1 //
2 // Copyright 2003-2008 Rational Discovery LLC and Greg Landrum
3 //   @@ All Rights Reserved @@
4 //  This file is part of the RDKit.
5 //  The contents are covered by the terms of the BSD license
6 //  which is included in the file license.txt, found at the root
7 //  of the RDKit source tree.
8 //
9 #include <cstring>
10 
11 #include <RDBoost/Wrap.h>
12 #include <RDBoost/import_array.h>
13 
14 namespace python = boost::python;
15 
16 #include <ML/InfoTheory/InfoGainFuncs.h>
17 
18 /***********************************************
19 
20    constructs a variable table for the data passed in
21    The table for a given variable records the number of times each possible
22  value
23     of that variable appears for each possible result of the function.
24 
25   **Arguments**
26 
27    - vals: pointer to double, contains the values of the variable,
28      should be sorted
29 
30    - nVals: int, the length of _vals_
31 
32    - cuts: pointer to int, the indices of the quantization bounds
33 
34    - nCuts: int, the length of _cuts_
35 
36    - starts: pointer to int, the potential starting points for quantization
37  bounds
38 
39    - nStarts: int, the length of _starts_
40 
41    - results: poitner to int, the result codes
42 
43    - nPossibleRes: int, the number of possible result codes
44 
45 
46   **Returns**
47 
48     _varTable_ (a pointer to int), which is also modified in place.
49 
50   **Notes:**
51 
52     - _varTable_ is modified in place
53 
54     - the _results_ array is assumed to be _nVals_ long
55 
56  ***********************************************/
GenVarTable(double * vals,int nVals,long int * cuts,int nCuts,long int * starts,long int * results,int nPossibleRes,long int * varTable)57 long int *GenVarTable(double *vals, int nVals, long int *cuts, int nCuts,
58                       long int *starts, long int *results, int nPossibleRes,
59                       long int *varTable) {
60   RDUNUSED_PARAM(vals);
61   int nBins = nCuts + 1;
62   int idx, i, iTab;
63 
64   memset(varTable, 0, nBins * nPossibleRes * sizeof(long int));
65   idx = 0;
66   for (i = 0; i < nCuts; i++) {
67     int cut = cuts[i];
68     iTab = i * nPossibleRes;
69     while (idx < starts[cut]) {
70       varTable[iTab + results[idx]] += 1;
71       idx++;
72     }
73   }
74   iTab = nCuts * nPossibleRes;
75   while (idx < nVals) {
76     varTable[iTab + results[idx]] += 1;
77     idx++;
78   }
79   return varTable;
80 }
81 
82 /***********************************************
83 
84  This actually does the recursion required by *cQuantize_RecurseOnBounds()*,
85   we do things this way to avoid having to convert things back and forth
86   from Python objects
87 
88   **Arguments**
89 
90    - vals: pointer to double, contains the values of the variable,
91      should be sorted
92 
93    - nVals: int, the length of _vals_
94 
95    - cuts: pointer to int, the indices of the quantization bounds
96 
97    - nCuts: int, the length of _cuts_
98 
99    - which: int, the quant bound being modified here
100 
101    - starts: pointer to int, the potential starting points for quantization
102  bounds
103 
104    - nStarts: int, the length of _starts_
105 
106    - results: poitner to int, the result codes
107 
108    - nPossibleRes: int, the number of possible result codes
109 
110 
111   **Returns**
112 
113     a double, the expected information gain for the best bounds found
114       (which are found in _cuts_ )
115 
116   **Notes:**
117 
118     - _cuts_ is modified in place
119 
120     - the _results_ array is assumed to be _nVals_ long
121 
122  ***********************************************/
RecurseHelper(double * vals,int nVals,long int * cuts,int nCuts,int which,long int * starts,int nStarts,long int * results,int nPossibleRes)123 double RecurseHelper(double *vals, int nVals, long int *cuts, int nCuts,
124                      int which, long int *starts, int nStarts,
125                      long int *results, int nPossibleRes) {
126   PRECONDITION(vals, "bad vals pointer");
127 
128   double maxGain = -1e6, gainHere;
129   long int *bestCuts, *tCuts;
130   long int *varTable = nullptr;
131   int highestCutHere = nStarts - nCuts + which;
132   int i, nBounds = nCuts;
133 
134   varTable = (long int *)calloc((nCuts + 1) * nPossibleRes, sizeof(long int));
135   bestCuts = (long int *)calloc(nCuts, sizeof(long int));
136   tCuts = (long int *)calloc(nCuts, sizeof(long int));
137   CHECK_INVARIANT(varTable, "failed to allocate memory");
138   CHECK_INVARIANT(bestCuts, "failed to allocate memory");
139   CHECK_INVARIANT(tCuts, "failed to allocate memory");
140   GenVarTable(vals, nVals, cuts, nCuts, starts, results, nPossibleRes,
141               varTable);
142   while (cuts[which] <= highestCutHere) {
143     gainHere = RDInfoTheory::InfoEntropyGain(varTable, nCuts + 1, nPossibleRes);
144     if (gainHere > maxGain) {
145       maxGain = gainHere;
146       memcpy(bestCuts, cuts, nCuts * sizeof(long int));
147     }
148 
149     // recurse on the next vars if needed
150     if (which < nBounds - 1) {
151       memcpy(tCuts, cuts, nCuts * sizeof(long int));
152       gainHere = RecurseHelper(vals, nVals, tCuts, nCuts, which + 1, starts,
153                                nStarts, results, nPossibleRes);
154       if (gainHere > maxGain) {
155         maxGain = gainHere;
156         memcpy(bestCuts, tCuts, nCuts * sizeof(long int));
157       }
158     }
159 
160     // update this cut
161     int oldCut = cuts[which];
162     cuts[which] += 1;
163     int top, bot;
164     bot = starts[oldCut];
165     if (oldCut + 1 < nStarts) {
166       top = starts[oldCut + 1];
167     } else {
168       top = starts[nStarts - 1];
169     }
170     for (i = bot; i < top; i++) {
171       int v = results[i];
172       varTable[which * nPossibleRes + v] += 1;
173       varTable[(which + 1) * nPossibleRes + v] -= 1;
174     }
175     for (i = which + 1; i < nBounds; i++) {
176       if (cuts[i] == cuts[i - 1]) {
177         cuts[i] += 1;
178       }
179     }
180   }
181   memcpy(cuts, bestCuts, nCuts * sizeof(long int));
182   free(tCuts);
183   free(bestCuts);
184   free(varTable);
185   return maxGain;
186 }
187 
188 /***********************************************
189 
190    Recursively finds the best quantization boundaries
191 
192    **Arguments**
193 
194      - vals: a 1D Numeric array with the values of the variables,
195        this should be sorted
196 
197      - cuts: a list with the indices of the quantization bounds
198        (indices are into _starts_ )
199 
200      - which: an integer indicating which bound is being adjusted here
201        (and index into _cuts_ )
202 
203      - starts: a list of potential starting points for quantization bounds
204 
205      - results: a 1D Numeric array of integer result codes
206 
207      - nPossibleRes: an integer with the number of possible result codes
208 
209    **Returns**
210 
211      - a 2-tuple containing:
212 
213        1) the best information gain found so far
214 
215        2) a list of the quantization bound indices ( _cuts_ for the best case)
216 
217    **Notes**
218 
219     - this is not even remotely efficient, which is why a C replacement
220       was written
221 
222     - this is a drop-in replacement for *ML.Data.Quantize._PyRecurseBounds*
223 
224  ***********************************************/
cQuantize_RecurseOnBounds(python::object vals,python::list pyCuts,int which,python::list pyStarts,python::object results,int nPossibleRes)225 static python::tuple cQuantize_RecurseOnBounds(python::object vals,
226                                                python::list pyCuts, int which,
227                                                python::list pyStarts,
228                                                python::object results,
229                                                int nPossibleRes) {
230   PyArrayObject *contigVals, *contigResults;
231   long int *cuts, *starts;
232 
233   /*
234     -------
235 
236     Setup code
237 
238     -------
239   */
240   contigVals = reinterpret_cast<PyArrayObject *>(
241       PyArray_ContiguousFromObject(vals.ptr(), NPY_DOUBLE, 1, 1));
242   if (!contigVals) {
243     throw_value_error("could not convert value argument");
244   }
245 
246   contigResults = reinterpret_cast<PyArrayObject *>(
247       PyArray_ContiguousFromObject(results.ptr(), NPY_LONG, 1, 1));
248   if (!contigResults) {
249     throw_value_error("could not convert results argument");
250   }
251 
252   python::ssize_t nCuts = python::len(pyCuts);
253   cuts = (long int *)calloc(nCuts, sizeof(long int));
254   CHECK_INVARIANT(cuts, "failed to allocate memory");
255   for (python::ssize_t i = 0; i < nCuts; i++) {
256     python::object elem = pyCuts[i];
257     cuts[i] = python::extract<long int>(elem);
258   }
259 
260   python::ssize_t nStarts = python::len(pyStarts);
261   starts = (long int *)calloc(nStarts, sizeof(long int));
262   CHECK_INVARIANT(starts, "failed to allocate memory");
263   for (python::ssize_t i = 0; i < nStarts; i++) {
264     python::object elem = pyStarts[i];
265     starts[i] = python::extract<long int>(elem);
266   }
267 
268   // do the real work
269   double gain = RecurseHelper(
270       (double *)PyArray_DATA(contigVals), PyArray_DIM(contigVals, 0), cuts,
271       nCuts, which, starts, nStarts, (long int *)PyArray_DATA(contigResults),
272       nPossibleRes);
273 
274   /*
275     -------
276 
277     Construct the return value
278 
279     -------
280   */
281   python::list cutObj;
282   for (python::ssize_t i = 0; i < nCuts; i++) {
283     cutObj.append(cuts[i]);
284   }
285   free(cuts);
286   free(starts);
287   return python::make_tuple(gain, cutObj);
288 }
289 
cQuantize_FindStartPoints(python::object values,python::object results,int nData)290 static python::list cQuantize_FindStartPoints(python::object values,
291                                               python::object results,
292                                               int nData) {
293   python::list startPts;
294 
295   if (nData < 2) {
296     return startPts;
297   }
298 
299   auto *contigVals = reinterpret_cast<PyArrayObject *>(
300       PyArray_ContiguousFromObject(values.ptr(), NPY_DOUBLE, 1, 1));
301   if (!contigVals) {
302     throw_value_error("could not convert value argument");
303   }
304 
305   auto *vals = (double *)PyArray_DATA(contigVals);
306 
307   auto *contigResults = reinterpret_cast<PyArrayObject *>(
308       PyArray_ContiguousFromObject(results.ptr(), NPY_LONG, 1, 1));
309   if (!contigResults) {
310     throw_value_error("could not convert results argument");
311   }
312 
313   long *res = (long *)PyArray_DATA(contigResults);
314 
315   bool firstBlock = true;
316   long lastBlockAct = -2, blockAct = res[0];
317   int lastDiv = -1;
318   double tol = 1e-8;
319 
320   int i = 1;
321   while (i < nData) {
322     while (i < nData && vals[i] - vals[i - 1] <= tol) {
323       if (res[i] != blockAct) {
324         blockAct = -1;
325       }
326       ++i;
327     }
328     if (firstBlock) {
329       firstBlock = false;
330       lastBlockAct = blockAct;
331       lastDiv = i;
332     } else {
333       if (blockAct == -1 || lastBlockAct == -1 || blockAct != lastBlockAct) {
334         startPts.append(lastDiv);
335         lastDiv = i;
336         lastBlockAct = blockAct;
337       } else {
338         lastDiv = i;
339       }
340     }
341     if (i < nData) {
342       blockAct = res[i];
343     }
344     ++i;
345   }
346 
347   // catch the case that the last point also sets a bin:
348   if (blockAct != lastBlockAct) {
349     startPts.append(lastDiv);
350   }
351 
352   return startPts;
353 }
354 
BOOST_PYTHON_MODULE(cQuantize)355 BOOST_PYTHON_MODULE(cQuantize) {
356   rdkit_import_array();
357 
358   python::def("_RecurseOnBounds", cQuantize_RecurseOnBounds,
359               (python::arg("vals"), python::arg("pyCuts"), python::arg("which"),
360                python::arg("pyStarts"), python::arg("results"),
361                python::arg("nPossibleRes")),
362               "TODO: provide docstring");
363   python::def(
364       "_FindStartPoints", cQuantize_FindStartPoints,
365       (python::arg("values"), python::arg("results"), python::arg("nData")),
366       "TODO: provide docstring");
367 }
368