1 //
2 // Copyright 2003-2008 Rational Discovery LLC and Greg Landrum
3 // @@ All Rights Reserved @@
4 // This file is part of the RDKit.
5 // The contents are covered by the terms of the BSD license
6 // which is included in the file license.txt, found at the root
7 // of the RDKit source tree.
8 //
9 #include <cstring>
10
11 #include <RDBoost/Wrap.h>
12 #include <RDBoost/import_array.h>
13
14 namespace python = boost::python;
15
16 #include <ML/InfoTheory/InfoGainFuncs.h>
17
18 /***********************************************
19
20 constructs a variable table for the data passed in
21 The table for a given variable records the number of times each possible
22 value
23 of that variable appears for each possible result of the function.
24
25 **Arguments**
26
27 - vals: pointer to double, contains the values of the variable,
28 should be sorted
29
30 - nVals: int, the length of _vals_
31
32 - cuts: pointer to int, the indices of the quantization bounds
33
34 - nCuts: int, the length of _cuts_
35
36 - starts: pointer to int, the potential starting points for quantization
37 bounds
38
39 - nStarts: int, the length of _starts_
40
41 - results: poitner to int, the result codes
42
43 - nPossibleRes: int, the number of possible result codes
44
45
46 **Returns**
47
48 _varTable_ (a pointer to int), which is also modified in place.
49
50 **Notes:**
51
52 - _varTable_ is modified in place
53
54 - the _results_ array is assumed to be _nVals_ long
55
56 ***********************************************/
GenVarTable(double * vals,int nVals,long int * cuts,int nCuts,long int * starts,long int * results,int nPossibleRes,long int * varTable)57 long int *GenVarTable(double *vals, int nVals, long int *cuts, int nCuts,
58 long int *starts, long int *results, int nPossibleRes,
59 long int *varTable) {
60 RDUNUSED_PARAM(vals);
61 int nBins = nCuts + 1;
62 int idx, i, iTab;
63
64 memset(varTable, 0, nBins * nPossibleRes * sizeof(long int));
65 idx = 0;
66 for (i = 0; i < nCuts; i++) {
67 int cut = cuts[i];
68 iTab = i * nPossibleRes;
69 while (idx < starts[cut]) {
70 varTable[iTab + results[idx]] += 1;
71 idx++;
72 }
73 }
74 iTab = nCuts * nPossibleRes;
75 while (idx < nVals) {
76 varTable[iTab + results[idx]] += 1;
77 idx++;
78 }
79 return varTable;
80 }
81
82 /***********************************************
83
84 This actually does the recursion required by *cQuantize_RecurseOnBounds()*,
85 we do things this way to avoid having to convert things back and forth
86 from Python objects
87
88 **Arguments**
89
90 - vals: pointer to double, contains the values of the variable,
91 should be sorted
92
93 - nVals: int, the length of _vals_
94
95 - cuts: pointer to int, the indices of the quantization bounds
96
97 - nCuts: int, the length of _cuts_
98
99 - which: int, the quant bound being modified here
100
101 - starts: pointer to int, the potential starting points for quantization
102 bounds
103
104 - nStarts: int, the length of _starts_
105
106 - results: poitner to int, the result codes
107
108 - nPossibleRes: int, the number of possible result codes
109
110
111 **Returns**
112
113 a double, the expected information gain for the best bounds found
114 (which are found in _cuts_ )
115
116 **Notes:**
117
118 - _cuts_ is modified in place
119
120 - the _results_ array is assumed to be _nVals_ long
121
122 ***********************************************/
RecurseHelper(double * vals,int nVals,long int * cuts,int nCuts,int which,long int * starts,int nStarts,long int * results,int nPossibleRes)123 double RecurseHelper(double *vals, int nVals, long int *cuts, int nCuts,
124 int which, long int *starts, int nStarts,
125 long int *results, int nPossibleRes) {
126 PRECONDITION(vals, "bad vals pointer");
127
128 double maxGain = -1e6, gainHere;
129 long int *bestCuts, *tCuts;
130 long int *varTable = nullptr;
131 int highestCutHere = nStarts - nCuts + which;
132 int i, nBounds = nCuts;
133
134 varTable = (long int *)calloc((nCuts + 1) * nPossibleRes, sizeof(long int));
135 bestCuts = (long int *)calloc(nCuts, sizeof(long int));
136 tCuts = (long int *)calloc(nCuts, sizeof(long int));
137 CHECK_INVARIANT(varTable, "failed to allocate memory");
138 CHECK_INVARIANT(bestCuts, "failed to allocate memory");
139 CHECK_INVARIANT(tCuts, "failed to allocate memory");
140 GenVarTable(vals, nVals, cuts, nCuts, starts, results, nPossibleRes,
141 varTable);
142 while (cuts[which] <= highestCutHere) {
143 gainHere = RDInfoTheory::InfoEntropyGain(varTable, nCuts + 1, nPossibleRes);
144 if (gainHere > maxGain) {
145 maxGain = gainHere;
146 memcpy(bestCuts, cuts, nCuts * sizeof(long int));
147 }
148
149 // recurse on the next vars if needed
150 if (which < nBounds - 1) {
151 memcpy(tCuts, cuts, nCuts * sizeof(long int));
152 gainHere = RecurseHelper(vals, nVals, tCuts, nCuts, which + 1, starts,
153 nStarts, results, nPossibleRes);
154 if (gainHere > maxGain) {
155 maxGain = gainHere;
156 memcpy(bestCuts, tCuts, nCuts * sizeof(long int));
157 }
158 }
159
160 // update this cut
161 int oldCut = cuts[which];
162 cuts[which] += 1;
163 int top, bot;
164 bot = starts[oldCut];
165 if (oldCut + 1 < nStarts) {
166 top = starts[oldCut + 1];
167 } else {
168 top = starts[nStarts - 1];
169 }
170 for (i = bot; i < top; i++) {
171 int v = results[i];
172 varTable[which * nPossibleRes + v] += 1;
173 varTable[(which + 1) * nPossibleRes + v] -= 1;
174 }
175 for (i = which + 1; i < nBounds; i++) {
176 if (cuts[i] == cuts[i - 1]) {
177 cuts[i] += 1;
178 }
179 }
180 }
181 memcpy(cuts, bestCuts, nCuts * sizeof(long int));
182 free(tCuts);
183 free(bestCuts);
184 free(varTable);
185 return maxGain;
186 }
187
188 /***********************************************
189
190 Recursively finds the best quantization boundaries
191
192 **Arguments**
193
194 - vals: a 1D Numeric array with the values of the variables,
195 this should be sorted
196
197 - cuts: a list with the indices of the quantization bounds
198 (indices are into _starts_ )
199
200 - which: an integer indicating which bound is being adjusted here
201 (and index into _cuts_ )
202
203 - starts: a list of potential starting points for quantization bounds
204
205 - results: a 1D Numeric array of integer result codes
206
207 - nPossibleRes: an integer with the number of possible result codes
208
209 **Returns**
210
211 - a 2-tuple containing:
212
213 1) the best information gain found so far
214
215 2) a list of the quantization bound indices ( _cuts_ for the best case)
216
217 **Notes**
218
219 - this is not even remotely efficient, which is why a C replacement
220 was written
221
222 - this is a drop-in replacement for *ML.Data.Quantize._PyRecurseBounds*
223
224 ***********************************************/
cQuantize_RecurseOnBounds(python::object vals,python::list pyCuts,int which,python::list pyStarts,python::object results,int nPossibleRes)225 static python::tuple cQuantize_RecurseOnBounds(python::object vals,
226 python::list pyCuts, int which,
227 python::list pyStarts,
228 python::object results,
229 int nPossibleRes) {
230 PyArrayObject *contigVals, *contigResults;
231 long int *cuts, *starts;
232
233 /*
234 -------
235
236 Setup code
237
238 -------
239 */
240 contigVals = reinterpret_cast<PyArrayObject *>(
241 PyArray_ContiguousFromObject(vals.ptr(), NPY_DOUBLE, 1, 1));
242 if (!contigVals) {
243 throw_value_error("could not convert value argument");
244 }
245
246 contigResults = reinterpret_cast<PyArrayObject *>(
247 PyArray_ContiguousFromObject(results.ptr(), NPY_LONG, 1, 1));
248 if (!contigResults) {
249 throw_value_error("could not convert results argument");
250 }
251
252 python::ssize_t nCuts = python::len(pyCuts);
253 cuts = (long int *)calloc(nCuts, sizeof(long int));
254 CHECK_INVARIANT(cuts, "failed to allocate memory");
255 for (python::ssize_t i = 0; i < nCuts; i++) {
256 python::object elem = pyCuts[i];
257 cuts[i] = python::extract<long int>(elem);
258 }
259
260 python::ssize_t nStarts = python::len(pyStarts);
261 starts = (long int *)calloc(nStarts, sizeof(long int));
262 CHECK_INVARIANT(starts, "failed to allocate memory");
263 for (python::ssize_t i = 0; i < nStarts; i++) {
264 python::object elem = pyStarts[i];
265 starts[i] = python::extract<long int>(elem);
266 }
267
268 // do the real work
269 double gain = RecurseHelper(
270 (double *)PyArray_DATA(contigVals), PyArray_DIM(contigVals, 0), cuts,
271 nCuts, which, starts, nStarts, (long int *)PyArray_DATA(contigResults),
272 nPossibleRes);
273
274 /*
275 -------
276
277 Construct the return value
278
279 -------
280 */
281 python::list cutObj;
282 for (python::ssize_t i = 0; i < nCuts; i++) {
283 cutObj.append(cuts[i]);
284 }
285 free(cuts);
286 free(starts);
287 return python::make_tuple(gain, cutObj);
288 }
289
cQuantize_FindStartPoints(python::object values,python::object results,int nData)290 static python::list cQuantize_FindStartPoints(python::object values,
291 python::object results,
292 int nData) {
293 python::list startPts;
294
295 if (nData < 2) {
296 return startPts;
297 }
298
299 auto *contigVals = reinterpret_cast<PyArrayObject *>(
300 PyArray_ContiguousFromObject(values.ptr(), NPY_DOUBLE, 1, 1));
301 if (!contigVals) {
302 throw_value_error("could not convert value argument");
303 }
304
305 auto *vals = (double *)PyArray_DATA(contigVals);
306
307 auto *contigResults = reinterpret_cast<PyArrayObject *>(
308 PyArray_ContiguousFromObject(results.ptr(), NPY_LONG, 1, 1));
309 if (!contigResults) {
310 throw_value_error("could not convert results argument");
311 }
312
313 long *res = (long *)PyArray_DATA(contigResults);
314
315 bool firstBlock = true;
316 long lastBlockAct = -2, blockAct = res[0];
317 int lastDiv = -1;
318 double tol = 1e-8;
319
320 int i = 1;
321 while (i < nData) {
322 while (i < nData && vals[i] - vals[i - 1] <= tol) {
323 if (res[i] != blockAct) {
324 blockAct = -1;
325 }
326 ++i;
327 }
328 if (firstBlock) {
329 firstBlock = false;
330 lastBlockAct = blockAct;
331 lastDiv = i;
332 } else {
333 if (blockAct == -1 || lastBlockAct == -1 || blockAct != lastBlockAct) {
334 startPts.append(lastDiv);
335 lastDiv = i;
336 lastBlockAct = blockAct;
337 } else {
338 lastDiv = i;
339 }
340 }
341 if (i < nData) {
342 blockAct = res[i];
343 }
344 ++i;
345 }
346
347 // catch the case that the last point also sets a bin:
348 if (blockAct != lastBlockAct) {
349 startPts.append(lastDiv);
350 }
351
352 return startPts;
353 }
354
BOOST_PYTHON_MODULE(cQuantize)355 BOOST_PYTHON_MODULE(cQuantize) {
356 rdkit_import_array();
357
358 python::def("_RecurseOnBounds", cQuantize_RecurseOnBounds,
359 (python::arg("vals"), python::arg("pyCuts"), python::arg("which"),
360 python::arg("pyStarts"), python::arg("results"),
361 python::arg("nPossibleRes")),
362 "TODO: provide docstring");
363 python::def(
364 "_FindStartPoints", cQuantize_FindStartPoints,
365 (python::arg("values"), python::arg("results"), python::arg("nData")),
366 "TODO: provide docstring");
367 }
368