1 // $Id$
2 //
3 // Copyright (C) 2003-2008 Greg Landrum and Rational Discovery LLC
4 //
5 // @@ All Rights Reserved @@
6 // This file is part of the RDKit.
7 // The contents are covered by the terms of the BSD license
8 // which is included in the file license.txt, found at the root
9 // of the RDKit source tree.
10 //
11 #define PY_ARRAY_UNIQUE_SYMBOL rdmetric_array_API
12 #include <RDBoost/python.h>
13 #include <RDBoost/boost_numpy.h>
14
15 #include <RDBoost/PySequenceHolder.h>
16 #include <RDBoost/Wrap.h>
17 #include <RDBoost/import_array.h>
18
19 #include <RDGeneral/types.h>
20
21 #include <DataManip/MetricMatrixCalc/MetricMatrixCalc.h>
22 #include <DataManip/MetricMatrixCalc/MetricFuncs.h>
23 #include <DataStructs/BitVects.h>
24 #include <string>
25
26 using namespace RDDataManip;
27
28 void wrap_MMcalc();
29
30 namespace python = boost::python;
31 namespace RDDataManip {
32
getEuclideanDistMat(python::object descripMat)33 PyObject *getEuclideanDistMat(python::object descripMat) {
34 // Bit of a pain involved here, we accept three types of PyObjects here
35 // 1. A Numeric Array
36 // - first find what 'type' of entry we have (float, double and int is all
37 // we recognize for now)
38 // - then point to contiguous piece of memory from the array that contains
39 // the data with a type*
40 // - then make a new type** pointer so that double index into this
41 // contiguous memory will work
42 // and then pass it along to the distance calculator
43 // 2. A list of Numeric Vector (or 1D arrays)
44 // - in this case wrap descripMat with a PySequenceHolder<type*> where
45 // type is the
46 // type of entry in vector (accepted types are int, double and float
47 // - Then pass the PySequenceHolder to the metric calculator
48 // 3. A list (or tuple) of lists (or tuple)
49 // - In this case other than wrapping descripMat with a PySequenceHolder
50 // each of the individual list in there are also wrapped by a
51 // PySequenceHolder
52 // - so the distance calculator is passed in a
53 // "PySequenceHolder<PySequenceHolder<double>>"
54 // - FIX: not that we always convert entry values to double here, even if
55 // we passed
56 // in a list of list of ints (or floats). Given that lists can be
57 // heterogeneous, I do not
58 // know how to ask a list what type of entries if contains.
59 //
60 // OK my brain is going to explode now
61
62 // first deal with situation where we have an Numeric Array
63 PyObject *descMatObj = descripMat.ptr();
64 PyArrayObject *distRes;
65 if (PyArray_Check(descMatObj)) {
66 // get the dimensions of the array
67 int nrows = PyArray_DIM((PyArrayObject *)descMatObj, 0);
68 int ncols = PyArray_DIM((PyArrayObject *)descMatObj, 1);
69 int i;
70 CHECK_INVARIANT((nrows > 0) && (ncols > 0), "");
71
72 npy_intp dMatLen = nrows * (nrows - 1) / 2;
73
74 // now that we have the dimensions declare the distance matrix which is
75 // always a
76 // 1D double array
77 distRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE);
78
79 // grab a pointer to the data in the array so that we can directly put
80 // values in there
81 // and avoid copying :
82 auto *dMat = (double *)PyArray_DATA(distRes);
83
84 PyArrayObject *copy;
85 copy = (PyArrayObject *)PyArray_ContiguousFromObject(
86 descMatObj, PyArray_DESCR((PyArrayObject *)descMatObj)->type_num, 2, 2);
87 // if we have double array
88 if (PyArray_DESCR((PyArrayObject *)descMatObj)->type_num == NPY_DOUBLE) {
89 auto *desc = (double *)PyArray_DATA((PyArrayObject *)descMatObj);
90
91 // REVIEW: create an adaptor object to hold a double * and support
92 // operator[]() so that we don't have to do this stuff:
93
94 // here is the 2D array trick this so that when the distance calaculator
95 // asks for desc2D[i] we basically get the ith row as double*
96 auto **desc2D = new double *[nrows];
97 for (i = 0; i < nrows; i++) {
98 desc2D[i] = desc;
99 desc += ncols;
100 }
101 MetricMatrixCalc<double **, double *> mmCalc;
102 mmCalc.setMetricFunc(&EuclideanDistanceMetric<double *, double *>);
103 mmCalc.calcMetricMatrix(desc2D, nrows, ncols, dMat);
104
105 delete[] desc2D;
106 // we got the distance matrix we are happy so return
107 return PyArray_Return(distRes);
108 }
109
110 // if we have a float array
111 else if (PyArray_DESCR((PyArrayObject *)descMatObj)->type_num ==
112 NPY_FLOAT) {
113 auto *desc = (float *)PyArray_DATA(copy);
114 auto **desc2D = new float *[nrows];
115 for (i = 0; i < nrows; i++) {
116 desc2D[i] = desc;
117 desc += ncols;
118 }
119 MetricMatrixCalc<float **, float *> mmCalc;
120 mmCalc.setMetricFunc(&EuclideanDistanceMetric<float *, float *>);
121 mmCalc.calcMetricMatrix(desc2D, nrows, ncols, dMat);
122 delete[] desc2D;
123 return PyArray_Return(distRes);
124 }
125
126 // if we have an integer array
127 else if (PyArray_DESCR((PyArrayObject *)descMatObj)->type_num == NPY_INT) {
128 int *desc = (int *)PyArray_DATA(copy);
129 auto **desc2D = new int *[nrows];
130 for (i = 0; i < nrows; i++) {
131 desc2D[i] = desc;
132 desc += ncols;
133 }
134 MetricMatrixCalc<int **, int *> mmCalc;
135 mmCalc.setMetricFunc(&EuclideanDistanceMetric<int *, int *>);
136 mmCalc.calcMetricMatrix(desc2D, nrows, ncols, dMat);
137 delete[] desc2D;
138 return PyArray_Return(distRes);
139 } else {
140 // unrecognized type for the matrix, throw up
141 throw_value_error(
142 "The array has to be of type int, float, or double for "
143 "GetEuclideanDistMat");
144 }
145 } // done with an array input
146 else {
147 // REVIEW: removed a ton of code here
148
149 // we have probably have a list or a tuple
150
151 unsigned int ncols = 0;
152 unsigned int nrows =
153 python::extract<unsigned int>(descripMat.attr("__len__")());
154 CHECK_INVARIANT(nrows > 0, "Empty list passed in");
155
156 npy_intp dMatLen = nrows * (nrows - 1) / 2;
157 distRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE);
158 auto *dMat = (double *)PyArray_DATA(distRes);
159
160 // assume that we a have a list of list of values (that can be extracted to
161 // double)
162 std::vector<PySequenceHolder<double> > dData;
163 dData.reserve(nrows);
164 for (unsigned int i = 0; i < nrows; i++) {
165 // PySequenceHolder<double> row(seq[i]);
166 PySequenceHolder<double> row(descripMat[i]);
167 if (i == 0) {
168 ncols = row.size();
169 } else if (row.size() != ncols) {
170 throw_value_error("All subsequences must be the same length");
171 }
172 dData.push_back(row);
173 }
174
175 MetricMatrixCalc<std::vector<PySequenceHolder<double> >,
176 PySequenceHolder<double> > mmCalc;
177 mmCalc.setMetricFunc(&EuclideanDistanceMetric<PySequenceHolder<double>,
178 PySequenceHolder<double> >);
179 mmCalc.calcMetricMatrix(dData, nrows, ncols, dMat);
180 }
181 return PyArray_Return(distRes);
182 }
183
getTanimotoDistMat(python::object bitVectList)184 PyObject *getTanimotoDistMat(python::object bitVectList) {
185 // we will assume here that we have a either a list of ExplicitBitVectors or
186 // SparseBitVects
187 int nrows = python::extract<int>(bitVectList.attr("__len__")());
188 CHECK_INVARIANT(nrows > 1, "");
189
190 // First check what type of vector we have
191 python::object v1 = bitVectList[0];
192 python::extract<ExplicitBitVect> ebvWorks(v1);
193 python::extract<SparseBitVect> sbvWorks(v1);
194 if (!ebvWorks.check() && !sbvWorks.check()) {
195 throw_value_error(
196 "GetTanimotoDistMat can only take a sequence of ExplicitBitVects or "
197 "SparseBitvects");
198 }
199
200 npy_intp dMatLen = nrows * (nrows - 1) / 2;
201 auto *simRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE);
202 auto *sMat = (double *)PyArray_DATA(simRes);
203
204 if (ebvWorks.check()) {
205 PySequenceHolder<ExplicitBitVect> dData(bitVectList);
206 MetricMatrixCalc<PySequenceHolder<ExplicitBitVect>, ExplicitBitVect> mmCalc;
207 mmCalc.setMetricFunc(
208 &TanimotoDistanceMetric<ExplicitBitVect, ExplicitBitVect>);
209 mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
210 } else if (sbvWorks.check()) {
211 PySequenceHolder<SparseBitVect> dData(bitVectList);
212 MetricMatrixCalc<PySequenceHolder<SparseBitVect>, SparseBitVect> mmCalc;
213 mmCalc.setMetricFunc(&TanimotoDistanceMetric<SparseBitVect, SparseBitVect>);
214 mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
215 }
216 return PyArray_Return(simRes);
217 }
218
getTanimotoSimMat(python::object bitVectList)219 PyObject *getTanimotoSimMat(python::object bitVectList) {
220 // we will assume here that we have a either a list of ExplicitBitVectors or
221 // SparseBitVects
222 int nrows = python::extract<int>(bitVectList.attr("__len__")());
223 CHECK_INVARIANT(nrows > 1, "");
224
225 // First check what type of vector we have
226 python::object v1 = bitVectList[0];
227 python::extract<ExplicitBitVect> ebvWorks(v1);
228 python::extract<SparseBitVect> sbvWorks(v1);
229 if (!ebvWorks.check() && !sbvWorks.check()) {
230 throw_value_error(
231 "GetTanimotoDistMat can only take a sequence of ExplicitBitVects or "
232 "SparseBitvects");
233 }
234
235 npy_intp dMatLen = nrows * (nrows - 1) / 2;
236 auto *simRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE);
237 auto *sMat = (double *)PyArray_DATA(simRes);
238
239 if (ebvWorks.check()) {
240 PySequenceHolder<ExplicitBitVect> dData(bitVectList);
241 MetricMatrixCalc<PySequenceHolder<ExplicitBitVect>, ExplicitBitVect> mmCalc;
242 mmCalc.setMetricFunc(
243 &TanimotoSimilarityMetric<ExplicitBitVect, ExplicitBitVect>);
244 mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
245 } else if (sbvWorks.check()) {
246 PySequenceHolder<SparseBitVect> dData(bitVectList);
247 MetricMatrixCalc<PySequenceHolder<SparseBitVect>, SparseBitVect> mmCalc;
248 mmCalc.setMetricFunc(
249 &TanimotoSimilarityMetric<SparseBitVect, SparseBitVect>);
250 mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
251 }
252 return PyArray_Return(simRes);
253 }
254 }
255
BOOST_PYTHON_MODULE(rdMetricMatrixCalc)256 BOOST_PYTHON_MODULE(rdMetricMatrixCalc) {
257 python::scope().attr("__doc__") =
258 "Module containing the calculator for metric matrix calculation, \n"
259 "e.g. similarity and distance matrices";
260
261 rdkit_import_array();
262
263 std::string docString;
264 docString =
265 "Compute the distance matrix from a descriptor matrix using the Euclidean distance metric\n\n\
266 ARGUMENTS: \n\
267 \n\
268 descripMat - A python object of any one of the following types \n\
269 1. A numeric array of dimensions n by m where n is the number of items in the data set \n\
270 and m is the number of descriptors \n\
271 2. A list of Numeric Vectors (or 1D arrays), each entry in the list corresponds \n\
272 to descriptor vector for one item \n\
273 3. A list (or tuple) of lists (or tuples) of values, where the values can be extracted to \n\
274 double. \n\n\
275 RETURNS: \n\
276 A numeric one-dimensional array containing the lower triangle elements of the symmetric distance matrix\n\n";
277 python::def("GetEuclideanDistMat", RDDataManip::getEuclideanDistMat,
278 docString.c_str());
279
280 docString =
281 "Compute the distance matrix from a list of BitVects using the Tanimoto distance metric\n\n\
282 ARGUMENTS: \n\
283 \n\
284 bitVectList - a list of bit vectors. Currently this works only for a list of explicit bit vectors, \n\
285 needs to be expanded to support a list of SparseBitVects\n\n\
286 RETURNS: \n\
287 A numeric 1 dimensional array containing the lower triangle elements of the\n\
288 symmetric distance matrix\n\n";
289 python::def("GetTanimotoDistMat", RDDataManip::getTanimotoDistMat,
290 docString.c_str());
291
292 docString =
293 "Compute the similarity matrix from a list of BitVects \n\n\
294 ARGUMENTS: \n\
295 \n\
296 bitVectList - a list of bit vectors. Currently this works only for a list of explicit bit vectors, \n\
297 needs to be expanded to support a list of SparseBitVects\n\n\
298 RETURNS: \n\
299 A numeric 1 dimensional array containing the lower triangle elements of the symmetric similarity matrix\n\n";
300 python::def("GetTanimotoSimMat", RDDataManip::getTanimotoSimMat,
301 docString.c_str());
302 }
303