1 // $Id$
2 //
3 //  Copyright (C) 2003-2008 Greg Landrum and Rational Discovery LLC
4 //
5 //  @@ All Rights Reserved @@
6 //  This file is part of the RDKit.
7 //  The contents are covered by the terms of the BSD license
8 //  which is included in the file license.txt, found at the root
9 //  of the RDKit source tree.
10 //
11 #define PY_ARRAY_UNIQUE_SYMBOL rdmetric_array_API
12 #include <RDBoost/python.h>
13 #include <RDBoost/boost_numpy.h>
14 
15 #include <RDBoost/PySequenceHolder.h>
16 #include <RDBoost/Wrap.h>
17 #include <RDBoost/import_array.h>
18 
19 #include <RDGeneral/types.h>
20 
21 #include <DataManip/MetricMatrixCalc/MetricMatrixCalc.h>
22 #include <DataManip/MetricMatrixCalc/MetricFuncs.h>
23 #include <DataStructs/BitVects.h>
24 #include <string>
25 
26 using namespace RDDataManip;
27 
28 void wrap_MMcalc();
29 
30 namespace python = boost::python;
31 namespace RDDataManip {
32 
getEuclideanDistMat(python::object descripMat)33 PyObject *getEuclideanDistMat(python::object descripMat) {
34   // Bit of a pain involved here, we accept three types of PyObjects here
35   // 1. A Numeric Array
36   //     - first find what 'type' of entry we have (float, double and int is all
37   //     we recognize for now)
38   //     - then point to contiguous piece of memory from the array that contains
39   //     the data with a type*
40   //     - then make a new type** pointer so that double index into this
41   //     contiguous memory will work
42   //       and then pass it along to the distance calculator
43   // 2. A list of Numeric Vector (or 1D arrays)
44   //     - in this case wrap descripMat with a PySequenceHolder<type*> where
45   //     type is the
46   //       type of entry in vector (accepted types are int, double and float
47   //     - Then pass the PySequenceHolder to the metric calculator
48   // 3. A list (or tuple) of lists (or tuple)
49   //     - In this case other than wrapping descripMat with a PySequenceHolder
50   //       each of the individual list in there are also wrapped by a
51   //       PySequenceHolder
52   //     - so the distance calculator is passed in a
53   //     "PySequenceHolder<PySequenceHolder<double>>"
54   //     - FIX: not that we always convert entry values to double here, even if
55   //     we passed
56   //       in a list of list of ints (or floats). Given that lists can be
57   //       heterogeneous, I do not
58   //       know how to ask a list what type of entries if contains.
59   //
60   //  OK my brain is going to explode now
61 
62   // first deal with situation where we have an Numeric Array
63   PyObject *descMatObj = descripMat.ptr();
64   PyArrayObject *distRes;
65   if (PyArray_Check(descMatObj)) {
66     // get the dimensions of the array
67     int nrows = PyArray_DIM((PyArrayObject *)descMatObj, 0);
68     int ncols = PyArray_DIM((PyArrayObject *)descMatObj, 1);
69     int i;
70     CHECK_INVARIANT((nrows > 0) && (ncols > 0), "");
71 
72     npy_intp dMatLen = nrows * (nrows - 1) / 2;
73 
74     // now that we have the dimensions declare the distance matrix which is
75     // always a
76     // 1D double array
77     distRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE);
78 
79     // grab a pointer to the data in the array so that we can directly put
80     // values in there
81     // and avoid copying :
82     auto *dMat = (double *)PyArray_DATA(distRes);
83 
84     PyArrayObject *copy;
85     copy = (PyArrayObject *)PyArray_ContiguousFromObject(
86         descMatObj, PyArray_DESCR((PyArrayObject *)descMatObj)->type_num, 2, 2);
87     // if we have double array
88     if (PyArray_DESCR((PyArrayObject *)descMatObj)->type_num == NPY_DOUBLE) {
89       auto *desc = (double *)PyArray_DATA((PyArrayObject *)descMatObj);
90 
91       // REVIEW: create an adaptor object to hold a double * and support
92       //  operator[]() so that we don't have to do this stuff:
93 
94       // here is the 2D array trick this so that when the distance calaculator
95       // asks for desc2D[i] we basically get the ith row as double*
96       auto **desc2D = new double *[nrows];
97       for (i = 0; i < nrows; i++) {
98         desc2D[i] = desc;
99         desc += ncols;
100       }
101       MetricMatrixCalc<double **, double *> mmCalc;
102       mmCalc.setMetricFunc(&EuclideanDistanceMetric<double *, double *>);
103       mmCalc.calcMetricMatrix(desc2D, nrows, ncols, dMat);
104 
105       delete[] desc2D;
106       // we got the distance matrix we are happy so return
107       return PyArray_Return(distRes);
108     }
109 
110     // if we have a float array
111     else if (PyArray_DESCR((PyArrayObject *)descMatObj)->type_num ==
112              NPY_FLOAT) {
113       auto *desc = (float *)PyArray_DATA(copy);
114       auto **desc2D = new float *[nrows];
115       for (i = 0; i < nrows; i++) {
116         desc2D[i] = desc;
117         desc += ncols;
118       }
119       MetricMatrixCalc<float **, float *> mmCalc;
120       mmCalc.setMetricFunc(&EuclideanDistanceMetric<float *, float *>);
121       mmCalc.calcMetricMatrix(desc2D, nrows, ncols, dMat);
122       delete[] desc2D;
123       return PyArray_Return(distRes);
124     }
125 
126     // if we have an integer array
127     else if (PyArray_DESCR((PyArrayObject *)descMatObj)->type_num == NPY_INT) {
128       int *desc = (int *)PyArray_DATA(copy);
129       auto **desc2D = new int *[nrows];
130       for (i = 0; i < nrows; i++) {
131         desc2D[i] = desc;
132         desc += ncols;
133       }
134       MetricMatrixCalc<int **, int *> mmCalc;
135       mmCalc.setMetricFunc(&EuclideanDistanceMetric<int *, int *>);
136       mmCalc.calcMetricMatrix(desc2D, nrows, ncols, dMat);
137       delete[] desc2D;
138       return PyArray_Return(distRes);
139     } else {
140       // unrecognized type for the matrix, throw up
141       throw_value_error(
142           "The array has to be of type int, float, or double for "
143           "GetEuclideanDistMat");
144     }
145   }  // done with an array input
146   else {
147     // REVIEW: removed a ton of code here
148 
149     // we have probably have a list or a tuple
150 
151     unsigned int ncols = 0;
152     unsigned int nrows =
153         python::extract<unsigned int>(descripMat.attr("__len__")());
154     CHECK_INVARIANT(nrows > 0, "Empty list passed in");
155 
156     npy_intp dMatLen = nrows * (nrows - 1) / 2;
157     distRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE);
158     auto *dMat = (double *)PyArray_DATA(distRes);
159 
160     // assume that we a have a list of list of values (that can be extracted to
161     // double)
162     std::vector<PySequenceHolder<double> > dData;
163     dData.reserve(nrows);
164     for (unsigned int i = 0; i < nrows; i++) {
165       // PySequenceHolder<double> row(seq[i]);
166       PySequenceHolder<double> row(descripMat[i]);
167       if (i == 0) {
168         ncols = row.size();
169       } else if (row.size() != ncols) {
170         throw_value_error("All subsequences must be the same length");
171       }
172       dData.push_back(row);
173     }
174 
175     MetricMatrixCalc<std::vector<PySequenceHolder<double> >,
176                      PySequenceHolder<double> > mmCalc;
177     mmCalc.setMetricFunc(&EuclideanDistanceMetric<PySequenceHolder<double>,
178                                                   PySequenceHolder<double> >);
179     mmCalc.calcMetricMatrix(dData, nrows, ncols, dMat);
180   }
181   return PyArray_Return(distRes);
182 }
183 
getTanimotoDistMat(python::object bitVectList)184 PyObject *getTanimotoDistMat(python::object bitVectList) {
185   // we will assume here that we have a either a list of ExplicitBitVectors or
186   // SparseBitVects
187   int nrows = python::extract<int>(bitVectList.attr("__len__")());
188   CHECK_INVARIANT(nrows > 1, "");
189 
190   // First check what type of vector we have
191   python::object v1 = bitVectList[0];
192   python::extract<ExplicitBitVect> ebvWorks(v1);
193   python::extract<SparseBitVect> sbvWorks(v1);
194   if (!ebvWorks.check() && !sbvWorks.check()) {
195     throw_value_error(
196         "GetTanimotoDistMat can only take a sequence of ExplicitBitVects or "
197         "SparseBitvects");
198   }
199 
200   npy_intp dMatLen = nrows * (nrows - 1) / 2;
201   auto *simRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE);
202   auto *sMat = (double *)PyArray_DATA(simRes);
203 
204   if (ebvWorks.check()) {
205     PySequenceHolder<ExplicitBitVect> dData(bitVectList);
206     MetricMatrixCalc<PySequenceHolder<ExplicitBitVect>, ExplicitBitVect> mmCalc;
207     mmCalc.setMetricFunc(
208         &TanimotoDistanceMetric<ExplicitBitVect, ExplicitBitVect>);
209     mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
210   } else if (sbvWorks.check()) {
211     PySequenceHolder<SparseBitVect> dData(bitVectList);
212     MetricMatrixCalc<PySequenceHolder<SparseBitVect>, SparseBitVect> mmCalc;
213     mmCalc.setMetricFunc(&TanimotoDistanceMetric<SparseBitVect, SparseBitVect>);
214     mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
215   }
216   return PyArray_Return(simRes);
217 }
218 
getTanimotoSimMat(python::object bitVectList)219 PyObject *getTanimotoSimMat(python::object bitVectList) {
220   // we will assume here that we have a either a list of ExplicitBitVectors or
221   // SparseBitVects
222   int nrows = python::extract<int>(bitVectList.attr("__len__")());
223   CHECK_INVARIANT(nrows > 1, "");
224 
225   // First check what type of vector we have
226   python::object v1 = bitVectList[0];
227   python::extract<ExplicitBitVect> ebvWorks(v1);
228   python::extract<SparseBitVect> sbvWorks(v1);
229   if (!ebvWorks.check() && !sbvWorks.check()) {
230     throw_value_error(
231         "GetTanimotoDistMat can only take a sequence of ExplicitBitVects or "
232         "SparseBitvects");
233   }
234 
235   npy_intp dMatLen = nrows * (nrows - 1) / 2;
236   auto *simRes = (PyArrayObject *)PyArray_SimpleNew(1, &dMatLen, NPY_DOUBLE);
237   auto *sMat = (double *)PyArray_DATA(simRes);
238 
239   if (ebvWorks.check()) {
240     PySequenceHolder<ExplicitBitVect> dData(bitVectList);
241     MetricMatrixCalc<PySequenceHolder<ExplicitBitVect>, ExplicitBitVect> mmCalc;
242     mmCalc.setMetricFunc(
243         &TanimotoSimilarityMetric<ExplicitBitVect, ExplicitBitVect>);
244     mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
245   } else if (sbvWorks.check()) {
246     PySequenceHolder<SparseBitVect> dData(bitVectList);
247     MetricMatrixCalc<PySequenceHolder<SparseBitVect>, SparseBitVect> mmCalc;
248     mmCalc.setMetricFunc(
249         &TanimotoSimilarityMetric<SparseBitVect, SparseBitVect>);
250     mmCalc.calcMetricMatrix(dData, nrows, 0, sMat);
251   }
252   return PyArray_Return(simRes);
253 }
254 }
255 
BOOST_PYTHON_MODULE(rdMetricMatrixCalc)256 BOOST_PYTHON_MODULE(rdMetricMatrixCalc) {
257   python::scope().attr("__doc__") =
258       "Module containing the calculator for metric matrix calculation, \n"
259       "e.g. similarity and distance matrices";
260 
261   rdkit_import_array();
262 
263   std::string docString;
264   docString =
265       "Compute the distance matrix from a descriptor matrix using the Euclidean distance metric\n\n\
266   ARGUMENTS: \n\
267 \n\
268     descripMat - A python object of any one of the following types \n\
269                    1. A numeric array of dimensions n by m where n is the number of items in the data set \n\
270                        and m is the number of descriptors \n\
271                    2. A list of Numeric Vectors (or 1D arrays), each entry in the list corresponds \n\
272                        to descriptor vector for one item \n\
273                    3. A list (or tuple) of lists (or tuples) of values, where the values can be extracted to \n\
274                        double. \n\n\
275   RETURNS: \n\
276     A numeric one-dimensional array containing the lower triangle elements of the symmetric distance matrix\n\n";
277   python::def("GetEuclideanDistMat", RDDataManip::getEuclideanDistMat,
278               docString.c_str());
279 
280   docString =
281       "Compute the distance matrix from a list of BitVects using the Tanimoto distance metric\n\n\
282   ARGUMENTS: \n\
283 \n\
284     bitVectList - a list of bit vectors. Currently this works only for a list of explicit bit vectors, \n\
285                   needs to be expanded to support a list of SparseBitVects\n\n\
286   RETURNS: \n\
287     A numeric 1 dimensional array containing the lower triangle elements of the\n\
288     symmetric distance matrix\n\n";
289   python::def("GetTanimotoDistMat", RDDataManip::getTanimotoDistMat,
290               docString.c_str());
291 
292   docString =
293       "Compute the similarity matrix from a list of BitVects \n\n\
294   ARGUMENTS: \n\
295 \n\
296     bitVectList - a list of bit vectors. Currently this works only for a list of explicit bit vectors, \n\
297                   needs to be expanded to support a list of SparseBitVects\n\n\
298   RETURNS: \n\
299     A numeric 1 dimensional array containing the lower triangle elements of the symmetric similarity matrix\n\n";
300   python::def("GetTanimotoSimMat", RDDataManip::getTanimotoSimMat,
301               docString.c_str());
302 }
303