1 #include "Python.h"
2 #include <stdio.h>
3 #include <string.h>
4 #include <float.h>
5 #include "cluster.h"
6 
7 
8 /* ========================================================================= */
9 /* -- Helper routines ------------------------------------------------------ */
10 /* ========================================================================= */
11 
12 static char
extract_single_character(PyObject * object,const char variable[],const char allowed[])13 extract_single_character(PyObject* object, const char variable[],
14                          const char allowed[])
15 {
16     Py_UCS4 ch;
17     Py_ssize_t n;
18     if (!PyUnicode_Check(object)) {
19         PyErr_Format(PyExc_ValueError, "%s should be a string", variable);
20         return 0;
21     }
22     if (PyUnicode_READY(object) == -1) return 0;
23     n = PyUnicode_GET_LENGTH(object);
24     if (n != 1) {
25         PyErr_Format(PyExc_ValueError,
26                      "%s should be a single character", variable);
27         return 0;
28     }
29     ch = PyUnicode_READ_CHAR(object, 0);
30     if (ch < 128) {
31         const char c = ch;
32         if (strchr(allowed, c)) return c;
33     }
34     PyErr_Format(PyExc_ValueError,
35                  "unknown %s function specified (should be one of '%s')",
36                  variable, allowed);
37     return 0;
38 }
39 
40 static int
distance_converter(PyObject * object,void * pointer)41 distance_converter(PyObject* object, void* pointer)
42 {
43     char c;
44 
45     c = extract_single_character(object, "dist", "ebcauxsk");
46     if (c == 0) return 0;
47     *((char*)pointer) = c;
48     return 1;
49 }
50 
51 static int
method_treecluster_converter(PyObject * object,void * pointer)52 method_treecluster_converter(PyObject* object, void* pointer)
53 {
54     char c;
55 
56     c = extract_single_character(object, "method", "csma");
57     if (c == 0) return 0;
58     *((char*)pointer) = c;
59     return 1;
60 }
61 
62 static int
method_kcluster_converter(PyObject * object,void * pointer)63 method_kcluster_converter(PyObject* object, void* pointer)
64 {
65     char c;
66 
67     c = extract_single_character(object, "method", "am");
68     if (c == 0) return 0;
69     *((char*)pointer) = c;
70     return 1;
71 }
72 
73 static int
method_clusterdistance_converter(PyObject * object,void * pointer)74 method_clusterdistance_converter(PyObject* object, void* pointer)
75 {
76     char c;
77 
78     c = extract_single_character(object, "method", "amsxv");
79     if (c == 0) return 0;
80     *((char*)pointer) = c;
81     return 1;
82 }
83 
84 /* -- data ----------------------------------------------------------------- */
85 
86 typedef struct {
87     int nrows;
88     int ncols;
89     double** values;
90     Py_buffer view;
91 } Data;
92 
93 static int
data_converter(PyObject * object,void * pointer)94 data_converter(PyObject* object, void* pointer)
95 {
96     Data* data = pointer;
97     int nrows;
98     int ncols;
99     int i;
100     double** values = data->values;
101     Py_buffer* view = &data->view;
102     const char* p;
103     Py_ssize_t stride;
104     const int flag = PyBUF_ND | PyBUF_STRIDES;
105 
106     if (object == NULL) goto exit;
107     if (object == Py_None) return 1;
108 
109     if (PyObject_GetBuffer(object, view, flag) == -1) {
110         PyErr_SetString(PyExc_RuntimeError,
111                         "data matrix has unexpected format.");
112         return 0;
113     }
114 
115     if (view->ndim != 2) {
116         PyErr_Format(PyExc_RuntimeError,
117                      "data matrix has incorrect rank %d (expected 2)",
118                      view->ndim);
119         goto exit;
120     }
121     if (view->itemsize != sizeof(double)) {
122         PyErr_SetString(PyExc_RuntimeError,
123                         "data matrix has incorrect data type");
124         goto exit;
125     }
126     nrows = (int) view->shape[0];
127     ncols = (int) view->shape[1];
128     if (nrows != view->shape[0] || ncols != view->shape[1]) {
129         PyErr_Format(PyExc_ValueError,
130             "data matrix is too large (dimensions = %zd x %zd)",
131             view->shape[0], view->shape[1]);
132         goto exit;
133     }
134     if (nrows < 1 || ncols < 1) {
135         PyErr_SetString(PyExc_ValueError, "data matrix is empty");
136         goto exit;
137     }
138     stride = view->strides[0];
139     if (view->strides[1] != view->itemsize) {
140         PyErr_SetString(PyExc_RuntimeError, "data is not contiguous");
141         goto exit;
142     }
143     values = PyMem_Malloc(nrows*sizeof(double*));
144     if (!values) {
145         PyErr_NoMemory();
146         goto exit;
147     }
148     for (i = 0, p = view->buf; i < nrows; i++, p += stride)
149         values[i] = (double*)p;
150     data->values = values;
151     data->nrows = nrows;
152     data->ncols = ncols;
153     return Py_CLEANUP_SUPPORTED;
154 
155 exit:
156     if (values) PyMem_Free(values);
157     PyBuffer_Release(view);
158     return 0;
159 }
160 
161 /* -- mask ----------------------------------------------------------------- */
162 
163 typedef struct {
164     int** values;
165     Py_buffer view;
166 } Mask;
167 
168 static int
mask_converter(PyObject * object,void * pointer)169 mask_converter(PyObject* object, void* pointer)
170 {
171     Mask* mask = pointer;
172     int nrows;
173     int ncols;
174     int i;
175     int** values = mask->values;
176     Py_buffer* view = &mask->view;
177     const char* p;
178     Py_ssize_t stride;
179     const int flag = PyBUF_ND | PyBUF_STRIDES;
180 
181     if (object == NULL) goto exit;
182     if (object == Py_None) return 1;
183 
184     if (PyObject_GetBuffer(object, view, flag) == -1) {
185         PyErr_SetString(PyExc_RuntimeError, "mask has unexpected format.");
186         return 0;
187     }
188 
189     if (view->ndim != 2) {
190         PyErr_Format(PyExc_ValueError,
191                      "mask has incorrect rank %d (expected 2)", view->ndim);
192         goto exit;
193     }
194     if (view->itemsize != sizeof(int)) {
195         PyErr_SetString(PyExc_RuntimeError, "mask has incorrect data type");
196         goto exit;
197     }
198     nrows = (int) view->shape[0];
199     ncols = (int) view->shape[1];
200     if (nrows != view->shape[0] || ncols != view->shape[1]) {
201         PyErr_Format(PyExc_ValueError,
202                      "mask is too large (dimensions = %zd x %zd)",
203                      view->shape[0], view->shape[1]);
204         goto exit;
205     }
206     stride = view->strides[0];
207     if (view->strides[1] != view->itemsize) {
208         PyErr_SetString(PyExc_RuntimeError, "mask is not contiguous");
209         goto exit;
210     }
211     values = PyMem_Malloc(nrows*sizeof(int*));
212     if (!values) {
213         PyErr_NoMemory();
214         goto exit;
215     }
216     for (i = 0, p = view->buf; i < nrows; i++, p += stride)
217         values[i] = (int*)p;
218     mask->values = values;
219     return Py_CLEANUP_SUPPORTED;
220 
221 exit:
222     if (values) PyMem_Free(values);
223     PyBuffer_Release(view);
224     return 0;
225 }
226 
227 /* -- 1d array ------------------------------------------------------------- */
228 
229 static int
vector_converter(PyObject * object,void * pointer)230 vector_converter(PyObject* object, void* pointer)
231 {
232     Py_buffer* view = pointer;
233     int ndata;
234     const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
235 
236     if (object == NULL) goto exit;
237 
238     if (PyObject_GetBuffer(object, view, flag) == -1) {
239         PyErr_SetString(PyExc_RuntimeError, "unexpected format.");
240         return 0;
241     }
242 
243     if (view->ndim != 1) {
244         PyErr_Format(PyExc_ValueError, "incorrect rank %d (expected 1)",
245                      view->ndim);
246         goto exit;
247     }
248     if (view->itemsize != sizeof(double)) {
249         PyErr_SetString(PyExc_RuntimeError, "array has incorrect data type");
250         goto exit;
251     }
252     ndata = (int) view->shape[0];
253     if (ndata != view->shape[0]) {
254         PyErr_Format(PyExc_ValueError,
255                      "array is too large (size = %zd)", view->shape[0]);
256         goto exit;
257     }
258     return Py_CLEANUP_SUPPORTED;
259 
260 exit:
261     PyBuffer_Release(view);
262     return 0;
263 }
264 
265 static int
vector_none_converter(PyObject * object,void * pointer)266 vector_none_converter(PyObject* object, void* pointer)
267 {
268     if (object == Py_None) return 1;
269     return vector_converter(object, pointer);
270 }
271 
272 /* -- clusterid ------------------------------------------------------------ */
273 
274 static int
check_clusterid(Py_buffer clusterid,int nitems)275 check_clusterid(Py_buffer clusterid, int nitems) {
276     int i, j;
277     int *p = clusterid.buf;
278     int nclusters = 0;
279     int* number;
280 
281     if (nitems != clusterid.shape[0]) {
282         PyErr_Format(PyExc_ValueError, "incorrect size (%zd, expected %d)",
283                      clusterid.shape[0], nitems);
284         return 0;
285     }
286     for (i = 0; i < nitems; i++) {
287         j = p[i];
288         if (j > nclusters) nclusters = j;
289         if (j < 0) {
290             PyErr_SetString(PyExc_ValueError, "negative cluster number found");
291             return 0;
292         }
293     }
294     nclusters++;
295     /* -- Count the number of items in each cluster --------------------- */
296     number = calloc(nclusters, sizeof(int));
297     if (!number) {
298         PyErr_NoMemory();
299         return 0;
300     }
301     for (i = 0; i < nitems; i++) {
302         j = p[i];
303         number[j]++;
304     }
305     for (j = 0; j < nclusters; j++) if (number[j] == 0) break;
306     PyMem_Free(number);
307     if (j < nclusters) {
308         PyErr_Format(PyExc_ValueError, "cluster %d is empty", j);
309         return 0;
310     }
311     return nclusters;
312 }
313 
314 /* -- distance ----------------------------------------------------------- */
315 
316 typedef struct {
317     int n;
318     double** values;
319     Py_buffer* views;
320     Py_buffer view;
321 } Distancematrix;
322 
323 static int
_convert_list_to_distancematrix(PyObject * list,Distancematrix * distances)324 _convert_list_to_distancematrix(PyObject* list, Distancematrix* distances)
325 {
326     int i;
327     double** values;
328     Py_buffer* view;
329     Py_buffer* views;
330     const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
331     const int n = (int) PyList_GET_SIZE(list);
332 
333     if (n != PyList_GET_SIZE(list)) {
334         PyErr_SetString(PyExc_ValueError, "distance matrix is too large");
335         return 0;
336     }
337     values = PyMem_Malloc(n*sizeof(double*));
338     if (!values) {
339         PyErr_NoMemory();
340         return 0;
341     }
342     distances->values = values;
343     views = PyMem_Malloc(n*sizeof(Py_buffer));
344     if (!views) {
345         PyErr_NoMemory();
346         return 0;
347     }
348     view = views;
349     for (i = 0; i < n; i++, view++) {
350         PyObject* item = PyList_GET_ITEM(list, i);
351         view->len = -1;
352         if (PyObject_GetBuffer(item, view, flag) == -1) {
353             PyErr_Format(PyExc_RuntimeError, "failed to parse row %d.", i);
354             view--;
355             break;
356         }
357         if (view->ndim != 1) {
358             PyErr_Format(PyExc_ValueError,
359                          "row %d has incorrect rank (%d expected 1)",
360                          i, view->ndim);
361             break;
362         }
363         if (view->itemsize != sizeof(double)) {
364             PyErr_Format(PyExc_RuntimeError,
365                          "row %d has incorrect data type", i);
366             break;
367         }
368         if (view->shape[0] != i) {
369             PyErr_Format(PyExc_RuntimeError,
370                          "row %d has incorrect size %zd (expected %d)",
371                          i, view->shape[0], i);
372             break;
373         }
374         values[i] = view->buf;
375     }
376     if (i < n) {
377         for ( ; view >= views; view--) PyBuffer_Release(view);
378         PyMem_Free(views);
379         return 0;
380     }
381     distances->n = n;
382     distances->view.len = 0;
383     distances->views = views;
384     distances->values = values;
385     return 1;
386 }
387 
388 static int
_convert_array_to_distancematrix(PyObject * array,Distancematrix * distances)389 _convert_array_to_distancematrix(PyObject* array, Distancematrix* distances)
390 {
391     int i;
392     int n;
393     double** values;
394     double* p;
395     Py_buffer* view = &distances->view;
396     const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
397 
398     if (PyObject_GetBuffer(array, view, flag) == -1) {
399         PyErr_SetString(PyExc_RuntimeError,
400                         "distance matrix has unexpected format.");
401         return 0;
402     }
403 
404     if (view->len == 0) {
405         PyBuffer_Release(view);
406         PyErr_SetString(PyExc_ValueError, "distance matrix is empty");
407         return 0;
408     }
409     if (view->itemsize != sizeof(double)) {
410         PyErr_SetString(PyExc_RuntimeError,
411                         "distance matrix has an incorrect data type");
412         return 0;
413     }
414     if (view->ndim == 1) {
415         int m = (int) view->shape[0];
416         if (m != view->shape[0]) {
417             PyErr_Format(PyExc_ValueError,
418                          "distance matrix is too large (size = %zd)",
419                          view->shape[0]);
420             return 0;
421         }
422         n = (int)(1+sqrt(1+8*m)/2); /* rounds to (1+sqrt(1+8*m))/2 */
423         if (n*n-n != 2 * m) {
424             PyErr_SetString(PyExc_ValueError,
425                             "distance matrix has unexpected size.");
426             return 0;
427         }
428         distances->n = n;
429         values = PyMem_Malloc(n*sizeof(double*));
430         if (!values) {
431             PyErr_NoMemory();
432             return 0;
433         }
434         distances->values = values;
435         for (p = view->buf, i = 0; i < n; p += i, i++) values[i] = p;
436     }
437     else if (view->ndim == 2) {
438         n = (int) view->shape[0];
439         if (n != view->shape[0]) {
440             PyErr_Format(PyExc_ValueError,
441                          "distance matrix is too large (size = %zd)",
442                          view->shape[0]);
443             return 0;
444         }
445         distances->n = n;
446         if (view->shape[1] != n) {
447             PyErr_SetString(PyExc_ValueError,
448                             "distance matrix is not square.");
449             return 0;
450         }
451         values = PyMem_Malloc(n*sizeof(double*));
452         if (!values) {
453             PyErr_NoMemory();
454             return 0;
455         }
456         distances->values = values;
457         for (p = view->buf, i = 0; i < n; p += n, i++) values[i] = p;
458     }
459     else {
460         PyErr_Format(PyExc_ValueError,
461                      "distance matrix has incorrect rank %d (expected 1 or 2)",
462                      view->ndim);
463         return 0;
464     }
465     return 1;
466 }
467 
468 static int
distancematrix_converter(PyObject * argument,void * pointer)469 distancematrix_converter(PyObject* argument, void* pointer)
470 {
471     Distancematrix* distances = pointer;
472     double** values;
473 
474     if (argument == NULL) goto exit;
475     if (argument == Py_None) return 1;
476     if (PyList_Check(argument)) {
477         if (_convert_list_to_distancematrix(argument, distances))
478             return Py_CLEANUP_SUPPORTED;
479     }
480     else {
481         if (_convert_array_to_distancematrix(argument, distances))
482             return Py_CLEANUP_SUPPORTED;
483     }
484 
485 exit:
486     values = distances->values;
487     if (values == NULL) return 0;
488     else {
489         int i;
490         const int n = distances->n;
491         Py_buffer* views = distances->views;
492         if (views) {
493             for (i = 0; i < n; i++) PyBuffer_Release(&views[i]);
494             PyMem_Free(views);
495         }
496         else if (distances->view.len) {
497             PyBuffer_Release(&distances->view);
498         }
499         PyMem_Free(values);
500     }
501     return 0;
502 }
503 
504 /* -- celldata ------------------------------------------------------------- */
505 
506 typedef struct {
507     int nx;
508     int ny;
509     int nz;
510     double*** values;
511     Py_buffer view;
512 } Celldata;
513 
514 static int
celldata_converter(PyObject * argument,void * pointer)515 celldata_converter(PyObject* argument, void* pointer)
516 {
517     int i, n;
518     double* p;
519     Celldata* celldata = pointer;
520     double*** ppp = celldata->values;
521     double** pp = ppp ? ppp[0] : NULL;
522     int nx;
523     int ny;
524     int nz;
525     Py_buffer* view = &celldata->view;
526     const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
527 
528     if (argument == NULL) goto exit;
529 
530     if (PyObject_GetBuffer(argument, view, flag) == -1) {
531         PyErr_SetString(PyExc_RuntimeError,
532                         "celldata array has unexpected format.");
533         return 0;
534     }
535 
536     nx = (int) view->shape[0];
537     ny = (int) view->shape[1];
538     nz = (int) view->shape[2];
539     if (nx != view->shape[0] || ny != view->shape[1] || nz != view->shape[2]) {
540         PyErr_SetString(PyExc_RuntimeError, "celldata array too large");
541         goto exit;
542     }
543     if (view->itemsize != sizeof(double)) {
544         PyErr_SetString(PyExc_RuntimeError,
545                         "celldata array has incorrect data type");
546         goto exit;
547     }
548     pp = PyMem_Malloc(nx*ny*sizeof(double*));
549     ppp = PyMem_Malloc(nx*sizeof(double**));
550     if (!pp || !ppp) {
551         PyErr_NoMemory();
552         goto exit;
553     }
554     p = view->buf;
555     n = nx * ny;
556     for (i = 0; i < n; i++, p += nz) pp[i] = p;
557     for (i = 0; i < nx; i++, pp += ny) ppp[i] = pp;
558     celldata->values = ppp;
559     celldata->nx = nx;
560     celldata->ny = ny;
561     celldata->nz = nz;
562     return Py_CLEANUP_SUPPORTED;
563 
564 exit:
565     if (pp) PyMem_Free(pp);
566     if (ppp) PyMem_Free(ppp);
567     PyBuffer_Release(view);
568     return 0;
569 }
570 
571 
572 /* -- index ---------------------------------------------------------------- */
573 
574 static int
index_converter(PyObject * argument,void * pointer)575 index_converter(PyObject* argument, void* pointer)
576 {
577     Py_buffer* view = pointer;
578     int n;
579     const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
580 
581     if (argument == NULL) goto exit;
582 
583     if (PyObject_GetBuffer(argument, view, flag) == -1) {
584         PyErr_SetString(PyExc_RuntimeError, "unexpected format.");
585         return 0;
586     }
587 
588     if (view->ndim != 1) {
589         PyErr_Format(PyExc_ValueError, "incorrect rank %d (expected 1)",
590                      view->ndim);
591         goto exit;
592     }
593     if (view->itemsize != sizeof(int)) {
594         PyErr_SetString(PyExc_RuntimeError,
595                         "argument has incorrect data type");
596         goto exit;
597     }
598     n = (int) view->shape[0];
599     if (n != view->shape[0]) {
600         PyErr_Format(PyExc_ValueError,
601             "array size is too large (size = %zd)", view->shape[0]);
602         goto exit;
603     }
604     return Py_CLEANUP_SUPPORTED;
605 
606 exit:
607     PyBuffer_Release(view);
608     return 0;
609 }
610 
611 /* -- index2d ------------------------------------------------------------- */
612 
613 static int
index2d_converter(PyObject * argument,void * pointer)614 index2d_converter(PyObject* argument, void* pointer)
615 {
616     Py_buffer* view = pointer;
617     int n;
618     const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
619 
620     if (argument == NULL) goto exit;
621 
622     if (PyObject_GetBuffer(argument, view, flag) == -1) {
623         PyErr_SetString(PyExc_RuntimeError, "unexpected format.");
624         return 0;
625     }
626 
627     if (view->ndim != 2) {
628         PyErr_Format(PyExc_ValueError, "incorrect rank %d (expected 2)",
629                      view->ndim);
630         goto exit;
631     }
632     if (view->itemsize != sizeof(int)) {
633         PyErr_SetString(PyExc_RuntimeError,
634                         "argument has incorrect data type");
635         goto exit;
636     }
637     n = (int) view->shape[0];
638     if (n != view->shape[0]) {
639         PyErr_Format(PyExc_ValueError,
640             "array size is too large (size = %zd)", view->shape[0]);
641         goto exit;
642     }
643     if (view->shape[1] != 2) {
644         PyErr_Format(PyExc_ValueError,
645             "array has %zd columns (expected 2)", view->shape[1]);
646         goto exit;
647     }
648     return Py_CLEANUP_SUPPORTED;
649 
650 exit:
651     PyBuffer_Release(view);
652     return 0;
653 }
654 
655 /* ========================================================================= */
656 /* -- Classes -------------------------------------------------------------- */
657 /* ========================================================================= */
658 
659 typedef struct {
660     PyObject_HEAD
661     Node node;
662 } PyNode;
663 
664 static int
PyNode_init(PyNode * self,PyObject * args,PyObject * kwds)665 PyNode_init(PyNode *self, PyObject *args, PyObject *kwds)
666 {
667     int left, right;
668     double distance = 0.0;
669     static char *kwlist[] = {"left", "right", "distance", NULL};
670 
671     if (!PyArg_ParseTupleAndKeywords(args, kwds, "ii|d", kwlist,
672                                       &left, &right, &distance))
673         return -1;
674     self->node.left = left;
675     self->node.right = right;
676     self->node.distance = distance;
677     return 0;
678 }
679 
680 static PyObject*
PyNode_repr(PyNode * self)681 PyNode_repr(PyNode* self)
682 {
683     char string[64];
684 
685     sprintf(string, "(%d, %d): %g",
686                    self->node.left, self->node.right, self->node.distance);
687     return PyUnicode_FromString(string);
688 }
689 
690 static char PyNode_left__doc__[] =
691 "integer representing the first member of this node";
692 
693 static PyObject*
PyNode_getleft(PyNode * self,void * closure)694 PyNode_getleft(PyNode* self, void* closure)
695 {
696     int left = self->node.left;
697 
698     return PyLong_FromLong((long)left);
699 }
700 
701 static int
PyNode_setleft(PyNode * self,PyObject * value,void * closure)702 PyNode_setleft(PyNode* self, PyObject* value, void* closure)
703 {
704     long left = PyLong_AsLong(value);
705 
706     if (PyErr_Occurred()) return -1;
707     self->node.left = (int) left;
708     return 0;
709 }
710 
711 static char PyNode_right__doc__[] =
712 "integer representing the second member of this node";
713 
714 static PyObject*
PyNode_getright(PyNode * self,void * closure)715 PyNode_getright(PyNode* self, void* closure)
716 {
717     int right = self->node.right;
718 
719     return PyLong_FromLong((long)right);
720 }
721 
722 static int
PyNode_setright(PyNode * self,PyObject * value,void * closure)723 PyNode_setright(PyNode* self, PyObject* value, void* closure)
724 {
725     long right = PyLong_AsLong(value);
726 
727     if (PyErr_Occurred()) return -1;
728     self->node.right = (int) right;
729     return 0;
730 }
731 
732 static PyObject*
PyNode_getdistance(PyNode * self,void * closure)733 PyNode_getdistance(PyNode* self, void* closure)
734 {
735     return PyFloat_FromDouble(self->node.distance);
736 }
737 
738 static int
PyNode_setdistance(PyNode * self,PyObject * value,void * closure)739 PyNode_setdistance(PyNode* self, PyObject* value, void* closure)
740 {
741     const double distance = PyFloat_AsDouble(value);
742 
743     if (PyErr_Occurred()) return -1;
744     self->node.distance = distance;
745     return 0;
746 }
747 
748 static char PyNode_distance__doc__[] =
749 "the distance between the two members of this node\n";
750 
751 static PyGetSetDef PyNode_getset[] = {
752     {"left",
753      (getter)PyNode_getleft,
754      (setter)PyNode_setleft,
755      PyNode_left__doc__, NULL},
756     {"right",
757      (getter)PyNode_getright,
758      (setter)PyNode_setright,
759      PyNode_right__doc__, NULL},
760     {"distance",
761      (getter)PyNode_getdistance,
762      (setter)PyNode_setdistance,
763      PyNode_distance__doc__, NULL},
764     {NULL}  /* Sentinel */
765 };
766 
767 static char PyNode_doc[] =
768 "A Node object describes a single node in a hierarchical clustering tree.\n"
769 "The integer attributes 'left' and 'right' represent the two members that\n"
770 "make up this node; the floating point attribute 'distance' contains the\n"
771 "distance between the two members of this node.\n";
772 
773 static PyTypeObject PyNodeType = {
774     PyVarObject_HEAD_INIT(NULL, 0)
775     "_cluster.Node",           /* tp_name */
776     sizeof(PyNode),            /* tp_basicsize */
777     0,                         /* tp_itemsize */
778     0,                         /* tp_dealloc */
779     0,                         /* tp_print */
780     0,                         /* tp_getattr */
781     0,                         /* tp_setattr */
782     0,                         /* tp_compare */
783     (reprfunc)PyNode_repr,     /* tp_repr */
784     0,                         /* tp_as_number */
785     0,                         /* tp_as_sequence */
786     0,                         /* tp_as_mapping */
787     0,                         /* tp_hash */
788     0,                         /* tp_call */
789     0,                         /* tp_str */
790     0,                         /* tp_getattro */
791     0,                         /* tp_setattro */
792     0,                         /* tp_as_buffer */
793     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,          /*tp_flags*/
794     PyNode_doc,                /* tp_doc */
795     0,                         /* tp_traverse */
796     0,                         /* tp_clear */
797     0,                         /* tp_richcompare */
798     0,                         /* tp_weaklistoffset */
799     0,                         /* tp_iter */
800     0,                         /* tp_iternext */
801     0,                         /* tp_methods */
802     0,                         /* tp_members */
803     PyNode_getset,             /* tp_getset */
804     0,                         /* tp_base */
805     0,                         /* tp_dict */
806     0,                         /* tp_descr_get */
807     0,                         /* tp_descr_set */
808     0,                         /* tp_dictoffset */
809     (initproc)PyNode_init,     /* tp_init */
810 };
811 
812 typedef struct {
813     PyObject_HEAD
814     Node* nodes;
815     int n;
816 } PyTree;
817 
818 static void
PyTree_dealloc(PyTree * self)819 PyTree_dealloc(PyTree* self)
820 {
821     if (self->n) PyMem_Free(self->nodes);
822     Py_TYPE(self)->tp_free((PyObject*)self);
823 }
824 
825 static PyObject*
PyTree_new(PyTypeObject * type,PyObject * args,PyObject * kwds)826 PyTree_new(PyTypeObject *type, PyObject* args, PyObject* kwds)
827 {
828     int i, j;
829     int n;
830     Node* nodes;
831     PyObject* arg = NULL;
832     int* flag;
833     PyTree* self;
834 
835     self = (PyTree *)type->tp_alloc(type, 0);
836     if (!self) return NULL;
837 
838     if (!PyArg_ParseTuple(args, "|O", &arg)) {
839         Py_DECREF(self);
840         return NULL;
841     }
842 
843     if (arg == NULL) {
844         self->n = 0;
845         self->nodes = NULL;
846         return (PyObject*)self;
847     }
848 
849     if (!PyList_Check(arg)) {
850         Py_DECREF(self);
851         PyErr_SetString(PyExc_TypeError,
852                         "Argument should be a list of Node objects");
853         return NULL;
854     }
855 
856     n = (int) PyList_GET_SIZE(arg);
857     if (n != PyList_GET_SIZE(arg)) {
858         Py_DECREF(self);
859         PyErr_Format(PyExc_ValueError,
860                      "List is too large (size = %zd)", PyList_GET_SIZE(arg));
861         return NULL;
862     }
863     if (n < 1) {
864         Py_DECREF(self);
865         PyErr_SetString(PyExc_ValueError, "List is empty");
866         return NULL;
867     }
868     nodes = PyMem_Malloc(n*sizeof(Node));
869     if (!nodes) {
870         Py_DECREF(self);
871         return PyErr_NoMemory();
872     }
873     for (i = 0; i < n; i++) {
874         PyNode* p;
875         PyObject* row = PyList_GET_ITEM(arg, i);
876         if (!PyType_IsSubtype(Py_TYPE(row), &PyNodeType)) {
877             PyMem_Free(nodes);
878             Py_DECREF(self);
879             PyErr_Format(PyExc_TypeError,
880                          "Row %d in list is not a Node object", i);
881             return NULL;
882         }
883         p = (PyNode*)row;
884         nodes[i] = p->node;
885     }
886     /* --- Check if this is a bona fide tree ------------------------------- */
887     flag = PyMem_Malloc((2*n+1)*sizeof(int));
888     if (!flag) {
889         PyMem_Free(nodes);
890         Py_DECREF(self);
891         return PyErr_NoMemory();
892     }
893     for (i = 0; i < 2*n+1; i++) flag[i] = 0;
894     for (i = 0; i < n; i++) {
895         j = nodes[i].left;
896         if (j < 0) {
897             j = -j-1;
898             if (j >= i) break;
899         }
900         else j += n;
901         if (flag[j]) break;
902         flag[j] = 1;
903         j = nodes[i].right;
904         if (j < 0) {
905           j = -j-1;
906           if (j >= i) break;
907         }
908         else j += n;
909         if (flag[j]) break;
910         flag[j] = 1;
911     }
912     PyMem_Free(flag);
913     if (i < n) {
914         /* break encountered */
915         PyMem_Free(nodes);
916         Py_DECREF(self);
917         PyErr_SetString(PyExc_ValueError, "Inconsistent tree");
918         return NULL;
919     }
920     self->n = n;
921     self->nodes = nodes;
922     return (PyObject*)self;
923 }
924 
925 static PyObject*
PyTree_str(PyTree * self)926 PyTree_str(PyTree* self)
927 {
928     int i;
929     const int n = self->n;
930     char string[128];
931     Node node;
932     PyObject* line;
933     PyObject* output;
934     PyObject* temp;
935 
936     output = PyUnicode_FromString("");
937     for (i = 0; i < n; i++) {
938         node = self->nodes[i];
939         sprintf(string, "(%d, %d): %g", node.left, node.right, node.distance);
940         if (i < n-1) strcat(string, "\n");
941         line = PyUnicode_FromString(string);
942         if (!line) {
943             Py_DECREF(output);
944             return NULL;
945         }
946         temp = PyUnicode_Concat(output, line);
947         if (!temp) {
948             Py_DECREF(output);
949             Py_DECREF(line);
950             return NULL;
951         }
952         output = temp;
953     }
954     return output;
955 }
956 
957 static int
PyTree_length(PyTree * self)958 PyTree_length(PyTree *self)
959 {
960     return self->n;
961 }
962 
963 static PyObject*
PyTree_subscript(PyTree * self,PyObject * item)964 PyTree_subscript(PyTree* self, PyObject* item)
965 {
966     if (PyIndex_Check(item)) {
967         PyNode* result;
968         Py_ssize_t i;
969         i = PyNumber_AsSsize_t(item, PyExc_IndexError);
970         if (i == -1 && PyErr_Occurred())
971             return NULL;
972         if (i < 0)
973             i += self->n;
974         if (i < 0 || i >= self->n) {
975             PyErr_SetString(PyExc_IndexError, "tree index out of range");
976             return NULL;
977         }
978         result = (PyNode*) PyNodeType.tp_alloc(&PyNodeType, 0);
979         if (!result) return PyErr_NoMemory();
980         result->node = self->nodes[i];
981         return (PyObject*) result;
982     }
983     else if (PySlice_Check(item)) {
984         Py_ssize_t i, j;
985         Py_ssize_t start, stop, step, slicelength;
986         if (PySlice_GetIndicesEx(item, self->n, &start, &stop, &step,
987                                  &slicelength) == -1) return NULL;
988         if (slicelength == 0) return PyList_New(0);
989         else {
990             PyNode* node;
991             PyObject* result = PyList_New(slicelength);
992             if (!result) return PyErr_NoMemory();
993             for (i = 0, j = start; i < slicelength; i++, j += step) {
994                 node = (PyNode*) PyNodeType.tp_alloc(&PyNodeType, 0);
995                 if (!node) {
996                     Py_DECREF(result);
997                     return PyErr_NoMemory();
998                 }
999                 node->node = self->nodes[j];
1000                 PyList_SET_ITEM(result, i, (PyObject*)node);
1001             }
1002             return result;
1003         }
1004     }
1005     else {
1006         PyErr_Format(PyExc_TypeError,
1007                      "tree indices must be integers, not %.200s",
1008                      item->ob_type->tp_name);
1009         return NULL;
1010     }
1011 }
1012 
1013 static PyMappingMethods PyTree_mapping = {
1014     (lenfunc)PyTree_length,           /* mp_length */
1015     (binaryfunc)PyTree_subscript,     /* mp_subscript */
1016 };
1017 
1018 static char PyTree_scale__doc__[] =
1019 "mytree.scale()\n"
1020 "\n"
1021 "Scale the node distances in the tree such that they are all between one\n"
1022 "and zero.\n";
1023 
1024 static PyObject*
PyTree_scale(PyTree * self)1025 PyTree_scale(PyTree* self)
1026 {
1027     int i;
1028     const int n = self->n;
1029     Node* nodes = self->nodes;
1030     double maximum = DBL_MIN;
1031 
1032     for (i = 0; i < n; i++) {
1033         double distance = nodes[i].distance;
1034         if (distance > maximum) maximum = distance;
1035     }
1036     if (maximum != 0.0)
1037         for (i = 0; i < n; i++) nodes[i].distance /= maximum;
1038     Py_INCREF(Py_None);
1039     return Py_None;
1040 }
1041 
1042 static char PyTree_cut__doc__[] =
1043 "mytree.cut(nclusters) -> array\n"
1044 "\n"
1045 "Divide the elements in a hierarchical clustering result mytree into\n"
1046 "clusters, and return an array with the number of the cluster to which each\n"
1047 "element was assigned. The number of clusters is given by nclusters.\n";
1048 
1049 static PyObject*
PyTree_cut(PyTree * self,PyObject * args)1050 PyTree_cut(PyTree* self, PyObject* args)
1051 {
1052     int ok = -1;
1053     int nclusters;
1054     const int n = self->n + 1;
1055     Py_buffer indices = {0};
1056 
1057     if (!PyArg_ParseTuple(args, "O&i",
1058                           index_converter, &indices, &nclusters)) goto exit;
1059     if (nclusters < 1) {
1060         PyErr_SetString(PyExc_ValueError,
1061                         "requested number of clusters should be positive");
1062         goto exit;
1063     }
1064     if (nclusters > n) {
1065         PyErr_SetString(PyExc_ValueError,
1066                         "more clusters requested than items available");
1067         goto exit;
1068     }
1069     if (indices.shape[0] != n) {
1070         PyErr_SetString(PyExc_RuntimeError,
1071                         "indices array inconsistent with tree");
1072         goto exit;
1073     }
1074     ok = cuttree(n, self->nodes, nclusters, indices.buf);
1075 
1076 exit:
1077     index_converter(NULL, &indices);
1078     if (ok == -1) return NULL;
1079     if (ok == 0) return PyErr_NoMemory();
1080     Py_INCREF(Py_None);
1081     return Py_None;
1082 }
1083 
1084 static char PyTree_sort__doc__[] =
1085 "mytree.sort(order) -> array\n"
1086 "\n"
1087 "Sort a hierarchical clustering tree by switching the left and right\n"
1088 "subnode of nodes such that the elements in the left-to-right order of the\n"
1089 "tree tend to have increasing order values.\n"
1090 "\n"
1091 "Return the indices of the elements in the left-to-right order in the\n"
1092 "hierarchical clustering tree, such that the element with index indices[i]\n"
1093 "occurs at position i in the dendrogram.\n";
1094 
1095 static PyObject*
PyTree_sort(PyTree * self,PyObject * args)1096 PyTree_sort(PyTree* self, PyObject* args)
1097 {
1098     int ok = -1;
1099     Py_buffer indices = {0};
1100     const int n = self->n;
1101     Py_buffer order = {0};
1102 
1103     if (n == 0) {
1104         PyErr_SetString(PyExc_ValueError, "tree is empty");
1105         return NULL;
1106     }
1107     if (!PyArg_ParseTuple(args, "O&O&",
1108                           index_converter, &indices,
1109                           vector_converter, &order)) goto exit;
1110     if (indices.shape[0] != n + 1) {
1111         PyErr_SetString(PyExc_RuntimeError,
1112                         "indices array inconsistent with tree");
1113         goto exit;
1114     }
1115     if (order.shape[0] != n + 1) {
1116         PyErr_Format(PyExc_ValueError,
1117             "order array has incorrect size %zd (expected %d)",
1118             order.shape[0], n + 1);
1119         goto exit;
1120     }
1121     ok = sorttree(n, self->nodes, order.buf, indices.buf);
1122 exit:
1123     index_converter(NULL, &indices);
1124     vector_converter(NULL, &order);
1125     if (ok == -1) return NULL;
1126     if (ok == 0) return PyErr_NoMemory();
1127     Py_INCREF(Py_None);
1128     return Py_None;
1129 }
1130 
1131 static PyMethodDef PyTree_methods[] = {
1132     {"scale", (PyCFunction)PyTree_scale, METH_NOARGS, PyTree_scale__doc__},
1133     {"cut", (PyCFunction)PyTree_cut, METH_VARARGS, PyTree_cut__doc__},
1134     {"sort", (PyCFunction)PyTree_sort, METH_VARARGS, PyTree_sort__doc__},
1135     {NULL}  /* Sentinel */
1136 };
1137 
1138 static char PyTree_doc[] =
1139 "Tree objects store a hierarchical clustering solution.\n"
1140 "Individual nodes in the tree can be accessed with tree[i], where i is\n"
1141 "an integer. Whereas the tree itself is a read-only object, tree[:]\n"
1142 "returns a list of all the nodes, which can then be modified. To create\n"
1143 "a new Tree from this list, use Tree(list).\n"
1144 "See the description of the Node class for more information.";
1145 
1146 static PyTypeObject PyTreeType = {
1147     PyVarObject_HEAD_INIT(NULL, 0)
1148     "_cluster.Tree",             /* tp_name */
1149     sizeof(PyTree),              /* tp_basicsize */
1150     0,                           /* tp_itemsize */
1151     (destructor)PyTree_dealloc,  /* tp_dealloc */
1152     0,                           /* tp_print */
1153     0,                           /* tp_getattr */
1154     0,                           /* tp_setattr */
1155     0,                           /* tp_compare */
1156     0,                           /* tp_repr */
1157     0,                           /* tp_as_number */
1158     0,                           /* tp_as_sequence */
1159     &PyTree_mapping,             /* tp_as_mapping */
1160     0,                           /* tp_hash */
1161     0,                           /* tp_call */
1162     (reprfunc)PyTree_str,        /* tp_str */
1163     0,                           /* tp_getattro */
1164     0,                           /* tp_setattro */
1165     0,                           /* tp_as_buffer */
1166     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,          /*tp_flags*/
1167     PyTree_doc,                  /* tp_doc */
1168     0,                           /* tp_traverse */
1169     0,                           /* tp_clear */
1170     0,                           /* tp_richcompare */
1171     0,                           /* tp_weaklistoffset */
1172     0,                           /* tp_iter */
1173     0,                           /* tp_iternext */
1174     PyTree_methods,              /* tp_methods */
1175     NULL,                        /* tp_members */
1176     0,                           /* tp_getset */
1177     0,                           /* tp_base */
1178     0,                           /* tp_dict */
1179     0,                           /* tp_descr_get */
1180     0,                           /* tp_descr_set */
1181     0,                           /* tp_dictoffset */
1182     0,                           /* tp_init */
1183     0,                           /* tp_alloc */
1184     (newfunc)PyTree_new,         /* tp_new */
1185 };
1186 
1187 /* ========================================================================= */
1188 /* -- Methods -------------------------------------------------------------- */
1189 /* ========================================================================= */
1190 
1191 /* version */
1192 static char version__doc__[] =
1193 "version() -> string\n"
1194 "\n"
1195 "Return the version number of the C Clustering Library as a string.\n";
1196 
1197 static PyObject*
py_version(PyObject * self)1198 py_version(PyObject* self)
1199 {
1200     return PyUnicode_FromString( CLUSTERVERSION );
1201 }
1202 
1203 /* kcluster */
1204 static char kcluster__doc__[] =
1205 "kcluster(data, nclusters, mask, weight, transpose, npass, method,\n"
1206 "         dist, clusterid) -> None\n"
1207 "\n"
1208 "This function implements k-means clustering.\n"
1209 "\n"
1210 "Arguments:\n"
1211 "\n"
1212 " - data: nrows x ncols array containing the data to be clustered\n"
1213 "\n"
1214 " - nclusters: number of clusters (the 'k' in k-means)\n"
1215 "\n"
1216 " - mask: nrows x ncols array of integers, showing which data are\n"
1217 "   missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
1218 "\n"
1219 " - weight: the weights to be used when calculating distances\n"
1220 " - transpose:\n"
1221 "\n"
1222 "   - if equal to 0, rows are clustered;\n"
1223 "   - if equal to 1, columns are clustered.\n"
1224 "\n"
1225 " - npass: number of times the k-means clustering algorithm is\n"
1226 "   performed, each time with a different (random) initial\n"
1227 "   condition. If npass == 0, then the assignments in clusterid\n"
1228 "   are used as the initial condition.\n"
1229 "\n"
1230 " - method: specifies how the center of a cluster is found:\n"
1231 "\n"
1232 "   - method == 'a': arithmetic mean\n"
1233 "   - method == 'm': median\n"
1234 "\n"
1235 " - dist: specifies the distance function to be used:\n"
1236 "\n"
1237 "   - dist == 'e': Euclidean distance\n"
1238 "   - dist == 'b': City Block distance\n"
1239 "   - dist == 'c': Pearson correlation\n"
1240 "   - dist == 'a': absolute value of the correlation\n"
1241 "   - dist == 'u': uncentered correlation\n"
1242 "   - dist == 'x': absolute uncentered correlation\n"
1243 "   - dist == 's': Spearman's rank correlation\n"
1244 "   - dist == 'k': Kendall's tau\n"
1245 "\n"
1246 " - clusterid: array in which the final clustering solution will be\n"
1247 "   stored (output variable). If npass == 0, then clusterid is also used\n"
1248 "   as an input variable, containing the initial condition from which\n"
1249 "   the EM algorithm should start. In this case, the k-means algorithm\n"
1250 "   is fully deterministic.\n"
1251 "\n";
1252 
1253 static PyObject*
py_kcluster(PyObject * self,PyObject * args,PyObject * keywords)1254 py_kcluster(PyObject* self, PyObject* args, PyObject* keywords)
1255 {
1256     int nclusters = 2;
1257     int nrows, ncols;
1258     int nitems;
1259     int ndata;
1260     Data data = {0};
1261     Mask mask = {0};
1262     Py_buffer weight = {0};
1263     int transpose = 0;
1264     int npass = 1;
1265     char method = 'a';
1266     char dist = 'e';
1267     Py_buffer clusterid = {0};
1268     double error;
1269     int ifound = 0;
1270 
1271     static char* kwlist[] = {"data",
1272                              "nclusters",
1273                              "mask",
1274                              "weight",
1275                              "transpose",
1276                              "npass",
1277                              "method",
1278                              "dist",
1279                              "clusterid",
1280                               NULL};
1281 
1282     if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&iO&O&iiO&O&O&", kwlist,
1283                                      data_converter, &data,
1284                                      &nclusters,
1285                                      mask_converter, &mask,
1286                                      vector_converter, &weight,
1287                                      &transpose,
1288                                      &npass,
1289                                      method_kcluster_converter, &method,
1290                                      distance_converter, &dist,
1291                                      index_converter, &clusterid)) return NULL;
1292     if (!data.values) {
1293         PyErr_SetString(PyExc_RuntimeError, "data is None");
1294         goto exit;
1295     }
1296     if (!mask.values) {
1297         PyErr_SetString(PyExc_RuntimeError, "mask is None");
1298         goto exit;
1299     }
1300     if (data.nrows != mask.view.shape[0] ||
1301         data.ncols != mask.view.shape[1]) {
1302         PyErr_Format(PyExc_ValueError,
1303             "mask has incorrect dimensions %zd x %zd (expected %d x %d)",
1304             mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
1305         goto exit;
1306     }
1307     nrows = data.nrows;
1308     ncols = data.ncols;
1309     ndata = transpose ? nrows : ncols;
1310     nitems = transpose ? ncols : nrows;
1311     if (weight.shape[0] != ndata) {
1312         PyErr_Format(PyExc_ValueError,
1313                      "weight has incorrect size %zd (expected %d)",
1314                      weight.shape[0], ndata);
1315         goto exit;
1316     }
1317     if (nclusters < 1) {
1318         PyErr_SetString(PyExc_ValueError, "nclusters should be positive");
1319         goto exit;
1320     }
1321     if (nitems < nclusters) {
1322         PyErr_SetString(PyExc_ValueError,
1323                         "more clusters than items to be clustered");
1324         goto exit;
1325     }
1326     if (npass < 0) {
1327         PyErr_SetString(PyExc_RuntimeError, "expected a non-negative integer");
1328         goto exit;
1329     }
1330     else if (npass == 0) {
1331         int n = check_clusterid(clusterid, nitems);
1332         if (n == 0) goto exit;
1333         if (n != nclusters) {
1334             PyErr_SetString(PyExc_ValueError,
1335                             "more clusters requested than found in clusterid");
1336             goto exit;
1337         }
1338     }
1339     kcluster(nclusters,
1340              nrows,
1341              ncols,
1342              data.values,
1343              mask.values,
1344              weight.buf,
1345              transpose,
1346              npass,
1347              method,
1348              dist,
1349              clusterid.buf,
1350              &error,
1351              &ifound);
1352 exit:
1353     data_converter(NULL, &data);
1354     mask_converter(NULL, &mask);
1355     vector_converter(NULL, &weight);
1356     index_converter(NULL, &clusterid);
1357     if (ifound) return Py_BuildValue("di", error, ifound);
1358     return NULL;
1359 }
1360 /* end of wrapper for kcluster */
1361 
1362 /* kmedoids */
1363 static char kmedoids__doc__[] =
1364 "kmedoids(distance, nclusters, npass, clusterid) -> error, nfound\n"
1365 "\n"
1366 "This function implements k-medoids clustering.\n"
1367 "\n"
1368 "Arguments:\n"
1369 " - distance: The distance matrix between the elements. There are three\n"
1370 "   ways in which you can pass a distance matrix:\n"
1371 "\n"
1372 "   1. a 2D Numerical Python array (in which only the left-lower\n"
1373 "      part of the array will be accessed);\n"
1374 "   2. a 1D Numerical Python array containing the distances\n"
1375 "      consecutively;\n"
1376 "   3. a list of rows containing the lower-triangular part of\n"
1377 "      the distance matrix.\n"
1378 "\n"
1379 "   Examples are:\n"
1380 "\n"
1381 "       >>> from numpy import array\n"
1382 "       >>> distance = array([[0.0, 1.1, 2.3],\n"
1383 "       ...                   [1.1, 0.0, 4.5],\n"
1384 "       ...                   [2.3, 4.5, 0.0]])\n"
1385 "       >>> # (option #1)\n"
1386 "       >>> distance = array([1.1, 2.3, 4.5])\n"
1387 "       >>> # (option #2)\n"
1388 "       >>> distance = [array([]),\n"
1389 "       ...             array([1.1]),\n"
1390 "       ...             array([2.3, 4.5])]\n"
1391 "       >>> # (option #3)\n"
1392 "\n"
1393 "   These three correspond to the same distance matrix.\n"
1394 "\n"
1395 " - nclusters: number of clusters (the 'k' in k-medoids)\n"
1396 "\n"
1397 " - npass: number of times the k-medoids clustering algorithm is\n"
1398 "   performed, each time with a different (random) initial\n"
1399 "   condition. If npass == 0, then the assignments in clusterid\n"
1400 "   are used as the initial condition.\n"
1401 "\n"
1402 " - clusterid: array in which the final clustering solution will be\n"
1403 "   stored (output variable). If npass == 0, then clusterid is also used\n"
1404 "   as an input variable, containing the initial condition from which\n"
1405 "   the EM algorithm should start. In this case, the k-medoids algorithm\n"
1406 "   is fully deterministic.\n"
1407 "\n"
1408 "Return values:\n"
1409 " - error: the within-cluster sum of distances for the returned k-means\n"
1410 "   clustering solution;\n"
1411 " - nfound: the number of times this solution was found.\n";
1412 
1413 static PyObject*
py_kmedoids(PyObject * self,PyObject * args,PyObject * keywords)1414 py_kmedoids(PyObject* self, PyObject* args, PyObject* keywords)
1415 {
1416     int nclusters = 2;
1417     Distancematrix distances = {0};
1418     Py_buffer clusterid = {0};
1419     int npass = 1;
1420     double error;
1421     int ifound = -2;
1422 
1423     static char* kwlist[] = {"distance",
1424                              "nclusters",
1425                              "npass",
1426                              "clusterid",
1427                               NULL};
1428 
1429     if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&iiO&", kwlist,
1430                                      distancematrix_converter, &distances,
1431                                      &nclusters,
1432                                      &npass,
1433                                      index_converter, &clusterid)) return NULL;
1434     if (npass < 0) {
1435         PyErr_SetString(PyExc_RuntimeError, "expected a non-negative integer");
1436         goto exit;
1437     }
1438     else if (npass == 0) {
1439         int n = check_clusterid(clusterid, distances.n);
1440         if (n == 0) goto exit;
1441         if (n != nclusters) {
1442             PyErr_SetString(PyExc_RuntimeError,
1443                             "more clusters requested than found in clusterid");
1444             goto exit;
1445         }
1446     }
1447     if (nclusters <= 0) {
1448         PyErr_SetString(PyExc_ValueError,
1449                         "nclusters should be a positive integer");
1450         goto exit;
1451     }
1452     if (distances.n < nclusters) {
1453         PyErr_SetString(PyExc_ValueError,
1454                         "more clusters requested than items to be clustered");
1455         goto exit;
1456     }
1457     kmedoids(nclusters,
1458              distances.n,
1459              distances.values,
1460              npass,
1461              clusterid.buf,
1462              &error,
1463              &ifound);
1464 
1465 exit:
1466     distancematrix_converter(NULL, &distances);
1467     index_converter(NULL, &clusterid);
1468     switch (ifound) {
1469         case -2:
1470             return NULL;
1471         case -1:
1472             return PyErr_NoMemory();
1473         case 0: /* should not occur */
1474             PyErr_SetString(PyExc_RuntimeError,
1475                         "error in kmedoids input arguments");
1476             return NULL;
1477         default:
1478             return Py_BuildValue("di", error, ifound);
1479     }
1480 }
1481 /* end of wrapper for kmedoids */
1482 
1483 /* treecluster */
1484 static char treecluster__doc__[] =
1485 "treecluster(tree, data, mask, weight, transpose, dist, method,\n"
1486 "            distancematrix) -> None\n"
1487 "\n"
1488 "This function implements the pairwise single, complete, centroid, and\n"
1489 "average linkage hierarchical clustering methods.\n"
1490 "\n"
1491 "Arguments:\n"
1492 " - tree: an empty Tree object; its nodes will be filled by treecluster\n"
1493 "   to describe the hierarchical clustering result. See the description\n"
1494 "   of the Tree class for more information.\n"
1495 "\n"
1496 " - data: nrows x ncols array containing the data to be clustered.\n"
1497 "   Either data or distancematrix (see below) should be None.\n"
1498 "\n"
1499 " - mask: nrows x ncols array of integers, showing which data are\n"
1500 "   missing. If mask[i,j]==0, then data[i,j] is missing.\n"
1501 "\n"
1502 " - weight: the weights to be used when calculating distances.\n"
1503 "\n"
1504 " - transpose:\n"
1505 "\n"
1506 "   - if equal to 0, rows are clustered;\n"
1507 "   - if equal to 1, columns are clustered.\n"
1508 "\n"
1509 " - dist: specifies the distance function to be used:\n"
1510 "\n"
1511 "   - dist == 'e': Euclidean distance\n"
1512 "   - dist == 'b': City Block distance\n"
1513 "   - dist == 'c': Pearson correlation\n"
1514 "   - dist == 'a': absolute value of the correlation\n"
1515 "   - dist == 'u': uncentered correlation\n"
1516 "   - dist == 'x': absolute uncentered correlation\n"
1517 "   - dist == 's': Spearman's rank correlation\n"
1518 "   - dist == 'k': Kendall's tau\n"
1519 "\n"
1520 " - method: specifies which linkage method is used:\n"
1521 "\n"
1522 "   - method == 's': Single pairwise linkage\n"
1523 "   - method == 'm': Complete (maximum) pairwise linkage (default)\n"
1524 "   - method == 'c': Centroid linkage\n"
1525 "   - method == 'a': Average pairwise linkage\n"
1526 "\n"
1527 " - distancematrix:  The distance matrix between the elements.\n"
1528 "   Either data (see above) or distancematrix should be None.\n"
1529 "   There are three ways in which you can pass a distance matrix:\n"
1530 "\n"
1531 "   1. a 2D Numerical Python array (in which only the left-lower\n"
1532 "      part of the array will be accessed);\n"
1533 "   2. a 1D Numerical Python array containing the distances\n"
1534 "      consecutively;\n"
1535 "   3. a list of rows containing the lower-triangular part of\n"
1536 "      the distance matrix.\n"
1537 "\n"
1538 "   Examples are:\n"
1539 "\n"
1540 "       >>> from numpy import array\n"
1541 "       >>> distance = array([[0.0, 1.1, 2.3],\n"
1542 "       ...                   [1.1, 0.0, 4.5],\n"
1543 "       ...                   [2.3, 4.5, 0.0]])\n"
1544 "       >>> # option 1.\n"
1545 "       >>> distance = array([1.1, 2.3, 4.5])\n"
1546 "       >>> # option 2.\n"
1547 "       >>> distance = [array([]),\n"
1548 "       ...             array([1.1]),\n"
1549 "       ...             array([2.3, 4.5])]\n"
1550 "       >>> # option 3.\n"
1551 "\n"
1552 "   These three correspond to the same distance matrix.\n"
1553 "\n"
1554 "   PLEASE NOTE:\n"
1555 "   As the treecluster routine may shuffle the values in the\n"
1556 "   distance matrix as part of the clustering algorithm, be sure\n"
1557 "   to save this array in a different variable before calling\n"
1558 "   treecluster if you need it later.\n"
1559 "\n"
1560 "Either data or distancematrix should be None. If distancematrix is None,\n"
1561 "the hierarchical clustering solution is calculated from the values in\n"
1562 "the argument data. Instead if data is None, the hierarchical clustering\n"
1563 "solution is calculated from the distance matrix.\n"
1564 "Pairwise centroid-linkage clustering can be calculated only from the data\n"
1565 "and not from the distance matrix.\n"
1566 "Pairwise single-, maximum-, and average-linkage clustering can be\n"
1567 "calculated from either the data or from the distance matrix.\n";
1568 
1569 static PyObject*
py_treecluster(PyObject * self,PyObject * args,PyObject * keywords)1570 py_treecluster(PyObject* self, PyObject* args, PyObject* keywords)
1571 {
1572     Data data = {0};
1573     Mask mask = {0};
1574     Py_buffer weight = {0};
1575     int transpose = 0;
1576     char dist = 'e';
1577     char method = 'm';
1578     Distancematrix distances = {0};
1579     PyTree* tree = NULL;
1580     Node* nodes;
1581     int nitems;
1582 
1583     static char* kwlist[] = {"tree",
1584                              "data",
1585                              "mask",
1586                              "weight",
1587                              "transpose",
1588                              "method",
1589                              "dist",
1590                              "distancematrix",
1591                               NULL };
1592 
1593     if (!PyArg_ParseTupleAndKeywords(args, keywords, "O!O&O&O&iO&O&O&", kwlist,
1594                                      &PyTreeType, &tree,
1595                                      data_converter, &data,
1596                                      mask_converter, &mask,
1597                                      vector_none_converter, &weight,
1598                                      &transpose,
1599                                      method_treecluster_converter, &method,
1600                                      distance_converter, &dist,
1601                                      distancematrix_converter, &distances))
1602         return NULL;
1603 
1604     if (tree->n != 0) {
1605         PyErr_SetString(PyExc_RuntimeError, "expected an empty tree");
1606         goto exit;
1607     }
1608     if (data.values != NULL && distances.values != NULL) {
1609         PyErr_SetString(PyExc_ValueError,
1610             "use either data or distancematrix, do not use both");
1611         goto exit;
1612     }
1613     if (data.values == NULL && distances.values == NULL) {
1614         PyErr_SetString(PyExc_ValueError,
1615                         "neither data nor distancematrix was given");
1616         goto exit;
1617     }
1618 
1619     if (data.values) /* use the values in data, not the distance matrix */ {
1620         int nrows;
1621         int ncols;
1622         int ndata;
1623 
1624         if (!mask.values) {
1625             PyErr_SetString(PyExc_RuntimeError, "mask is None");
1626             goto exit;
1627         }
1628         if (!weight.buf) {
1629             PyErr_SetString(PyExc_RuntimeError, "weight is None");
1630             goto exit;
1631         }
1632         nrows = data.nrows;
1633         ncols = data.ncols;
1634         if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
1635             PyErr_Format(PyExc_ValueError,
1636                 "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
1637                 mask.view.shape[0], mask.view.shape[1],
1638                 data.nrows, data.ncols);
1639             goto exit;
1640         }
1641         ndata = transpose ? nrows : ncols;
1642         nitems = transpose ? ncols : nrows;
1643         if (weight.shape[0] != ndata) {
1644             PyErr_Format(PyExc_RuntimeError,
1645                          "weight has incorrect size %zd (expected %d)",
1646                          weight.shape[0], ndata);
1647             goto exit;
1648         }
1649 
1650         nodes = treecluster(nrows,
1651                             ncols,
1652                             data.values,
1653                             mask.values,
1654                             weight.buf,
1655                             transpose,
1656                             dist,
1657                             method,
1658                             NULL);
1659     }
1660     else { /* use the distance matrix instead of the values in data */
1661         if (!strchr("sma", method)) {
1662             PyErr_SetString(PyExc_ValueError,
1663                             "argument method should be 's', 'm', or 'a' "
1664                             "when specifying the distance matrix");
1665             goto exit;
1666         }
1667         nitems = distances.n;
1668         nodes = treecluster(nitems,
1669                             nitems,
1670                             0,
1671                             0,
1672                             0,
1673                             transpose,
1674                             dist,
1675                             method,
1676                             distances.values);
1677     }
1678 
1679     if (!nodes) {
1680         PyErr_NoMemory();
1681         goto exit;
1682     }
1683     tree->nodes = nodes;
1684     tree->n = nitems-1;
1685 
1686 exit:
1687     data_converter(NULL, &data);
1688     mask_converter(NULL, &mask);
1689     vector_none_converter(NULL, &weight);
1690     distancematrix_converter(NULL, &distances);
1691     if (tree == NULL || tree->n == 0) return NULL;
1692     Py_INCREF(Py_None);
1693     return Py_None;
1694 }
1695 /* end of wrapper for treecluster */
1696 
1697 /* somcluster */
1698 static char somcluster__doc__[] =
1699 "somcluster(clusterid, celldata, data, mask, weight, transpose,\n"
1700 "           inittau, niter, dist) -> None\n"
1701 "\n"
1702 "This function implements a self-organizing map on a rectangular grid.\n"
1703 "\n"
1704 "Arguments:\n"
1705 " - clusterid: array with two columns, with the number of rows equal\n"
1706 "   to the number of items being clustered. Upon return, each row\n"
1707 "   in the array contains the x and y coordinates of the cell in the\n"
1708 "   the rectangular SOM grid to which the item was assigned.\n"
1709 "\n"
1710 " - celldata: array with dimensions nxgrid x nygrid x number of columns\n"
1711 "   if rows are being clustered, or nxgrid x nygrid x number of rows\n"
1712 "   if columns are being clustered, where nxgrid is the horizontal\n"
1713 "   dimension of the rectangular SOM map and nygrid is the vertical\n"
1714 "   dimension of the rectangular SOM map.\n"
1715 "   Upon return, each element [ix, iy] of this array contains the\n"
1716 "   data for the centroid of the cluster in the SOM grid cell with\n"
1717 "   coordinates [ix, iy].\n"
1718 "\n"
1719 " - data: nrows x ncols array containing the data to be clustered.\n"
1720 "\n"
1721 " - mask: nrows x ncols array of integers, showing which data are\n"
1722 "   missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
1723 "\n"
1724 " - weight: the weights to be used when calculating distances\n"
1725 "\n"
1726 " - transpose:\n"
1727 "\n"
1728 "   - if equal to 0, rows are clustered;\n"
1729 "   - if equal to 1, columns are clustered.\n"
1730 "\n"
1731 " - inittau: the initial value of tau (the neighborbood function)\n"
1732 "\n"
1733 " - niter: the number of iterations\n"
1734 "\n"
1735 " - dist: specifies the distance function to be used:\n"
1736 "\n"
1737 "   - dist == 'e': Euclidean distance\n"
1738 "   - dist == 'b': City Block distance\n"
1739 "   - dist == 'c': Pearson correlation\n"
1740 "   - dist == 'a': absolute value of the correlation\n"
1741 "   - dist == 'u': uncentered correlation\n"
1742 "   - dist == 'x': absolute uncentered correlation\n"
1743 "   - dist == 's': Spearman's rank correlation\n"
1744 "   - dist == 'k': Kendall's tau\n";
1745 
1746 static PyObject*
py_somcluster(PyObject * self,PyObject * args,PyObject * keywords)1747 py_somcluster(PyObject* self, PyObject* args, PyObject* keywords)
1748 {
1749     int nrows;
1750     int ncols;
1751     int ndata;
1752     Data data = {0};
1753     Mask mask = {0};
1754     Py_buffer weight = {0};
1755     int transpose = 0;
1756     double inittau = 0.02;
1757     int niter = 1;
1758     char dist = 'e';
1759     Py_buffer indices = {0};
1760     Celldata celldata = {0};
1761     PyObject* result = NULL;
1762 
1763     static char* kwlist[] = {"clusterids",
1764                              "celldata",
1765                              "data",
1766                              "mask",
1767                              "weight",
1768                              "transpose",
1769                              "inittau",
1770                              "niter",
1771                              "dist",
1772                              NULL};
1773 
1774     if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&O&idiO&", kwlist,
1775                                      index2d_converter, &indices,
1776                                      celldata_converter, &celldata,
1777                                      data_converter, &data,
1778                                      mask_converter, &mask,
1779                                      vector_converter, &weight,
1780                                      &transpose,
1781                                      &inittau,
1782                                      &niter,
1783                                      distance_converter, &dist)) return NULL;
1784     if (niter < 1) {
1785         PyErr_SetString(PyExc_ValueError,
1786                       "number of iterations (niter) should be positive");
1787         goto exit;
1788     }
1789     if (!data.values) {
1790         PyErr_SetString(PyExc_RuntimeError, "data is None");
1791         goto exit;
1792     }
1793     if (!mask.values) {
1794         PyErr_SetString(PyExc_RuntimeError, "mask is None");
1795         goto exit;
1796     }
1797     nrows = data.nrows;
1798     ncols = data.ncols;
1799     if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
1800         PyErr_Format(PyExc_ValueError,
1801             "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
1802             mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
1803         goto exit;
1804     }
1805     ndata = transpose ? nrows : ncols;
1806     if (weight.shape[0] != ndata) {
1807         PyErr_Format(PyExc_RuntimeError,
1808                      "weight has incorrect size %zd (expected %d)",
1809                      weight.shape[0], ndata);
1810         goto exit;
1811     }
1812     if (celldata.nz != ndata) {
1813         PyErr_Format(PyExc_RuntimeError,
1814                     "the celldata array size is not consistent with the data "
1815                     "(last dimension is %d; expected %d)", celldata.nz, ndata);
1816         goto exit;
1817     }
1818     somcluster(nrows,
1819                ncols,
1820                data.values,
1821                mask.values,
1822                weight.buf,
1823                transpose,
1824                celldata.nx,
1825                celldata.ny,
1826                inittau,
1827                niter,
1828                dist,
1829                celldata.values,
1830                indices.buf);
1831     Py_INCREF(Py_None);
1832     result = Py_None;
1833 
1834 exit:
1835     data_converter(NULL, &data);
1836     vector_converter(NULL, &weight);
1837     index2d_converter(NULL, &indices);
1838     celldata_converter(NULL, &celldata);
1839     return result;
1840 }
1841 /* end of wrapper for somcluster */
1842 
1843 /* clusterdistance */
1844 static char clusterdistance__doc__[] =
1845 "clusterdistance(data, mask, weight, index1, index2, dist, method,\n"
1846 "                transpose) -> distance between two clusters\n"
1847 "\n"
1848 "Arguments:\n"
1849 "\n"
1850 " - data: nrows x ncols array containing the data values.\n"
1851 "\n"
1852 " - mask: nrows x ncols array of integers, showing which data are\n"
1853 "   missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
1854 "\n"
1855 " - weight: the weights to be used when calculating distances\n"
1856 "\n"
1857 " - index1: 1D array identifying which items belong to the first\n"
1858 "   cluster.\n"
1859 "\n"
1860 " - index2: 1D array identifying which items belong to the second\n"
1861 "   cluster.\n"
1862 "\n"
1863 " - dist: specifies the distance function to be used:\n"
1864 "\n"
1865 "   - dist == 'e': Euclidean distance\n"
1866 "   - dist == 'b': City Block distance\n"
1867 "   - dist == 'c': Pearson correlation\n"
1868 "   - dist == 'a': absolute value of the correlation\n"
1869 "   - dist == 'u': uncentered correlation\n"
1870 "   - dist == 'x': absolute uncentered correlation\n"
1871 "   - dist == 's': Spearman's rank correlation\n"
1872 "   - dist == 'k': Kendall's tau\n"
1873 "\n"
1874 " - method: specifies how the distance between two clusters is defined:\n"
1875 "\n"
1876 "   - method == 'a': the distance between the arithmetic means of the\n"
1877 "     two clusters\n"
1878 "   - method == 'm': the distance between the medians of the two\n"
1879 "     clusters\n"
1880 "   - method == 's': the smallest pairwise distance between members\n"
1881 "     of the two clusters\n"
1882 "   - method == 'x': the largest pairwise distance between members of\n"
1883 "     the two clusters\n"
1884 "   - method == 'v': average of the pairwise distances between\n"
1885 "     members of the clusters\n"
1886 "\n"
1887 " - transpose:\n"
1888 "\n"
1889 "   - if equal to 0: clusters of rows are considered;\n"
1890 "   - if equal to 1: clusters of columns are considered.\n"
1891 "\n";
1892 
1893 static PyObject*
py_clusterdistance(PyObject * self,PyObject * args,PyObject * keywords)1894 py_clusterdistance(PyObject* self, PyObject* args, PyObject* keywords)
1895 {
1896     double distance;
1897     int nrows;
1898     int ncols;
1899     int ndata;
1900     Data data = {0};
1901     Mask mask = {0};
1902     Py_buffer weight = {0};
1903     char dist = 'e';
1904     char method = 'a';
1905     int transpose = 0;
1906     Py_buffer index1 = {0};
1907     Py_buffer index2 = {0};
1908     PyObject* result = NULL;
1909 
1910     static char* kwlist[] = {"data",
1911                              "mask",
1912                              "weight",
1913                              "index1",
1914                              "index2",
1915                              "method",
1916                              "dist",
1917                              "transpose",
1918                               NULL};
1919 
1920     if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&O&O&O&i", kwlist,
1921                                      data_converter, &data,
1922                                      mask_converter, &mask,
1923                                      vector_converter, &weight,
1924                                      index_converter, &index1,
1925                                      index_converter, &index2,
1926                                      method_clusterdistance_converter, &method,
1927                                      distance_converter, &dist,
1928                                      &transpose)) return NULL;
1929     if (!data.values) {
1930         PyErr_SetString(PyExc_RuntimeError, "data is None");
1931         goto exit;
1932     }
1933     if (!mask.values) {
1934         PyErr_SetString(PyExc_RuntimeError, "mask is None");
1935         goto exit;
1936     }
1937     nrows = data.nrows;
1938     ncols = data.ncols;
1939     ndata = transpose ? nrows : ncols;
1940     if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
1941         PyErr_Format(PyExc_ValueError,
1942             "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
1943             mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
1944         goto exit;
1945     }
1946     if (weight.shape[0] != ndata) {
1947         PyErr_Format(PyExc_RuntimeError,
1948                      "weight has incorrect size %zd (expected %d)",
1949                      weight.shape[0], ndata);
1950         goto exit;
1951     }
1952 
1953     distance = clusterdistance(nrows,
1954                                ncols,
1955                                data.values,
1956                                mask.values,
1957                                weight.buf,
1958                                (int) index1.shape[0],
1959                                (int) index2.shape[0],
1960                                index1.buf,
1961                                index2.buf,
1962                                dist,
1963                                method,
1964                                transpose);
1965 
1966     if (distance < -0.5) /* Actually -1.0; avoiding roundoff errors */
1967         PyErr_SetString(PyExc_IndexError, "index out of range");
1968     else
1969         result = PyFloat_FromDouble(distance);
1970 exit:
1971     data_converter(NULL, &data);
1972     mask_converter(NULL, &mask);
1973     vector_converter(NULL, &weight);
1974     index_converter(NULL, &index1);
1975     index_converter(NULL, &index2);
1976     return result;
1977 }
1978 /* end of wrapper for clusterdistance */
1979 
1980 /* clustercentroids */
1981 static char clustercentroids__doc__[] =
1982 "clustercentroids(data, mask, clusterid, method, transpose) -> cdata, cmask\n"
1983 "\n"
1984 "The clustercentroids routine calculates the cluster centroids, given to\n"
1985 "which cluster each element belongs. The centroid is defined as either\n"
1986 "the mean or the median over all elements for each dimension.\n"
1987 "\n"
1988 "Arguments:\n"
1989 " - data: nrows x ncols array containing the data values.\n"
1990 "\n"
1991 " - mask: nrows x ncols array of integers, showing which data are\n"
1992 "   missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
1993 "\n"
1994 " - clusterid: array containing the cluster number for each item.\n"
1995 "   The cluster number should be non-negative.\n"
1996 "\n"
1997 " - method: specifies whether the centroid is calculated from the\n"
1998 "   arithmetic mean (method == 'a', default) or the median\n"
1999 "   (method == 'm') over each dimension.\n"
2000 "\n"
2001 " - transpose: if equal to 0, row clusters are considered;\n"
2002 "   if equal to 1, column clusters are considered.\n"
2003 "\n"
2004 " - cdata: 2D array containing, upon return, the cluster centroids.\n"
2005 "   If transpose == 0, then the dimensions of cdata should be\n"
2006 "   nclusters x ncols.\n"
2007 "   If transpose == 1, then the dimensions of cdata should be \n"
2008 "   nrows x nclusters.\n"
2009 "\n"
2010 " - cmask: 2D array of integers describing, upon return,  which elements\n"
2011 "   in cdata, if any, are missing.\n";
2012 
2013 static PyObject*
py_clustercentroids(PyObject * self,PyObject * args,PyObject * keywords)2014 py_clustercentroids(PyObject* self, PyObject* args, PyObject* keywords)
2015 {
2016     int nrows;
2017     int ncols;
2018     int nclusters;
2019     Data data = {0};
2020     Mask mask = {0};
2021     Data cdata = {0};
2022     Mask cmask = {0};
2023     Py_buffer clusterid = {0};
2024     char method = 'a';
2025     int transpose = 0;
2026     int ok = -1;
2027 
2028     static char* kwlist[] = {"data",
2029                              "mask",
2030                              "clusterid",
2031                              "method",
2032                              "transpose",
2033                              "cdata",
2034                              "cmask",
2035                               NULL };
2036 
2037     if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&iO&O&", kwlist,
2038                                      data_converter, &data,
2039                                      mask_converter, &mask,
2040                                      index_converter, &clusterid,
2041                                      method_kcluster_converter, &method,
2042                                      &transpose,
2043                                      data_converter, &cdata,
2044                                      mask_converter, &cmask)) return NULL;
2045     if (!data.values) {
2046         PyErr_SetString(PyExc_RuntimeError, "data is None");
2047         goto exit;
2048     }
2049     if (!mask.values) {
2050         PyErr_SetString(PyExc_RuntimeError, "mask is None");
2051         goto exit;
2052     }
2053     nrows = data.nrows;
2054     ncols = data.ncols;
2055     if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
2056         PyErr_Format(PyExc_ValueError,
2057             "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
2058             mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
2059         goto exit;
2060     }
2061     if (transpose == 0) {
2062         nclusters = check_clusterid(clusterid, nrows);
2063         nrows = nclusters;
2064     }
2065     else {
2066         nclusters = check_clusterid(clusterid, ncols);
2067         ncols = nclusters;
2068     }
2069     if (nclusters == 0) goto exit;
2070     if (cdata.nrows != nrows) {
2071         PyErr_Format(PyExc_RuntimeError,
2072                      "cdata has incorrect number of rows (%d, expected %d)",
2073                      cdata.nrows, nrows);
2074         goto exit;
2075     }
2076     if (cdata.ncols != ncols) {
2077         PyErr_Format(PyExc_RuntimeError,
2078                      "cdata has incorrect number of columns (%d, expected %d)",
2079                      cdata.ncols, ncols);
2080         goto exit;
2081     }
2082     if (cmask.view.shape[0] != nrows) {
2083         PyErr_Format(PyExc_RuntimeError,
2084                      "cmask has incorrect number of rows (%zd, expected %d)",
2085                      cmask.view.shape[0], nrows);
2086         goto exit;
2087     }
2088     if (cmask.view.shape[1] != ncols) {
2089         PyErr_Format(PyExc_RuntimeError,
2090                      "cmask has incorrect number of columns "
2091                      "(%zd, expected %d)", cmask.view.shape[1], ncols);
2092         goto exit;
2093     }
2094     ok = getclustercentroids(nclusters,
2095                              data.nrows,
2096                              data.ncols,
2097                              data.values,
2098                              mask.values,
2099                              clusterid.buf,
2100                              cdata.values,
2101                              cmask.values,
2102                              transpose,
2103                              method);
2104 exit:
2105     data_converter(NULL, &data);
2106     mask_converter(NULL, &mask);
2107     data_converter(NULL, &cdata);
2108     mask_converter(NULL, &cmask);
2109     index_converter(NULL, &clusterid);
2110     if (ok == -1) return NULL;
2111     if (ok == 0) return PyErr_NoMemory();
2112     Py_INCREF(Py_None);
2113     return Py_None;
2114 }
2115 /* end of wrapper for clustercentroids */
2116 
2117 /* distancematrix */
2118 static char distancematrix__doc__[] =
2119 "distancematrix(data, mask, weight, transpose, dist, distancematrix)\n"
2120 "              -> None\n"
2121 "\n"
2122 "This function calculuates the distance matrix between the data values.\n"
2123 "\n"
2124 "Arguments:\n"
2125 "\n"
2126 " - data: nrows x ncols array containing the data values.\n"
2127 "\n"
2128 " - mask: nrows x ncols array of integers, showing which data are\n"
2129 "   missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
2130 "\n"
2131 " - weight: the weights to be used when calculating distances.\n"
2132 "\n"
2133 " - transpose: if equal to 0: the distances between rows are\n"
2134 "   calculated;\n"
2135 "   if equal to 1, the distances between columns are calculated.\n"
2136 "\n"
2137 " - dist: specifies the distance function to be used:\n"
2138 "\n"
2139 "   - dist == 'e': Euclidean distance\n"
2140 "   - dist == 'b': City Block distance\n"
2141 "   - dist == 'c': Pearson correlation\n"
2142 "   - dist == 'a': absolute value of the correlation\n"
2143 "   - dist == 'u': uncentered correlation\n"
2144 "   - dist == 'x': absolute uncentered correlation\n"
2145 "   - dist == 's': Spearman's rank correlation\n"
2146 "   - dist == 'k': Kendall's tau\n"
2147 "\n"
2148 " - distancematrix: Upon return, the distance matrix as a list of 1D\n"
2149 "   arrays. The number of columns in each row is equal to the row number\n"
2150 "   (i.e., len(distancematrix[i]) == i).\n"
2151 "   An example of the return value is:\n"
2152 "\n"
2153 "    matrix = [[],\n"
2154 "              array([1.]),\n"
2155 "              array([7., 3.]),\n"
2156 "              array([4., 2., 6.])]\n"
2157 "\n"
2158 "This corresponds to the distance matrix:\n"
2159 "\n"
2160 "    [0.\t1.\t7.\t4.]\n"
2161 "    [1.\t0.\t3.\t2.]\n"
2162 "    [7.\t3.\t0.\t6.]\n"
2163 "    [4.\t2.\t6.\t0.]\n";
2164 
2165 static PyObject*
py_distancematrix(PyObject * self,PyObject * args,PyObject * keywords)2166 py_distancematrix(PyObject* self, PyObject* args, PyObject* keywords)
2167 {
2168     PyObject* list;
2169     Distancematrix distances = {0};
2170     Data data = {0};
2171     Mask mask = {0};
2172     Py_buffer weight = {0};
2173     int transpose = 0;
2174     char dist = 'e';
2175     int nrows, ncols, ndata;
2176     PyObject* result = NULL;
2177 
2178     /* -- Read the input variables --------------------------------------- */
2179     static char* kwlist[] = {"data",
2180                              "mask",
2181                              "weight",
2182                              "transpose",
2183                              "dist",
2184                              "distancematrix",
2185                               NULL};
2186 
2187     if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&iO&O!", kwlist,
2188                                      data_converter, &data,
2189                                      mask_converter, &mask,
2190                                      vector_converter, &weight,
2191                                      &transpose,
2192                                      distance_converter, &dist,
2193                                      &PyList_Type, &list)) return NULL;
2194     if (!data.values) {
2195         PyErr_SetString(PyExc_RuntimeError, "data is None");
2196         goto exit;
2197     }
2198     if (!mask.values) {
2199         PyErr_SetString(PyExc_RuntimeError, "mask is None");
2200         goto exit;
2201     }
2202     nrows = data.nrows;
2203     ncols = data.ncols;
2204     if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
2205         PyErr_Format(PyExc_ValueError,
2206             "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
2207             mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
2208         goto exit;
2209     }
2210     ndata = (transpose == 0) ? ncols : nrows;
2211     if (weight.shape[0] != ndata) {
2212         PyErr_Format(PyExc_ValueError,
2213                      "weight has incorrect size %zd (expected %d)",
2214                      weight.shape[0], ndata);
2215         goto exit;
2216     }
2217     if (_convert_list_to_distancematrix(list, &distances) == 0) goto exit;
2218 
2219     distancematrix(nrows,
2220                    ncols,
2221                    data.values,
2222                    mask.values,
2223                    weight.buf,
2224                    dist,
2225                    transpose,
2226                    distances.values);
2227 
2228     Py_INCREF(Py_None);
2229     result = Py_None;
2230 exit:
2231     data_converter(NULL, &data);
2232     mask_converter(NULL, &mask);
2233     vector_converter(NULL, &weight);
2234     distancematrix_converter(NULL, &distances);
2235     return result;
2236 }
2237 /* end of wrapper for distancematrix */
2238 
2239 /* pca */
2240 static char pca__doc__[] =
2241 "pca(data, columnmean, coordinates, pc, eigenvalues) -> None\n"
2242 "\n"
2243 "This function calculates the principal component decomposition\n"
2244 "of the values in data.\n"
2245 "\n"
2246 "Arguments:\n"
2247 "\n"
2248 " - data: nrows x ncols array containing the data values.\n"
2249 "\n"
2250 " - columnmean: array of size nrows) in which the mean of each column\n"
2251 "               will be sorted.\n"
2252 "\n"
2253 " - coordinates: nrows x nmin array in which the coordinates of the\n"
2254 "                data along the principal components will be stored;\n"
2255 "                nmin is min(nrows, ncols).\n"
2256 "\n"
2257 " - pc : the principal components as an nmin x ncols array, where nmin\n"
2258 "        is min(nrows, ncols).\n"
2259 "\n"
2260 " - eigenvalues: array of size min(nrows, ncols), in which the\n"
2261 "                eigenvalues will be stored, sorted by the magnitude\n"
2262 "                of the eigenvalues, with the largest eigenvalues\n"
2263 "                appearing first.\n"
2264 "\n"
2265 "Adding the column means to the dot product of the coordinates and the\n"
2266 "principal components, i.e.\n"
2267 "\n"
2268 "   columnmean + dot(coordinates, pc)\n"
2269 "\n"
2270 "recreates the data matrix.\n";
2271 
2272 static PyObject*
py_pca(PyObject * self,PyObject * args)2273 py_pca(PyObject* self, PyObject* args)
2274 {
2275     Py_buffer eigenvalues = {0};
2276     double** u;
2277     double** v;
2278     Data data = {0};
2279     Data pc = {0};
2280     Data coordinates = {0};
2281     Py_buffer mean = {0};
2282     int nrows, ncols;
2283     int nmin;
2284     int error = -2;
2285     double* p;
2286     double** values;
2287     int i, j;
2288 
2289     if (!PyArg_ParseTuple(args, "O&O&O&O&O&",
2290                           data_converter, &data,
2291                           vector_converter, &mean,
2292                           data_converter, &coordinates,
2293                           data_converter, &pc,
2294                           vector_converter, &eigenvalues)) return NULL;
2295 
2296     values = data.values;
2297     if (!values) {
2298         PyErr_SetString(PyExc_RuntimeError, "data is None");
2299         goto exit;
2300     }
2301     nrows = data.nrows;
2302     ncols = data.ncols;
2303     if (mean.shape[0] != ncols) {
2304         PyErr_Format(PyExc_RuntimeError,
2305                      "columnmean has inconsistent size %zd (expected %d)",
2306                      mean.shape[0], ncols);
2307         goto exit;
2308     }
2309     nmin = nrows < ncols ? nrows : ncols;
2310     if (pc.nrows != nmin || pc.ncols != ncols) {
2311         PyErr_Format(PyExc_RuntimeError,
2312                      "pc has inconsistent size %zd x %zd (expected %d x %d)",
2313                      mean.shape[0], mean.shape[1], nmin, ncols);
2314         goto exit;
2315     }
2316     if (coordinates.nrows != nrows || coordinates.ncols != nmin) {
2317         PyErr_Format(PyExc_RuntimeError,
2318             "coordinates has inconsistent size %zd x %zd (expected %d x %d)",
2319             mean.shape[0], mean.shape[1], nrows, nmin);
2320         goto exit;
2321     }
2322     if (nrows >= ncols) {
2323         u = coordinates.values;
2324         v = pc.values;
2325     }
2326     else { /* nrows < ncolums */
2327         u = pc.values;
2328         v = coordinates.values;
2329     }
2330     /* -- Calculate the mean of each column ------------------------------ */
2331     p = mean.buf;
2332     for (j = 0; j < ncols; j++) {
2333         p[j] = 0.0;
2334         for (i = 0; i < nrows; i++) p[j] += values[i][j];
2335         p[j] /= nrows;
2336     }
2337     /* --   Subtract the mean of each column ----------------------------- */
2338     for (i = 0; i < nrows; i++)
2339         for (j = 0; j < ncols; j++)
2340             u[i][j] = values[i][j] - p[j];
2341     /* -- Perform the principal component analysis ----------------------- */
2342     error = pca(nrows, ncols, u, v, eigenvalues.buf);
2343     /* ------------------------------------------------------------------- */
2344 exit:
2345     data_converter(NULL, &data);
2346     vector_converter(NULL, &mean);
2347     data_converter(NULL, &pc);
2348     data_converter(NULL, &coordinates);
2349     vector_converter(NULL, &eigenvalues);
2350     if (error == 0) {
2351         Py_INCREF(Py_None);
2352         return Py_None;
2353     }
2354     if (error == -1) return PyErr_NoMemory();
2355     else if (error > 0)
2356         PyErr_SetString(PyExc_RuntimeError,
2357             "Singular value decomposition failed to converge");
2358     return NULL;
2359 }
2360 /* end of wrapper for pca */
2361 
2362 /* ========================================================================= */
2363 /* -- The methods table ---------------------------------------------------- */
2364 /* ========================================================================= */
2365 
2366 
2367 static struct PyMethodDef cluster_methods[] = {
2368     {"version", (PyCFunction) py_version, METH_NOARGS, version__doc__},
2369     {"kcluster",
2370      (PyCFunction) py_kcluster,
2371      METH_VARARGS | METH_KEYWORDS,
2372      kcluster__doc__
2373     },
2374     {"kmedoids",
2375      (PyCFunction) py_kmedoids,
2376      METH_VARARGS | METH_KEYWORDS,
2377      kmedoids__doc__
2378     },
2379     {"treecluster",
2380      (PyCFunction) py_treecluster,
2381      METH_VARARGS | METH_KEYWORDS,
2382      treecluster__doc__
2383     },
2384     {"somcluster",
2385      (PyCFunction) py_somcluster,
2386      METH_VARARGS | METH_KEYWORDS,
2387      somcluster__doc__
2388     },
2389     {"clusterdistance",
2390      (PyCFunction) py_clusterdistance,
2391      METH_VARARGS | METH_KEYWORDS,
2392      clusterdistance__doc__
2393     },
2394     {"clustercentroids",
2395      (PyCFunction) py_clustercentroids,
2396      METH_VARARGS | METH_KEYWORDS,
2397      clustercentroids__doc__
2398     },
2399     {"distancematrix",
2400      (PyCFunction) py_distancematrix,
2401      METH_VARARGS | METH_KEYWORDS,
2402      distancematrix__doc__
2403     },
2404     {"pca",
2405      (PyCFunction) py_pca,
2406      METH_VARARGS | METH_KEYWORDS,
2407      pca__doc__
2408     },
2409     {NULL, NULL, 0, NULL} /* sentinel */
2410 };
2411 
2412 /* ========================================================================= */
2413 /* -- Initialization ------------------------------------------------------- */
2414 /* ========================================================================= */
2415 
2416 static struct PyModuleDef moduledef = {
2417     PyModuleDef_HEAD_INIT,
2418     "_cluster",
2419     "C Clustering Library",
2420     -1,
2421     cluster_methods,
2422     NULL,
2423     NULL,
2424     NULL,
2425     NULL
2426 };
2427 
2428 PyObject *
PyInit__cluster(void)2429 PyInit__cluster(void)
2430 {
2431     PyObject *module;
2432 
2433     PyNodeType.tp_new = PyType_GenericNew;
2434     if (PyType_Ready(&PyNodeType) < 0)
2435         return NULL;
2436     if (PyType_Ready(&PyTreeType) < 0)
2437         return NULL;
2438 
2439     module = PyModule_Create(&moduledef);
2440     if (module == NULL) return NULL;
2441 
2442     Py_INCREF(&PyTreeType);
2443     if (PyModule_AddObject(module, "Tree", (PyObject*) &PyTreeType) < 0) {
2444         Py_DECREF(module);
2445         Py_DECREF(&PyTreeType);
2446         return NULL;
2447     }
2448 
2449     Py_INCREF(&PyNodeType);
2450     if (PyModule_AddObject(module, "Node", (PyObject*) &PyNodeType) < 0) {
2451         Py_DECREF(module);
2452         Py_DECREF(&PyNodeType);
2453         return NULL;
2454     }
2455 
2456     return module;
2457 }
2458