1 #include "Python.h"
2 #include <stdio.h>
3 #include <string.h>
4 #include <float.h>
5 #include "cluster.h"
6
7
8 /* ========================================================================= */
9 /* -- Helper routines ------------------------------------------------------ */
10 /* ========================================================================= */
11
12 static char
extract_single_character(PyObject * object,const char variable[],const char allowed[])13 extract_single_character(PyObject* object, const char variable[],
14 const char allowed[])
15 {
16 Py_UCS4 ch;
17 Py_ssize_t n;
18 if (!PyUnicode_Check(object)) {
19 PyErr_Format(PyExc_ValueError, "%s should be a string", variable);
20 return 0;
21 }
22 if (PyUnicode_READY(object) == -1) return 0;
23 n = PyUnicode_GET_LENGTH(object);
24 if (n != 1) {
25 PyErr_Format(PyExc_ValueError,
26 "%s should be a single character", variable);
27 return 0;
28 }
29 ch = PyUnicode_READ_CHAR(object, 0);
30 if (ch < 128) {
31 const char c = ch;
32 if (strchr(allowed, c)) return c;
33 }
34 PyErr_Format(PyExc_ValueError,
35 "unknown %s function specified (should be one of '%s')",
36 variable, allowed);
37 return 0;
38 }
39
40 static int
distance_converter(PyObject * object,void * pointer)41 distance_converter(PyObject* object, void* pointer)
42 {
43 char c;
44
45 c = extract_single_character(object, "dist", "ebcauxsk");
46 if (c == 0) return 0;
47 *((char*)pointer) = c;
48 return 1;
49 }
50
51 static int
method_treecluster_converter(PyObject * object,void * pointer)52 method_treecluster_converter(PyObject* object, void* pointer)
53 {
54 char c;
55
56 c = extract_single_character(object, "method", "csma");
57 if (c == 0) return 0;
58 *((char*)pointer) = c;
59 return 1;
60 }
61
62 static int
method_kcluster_converter(PyObject * object,void * pointer)63 method_kcluster_converter(PyObject* object, void* pointer)
64 {
65 char c;
66
67 c = extract_single_character(object, "method", "am");
68 if (c == 0) return 0;
69 *((char*)pointer) = c;
70 return 1;
71 }
72
73 static int
method_clusterdistance_converter(PyObject * object,void * pointer)74 method_clusterdistance_converter(PyObject* object, void* pointer)
75 {
76 char c;
77
78 c = extract_single_character(object, "method", "amsxv");
79 if (c == 0) return 0;
80 *((char*)pointer) = c;
81 return 1;
82 }
83
84 /* -- data ----------------------------------------------------------------- */
85
86 typedef struct {
87 int nrows;
88 int ncols;
89 double** values;
90 Py_buffer view;
91 } Data;
92
93 static int
data_converter(PyObject * object,void * pointer)94 data_converter(PyObject* object, void* pointer)
95 {
96 Data* data = pointer;
97 int nrows;
98 int ncols;
99 int i;
100 double** values = data->values;
101 Py_buffer* view = &data->view;
102 const char* p;
103 Py_ssize_t stride;
104 const int flag = PyBUF_ND | PyBUF_STRIDES;
105
106 if (object == NULL) goto exit;
107 if (object == Py_None) return 1;
108
109 if (PyObject_GetBuffer(object, view, flag) == -1) {
110 PyErr_SetString(PyExc_RuntimeError,
111 "data matrix has unexpected format.");
112 return 0;
113 }
114
115 if (view->ndim != 2) {
116 PyErr_Format(PyExc_RuntimeError,
117 "data matrix has incorrect rank %d (expected 2)",
118 view->ndim);
119 goto exit;
120 }
121 if (view->itemsize != sizeof(double)) {
122 PyErr_SetString(PyExc_RuntimeError,
123 "data matrix has incorrect data type");
124 goto exit;
125 }
126 nrows = (int) view->shape[0];
127 ncols = (int) view->shape[1];
128 if (nrows != view->shape[0] || ncols != view->shape[1]) {
129 PyErr_Format(PyExc_ValueError,
130 "data matrix is too large (dimensions = %zd x %zd)",
131 view->shape[0], view->shape[1]);
132 goto exit;
133 }
134 if (nrows < 1 || ncols < 1) {
135 PyErr_SetString(PyExc_ValueError, "data matrix is empty");
136 goto exit;
137 }
138 stride = view->strides[0];
139 if (view->strides[1] != view->itemsize) {
140 PyErr_SetString(PyExc_RuntimeError, "data is not contiguous");
141 goto exit;
142 }
143 values = PyMem_Malloc(nrows*sizeof(double*));
144 if (!values) {
145 PyErr_NoMemory();
146 goto exit;
147 }
148 for (i = 0, p = view->buf; i < nrows; i++, p += stride)
149 values[i] = (double*)p;
150 data->values = values;
151 data->nrows = nrows;
152 data->ncols = ncols;
153 return Py_CLEANUP_SUPPORTED;
154
155 exit:
156 if (values) PyMem_Free(values);
157 PyBuffer_Release(view);
158 return 0;
159 }
160
161 /* -- mask ----------------------------------------------------------------- */
162
163 typedef struct {
164 int** values;
165 Py_buffer view;
166 } Mask;
167
168 static int
mask_converter(PyObject * object,void * pointer)169 mask_converter(PyObject* object, void* pointer)
170 {
171 Mask* mask = pointer;
172 int nrows;
173 int ncols;
174 int i;
175 int** values = mask->values;
176 Py_buffer* view = &mask->view;
177 const char* p;
178 Py_ssize_t stride;
179 const int flag = PyBUF_ND | PyBUF_STRIDES;
180
181 if (object == NULL) goto exit;
182 if (object == Py_None) return 1;
183
184 if (PyObject_GetBuffer(object, view, flag) == -1) {
185 PyErr_SetString(PyExc_RuntimeError, "mask has unexpected format.");
186 return 0;
187 }
188
189 if (view->ndim != 2) {
190 PyErr_Format(PyExc_ValueError,
191 "mask has incorrect rank %d (expected 2)", view->ndim);
192 goto exit;
193 }
194 if (view->itemsize != sizeof(int)) {
195 PyErr_SetString(PyExc_RuntimeError, "mask has incorrect data type");
196 goto exit;
197 }
198 nrows = (int) view->shape[0];
199 ncols = (int) view->shape[1];
200 if (nrows != view->shape[0] || ncols != view->shape[1]) {
201 PyErr_Format(PyExc_ValueError,
202 "mask is too large (dimensions = %zd x %zd)",
203 view->shape[0], view->shape[1]);
204 goto exit;
205 }
206 stride = view->strides[0];
207 if (view->strides[1] != view->itemsize) {
208 PyErr_SetString(PyExc_RuntimeError, "mask is not contiguous");
209 goto exit;
210 }
211 values = PyMem_Malloc(nrows*sizeof(int*));
212 if (!values) {
213 PyErr_NoMemory();
214 goto exit;
215 }
216 for (i = 0, p = view->buf; i < nrows; i++, p += stride)
217 values[i] = (int*)p;
218 mask->values = values;
219 return Py_CLEANUP_SUPPORTED;
220
221 exit:
222 if (values) PyMem_Free(values);
223 PyBuffer_Release(view);
224 return 0;
225 }
226
227 /* -- 1d array ------------------------------------------------------------- */
228
229 static int
vector_converter(PyObject * object,void * pointer)230 vector_converter(PyObject* object, void* pointer)
231 {
232 Py_buffer* view = pointer;
233 int ndata;
234 const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
235
236 if (object == NULL) goto exit;
237
238 if (PyObject_GetBuffer(object, view, flag) == -1) {
239 PyErr_SetString(PyExc_RuntimeError, "unexpected format.");
240 return 0;
241 }
242
243 if (view->ndim != 1) {
244 PyErr_Format(PyExc_ValueError, "incorrect rank %d (expected 1)",
245 view->ndim);
246 goto exit;
247 }
248 if (view->itemsize != sizeof(double)) {
249 PyErr_SetString(PyExc_RuntimeError, "array has incorrect data type");
250 goto exit;
251 }
252 ndata = (int) view->shape[0];
253 if (ndata != view->shape[0]) {
254 PyErr_Format(PyExc_ValueError,
255 "array is too large (size = %zd)", view->shape[0]);
256 goto exit;
257 }
258 return Py_CLEANUP_SUPPORTED;
259
260 exit:
261 PyBuffer_Release(view);
262 return 0;
263 }
264
265 static int
vector_none_converter(PyObject * object,void * pointer)266 vector_none_converter(PyObject* object, void* pointer)
267 {
268 if (object == Py_None) return 1;
269 return vector_converter(object, pointer);
270 }
271
272 /* -- clusterid ------------------------------------------------------------ */
273
274 static int
check_clusterid(Py_buffer clusterid,int nitems)275 check_clusterid(Py_buffer clusterid, int nitems) {
276 int i, j;
277 int *p = clusterid.buf;
278 int nclusters = 0;
279 int* number;
280
281 if (nitems != clusterid.shape[0]) {
282 PyErr_Format(PyExc_ValueError, "incorrect size (%zd, expected %d)",
283 clusterid.shape[0], nitems);
284 return 0;
285 }
286 for (i = 0; i < nitems; i++) {
287 j = p[i];
288 if (j > nclusters) nclusters = j;
289 if (j < 0) {
290 PyErr_SetString(PyExc_ValueError, "negative cluster number found");
291 return 0;
292 }
293 }
294 nclusters++;
295 /* -- Count the number of items in each cluster --------------------- */
296 number = calloc(nclusters, sizeof(int));
297 if (!number) {
298 PyErr_NoMemory();
299 return 0;
300 }
301 for (i = 0; i < nitems; i++) {
302 j = p[i];
303 number[j]++;
304 }
305 for (j = 0; j < nclusters; j++) if (number[j] == 0) break;
306 PyMem_Free(number);
307 if (j < nclusters) {
308 PyErr_Format(PyExc_ValueError, "cluster %d is empty", j);
309 return 0;
310 }
311 return nclusters;
312 }
313
314 /* -- distance ----------------------------------------------------------- */
315
316 typedef struct {
317 int n;
318 double** values;
319 Py_buffer* views;
320 Py_buffer view;
321 } Distancematrix;
322
323 static int
_convert_list_to_distancematrix(PyObject * list,Distancematrix * distances)324 _convert_list_to_distancematrix(PyObject* list, Distancematrix* distances)
325 {
326 int i;
327 double** values;
328 Py_buffer* view;
329 Py_buffer* views;
330 const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
331 const int n = (int) PyList_GET_SIZE(list);
332
333 if (n != PyList_GET_SIZE(list)) {
334 PyErr_SetString(PyExc_ValueError, "distance matrix is too large");
335 return 0;
336 }
337 values = PyMem_Malloc(n*sizeof(double*));
338 if (!values) {
339 PyErr_NoMemory();
340 return 0;
341 }
342 distances->values = values;
343 views = PyMem_Malloc(n*sizeof(Py_buffer));
344 if (!views) {
345 PyErr_NoMemory();
346 return 0;
347 }
348 view = views;
349 for (i = 0; i < n; i++, view++) {
350 PyObject* item = PyList_GET_ITEM(list, i);
351 view->len = -1;
352 if (PyObject_GetBuffer(item, view, flag) == -1) {
353 PyErr_Format(PyExc_RuntimeError, "failed to parse row %d.", i);
354 view--;
355 break;
356 }
357 if (view->ndim != 1) {
358 PyErr_Format(PyExc_ValueError,
359 "row %d has incorrect rank (%d expected 1)",
360 i, view->ndim);
361 break;
362 }
363 if (view->itemsize != sizeof(double)) {
364 PyErr_Format(PyExc_RuntimeError,
365 "row %d has incorrect data type", i);
366 break;
367 }
368 if (view->shape[0] != i) {
369 PyErr_Format(PyExc_RuntimeError,
370 "row %d has incorrect size %zd (expected %d)",
371 i, view->shape[0], i);
372 break;
373 }
374 values[i] = view->buf;
375 }
376 if (i < n) {
377 for ( ; view >= views; view--) PyBuffer_Release(view);
378 PyMem_Free(views);
379 return 0;
380 }
381 distances->n = n;
382 distances->view.len = 0;
383 distances->views = views;
384 distances->values = values;
385 return 1;
386 }
387
388 static int
_convert_array_to_distancematrix(PyObject * array,Distancematrix * distances)389 _convert_array_to_distancematrix(PyObject* array, Distancematrix* distances)
390 {
391 int i;
392 int n;
393 double** values;
394 double* p;
395 Py_buffer* view = &distances->view;
396 const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
397
398 if (PyObject_GetBuffer(array, view, flag) == -1) {
399 PyErr_SetString(PyExc_RuntimeError,
400 "distance matrix has unexpected format.");
401 return 0;
402 }
403
404 if (view->len == 0) {
405 PyBuffer_Release(view);
406 PyErr_SetString(PyExc_ValueError, "distance matrix is empty");
407 return 0;
408 }
409 if (view->itemsize != sizeof(double)) {
410 PyErr_SetString(PyExc_RuntimeError,
411 "distance matrix has an incorrect data type");
412 return 0;
413 }
414 if (view->ndim == 1) {
415 int m = (int) view->shape[0];
416 if (m != view->shape[0]) {
417 PyErr_Format(PyExc_ValueError,
418 "distance matrix is too large (size = %zd)",
419 view->shape[0]);
420 return 0;
421 }
422 n = (int)(1+sqrt(1+8*m)/2); /* rounds to (1+sqrt(1+8*m))/2 */
423 if (n*n-n != 2 * m) {
424 PyErr_SetString(PyExc_ValueError,
425 "distance matrix has unexpected size.");
426 return 0;
427 }
428 distances->n = n;
429 values = PyMem_Malloc(n*sizeof(double*));
430 if (!values) {
431 PyErr_NoMemory();
432 return 0;
433 }
434 distances->values = values;
435 for (p = view->buf, i = 0; i < n; p += i, i++) values[i] = p;
436 }
437 else if (view->ndim == 2) {
438 n = (int) view->shape[0];
439 if (n != view->shape[0]) {
440 PyErr_Format(PyExc_ValueError,
441 "distance matrix is too large (size = %zd)",
442 view->shape[0]);
443 return 0;
444 }
445 distances->n = n;
446 if (view->shape[1] != n) {
447 PyErr_SetString(PyExc_ValueError,
448 "distance matrix is not square.");
449 return 0;
450 }
451 values = PyMem_Malloc(n*sizeof(double*));
452 if (!values) {
453 PyErr_NoMemory();
454 return 0;
455 }
456 distances->values = values;
457 for (p = view->buf, i = 0; i < n; p += n, i++) values[i] = p;
458 }
459 else {
460 PyErr_Format(PyExc_ValueError,
461 "distance matrix has incorrect rank %d (expected 1 or 2)",
462 view->ndim);
463 return 0;
464 }
465 return 1;
466 }
467
468 static int
distancematrix_converter(PyObject * argument,void * pointer)469 distancematrix_converter(PyObject* argument, void* pointer)
470 {
471 Distancematrix* distances = pointer;
472 double** values;
473
474 if (argument == NULL) goto exit;
475 if (argument == Py_None) return 1;
476 if (PyList_Check(argument)) {
477 if (_convert_list_to_distancematrix(argument, distances))
478 return Py_CLEANUP_SUPPORTED;
479 }
480 else {
481 if (_convert_array_to_distancematrix(argument, distances))
482 return Py_CLEANUP_SUPPORTED;
483 }
484
485 exit:
486 values = distances->values;
487 if (values == NULL) return 0;
488 else {
489 int i;
490 const int n = distances->n;
491 Py_buffer* views = distances->views;
492 if (views) {
493 for (i = 0; i < n; i++) PyBuffer_Release(&views[i]);
494 PyMem_Free(views);
495 }
496 else if (distances->view.len) {
497 PyBuffer_Release(&distances->view);
498 }
499 PyMem_Free(values);
500 }
501 return 0;
502 }
503
504 /* -- celldata ------------------------------------------------------------- */
505
506 typedef struct {
507 int nx;
508 int ny;
509 int nz;
510 double*** values;
511 Py_buffer view;
512 } Celldata;
513
514 static int
celldata_converter(PyObject * argument,void * pointer)515 celldata_converter(PyObject* argument, void* pointer)
516 {
517 int i, n;
518 double* p;
519 Celldata* celldata = pointer;
520 double*** ppp = celldata->values;
521 double** pp = ppp ? ppp[0] : NULL;
522 int nx;
523 int ny;
524 int nz;
525 Py_buffer* view = &celldata->view;
526 const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
527
528 if (argument == NULL) goto exit;
529
530 if (PyObject_GetBuffer(argument, view, flag) == -1) {
531 PyErr_SetString(PyExc_RuntimeError,
532 "celldata array has unexpected format.");
533 return 0;
534 }
535
536 nx = (int) view->shape[0];
537 ny = (int) view->shape[1];
538 nz = (int) view->shape[2];
539 if (nx != view->shape[0] || ny != view->shape[1] || nz != view->shape[2]) {
540 PyErr_SetString(PyExc_RuntimeError, "celldata array too large");
541 goto exit;
542 }
543 if (view->itemsize != sizeof(double)) {
544 PyErr_SetString(PyExc_RuntimeError,
545 "celldata array has incorrect data type");
546 goto exit;
547 }
548 pp = PyMem_Malloc(nx*ny*sizeof(double*));
549 ppp = PyMem_Malloc(nx*sizeof(double**));
550 if (!pp || !ppp) {
551 PyErr_NoMemory();
552 goto exit;
553 }
554 p = view->buf;
555 n = nx * ny;
556 for (i = 0; i < n; i++, p += nz) pp[i] = p;
557 for (i = 0; i < nx; i++, pp += ny) ppp[i] = pp;
558 celldata->values = ppp;
559 celldata->nx = nx;
560 celldata->ny = ny;
561 celldata->nz = nz;
562 return Py_CLEANUP_SUPPORTED;
563
564 exit:
565 if (pp) PyMem_Free(pp);
566 if (ppp) PyMem_Free(ppp);
567 PyBuffer_Release(view);
568 return 0;
569 }
570
571
572 /* -- index ---------------------------------------------------------------- */
573
574 static int
index_converter(PyObject * argument,void * pointer)575 index_converter(PyObject* argument, void* pointer)
576 {
577 Py_buffer* view = pointer;
578 int n;
579 const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
580
581 if (argument == NULL) goto exit;
582
583 if (PyObject_GetBuffer(argument, view, flag) == -1) {
584 PyErr_SetString(PyExc_RuntimeError, "unexpected format.");
585 return 0;
586 }
587
588 if (view->ndim != 1) {
589 PyErr_Format(PyExc_ValueError, "incorrect rank %d (expected 1)",
590 view->ndim);
591 goto exit;
592 }
593 if (view->itemsize != sizeof(int)) {
594 PyErr_SetString(PyExc_RuntimeError,
595 "argument has incorrect data type");
596 goto exit;
597 }
598 n = (int) view->shape[0];
599 if (n != view->shape[0]) {
600 PyErr_Format(PyExc_ValueError,
601 "array size is too large (size = %zd)", view->shape[0]);
602 goto exit;
603 }
604 return Py_CLEANUP_SUPPORTED;
605
606 exit:
607 PyBuffer_Release(view);
608 return 0;
609 }
610
611 /* -- index2d ------------------------------------------------------------- */
612
613 static int
index2d_converter(PyObject * argument,void * pointer)614 index2d_converter(PyObject* argument, void* pointer)
615 {
616 Py_buffer* view = pointer;
617 int n;
618 const int flag = PyBUF_ND | PyBUF_C_CONTIGUOUS;
619
620 if (argument == NULL) goto exit;
621
622 if (PyObject_GetBuffer(argument, view, flag) == -1) {
623 PyErr_SetString(PyExc_RuntimeError, "unexpected format.");
624 return 0;
625 }
626
627 if (view->ndim != 2) {
628 PyErr_Format(PyExc_ValueError, "incorrect rank %d (expected 2)",
629 view->ndim);
630 goto exit;
631 }
632 if (view->itemsize != sizeof(int)) {
633 PyErr_SetString(PyExc_RuntimeError,
634 "argument has incorrect data type");
635 goto exit;
636 }
637 n = (int) view->shape[0];
638 if (n != view->shape[0]) {
639 PyErr_Format(PyExc_ValueError,
640 "array size is too large (size = %zd)", view->shape[0]);
641 goto exit;
642 }
643 if (view->shape[1] != 2) {
644 PyErr_Format(PyExc_ValueError,
645 "array has %zd columns (expected 2)", view->shape[1]);
646 goto exit;
647 }
648 return Py_CLEANUP_SUPPORTED;
649
650 exit:
651 PyBuffer_Release(view);
652 return 0;
653 }
654
655 /* ========================================================================= */
656 /* -- Classes -------------------------------------------------------------- */
657 /* ========================================================================= */
658
659 typedef struct {
660 PyObject_HEAD
661 Node node;
662 } PyNode;
663
664 static int
PyNode_init(PyNode * self,PyObject * args,PyObject * kwds)665 PyNode_init(PyNode *self, PyObject *args, PyObject *kwds)
666 {
667 int left, right;
668 double distance = 0.0;
669 static char *kwlist[] = {"left", "right", "distance", NULL};
670
671 if (!PyArg_ParseTupleAndKeywords(args, kwds, "ii|d", kwlist,
672 &left, &right, &distance))
673 return -1;
674 self->node.left = left;
675 self->node.right = right;
676 self->node.distance = distance;
677 return 0;
678 }
679
680 static PyObject*
PyNode_repr(PyNode * self)681 PyNode_repr(PyNode* self)
682 {
683 char string[64];
684
685 sprintf(string, "(%d, %d): %g",
686 self->node.left, self->node.right, self->node.distance);
687 return PyUnicode_FromString(string);
688 }
689
690 static char PyNode_left__doc__[] =
691 "integer representing the first member of this node";
692
693 static PyObject*
PyNode_getleft(PyNode * self,void * closure)694 PyNode_getleft(PyNode* self, void* closure)
695 {
696 int left = self->node.left;
697
698 return PyLong_FromLong((long)left);
699 }
700
701 static int
PyNode_setleft(PyNode * self,PyObject * value,void * closure)702 PyNode_setleft(PyNode* self, PyObject* value, void* closure)
703 {
704 long left = PyLong_AsLong(value);
705
706 if (PyErr_Occurred()) return -1;
707 self->node.left = (int) left;
708 return 0;
709 }
710
711 static char PyNode_right__doc__[] =
712 "integer representing the second member of this node";
713
714 static PyObject*
PyNode_getright(PyNode * self,void * closure)715 PyNode_getright(PyNode* self, void* closure)
716 {
717 int right = self->node.right;
718
719 return PyLong_FromLong((long)right);
720 }
721
722 static int
PyNode_setright(PyNode * self,PyObject * value,void * closure)723 PyNode_setright(PyNode* self, PyObject* value, void* closure)
724 {
725 long right = PyLong_AsLong(value);
726
727 if (PyErr_Occurred()) return -1;
728 self->node.right = (int) right;
729 return 0;
730 }
731
732 static PyObject*
PyNode_getdistance(PyNode * self,void * closure)733 PyNode_getdistance(PyNode* self, void* closure)
734 {
735 return PyFloat_FromDouble(self->node.distance);
736 }
737
738 static int
PyNode_setdistance(PyNode * self,PyObject * value,void * closure)739 PyNode_setdistance(PyNode* self, PyObject* value, void* closure)
740 {
741 const double distance = PyFloat_AsDouble(value);
742
743 if (PyErr_Occurred()) return -1;
744 self->node.distance = distance;
745 return 0;
746 }
747
748 static char PyNode_distance__doc__[] =
749 "the distance between the two members of this node\n";
750
751 static PyGetSetDef PyNode_getset[] = {
752 {"left",
753 (getter)PyNode_getleft,
754 (setter)PyNode_setleft,
755 PyNode_left__doc__, NULL},
756 {"right",
757 (getter)PyNode_getright,
758 (setter)PyNode_setright,
759 PyNode_right__doc__, NULL},
760 {"distance",
761 (getter)PyNode_getdistance,
762 (setter)PyNode_setdistance,
763 PyNode_distance__doc__, NULL},
764 {NULL} /* Sentinel */
765 };
766
767 static char PyNode_doc[] =
768 "A Node object describes a single node in a hierarchical clustering tree.\n"
769 "The integer attributes 'left' and 'right' represent the two members that\n"
770 "make up this node; the floating point attribute 'distance' contains the\n"
771 "distance between the two members of this node.\n";
772
773 static PyTypeObject PyNodeType = {
774 PyVarObject_HEAD_INIT(NULL, 0)
775 "_cluster.Node", /* tp_name */
776 sizeof(PyNode), /* tp_basicsize */
777 0, /* tp_itemsize */
778 0, /* tp_dealloc */
779 0, /* tp_print */
780 0, /* tp_getattr */
781 0, /* tp_setattr */
782 0, /* tp_compare */
783 (reprfunc)PyNode_repr, /* tp_repr */
784 0, /* tp_as_number */
785 0, /* tp_as_sequence */
786 0, /* tp_as_mapping */
787 0, /* tp_hash */
788 0, /* tp_call */
789 0, /* tp_str */
790 0, /* tp_getattro */
791 0, /* tp_setattro */
792 0, /* tp_as_buffer */
793 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
794 PyNode_doc, /* tp_doc */
795 0, /* tp_traverse */
796 0, /* tp_clear */
797 0, /* tp_richcompare */
798 0, /* tp_weaklistoffset */
799 0, /* tp_iter */
800 0, /* tp_iternext */
801 0, /* tp_methods */
802 0, /* tp_members */
803 PyNode_getset, /* tp_getset */
804 0, /* tp_base */
805 0, /* tp_dict */
806 0, /* tp_descr_get */
807 0, /* tp_descr_set */
808 0, /* tp_dictoffset */
809 (initproc)PyNode_init, /* tp_init */
810 };
811
812 typedef struct {
813 PyObject_HEAD
814 Node* nodes;
815 int n;
816 } PyTree;
817
818 static void
PyTree_dealloc(PyTree * self)819 PyTree_dealloc(PyTree* self)
820 {
821 if (self->n) PyMem_Free(self->nodes);
822 Py_TYPE(self)->tp_free((PyObject*)self);
823 }
824
825 static PyObject*
PyTree_new(PyTypeObject * type,PyObject * args,PyObject * kwds)826 PyTree_new(PyTypeObject *type, PyObject* args, PyObject* kwds)
827 {
828 int i, j;
829 int n;
830 Node* nodes;
831 PyObject* arg = NULL;
832 int* flag;
833 PyTree* self;
834
835 self = (PyTree *)type->tp_alloc(type, 0);
836 if (!self) return NULL;
837
838 if (!PyArg_ParseTuple(args, "|O", &arg)) {
839 Py_DECREF(self);
840 return NULL;
841 }
842
843 if (arg == NULL) {
844 self->n = 0;
845 self->nodes = NULL;
846 return (PyObject*)self;
847 }
848
849 if (!PyList_Check(arg)) {
850 Py_DECREF(self);
851 PyErr_SetString(PyExc_TypeError,
852 "Argument should be a list of Node objects");
853 return NULL;
854 }
855
856 n = (int) PyList_GET_SIZE(arg);
857 if (n != PyList_GET_SIZE(arg)) {
858 Py_DECREF(self);
859 PyErr_Format(PyExc_ValueError,
860 "List is too large (size = %zd)", PyList_GET_SIZE(arg));
861 return NULL;
862 }
863 if (n < 1) {
864 Py_DECREF(self);
865 PyErr_SetString(PyExc_ValueError, "List is empty");
866 return NULL;
867 }
868 nodes = PyMem_Malloc(n*sizeof(Node));
869 if (!nodes) {
870 Py_DECREF(self);
871 return PyErr_NoMemory();
872 }
873 for (i = 0; i < n; i++) {
874 PyNode* p;
875 PyObject* row = PyList_GET_ITEM(arg, i);
876 if (!PyType_IsSubtype(Py_TYPE(row), &PyNodeType)) {
877 PyMem_Free(nodes);
878 Py_DECREF(self);
879 PyErr_Format(PyExc_TypeError,
880 "Row %d in list is not a Node object", i);
881 return NULL;
882 }
883 p = (PyNode*)row;
884 nodes[i] = p->node;
885 }
886 /* --- Check if this is a bona fide tree ------------------------------- */
887 flag = PyMem_Malloc((2*n+1)*sizeof(int));
888 if (!flag) {
889 PyMem_Free(nodes);
890 Py_DECREF(self);
891 return PyErr_NoMemory();
892 }
893 for (i = 0; i < 2*n+1; i++) flag[i] = 0;
894 for (i = 0; i < n; i++) {
895 j = nodes[i].left;
896 if (j < 0) {
897 j = -j-1;
898 if (j >= i) break;
899 }
900 else j += n;
901 if (flag[j]) break;
902 flag[j] = 1;
903 j = nodes[i].right;
904 if (j < 0) {
905 j = -j-1;
906 if (j >= i) break;
907 }
908 else j += n;
909 if (flag[j]) break;
910 flag[j] = 1;
911 }
912 PyMem_Free(flag);
913 if (i < n) {
914 /* break encountered */
915 PyMem_Free(nodes);
916 Py_DECREF(self);
917 PyErr_SetString(PyExc_ValueError, "Inconsistent tree");
918 return NULL;
919 }
920 self->n = n;
921 self->nodes = nodes;
922 return (PyObject*)self;
923 }
924
925 static PyObject*
PyTree_str(PyTree * self)926 PyTree_str(PyTree* self)
927 {
928 int i;
929 const int n = self->n;
930 char string[128];
931 Node node;
932 PyObject* line;
933 PyObject* output;
934 PyObject* temp;
935
936 output = PyUnicode_FromString("");
937 for (i = 0; i < n; i++) {
938 node = self->nodes[i];
939 sprintf(string, "(%d, %d): %g", node.left, node.right, node.distance);
940 if (i < n-1) strcat(string, "\n");
941 line = PyUnicode_FromString(string);
942 if (!line) {
943 Py_DECREF(output);
944 return NULL;
945 }
946 temp = PyUnicode_Concat(output, line);
947 if (!temp) {
948 Py_DECREF(output);
949 Py_DECREF(line);
950 return NULL;
951 }
952 output = temp;
953 }
954 return output;
955 }
956
957 static int
PyTree_length(PyTree * self)958 PyTree_length(PyTree *self)
959 {
960 return self->n;
961 }
962
963 static PyObject*
PyTree_subscript(PyTree * self,PyObject * item)964 PyTree_subscript(PyTree* self, PyObject* item)
965 {
966 if (PyIndex_Check(item)) {
967 PyNode* result;
968 Py_ssize_t i;
969 i = PyNumber_AsSsize_t(item, PyExc_IndexError);
970 if (i == -1 && PyErr_Occurred())
971 return NULL;
972 if (i < 0)
973 i += self->n;
974 if (i < 0 || i >= self->n) {
975 PyErr_SetString(PyExc_IndexError, "tree index out of range");
976 return NULL;
977 }
978 result = (PyNode*) PyNodeType.tp_alloc(&PyNodeType, 0);
979 if (!result) return PyErr_NoMemory();
980 result->node = self->nodes[i];
981 return (PyObject*) result;
982 }
983 else if (PySlice_Check(item)) {
984 Py_ssize_t i, j;
985 Py_ssize_t start, stop, step, slicelength;
986 if (PySlice_GetIndicesEx(item, self->n, &start, &stop, &step,
987 &slicelength) == -1) return NULL;
988 if (slicelength == 0) return PyList_New(0);
989 else {
990 PyNode* node;
991 PyObject* result = PyList_New(slicelength);
992 if (!result) return PyErr_NoMemory();
993 for (i = 0, j = start; i < slicelength; i++, j += step) {
994 node = (PyNode*) PyNodeType.tp_alloc(&PyNodeType, 0);
995 if (!node) {
996 Py_DECREF(result);
997 return PyErr_NoMemory();
998 }
999 node->node = self->nodes[j];
1000 PyList_SET_ITEM(result, i, (PyObject*)node);
1001 }
1002 return result;
1003 }
1004 }
1005 else {
1006 PyErr_Format(PyExc_TypeError,
1007 "tree indices must be integers, not %.200s",
1008 item->ob_type->tp_name);
1009 return NULL;
1010 }
1011 }
1012
1013 static PyMappingMethods PyTree_mapping = {
1014 (lenfunc)PyTree_length, /* mp_length */
1015 (binaryfunc)PyTree_subscript, /* mp_subscript */
1016 };
1017
1018 static char PyTree_scale__doc__[] =
1019 "mytree.scale()\n"
1020 "\n"
1021 "Scale the node distances in the tree such that they are all between one\n"
1022 "and zero.\n";
1023
1024 static PyObject*
PyTree_scale(PyTree * self)1025 PyTree_scale(PyTree* self)
1026 {
1027 int i;
1028 const int n = self->n;
1029 Node* nodes = self->nodes;
1030 double maximum = DBL_MIN;
1031
1032 for (i = 0; i < n; i++) {
1033 double distance = nodes[i].distance;
1034 if (distance > maximum) maximum = distance;
1035 }
1036 if (maximum != 0.0)
1037 for (i = 0; i < n; i++) nodes[i].distance /= maximum;
1038 Py_INCREF(Py_None);
1039 return Py_None;
1040 }
1041
1042 static char PyTree_cut__doc__[] =
1043 "mytree.cut(nclusters) -> array\n"
1044 "\n"
1045 "Divide the elements in a hierarchical clustering result mytree into\n"
1046 "clusters, and return an array with the number of the cluster to which each\n"
1047 "element was assigned. The number of clusters is given by nclusters.\n";
1048
1049 static PyObject*
PyTree_cut(PyTree * self,PyObject * args)1050 PyTree_cut(PyTree* self, PyObject* args)
1051 {
1052 int ok = -1;
1053 int nclusters;
1054 const int n = self->n + 1;
1055 Py_buffer indices = {0};
1056
1057 if (!PyArg_ParseTuple(args, "O&i",
1058 index_converter, &indices, &nclusters)) goto exit;
1059 if (nclusters < 1) {
1060 PyErr_SetString(PyExc_ValueError,
1061 "requested number of clusters should be positive");
1062 goto exit;
1063 }
1064 if (nclusters > n) {
1065 PyErr_SetString(PyExc_ValueError,
1066 "more clusters requested than items available");
1067 goto exit;
1068 }
1069 if (indices.shape[0] != n) {
1070 PyErr_SetString(PyExc_RuntimeError,
1071 "indices array inconsistent with tree");
1072 goto exit;
1073 }
1074 ok = cuttree(n, self->nodes, nclusters, indices.buf);
1075
1076 exit:
1077 index_converter(NULL, &indices);
1078 if (ok == -1) return NULL;
1079 if (ok == 0) return PyErr_NoMemory();
1080 Py_INCREF(Py_None);
1081 return Py_None;
1082 }
1083
1084 static char PyTree_sort__doc__[] =
1085 "mytree.sort(order) -> array\n"
1086 "\n"
1087 "Sort a hierarchical clustering tree by switching the left and right\n"
1088 "subnode of nodes such that the elements in the left-to-right order of the\n"
1089 "tree tend to have increasing order values.\n"
1090 "\n"
1091 "Return the indices of the elements in the left-to-right order in the\n"
1092 "hierarchical clustering tree, such that the element with index indices[i]\n"
1093 "occurs at position i in the dendrogram.\n";
1094
1095 static PyObject*
PyTree_sort(PyTree * self,PyObject * args)1096 PyTree_sort(PyTree* self, PyObject* args)
1097 {
1098 int ok = -1;
1099 Py_buffer indices = {0};
1100 const int n = self->n;
1101 Py_buffer order = {0};
1102
1103 if (n == 0) {
1104 PyErr_SetString(PyExc_ValueError, "tree is empty");
1105 return NULL;
1106 }
1107 if (!PyArg_ParseTuple(args, "O&O&",
1108 index_converter, &indices,
1109 vector_converter, &order)) goto exit;
1110 if (indices.shape[0] != n + 1) {
1111 PyErr_SetString(PyExc_RuntimeError,
1112 "indices array inconsistent with tree");
1113 goto exit;
1114 }
1115 if (order.shape[0] != n + 1) {
1116 PyErr_Format(PyExc_ValueError,
1117 "order array has incorrect size %zd (expected %d)",
1118 order.shape[0], n + 1);
1119 goto exit;
1120 }
1121 ok = sorttree(n, self->nodes, order.buf, indices.buf);
1122 exit:
1123 index_converter(NULL, &indices);
1124 vector_converter(NULL, &order);
1125 if (ok == -1) return NULL;
1126 if (ok == 0) return PyErr_NoMemory();
1127 Py_INCREF(Py_None);
1128 return Py_None;
1129 }
1130
1131 static PyMethodDef PyTree_methods[] = {
1132 {"scale", (PyCFunction)PyTree_scale, METH_NOARGS, PyTree_scale__doc__},
1133 {"cut", (PyCFunction)PyTree_cut, METH_VARARGS, PyTree_cut__doc__},
1134 {"sort", (PyCFunction)PyTree_sort, METH_VARARGS, PyTree_sort__doc__},
1135 {NULL} /* Sentinel */
1136 };
1137
1138 static char PyTree_doc[] =
1139 "Tree objects store a hierarchical clustering solution.\n"
1140 "Individual nodes in the tree can be accessed with tree[i], where i is\n"
1141 "an integer. Whereas the tree itself is a read-only object, tree[:]\n"
1142 "returns a list of all the nodes, which can then be modified. To create\n"
1143 "a new Tree from this list, use Tree(list).\n"
1144 "See the description of the Node class for more information.";
1145
1146 static PyTypeObject PyTreeType = {
1147 PyVarObject_HEAD_INIT(NULL, 0)
1148 "_cluster.Tree", /* tp_name */
1149 sizeof(PyTree), /* tp_basicsize */
1150 0, /* tp_itemsize */
1151 (destructor)PyTree_dealloc, /* tp_dealloc */
1152 0, /* tp_print */
1153 0, /* tp_getattr */
1154 0, /* tp_setattr */
1155 0, /* tp_compare */
1156 0, /* tp_repr */
1157 0, /* tp_as_number */
1158 0, /* tp_as_sequence */
1159 &PyTree_mapping, /* tp_as_mapping */
1160 0, /* tp_hash */
1161 0, /* tp_call */
1162 (reprfunc)PyTree_str, /* tp_str */
1163 0, /* tp_getattro */
1164 0, /* tp_setattro */
1165 0, /* tp_as_buffer */
1166 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /*tp_flags*/
1167 PyTree_doc, /* tp_doc */
1168 0, /* tp_traverse */
1169 0, /* tp_clear */
1170 0, /* tp_richcompare */
1171 0, /* tp_weaklistoffset */
1172 0, /* tp_iter */
1173 0, /* tp_iternext */
1174 PyTree_methods, /* tp_methods */
1175 NULL, /* tp_members */
1176 0, /* tp_getset */
1177 0, /* tp_base */
1178 0, /* tp_dict */
1179 0, /* tp_descr_get */
1180 0, /* tp_descr_set */
1181 0, /* tp_dictoffset */
1182 0, /* tp_init */
1183 0, /* tp_alloc */
1184 (newfunc)PyTree_new, /* tp_new */
1185 };
1186
1187 /* ========================================================================= */
1188 /* -- Methods -------------------------------------------------------------- */
1189 /* ========================================================================= */
1190
1191 /* version */
1192 static char version__doc__[] =
1193 "version() -> string\n"
1194 "\n"
1195 "Return the version number of the C Clustering Library as a string.\n";
1196
1197 static PyObject*
py_version(PyObject * self)1198 py_version(PyObject* self)
1199 {
1200 return PyUnicode_FromString( CLUSTERVERSION );
1201 }
1202
1203 /* kcluster */
1204 static char kcluster__doc__[] =
1205 "kcluster(data, nclusters, mask, weight, transpose, npass, method,\n"
1206 " dist, clusterid) -> None\n"
1207 "\n"
1208 "This function implements k-means clustering.\n"
1209 "\n"
1210 "Arguments:\n"
1211 "\n"
1212 " - data: nrows x ncols array containing the data to be clustered\n"
1213 "\n"
1214 " - nclusters: number of clusters (the 'k' in k-means)\n"
1215 "\n"
1216 " - mask: nrows x ncols array of integers, showing which data are\n"
1217 " missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
1218 "\n"
1219 " - weight: the weights to be used when calculating distances\n"
1220 " - transpose:\n"
1221 "\n"
1222 " - if equal to 0, rows are clustered;\n"
1223 " - if equal to 1, columns are clustered.\n"
1224 "\n"
1225 " - npass: number of times the k-means clustering algorithm is\n"
1226 " performed, each time with a different (random) initial\n"
1227 " condition. If npass == 0, then the assignments in clusterid\n"
1228 " are used as the initial condition.\n"
1229 "\n"
1230 " - method: specifies how the center of a cluster is found:\n"
1231 "\n"
1232 " - method == 'a': arithmetic mean\n"
1233 " - method == 'm': median\n"
1234 "\n"
1235 " - dist: specifies the distance function to be used:\n"
1236 "\n"
1237 " - dist == 'e': Euclidean distance\n"
1238 " - dist == 'b': City Block distance\n"
1239 " - dist == 'c': Pearson correlation\n"
1240 " - dist == 'a': absolute value of the correlation\n"
1241 " - dist == 'u': uncentered correlation\n"
1242 " - dist == 'x': absolute uncentered correlation\n"
1243 " - dist == 's': Spearman's rank correlation\n"
1244 " - dist == 'k': Kendall's tau\n"
1245 "\n"
1246 " - clusterid: array in which the final clustering solution will be\n"
1247 " stored (output variable). If npass == 0, then clusterid is also used\n"
1248 " as an input variable, containing the initial condition from which\n"
1249 " the EM algorithm should start. In this case, the k-means algorithm\n"
1250 " is fully deterministic.\n"
1251 "\n";
1252
1253 static PyObject*
py_kcluster(PyObject * self,PyObject * args,PyObject * keywords)1254 py_kcluster(PyObject* self, PyObject* args, PyObject* keywords)
1255 {
1256 int nclusters = 2;
1257 int nrows, ncols;
1258 int nitems;
1259 int ndata;
1260 Data data = {0};
1261 Mask mask = {0};
1262 Py_buffer weight = {0};
1263 int transpose = 0;
1264 int npass = 1;
1265 char method = 'a';
1266 char dist = 'e';
1267 Py_buffer clusterid = {0};
1268 double error;
1269 int ifound = 0;
1270
1271 static char* kwlist[] = {"data",
1272 "nclusters",
1273 "mask",
1274 "weight",
1275 "transpose",
1276 "npass",
1277 "method",
1278 "dist",
1279 "clusterid",
1280 NULL};
1281
1282 if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&iO&O&iiO&O&O&", kwlist,
1283 data_converter, &data,
1284 &nclusters,
1285 mask_converter, &mask,
1286 vector_converter, &weight,
1287 &transpose,
1288 &npass,
1289 method_kcluster_converter, &method,
1290 distance_converter, &dist,
1291 index_converter, &clusterid)) return NULL;
1292 if (!data.values) {
1293 PyErr_SetString(PyExc_RuntimeError, "data is None");
1294 goto exit;
1295 }
1296 if (!mask.values) {
1297 PyErr_SetString(PyExc_RuntimeError, "mask is None");
1298 goto exit;
1299 }
1300 if (data.nrows != mask.view.shape[0] ||
1301 data.ncols != mask.view.shape[1]) {
1302 PyErr_Format(PyExc_ValueError,
1303 "mask has incorrect dimensions %zd x %zd (expected %d x %d)",
1304 mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
1305 goto exit;
1306 }
1307 nrows = data.nrows;
1308 ncols = data.ncols;
1309 ndata = transpose ? nrows : ncols;
1310 nitems = transpose ? ncols : nrows;
1311 if (weight.shape[0] != ndata) {
1312 PyErr_Format(PyExc_ValueError,
1313 "weight has incorrect size %zd (expected %d)",
1314 weight.shape[0], ndata);
1315 goto exit;
1316 }
1317 if (nclusters < 1) {
1318 PyErr_SetString(PyExc_ValueError, "nclusters should be positive");
1319 goto exit;
1320 }
1321 if (nitems < nclusters) {
1322 PyErr_SetString(PyExc_ValueError,
1323 "more clusters than items to be clustered");
1324 goto exit;
1325 }
1326 if (npass < 0) {
1327 PyErr_SetString(PyExc_RuntimeError, "expected a non-negative integer");
1328 goto exit;
1329 }
1330 else if (npass == 0) {
1331 int n = check_clusterid(clusterid, nitems);
1332 if (n == 0) goto exit;
1333 if (n != nclusters) {
1334 PyErr_SetString(PyExc_ValueError,
1335 "more clusters requested than found in clusterid");
1336 goto exit;
1337 }
1338 }
1339 kcluster(nclusters,
1340 nrows,
1341 ncols,
1342 data.values,
1343 mask.values,
1344 weight.buf,
1345 transpose,
1346 npass,
1347 method,
1348 dist,
1349 clusterid.buf,
1350 &error,
1351 &ifound);
1352 exit:
1353 data_converter(NULL, &data);
1354 mask_converter(NULL, &mask);
1355 vector_converter(NULL, &weight);
1356 index_converter(NULL, &clusterid);
1357 if (ifound) return Py_BuildValue("di", error, ifound);
1358 return NULL;
1359 }
1360 /* end of wrapper for kcluster */
1361
1362 /* kmedoids */
1363 static char kmedoids__doc__[] =
1364 "kmedoids(distance, nclusters, npass, clusterid) -> error, nfound\n"
1365 "\n"
1366 "This function implements k-medoids clustering.\n"
1367 "\n"
1368 "Arguments:\n"
1369 " - distance: The distance matrix between the elements. There are three\n"
1370 " ways in which you can pass a distance matrix:\n"
1371 "\n"
1372 " 1. a 2D Numerical Python array (in which only the left-lower\n"
1373 " part of the array will be accessed);\n"
1374 " 2. a 1D Numerical Python array containing the distances\n"
1375 " consecutively;\n"
1376 " 3. a list of rows containing the lower-triangular part of\n"
1377 " the distance matrix.\n"
1378 "\n"
1379 " Examples are:\n"
1380 "\n"
1381 " >>> from numpy import array\n"
1382 " >>> distance = array([[0.0, 1.1, 2.3],\n"
1383 " ... [1.1, 0.0, 4.5],\n"
1384 " ... [2.3, 4.5, 0.0]])\n"
1385 " >>> # (option #1)\n"
1386 " >>> distance = array([1.1, 2.3, 4.5])\n"
1387 " >>> # (option #2)\n"
1388 " >>> distance = [array([]),\n"
1389 " ... array([1.1]),\n"
1390 " ... array([2.3, 4.5])]\n"
1391 " >>> # (option #3)\n"
1392 "\n"
1393 " These three correspond to the same distance matrix.\n"
1394 "\n"
1395 " - nclusters: number of clusters (the 'k' in k-medoids)\n"
1396 "\n"
1397 " - npass: number of times the k-medoids clustering algorithm is\n"
1398 " performed, each time with a different (random) initial\n"
1399 " condition. If npass == 0, then the assignments in clusterid\n"
1400 " are used as the initial condition.\n"
1401 "\n"
1402 " - clusterid: array in which the final clustering solution will be\n"
1403 " stored (output variable). If npass == 0, then clusterid is also used\n"
1404 " as an input variable, containing the initial condition from which\n"
1405 " the EM algorithm should start. In this case, the k-medoids algorithm\n"
1406 " is fully deterministic.\n"
1407 "\n"
1408 "Return values:\n"
1409 " - error: the within-cluster sum of distances for the returned k-means\n"
1410 " clustering solution;\n"
1411 " - nfound: the number of times this solution was found.\n";
1412
1413 static PyObject*
py_kmedoids(PyObject * self,PyObject * args,PyObject * keywords)1414 py_kmedoids(PyObject* self, PyObject* args, PyObject* keywords)
1415 {
1416 int nclusters = 2;
1417 Distancematrix distances = {0};
1418 Py_buffer clusterid = {0};
1419 int npass = 1;
1420 double error;
1421 int ifound = -2;
1422
1423 static char* kwlist[] = {"distance",
1424 "nclusters",
1425 "npass",
1426 "clusterid",
1427 NULL};
1428
1429 if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&iiO&", kwlist,
1430 distancematrix_converter, &distances,
1431 &nclusters,
1432 &npass,
1433 index_converter, &clusterid)) return NULL;
1434 if (npass < 0) {
1435 PyErr_SetString(PyExc_RuntimeError, "expected a non-negative integer");
1436 goto exit;
1437 }
1438 else if (npass == 0) {
1439 int n = check_clusterid(clusterid, distances.n);
1440 if (n == 0) goto exit;
1441 if (n != nclusters) {
1442 PyErr_SetString(PyExc_RuntimeError,
1443 "more clusters requested than found in clusterid");
1444 goto exit;
1445 }
1446 }
1447 if (nclusters <= 0) {
1448 PyErr_SetString(PyExc_ValueError,
1449 "nclusters should be a positive integer");
1450 goto exit;
1451 }
1452 if (distances.n < nclusters) {
1453 PyErr_SetString(PyExc_ValueError,
1454 "more clusters requested than items to be clustered");
1455 goto exit;
1456 }
1457 kmedoids(nclusters,
1458 distances.n,
1459 distances.values,
1460 npass,
1461 clusterid.buf,
1462 &error,
1463 &ifound);
1464
1465 exit:
1466 distancematrix_converter(NULL, &distances);
1467 index_converter(NULL, &clusterid);
1468 switch (ifound) {
1469 case -2:
1470 return NULL;
1471 case -1:
1472 return PyErr_NoMemory();
1473 case 0: /* should not occur */
1474 PyErr_SetString(PyExc_RuntimeError,
1475 "error in kmedoids input arguments");
1476 return NULL;
1477 default:
1478 return Py_BuildValue("di", error, ifound);
1479 }
1480 }
1481 /* end of wrapper for kmedoids */
1482
1483 /* treecluster */
1484 static char treecluster__doc__[] =
1485 "treecluster(tree, data, mask, weight, transpose, dist, method,\n"
1486 " distancematrix) -> None\n"
1487 "\n"
1488 "This function implements the pairwise single, complete, centroid, and\n"
1489 "average linkage hierarchical clustering methods.\n"
1490 "\n"
1491 "Arguments:\n"
1492 " - tree: an empty Tree object; its nodes will be filled by treecluster\n"
1493 " to describe the hierarchical clustering result. See the description\n"
1494 " of the Tree class for more information.\n"
1495 "\n"
1496 " - data: nrows x ncols array containing the data to be clustered.\n"
1497 " Either data or distancematrix (see below) should be None.\n"
1498 "\n"
1499 " - mask: nrows x ncols array of integers, showing which data are\n"
1500 " missing. If mask[i,j]==0, then data[i,j] is missing.\n"
1501 "\n"
1502 " - weight: the weights to be used when calculating distances.\n"
1503 "\n"
1504 " - transpose:\n"
1505 "\n"
1506 " - if equal to 0, rows are clustered;\n"
1507 " - if equal to 1, columns are clustered.\n"
1508 "\n"
1509 " - dist: specifies the distance function to be used:\n"
1510 "\n"
1511 " - dist == 'e': Euclidean distance\n"
1512 " - dist == 'b': City Block distance\n"
1513 " - dist == 'c': Pearson correlation\n"
1514 " - dist == 'a': absolute value of the correlation\n"
1515 " - dist == 'u': uncentered correlation\n"
1516 " - dist == 'x': absolute uncentered correlation\n"
1517 " - dist == 's': Spearman's rank correlation\n"
1518 " - dist == 'k': Kendall's tau\n"
1519 "\n"
1520 " - method: specifies which linkage method is used:\n"
1521 "\n"
1522 " - method == 's': Single pairwise linkage\n"
1523 " - method == 'm': Complete (maximum) pairwise linkage (default)\n"
1524 " - method == 'c': Centroid linkage\n"
1525 " - method == 'a': Average pairwise linkage\n"
1526 "\n"
1527 " - distancematrix: The distance matrix between the elements.\n"
1528 " Either data (see above) or distancematrix should be None.\n"
1529 " There are three ways in which you can pass a distance matrix:\n"
1530 "\n"
1531 " 1. a 2D Numerical Python array (in which only the left-lower\n"
1532 " part of the array will be accessed);\n"
1533 " 2. a 1D Numerical Python array containing the distances\n"
1534 " consecutively;\n"
1535 " 3. a list of rows containing the lower-triangular part of\n"
1536 " the distance matrix.\n"
1537 "\n"
1538 " Examples are:\n"
1539 "\n"
1540 " >>> from numpy import array\n"
1541 " >>> distance = array([[0.0, 1.1, 2.3],\n"
1542 " ... [1.1, 0.0, 4.5],\n"
1543 " ... [2.3, 4.5, 0.0]])\n"
1544 " >>> # option 1.\n"
1545 " >>> distance = array([1.1, 2.3, 4.5])\n"
1546 " >>> # option 2.\n"
1547 " >>> distance = [array([]),\n"
1548 " ... array([1.1]),\n"
1549 " ... array([2.3, 4.5])]\n"
1550 " >>> # option 3.\n"
1551 "\n"
1552 " These three correspond to the same distance matrix.\n"
1553 "\n"
1554 " PLEASE NOTE:\n"
1555 " As the treecluster routine may shuffle the values in the\n"
1556 " distance matrix as part of the clustering algorithm, be sure\n"
1557 " to save this array in a different variable before calling\n"
1558 " treecluster if you need it later.\n"
1559 "\n"
1560 "Either data or distancematrix should be None. If distancematrix is None,\n"
1561 "the hierarchical clustering solution is calculated from the values in\n"
1562 "the argument data. Instead if data is None, the hierarchical clustering\n"
1563 "solution is calculated from the distance matrix.\n"
1564 "Pairwise centroid-linkage clustering can be calculated only from the data\n"
1565 "and not from the distance matrix.\n"
1566 "Pairwise single-, maximum-, and average-linkage clustering can be\n"
1567 "calculated from either the data or from the distance matrix.\n";
1568
1569 static PyObject*
py_treecluster(PyObject * self,PyObject * args,PyObject * keywords)1570 py_treecluster(PyObject* self, PyObject* args, PyObject* keywords)
1571 {
1572 Data data = {0};
1573 Mask mask = {0};
1574 Py_buffer weight = {0};
1575 int transpose = 0;
1576 char dist = 'e';
1577 char method = 'm';
1578 Distancematrix distances = {0};
1579 PyTree* tree = NULL;
1580 Node* nodes;
1581 int nitems;
1582
1583 static char* kwlist[] = {"tree",
1584 "data",
1585 "mask",
1586 "weight",
1587 "transpose",
1588 "method",
1589 "dist",
1590 "distancematrix",
1591 NULL };
1592
1593 if (!PyArg_ParseTupleAndKeywords(args, keywords, "O!O&O&O&iO&O&O&", kwlist,
1594 &PyTreeType, &tree,
1595 data_converter, &data,
1596 mask_converter, &mask,
1597 vector_none_converter, &weight,
1598 &transpose,
1599 method_treecluster_converter, &method,
1600 distance_converter, &dist,
1601 distancematrix_converter, &distances))
1602 return NULL;
1603
1604 if (tree->n != 0) {
1605 PyErr_SetString(PyExc_RuntimeError, "expected an empty tree");
1606 goto exit;
1607 }
1608 if (data.values != NULL && distances.values != NULL) {
1609 PyErr_SetString(PyExc_ValueError,
1610 "use either data or distancematrix, do not use both");
1611 goto exit;
1612 }
1613 if (data.values == NULL && distances.values == NULL) {
1614 PyErr_SetString(PyExc_ValueError,
1615 "neither data nor distancematrix was given");
1616 goto exit;
1617 }
1618
1619 if (data.values) /* use the values in data, not the distance matrix */ {
1620 int nrows;
1621 int ncols;
1622 int ndata;
1623
1624 if (!mask.values) {
1625 PyErr_SetString(PyExc_RuntimeError, "mask is None");
1626 goto exit;
1627 }
1628 if (!weight.buf) {
1629 PyErr_SetString(PyExc_RuntimeError, "weight is None");
1630 goto exit;
1631 }
1632 nrows = data.nrows;
1633 ncols = data.ncols;
1634 if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
1635 PyErr_Format(PyExc_ValueError,
1636 "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
1637 mask.view.shape[0], mask.view.shape[1],
1638 data.nrows, data.ncols);
1639 goto exit;
1640 }
1641 ndata = transpose ? nrows : ncols;
1642 nitems = transpose ? ncols : nrows;
1643 if (weight.shape[0] != ndata) {
1644 PyErr_Format(PyExc_RuntimeError,
1645 "weight has incorrect size %zd (expected %d)",
1646 weight.shape[0], ndata);
1647 goto exit;
1648 }
1649
1650 nodes = treecluster(nrows,
1651 ncols,
1652 data.values,
1653 mask.values,
1654 weight.buf,
1655 transpose,
1656 dist,
1657 method,
1658 NULL);
1659 }
1660 else { /* use the distance matrix instead of the values in data */
1661 if (!strchr("sma", method)) {
1662 PyErr_SetString(PyExc_ValueError,
1663 "argument method should be 's', 'm', or 'a' "
1664 "when specifying the distance matrix");
1665 goto exit;
1666 }
1667 nitems = distances.n;
1668 nodes = treecluster(nitems,
1669 nitems,
1670 0,
1671 0,
1672 0,
1673 transpose,
1674 dist,
1675 method,
1676 distances.values);
1677 }
1678
1679 if (!nodes) {
1680 PyErr_NoMemory();
1681 goto exit;
1682 }
1683 tree->nodes = nodes;
1684 tree->n = nitems-1;
1685
1686 exit:
1687 data_converter(NULL, &data);
1688 mask_converter(NULL, &mask);
1689 vector_none_converter(NULL, &weight);
1690 distancematrix_converter(NULL, &distances);
1691 if (tree == NULL || tree->n == 0) return NULL;
1692 Py_INCREF(Py_None);
1693 return Py_None;
1694 }
1695 /* end of wrapper for treecluster */
1696
1697 /* somcluster */
1698 static char somcluster__doc__[] =
1699 "somcluster(clusterid, celldata, data, mask, weight, transpose,\n"
1700 " inittau, niter, dist) -> None\n"
1701 "\n"
1702 "This function implements a self-organizing map on a rectangular grid.\n"
1703 "\n"
1704 "Arguments:\n"
1705 " - clusterid: array with two columns, with the number of rows equal\n"
1706 " to the number of items being clustered. Upon return, each row\n"
1707 " in the array contains the x and y coordinates of the cell in the\n"
1708 " the rectangular SOM grid to which the item was assigned.\n"
1709 "\n"
1710 " - celldata: array with dimensions nxgrid x nygrid x number of columns\n"
1711 " if rows are being clustered, or nxgrid x nygrid x number of rows\n"
1712 " if columns are being clustered, where nxgrid is the horizontal\n"
1713 " dimension of the rectangular SOM map and nygrid is the vertical\n"
1714 " dimension of the rectangular SOM map.\n"
1715 " Upon return, each element [ix, iy] of this array contains the\n"
1716 " data for the centroid of the cluster in the SOM grid cell with\n"
1717 " coordinates [ix, iy].\n"
1718 "\n"
1719 " - data: nrows x ncols array containing the data to be clustered.\n"
1720 "\n"
1721 " - mask: nrows x ncols array of integers, showing which data are\n"
1722 " missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
1723 "\n"
1724 " - weight: the weights to be used when calculating distances\n"
1725 "\n"
1726 " - transpose:\n"
1727 "\n"
1728 " - if equal to 0, rows are clustered;\n"
1729 " - if equal to 1, columns are clustered.\n"
1730 "\n"
1731 " - inittau: the initial value of tau (the neighborbood function)\n"
1732 "\n"
1733 " - niter: the number of iterations\n"
1734 "\n"
1735 " - dist: specifies the distance function to be used:\n"
1736 "\n"
1737 " - dist == 'e': Euclidean distance\n"
1738 " - dist == 'b': City Block distance\n"
1739 " - dist == 'c': Pearson correlation\n"
1740 " - dist == 'a': absolute value of the correlation\n"
1741 " - dist == 'u': uncentered correlation\n"
1742 " - dist == 'x': absolute uncentered correlation\n"
1743 " - dist == 's': Spearman's rank correlation\n"
1744 " - dist == 'k': Kendall's tau\n";
1745
1746 static PyObject*
py_somcluster(PyObject * self,PyObject * args,PyObject * keywords)1747 py_somcluster(PyObject* self, PyObject* args, PyObject* keywords)
1748 {
1749 int nrows;
1750 int ncols;
1751 int ndata;
1752 Data data = {0};
1753 Mask mask = {0};
1754 Py_buffer weight = {0};
1755 int transpose = 0;
1756 double inittau = 0.02;
1757 int niter = 1;
1758 char dist = 'e';
1759 Py_buffer indices = {0};
1760 Celldata celldata = {0};
1761 PyObject* result = NULL;
1762
1763 static char* kwlist[] = {"clusterids",
1764 "celldata",
1765 "data",
1766 "mask",
1767 "weight",
1768 "transpose",
1769 "inittau",
1770 "niter",
1771 "dist",
1772 NULL};
1773
1774 if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&O&idiO&", kwlist,
1775 index2d_converter, &indices,
1776 celldata_converter, &celldata,
1777 data_converter, &data,
1778 mask_converter, &mask,
1779 vector_converter, &weight,
1780 &transpose,
1781 &inittau,
1782 &niter,
1783 distance_converter, &dist)) return NULL;
1784 if (niter < 1) {
1785 PyErr_SetString(PyExc_ValueError,
1786 "number of iterations (niter) should be positive");
1787 goto exit;
1788 }
1789 if (!data.values) {
1790 PyErr_SetString(PyExc_RuntimeError, "data is None");
1791 goto exit;
1792 }
1793 if (!mask.values) {
1794 PyErr_SetString(PyExc_RuntimeError, "mask is None");
1795 goto exit;
1796 }
1797 nrows = data.nrows;
1798 ncols = data.ncols;
1799 if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
1800 PyErr_Format(PyExc_ValueError,
1801 "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
1802 mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
1803 goto exit;
1804 }
1805 ndata = transpose ? nrows : ncols;
1806 if (weight.shape[0] != ndata) {
1807 PyErr_Format(PyExc_RuntimeError,
1808 "weight has incorrect size %zd (expected %d)",
1809 weight.shape[0], ndata);
1810 goto exit;
1811 }
1812 if (celldata.nz != ndata) {
1813 PyErr_Format(PyExc_RuntimeError,
1814 "the celldata array size is not consistent with the data "
1815 "(last dimension is %d; expected %d)", celldata.nz, ndata);
1816 goto exit;
1817 }
1818 somcluster(nrows,
1819 ncols,
1820 data.values,
1821 mask.values,
1822 weight.buf,
1823 transpose,
1824 celldata.nx,
1825 celldata.ny,
1826 inittau,
1827 niter,
1828 dist,
1829 celldata.values,
1830 indices.buf);
1831 Py_INCREF(Py_None);
1832 result = Py_None;
1833
1834 exit:
1835 data_converter(NULL, &data);
1836 vector_converter(NULL, &weight);
1837 index2d_converter(NULL, &indices);
1838 celldata_converter(NULL, &celldata);
1839 return result;
1840 }
1841 /* end of wrapper for somcluster */
1842
1843 /* clusterdistance */
1844 static char clusterdistance__doc__[] =
1845 "clusterdistance(data, mask, weight, index1, index2, dist, method,\n"
1846 " transpose) -> distance between two clusters\n"
1847 "\n"
1848 "Arguments:\n"
1849 "\n"
1850 " - data: nrows x ncols array containing the data values.\n"
1851 "\n"
1852 " - mask: nrows x ncols array of integers, showing which data are\n"
1853 " missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
1854 "\n"
1855 " - weight: the weights to be used when calculating distances\n"
1856 "\n"
1857 " - index1: 1D array identifying which items belong to the first\n"
1858 " cluster.\n"
1859 "\n"
1860 " - index2: 1D array identifying which items belong to the second\n"
1861 " cluster.\n"
1862 "\n"
1863 " - dist: specifies the distance function to be used:\n"
1864 "\n"
1865 " - dist == 'e': Euclidean distance\n"
1866 " - dist == 'b': City Block distance\n"
1867 " - dist == 'c': Pearson correlation\n"
1868 " - dist == 'a': absolute value of the correlation\n"
1869 " - dist == 'u': uncentered correlation\n"
1870 " - dist == 'x': absolute uncentered correlation\n"
1871 " - dist == 's': Spearman's rank correlation\n"
1872 " - dist == 'k': Kendall's tau\n"
1873 "\n"
1874 " - method: specifies how the distance between two clusters is defined:\n"
1875 "\n"
1876 " - method == 'a': the distance between the arithmetic means of the\n"
1877 " two clusters\n"
1878 " - method == 'm': the distance between the medians of the two\n"
1879 " clusters\n"
1880 " - method == 's': the smallest pairwise distance between members\n"
1881 " of the two clusters\n"
1882 " - method == 'x': the largest pairwise distance between members of\n"
1883 " the two clusters\n"
1884 " - method == 'v': average of the pairwise distances between\n"
1885 " members of the clusters\n"
1886 "\n"
1887 " - transpose:\n"
1888 "\n"
1889 " - if equal to 0: clusters of rows are considered;\n"
1890 " - if equal to 1: clusters of columns are considered.\n"
1891 "\n";
1892
1893 static PyObject*
py_clusterdistance(PyObject * self,PyObject * args,PyObject * keywords)1894 py_clusterdistance(PyObject* self, PyObject* args, PyObject* keywords)
1895 {
1896 double distance;
1897 int nrows;
1898 int ncols;
1899 int ndata;
1900 Data data = {0};
1901 Mask mask = {0};
1902 Py_buffer weight = {0};
1903 char dist = 'e';
1904 char method = 'a';
1905 int transpose = 0;
1906 Py_buffer index1 = {0};
1907 Py_buffer index2 = {0};
1908 PyObject* result = NULL;
1909
1910 static char* kwlist[] = {"data",
1911 "mask",
1912 "weight",
1913 "index1",
1914 "index2",
1915 "method",
1916 "dist",
1917 "transpose",
1918 NULL};
1919
1920 if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&O&O&O&i", kwlist,
1921 data_converter, &data,
1922 mask_converter, &mask,
1923 vector_converter, &weight,
1924 index_converter, &index1,
1925 index_converter, &index2,
1926 method_clusterdistance_converter, &method,
1927 distance_converter, &dist,
1928 &transpose)) return NULL;
1929 if (!data.values) {
1930 PyErr_SetString(PyExc_RuntimeError, "data is None");
1931 goto exit;
1932 }
1933 if (!mask.values) {
1934 PyErr_SetString(PyExc_RuntimeError, "mask is None");
1935 goto exit;
1936 }
1937 nrows = data.nrows;
1938 ncols = data.ncols;
1939 ndata = transpose ? nrows : ncols;
1940 if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
1941 PyErr_Format(PyExc_ValueError,
1942 "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
1943 mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
1944 goto exit;
1945 }
1946 if (weight.shape[0] != ndata) {
1947 PyErr_Format(PyExc_RuntimeError,
1948 "weight has incorrect size %zd (expected %d)",
1949 weight.shape[0], ndata);
1950 goto exit;
1951 }
1952
1953 distance = clusterdistance(nrows,
1954 ncols,
1955 data.values,
1956 mask.values,
1957 weight.buf,
1958 (int) index1.shape[0],
1959 (int) index2.shape[0],
1960 index1.buf,
1961 index2.buf,
1962 dist,
1963 method,
1964 transpose);
1965
1966 if (distance < -0.5) /* Actually -1.0; avoiding roundoff errors */
1967 PyErr_SetString(PyExc_IndexError, "index out of range");
1968 else
1969 result = PyFloat_FromDouble(distance);
1970 exit:
1971 data_converter(NULL, &data);
1972 mask_converter(NULL, &mask);
1973 vector_converter(NULL, &weight);
1974 index_converter(NULL, &index1);
1975 index_converter(NULL, &index2);
1976 return result;
1977 }
1978 /* end of wrapper for clusterdistance */
1979
1980 /* clustercentroids */
1981 static char clustercentroids__doc__[] =
1982 "clustercentroids(data, mask, clusterid, method, transpose) -> cdata, cmask\n"
1983 "\n"
1984 "The clustercentroids routine calculates the cluster centroids, given to\n"
1985 "which cluster each element belongs. The centroid is defined as either\n"
1986 "the mean or the median over all elements for each dimension.\n"
1987 "\n"
1988 "Arguments:\n"
1989 " - data: nrows x ncols array containing the data values.\n"
1990 "\n"
1991 " - mask: nrows x ncols array of integers, showing which data are\n"
1992 " missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
1993 "\n"
1994 " - clusterid: array containing the cluster number for each item.\n"
1995 " The cluster number should be non-negative.\n"
1996 "\n"
1997 " - method: specifies whether the centroid is calculated from the\n"
1998 " arithmetic mean (method == 'a', default) or the median\n"
1999 " (method == 'm') over each dimension.\n"
2000 "\n"
2001 " - transpose: if equal to 0, row clusters are considered;\n"
2002 " if equal to 1, column clusters are considered.\n"
2003 "\n"
2004 " - cdata: 2D array containing, upon return, the cluster centroids.\n"
2005 " If transpose == 0, then the dimensions of cdata should be\n"
2006 " nclusters x ncols.\n"
2007 " If transpose == 1, then the dimensions of cdata should be \n"
2008 " nrows x nclusters.\n"
2009 "\n"
2010 " - cmask: 2D array of integers describing, upon return, which elements\n"
2011 " in cdata, if any, are missing.\n";
2012
2013 static PyObject*
py_clustercentroids(PyObject * self,PyObject * args,PyObject * keywords)2014 py_clustercentroids(PyObject* self, PyObject* args, PyObject* keywords)
2015 {
2016 int nrows;
2017 int ncols;
2018 int nclusters;
2019 Data data = {0};
2020 Mask mask = {0};
2021 Data cdata = {0};
2022 Mask cmask = {0};
2023 Py_buffer clusterid = {0};
2024 char method = 'a';
2025 int transpose = 0;
2026 int ok = -1;
2027
2028 static char* kwlist[] = {"data",
2029 "mask",
2030 "clusterid",
2031 "method",
2032 "transpose",
2033 "cdata",
2034 "cmask",
2035 NULL };
2036
2037 if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&O&iO&O&", kwlist,
2038 data_converter, &data,
2039 mask_converter, &mask,
2040 index_converter, &clusterid,
2041 method_kcluster_converter, &method,
2042 &transpose,
2043 data_converter, &cdata,
2044 mask_converter, &cmask)) return NULL;
2045 if (!data.values) {
2046 PyErr_SetString(PyExc_RuntimeError, "data is None");
2047 goto exit;
2048 }
2049 if (!mask.values) {
2050 PyErr_SetString(PyExc_RuntimeError, "mask is None");
2051 goto exit;
2052 }
2053 nrows = data.nrows;
2054 ncols = data.ncols;
2055 if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
2056 PyErr_Format(PyExc_ValueError,
2057 "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
2058 mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
2059 goto exit;
2060 }
2061 if (transpose == 0) {
2062 nclusters = check_clusterid(clusterid, nrows);
2063 nrows = nclusters;
2064 }
2065 else {
2066 nclusters = check_clusterid(clusterid, ncols);
2067 ncols = nclusters;
2068 }
2069 if (nclusters == 0) goto exit;
2070 if (cdata.nrows != nrows) {
2071 PyErr_Format(PyExc_RuntimeError,
2072 "cdata has incorrect number of rows (%d, expected %d)",
2073 cdata.nrows, nrows);
2074 goto exit;
2075 }
2076 if (cdata.ncols != ncols) {
2077 PyErr_Format(PyExc_RuntimeError,
2078 "cdata has incorrect number of columns (%d, expected %d)",
2079 cdata.ncols, ncols);
2080 goto exit;
2081 }
2082 if (cmask.view.shape[0] != nrows) {
2083 PyErr_Format(PyExc_RuntimeError,
2084 "cmask has incorrect number of rows (%zd, expected %d)",
2085 cmask.view.shape[0], nrows);
2086 goto exit;
2087 }
2088 if (cmask.view.shape[1] != ncols) {
2089 PyErr_Format(PyExc_RuntimeError,
2090 "cmask has incorrect number of columns "
2091 "(%zd, expected %d)", cmask.view.shape[1], ncols);
2092 goto exit;
2093 }
2094 ok = getclustercentroids(nclusters,
2095 data.nrows,
2096 data.ncols,
2097 data.values,
2098 mask.values,
2099 clusterid.buf,
2100 cdata.values,
2101 cmask.values,
2102 transpose,
2103 method);
2104 exit:
2105 data_converter(NULL, &data);
2106 mask_converter(NULL, &mask);
2107 data_converter(NULL, &cdata);
2108 mask_converter(NULL, &cmask);
2109 index_converter(NULL, &clusterid);
2110 if (ok == -1) return NULL;
2111 if (ok == 0) return PyErr_NoMemory();
2112 Py_INCREF(Py_None);
2113 return Py_None;
2114 }
2115 /* end of wrapper for clustercentroids */
2116
2117 /* distancematrix */
2118 static char distancematrix__doc__[] =
2119 "distancematrix(data, mask, weight, transpose, dist, distancematrix)\n"
2120 " -> None\n"
2121 "\n"
2122 "This function calculuates the distance matrix between the data values.\n"
2123 "\n"
2124 "Arguments:\n"
2125 "\n"
2126 " - data: nrows x ncols array containing the data values.\n"
2127 "\n"
2128 " - mask: nrows x ncols array of integers, showing which data are\n"
2129 " missing. If mask[i,j] == 0, then data[i,j] is missing.\n"
2130 "\n"
2131 " - weight: the weights to be used when calculating distances.\n"
2132 "\n"
2133 " - transpose: if equal to 0: the distances between rows are\n"
2134 " calculated;\n"
2135 " if equal to 1, the distances between columns are calculated.\n"
2136 "\n"
2137 " - dist: specifies the distance function to be used:\n"
2138 "\n"
2139 " - dist == 'e': Euclidean distance\n"
2140 " - dist == 'b': City Block distance\n"
2141 " - dist == 'c': Pearson correlation\n"
2142 " - dist == 'a': absolute value of the correlation\n"
2143 " - dist == 'u': uncentered correlation\n"
2144 " - dist == 'x': absolute uncentered correlation\n"
2145 " - dist == 's': Spearman's rank correlation\n"
2146 " - dist == 'k': Kendall's tau\n"
2147 "\n"
2148 " - distancematrix: Upon return, the distance matrix as a list of 1D\n"
2149 " arrays. The number of columns in each row is equal to the row number\n"
2150 " (i.e., len(distancematrix[i]) == i).\n"
2151 " An example of the return value is:\n"
2152 "\n"
2153 " matrix = [[],\n"
2154 " array([1.]),\n"
2155 " array([7., 3.]),\n"
2156 " array([4., 2., 6.])]\n"
2157 "\n"
2158 "This corresponds to the distance matrix:\n"
2159 "\n"
2160 " [0.\t1.\t7.\t4.]\n"
2161 " [1.\t0.\t3.\t2.]\n"
2162 " [7.\t3.\t0.\t6.]\n"
2163 " [4.\t2.\t6.\t0.]\n";
2164
2165 static PyObject*
py_distancematrix(PyObject * self,PyObject * args,PyObject * keywords)2166 py_distancematrix(PyObject* self, PyObject* args, PyObject* keywords)
2167 {
2168 PyObject* list;
2169 Distancematrix distances = {0};
2170 Data data = {0};
2171 Mask mask = {0};
2172 Py_buffer weight = {0};
2173 int transpose = 0;
2174 char dist = 'e';
2175 int nrows, ncols, ndata;
2176 PyObject* result = NULL;
2177
2178 /* -- Read the input variables --------------------------------------- */
2179 static char* kwlist[] = {"data",
2180 "mask",
2181 "weight",
2182 "transpose",
2183 "dist",
2184 "distancematrix",
2185 NULL};
2186
2187 if (!PyArg_ParseTupleAndKeywords(args, keywords, "O&O&O&iO&O!", kwlist,
2188 data_converter, &data,
2189 mask_converter, &mask,
2190 vector_converter, &weight,
2191 &transpose,
2192 distance_converter, &dist,
2193 &PyList_Type, &list)) return NULL;
2194 if (!data.values) {
2195 PyErr_SetString(PyExc_RuntimeError, "data is None");
2196 goto exit;
2197 }
2198 if (!mask.values) {
2199 PyErr_SetString(PyExc_RuntimeError, "mask is None");
2200 goto exit;
2201 }
2202 nrows = data.nrows;
2203 ncols = data.ncols;
2204 if (nrows != mask.view.shape[0] || ncols != mask.view.shape[1]) {
2205 PyErr_Format(PyExc_ValueError,
2206 "mask has incorrect dimensions (%zd x %zd, expected %d x %d)",
2207 mask.view.shape[0], mask.view.shape[1], data.nrows, data.ncols);
2208 goto exit;
2209 }
2210 ndata = (transpose == 0) ? ncols : nrows;
2211 if (weight.shape[0] != ndata) {
2212 PyErr_Format(PyExc_ValueError,
2213 "weight has incorrect size %zd (expected %d)",
2214 weight.shape[0], ndata);
2215 goto exit;
2216 }
2217 if (_convert_list_to_distancematrix(list, &distances) == 0) goto exit;
2218
2219 distancematrix(nrows,
2220 ncols,
2221 data.values,
2222 mask.values,
2223 weight.buf,
2224 dist,
2225 transpose,
2226 distances.values);
2227
2228 Py_INCREF(Py_None);
2229 result = Py_None;
2230 exit:
2231 data_converter(NULL, &data);
2232 mask_converter(NULL, &mask);
2233 vector_converter(NULL, &weight);
2234 distancematrix_converter(NULL, &distances);
2235 return result;
2236 }
2237 /* end of wrapper for distancematrix */
2238
2239 /* pca */
2240 static char pca__doc__[] =
2241 "pca(data, columnmean, coordinates, pc, eigenvalues) -> None\n"
2242 "\n"
2243 "This function calculates the principal component decomposition\n"
2244 "of the values in data.\n"
2245 "\n"
2246 "Arguments:\n"
2247 "\n"
2248 " - data: nrows x ncols array containing the data values.\n"
2249 "\n"
2250 " - columnmean: array of size nrows) in which the mean of each column\n"
2251 " will be sorted.\n"
2252 "\n"
2253 " - coordinates: nrows x nmin array in which the coordinates of the\n"
2254 " data along the principal components will be stored;\n"
2255 " nmin is min(nrows, ncols).\n"
2256 "\n"
2257 " - pc : the principal components as an nmin x ncols array, where nmin\n"
2258 " is min(nrows, ncols).\n"
2259 "\n"
2260 " - eigenvalues: array of size min(nrows, ncols), in which the\n"
2261 " eigenvalues will be stored, sorted by the magnitude\n"
2262 " of the eigenvalues, with the largest eigenvalues\n"
2263 " appearing first.\n"
2264 "\n"
2265 "Adding the column means to the dot product of the coordinates and the\n"
2266 "principal components, i.e.\n"
2267 "\n"
2268 " columnmean + dot(coordinates, pc)\n"
2269 "\n"
2270 "recreates the data matrix.\n";
2271
2272 static PyObject*
py_pca(PyObject * self,PyObject * args)2273 py_pca(PyObject* self, PyObject* args)
2274 {
2275 Py_buffer eigenvalues = {0};
2276 double** u;
2277 double** v;
2278 Data data = {0};
2279 Data pc = {0};
2280 Data coordinates = {0};
2281 Py_buffer mean = {0};
2282 int nrows, ncols;
2283 int nmin;
2284 int error = -2;
2285 double* p;
2286 double** values;
2287 int i, j;
2288
2289 if (!PyArg_ParseTuple(args, "O&O&O&O&O&",
2290 data_converter, &data,
2291 vector_converter, &mean,
2292 data_converter, &coordinates,
2293 data_converter, &pc,
2294 vector_converter, &eigenvalues)) return NULL;
2295
2296 values = data.values;
2297 if (!values) {
2298 PyErr_SetString(PyExc_RuntimeError, "data is None");
2299 goto exit;
2300 }
2301 nrows = data.nrows;
2302 ncols = data.ncols;
2303 if (mean.shape[0] != ncols) {
2304 PyErr_Format(PyExc_RuntimeError,
2305 "columnmean has inconsistent size %zd (expected %d)",
2306 mean.shape[0], ncols);
2307 goto exit;
2308 }
2309 nmin = nrows < ncols ? nrows : ncols;
2310 if (pc.nrows != nmin || pc.ncols != ncols) {
2311 PyErr_Format(PyExc_RuntimeError,
2312 "pc has inconsistent size %zd x %zd (expected %d x %d)",
2313 mean.shape[0], mean.shape[1], nmin, ncols);
2314 goto exit;
2315 }
2316 if (coordinates.nrows != nrows || coordinates.ncols != nmin) {
2317 PyErr_Format(PyExc_RuntimeError,
2318 "coordinates has inconsistent size %zd x %zd (expected %d x %d)",
2319 mean.shape[0], mean.shape[1], nrows, nmin);
2320 goto exit;
2321 }
2322 if (nrows >= ncols) {
2323 u = coordinates.values;
2324 v = pc.values;
2325 }
2326 else { /* nrows < ncolums */
2327 u = pc.values;
2328 v = coordinates.values;
2329 }
2330 /* -- Calculate the mean of each column ------------------------------ */
2331 p = mean.buf;
2332 for (j = 0; j < ncols; j++) {
2333 p[j] = 0.0;
2334 for (i = 0; i < nrows; i++) p[j] += values[i][j];
2335 p[j] /= nrows;
2336 }
2337 /* -- Subtract the mean of each column ----------------------------- */
2338 for (i = 0; i < nrows; i++)
2339 for (j = 0; j < ncols; j++)
2340 u[i][j] = values[i][j] - p[j];
2341 /* -- Perform the principal component analysis ----------------------- */
2342 error = pca(nrows, ncols, u, v, eigenvalues.buf);
2343 /* ------------------------------------------------------------------- */
2344 exit:
2345 data_converter(NULL, &data);
2346 vector_converter(NULL, &mean);
2347 data_converter(NULL, &pc);
2348 data_converter(NULL, &coordinates);
2349 vector_converter(NULL, &eigenvalues);
2350 if (error == 0) {
2351 Py_INCREF(Py_None);
2352 return Py_None;
2353 }
2354 if (error == -1) return PyErr_NoMemory();
2355 else if (error > 0)
2356 PyErr_SetString(PyExc_RuntimeError,
2357 "Singular value decomposition failed to converge");
2358 return NULL;
2359 }
2360 /* end of wrapper for pca */
2361
2362 /* ========================================================================= */
2363 /* -- The methods table ---------------------------------------------------- */
2364 /* ========================================================================= */
2365
2366
2367 static struct PyMethodDef cluster_methods[] = {
2368 {"version", (PyCFunction) py_version, METH_NOARGS, version__doc__},
2369 {"kcluster",
2370 (PyCFunction) py_kcluster,
2371 METH_VARARGS | METH_KEYWORDS,
2372 kcluster__doc__
2373 },
2374 {"kmedoids",
2375 (PyCFunction) py_kmedoids,
2376 METH_VARARGS | METH_KEYWORDS,
2377 kmedoids__doc__
2378 },
2379 {"treecluster",
2380 (PyCFunction) py_treecluster,
2381 METH_VARARGS | METH_KEYWORDS,
2382 treecluster__doc__
2383 },
2384 {"somcluster",
2385 (PyCFunction) py_somcluster,
2386 METH_VARARGS | METH_KEYWORDS,
2387 somcluster__doc__
2388 },
2389 {"clusterdistance",
2390 (PyCFunction) py_clusterdistance,
2391 METH_VARARGS | METH_KEYWORDS,
2392 clusterdistance__doc__
2393 },
2394 {"clustercentroids",
2395 (PyCFunction) py_clustercentroids,
2396 METH_VARARGS | METH_KEYWORDS,
2397 clustercentroids__doc__
2398 },
2399 {"distancematrix",
2400 (PyCFunction) py_distancematrix,
2401 METH_VARARGS | METH_KEYWORDS,
2402 distancematrix__doc__
2403 },
2404 {"pca",
2405 (PyCFunction) py_pca,
2406 METH_VARARGS | METH_KEYWORDS,
2407 pca__doc__
2408 },
2409 {NULL, NULL, 0, NULL} /* sentinel */
2410 };
2411
2412 /* ========================================================================= */
2413 /* -- Initialization ------------------------------------------------------- */
2414 /* ========================================================================= */
2415
2416 static struct PyModuleDef moduledef = {
2417 PyModuleDef_HEAD_INIT,
2418 "_cluster",
2419 "C Clustering Library",
2420 -1,
2421 cluster_methods,
2422 NULL,
2423 NULL,
2424 NULL,
2425 NULL
2426 };
2427
2428 PyObject *
PyInit__cluster(void)2429 PyInit__cluster(void)
2430 {
2431 PyObject *module;
2432
2433 PyNodeType.tp_new = PyType_GenericNew;
2434 if (PyType_Ready(&PyNodeType) < 0)
2435 return NULL;
2436 if (PyType_Ready(&PyTreeType) < 0)
2437 return NULL;
2438
2439 module = PyModule_Create(&moduledef);
2440 if (module == NULL) return NULL;
2441
2442 Py_INCREF(&PyTreeType);
2443 if (PyModule_AddObject(module, "Tree", (PyObject*) &PyTreeType) < 0) {
2444 Py_DECREF(module);
2445 Py_DECREF(&PyTreeType);
2446 return NULL;
2447 }
2448
2449 Py_INCREF(&PyNodeType);
2450 if (PyModule_AddObject(module, "Node", (PyObject*) &PyNodeType) < 0) {
2451 Py_DECREF(module);
2452 Py_DECREF(&PyNodeType);
2453 return NULL;
2454 }
2455
2456 return module;
2457 }
2458