1 #include <sys/types.h>
2 #include "Python.h"
3 #include "distance.h"
4 
5 static PyObject *pydistance_error;         /* distance.error */
6 
7 struct nw_matrix {
8 	PyObject_HEAD
9 	struct matrix _m;
10 } nw_matrix_t;
11 
12 static char 	pydistance__doc__[] =
13 "String distance comparison functions\n"
14 "\nThe distance library is used to compare pieces of data for similarity.\n"
15 "Specifically, it contains a number of methods to find the 'edit distance'\n"
16 "between inputs, or the number of differences between them. These\n"
17 "differences are calculated using various mechanisms.\n"
18 "\n"
19 "Each fuction returns the calculated distance between the two inputs. A\n"
20 "distance of 0 indicates that the strings are the same. A distance less\n"
21 "than 0 indicates that an error has occurred. For the Hamming and Jaccard\n"
22 "distances, this is because the two inputs are of different sizes.\n"
23 "\nLorenzo Seidenari wrote the Levenshtein distance implementation.\n"
24 "\nAUTHORS\n\n"
25 "Lorenzo Seidenari wrote the Levenshtein distance implementation.\n\n"
26 "Jose Nazario <jose@monkey.org> wrote the implementations of the Damerau,\n"
27 "Hamming and Jaccard distance algorithms along with the Needleman-Wunsch\n"
28 "Levenshtein distance implementation here.\n\n"
29 "Evan Cooke adapted the adler32 routine from Jean-loup Gailly and Mark\n"
30 "Adler used in the bloom distance function, which he also wrote.\n"
31 "\nSEE ALSO\n\n"
32 "V. I. Levenshtein, 'Binary codes capable of correcting deletions,\n"
33 "insertions and reversals', Doklady Akademii Nauk SSSR, 4, 163, 845-848,\n"
34 "1965.\n"
35 "\n"
36 "Burton Bloom, 'Space/time trade-offs in hash coding with allowable\n"
37 "errors', Communications of the ACM, 7, 13, 422-426, July 1970.\n"
38 "\n"
39 "Fred J. Damerau, 'A technique for computer detection and correction of\n"
40 "spelling errors', Communications of the ACM, 3, 7, 171-176, March 1964.\n"
41 "\n"
42 "Burton Bloom, 'Space/time trade-offs in hash coding with allowable\n"
43 "errors', Communications of the ACM, 7, 13, 422-426, July 1970.\n"
44 "\n"
45 "R. W. Hamming, 'Error Detecting and Error Correcting Codes', Bell System\n"
46 "Tech Journal, 9, 147-160, April 1950.\n"
47 "\n"
48 "P. Jaccard, 'Etude comparative de la distribution florale dans une\n"
49 "portion des Alpes et du Jura', Bull Soc Vaudoise Sci Nat, 44, 223-270,\n"
50 "1908.\n";
51 
52 static PyObject *
raisePydistanceError(char * errmsg)53 raisePydistanceError(char *errmsg)
54 {
55         PyErr_SetString(pydistance_error, errmsg);
56         return NULL;
57 }
58 
59 static char	pydistance__levenshtein__doc__[] =
60 "levenshtein(string1, string2)\n\n"
61 "Levenshtein distance (LD) is a measure of the similarity between two\n"
62 "inputs, which we will refer to as the source s and the target input t.\n"
63 "The distance is the number of deletions, insertions, or substitutions\n"
64 "required to transform s into t.  For example,\n"
65 "\n"
66 "If s is 'test' and t is 'test', then LD(s,t) = 0, because no \n"
67 "transformations are needed. The strings are already identical.\n"
68 "\n"
69 "If s is 'test' and t is 'tent', then LD(s,t) = 1, because one \n"
70 "substitution (change 's' to 'n') is sufficient to transform s into t.\n"
71 "\n"
72 "The greater the Levenshtein distance, the more different the inputs are.\n"
73 "\n"
74 "Levenshtein distance is named after the Russian scientist Vladimir \n"
75 "Levenshtein, who devised the algorithm in 1965. If you can't spell or \n"
76 "pronounce Levenshtein, the metric is also sometimes called edit distance.";
77 
78 static PyObject *
pydistance_levenshtein(PyObject * na,PyObject * args)79 pydistance_levenshtein(PyObject *na, PyObject *args)
80 {
81 	char *str1, *str2;
82 	int ret;
83 
84 	if (!PyArg_ParseTuple(args, "ss", &str1, &str2)) return NULL;
85 	ret = levenshtein_d(str1, strlen(str1), str2, strlen(str2));
86 	return PyInt_FromLong((long) ret);
87 }
88 
89 static char	pydistance__damerau__doc__[] =
90 "damerau(string1, string2)\n\n"
91 "The Damerau distance is almost identical to the Levenshtein distance but\n"
92 "can tolerate adjascent characters that have been swapped, a common\n"
93 "spelling mistake or typographic error. For example, if s is 'abcd' and t\n"
94 "is 'acbd', then DD(s,t) is 0 because of the transposition of 'b' and 'c'.\n"
95 "Other costs found in the Levenshtein distance are identical.";
96 
97 static PyObject *
pydistance_damerau(PyObject * na,PyObject * args)98 pydistance_damerau(PyObject *na, PyObject *args)
99 {
100 	char *str1, *str2;
101 	int ret;
102 
103 	if (!PyArg_ParseTuple(args, "ss", &str1, &str2)) return NULL;
104 	ret = damerau_d(str1, strlen(str1), str2, strlen(str2));
105 	return PyInt_FromLong((long) ret);
106 }
107 
108 static char 	pydistance__hamming__doc__[] =
109 "hamming(string1, string2)\n\n"
110 "The Hamming distance H is defined only for inputs of the same length.\n"
111 "For two inputs s and t, H(s, t) is the number of places in which the two\n"
112 "string differ, i.e., have different characters.";
113 
114 static PyObject *
pydistance_hamming(PyObject * na,PyObject * args)115 pydistance_hamming(PyObject *na, PyObject *args)
116 {
117 	char *str1, *str2;
118 	int ret;
119 
120 	if (!PyArg_ParseTuple(args, "ss", &str1, &str2)) return NULL;
121 	ret = hamming_d(str1, strlen(str1), str2, strlen(str2));
122 	return PyInt_FromLong((long) ret);
123 }
124 
125 static char	pydistance__jaccard__doc__[] =
126 "jaccard(string1, string2)\n\n"
127 "The Jaccard similarity between two strings is the ratio of the size of\n"
128 "their intersection to the size of their union. This distance is 1 minus\n"
129 "this similarity sscore. Indentical strings have a Jacard distance of 0,\n"
130 "completely dissimilar strings have a score of 1. The two inputs must be\n"
131 "of the same size.";
132 
133 static PyObject *
pydistance_jaccard(PyObject * na,PyObject * args)134 pydistance_jaccard(PyObject *na, PyObject *args)
135 {
136 	char *str1, *str2;
137 	float ret;
138 
139 	if (!PyArg_ParseTuple(args, "ss", &str1, &str2)) return NULL;
140 	ret = jaccard_d(str1, strlen(str1), str2, strlen(str2));
141 	return PyFloat_FromDouble((long) ret);
142 }
143 
144 static char	pydistance__minkowski__doc__[] =
145 "minkowski(string1, string2, power)\n\n"
146 "The Minkowski distance between two strings is the geometric distance\n"
147 "between two inputs and uses a variable scaling factor, power.  When this\n"
148 "value is 1 the Minkowski distance is equal to the Manhattan distance.\n"
149 "When power is 2 it yields the Euclidian distance between two strings.\n"
150 "When this value grows, the value of the Minkowski distance tends towards\n"
151 "a Chebyshev result. Therefore by inceasing power, one can place more\n"
152 "numerical value on the largest distance (in terms of elements in the two\n"
153 "vectors in question). \n"
154 "\n"
155 "A disadvantage of the Minkowski method is that if one element in the \n"
156 "vectors has a wider range than the other elements then that large range\n"
157 "may 'dilute' the distances of the small-range elements.";
158 
159 static PyObject *
pydistance_minkowski(PyObject * na,PyObject * args)160 pydistance_minkowski(PyObject *na, PyObject *args)
161 {
162 	char *str1, *str2;
163 	int power;
164 	double ret;
165 
166 	if (!PyArg_ParseTuple(args, "ssi", &str1, &str2, &power)) return NULL;
167 	ret = minkowski_d(str1, strlen(str1), str2, strlen(str2), power);
168 	return PyFloat_FromDouble((long) ret);
169 }
170 
171 static char	pydistance__manhattan__doc__[] =
172 "manhattan(string1, string2)\n\n"
173 "The Manhattan distance is a special case of the Minkowski distance where\n"
174 "the 'power' is set to 1.";
175 
176 static PyObject *
pydistance_manhattan(PyObject * na,PyObject * args)177 pydistance_manhattan(PyObject *na, PyObject *args)
178 {
179 	char *str1, *str2;
180 	double ret;
181 
182 	if (!PyArg_ParseTuple(args, "ss", &str1, &str2)) return NULL;
183 	ret = MANHATTAN_D(str1, strlen(str1), str2, strlen(str2));
184 	return PyFloat_FromDouble((long) ret);
185 }
186 
187 static char	pydistance__euclid__doc__[] =
188 "euclid(string1, string2)\n\n"
189 "The Euclid distance is a special case of the Minkowski distance where\n"
190 "the 'power' is set to 2.";
191 
192 static PyObject *
pydistance_euclid(PyObject * na,PyObject * args)193 pydistance_euclid(PyObject *na, PyObject *args)
194 {
195 	char *str1, *str2;
196 	double ret;
197 
198 	if (!PyArg_ParseTuple(args, "ss", &str1, &str2)) return NULL;
199 	ret = EUCLID_D(str1, strlen(str1), str2, strlen(str2));
200 	return PyFloat_FromDouble((long) ret);
201 }
202 
203 static char	pydistance__bloom__doc__[] =
204 "bloom(string1, string2)\n\n"
205 "A Bloom Filter (BF) is a method of encoding a variable length piece of\n"
206 "input data to a fixed length signature.  The basic idea is to map\n"
207 "attributes of the data to specific positions in a binary vector.  First,\n"
208 "the result vector is initialized to all 0's.  Next, a series of hash\n"
209 "functions are applied to the data to map some part of the input data\n"
210 "to a position in the result vector.  Thus, if a is the input data,\n"
211 "and m is length of the result vector, then hash(a) produces a value in the\n"
212 "range 0...m-1.\n"
213 "\n"
214 "A similar method encoding data into a fixed length result vector can be\n"
215 "accomplished using a crytographic digest.  A digest function like MD5\n"
216 "is designed such that the difference between any two digests does not in\n"
217 "any way correlate to similar differences in input messages.  This is \n"
218 "an important property in certain applications however it may also be \n"
219 "desirable that the difference between digests corresponds to\n"
220 "differences in the input messages.  Because a Bloom filter is derived from\n"
221 "attributes of the input message, the difference between any two Bloom\n"
222 "Filter digests corresponds to differences in the input data.\n"
223 "\n"
224 "This Bloom filter implementation is meant take any byte sequence as input\n"
225 "so a generic hash function is used.  The input stream is divided into\n"
226 "chunks and hashed using a 32 bit adler modulo m.  In order to get the most\n"
227 "complete coverage, the chunk N+1 includes all but one of the bytes in\n"
228 "chunk N and one byte not included in chunk N.  (i.e. the chunk window\n"
229 "slides to right one byte per hash.)\n"
230 "\n"
231 "The difference between any two Bloom filter digests is computed by taking\n"
232 "the logical and of the two digests and computing the number of bit of\n"
233 "similarity.  The similar count is then divided by maximum of the total\n"
234 "number of bit in either digest.  This acts to normalize the bit count for\n"
235 "large and small signatures.  The similarity is then turned into a distance\n"
236 "by take 1 - normalized bit count.\n";
237 
238 static PyObject *
pydistance_bloom(PyObject * na,PyObject * args)239 pydistance_bloom(PyObject *na, PyObject *args)
240 {
241         char *str1, *str2, *d1, *d2;
242 	double ret;
243 	int bits;
244 
245         if (!PyArg_ParseTuple(args, "ssi", &str1, &str2, &bits)) return NULL;
246 	d1 = (char *)malloc(bits);
247 	d2 = (char *)malloc(bits);
248 	if (!d1 || !d2)
249 		raisePydistanceError("Couldn't allocate memory.");
250 
251         bloom_create(str1, strlen(str1), d1, bits);
252         bloom_create(str2, strlen(str2), d2, bits);
253 	ret = bloom_d(d1, d1, bits);
254 	free(d1);
255 	free(d2);
256 	return PyFloat_FromDouble((long) ret);
257 }
258 
259 static char	pydistance__needleman_wunsch__doc__[] =
260 "The Levenshtein distance algorithm assumes that the cost of all insertions \n"
261 "or conversions is equal. However, in some scenarios this may not be\n"
262 "desirable and may mask the acceptable distances between inputs.\n"
263 "\n"
264 "A modified Levenshtein distance algorithm, also known as the Needleman-\n"
265 "Wunsch distance algorithm, accepts a cost matrix as an extra input. This\n"
266 "matrix object (distance.nw_matrix) contains two cost matricies for each \n"
267 "pair of characters to convert from and to. The costs of inserting this \n"
268 "character and converting between characters is listed in this matrix.\n";
269 
270 static PyObject *
pydistance_needleman_wunsch(PyObject * na,PyObject * args)271 pydistance_needleman_wunsch(PyObject *na, PyObject *args)
272 {
273         char *str1, *str2;
274 	float ret;
275 	struct nw_matrix *nw_m;
276 	struct matrix *m;
277 	PyObject *tmp;
278 
279         if (!PyArg_ParseTuple(args, "ssO", &str1, &str2, &nw_m, &tmp))
280 		return NULL;
281 	m = (struct matrix *)(&nw_m->_m);
282 	ret = needleman_wunsch_d(str1, strlen(str1), str2, strlen(str2), m);
283 	return PyFloat_FromDouble(ret);
284 }
285 
286 static PyObject *
NwMatrix_new(PyTypeObject * type,PyObject * args,PyObject * kwds)287 NwMatrix_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
288 {
289 	struct nw_matrix *self;
290 	self = (struct nw_matrix *)type->tp_alloc(type, 0);
291 	Py_INCREF(self);
292 	return (PyObject *)self;
293 }
294 
295 static void
NwMatrix_dealloc(struct nw_matrix * self)296 NwMatrix_dealloc(struct nw_matrix *self)
297 {
298 	Py_DECREF(self);
299 	self->ob_type->tp_free((PyObject*)self);
300 }
301 
302 static PyObject *
NwMatrix_init(struct nw_matrix * self,PyObject * args,PyObject * kwds)303 NwMatrix_init(struct nw_matrix *self, PyObject *args, PyObject *kwds)
304 {
305 	static char *kwlist[] = {"insertion", "conversion", NULL};
306 	double ins = 1.0, conv = 1.0;
307 	int x, y;
308 
309 	if (! PyArg_ParseTupleAndKeywords(args, kwds, "|dd", kwlist,
310 					  &ins, &conv))
311 		return NULL;
312 	/* initialize the matrix ... */
313         for (x = 0; x < 255; x++) {
314                 for (y = 0; y < 255; y++) {
315                         self->_m.conversion[x][y] = conv;
316                         self->_m.insertion[x][y] = ins;
317                 }
318         }
319 	Py_INCREF(self);
320 	return 0;
321 }
322 
323 static char pydistance__nw_matrix__setConversion__doc__[] =
324 	"setConversion(source, dest, cost)\n"
325 	"set the cost of a conversion";
326 
327 static PyObject *
pydistance_nw_matrix_setConversion(struct nw_matrix * self,PyObject * args)328 pydistance_nw_matrix_setConversion(struct nw_matrix * self, PyObject *args)
329 {
330         char *s, *d;
331 	int from, to;
332         double val;
333 
334         if (!PyArg_ParseTuple(args, "ssd", &s, &d, &val))
335 		return NULL;
336 
337 	from = s[0];
338 	to = d[0];
339 	Py_DECREF(self);
340 	Py_INCREF(self);
341 	self->_m.conversion[from][to] = val;
342 	return PyFloat_FromDouble(0.0);
343 }
344 
345 static char pydistance__nw_matrix__setInsertion__doc__[] =
346 	"setInsertion(source, dest, cost)\n"
347 	"set the cost of an insertion";
348 
349 static PyObject *
pydistance_nw_matrix_setInsertion(struct nw_matrix * self,PyObject * args)350 pydistance_nw_matrix_setInsertion(struct nw_matrix * self, PyObject *args)
351 {
352         char *s, *d;
353 	int from, to;
354         double val;
355 
356         if (!PyArg_ParseTuple(args, "ssd", &s, &d, &val))
357 		return NULL;
358 
359 	from = s[0];
360 	to = d[0];
361 	Py_DECREF(self);
362 	Py_INCREF(self);
363 	self->_m.insertion[from][to] = val;
364 	return PyFloat_FromDouble(0.0);
365 }
366 
367 #define mkMethod(x)                                             \
368     {#x, pydistance_##x, METH_VARARGS, pydistance__##x##__doc__}
369 
370 static PyMethodDef pydistance_methods[] = {
371 	mkMethod(levenshtein),
372 	mkMethod(hamming),
373 	mkMethod(damerau),
374 	mkMethod(jaccard),
375 	mkMethod(minkowski),
376 	mkMethod(manhattan),
377 	mkMethod(euclid),
378 	mkMethod(bloom),
379 	mkMethod(needleman_wunsch),
380 	{NULL, NULL}
381 };
382 
383 #define mkNwMethod(x)                                             \
384     {#x, pydistance_nw_matrix_##x, METH_VARARGS, pydistance__nw_matrix__##x##__doc__}
385 
386 static PyMethodDef nw_matrix_methods[] = {
387         mkNwMethod(setConversion),
388         mkNwMethod(setInsertion),
389         {NULL, NULL}
390 };
391 
392 /* see http://www.python.org/doc/2.3/ext/node22.html */
393 
394 static PyTypeObject distance_MatrixType = {
395     PyObject_HEAD_INIT(NULL)
396     0,                         /*ob_size*/
397     "distance.nw_matrix",      /*tp_name*/
398     sizeof(nw_matrix_t),         /*tp_basicsize*/
399     0,                         /*tp_itemsize*/
400     (destructor)NwMatrix_dealloc, /*tp_dealloc*/
401     0,                         /*tp_print*/
402     0,                         /*tp_getattr*/
403     0,                         /*tp_setattr*/
404     0,                         /*tp_compare*/
405     0,                         /*tp_repr*/
406     0,                         /*tp_as_number*/
407     0,                         /*tp_as_sequence*/
408     0,                         /*tp_as_mapping*/
409     0,                         /*tp_hash */
410     0,                         /*tp_call*/
411     0,                         /*tp_str*/
412     0,                         /*tp_getattro*/
413     0,                         /*tp_setattro*/
414     0,                         /*tp_as_buffer*/
415     Py_TPFLAGS_DEFAULT,        /*tp_flags*/
416     "Needleman Wunsch cost matrix",           /* tp_doc */
417     0,                         /* tp_traverse */
418     0,                         /* tp_clear */
419     0,                         /* tp_richcompare */
420     0,                         /* tp_weaklistoffset */
421     0,                         /* tp_iter */
422     0,                         /* tp_iternext */
423     nw_matrix_methods,         /* tp_methods */
424     0,				/* tp_members */
425     0,                         /* tp_getset */
426     0,                         /* tp_base */
427     0,                         /* tp_dict */
428     0,                         /* tp_descr_get */
429     0,                         /* tp_descr_set */
430     0,                         /* tp_dictoffset */
431     (initproc)NwMatrix_init,   /* tp_init */
432     0,                         /* tp_alloc */
433     NwMatrix_new,                 /* tp_new */
434 };
435 
436 
437 #if 0
438 static PyObject *
439 pydistance_init(PyObject *na, PyObject *args)
440 {
441 	if (!PyArg_ParseTuple(args, ":init")) return NULL;
442 	Py_INCREF(Py_None);
443 	return Py_None;
444 }
445 #endif 	/* 0 */
446 
447 DL_EXPORT(void)
initdistance(void)448 initdistance(void)
449 {
450 	PyObject *m;
451 	distance_MatrixType.tp_new = PyType_GenericNew;
452 	if (PyType_Ready(&distance_MatrixType) < 0)
453 		return;
454 	m = Py_InitModule3("distance", pydistance_methods, pydistance__doc__);
455 	pydistance_error = PyErr_NewException("distance.error", NULL, NULL);
456 	Py_INCREF(pydistance_error);
457 	Py_INCREF(&distance_MatrixType);
458 	PyModule_AddObject(m, "error", pydistance_error);
459 	PyModule_AddObject(m, "nw_matrix", (PyObject *)&distance_MatrixType);
460 	PyModule_AddStringConstant(m, "__version__", "0.2.1");
461 }
462