1 // crm_expr_clump.c - automatically cluster unlabelled documents
2
3 // Copyright 2009 William S. Yerazunis.
4 // This file is under GPLv3, as described in COPYING.
5
6 /////////////////////////////////////////////////////////////////////
7 // Original spec by Bill Yerazunis, original code by Joe Langeway,
8 // recode for CRM114 use by Bill Yerazunis.
9 //
10 // This code section (crm_expr_clump and subsidiary routines) is
11 // dual-licensed to both William S. Yerazunis and Joe Langeway,
12 // including the right to reuse this code in any way desired,
13 // including the right to relicense it under any other terms as
14 // desired.
15 /////////////////////////////////////////////////////////////////////
16
17 // This file is part of on going research and should not be considered
18 // a finished product, a reliable tool, an example of good software
19 // engineering, or a reflection of any quality of Joe's besides his
20 // tendency towards long hours.
21 //
22 // Here's what's going on:
23 //
24 // Documents are fed in with calls to "clump" and the distance between each
25 // document is recorded in a matrix. We then find clusters for automatic
26 // classification without the need for a gold standard judgement ahead
27 // of time.
28 //
29 // Cluster assignments start at index 1 and negative numbers indicate
30 // permanent assignments made by crm.
31
32 // include some standard files
33 #include "crm114_sysincludes.h"
34
35 // include any local crm114 configuration file
36 #include "crm114_config.h"
37
38 // include the crm114 data structures file
39 #include "crm114_structs.h"
40
41 // and include the routine declarations file
42 #include "crm114.h"
43
44 // the globals used when we need a big buffer - allocated once, used
45 // wherever needed. These are sized to the same size as the data window.
46 // Do not mutate/realloc these.
47 extern char *outbuf;
48
49 #define MAX_CLUSTERS 4096
50 #define CLUSTER_LABEL_LEN 32
51 #define DOCUMENT_TAG_LEN 32
52
53 typedef struct mythical_clumper_header
54 {
55 long max_documents, n_documents;
56 long document_offsets_offset;//list of offsets to documents
57 long clusters_offset; //cluster assignments of documents
58 long distance_matrix_offset;
59 long cluster_labels_offset;
60 long document_tags_offset;
61 long n_perma_clusters;
62 long file_length; //is the offset of new files when learning
63 long last_action; //0 = made clumps, non-zero means make clumps if you've got
64 // a chance and we're told not to
65 long n_clusters;
66 } CLUMPER_HEADER_STRUCT;
67
68 typedef struct mythical_clumper_state
69 {
70 char *file_origin;
71 CLUMPER_HEADER_STRUCT *header;
72 long *document_offsets;
73 long *cluster_assignments;
74 float *distance_matrix;
75 char (*cluster_labels)[CLUSTER_LABEL_LEN];
76 char (*document_tags)[DOCUMENT_TAG_LEN];
77 } CLUMPER_STATE_STRUCT;
78
79 // tracing for this module
80 int joe_trace = 0;
81
82 long max_documents = 1000;
83
make_new_clumper_backing_file(char * filename)84 static void make_new_clumper_backing_file(char *filename)
85 {
86 CLUMPER_HEADER_STRUCT H, *h = &H;
87 FILE *f;
88 long i;
89 h->max_documents = max_documents;
90 h->n_documents = 0;
91 h->document_offsets_offset = sizeof(CLUMPER_HEADER_STRUCT);
92 h->clusters_offset = h->document_offsets_offset +
93 sizeof(long) * max_documents;
94 h->distance_matrix_offset = h->clusters_offset +
95 sizeof(long) * max_documents;
96 h->cluster_labels_offset = h->distance_matrix_offset + ( sizeof(float) *
97 max_documents * (max_documents + 1) / 2);
98 h->document_tags_offset = h->cluster_labels_offset +
99 ( sizeof(char) * max_documents * CLUSTER_LABEL_LEN);
100 h->file_length = h->document_tags_offset +
101 ( sizeof(char) * max_documents * DOCUMENT_TAG_LEN);
102 h->n_perma_clusters = 0;
103 h->n_clusters = 0;
104 crm_force_munmap_filename(filename);
105 f = fopen(filename, "wb");
106 dontcare = fwrite(h, 1, sizeof(CLUMPER_HEADER_STRUCT), f);
107 i = h->file_length - sizeof(CLUMPER_HEADER_STRUCT);
108 if(joe_trace)
109 fprintf(stderr, "about to write %ld zeros to backing file\n", i);
110 while(i--)
111 fputc('\0', f);
112 fflush(f);
113 fclose(f);
114 if(joe_trace)
115 fprintf(stderr, "Just wrote backing file for clumper size %ld\n",
116 h->file_length);
117 }
118
map_file(CLUMPER_STATE_STRUCT * s,char * filename)119 static int map_file(CLUMPER_STATE_STRUCT *s, char *filename)
120 {
121 struct stat statee;
122 if(stat(filename, &statee))
123 {
124 nonfatalerror("Couldn't stat file!", filename);
125 return -1;
126 }
127
128 s->file_origin = crm_mmap_file
129 (filename,
130 0, statee.st_size,
131 PROT_READ | PROT_WRITE,
132 MAP_SHARED,
133 NULL);
134 if(s->file_origin == MAP_FAILED)
135 {
136 nonfatalerror("Couldn't mmap file!", filename);
137 return -1;
138 }
139 if(joe_trace)
140 fprintf(stderr,"Definitely think I've mapped a file.\n");
141
142 s->header = (CLUMPER_HEADER_STRUCT *)(s->file_origin);
143 s->document_offsets =
144 (void *)( s->file_origin + s->header->document_offsets_offset );
145 s->cluster_assignments =
146 (void *)( s->file_origin + s->header->clusters_offset );
147 s->distance_matrix =
148 (void *)( s->file_origin + s->header->distance_matrix_offset );
149 s->cluster_labels =
150 (void *)( s->file_origin + s->header->cluster_labels_offset );
151 s->document_tags =
152 (void *)( s->file_origin + s->header->document_tags_offset );
153 return 0;
154 }
155
unmap_file(CLUMPER_STATE_STRUCT * s)156 static void unmap_file(CLUMPER_STATE_STRUCT *s)
157 {
158 crm_munmap_file ((void *) s->file_origin);
159 }
160
aref_dist_mat(float * m,int j,int i)161 static float *aref_dist_mat(float *m, int j, int i)
162 {
163 if(i < j) {int t = i; i = j; j = t;}
164 return m + i * (i - 1) / 2 + j;
165 }
166
get_document_affinity(unsigned int * doc1,unsigned int * doc2)167 static float get_document_affinity(unsigned int *doc1, unsigned int *doc2)
168 {
169 int u = 0, l1 = 0, l2 = 0;
170 for(;;)
171 if(doc1[l1] == 0)
172 {
173 while(doc2[l2] != 0) l2++;
174 break;
175 }
176 else if(doc2[l2] == 0)
177 {
178 while(doc1[l1] != 0) l1++;
179 break;
180 }
181 else if(doc1[l1] == doc2[l2])
182 {
183 u++; l1++; l2++;
184 }
185 else if(doc1[l1] < doc2[l2])
186 l1++;
187 else if(doc1[l1] > doc2[l2])
188 l2++;
189 else
190 {
191 fprintf(stderr, "panic in the disco! ");
192 break;
193 }
194 if(joe_trace)
195 fprintf(stderr, "Just compared two documents u=%d l1=%d l2=%d\n",
196 u, l1, l2);
197 return pow((double)(1.0 + u * u) / (double)(1.0 + l1 * l2), 0.2);
198 }
199
compare_features(const void * a,const void * b)200 static int compare_features(const void *a, const void *b)
201 {
202 if(*(unsigned int *)a < *(unsigned int *)b)
203 return -1;
204 if(*(unsigned int *)a > *(unsigned int *)b)
205 return 1;
206 return 0;
207 }
208
eat_document(char * text,long text_len,long * ate,regex_t * regee,unsigned int * feature_space,long max_features,long long flags)209 static int eat_document
210 ( char *text, long text_len, long *ate,
211 regex_t *regee,
212 unsigned int *feature_space, long max_features,
213 long long flags)
214 {
215 long n_features = 0, i, j;
216 unsigned int hash_pipe[OSB_BAYES_WINDOW_LEN];
217 int hash_coefs[] = { 1, 3, 5, 11, 23, 47};
218 regmatch_t match[1];
219 char *t_start;
220 long t_len;
221 long f;
222 unsigned long long unigram, unique, string;
223
224 unigram = flags & CRM_UNIGRAM;
225 string = flags & CRM_STRING;
226 unique = flags & (CRM_UNIQUE | CRM_STRING);
227
228 *ate = 0;
229
230 for(i = 0; i < OSB_BAYES_WINDOW_LEN; i++)
231 hash_pipe[i] = 0xdeadbeef;
232 while(text_len > 0 && n_features < max_features - 1)
233 {
234 if(crm_regexec (regee, text, text_len, 1, match, 0, NULL))
235 //no match or regex error, we're done
236 break;
237 else
238 {
239 t_start = text + match->rm_so;
240 t_len = match->rm_eo - match->rm_so;
241 if(string)
242 {
243 text += match->rm_so + 1;
244 text_len -= match->rm_so + 1;
245 *ate += match->rm_so + 1;
246 }else
247 {
248 text += match->rm_eo;
249 text_len -= match->rm_eo;
250 *ate += match->rm_eo;
251 }
252
253 for(i = OSB_BAYES_WINDOW_LEN - 1; i > 0; i--)
254 hash_pipe[i] = hash_pipe[i - 1];
255 hash_pipe[0] = strnhash(t_start, t_len);
256 }
257 f = 0;
258 if(unigram)
259 feature_space[n_features++] = hash_pipe[0];
260 else
261 for(i = 1; i < OSB_BAYES_WINDOW_LEN && hash_pipe[i] != 0xdeadbeef; i++)
262 feature_space[n_features++] =
263 hash_pipe[0] + hash_pipe[i] * hash_coefs[i];
264
265 }
266 qsort(feature_space, n_features, sizeof(unsigned int), compare_features);
267
268 if(unique)
269 {
270 i = 0;
271 for(j = 1; j < n_features; j++)
272 if(feature_space[i] != feature_space[j])
273 feature_space[++i] = feature_space[j];
274 feature_space[++i] = 0;
275 n_features = i + 1; //the zero counts
276 } else
277 feature_space[n_features++] = 0;
278 return n_features;
279 }
280
281
find_closest_document(CLUMPER_STATE_STRUCT * s,char * text,long text_len,regex_t * regee,long long flags)282 static long find_closest_document
283 (CLUMPER_STATE_STRUCT *s,
284 char *text, long text_len,
285 regex_t *regee,
286 long long flags)
287 {
288 unsigned int feature_space[32768];
289 long n, i, b = -1;
290 float b_s = 0.0, n_s;
291 n = eat_document(text, text_len, &i,
292 regee, feature_space, 32768,
293 flags);
294 for(i = 0; i < s->header->n_documents; i++)
295 {
296 n_s = get_document_affinity
297 (feature_space, (unsigned int *)(s->file_origin + s->document_offsets[i]));
298 if(n_s > b_s)
299 {
300 b = i;
301 b_s = n_s;
302 }
303 }
304 return b;
305 }
306
307 typedef struct mythical_cluster_head
308 {
309 struct mythical_cluster_head *head, *next_head, *prev_head, *next;
310 long count;
311 } CLUSTER_HEAD_STRUCT;
312
join_clusters(CLUSTER_HEAD_STRUCT * a,CLUSTER_HEAD_STRUCT * b)313 static void join_clusters(CLUSTER_HEAD_STRUCT *a, CLUSTER_HEAD_STRUCT *b)
314 {
315 if(joe_trace)
316 fprintf(stderr, "Joining clusters of sizes %ld and %ld\n,",
317 a->head->count, b->head->count);
318
319 while(a->next) a = a->next;
320 b = b->head;
321 a->next = b;
322 a->head->count += b->count;
323 b->count = 0; //though we wont actually touch this value anymore
324 if(b->prev_head)
325 b->prev_head->next_head = b->next_head;
326 if(b->next_head)
327 b->next_head->prev_head = b->prev_head;
328 b->next_head = NULL;
329 b->prev_head = NULL;
330 do
331 b->head = a->head;
332 while( (b = b->next) );
333 }
334
agglomerative_averaging_cluster(CLUMPER_STATE_STRUCT * s,long goal)335 static void agglomerative_averaging_cluster(CLUMPER_STATE_STRUCT *s, long goal)
336 {
337 long i, j, k, l, n = s->header->n_documents;
338 CLUSTER_HEAD_STRUCT *clusters = malloc(n * sizeof(CLUSTER_HEAD_STRUCT)),
339 *a, *b, *c, first_head_ptr;
340 float *M = malloc( (n * (n + 1) / 2 - 1) * sizeof(float)), d, e;
341 long ck, cl, ckl;
342
343 if(joe_trace)
344 fprintf(stderr, "agglomerative averaging clustering...\n");
345
346 for(i = 1; i < s->header->n_documents - 1; i++)
347 {
348 clusters[i].head = &clusters[i];
349 clusters[i].prev_head = &clusters[i - 1];
350 clusters[i].next_head = &clusters[i + 1];
351 clusters[i].next = NULL;
352 clusters[i].count = 1;
353 }
354 clusters[0].head = &clusters[0];
355 clusters[0].prev_head = &first_head_ptr;
356 clusters[0].next_head = &clusters[1];
357 clusters[0].next = NULL;
358 clusters[0].count = 1;
359 if(s->header->n_documents > 1) //don't muck the first one!
360 {
361 clusters[s->header->n_documents-1].head =
362 &clusters[s->header->n_documents - 1];
363 clusters[s->header->n_documents - 1].prev_head =
364 &clusters[s->header->n_documents - 2];
365 clusters[s->header->n_documents - 1].next = NULL;
366 clusters[s->header->n_documents - 1].count = 1;
367 }
368 //always make sure the chain ends
369 clusters[s->header->n_documents - 1].next_head = NULL;
370
371 first_head_ptr.next_head = &clusters[0];
372
373 j = (n * (n + 1) / 2 - 1);
374 for(i = 0; i < j; i++)
375 M[i] = s->distance_matrix[i];
376
377 for(a = first_head_ptr.next_head; a; a = a->next_head)
378 if(s->cluster_assignments[a - clusters] < 0)
379 for(b = a->next_head; b; b = b->next_head)
380 if(s->cluster_assignments[a - clusters]
381 == s->cluster_assignments[b - clusters])
382 {
383 k = a - clusters;
384 l = b - clusters;
385 ck = clusters[k].count;
386 cl = clusters[l].count;
387 ckl = ck + cl;
388
389 for(c = &clusters[0]; c; c = c->next_head)
390 {
391 i = c - clusters;
392 if(i == k || i == l)
393 continue;
394 *aref_dist_mat(M, k, i) = (ck * *aref_dist_mat(M, k, i) +
395 cl * *aref_dist_mat(M, l, i) ) / ckl;
396 *aref_dist_mat(M, l, i) = 0.0;
397 }
398 join_clusters(&clusters[k], &clusters[l]);
399 n--;
400 }
401
402 while(n > goal)
403 {
404 l = 0; k = 0;
405 d = -1.0;
406 for(a = first_head_ptr.next_head; a; a = a->next_head)
407 for(b = a->next_head; b; b = b->next_head)
408 {
409 i = a - clusters;
410 j = b - clusters;
411 if(s->cluster_assignments[i] < 0 && s->cluster_assignments[j] < 0)
412 e = *aref_dist_mat(M, i, j) = -1000000000.0;
413 else
414 e = *aref_dist_mat(M, i, j);
415 if( e > d )
416 {
417 if(s->cluster_assignments[j] < 0)
418 {
419 k = j;
420 l = i;
421 } else
422 {
423 k = i;
424 l = j;
425 }
426 d = e;
427 }
428 }
429 if(l == 0 && k == 0)
430 {
431 fprintf(stderr, "CLUMP FAILED TO JOIN ENOUGH CLUMPS!\n");
432 break;
433 }
434 ck = clusters[k].count;
435 cl = clusters[l].count;
436 ckl = ck + cl;
437
438 for(a = &clusters[0]; a; a = a->next_head)
439 {
440 i = a - clusters;
441 if(i == k || i == l)
442 continue;
443 *aref_dist_mat(M, k, i) = (ck * *aref_dist_mat(M, k, i) +
444 cl * *aref_dist_mat(M, l, i) ) / ckl;
445 *aref_dist_mat(M, l, i) = 0.0;
446 }
447 join_clusters(&clusters[k], &clusters[l]);
448 n--;
449 }
450
451 i = s->header->n_perma_clusters + 1;
452 for(a = &clusters[0]; a; a = a->next_head)
453 {
454 if(s->cluster_assignments[a - clusters] < 0)
455 j = -s->cluster_assignments[a - clusters];
456 else
457 j = i++;
458 for(b = a; b; b = b->next)
459 s->cluster_assignments[b - clusters] = j;
460 }
461
462 s->header->n_clusters = n;
463 free(M);
464 }
465
index_to_pair(long t,long * i,long * j)466 static void index_to_pair(long t, long *i, long *j)
467 {
468 long p = 2 * t;
469 *i = (long)sqrt(p);
470 if(*i * *i + *i > p)
471 (*i)--;
472 *j = t - (*i * (*i + 1) / 2);
473 }
474
compare_float_ptrs(const void * a,const void * b)475 static int compare_float_ptrs(const void *a, const void *b)
476 {
477 if(**(float **)a > **(float **)b)
478 return 1;
479 if(**(float **)a < **(float **)b)
480 return -1;
481 return 0;
482 }
483
agglomerative_nearest_cluster(CLUMPER_STATE_STRUCT * s,long goal)484 static void agglomerative_nearest_cluster(CLUMPER_STATE_STRUCT *s, long goal)
485 {
486 long i, j, k, l, m, n = s->header->n_documents;
487 CLUSTER_HEAD_STRUCT *clusters = malloc(n * sizeof(CLUSTER_HEAD_STRUCT)),
488 *a, *b, first_head_ptr;
489 float **M = malloc( (n * (n + 1) / 2 - 1) * sizeof(float *));
490
491 if(joe_trace)
492 fprintf(stderr, "agglomerative nearest clustering...\n");
493
494 for(i = 1; i < s->header->n_documents - 1; i++)
495 {
496 clusters[i].head = &clusters[i];
497 clusters[i].prev_head = &clusters[i - 1];
498 clusters[i].next_head = &clusters[i + 1];
499 clusters[i].next = NULL;
500 clusters[i].count = 1;
501 }
502 clusters[0].head = &clusters[0];
503 clusters[0].prev_head = &first_head_ptr;
504 clusters[0].next_head = &clusters[1];
505 clusters[0].next = NULL;
506 clusters[0].count = 1;
507 if(s->header->n_documents > 1) //don't muck the first one!
508 {
509 clusters[s->header->n_documents - 1].head =
510 &clusters[s->header->n_documents - 1];
511 clusters[s->header->n_documents - 1].prev_head =
512 &clusters[s->header->n_documents - 2];
513 clusters[s->header->n_documents - 1].next = NULL;
514 clusters[s->header->n_documents - 1].count = 1;
515 }
516 //always make sure the chain ends
517 clusters[s->header->n_documents - 1].next_head = NULL;
518
519 first_head_ptr.next_head = &clusters[0];
520
521 j = (n * (n + 1) / 2 - 1);
522 for(i = 0; i < j; i++)
523 M[i] = &s->distance_matrix[i];
524 qsort(M, j, sizeof(float *), compare_float_ptrs);
525
526 for(a = first_head_ptr.next_head; a; a = a->next_head)
527 if(s->cluster_assignments[a - clusters] < 0)
528 for(b = a->next_head; b; b = b->next_head)
529 if(s->cluster_assignments[a - clusters]
530 == s->cluster_assignments[b - clusters])
531 {
532 k = a - clusters;
533 l = b - clusters;
534 join_clusters(&clusters[k], &clusters[l]);
535 n--;
536 }
537 i = j;
538 while(n > goal)
539 {
540 do
541 {
542 k = M[--i] - s->distance_matrix;
543 index_to_pair(k, &l, &m);
544 } while(clusters[m].head == clusters[l].head);
545
546 join_clusters(&clusters[m], &clusters[l]);
547 n--;
548 }
549
550 i = s->header->n_perma_clusters + 1;
551 for(a = &clusters[0]; a; a = a->next_head)
552 {
553 if(s->cluster_assignments[a - clusters] < 0)
554 j = -s->cluster_assignments[a - clusters];
555 else
556 j = i++;
557 for(b = a; b; b = b->next)
558 s->cluster_assignments[b - clusters] = j;
559 }
560 free(M);
561 s->header->n_clusters = n;
562 }
563
square(double a)564 double square(double a) {return a * a;}
minf(double a,double b)565 double minf(double a, double b) {return a < b ? a : b;}
566
567 #define H_BUCKETS 50
thresholding_average_cluster(CLUMPER_STATE_STRUCT * s)568 static void thresholding_average_cluster(CLUMPER_STATE_STRUCT *s)
569 {
570 long i, j, k, l, ck, cl, ckl, n = s->header->n_documents;
571 CLUSTER_HEAD_STRUCT *clusters = malloc(n * sizeof(CLUSTER_HEAD_STRUCT)),
572 *a, *b, *c, first_head_ptr;
573 long H[H_BUCKETS], C[H_BUCKETS];
574 float A[H_BUCKETS], t_A, t, t_score, scoro,gM;
575 float min, max, scale;
576 float *M = malloc( (n * (n + 1) / 2 - 1) * sizeof(float)), d, e;
577
578 if(joe_trace)
579 fprintf(stderr, "threshold average clustering...\n");
580
581 for(i = 1; i < s->header->n_documents - 1; i++)
582 {
583 clusters[i].head = &clusters[i];
584 clusters[i].prev_head = &clusters[i - 1];
585 clusters[i].next_head = &clusters[i + 1];
586 clusters[i].next = NULL;
587 clusters[i].count = 1;
588 }
589 clusters[0].head = &clusters[0];
590 clusters[0].prev_head = &first_head_ptr;
591 clusters[0].next_head = &clusters[1];
592 clusters[0].next = NULL;
593 clusters[0].count = 1;
594 if(s->header->n_documents > 1) //don't muck the first one!
595 {
596 clusters[s->header->n_documents-1].head =
597 &clusters[s->header->n_documents - 1];
598 clusters[s->header->n_documents - 1].prev_head =
599 &clusters[s->header->n_documents - 2];
600 clusters[s->header->n_documents - 1].next = NULL;
601 clusters[s->header->n_documents - 1].count = 1;
602 }
603 //always make sure the chain ends
604 clusters[s->header->n_documents - 1].next_head = NULL;
605
606 first_head_ptr.next_head = &clusters[0];
607
608 j = (n * (n + 1) / 2 - 1);
609 for(i = 0; i < j; i++)
610 M[i] = s->distance_matrix[i];
611
612
613 for(i = 0; i < H_BUCKETS; i++)
614 H[i] = 0.0;
615 j = n * (n - 1) / 2;
616 min = 1000000000.0;
617 max = -1000000000.0;
618
619 for(i = 0; i < j; i++)
620 {
621 if(M[i] < min)
622 min = s->distance_matrix[i];
623 if(M[i] > max)
624 max = s->distance_matrix[i];
625 }
626 scale = (max - min) / ((float)H_BUCKETS - 0.1);
627 if (scale == 0)
628 scale = 1.0;
629 for(i = 0; i < j; i++)
630 H[ (int)( (M[i] - min) / scale ) ]++;
631 if(joe_trace)
632 {
633 fprintf(stderr, "Histogram of document distances:\n");
634 for(i = 0; i < H_BUCKETS; i++)
635 {
636 for(k = 0; k < H[i]; k += 100)
637 fputc('*', stderr);
638 fputc('\n', stderr);
639 }
640 }
641
642 k = 0;
643 t_A = 0.0;
644 for(i = 0; i < H_BUCKETS; i++)
645 {
646 k = C[i] = H[i] + k;
647 t_A = A[i] = H[i] * (min + (i + 0.5) * scale) + t_A;
648 }
649 gM = t_A / (float)j;
650 t_score = 0.0;
651 t = -1.0;
652 for(i = 2; i < H_BUCKETS - 2; i++)
653 {
654 scoro = square(gM - (t_A - A[i]) / (k - C[i])) * (k - C[i])
655 + square(gM - A[i] / C[i]) * C[i];
656 if(scoro > t_score)
657 {
658 t_score = scoro;
659 t = min + scale * (float)(i );
660 }
661 }
662 if(joe_trace)
663 fprintf(stderr, "min = %f, max = %f, t = %f\n", min, max, t);
664 for(a = first_head_ptr.next_head; a; a = a->next_head)
665 if(s->cluster_assignments[a - clusters] < 0)
666 for(b = a->next_head; b; b = b->next_head)
667 if(s->cluster_assignments[a - clusters]
668 == s->cluster_assignments[b - clusters])
669 {
670 k = a - clusters;
671 l = b - clusters;
672 ck = clusters[k].count;
673 cl = clusters[l].count;
674 ckl = ck + cl;
675
676 for(c = &clusters[0]; c; c = c->next_head)
677 {
678 i = c - clusters;
679 if(i == k || i == l)
680 continue;
681 //*aref_dist_mat(M, k, i) = (ck * *aref_dist_mat(M, k, i) +
682 // cl * *aref_dist_mat(M, l, i) ) / ckl;
683 *aref_dist_mat(M, k, i) =
684 minf(*aref_dist_mat(M, k, i), *aref_dist_mat(M, l, i));
685 *aref_dist_mat(M, l, i) = 0.0;
686 }
687 join_clusters(&clusters[k], &clusters[l]);
688 n--;
689 }
690
691 for(;;)
692 {
693 l = 0; k = 0;
694 d = -1.0;
695 for(a = first_head_ptr.next_head; a; a = a->next_head)
696 for(b = a->next_head; b; b = b->next_head)
697 {
698 i = a - clusters;
699 j = b - clusters;
700 if(s->cluster_assignments[i] < 0 && s->cluster_assignments[j] < 0)
701 {
702 e = *aref_dist_mat(M, i, j) = -1000000000.0;
703 if(joe_trace)
704 fprintf(stderr, rand() & 0x1 ? "wonk!\n" : " wonk!\n");
705 }
706 else
707 e = *aref_dist_mat(M, i, j);
708 if( e > d )
709 {
710 if(s->cluster_assignments[j] < 0)
711 {
712 k = j;
713 l = i;
714 } else
715 {
716 k = i;
717 l = j;
718 }
719 d = e;
720 }
721 }
722 if(joe_trace)
723 fprintf(stderr, "l = %ld, k = %ld, d = %f\n", l, k, d);
724 if( (l == 0 && k == 0) || d < t) //we're done
725 break;
726
727 ck = clusters[k].count;
728 cl = clusters[l].count;
729 ckl = ck + cl;
730
731 for(a = &clusters[0]; a; a = a->next_head)
732 {
733 i = a - clusters;
734 if(i == k || i == l)
735 continue;
736 *aref_dist_mat(M, k, i) = (ck * *aref_dist_mat(M, k, i) +
737 cl * *aref_dist_mat(M, l, i) ) / ckl;
738 *aref_dist_mat(M, l, i) = 0.0;
739 }
740 join_clusters(&clusters[k], &clusters[l]);
741 n--;
742 }
743
744 i = s->header->n_perma_clusters + 1;
745 for(a = &clusters[0]; a; a = a->next_head)
746 {
747 if(s->cluster_assignments[a - clusters] < 0)
748 j = -s->cluster_assignments[a - clusters];
749 else
750 j = i++;
751 for(b = a; b; b = b->next)
752 s->cluster_assignments[b - clusters] = j;
753 }
754
755 s->header->n_clusters = n;
756 free(M);
757
758 }
759
assign_perma_cluster(CLUMPER_STATE_STRUCT * s,long doc,char * lab)760 static void assign_perma_cluster(CLUMPER_STATE_STRUCT *s,
761 long doc,
762 char *lab)
763 {
764 long i;
765 for(i = 1; i <= s->header->n_perma_clusters; i++)
766 if(0 == strcmp(s->cluster_labels[i], lab))
767 break;
768 if(i > s->header->n_perma_clusters)
769 {
770 i = ++(s->header->n_perma_clusters);
771 strcpy(s->cluster_labels[i], lab);
772 }
773 s->cluster_assignments[doc] = -i;
774 }
775
max(long a,long b)776 long max(long a, long b) {return a > b ? a : b;}
777
crm_expr_clump(CSL_CELL * csl,ARGPARSE_BLOCK * apb)778 int crm_expr_clump(CSL_CELL *csl, ARGPARSE_BLOCK *apb)
779 {
780 char *txtptr;
781 long txtstart;
782 long txtlen;
783 char htext[MAX_PATTERN];
784 char filename[MAX_PATTERN];
785 long htext_len;
786
787 char regex_text[MAX_PATTERN]; // the regex pattern
788 long regex_text_len;
789 char param_text[MAX_PATTERN];
790 long param_text_len;
791 unsigned long long bychunk;
792 int n_clusters = 0;
793
794 char tag[DOCUMENT_TAG_LEN];
795 char class[CLUSTER_LABEL_LEN];
796
797 struct stat statbuf;
798
799 CLUMPER_STATE_STRUCT S, *s = &S;
800
801 regex_t regee;
802 regmatch_t matchee[2];
803
804 long i, j, k, l;
805
806
807 if (crm_exec_box_restriction(csl, apb, &txtptr, &txtstart, &txtlen) != 0)
808 return 0;
809
810 crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len);
811 htext_len = apb->p1len;
812 htext_len = crm_nexpandvar (htext, htext_len, MAX_PATTERN);
813 i = 0;
814 while (htext[i] < 0x021) i++;
815 j = i;
816 while (htext[j] >= 0x021) j++;
817 htext[j] = '\000';
818 strcpy (filename, &htext[i]);
819
820 //use regex_text and regee to grab parameters
821 crm_get_pgm_arg (param_text, MAX_PATTERN, apb->s2start, apb->s2len);
822 param_text_len = apb->s2len;
823 param_text_len = crm_nexpandvar (param_text, param_text_len, MAX_PATTERN);
824
825 param_text[ param_text_len ] = '\0';
826 if(joe_trace)
827 fprintf( stderr, "param_text = %s\n", param_text );
828
829 strcpy(regex_text, "n_clusters[[:space:]]*=[[:space:]]*([0-9]+)");
830 if( crm_regcomp (®ee, regex_text, strlen(regex_text), REG_EXTENDED) )
831 {
832 nonfatalerror("Problem compiling regex to grab params:", regex_text);
833 return 0;
834 }
835 if(!crm_regexec (®ee, param_text, param_text_len, 2, matchee, 0, NULL))
836 {
837 param_text[matchee[1].rm_eo + 1] = '\0';
838 if(joe_trace)
839 fprintf(stderr, "¶m_text[matchee[1].rm_so] = %s\n",
840 ¶m_text[matchee[1].rm_so]);
841 n_clusters = atol(¶m_text[matchee[1].rm_so]);
842 if(joe_trace)
843 fprintf(stderr, "n_clusters = %d\n", n_clusters);
844 }
845 strcpy(regex_text, "tag[[:space:]]*=[[:space:]]*([[:graph:]]+)");
846 if( crm_regcomp (®ee, regex_text, strlen(regex_text), REG_EXTENDED) )
847 {
848 nonfatalerror("Problem compiling regex to grab params:", regex_text);
849 return 0;
850 }
851 if(!crm_regexec (®ee, param_text, param_text_len, 2, matchee, 0, NULL))
852 {
853 param_text[matchee[1].rm_eo] = '\0';
854 strcpy(tag, ¶m_text[matchee[1].rm_so]);
855 } else
856 tag[0] = '\0';
857 strcpy(regex_text, "clump[[:space:]]*=[[:space:]]*([[:graph:]]+)");
858 if( crm_regcomp (®ee, regex_text, strlen(regex_text), REG_EXTENDED) )
859 {
860 nonfatalerror("Problem compiling regex to grab params:", regex_text);
861 return 0;
862 }
863 if(!crm_regexec (®ee, param_text, param_text_len, 2, matchee, 0, NULL))
864 {
865 param_text[matchee[1].rm_eo] = '\0';
866 strcpy(class, ¶m_text[matchee[1].rm_so]);
867 } else
868 class[0] = '\0';
869
870 strcpy(regex_text, "max_documents[[:space:]]*=[[:space:]]*([[:graph:]]+)");
871 if( crm_regcomp (®ee, regex_text, strlen(regex_text), REG_EXTENDED) )
872 {
873 nonfatalerror("Problem compiling regex to grab params:", regex_text);
874 return 0;
875 }
876 if(!crm_regexec (®ee, param_text, param_text_len, 2, matchee, 0, NULL))
877 {
878 param_text[matchee[1].rm_eo] = '\0';
879 max_documents = atol(¶m_text[matchee[1].rm_so]);
880 }
881 //we've already got a default max_documents
882
883 crm_get_pgm_arg (regex_text, MAX_PATTERN, apb->s1start, apb->s1len);
884 regex_text_len = apb->s1len;
885 if(regex_text_len == 0)
886 {
887 strcpy(regex_text, "[[:graph:]]+");
888 regex_text_len = strlen( regex_text );
889 }
890 regex_text[regex_text_len] = '\0';
891 regex_text_len = crm_nexpandvar (regex_text, regex_text_len, MAX_PATTERN);
892 if( crm_regcomp (®ee, regex_text, regex_text_len, REG_EXTENDED) )
893 {
894 nonfatalerror("Problem compiling this regex:", regex_text);
895 return 0;
896 }
897
898 bychunk = apb->sflags & CRM_BYCHUNK;
899
900 if (apb->sflags & CRM_REFUTE)
901 {
902 if(map_file(s, filename))
903 //we already nonfatalerrored
904 return 0;
905 if(tag[0]) {
906 for(i = s->header->n_documents; i >= 0; i--)
907 if(0 == strcmp(tag, s->document_tags[i]))
908 break;
909 }
910 else
911 i = find_closest_document(s, txtptr + txtstart, txtlen,
912 ®ee, apb->sflags);
913 if(i < 0)
914 {
915 unmap_file(s);
916 return 0;
917 }
918 memmove(s->file_origin + s->document_offsets[i],
919 s->file_origin + s->document_offsets[i + 1],
920 (s->header->file_length - s->document_offsets[i + 1]) );
921 memmove(&s->document_tags[i],
922 &s->document_tags[i + 1],
923 sizeof(char) * DOCUMENT_TAG_LEN *(s->header->n_documents - i - 1 ));
924 memmove(&s->cluster_labels[i],
925 &s->cluster_labels[i + 1],
926 sizeof(char) * CLUSTER_LABEL_LEN*(s->header->n_documents - i - 1 ));
927 s->header->n_documents--;
928 j = s->document_offsets[i + 1] - s->document_offsets[i];
929 for(k = i; k < s->header->n_documents; k++)
930 {
931 s->document_offsets[k] = s->document_offsets[k + 1] - j;
932 s->cluster_assignments[k] = s->cluster_assignments[k + 1];
933 }
934 s->header->file_length -= j;
935 for(k = 0; k < s->header->n_documents; k++)
936 for(l = max(k + 1, i); l < s->header->n_documents; l++)
937 *aref_dist_mat(s->distance_matrix, k, l) =
938 *aref_dist_mat(s->distance_matrix, k, l + 1);
939 if(n_clusters > 0)
940 {
941 if(bychunk)
942 agglomerative_averaging_cluster(s, n_clusters);
943 else
944 agglomerative_nearest_cluster(s, n_clusters);
945 }
946 else if(n_clusters == 0)
947 {
948 if(bychunk)
949 thresholding_average_cluster(s);
950 else
951 thresholding_average_cluster(s);
952 }
953 l = s->header->file_length;
954
955 unmap_file(s);
956 crm_force_munmap_filename(filename);
957 dontcare = truncate(filename, l);
958 return 0;
959 } else
960 { //LEARNIN'!
961 long n;
962 unsigned int feature_space[32768];
963 FILE *f;
964 if(stat(filename, &statbuf))
965 make_new_clumper_backing_file(filename);
966 if(txtlen == 0)
967 {
968 if(tag[0] && class[0]) //is not null
969 {
970 if(map_file(s, filename))
971 //we already nonfatalerrored
972 return 0;
973 for(i = s->header->n_documents - 1; i >= 0; i++)
974 if(0 == strcmp(tag, s->document_tags[i]))
975 break;
976 if(i >= 0)
977 assign_perma_cluster(s, i, class);
978 unmap_file(s);
979 }
980 return 0;
981 }
982
983 n = eat_document
984 ( txtptr + txtstart, txtlen, &i,
985 ®ee,
986 feature_space, 32768,
987 apb->sflags );
988 crm_force_munmap_filename(filename);
989 f = fopen(filename, "ab+");
990 dontcare = fwrite(feature_space, n, sizeof(unsigned int), f);
991 fclose(f);
992 if(map_file(s, filename))
993 //we already nonfatalerrored
994 return 0;
995 if(s->header->n_documents >= s->header->max_documents)
996 {
997 nonfatalerror("This clump backing file is full and cannot"
998 " assimilate new documents!", filename);
999 unmap_file(s);
1000 return 0;
1001 }
1002 i = s->header->n_documents++;
1003
1004 s->document_offsets[i] = s->header->file_length;
1005 s->header->file_length += sizeof(unsigned int) * n;
1006 for(j = 0; j < i; j++)
1007 *aref_dist_mat(s->distance_matrix, j, i) = get_document_affinity(
1008 (unsigned int *)( s->file_origin + s->document_offsets[i]),
1009 (unsigned int *)( s->file_origin + s->document_offsets[j]) );
1010 strcpy(s->document_tags[i], tag);
1011 if(class[0])
1012 assign_perma_cluster(s, i, class);
1013 else
1014 s->cluster_assignments[i] = 0;
1015
1016 if(n_clusters > 0)
1017 {
1018 if(bychunk)
1019 agglomerative_averaging_cluster(s, n_clusters);
1020 else
1021 agglomerative_nearest_cluster(s, n_clusters);
1022 }
1023 else if(n_clusters == 0)
1024 {
1025 if(bychunk)
1026 thresholding_average_cluster(s);
1027 else
1028 thresholding_average_cluster(s);
1029 }
1030 unmap_file(s);
1031 return 0;
1032 }
1033 }
1034
sprint_lab(CLUMPER_STATE_STRUCT * s,char * b,int l)1035 int sprint_lab(CLUMPER_STATE_STRUCT *s, char *b, int l)
1036 {
1037 if(l == 0)
1038 {
1039 strcpy(b, "unassigned");
1040 return strlen("unassigned");
1041 }
1042 if(s->cluster_labels[l][0] != '\0')
1043 return sprintf(b, "%s", s->cluster_labels[l]);
1044 else
1045 return sprintf(b, "clump_#%d", l);
1046 }
1047
sprint_tag(CLUMPER_STATE_STRUCT * s,char * b,int d)1048 int sprint_tag(CLUMPER_STATE_STRUCT *s, char *b, int d)
1049 {
1050 if(s->document_tags[d][0] != '\0')
1051 return sprintf(b, "%s", s->document_tags[d]);
1052 else
1053 return sprintf(b, "document_#%d", d);
1054 }
1055
crm_expr_pmulc(CSL_CELL * csl,ARGPARSE_BLOCK * apb)1056 int crm_expr_pmulc(CSL_CELL *csl, ARGPARSE_BLOCK *apb)
1057 {
1058 char *txtptr;
1059 long txtstart;
1060 long txtlen;
1061 char htext[MAX_PATTERN];
1062 char filename[MAX_PATTERN];
1063 long htext_len;
1064
1065 char regex_text[MAX_PATTERN]; // the regex pattern
1066 long regex_text_len;
1067 unsigned long long bychunk;
1068
1069 char out_var[MAX_PATTERN];
1070 long out_var_len;
1071
1072 float A[MAX_CLUSTERS], T;
1073 long N[MAX_CLUSTERS];
1074 double p[MAX_CLUSTERS], pR[MAX_CLUSTERS];
1075
1076 unsigned int feature_space[32768];
1077 long n;
1078
1079 long closest_doc = -1;
1080 float closest_doc_affinity;
1081
1082 long out_len = 0;
1083
1084 CLUMPER_STATE_STRUCT S, *s = &S;
1085
1086 regex_t regee;
1087
1088 long i, j;
1089
1090
1091 if (crm_exec_box_restriction(csl, apb, &txtptr, &txtstart, &txtlen) != 0)
1092 return 0;
1093
1094 crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len);
1095 htext_len = apb->p1len;
1096 htext_len = crm_nexpandvar (htext, htext_len, MAX_PATTERN);
1097 i = 0;
1098 while (htext[i] < 0x021) i++;
1099 j = i;
1100 while (htext[j] >= 0x021) j++;
1101 htext[j] = '\000';
1102 strcpy (filename, &htext[i]);
1103
1104 //grab output variable name
1105 crm_get_pgm_arg (out_var, MAX_PATTERN, apb->p2start, apb->p2len);
1106 out_var_len = apb->p2len;
1107 out_var_len = crm_nexpandvar (out_var, out_var_len, MAX_PATTERN);
1108
1109 crm_get_pgm_arg (regex_text, MAX_PATTERN, apb->s1start, apb->s1len);
1110 regex_text_len = apb->s1len;
1111 if(regex_text_len == 0)
1112 {
1113 strcpy(regex_text, "[[:graph:]]+");
1114 regex_text_len = strlen( regex_text );
1115 }
1116 regex_text[regex_text_len] = '\0';
1117 regex_text_len = crm_nexpandvar (regex_text, regex_text_len, MAX_PATTERN);
1118 if( crm_regcomp (®ee, regex_text, regex_text_len, REG_EXTENDED) )
1119 {
1120 nonfatalerror("Problem compiling this regex:", regex_text);
1121 return 0;
1122 }
1123
1124 bychunk = apb->sflags & CRM_BYCHUNK;
1125
1126 if(map_file(s, filename))
1127 //we already nonfatalerrored
1128 return 0;
1129
1130 if(txtlen == 0)
1131 {
1132 for(i = 0; i < s->header->n_documents; i++)
1133 {
1134 A[0] = 0.0;
1135 N[0] = 1;
1136 for(j = 0; j < s->header->n_documents; j++)
1137 if(i != j && s->cluster_assignments[i] == s->cluster_assignments[j])
1138 {
1139 N[0]++;
1140 if(bychunk)
1141 A[0] += *aref_dist_mat(s->distance_matrix, i, j);
1142 else
1143 if(*aref_dist_mat(s->distance_matrix, i, j) > A[0])
1144 A[0] = *aref_dist_mat(s->distance_matrix, i, j);
1145 }
1146 if(bychunk)
1147 A[0] /= (float)N[0];
1148 out_len += sprintf(outbuf + out_len, "%ld (", i);
1149 out_len += sprint_tag(s, outbuf + out_len, i);
1150 out_len += sprintf(outbuf + out_len,
1151 ") clump: %ld (", s->cluster_assignments[i]);
1152 out_len += sprint_lab(s, outbuf + out_len, s->cluster_assignments[i]);
1153 out_len += sprintf(outbuf + out_len, ") affinity: %0.4f\n", A[0]);
1154
1155 }
1156 outbuf[out_len] = '\0';
1157 if(out_var_len)
1158 crm_destructive_alter_nvariable(out_var, out_var_len, outbuf, out_len);
1159 unmap_file(s);
1160 return 0;
1161 } else
1162 {
1163 if(joe_trace)
1164 fprintf(stderr, "pmulcing!\n");
1165 n = eat_document
1166 ( txtptr + txtstart, txtlen, &i,
1167 ®ee,
1168 feature_space, 32768,
1169 apb->sflags );
1170 closest_doc_affinity = -1.0;
1171 for(i = 0; i <= s->header->n_clusters; i++)
1172 {
1173 A[i] = 0.0;
1174 N[i] = 0;
1175 }
1176 if(bychunk)
1177 {
1178 for(i = 0; i < s->header->n_documents; i++)
1179 {
1180 j = s->cluster_assignments[i];
1181 if(j == 0)
1182 continue;
1183 T = get_document_affinity(feature_space, (void *)(s->file_origin +
1184 s->document_offsets[i]));
1185 A[j] += T;
1186 if(T > closest_doc_affinity)
1187 {
1188 closest_doc = i;
1189 closest_doc_affinity = T;
1190 }
1191 N[j]++;
1192 }
1193 for(i = 1; i <= s->header->n_clusters; i++)
1194 T += A[i] /= N[i];
1195 } else
1196 {
1197 for(i = 0; i < s->header->n_documents; i++)
1198 {
1199 j = s->cluster_assignments[i];
1200 T = get_document_affinity(feature_space, (void *)(s->file_origin +
1201 s->document_offsets[i]));
1202 if(T > A[j])
1203 A[j] = T;
1204 if(T > closest_doc_affinity)
1205 {
1206 closest_doc = i;
1207 closest_doc_affinity = T;
1208 }
1209 N[j]++;
1210 }
1211 }
1212 T = 0.0000001;
1213 j = 1;
1214 for(i = 1; i <= s->header->n_clusters; i++)
1215 {
1216 if(A[i] > A[j])
1217 j = i;
1218 if(A[i] == 0.0)
1219 p[i] = 0.0;
1220 else
1221 p[i] = normalized_gauss(1.0 / A[i] - 1.0, 0.5);
1222 //p[i] = A[i];
1223 T += p[i];
1224 }
1225 if(s->header->n_clusters < 2)
1226 p[j = 1] = 0.0;
1227 for(i = 1; i <= s->header->n_clusters; i++)
1228 {
1229 p[i] /= T;
1230 pR[i] = 10 * ( log10(0.0000001 + p[i])
1231 - log10(1.0000001 - p[i]) );
1232 }
1233
1234 if(joe_trace)
1235 fprintf(stderr, "generating output...\n");
1236
1237 if(p[j] > 0.5)
1238 out_len += sprintf(outbuf + out_len,
1239 "PMULC succeeds; success probabilty: %0.4f pR: %0.4f\n", p[j], pR[j]);
1240 else
1241 out_len += sprintf(outbuf + out_len,
1242 "PMULC fails; success probabilty: %0.4f pR: %0.4f\n", p[j], pR[j]);
1243 out_len += sprintf(outbuf + out_len,
1244 "Best match to clump #%ld (", j);
1245 out_len += sprint_lab(s, outbuf + out_len, j);
1246 out_len += sprintf(outbuf + out_len,
1247 ") prob: %0.4f pR: %0.4f\n", p[j], pR[j]);
1248 out_len += sprintf(outbuf + out_len,
1249 "Closest document: #%ld (", closest_doc);
1250 out_len += sprint_tag(s, outbuf + out_len, closest_doc);
1251 out_len += sprintf(outbuf + out_len,
1252 ") affinity: %0.4f\n", closest_doc_affinity);
1253 out_len += sprintf(outbuf + out_len,
1254 "Total features in input file: %ld\n", n);
1255 for(i = 1; i <= s->header->n_clusters; i++)
1256 {
1257 out_len += sprintf(outbuf + out_len,
1258 "%ld: (", i);
1259 out_len += sprint_lab(s, outbuf + out_len, i);
1260 out_len += sprintf(outbuf + out_len,
1261 "): documents: %ld affinity: %0.4f prob: %0.4f pR: %0.4f\n",
1262 N[i], A[i], p[i], pR[i]);
1263 }
1264
1265 if (p[j] > 0.5000)
1266 {
1267 if (user_trace)
1268 fprintf (stderr, "CLUMP was a SUCCESS, continuing execution.\n");
1269 }
1270 else
1271 {
1272 if (user_trace)
1273 fprintf (stderr, "CLUMP was a FAIL, skipping forward.\n");
1274 // and do what we do for a FAIL here
1275 csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1;
1276 csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1;
1277 };
1278 outbuf[out_len] = '\0';
1279 if(joe_trace)
1280 fprintf(stderr, "JOE_TRACE:\n%s", outbuf);
1281 if(out_var_len)
1282 crm_destructive_alter_nvariable(out_var, out_var_len, outbuf, out_len);
1283 unmap_file(s);
1284 return 0;
1285 }
1286
1287 }
1288