1 //	crm_expr_clump.c - automatically cluster unlabelled documents
2 
3 // Copyright 2009 William S. Yerazunis.
4 // This file is under GPLv3, as described in COPYING.
5 
6 /////////////////////////////////////////////////////////////////////
7 //     Original spec by Bill Yerazunis, original code by Joe Langeway,
8 //     recode for CRM114 use by Bill Yerazunis.
9 //
10 //     This code section (crm_expr_clump and subsidiary routines) is
11 //     dual-licensed to both William S. Yerazunis and Joe Langeway,
12 //     including the right to reuse this code in any way desired,
13 //     including the right to relicense it under any other terms as
14 //     desired.
15 /////////////////////////////////////////////////////////////////////
16 
17 //   This file is part of on going research and should not be considered
18 //   a finished product, a reliable tool, an example of good software
19 //   engineering, or a reflection of any quality of Joe's besides his
20 //   tendency towards long hours.
21 //
22 //   Here's what's going on:
23 //
24 //   Documents are fed in with calls to "clump" and the distance between each
25 //   document is recorded in a matrix. We then find clusters for automatic
26 //   classification without the need for a gold standard judgement ahead
27 //   of time.
28 //
29 //   Cluster assignments start at index 1 and negative numbers indicate
30 //   permanent assignments made by crm.
31 
32 //  include some standard files
33 #include "crm114_sysincludes.h"
34 
35 //  include any local crm114 configuration file
36 #include "crm114_config.h"
37 
38 //  include the crm114 data structures file
39 #include "crm114_structs.h"
40 
41 //  and include the routine declarations file
42 #include "crm114.h"
43 
44 //    the globals used when we need a big buffer  - allocated once, used
45 //    wherever needed.  These are sized to the same size as the data window.
46 //    Do not mutate/realloc these.
47 extern char *outbuf;
48 
49 #define MAX_CLUSTERS 4096
50 #define CLUSTER_LABEL_LEN 32
51 #define DOCUMENT_TAG_LEN 32
52 
53 typedef struct mythical_clumper_header
54 {
55   long max_documents, n_documents;
56   long document_offsets_offset;//list of offsets to documents
57   long clusters_offset; //cluster assignments of documents
58   long distance_matrix_offset;
59   long cluster_labels_offset;
60   long document_tags_offset;
61   long n_perma_clusters;
62   long file_length; //is the offset of new files when learning
63   long last_action; //0 = made clumps, non-zero means make clumps if you've got
64                         // a chance and we're told not to
65   long n_clusters;
66 } CLUMPER_HEADER_STRUCT;
67 
68 typedef struct mythical_clumper_state
69 {
70   char *file_origin;
71   CLUMPER_HEADER_STRUCT *header;
72   long *document_offsets;
73   long *cluster_assignments;
74   float *distance_matrix;
75   char (*cluster_labels)[CLUSTER_LABEL_LEN];
76   char (*document_tags)[DOCUMENT_TAG_LEN];
77 } CLUMPER_STATE_STRUCT;
78 
79 // tracing for this module
80 int joe_trace = 0;
81 
82 long max_documents = 1000;
83 
make_new_clumper_backing_file(char * filename)84 static void make_new_clumper_backing_file(char *filename)
85 {
86   CLUMPER_HEADER_STRUCT H, *h = &H;
87   FILE *f;
88   long i;
89   h->max_documents = max_documents;
90   h->n_documents = 0;
91   h->document_offsets_offset = sizeof(CLUMPER_HEADER_STRUCT);
92   h->clusters_offset = h->document_offsets_offset +
93                           sizeof(long) * max_documents;
94   h->distance_matrix_offset = h->clusters_offset +
95                           sizeof(long) * max_documents;
96   h->cluster_labels_offset = h->distance_matrix_offset + ( sizeof(float) *
97                max_documents * (max_documents + 1) / 2);
98   h->document_tags_offset = h->cluster_labels_offset +
99                            ( sizeof(char) * max_documents * CLUSTER_LABEL_LEN);
100   h->file_length = h->document_tags_offset +
101                            ( sizeof(char) * max_documents * DOCUMENT_TAG_LEN);
102   h->n_perma_clusters = 0;
103   h->n_clusters = 0;
104   crm_force_munmap_filename(filename);
105   f = fopen(filename, "wb");
106   dontcare = fwrite(h, 1, sizeof(CLUMPER_HEADER_STRUCT), f);
107   i = h->file_length - sizeof(CLUMPER_HEADER_STRUCT);
108   if(joe_trace)
109     fprintf(stderr, "about to write %ld zeros to backing file\n", i);
110   while(i--)
111     fputc('\0', f);
112   fflush(f);
113   fclose(f);
114   if(joe_trace)
115     fprintf(stderr, "Just wrote backing file for clumper size %ld\n",
116         h->file_length);
117 }
118 
map_file(CLUMPER_STATE_STRUCT * s,char * filename)119 static int map_file(CLUMPER_STATE_STRUCT *s, char *filename)
120 {
121   struct stat statee;
122   if(stat(filename, &statee))
123   {
124     nonfatalerror("Couldn't stat file!", filename);
125     return -1;
126   }
127 
128   s->file_origin = crm_mmap_file
129             (filename,
130              0, statee.st_size,
131              PROT_READ | PROT_WRITE,
132              MAP_SHARED,
133              NULL);
134   if(s->file_origin == MAP_FAILED)
135   {
136     nonfatalerror("Couldn't mmap file!", filename);
137     return -1;
138   }
139   if(joe_trace)
140     fprintf(stderr,"Definitely think I've mapped a file.\n");
141 
142   s->header = (CLUMPER_HEADER_STRUCT *)(s->file_origin);
143   s->document_offsets =
144       (void *)( s->file_origin + s->header->document_offsets_offset );
145   s->cluster_assignments =
146       (void *)( s->file_origin + s->header->clusters_offset );
147   s->distance_matrix =
148       (void *)( s->file_origin + s->header->distance_matrix_offset );
149   s->cluster_labels =
150       (void *)( s->file_origin + s->header->cluster_labels_offset );
151   s->document_tags =
152       (void *)( s->file_origin + s->header->document_tags_offset );
153   return 0;
154 }
155 
unmap_file(CLUMPER_STATE_STRUCT * s)156 static void unmap_file(CLUMPER_STATE_STRUCT *s)
157 {
158   crm_munmap_file ((void *) s->file_origin);
159 }
160 
aref_dist_mat(float * m,int j,int i)161 static float *aref_dist_mat(float *m, int j, int i)
162 {
163   if(i < j) {int t = i; i = j; j = t;}
164   return m + i * (i - 1) / 2 + j;
165 }
166 
get_document_affinity(unsigned int * doc1,unsigned int * doc2)167 static float get_document_affinity(unsigned int *doc1, unsigned int *doc2)
168 {
169   int u = 0, l1 = 0, l2 = 0;
170   for(;;)
171     if(doc1[l1] == 0)
172     {
173         while(doc2[l2] != 0) l2++;
174         break;
175     }
176     else if(doc2[l2] == 0)
177     {
178         while(doc1[l1] != 0) l1++;
179         break;
180     }
181     else if(doc1[l1] == doc2[l2])
182     {
183       u++; l1++; l2++;
184     }
185     else if(doc1[l1] < doc2[l2])
186         l1++;
187     else if(doc1[l1] > doc2[l2])
188         l2++;
189     else
190     {
191       fprintf(stderr, "panic in the disco!  ");
192       break;
193     }
194   if(joe_trace)
195         fprintf(stderr, "Just compared two documents u=%d l1=%d l2=%d\n",
196                  u, l1, l2);
197   return pow((double)(1.0 + u * u) / (double)(1.0 + l1 * l2), 0.2);
198 }
199 
compare_features(const void * a,const void * b)200 static int compare_features(const void *a, const void *b)
201 {
202   if(*(unsigned int *)a < *(unsigned int *)b)
203     return -1;
204   if(*(unsigned int *)a > *(unsigned int *)b)
205     return 1;
206   return 0;
207 }
208 
eat_document(char * text,long text_len,long * ate,regex_t * regee,unsigned int * feature_space,long max_features,long long flags)209 static int eat_document
210         (       char *text, long text_len, long *ate,
211                 regex_t *regee,
212                 unsigned int *feature_space, long max_features,
213                 long long flags)
214 {
215   long n_features = 0, i, j;
216   unsigned int hash_pipe[OSB_BAYES_WINDOW_LEN];
217   int hash_coefs[] = { 1, 3, 5, 11, 23, 47};
218   regmatch_t match[1];
219   char *t_start;
220   long t_len;
221   long f;
222   unsigned long long unigram, unique, string;
223 
224   unigram = flags & CRM_UNIGRAM;
225   string = flags & CRM_STRING;
226   unique = flags & (CRM_UNIQUE | CRM_STRING);
227 
228   *ate = 0;
229 
230   for(i = 0; i < OSB_BAYES_WINDOW_LEN; i++)
231     hash_pipe[i] = 0xdeadbeef;
232   while(text_len > 0 && n_features < max_features - 1)
233   {
234     if(crm_regexec (regee, text, text_len, 1, match, 0, NULL))
235       //no match or regex error, we're done
236       break;
237     else
238     {
239       t_start = text + match->rm_so;
240       t_len = match->rm_eo - match->rm_so;
241       if(string)
242       {
243         text += match->rm_so + 1;
244         text_len -= match->rm_so + 1;
245         *ate += match->rm_so + 1;
246       }else
247       {
248         text += match->rm_eo;
249         text_len -= match->rm_eo;
250         *ate += match->rm_eo;
251       }
252 
253       for(i = OSB_BAYES_WINDOW_LEN - 1; i > 0; i--)
254         hash_pipe[i] = hash_pipe[i - 1];
255       hash_pipe[0] = strnhash(t_start, t_len);
256     }
257     f = 0;
258     if(unigram)
259       feature_space[n_features++] = hash_pipe[0];
260     else
261       for(i = 1; i < OSB_BAYES_WINDOW_LEN && hash_pipe[i] != 0xdeadbeef; i++)
262         feature_space[n_features++] =
263                 hash_pipe[0] + hash_pipe[i] * hash_coefs[i];
264 
265   }
266   qsort(feature_space, n_features, sizeof(unsigned int), compare_features);
267 
268   if(unique)
269   {
270     i = 0;
271     for(j = 1; j < n_features; j++)
272       if(feature_space[i] != feature_space[j])
273         feature_space[++i] = feature_space[j];
274     feature_space[++i] = 0;
275     n_features = i + 1; //the zero counts
276   } else
277     feature_space[n_features++] = 0;
278   return n_features;
279 }
280 
281 
find_closest_document(CLUMPER_STATE_STRUCT * s,char * text,long text_len,regex_t * regee,long long flags)282 static long find_closest_document
283           (CLUMPER_STATE_STRUCT *s,
284           char *text, long text_len,
285           regex_t *regee,
286          long long flags)
287 {
288   unsigned int feature_space[32768];
289   long n, i, b = -1;
290   float b_s = 0.0, n_s;
291   n = eat_document(text, text_len, &i,
292                     regee, feature_space, 32768,
293                     flags);
294   for(i = 0; i < s->header->n_documents; i++)
295   {
296     n_s = get_document_affinity
297            (feature_space, (unsigned int *)(s->file_origin + s->document_offsets[i]));
298     if(n_s > b_s)
299     {
300       b = i;
301       b_s = n_s;
302     }
303   }
304   return b;
305 }
306 
307 typedef struct mythical_cluster_head
308 {
309   struct mythical_cluster_head *head, *next_head, *prev_head, *next;
310   long count;
311 } CLUSTER_HEAD_STRUCT;
312 
join_clusters(CLUSTER_HEAD_STRUCT * a,CLUSTER_HEAD_STRUCT * b)313 static void join_clusters(CLUSTER_HEAD_STRUCT *a, CLUSTER_HEAD_STRUCT *b)
314 {
315   if(joe_trace)
316     fprintf(stderr, "Joining clusters of sizes %ld and %ld\n,",
317         a->head->count, b->head->count);
318 
319   while(a->next) a = a->next;
320   b = b->head;
321   a->next = b;
322   a->head->count += b->count;
323   b->count = 0; //though we wont actually touch this value anymore
324   if(b->prev_head)
325     b->prev_head->next_head = b->next_head;
326   if(b->next_head)
327     b->next_head->prev_head = b->prev_head;
328   b->next_head = NULL;
329   b->prev_head = NULL;
330   do
331     b->head = a->head;
332   while( (b = b->next) );
333 }
334 
agglomerative_averaging_cluster(CLUMPER_STATE_STRUCT * s,long goal)335 static void agglomerative_averaging_cluster(CLUMPER_STATE_STRUCT *s, long goal)
336 {
337   long i, j, k, l, n = s->header->n_documents;
338   CLUSTER_HEAD_STRUCT *clusters = malloc(n * sizeof(CLUSTER_HEAD_STRUCT)),
339                         *a, *b, *c, first_head_ptr;
340   float *M = malloc( (n * (n + 1) / 2 - 1) * sizeof(float)), d, e;
341   long ck, cl, ckl;
342 
343   if(joe_trace)
344     fprintf(stderr, "agglomerative averaging clustering...\n");
345 
346   for(i = 1; i < s->header->n_documents - 1; i++)
347   {
348     clusters[i].head = &clusters[i];
349     clusters[i].prev_head = &clusters[i - 1];
350     clusters[i].next_head = &clusters[i + 1];
351     clusters[i].next = NULL;
352     clusters[i].count = 1;
353   }
354   clusters[0].head = &clusters[0];
355   clusters[0].prev_head = &first_head_ptr;
356   clusters[0].next_head = &clusters[1];
357   clusters[0].next = NULL;
358   clusters[0].count = 1;
359   if(s->header->n_documents > 1) //don't muck the first one!
360   {
361     clusters[s->header->n_documents-1].head =
362                                 &clusters[s->header->n_documents - 1];
363     clusters[s->header->n_documents - 1].prev_head =
364                                 &clusters[s->header->n_documents - 2];
365     clusters[s->header->n_documents - 1].next = NULL;
366     clusters[s->header->n_documents - 1].count = 1;
367   }
368   //always make sure the chain ends
369   clusters[s->header->n_documents - 1].next_head = NULL;
370 
371   first_head_ptr.next_head = &clusters[0];
372 
373   j = (n * (n + 1) / 2 - 1);
374   for(i = 0; i < j; i++)
375     M[i] = s->distance_matrix[i];
376 
377   for(a = first_head_ptr.next_head; a; a = a->next_head)
378     if(s->cluster_assignments[a - clusters] < 0)
379       for(b = a->next_head; b; b = b->next_head)
380         if(s->cluster_assignments[a - clusters]
381             == s->cluster_assignments[b - clusters])
382         {
383           k = a - clusters;
384           l = b - clusters;
385           ck = clusters[k].count;
386           cl = clusters[l].count;
387           ckl = ck + cl;
388 
389           for(c = &clusters[0]; c; c = c->next_head)
390           {
391             i = c - clusters;
392             if(i == k || i == l)
393               continue;
394             *aref_dist_mat(M, k, i) = (ck * *aref_dist_mat(M, k, i) +
395                                       cl * *aref_dist_mat(M, l, i) ) / ckl;
396             *aref_dist_mat(M, l, i) = 0.0;
397           }
398           join_clusters(&clusters[k], &clusters[l]);
399           n--;
400         }
401 
402   while(n > goal)
403   {
404     l = 0; k = 0;
405     d = -1.0;
406     for(a = first_head_ptr.next_head; a; a = a->next_head)
407       for(b = a->next_head; b; b = b->next_head)
408       {
409         i = a - clusters;
410         j = b - clusters;
411         if(s->cluster_assignments[i] < 0 && s->cluster_assignments[j] < 0)
412           e = *aref_dist_mat(M, i, j) = -1000000000.0;
413         else
414           e = *aref_dist_mat(M, i, j);
415         if( e > d )
416         {
417           if(s->cluster_assignments[j] < 0)
418           {
419             k = j;
420             l = i;
421           } else
422           {
423             k = i;
424             l = j;
425           }
426           d = e;
427         }
428       }
429     if(l == 0 && k == 0)
430     {
431       fprintf(stderr, "CLUMP FAILED TO JOIN ENOUGH CLUMPS!\n");
432       break;
433     }
434     ck = clusters[k].count;
435     cl = clusters[l].count;
436     ckl = ck + cl;
437 
438     for(a = &clusters[0]; a; a = a->next_head)
439     {
440       i = a - clusters;
441       if(i == k || i == l)
442         continue;
443       *aref_dist_mat(M, k, i) = (ck * *aref_dist_mat(M, k, i) +
444                                  cl * *aref_dist_mat(M, l, i) ) / ckl;
445       *aref_dist_mat(M, l, i) = 0.0;
446     }
447     join_clusters(&clusters[k], &clusters[l]);
448     n--;
449   }
450 
451   i = s->header->n_perma_clusters + 1;
452   for(a = &clusters[0]; a; a = a->next_head)
453   {
454     if(s->cluster_assignments[a - clusters] < 0)
455       j = -s->cluster_assignments[a - clusters];
456     else
457       j = i++;
458     for(b = a; b; b = b->next)
459       s->cluster_assignments[b - clusters] = j;
460   }
461 
462   s->header->n_clusters = n;
463   free(M);
464 }
465 
index_to_pair(long t,long * i,long * j)466 static void index_to_pair(long t, long *i, long *j)
467 {
468   long p = 2 * t;
469   *i = (long)sqrt(p);
470   if(*i * *i + *i > p)
471     (*i)--;
472   *j = t - (*i * (*i + 1) / 2);
473 }
474 
compare_float_ptrs(const void * a,const void * b)475 static int compare_float_ptrs(const void *a, const void *b)
476 {
477   if(**(float **)a > **(float **)b)
478     return 1;
479   if(**(float **)a < **(float **)b)
480     return -1;
481   return 0;
482 }
483 
agglomerative_nearest_cluster(CLUMPER_STATE_STRUCT * s,long goal)484 static void agglomerative_nearest_cluster(CLUMPER_STATE_STRUCT *s, long goal)
485 {
486   long i, j, k, l, m, n = s->header->n_documents;
487   CLUSTER_HEAD_STRUCT *clusters = malloc(n * sizeof(CLUSTER_HEAD_STRUCT)),
488                         *a, *b, first_head_ptr;
489   float **M = malloc( (n * (n + 1) / 2 - 1) * sizeof(float *));
490 
491   if(joe_trace)
492     fprintf(stderr, "agglomerative nearest clustering...\n");
493 
494   for(i = 1; i < s->header->n_documents - 1; i++)
495   {
496     clusters[i].head = &clusters[i];
497     clusters[i].prev_head = &clusters[i - 1];
498     clusters[i].next_head = &clusters[i + 1];
499     clusters[i].next = NULL;
500     clusters[i].count = 1;
501   }
502   clusters[0].head = &clusters[0];
503   clusters[0].prev_head = &first_head_ptr;
504   clusters[0].next_head = &clusters[1];
505   clusters[0].next = NULL;
506   clusters[0].count = 1;
507   if(s->header->n_documents > 1) //don't muck the first one!
508   {
509     clusters[s->header->n_documents - 1].head =
510                                 &clusters[s->header->n_documents - 1];
511     clusters[s->header->n_documents - 1].prev_head =
512                                 &clusters[s->header->n_documents - 2];
513     clusters[s->header->n_documents - 1].next = NULL;
514     clusters[s->header->n_documents - 1].count = 1;
515   }
516   //always make sure the chain ends
517   clusters[s->header->n_documents - 1].next_head = NULL;
518 
519   first_head_ptr.next_head = &clusters[0];
520 
521   j = (n * (n + 1) / 2 - 1);
522   for(i = 0; i < j; i++)
523     M[i] = &s->distance_matrix[i];
524   qsort(M, j, sizeof(float *), compare_float_ptrs);
525 
526   for(a = first_head_ptr.next_head; a; a = a->next_head)
527     if(s->cluster_assignments[a - clusters] < 0)
528       for(b = a->next_head; b; b = b->next_head)
529         if(s->cluster_assignments[a - clusters]
530             == s->cluster_assignments[b - clusters])
531         {
532           k = a - clusters;
533           l = b - clusters;
534           join_clusters(&clusters[k], &clusters[l]);
535           n--;
536         }
537   i = j;
538   while(n > goal)
539   {
540     do
541     {
542       k = M[--i] - s->distance_matrix;
543       index_to_pair(k, &l, &m);
544     } while(clusters[m].head == clusters[l].head);
545 
546     join_clusters(&clusters[m], &clusters[l]);
547     n--;
548   }
549 
550   i = s->header->n_perma_clusters + 1;
551   for(a = &clusters[0]; a; a = a->next_head)
552   {
553     if(s->cluster_assignments[a - clusters] < 0)
554       j = -s->cluster_assignments[a - clusters];
555     else
556       j = i++;
557     for(b = a; b; b = b->next)
558       s->cluster_assignments[b - clusters] = j;
559   }
560   free(M);
561   s->header->n_clusters = n;
562 }
563 
square(double a)564 double square(double a) {return a * a;}
minf(double a,double b)565 double minf(double a, double b) {return a < b ? a : b;}
566 
567 #define H_BUCKETS 50
thresholding_average_cluster(CLUMPER_STATE_STRUCT * s)568 static void thresholding_average_cluster(CLUMPER_STATE_STRUCT *s)
569 {
570   long i, j, k, l, ck, cl, ckl, n = s->header->n_documents;
571   CLUSTER_HEAD_STRUCT *clusters = malloc(n * sizeof(CLUSTER_HEAD_STRUCT)),
572                         *a, *b, *c, first_head_ptr;
573   long H[H_BUCKETS], C[H_BUCKETS];
574   float A[H_BUCKETS], t_A, t, t_score, scoro,gM;
575   float min, max, scale;
576   float *M = malloc( (n * (n + 1) / 2 - 1) * sizeof(float)), d, e;
577 
578   if(joe_trace)
579     fprintf(stderr, "threshold average clustering...\n");
580 
581   for(i = 1; i < s->header->n_documents - 1; i++)
582   {
583     clusters[i].head = &clusters[i];
584     clusters[i].prev_head = &clusters[i - 1];
585     clusters[i].next_head = &clusters[i + 1];
586     clusters[i].next = NULL;
587     clusters[i].count = 1;
588   }
589   clusters[0].head = &clusters[0];
590   clusters[0].prev_head = &first_head_ptr;
591   clusters[0].next_head = &clusters[1];
592   clusters[0].next = NULL;
593   clusters[0].count = 1;
594   if(s->header->n_documents > 1) //don't muck the first one!
595   {
596     clusters[s->header->n_documents-1].head =
597                                 &clusters[s->header->n_documents - 1];
598     clusters[s->header->n_documents - 1].prev_head =
599                                 &clusters[s->header->n_documents - 2];
600     clusters[s->header->n_documents - 1].next = NULL;
601     clusters[s->header->n_documents - 1].count = 1;
602   }
603   //always make sure the chain ends
604   clusters[s->header->n_documents - 1].next_head = NULL;
605 
606   first_head_ptr.next_head = &clusters[0];
607 
608   j = (n * (n + 1) / 2 - 1);
609   for(i = 0; i < j; i++)
610     M[i] = s->distance_matrix[i];
611 
612 
613   for(i = 0; i < H_BUCKETS; i++)
614     H[i] = 0.0;
615   j = n * (n - 1) / 2;
616   min = 1000000000.0;
617   max = -1000000000.0;
618 
619   for(i = 0; i < j; i++)
620   {
621     if(M[i] < min)
622       min = s->distance_matrix[i];
623     if(M[i] > max)
624       max = s->distance_matrix[i];
625   }
626   scale = (max - min) / ((float)H_BUCKETS - 0.1);
627   if (scale == 0)
628     scale = 1.0;
629   for(i = 0; i < j; i++)
630     H[ (int)( (M[i] - min) / scale ) ]++;
631   if(joe_trace)
632   {
633     fprintf(stderr, "Histogram of document distances:\n");
634     for(i = 0; i < H_BUCKETS; i++)
635     {
636       for(k = 0; k < H[i]; k += 100)
637         fputc('*', stderr);
638       fputc('\n', stderr);
639     }
640   }
641 
642   k = 0;
643   t_A = 0.0;
644   for(i = 0; i < H_BUCKETS; i++)
645   {
646     k = C[i] = H[i] + k;
647     t_A = A[i] = H[i] * (min + (i + 0.5) * scale) + t_A;
648   }
649   gM = t_A / (float)j;
650   t_score = 0.0;
651   t = -1.0;
652   for(i = 2; i < H_BUCKETS - 2; i++)
653   {
654     scoro = square(gM - (t_A - A[i]) / (k - C[i])) * (k - C[i])
655           + square(gM - A[i] / C[i]) * C[i];
656     if(scoro > t_score)
657     {
658       t_score = scoro;
659       t = min + scale * (float)(i );
660     }
661   }
662   if(joe_trace)
663     fprintf(stderr, "min = %f, max = %f, t = %f\n", min, max, t);
664   for(a = first_head_ptr.next_head; a; a = a->next_head)
665     if(s->cluster_assignments[a - clusters] < 0)
666       for(b = a->next_head; b; b = b->next_head)
667         if(s->cluster_assignments[a - clusters]
668             == s->cluster_assignments[b - clusters])
669         {
670           k = a - clusters;
671           l = b - clusters;
672           ck = clusters[k].count;
673           cl = clusters[l].count;
674           ckl = ck + cl;
675 
676           for(c = &clusters[0]; c; c = c->next_head)
677           {
678             i = c - clusters;
679             if(i == k || i == l)
680               continue;
681             //*aref_dist_mat(M, k, i) = (ck * *aref_dist_mat(M, k, i) +
682             //                         cl * *aref_dist_mat(M, l, i) ) / ckl;
683             *aref_dist_mat(M, k, i) =
684                 minf(*aref_dist_mat(M, k, i), *aref_dist_mat(M, l, i));
685             *aref_dist_mat(M, l, i) = 0.0;
686           }
687           join_clusters(&clusters[k], &clusters[l]);
688           n--;
689         }
690 
691   for(;;)
692   {
693     l = 0; k = 0;
694     d = -1.0;
695     for(a = first_head_ptr.next_head; a; a = a->next_head)
696       for(b = a->next_head; b; b = b->next_head)
697       {
698         i = a - clusters;
699         j = b - clusters;
700         if(s->cluster_assignments[i] < 0 && s->cluster_assignments[j] < 0)
701         {
702           e = *aref_dist_mat(M, i, j) = -1000000000.0;
703           if(joe_trace)
704             fprintf(stderr, rand() & 0x1 ? "wonk!\n" : " wonk!\n");
705         }
706         else
707           e = *aref_dist_mat(M, i, j);
708         if( e > d )
709         {
710           if(s->cluster_assignments[j] < 0)
711           {
712             k = j;
713             l = i;
714           } else
715           {
716             k = i;
717             l = j;
718           }
719           d = e;
720         }
721       }
722     if(joe_trace)
723       fprintf(stderr, "l = %ld, k = %ld, d = %f\n", l, k, d);
724     if( (l == 0 && k == 0) || d < t) //we're done
725       break;
726 
727     ck = clusters[k].count;
728     cl = clusters[l].count;
729     ckl = ck + cl;
730 
731     for(a = &clusters[0]; a; a = a->next_head)
732     {
733       i = a - clusters;
734       if(i == k || i == l)
735         continue;
736       *aref_dist_mat(M, k, i) = (ck * *aref_dist_mat(M, k, i) +
737                                  cl * *aref_dist_mat(M, l, i) ) / ckl;
738       *aref_dist_mat(M, l, i) = 0.0;
739     }
740     join_clusters(&clusters[k], &clusters[l]);
741     n--;
742   }
743 
744   i = s->header->n_perma_clusters + 1;
745   for(a = &clusters[0]; a; a = a->next_head)
746   {
747     if(s->cluster_assignments[a - clusters] < 0)
748       j = -s->cluster_assignments[a - clusters];
749     else
750       j = i++;
751     for(b = a; b; b = b->next)
752       s->cluster_assignments[b - clusters] = j;
753   }
754 
755   s->header->n_clusters = n;
756   free(M);
757 
758 }
759 
assign_perma_cluster(CLUMPER_STATE_STRUCT * s,long doc,char * lab)760 static void assign_perma_cluster(CLUMPER_STATE_STRUCT *s,
761                                 long doc,
762                                 char *lab)
763 {
764   long i;
765   for(i = 1; i <= s->header->n_perma_clusters; i++)
766     if(0 == strcmp(s->cluster_labels[i], lab))
767       break;
768   if(i > s->header->n_perma_clusters)
769   {
770     i = ++(s->header->n_perma_clusters);
771     strcpy(s->cluster_labels[i], lab);
772   }
773   s->cluster_assignments[doc] = -i;
774 }
775 
max(long a,long b)776 long max(long a, long b) {return a > b ? a : b;}
777 
crm_expr_clump(CSL_CELL * csl,ARGPARSE_BLOCK * apb)778 int crm_expr_clump(CSL_CELL *csl, ARGPARSE_BLOCK *apb)
779 {
780   char *txtptr;
781   long txtstart;
782   long txtlen;
783   char htext[MAX_PATTERN];
784   char filename[MAX_PATTERN];
785   long htext_len;
786 
787   char regex_text[MAX_PATTERN];  //  the regex pattern
788   long regex_text_len;
789   char param_text[MAX_PATTERN];
790   long param_text_len;
791   unsigned long long bychunk;
792   int n_clusters = 0;
793 
794   char tag[DOCUMENT_TAG_LEN];
795   char class[CLUSTER_LABEL_LEN];
796 
797   struct stat statbuf;
798 
799   CLUMPER_STATE_STRUCT S, *s = &S;
800 
801   regex_t regee;
802   regmatch_t matchee[2];
803 
804   long i, j, k, l;
805 
806 
807   if (crm_exec_box_restriction(csl, apb, &txtptr, &txtstart, &txtlen) != 0)
808     return 0;
809 
810   crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len);
811   htext_len = apb->p1len;
812   htext_len = crm_nexpandvar (htext, htext_len, MAX_PATTERN);
813   i = 0;
814   while (htext[i] < 0x021) i++;
815   j = i;
816   while (htext[j] >= 0x021) j++;
817   htext[j] = '\000';
818   strcpy (filename, &htext[i]);
819 
820   //use regex_text and regee to grab parameters
821   crm_get_pgm_arg (param_text, MAX_PATTERN, apb->s2start, apb->s2len);
822   param_text_len = apb->s2len;
823   param_text_len = crm_nexpandvar (param_text, param_text_len, MAX_PATTERN);
824 
825   param_text[ param_text_len ] = '\0';
826   if(joe_trace)
827     fprintf( stderr, "param_text = %s\n", param_text );
828 
829   strcpy(regex_text, "n_clusters[[:space:]]*=[[:space:]]*([0-9]+)");
830   if( crm_regcomp (&regee, regex_text, strlen(regex_text), REG_EXTENDED) )
831   {
832     nonfatalerror("Problem compiling regex to grab params:", regex_text);
833     return 0;
834   }
835   if(!crm_regexec (&regee, param_text, param_text_len, 2, matchee, 0, NULL))
836   {
837     param_text[matchee[1].rm_eo + 1] = '\0';
838     if(joe_trace)
839       fprintf(stderr, "&param_text[matchee[1].rm_so] = %s\n",
840                 &param_text[matchee[1].rm_so]);
841     n_clusters = atol(&param_text[matchee[1].rm_so]);
842     if(joe_trace)
843       fprintf(stderr, "n_clusters = %d\n", n_clusters);
844   }
845   strcpy(regex_text, "tag[[:space:]]*=[[:space:]]*([[:graph:]]+)");
846   if( crm_regcomp (&regee, regex_text, strlen(regex_text), REG_EXTENDED) )
847   {
848     nonfatalerror("Problem compiling regex to grab params:", regex_text);
849     return 0;
850   }
851   if(!crm_regexec (&regee, param_text, param_text_len, 2, matchee, 0, NULL))
852   {
853     param_text[matchee[1].rm_eo] = '\0';
854     strcpy(tag, &param_text[matchee[1].rm_so]);
855   } else
856     tag[0] = '\0';
857   strcpy(regex_text, "clump[[:space:]]*=[[:space:]]*([[:graph:]]+)");
858   if( crm_regcomp (&regee, regex_text, strlen(regex_text), REG_EXTENDED) )
859   {
860     nonfatalerror("Problem compiling regex to grab params:", regex_text);
861     return 0;
862   }
863   if(!crm_regexec (&regee, param_text, param_text_len, 2, matchee, 0, NULL))
864   {
865     param_text[matchee[1].rm_eo] = '\0';
866     strcpy(class, &param_text[matchee[1].rm_so]);
867   } else
868     class[0] = '\0';
869 
870   strcpy(regex_text, "max_documents[[:space:]]*=[[:space:]]*([[:graph:]]+)");
871   if( crm_regcomp (&regee, regex_text, strlen(regex_text), REG_EXTENDED) )
872   {
873     nonfatalerror("Problem compiling regex to grab params:", regex_text);
874     return 0;
875   }
876   if(!crm_regexec (&regee, param_text, param_text_len, 2, matchee, 0, NULL))
877   {
878     param_text[matchee[1].rm_eo] = '\0';
879     max_documents = atol(&param_text[matchee[1].rm_so]);
880   }
881   //we've already got a default max_documents
882 
883   crm_get_pgm_arg (regex_text, MAX_PATTERN, apb->s1start, apb->s1len);
884   regex_text_len = apb->s1len;
885   if(regex_text_len == 0)
886   {
887     strcpy(regex_text, "[[:graph:]]+");
888     regex_text_len = strlen( regex_text );
889   }
890   regex_text[regex_text_len] = '\0';
891   regex_text_len = crm_nexpandvar (regex_text, regex_text_len, MAX_PATTERN);
892   if( crm_regcomp (&regee, regex_text, regex_text_len, REG_EXTENDED) )
893   {
894     nonfatalerror("Problem compiling this regex:", regex_text);
895     return 0;
896   }
897 
898   bychunk = apb->sflags & CRM_BYCHUNK;
899 
900   if (apb->sflags & CRM_REFUTE)
901   {
902     if(map_file(s, filename))
903       //we already nonfatalerrored
904       return 0;
905     if(tag[0]) {
906       for(i = s->header->n_documents; i >= 0; i--)
907         if(0 == strcmp(tag, s->document_tags[i]))
908           break;
909     }
910     else
911       i = find_closest_document(s, txtptr + txtstart, txtlen,
912                                       &regee, apb->sflags);
913     if(i < 0)
914     {
915       unmap_file(s);
916       return 0;
917     }
918     memmove(s->file_origin + s->document_offsets[i],
919             s->file_origin + s->document_offsets[i + 1],
920             (s->header->file_length - s->document_offsets[i + 1]) );
921     memmove(&s->document_tags[i],
922             &s->document_tags[i + 1],
923             sizeof(char) * DOCUMENT_TAG_LEN *(s->header->n_documents - i - 1 ));
924     memmove(&s->cluster_labels[i],
925             &s->cluster_labels[i + 1],
926             sizeof(char) * CLUSTER_LABEL_LEN*(s->header->n_documents - i - 1 ));
927     s->header->n_documents--;
928     j = s->document_offsets[i + 1] - s->document_offsets[i];
929     for(k = i; k < s->header->n_documents; k++)
930     {
931       s->document_offsets[k] = s->document_offsets[k + 1] - j;
932       s->cluster_assignments[k] = s->cluster_assignments[k + 1];
933     }
934     s->header->file_length -= j;
935     for(k = 0; k < s->header->n_documents; k++)
936       for(l = max(k + 1, i); l < s->header->n_documents; l++)
937         *aref_dist_mat(s->distance_matrix, k, l) =
938             *aref_dist_mat(s->distance_matrix, k, l + 1);
939     if(n_clusters > 0)
940     {
941       if(bychunk)
942         agglomerative_averaging_cluster(s, n_clusters);
943       else
944         agglomerative_nearest_cluster(s, n_clusters);
945     }
946     else if(n_clusters == 0)
947     {
948       if(bychunk)
949         thresholding_average_cluster(s);
950       else
951         thresholding_average_cluster(s);
952     }
953     l = s->header->file_length;
954 
955     unmap_file(s);
956     crm_force_munmap_filename(filename);
957     dontcare = truncate(filename, l);
958     return 0;
959   } else
960   { //LEARNIN'!
961     long n;
962     unsigned int feature_space[32768];
963     FILE *f;
964     if(stat(filename, &statbuf))
965       make_new_clumper_backing_file(filename);
966     if(txtlen == 0)
967     {
968       if(tag[0] && class[0]) //is not null
969       {
970         if(map_file(s, filename))
971         //we already nonfatalerrored
972           return 0;
973         for(i = s->header->n_documents - 1; i >= 0; i++)
974           if(0 == strcmp(tag, s->document_tags[i]))
975             break;
976         if(i >= 0)
977           assign_perma_cluster(s, i, class);
978         unmap_file(s);
979       }
980       return 0;
981     }
982 
983     n = eat_document
984         (       txtptr + txtstart, txtlen, &i,
985                 &regee,
986                 feature_space, 32768,
987                 apb->sflags );
988     crm_force_munmap_filename(filename);
989     f = fopen(filename, "ab+");
990     dontcare = fwrite(feature_space, n, sizeof(unsigned int), f);
991     fclose(f);
992     if(map_file(s, filename))
993       //we already nonfatalerrored
994       return 0;
995     if(s->header->n_documents >= s->header->max_documents)
996     {
997       nonfatalerror("This clump backing file is full and cannot"
998                     " assimilate new documents!", filename);
999       unmap_file(s);
1000       return 0;
1001     }
1002     i = s->header->n_documents++;
1003 
1004     s->document_offsets[i] = s->header->file_length;
1005     s->header->file_length += sizeof(unsigned int) * n;
1006     for(j = 0; j < i; j++)
1007       *aref_dist_mat(s->distance_matrix, j, i) = get_document_affinity(
1008             (unsigned int *)( s->file_origin + s->document_offsets[i]),
1009             (unsigned int *)( s->file_origin + s->document_offsets[j]) );
1010     strcpy(s->document_tags[i], tag);
1011     if(class[0])
1012       assign_perma_cluster(s, i, class);
1013     else
1014       s->cluster_assignments[i] = 0;
1015 
1016     if(n_clusters > 0)
1017     {
1018       if(bychunk)
1019         agglomerative_averaging_cluster(s, n_clusters);
1020       else
1021         agglomerative_nearest_cluster(s, n_clusters);
1022     }
1023     else if(n_clusters == 0)
1024     {
1025       if(bychunk)
1026         thresholding_average_cluster(s);
1027       else
1028         thresholding_average_cluster(s);
1029     }
1030     unmap_file(s);
1031     return 0;
1032   }
1033 }
1034 
sprint_lab(CLUMPER_STATE_STRUCT * s,char * b,int l)1035 int sprint_lab(CLUMPER_STATE_STRUCT *s, char *b, int l)
1036 {
1037   if(l == 0)
1038   {
1039     strcpy(b, "unassigned");
1040     return strlen("unassigned");
1041   }
1042   if(s->cluster_labels[l][0] != '\0')
1043     return sprintf(b, "%s", s->cluster_labels[l]);
1044   else
1045     return sprintf(b, "clump_#%d", l);
1046 }
1047 
sprint_tag(CLUMPER_STATE_STRUCT * s,char * b,int d)1048 int sprint_tag(CLUMPER_STATE_STRUCT *s, char *b, int d)
1049 {
1050   if(s->document_tags[d][0] != '\0')
1051     return sprintf(b, "%s", s->document_tags[d]);
1052   else
1053     return sprintf(b, "document_#%d", d);
1054 }
1055 
crm_expr_pmulc(CSL_CELL * csl,ARGPARSE_BLOCK * apb)1056 int crm_expr_pmulc(CSL_CELL *csl, ARGPARSE_BLOCK *apb)
1057 {
1058   char *txtptr;
1059   long txtstart;
1060   long txtlen;
1061   char htext[MAX_PATTERN];
1062   char filename[MAX_PATTERN];
1063   long htext_len;
1064 
1065   char regex_text[MAX_PATTERN];  //  the regex pattern
1066   long regex_text_len;
1067   unsigned long long bychunk;
1068 
1069   char out_var[MAX_PATTERN];
1070   long out_var_len;
1071 
1072   float A[MAX_CLUSTERS], T;
1073   long N[MAX_CLUSTERS];
1074   double p[MAX_CLUSTERS], pR[MAX_CLUSTERS];
1075 
1076   unsigned int feature_space[32768];
1077   long n;
1078 
1079   long closest_doc = -1;
1080   float closest_doc_affinity;
1081 
1082   long out_len = 0;
1083 
1084   CLUMPER_STATE_STRUCT S, *s = &S;
1085 
1086   regex_t regee;
1087 
1088   long i, j;
1089 
1090 
1091   if (crm_exec_box_restriction(csl, apb, &txtptr, &txtstart, &txtlen) != 0)
1092     return 0;
1093 
1094   crm_get_pgm_arg (htext, MAX_PATTERN, apb->p1start, apb->p1len);
1095   htext_len = apb->p1len;
1096   htext_len = crm_nexpandvar (htext, htext_len, MAX_PATTERN);
1097   i = 0;
1098   while (htext[i] < 0x021) i++;
1099   j = i;
1100   while (htext[j] >= 0x021) j++;
1101   htext[j] = '\000';
1102   strcpy (filename, &htext[i]);
1103 
1104   //grab output variable name
1105   crm_get_pgm_arg (out_var, MAX_PATTERN, apb->p2start, apb->p2len);
1106   out_var_len = apb->p2len;
1107   out_var_len = crm_nexpandvar (out_var, out_var_len, MAX_PATTERN);
1108 
1109   crm_get_pgm_arg (regex_text, MAX_PATTERN, apb->s1start, apb->s1len);
1110   regex_text_len = apb->s1len;
1111   if(regex_text_len == 0)
1112   {
1113     strcpy(regex_text, "[[:graph:]]+");
1114     regex_text_len = strlen( regex_text );
1115   }
1116   regex_text[regex_text_len] = '\0';
1117   regex_text_len = crm_nexpandvar (regex_text, regex_text_len, MAX_PATTERN);
1118   if( crm_regcomp (&regee, regex_text, regex_text_len, REG_EXTENDED) )
1119   {
1120     nonfatalerror("Problem compiling this regex:", regex_text);
1121     return 0;
1122   }
1123 
1124   bychunk = apb->sflags & CRM_BYCHUNK;
1125 
1126   if(map_file(s, filename))
1127   //we already nonfatalerrored
1128     return 0;
1129 
1130   if(txtlen == 0)
1131   {
1132     for(i = 0; i < s->header->n_documents; i++)
1133     {
1134       A[0] = 0.0;
1135       N[0] = 1;
1136       for(j = 0; j < s->header->n_documents; j++)
1137         if(i != j && s->cluster_assignments[i] == s->cluster_assignments[j])
1138         {
1139           N[0]++;
1140           if(bychunk)
1141             A[0] += *aref_dist_mat(s->distance_matrix, i, j);
1142           else
1143             if(*aref_dist_mat(s->distance_matrix, i, j) > A[0])
1144               A[0] = *aref_dist_mat(s->distance_matrix, i, j);
1145         }
1146       if(bychunk)
1147         A[0] /= (float)N[0];
1148       out_len += sprintf(outbuf + out_len, "%ld (", i);
1149       out_len += sprint_tag(s, outbuf + out_len, i);
1150       out_len += sprintf(outbuf + out_len,
1151         ") clump: %ld (", s->cluster_assignments[i]);
1152       out_len += sprint_lab(s, outbuf + out_len, s->cluster_assignments[i]);
1153       out_len += sprintf(outbuf + out_len, ") affinity: %0.4f\n", A[0]);
1154 
1155     }
1156     outbuf[out_len] = '\0';
1157     if(out_var_len)
1158       crm_destructive_alter_nvariable(out_var, out_var_len, outbuf, out_len);
1159     unmap_file(s);
1160     return 0;
1161   } else
1162   {
1163     if(joe_trace)
1164       fprintf(stderr, "pmulcing!\n");
1165     n = eat_document
1166         (       txtptr + txtstart, txtlen, &i,
1167                 &regee,
1168                 feature_space, 32768,
1169                 apb->sflags );
1170     closest_doc_affinity = -1.0;
1171     for(i = 0; i <= s->header->n_clusters; i++)
1172     {
1173       A[i] = 0.0;
1174       N[i] = 0;
1175     }
1176     if(bychunk)
1177     {
1178       for(i = 0; i < s->header->n_documents; i++)
1179       {
1180         j = s->cluster_assignments[i];
1181         if(j == 0)
1182           continue;
1183         T = get_document_affinity(feature_space, (void *)(s->file_origin +
1184                     s->document_offsets[i]));
1185         A[j] += T;
1186         if(T > closest_doc_affinity)
1187         {
1188           closest_doc = i;
1189           closest_doc_affinity = T;
1190         }
1191         N[j]++;
1192       }
1193       for(i = 1; i <= s->header->n_clusters; i++)
1194         T += A[i] /= N[i];
1195     } else
1196     {
1197       for(i = 0; i < s->header->n_documents; i++)
1198       {
1199         j = s->cluster_assignments[i];
1200         T = get_document_affinity(feature_space, (void *)(s->file_origin +
1201                     s->document_offsets[i]));
1202         if(T > A[j])
1203           A[j] = T;
1204         if(T > closest_doc_affinity)
1205         {
1206           closest_doc = i;
1207           closest_doc_affinity = T;
1208         }
1209         N[j]++;
1210       }
1211     }
1212     T = 0.0000001;
1213     j = 1;
1214     for(i = 1; i <= s->header->n_clusters; i++)
1215     {
1216       if(A[i] > A[j])
1217         j = i;
1218       if(A[i] == 0.0)
1219         p[i] = 0.0;
1220       else
1221         p[i] = normalized_gauss(1.0 / A[i] - 1.0, 0.5);
1222       //p[i] = A[i];
1223       T += p[i];
1224     }
1225     if(s->header->n_clusters < 2)
1226       p[j = 1] = 0.0;
1227     for(i = 1; i <= s->header->n_clusters; i++)
1228     {
1229       p[i] /= T;
1230       pR[i] = 10 * ( log10(0.0000001 + p[i])
1231                         - log10(1.0000001 - p[i]) );
1232     }
1233 
1234     if(joe_trace)
1235       fprintf(stderr, "generating output...\n");
1236 
1237     if(p[j] > 0.5)
1238       out_len += sprintf(outbuf + out_len,
1239           "PMULC succeeds; success probabilty: %0.4f pR: %0.4f\n", p[j], pR[j]);
1240     else
1241       out_len += sprintf(outbuf + out_len,
1242           "PMULC fails; success probabilty: %0.4f pR: %0.4f\n", p[j], pR[j]);
1243     out_len += sprintf(outbuf + out_len,
1244           "Best match to clump #%ld (", j);
1245     out_len += sprint_lab(s, outbuf + out_len, j);
1246     out_len += sprintf(outbuf + out_len,
1247           ") prob: %0.4f  pR: %0.4f\n", p[j], pR[j]);
1248     out_len += sprintf(outbuf + out_len,
1249           "Closest document: #%ld (", closest_doc);
1250     out_len += sprint_tag(s, outbuf + out_len, closest_doc);
1251     out_len += sprintf(outbuf + out_len,
1252           ") affinity: %0.4f\n", closest_doc_affinity);
1253     out_len += sprintf(outbuf + out_len,
1254           "Total features in input file: %ld\n", n);
1255     for(i = 1; i <= s->header->n_clusters; i++)
1256     {
1257       out_len += sprintf(outbuf + out_len,
1258           "%ld: (", i);
1259       out_len += sprint_lab(s, outbuf + out_len, i);
1260       out_len += sprintf(outbuf + out_len,
1261           "): documents: %ld  affinity: %0.4f  prob: %0.4f  pR: %0.4f\n",
1262           N[i], A[i], p[i], pR[i]);
1263     }
1264 
1265     if (p[j] > 0.5000)
1266     {
1267       if (user_trace)
1268         fprintf (stderr, "CLUMP was a SUCCESS, continuing execution.\n");
1269     }
1270     else
1271     {
1272       if (user_trace)
1273         fprintf (stderr, "CLUMP was a FAIL, skipping forward.\n");
1274       //    and do what we do for a FAIL here
1275       csl->cstmt = csl->mct[csl->cstmt]->fail_index - 1;
1276       csl->aliusstk [csl->mct[csl->cstmt]->nest_level] = -1;
1277     };
1278     outbuf[out_len] = '\0';
1279     if(joe_trace)
1280       fprintf(stderr, "JOE_TRACE:\n%s", outbuf);
1281     if(out_var_len)
1282       crm_destructive_alter_nvariable(out_var, out_var_len, outbuf, out_len);
1283     unmap_file(s);
1284     return 0;
1285   }
1286 
1287 }
1288