1 /*
2  * Copyright (C) 2002 Laird Breyer
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
17  *
18  * Author:   Laird Breyer <laird@lbreyer.com>
19  */
20 
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24 
25 #include <string.h>
26 #include <stdlib.h>
27 #include <math.h>
28 
29 #include "util.h"
30 #include "dbacl.h"
31 
32 extern options_t m_options;
33 extern charparser_t m_cp;
34 extern options_t u_options;
35 
36 extern empirical_t empirical;
37 
38 extern category_t cat[MAX_CAT];
39 extern category_count_t cat_count;
40 
41 extern myregex_t re[MAX_RE];
42 extern regex_count_t regex_count;
43 
44 extern char *extn;
45 extern long system_pagesize;
46 
47 /***********************************************************
48  * MISCELLANEOUS FUNCTIONS                                 *
49  ***********************************************************/
50 
sanitize_path(char * in,char * extension)51 char *sanitize_path(char *in, char *extension) {
52   char *q;
53   char *path;
54   charbuf_len_t l;
55 
56   if( !extension ) { extension = ""; }
57   /* this bit likely fails in DOS ;-) */
58   path = getenv(DEFAULT_CATPATH);
59   if( path && (*in != '/') && (*in != '.') ) {
60     l = strlen(path);
61     q = (char *)malloc(l + strlen(in) + 3 + strlen(extension));
62     strcpy(q, path);
63     if( q[l - 1] != '/' ) {
64       q[l] = '/';
65       q[l + 1] = 0;
66     }
67     strcat(q, in);
68     strcat(q, extension);
69   } else {
70     q = (char *)malloc(strlen(in) + strlen(extension) + 1);
71     strcpy(q, in);
72     strcat(q, extension);
73   }
74   return q;
75 }
76 
77 #define MOPTION(x,opt) (((opt) & (1<<(x))) ? ("|" #x) : "")
78 
79 
print_charparser_type(charparser_t mcp,char * buf)80 char *print_charparser_type(charparser_t mcp, char *buf) {
81   strcpy(buf,
82 	 (mcp == CP_DEFAULT) ? "CP_DEFAULT" :
83 	 (mcp == CP_CHAR) ? "CP_CHAR" :
84 	 (mcp == CP_ALPHA) ? "CP_ALPHA" :
85 	 (mcp == CP_ALNUM) ? "CP_ALNUM" :
86 	 (mcp == CP_GRAPH) ? "CP_GRAPH" :
87 	 (mcp == CP_CEF) ? "CP_CEF" :
88 	 (mcp == CP_ADP) ? "CP_ADP" :
89 	 (mcp == CP_CEF2) ? "CP_CEF2" : "CP_UNKNOWN");
90   return buf;
91 }
92 
93 
print_model_options(options_t opt,charparser_t mcp,char * buf)94 char *print_model_options(options_t opt, charparser_t mcp,  char *buf) {
95   print_charparser_type(mcp, buf);
96   strcat(buf, MOPTION(M_OPTION_REFMODEL, opt));
97   strcat(buf, MOPTION(M_OPTION_TEXT_FORMAT, opt));
98   strcat(buf, MOPTION(M_OPTION_MBOX_FORMAT, opt));
99   strcat(buf, MOPTION(M_OPTION_XML, opt));
100   strcat(buf, MOPTION(M_OPTION_I18N, opt));
101   strcat(buf, MOPTION(M_OPTION_CASEN, opt));
102   strcat(buf, MOPTION(M_OPTION_CALCENTROPY, opt));
103   strcat(buf, MOPTION(M_OPTION_MULTINOMIAL, opt));
104   strcat(buf, MOPTION(M_OPTION_HEADERS, opt));
105   strcat(buf, MOPTION(M_OPTION_PLAIN, opt));
106   strcat(buf, MOPTION(M_OPTION_NOPLAIN, opt));
107   strcat(buf, MOPTION(M_OPTION_SHOW_LINKS, opt));
108   strcat(buf, MOPTION(M_OPTION_SHOW_ALT, opt));
109   strcat(buf, MOPTION(M_OPTION_HTML, opt));
110   strcat(buf, MOPTION(M_OPTION_XHEADERS, opt));
111   strcat(buf, MOPTION(M_OPTION_THEADERS, opt));
112   strcat(buf, MOPTION(M_OPTION_SHOW_SCRIPT, opt));
113   strcat(buf, MOPTION(M_OPTION_SHOW_STYLE, opt));
114   strcat(buf, MOPTION(M_OPTION_SHOW_HTML_COMMENTS, opt));
115   strcat(buf, MOPTION(M_OPTION_USE_STDTOK, opt));
116   strcat(buf, MOPTION(M_OPTION_ATTACHMENTS, opt));
117   strcat(buf, MOPTION(M_OPTION_WARNING_BAD, opt));
118   strcat(buf, MOPTION(M_OPTION_NGRAM_STRADDLE_NL, opt));
119   return buf;
120 }
121 
print_user_options(options_t opt,char * buf)122 char *print_user_options(options_t opt, char *buf) {
123   strcpy(buf, MOPTION(U_OPTION_CLASSIFY, opt));
124   strcat(buf, MOPTION(U_OPTION_LEARN, opt));
125   strcat(buf, MOPTION(U_OPTION_FASTEMP, opt));
126   strcat(buf, MOPTION(U_OPTION_CUTOFF, opt));
127   strcat(buf, MOPTION(U_OPTION_VERBOSE, opt));
128   strcat(buf, MOPTION(U_OPTION_STDIN, opt));
129   strcat(buf, MOPTION(U_OPTION_SCORES, opt));
130   strcat(buf, MOPTION(U_OPTION_POSTERIOR, opt));
131   strcat(buf, MOPTION(U_OPTION_FILTER, opt));
132   strcat(buf, MOPTION(U_OPTION_DEBUG, opt));
133   strcat(buf, MOPTION(U_OPTION_DUMP, opt));
134   strcat(buf, MOPTION(U_OPTION_APPEND, opt));
135   strcat(buf, MOPTION(U_OPTION_DECIMATE, opt));
136   strcat(buf, MOPTION(U_OPTION_GROWHASH, opt));
137   strcat(buf, MOPTION(U_OPTION_INDENTED, opt));
138   strcat(buf, MOPTION(U_OPTION_NOZEROLEARN, opt));
139   strcat(buf, MOPTION(U_OPTION_MMAP, opt));
140   strcat(buf, MOPTION(U_OPTION_CONFIDENCE, opt));
141   strcat(buf, MOPTION(U_OPTION_VAR, opt));
142   return buf;
143 }
144 
sanitize_model_options(options_t * mopt,charparser_t * mcp,category_t * cat)145 error_code_t sanitize_model_options(options_t *mopt, charparser_t *mcp,
146 				    category_t *cat) {
147   options_t mask;
148 
149   /* things that always override mopt */
150   mask =
151     (1<<M_OPTION_WARNING_BAD)|
152     (1<<M_OPTION_SHOW_HTML_COMMENTS)|
153     (1<<M_OPTION_SHOW_SCRIPT)|
154     (1<<M_OPTION_SHOW_STYLE)|
155     (1<<M_OPTION_HEADERS)|
156     (1<<M_OPTION_XHEADERS)|
157     (1<<M_OPTION_THEADERS)|
158     (1<<M_OPTION_SHOW_ALT)|
159     (1<<M_OPTION_SHOW_FORMS)|
160     (1<<M_OPTION_NOHEADERS)|
161     (1<<M_OPTION_ATTACHMENTS)|
162     (1<<M_OPTION_SHOW_LINKS)|
163     (1<<M_OPTION_NGRAM_STRADDLE_NL)|
164     (1<<M_OPTION_REFMODEL)|
165     (1<<M_OPTION_I18N)|
166     (1<<M_OPTION_CASEN)|
167     (1<<M_OPTION_USE_STDTOK)|
168     (1<<M_OPTION_CALCENTROPY)|
169     (1<<M_OPTION_MULTINOMIAL);
170   *mopt |= (cat->model.options & mask);
171 
172   /* things that are problematic */
173 
174   if( *mopt & (1<<M_OPTION_TEXT_FORMAT) ) {
175     if( cat->model.options & (1<<M_OPTION_MBOX_FORMAT) ) {
176       errormsg(E_WARNING,
177 	      "category %s was learned with -T email, forcing -T text\n",
178 	      cat->filename);
179     }
180   } else if( *mopt & (1<<M_OPTION_MBOX_FORMAT) ) {
181     if( cat->model.options & (1<<M_OPTION_TEXT_FORMAT) ) {
182       errormsg(E_WARNING,
183 	      "category %s was learned with -T text, forcing -T email\n",
184 	      cat->filename);
185     }
186   } else {
187     *mopt |= (cat->model.options & ((1<<M_OPTION_TEXT_FORMAT)|(1<<M_OPTION_MBOX_FORMAT)));
188   }
189 
190   if( *mopt & (1<<M_OPTION_XML) ) {
191     if( cat->model.options & (1<<M_OPTION_HTML) ) {
192       errormsg(E_WARNING,
193 	      "category %s was learned with -T html, forcing -T xml\n",
194 	      cat->filename);
195     }
196   } else if( *mopt & (1<<M_OPTION_HTML) ) {
197     if( cat->model.options & (1<<M_OPTION_XML) ) {
198       errormsg(E_WARNING,
199 	      "category %s was learned with -T xml, forcing -T html\n",
200 	      cat->filename);
201     }
202   } else {
203     *mopt |= (cat->model.options & ((1<<M_OPTION_XML)|(1<<M_OPTION_HTML)));
204   }
205 
206   if( *mopt & (1<<M_OPTION_PLAIN) ) {
207     if( cat->model.options & (1<<M_OPTION_NOPLAIN) ) {
208       errormsg(E_WARNING,
209 	      "category %s was learned with -T email:noplain, forcing -T email:plain\n",
210 	      cat->filename);
211     }
212   } else if( *mopt & (1<<M_OPTION_NOPLAIN) ) {
213     if( cat->model.options & (1<<M_OPTION_PLAIN) ) {
214       errormsg(E_WARNING,
215 	      "category %s was learned with -T email:plain, forcing -T email:noplain\n",
216 	      cat->filename);
217     }
218   } else {
219     *mopt |= (cat->model.options & ((1<<M_OPTION_NOPLAIN)|(1<<M_OPTION_PLAIN)));
220   }
221 
222 /*   mask =  */
223 /*     (1<<M_OPTION_CHAR_ALPHA)| */
224 /*     (1<<M_OPTION_CHAR_ALNUM)| */
225 /*     (1<<M_OPTION_CHAR_CHAR)| */
226 /*     (1<<M_OPTION_CHAR_GRAPH)| */
227 /*     (1<<M_OPTION_CHAR_ADP)| */
228 /*     (1<<M_OPTION_CHAR_CEF); */
229 
230 /*   if( (*mopt & mask) && ((cat->m_options & mask) != (*mopt & mask)) ) { */
231 /*     errormsg(E_FATAL, */
232 /* 	    "category %s has incompatible token set (check -e switch)\n", */
233 /* 	    cat->filename); */
234 /*     return 0; */
235 /*   } else { */
236 /*     *mopt |= (cat->m_options & mask); */
237 /*   } */
238   if( (*mcp != CP_DEFAULT) && (*mcp != cat->model.cp) ) {
239     errormsg(E_FATAL,
240 	    "category %s has incompatible token set (check -e switch)\n",
241 	    cat->filename);
242     return 0;
243   } else {
244     *mcp = cat->model.cp;
245   }
246 
247   return 1;
248 }
249 
load_regex(char * buf)250 regex_count_t load_regex(char *buf) {
251   char *p;
252   regex_count_t c;
253 
254   /* set up the submatch bitmap */
255   re[regex_count].submatches |= 0;
256   re[regex_count].flags = 0;
257 
258   p = strrchr(buf, '|');
259   if( p && ( *(--p) == '|') ) {
260     /* assume string ends in ||12345, use as bitmap */
261     *p = '\0';
262     for(p += 2; *p; p++) {
263       /* assume ascii number positions */
264       if( (*p > '9') || (*p < '1')) {
265 	if( *p != '\n' ) {
266 	  errormsg(E_WARNING,
267 		  "could not decode || suffix for '%s'.\n", buf);
268 	}
269       } else {
270 	re[regex_count].submatches |= (1<<(*p - '0'));
271       }
272     }
273   } else { /* no bitmap specified */
274     re[regex_count].submatches = ~0;
275   }
276 
277   /* remove trailing newline */
278   if( buf[strlen(buf) - 1] == '\n' ) {
279     buf[strlen(buf) - 1] = '\0';
280   }
281   /* GNU regexes use ordinary strings */
282   /* it's a regex - see if we've got it already */
283   for(c = 0; c < regex_count; c++) {
284     if( strcmp(re[c].string, buf) == 0 ) {
285       break;
286     }
287   }
288   if( c >= regex_count ) { /* not found */
289     /* add it to our list */
290     if( strchr(buf, '(') ) {
291       re[regex_count].string = strdup(buf);
292     } else {
293       char *dup = (char *)malloc(strlen(buf)+2);
294       if( dup ) {
295 	sprintf(dup, "(%s)", buf);
296 	errormsg(E_WARNING,
297 		 "no captures found in regex, converting to '%s'\n",
298 		 dup);
299       }
300       re[regex_count].string = dup;
301     }
302     if( !re[regex_count].string ) {
303       errormsg(E_FATAL,
304 	       "could not prepare regular expression '%s'.\n",
305 	       buf);
306     }
307     /* and compile the regex */
308     if( regcomp(&re[regex_count].regex,
309 		re[regex_count].string, REG_EXTENDED) != 0 ) {
310       errormsg(E_FATAL, /* not much point going on */
311 	      "could not compile regular expression '%s'.\n",
312 	      re[regex_count].string);
313     } else {
314       regex_count++;
315       if( regex_count >= MAX_RE ) {
316 	errormsg(E_FATAL, "too many regular expressions\n");
317 	/* no point in going on */
318       }
319     }
320   }
321   return c;
322 }
323 
free_all_regexes()324 void free_all_regexes() {
325   regex_count_t k;
326   for(k = 0; k < regex_count; k++) {
327     regfree(&re[k].regex);
328   }
329 }
330 
331 
332 /***********************************************************
333  * EMPIRICAL DISTRIBUTION OF TOKENS                        *
334  ***********************************************************/
335 
336 /* initialize global learner object */
init_empirical(empirical_t * emp,hash_count_t dmt,hash_bit_count_t dmhb)337 void init_empirical(empirical_t *emp, hash_count_t dmt, hash_bit_count_t dmhb) {
338 
339     /* some constants */
340     emp->max_tokens = dmt;
341     emp->max_hash_bits = dmhb;
342     emp->full_token_count = 0;
343     emp->unique_token_count = 0;
344     emp->track_features = 0;
345     emp->feature_stack_top = 0;
346     emp->hashfull_warning = 0;
347 
348     /* allocate room for hash */
349     emp->hash = (h_item_t *)calloc(emp->max_tokens, sizeof(h_item_t));
350     if( !emp->hash ) {
351       errormsg(E_FATAL,
352 	       "not enough memory? I couldn't allocate %li bytes\n",
353 	       (sizeof(h_item_t) * ((long int)emp->max_tokens)));
354     }
355 
356     MADVISE(emp->hash, sizeof(h_item_t) * emp->max_tokens, MADV_RANDOM);
357 
358 }
359 
free_empirical(empirical_t * emp)360 void free_empirical(empirical_t *emp) {
361   if( emp->hash ) {
362     free(emp->hash);
363   }
364 }
365 
clear_empirical(empirical_t * emp)366 void clear_empirical(empirical_t *emp) {
367     token_stack_t i;
368 
369     if( emp->track_features ) {
370 	/* this may actually be slower than a global memset */
371 	for(i = 0; i < emp->feature_stack_top; i++) {
372 	    memset(emp->feature_stack[i], 0, sizeof(h_item_t));
373 	}
374     } else {
375 	memset(emp->hash, 0, sizeof(h_item_t) * emp->max_tokens);
376     }
377 
378     emp->full_token_count = 0;
379     emp->unique_token_count = 0;
380     emp->feature_stack_top = 0;
381 
382     if( u_options & (1<<U_OPTION_FASTEMP) ) {
383       emp->track_features = 1;
384     }
385 }
386 
find_in_empirical(empirical_t * emp,hash_value_t id)387 h_item_t *find_in_empirical(empirical_t *emp, hash_value_t id) {
388   register h_item_t *i, *loop;
389   /* start at id */
390   i = loop = &emp->hash[id & (emp->max_tokens - 1)];
391 
392   while( FILLEDP(i) ) {
393     if( EQUALP(i->id,id) ) {
394       return i; /* found id */
395     } else {
396       i++; /* not found */
397       /* wrap around */
398       i = (i >= &emp->hash[emp->max_tokens]) ? emp->hash : i;
399       if( i == loop ) {
400 	return NULL; /* when hash table is full */
401       }
402     }
403   }
404 
405   /* empty slot, so not found */
406 
407   return i;
408 }
409 
410 /* calculates the entropy of the full empirical measure */
empirical_entropy(empirical_t * emp)411 score_t empirical_entropy(empirical_t *emp) {
412   token_stack_t j;
413   hash_count_t i;
414   score_t e = 0.0;
415 
416   if( emp->track_features &&
417       (emp->feature_stack_top < MAX_TOKEN_LINE_STACK) ) {
418     for(j = 0; j < emp->feature_stack_top; j++) {
419       e += ((score_t)emp->feature_stack[j]->count) *
420 	log((score_t)emp->feature_stack[j]->count);
421     }
422   } else {
423     for(i = 0; i < emp->max_tokens; i++) {
424       if( FILLEDP(&emp->hash[i]) ) {
425 	e += ((score_t)emp->hash[i].count) *
426 	  log((score_t)emp->hash[i].count);
427       }
428     }
429   }
430   e = e/emp->full_token_count - log((score_t)emp->full_token_count);
431 
432   return -e;
433 }
434 
435 /***********************************************************
436  * CATEGORY FUNCTIONS                                      *
437  ***********************************************************/
438 
439 /* initialize to zero. fullfilename specified elsewhere */
init_category(category_t * cat)440 void init_category(category_t *cat) {
441     char *p;
442 
443     cat->model_full_token_count = 0;
444     cat->model_unique_token_count = 0;
445     cat->score = 0.0;
446     cat->score_div = 0.0;
447     cat->score_s2 = 0.0;
448     cat->score_shannon = 0.0;
449     cat->shannon = 0.0;
450     cat->shannon_s2 = 0.0;
451     cat->alpha = 0.0;
452     cat->beta = 0.0;
453     cat->mu = 0.0;
454     cat->s2 = 0.0;
455     cat->prior = 0.0;
456     cat->complexity = 0.0;
457     cat->fcomplexity = 0;
458     cat->fmiss = 0;
459     cat->max_order = 0;
460     memset(cat->mediacounts, 0, sizeof(token_count_t)*TOKEN_CLASS_MAX);
461     p = strrchr(cat->fullfilename, '/');
462     if( p ) {
463       cat->filename = p + 1; /* only keep basename */
464     } else {
465       cat->filename = cat->fullfilename;
466     }
467     cat->filename = strdup(cat->filename);
468     if( extn && *extn ) {
469       p = strstr(cat->filename, extn);
470       if( p ) { *p = '\0'; }
471     }
472     cat->retype = 0;
473     cat->model.type = simple;
474     cat->model.options = 0;
475     cat->model.cp = 0;
476     cat->model.dt = 0;
477     cat->c_options = 0;
478     cat->hash = NULL;
479     cat->mmap_offset = 0;
480     cat->mmap_start = NULL;
481 }
482 
create_category_hash(category_t * cat,FILE * input,int protf)483 bool_t create_category_hash(category_t *cat, FILE *input, int protf) {
484   hash_count_t i, j;
485 
486   if( u_options & (1<<U_OPTION_MMAP) ) {
487     cat->mmap_offset = ftell(input);
488     if( cat->mmap_offset > 0 ) {
489       cat->mmap_start =
490 	(byte_t *)MMAP(0, sizeof(c_item_t) * cat->max_tokens +
491 		       cat->mmap_offset,
492 		       protf, MAP_SHARED, fileno(input), 0);
493       if( cat->mmap_start == MAP_FAILED ) { cat->mmap_start = NULL; }
494       if( cat->mmap_start ) {
495 	cat->hash = (c_item_t *)(cat->mmap_start + cat->mmap_offset);
496 	MADVISE(cat->hash, sizeof(c_item_t) * cat->max_tokens,
497 		MADV_SEQUENTIAL|MADV_WILLNEED);
498 	/* lock the pages to prevent swapping - on Linux, this
499 	   works without root privs so long as the user limits
500 	   are big enough - mine are unlimited ;-)
501 	   On other OSes, root may me necessary. If we can't
502 	   lock, it doesn't really matter, but cross validations
503 	   and multiple classifications are a _lot_ faster with locking. */
504 	MLOCK(cat->hash, sizeof(c_item_t) * cat->max_tokens);
505 	cat->c_options |= (1<<C_OPTION_MMAPPED_HASH);
506       }
507     }
508   }
509 
510   if( !cat->hash ) {
511     cat->c_options &= ~(1<<C_OPTION_MMAPPED_HASH);
512     /* allocate hash table */
513     cat->hash = (c_item_t *)malloc(sizeof(c_item_t) * cat->max_tokens);
514     if( !cat->hash ) {
515       errormsg(E_ERROR, "not enough memory for category %s\n",
516 	       cat->filename);
517       return 0;
518     }
519 
520     MADVISE(cat->hash, sizeof(c_item_t) * cat->max_tokens,
521 	    MADV_SEQUENTIAL);
522 
523     /* read in hash table */
524     i = cat->max_tokens;
525     j = 0;
526     while(!ferror(input) && !feof(input) && (j < i) ) {
527       j += fread(cat->hash + j, sizeof(c_item_t), i - j, input);
528     }
529 
530     if( j < i ) {
531       errormsg(E_ERROR, "corrupt category? %s\n",
532 	       cat->fullfilename);
533       free(cat->hash);
534       return 0;
535     }
536 
537   }
538   return 1;
539 }
540 
541 
free_category_hash(category_t * cat)542 void free_category_hash(category_t *cat) {
543   if( cat->hash ) {
544     if( cat->mmap_start != NULL ) {
545       MUNMAP(cat->mmap_start, cat->max_tokens * sizeof(c_item_t) +
546 	     cat->mmap_offset);
547       cat->mmap_start = NULL;
548       cat->mmap_offset = 0;
549       cat->hash = NULL;
550     }
551     if( cat->hash ) {
552       free(cat->hash);
553       cat->hash = NULL;
554     }
555   }
556 }
557 
558 /* frees the resrouces associated with a category */
free_category(category_t * cat)559 void free_category(category_t *cat) {
560   free_category_hash(cat);
561   if( cat->filename ) { free(cat->filename); }
562   if( cat->fullfilename ) { free(cat->fullfilename); }
563 }
564 
565 /* turns purely random text into a category of its own */
init_purely_random_text_category(category_t * cat)566 void init_purely_random_text_category(category_t *cat) {
567   alphabet_size_t i, j;
568   weight_t z = -log((double)(ASIZE - AMIN));
569 
570 #if defined DIGITIZE_DIGRAMS
571   digitized_weight_t zz = PACK_DIGRAMS(z);
572 #endif
573 
574   for(i = AMIN; i < ASIZE; i++) {
575     for(j = AMIN; j < ASIZE; j++) {
576 #if defined DIGITIZE_DIGRAMS
577       cat->dig[i][j] = zz;
578 #else
579       cat->dig[i][j] = z;
580 #endif
581     }
582   }
583 
584   z = -log((double)(ASIZE - AMIN));
585 #if defined DIGITIZE_DIGRAMS
586   zz = PACK_DIGRAMS(z);
587 #endif
588   for(j = AMIN; j < ASIZE; j++) {
589 #if defined DIGITIZE_DIGRAMS
590     cat->dig[(alphabet_size_t)DIAMOND][j] = zz;
591 #else
592     cat->dig[(alphabet_size_t)DIAMOND][j] = z;
593 #endif
594   }
595 
596   /* not needed: set DIAMOND-DIAMOND score for completeness only */
597 
598 #if defined DIGITIZE_DIGRAMS
599   cat->dig[(alphabet_size_t)DIAMOND][(alphabet_size_t)DIAMOND] = DIGITIZED_WEIGHT_MIN;
600 #else
601   cat->dig[(alphabet_size_t)DIAMOND][(alphabet_size_t)DIAMOND] = log(0.0);
602 #endif
603 
604   cat->logZ = 0.0;
605   cat->divergence = 0.0;
606   cat->delta = 0.0;
607   cat->renorm = 0.0;
608   cat->hash = NULL;
609   cat->mmap_start = NULL;
610   cat->mmap_offset = 0;
611   cat->model.type = simple;
612   cat->max_order = 1;
613   cat->model.options = 0;
614   cat->model.cp = 0;
615   cat->model.dt = 0;
616 }
617 
find_in_category(category_t * cat,hash_value_t id)618 c_item_t *find_in_category(category_t *cat, hash_value_t id) {
619     register c_item_t *i, *loop;
620 
621     if( cat->hash ) {
622 	/* start at id */
623 	i = loop = &cat->hash[id & (cat->max_tokens - 1)];
624 
625 	while( FILLEDP(i) ) {
626 	    if( EQUALP(NTOH_ID(i->id),id) ) {
627 		return i; /* found id */
628 	    } else {
629 		i++; /* not found */
630 		/* wrap around */
631 		i = (i >= &cat->hash[cat->max_tokens]) ? cat->hash : i;
632 		if( i == loop ) {
633 		    return NULL; /* when hash table is full */
634 		}
635 	    }
636 	}
637 	return i;
638     } else {
639 	return NULL;
640     }
641 }
642 
643 /* for each loaded category, this calculates the score.
644    Tokens have the format
645    DIAMOND t1 DIAMOND t2 ... tn DIAMOND CLASSEP class NUL */
score_word(char * tok,token_type_t tt,regex_count_t re)646 void score_word(char *tok, token_type_t tt, regex_count_t re) {
647   category_count_t i = 0;
648   weight_t multinomial_correction = 0.0;
649   weight_t shannon_correction = 0.0;
650   weight_t lambda, ref, oldscore;
651   bool_t apply;
652   alphabet_size_t pp, pc, len;
653   hash_value_t id;
654   char *q;
655   register c_item_t *k = NULL;
656   h_item_t *h = NULL;
657 
658   /* we skip "empty" tokens */
659   for(q = tok; q && *q == DIAMOND; q++);
660   if( q && (*q != EOTOKEN) ) {
661 
662     id = hash_full_token(tok);
663 
664     if( (m_options & (1<<M_OPTION_CALCENTROPY)) ) {
665       /* add the token to the hash */
666 
667       h = find_in_empirical(&empirical, id);
668       if( h ) {
669  	if( FILLEDP(h) ) {
670 	  if( h->count < K_TOKEN_COUNT_MAX ) {
671 	    h->count++;
672 
673 #if defined SHANNON_STIRLING
674 	    shannon_correction = 1.0 + log((weight_t)h->count *
675 					   (weight_t)(h->count - 1))/2.0;
676 #else
677 	    shannon_correction += log((weight_t)h->count)*(weight_t)h->count
678 	      - log((weight_t)(h->count - 1))*(weight_t)(h->count - 1);
679 #endif
680 	  }
681  	} else {
682  	  if( /* !FILLEDP(i) && */
683  	     ((100 * empirical.unique_token_count) <
684  	      (HASH_FULL * empirical.max_tokens) )) {
685  	    /* fill the empirical hash */
686  	    SET(h->id,id);
687 	    empirical.unique_token_count +=
688 	      ( empirical.unique_token_count < K_TOKEN_COUNT_MAX ) ? 1 : 0;
689 	    h->count = 1;
690 #if defined SHANNON_STIRLING
691 	    shannon_correction = 1.0 - log(2.0 * M_PI)/2.0;
692 #else
693 	    shannon_correction = 0;
694 #endif
695  	  } else {
696  	    /* hash full */
697  	    h = NULL;
698  	    if( !empirical.hashfull_warning ) {
699 	      errormsg(E_WARNING,
700  		      "empirical hash full, calculation may be skewed. "
701  		      "Try option -h %d\n",
702  		      (empirical.max_hash_bits + 1));
703  	      empirical.hashfull_warning = 1;
704  	    }
705  	    return; /* pretend word doesn't exist */
706  	  }
707 
708 	  if( empirical.track_features ) {
709 	    if( empirical.feature_stack_top < MAX_TOKEN_LINE_STACK ) {
710 	      empirical.feature_stack[empirical.feature_stack_top++] = h;
711 	    } else {
712 	      empirical.track_features = 0;
713 	      empirical.feature_stack_top = 0;
714 	    }
715 	  }
716  	}
717 	empirical.full_token_count +=
718 	  ( empirical.full_token_count < K_TOKEN_COUNT_MAX ) ? 1 : 0;
719 
720       }
721     }
722 
723     /* now do scoring for all available categories */
724     for(i = 0; i < cat_count; i++) {
725 
726       oldscore = cat[i].score;
727       lambda = 0.0;
728       ref = 0.0;
729 
730       /* see if this token is for us. The rule is: a category either
731 	 uses the standard tokenizer (in that case re = INVALID_RE),
732 	 or it uses only those regexes which are listed in the retype
733 	 bitmap. Since re = 0 is taken by the standard tokenizer,
734 	 this occurs when re > 0 and we have to subtract 1 to check
735 	 the bitmap. Simple, really ;-) */
736       apply = ( ((re == INVALID_RE) &&
737 		 (tt.order <= cat[i].max_order) && !cat[i].retype) ||
738 		((re > 0) &&
739 		 (cat[i].retype & (1<<(re-1)))) );
740       if( apply ) {
741 
742 	/* if token found, add its lambda weight */
743 	k = find_in_category(&cat[i], id);
744 	if( k ) {
745 	  lambda = UNPACK_LAMBDA(NTOH_LAMBDA(k->lam));
746 	}
747 
748 	if( tt.order == 1 ) {
749 	  /* now compute the reference weight from digram model,
750 	     (while duplicating the digitization error) */
751 	  pp = (unsigned char)*tok;
752 	  CLIP_ALPHABET(pp);
753 	  q = tok + 1;
754 	  len = 1;
755 	  while( *q != EOTOKEN ) {
756 	    if( *q == '\r' ) {
757 	      q++;
758 	      continue;
759 	    }
760 	    pc = (unsigned char)*q;
761 	    CLIP_ALPHABET(pc);
762 	    ref += UNPACK_DIGRAMS(cat[i].dig[pp][pc]);
763 	    pp = pc;
764 	    q++;
765 	    if( *q != DIAMOND ) { len++; }
766 	  }
767 	  ref += UNPACK_DIGRAMS(cat[i].dig[RESERVED_TOKLEN][len]) -
768 	    UNPACK_DIGRAMS(cat[i].dig[RESERVED_TOKLEN][0]);
769 	  ref = UNPACK_RWEIGHTS(PACK_RWEIGHTS(ref));
770 
771 	}
772 
773 	/* update the complexity */
774 	/* note: complexity has nothing to do with Kolmogorov's definition */
775 	/* this is actually very simple in hindsight, but took
776 	   me a long time to get right. Different versions of dbacl
777 	   compute the complexity in different ways, and I kept changing
778 	   the method because I wasn't happy.
779 
780 	   In previous versions, complexity is an integer, which begs
781 	   the question "what does it count?".  For simple models
782 	   (max_order = 1) this is easy: we count the number of
783 	   tokens. But for max_order > 1, it's not obvious, because we
784 	   need to divide by 1/max_order asymptotically.
785 
786 	   One way is to increment the complexity if we encounter a
787 	   token of order max_order. This is correct for Markovian
788 	   models and corresponds to the dbacl.ps writeup, but causes
789 	   trouble in some edge cases. For example, if we classify a
790 	   very short document, there might not be enough tokens to
791 	   make sense.  This actually occurs when dbacl must classify
792 	   individual lines, and some lines contain one or two tokens
793 	   only.  Worse, dbacl used to renormalize at the same time as
794 	   updating the complexity, which increases the likelihood of
795 	   having a negative divergence score estimate in the first
796 	   few iterations - very bad.  Finally, the complexity is
797 	   nearly meaningless for models built up from regular
798 	   expressions, because both the start and the end of each
799 	   line contains incomplete n-grams (recall regexes can't
800 	   straddle newlines).
801 
802 	   So to solve these problems, some previous versions of dbacl
803 	   counted always the order 1 tokens. Asymptotically, this
804 	   makes no difference, but again it fails on edge
805 	   cases. Firstly, doing this means that the complexity for a
806 	   simple model is the same as the complexity for an n-gram
807 	   model for any n, so that makes it hard to compare mixed
808 	   models because n-gram model scores are consitently biased
809 	   for n > 1. Another problem is again with regexes, because
810 	   the incomplete n-gram tokens at the start and end of each
811 	   line add up to a pretty large error over thousands of
812 	   tokens.
813 
814 	   The solution to the above problems is twofold: first, we
815 	   renormalize after each token, regardless of its order. Of
816 	   course this means we must divide logZ by the number of
817 	   tokens per complexity unit, ie renorm = delta * logZ with
818 	   delta = 1/max_order. Once I realized this it was obvious
819 	   that the complexity should be also incremented by delta for
820 	   every token. As a side effect, the complexity is now a real
821 	   number, and actually measures not just the max_order token
822 	   count, but also the fraction of incomplete n-grams. This
823 	   seems like the right way to go, especially for models based
824 	   on regexes, since now we also count the incomplete n-grams
825 	   at both ends of the line, which adds up to quite a bit over
826 	   many lines. */
827 
828 	cat[i].fcomplexity++; /* don't actually need this, but nice to have */
829 	cat[i].complexity += cat[i].delta;
830 
831 	/* now adjust the score */
832 	switch(cat[i].model.type) {
833 	case simple:
834 	  multinomial_correction = h ?
835 	    (log((weight_t)cat[i].complexity) - log((weight_t)h->count)) : 0.0;
836 	  cat[i].score +=
837 	    lambda + multinomial_correction + ref - cat[i].renorm;
838 	  break;
839 	case sequential:
840 	default:
841 	  cat[i].score += lambda + ref - cat[i].renorm;
842 	  if( tt.order == cat[i].max_order ) {
843 	    cat[i].score_shannon += shannon_correction;
844 	  }
845 	  break;
846 	}
847 
848 	if( !k || !NTOH_ID(k->id) ) {
849 	  /* missing data */
850 	  cat[i].fmiss++;
851 	}
852 
853 	if( tt.order == 1 ) {
854 	  /* sample variance */
855 	  cat[i].score_s2 += (cat[i].score - oldscore) * (cat[i].score - oldscore);
856 	  /* only count medium for 1-grams */
857 	  cat[i].mediacounts[tt.cls]++;
858 	}
859 
860       }
861 
862       if( u_options & (1<<U_OPTION_DUMP) ) {
863 	if( u_options & (1<<U_OPTION_SCORES) ) {
864 	  fprintf(stdout, " %8.2f * %-6.1f\t",
865 		  -sample_mean(cat[i].score, cat[i].complexity),
866 		  cat[i].complexity);
867 	} else if( u_options & (1<<U_OPTION_POSTERIOR) ) {
868 	  fprintf(stdout, " %8.2f\t", oldscore - cat[i].score);
869 	} else if( u_options & (1<<U_OPTION_VAR) ) {
870 	  fprintf(stdout, " %8.2f * %-6.1f # %-8.2f\t",
871 		  -sample_mean(cat[i].score,cat[i].complexity),
872 		  cat[i].complexity,
873 		  sample_variance(cat[i].score_s2, cat[i].score,
874 				  cat[i].complexity)/cat[i].complexity);
875 	} else {
876 	  fprintf(stdout,
877 		  "%7.2f %7.2f %7.2f %7.2f %8lx\t",
878 		  lambda, ref, apply ? -cat[i].renorm : 0.0,
879 		  multinomial_correction,
880 		  (long unsigned int)((k && apply) ? NTOH_ID(k->id) : 0));
881 	}
882       }
883 
884     }
885 
886     if( u_options & (1<<U_OPTION_DUMP) ) {
887       print_token(stdout, tok);
888       if( re > 0 ) {
889 	fprintf(stdout, "<re=%d>\n", re);
890       } else {
891 	fprintf(stdout, "\n");
892       }
893     }
894 
895   }
896 }
897 
898 /*
899  * Returns 2 * min[ F(obs), 1 - F(obs) ], and calls it
900  * the "confidence". In reality, this is a type of p-value,
901  * which is zero when obs = +/- infinity, and 1 at the median.
902  * Under the null hypothesis obs ~ Gamma(alpha, beta), this
903  * p-value is uniformly distributed. This is chosen so that
904  * we can combine our p-palues with Fisher's chi^2 test, viz.
905  * -2\sum_i=1^n \log X_i ~ chi^2(2n), if all X_i are IID uniform.
906  *
907  */
gamma_pvalue(category_t * cat,double obs)908 confidence_t gamma_pvalue(category_t *cat, double obs) {
909   double m;
910   if( (obs <= 0.0) || (cat->alpha <= 0.0) || (cat->beta <= 0.0) ) {
911     return 0; /* bogus value */
912   } else if( obs > 120.0 ) {
913     /* large values can produce underflow in gamma_tail() */
914     return 0; /* bogus value */
915   }
916 
917   m = gamma_tail(cat->alpha, cat->beta, obs);
918 
919   /* uncomment below to get one sided confidence */
920   /*   return (int)(1000.0 * m); */
921   /* result is per-thousand */
922   return (confidence_t)(2000.0 * ((m > 0.5) ? 1.0 - m : m));
923 }
924 
925 /***********************************************************
926  * FILE MANAGEMENT FUNCTIONS                               *
927  ***********************************************************/
load_category_header(FILE * input,category_t * cat)928 error_code_t load_category_header(FILE *input, category_t *cat) {
929   char buf[MAGIC_BUFSIZE];
930   char scratchbuf[MAGIC_BUFSIZE];
931   short int shint_val, shint_val2;
932   long int lint_val1, lint_val2, lint_val3;
933 
934   if( input ) {
935     if( !fgets(buf, MAGIC_BUFSIZE, input) ||
936 	strncmp(buf, MAGIC1, MAGIC1_LEN) ) {
937       errormsg(E_ERROR,
938 	       "not a dbacl " SIGNATURE " category file [%s]\n",
939 	       cat->fullfilename);
940       return 0;
941     }
942 
943     init_category(cat); /* changes filename */
944 
945     if( !fgets(buf, MAGIC_BUFSIZE, input) ||
946 	(sscanf(buf, MAGIC2_i, &cat->divergence, &cat->logZ,
947 		&shint_val, scratchbuf) < 4) ) {
948       errormsg(E_ERROR, "bad category file [2]\n");
949       return 0;
950     }
951     cat->max_order = (token_order_t)shint_val;
952     cat->delta = 1.0/(score_t)(cat->max_order);
953     cat->renorm = cat->delta * cat->logZ;
954     if( scratchbuf[0] == 'm' ) {
955       cat->model.type = simple;
956     } else {
957       cat->model.type = sequential;
958     }
959 
960     if( !fgets(buf, MAGIC_BUFSIZE, input) ||
961 	(sscanf(buf, MAGIC3,
962 		&shint_val,
963 		&lint_val1,
964 		&lint_val2,
965 		&lint_val3) < 4) ) {
966       errormsg(E_ERROR, "bad category file [3]\n");
967       return 0;
968     }
969     cat->max_hash_bits = (token_order_t)shint_val;
970     cat->model_full_token_count = (token_count_t)lint_val1;
971     cat->model_unique_token_count = (token_count_t)lint_val2;
972     cat->model_num_docs = (document_count_t)lint_val3;
973 
974     cat->max_tokens = (1<<cat->max_hash_bits);
975 
976     if( !fgets(buf, MAGIC_BUFSIZE, input) ||
977 	(sscanf(buf, MAGIC8_i,
978 		&cat->shannon, &cat->shannon_s2) < 2) ) {
979       errormsg(E_ERROR, "bad category file [8]\n");
980       return 0;
981     }
982 
983     if( !fgets(buf, MAGIC_BUFSIZE, input) ||
984 	(sscanf(buf, MAGIC10_i,
985 		&cat->alpha, &cat->beta,
986 		&cat->mu, &cat->s2) < 4) ) {
987       errormsg(E_ERROR, "bad category file [10]\n");
988       return 0;
989     }
990 
991     /* see if there are any regexes */
992     while(fgets(buf, MAGIC_BUFSIZE, input)) {
993       if( strncmp(buf, MAGIC6, 2) == 0 ) {
994 	break;
995       } else if( strncmp(buf, MAGIC5_i, 8) == 0 ) {
996 
997 	/* if regex can't be compiled, load_regex() exits */
998 	cat->retype |= (1<<load_regex(buf + RESTARTPOS));
999 
1000       } else if( strncmp(buf, MAGIC4_i, 10) == 0) {
1001 	if( sscanf(buf, MAGIC4_i, &lint_val1, &shint_val, &shint_val2, scratchbuf) == 4 ) {
1002 	  cat->model.options = (options_t)lint_val1;
1003 	  cat->model.cp = (charparser_t)shint_val;
1004 	  cat->model.dt = (digtype_t)shint_val2;
1005 	}
1006       }
1007 
1008       /* finished with current line, get next one */
1009     }
1010 
1011     /* if this category did not register a regex, it wants
1012        the default processing, so we flag this */
1013     if( !cat->retype ) {
1014       cat->model.options |= (1<<M_OPTION_USE_STDTOK);
1015     }
1016 
1017     /* if we haven't read a character class, use alpha */
1018     if( cat->model.cp == CP_DEFAULT ) {
1019       if( cat->model.options & (1<<M_OPTION_MBOX_FORMAT) ) {
1020 	cat->model.cp = CP_ADP;
1021       } else {
1022 	cat->model.cp = CP_ALPHA;
1023       }
1024     }
1025 
1026     /* if we're here, success! */
1027     return 1;
1028   }
1029   return 0;
1030 }
1031 
1032 
explicit_load_category(category_t * cat,char * openf,int protf)1033 error_code_t explicit_load_category(category_t *cat, char *openf, int protf) {
1034   hash_count_t i, j;
1035 
1036   FILE *input;
1037 
1038   /* this is needed in case we try to open with write permissions,
1039      which would otherwise create the file */
1040 
1041   input = fopen(cat->fullfilename, "rb");
1042   if( input && (strcmp(openf, "rb") != 0) ) {
1043     input = freopen(cat->fullfilename, openf, input);
1044   }
1045 
1046   if( input ) {
1047 
1048     if( !load_category_header(input, cat) ) {
1049       fclose(input);
1050       return 0;
1051     }
1052 
1053     /* read character frequencies */
1054     i = ASIZE * ASIZE;
1055     j = 0;
1056     while(!ferror(input) && !feof(input) && (j < i) ) {
1057       j += fread(cat->dig + j, SIZEOF_DIGRAMS, i - j, input);
1058     }
1059     if( j < i ) {
1060       errormsg(E_ERROR, "is this category corrupt: %s?\n",
1061 	      cat->fullfilename);
1062       fclose(input);
1063       return 0;
1064     }
1065 
1066 #if defined PORTABLE_CATS
1067     for(i = 0; i < ASIZE; i++) {
1068       for(j = 0; j < ASIZE; j++) {
1069 	cat->dig[i][j] = NTOH_DIGRAM(cat->dig[i][j]);
1070       }
1071     }
1072 #endif
1073 
1074     if( !create_category_hash(cat, input, protf) ) {
1075       fclose(input);
1076       return 0;
1077     }
1078 
1079     fclose(input);
1080 
1081     return 1;
1082   }
1083 
1084   return 0;
1085 }
1086 
1087 
1088 /* loads a category hash
1089    returns 0 on failure, you should free the category in that case */
load_category(category_t * cat)1090 error_code_t load_category(category_t *cat) {
1091   return explicit_load_category(cat, "rb", PROT_READ);
1092 }
1093 
1094 /* loads a category file for potential read/write */
open_category(category_t * cat)1095 error_code_t open_category(category_t *cat) {
1096   return explicit_load_category(cat, "r+b", PROT_READ|PROT_WRITE);
1097 }
1098 
reload_category(category_t * cat)1099 error_code_t reload_category(category_t *cat) {
1100   if( cat ) {
1101     /* free the hash, but keep the cat->fullfilename */
1102     free_category_hash(cat);
1103     return load_category(cat) &&
1104       sanitize_model_options(&m_options,&m_cp,cat);
1105   }
1106   return 0;
1107 }
1108 
reload_all_categories()1109 void reload_all_categories() {
1110   category_count_t c;
1111   for(c = 0; c < cat_count; c++) {
1112     if( !reload_category(&cat[c]) ) {
1113       errormsg(E_FATAL,
1114 	      "could not reload %s, exiting\n", cat[c].fullfilename);
1115     }
1116   }
1117 }
1118