1 /*
2 * Copyright (C) 2002 Laird Breyer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
17 *
18 * Author: Laird Breyer <laird@lbreyer.com>
19 */
20
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24
25 #include <string.h>
26 #include <stdlib.h>
27 #include <math.h>
28
29 #include "util.h"
30 #include "dbacl.h"
31
32 extern options_t m_options;
33 extern charparser_t m_cp;
34 extern options_t u_options;
35
36 extern empirical_t empirical;
37
38 extern category_t cat[MAX_CAT];
39 extern category_count_t cat_count;
40
41 extern myregex_t re[MAX_RE];
42 extern regex_count_t regex_count;
43
44 extern char *extn;
45 extern long system_pagesize;
46
47 /***********************************************************
48 * MISCELLANEOUS FUNCTIONS *
49 ***********************************************************/
50
sanitize_path(char * in,char * extension)51 char *sanitize_path(char *in, char *extension) {
52 char *q;
53 char *path;
54 charbuf_len_t l;
55
56 if( !extension ) { extension = ""; }
57 /* this bit likely fails in DOS ;-) */
58 path = getenv(DEFAULT_CATPATH);
59 if( path && (*in != '/') && (*in != '.') ) {
60 l = strlen(path);
61 q = (char *)malloc(l + strlen(in) + 3 + strlen(extension));
62 strcpy(q, path);
63 if( q[l - 1] != '/' ) {
64 q[l] = '/';
65 q[l + 1] = 0;
66 }
67 strcat(q, in);
68 strcat(q, extension);
69 } else {
70 q = (char *)malloc(strlen(in) + strlen(extension) + 1);
71 strcpy(q, in);
72 strcat(q, extension);
73 }
74 return q;
75 }
76
77 #define MOPTION(x,opt) (((opt) & (1<<(x))) ? ("|" #x) : "")
78
79
print_charparser_type(charparser_t mcp,char * buf)80 char *print_charparser_type(charparser_t mcp, char *buf) {
81 strcpy(buf,
82 (mcp == CP_DEFAULT) ? "CP_DEFAULT" :
83 (mcp == CP_CHAR) ? "CP_CHAR" :
84 (mcp == CP_ALPHA) ? "CP_ALPHA" :
85 (mcp == CP_ALNUM) ? "CP_ALNUM" :
86 (mcp == CP_GRAPH) ? "CP_GRAPH" :
87 (mcp == CP_CEF) ? "CP_CEF" :
88 (mcp == CP_ADP) ? "CP_ADP" :
89 (mcp == CP_CEF2) ? "CP_CEF2" : "CP_UNKNOWN");
90 return buf;
91 }
92
93
print_model_options(options_t opt,charparser_t mcp,char * buf)94 char *print_model_options(options_t opt, charparser_t mcp, char *buf) {
95 print_charparser_type(mcp, buf);
96 strcat(buf, MOPTION(M_OPTION_REFMODEL, opt));
97 strcat(buf, MOPTION(M_OPTION_TEXT_FORMAT, opt));
98 strcat(buf, MOPTION(M_OPTION_MBOX_FORMAT, opt));
99 strcat(buf, MOPTION(M_OPTION_XML, opt));
100 strcat(buf, MOPTION(M_OPTION_I18N, opt));
101 strcat(buf, MOPTION(M_OPTION_CASEN, opt));
102 strcat(buf, MOPTION(M_OPTION_CALCENTROPY, opt));
103 strcat(buf, MOPTION(M_OPTION_MULTINOMIAL, opt));
104 strcat(buf, MOPTION(M_OPTION_HEADERS, opt));
105 strcat(buf, MOPTION(M_OPTION_PLAIN, opt));
106 strcat(buf, MOPTION(M_OPTION_NOPLAIN, opt));
107 strcat(buf, MOPTION(M_OPTION_SHOW_LINKS, opt));
108 strcat(buf, MOPTION(M_OPTION_SHOW_ALT, opt));
109 strcat(buf, MOPTION(M_OPTION_HTML, opt));
110 strcat(buf, MOPTION(M_OPTION_XHEADERS, opt));
111 strcat(buf, MOPTION(M_OPTION_THEADERS, opt));
112 strcat(buf, MOPTION(M_OPTION_SHOW_SCRIPT, opt));
113 strcat(buf, MOPTION(M_OPTION_SHOW_STYLE, opt));
114 strcat(buf, MOPTION(M_OPTION_SHOW_HTML_COMMENTS, opt));
115 strcat(buf, MOPTION(M_OPTION_USE_STDTOK, opt));
116 strcat(buf, MOPTION(M_OPTION_ATTACHMENTS, opt));
117 strcat(buf, MOPTION(M_OPTION_WARNING_BAD, opt));
118 strcat(buf, MOPTION(M_OPTION_NGRAM_STRADDLE_NL, opt));
119 return buf;
120 }
121
print_user_options(options_t opt,char * buf)122 char *print_user_options(options_t opt, char *buf) {
123 strcpy(buf, MOPTION(U_OPTION_CLASSIFY, opt));
124 strcat(buf, MOPTION(U_OPTION_LEARN, opt));
125 strcat(buf, MOPTION(U_OPTION_FASTEMP, opt));
126 strcat(buf, MOPTION(U_OPTION_CUTOFF, opt));
127 strcat(buf, MOPTION(U_OPTION_VERBOSE, opt));
128 strcat(buf, MOPTION(U_OPTION_STDIN, opt));
129 strcat(buf, MOPTION(U_OPTION_SCORES, opt));
130 strcat(buf, MOPTION(U_OPTION_POSTERIOR, opt));
131 strcat(buf, MOPTION(U_OPTION_FILTER, opt));
132 strcat(buf, MOPTION(U_OPTION_DEBUG, opt));
133 strcat(buf, MOPTION(U_OPTION_DUMP, opt));
134 strcat(buf, MOPTION(U_OPTION_APPEND, opt));
135 strcat(buf, MOPTION(U_OPTION_DECIMATE, opt));
136 strcat(buf, MOPTION(U_OPTION_GROWHASH, opt));
137 strcat(buf, MOPTION(U_OPTION_INDENTED, opt));
138 strcat(buf, MOPTION(U_OPTION_NOZEROLEARN, opt));
139 strcat(buf, MOPTION(U_OPTION_MMAP, opt));
140 strcat(buf, MOPTION(U_OPTION_CONFIDENCE, opt));
141 strcat(buf, MOPTION(U_OPTION_VAR, opt));
142 return buf;
143 }
144
sanitize_model_options(options_t * mopt,charparser_t * mcp,category_t * cat)145 error_code_t sanitize_model_options(options_t *mopt, charparser_t *mcp,
146 category_t *cat) {
147 options_t mask;
148
149 /* things that always override mopt */
150 mask =
151 (1<<M_OPTION_WARNING_BAD)|
152 (1<<M_OPTION_SHOW_HTML_COMMENTS)|
153 (1<<M_OPTION_SHOW_SCRIPT)|
154 (1<<M_OPTION_SHOW_STYLE)|
155 (1<<M_OPTION_HEADERS)|
156 (1<<M_OPTION_XHEADERS)|
157 (1<<M_OPTION_THEADERS)|
158 (1<<M_OPTION_SHOW_ALT)|
159 (1<<M_OPTION_SHOW_FORMS)|
160 (1<<M_OPTION_NOHEADERS)|
161 (1<<M_OPTION_ATTACHMENTS)|
162 (1<<M_OPTION_SHOW_LINKS)|
163 (1<<M_OPTION_NGRAM_STRADDLE_NL)|
164 (1<<M_OPTION_REFMODEL)|
165 (1<<M_OPTION_I18N)|
166 (1<<M_OPTION_CASEN)|
167 (1<<M_OPTION_USE_STDTOK)|
168 (1<<M_OPTION_CALCENTROPY)|
169 (1<<M_OPTION_MULTINOMIAL);
170 *mopt |= (cat->model.options & mask);
171
172 /* things that are problematic */
173
174 if( *mopt & (1<<M_OPTION_TEXT_FORMAT) ) {
175 if( cat->model.options & (1<<M_OPTION_MBOX_FORMAT) ) {
176 errormsg(E_WARNING,
177 "category %s was learned with -T email, forcing -T text\n",
178 cat->filename);
179 }
180 } else if( *mopt & (1<<M_OPTION_MBOX_FORMAT) ) {
181 if( cat->model.options & (1<<M_OPTION_TEXT_FORMAT) ) {
182 errormsg(E_WARNING,
183 "category %s was learned with -T text, forcing -T email\n",
184 cat->filename);
185 }
186 } else {
187 *mopt |= (cat->model.options & ((1<<M_OPTION_TEXT_FORMAT)|(1<<M_OPTION_MBOX_FORMAT)));
188 }
189
190 if( *mopt & (1<<M_OPTION_XML) ) {
191 if( cat->model.options & (1<<M_OPTION_HTML) ) {
192 errormsg(E_WARNING,
193 "category %s was learned with -T html, forcing -T xml\n",
194 cat->filename);
195 }
196 } else if( *mopt & (1<<M_OPTION_HTML) ) {
197 if( cat->model.options & (1<<M_OPTION_XML) ) {
198 errormsg(E_WARNING,
199 "category %s was learned with -T xml, forcing -T html\n",
200 cat->filename);
201 }
202 } else {
203 *mopt |= (cat->model.options & ((1<<M_OPTION_XML)|(1<<M_OPTION_HTML)));
204 }
205
206 if( *mopt & (1<<M_OPTION_PLAIN) ) {
207 if( cat->model.options & (1<<M_OPTION_NOPLAIN) ) {
208 errormsg(E_WARNING,
209 "category %s was learned with -T email:noplain, forcing -T email:plain\n",
210 cat->filename);
211 }
212 } else if( *mopt & (1<<M_OPTION_NOPLAIN) ) {
213 if( cat->model.options & (1<<M_OPTION_PLAIN) ) {
214 errormsg(E_WARNING,
215 "category %s was learned with -T email:plain, forcing -T email:noplain\n",
216 cat->filename);
217 }
218 } else {
219 *mopt |= (cat->model.options & ((1<<M_OPTION_NOPLAIN)|(1<<M_OPTION_PLAIN)));
220 }
221
222 /* mask = */
223 /* (1<<M_OPTION_CHAR_ALPHA)| */
224 /* (1<<M_OPTION_CHAR_ALNUM)| */
225 /* (1<<M_OPTION_CHAR_CHAR)| */
226 /* (1<<M_OPTION_CHAR_GRAPH)| */
227 /* (1<<M_OPTION_CHAR_ADP)| */
228 /* (1<<M_OPTION_CHAR_CEF); */
229
230 /* if( (*mopt & mask) && ((cat->m_options & mask) != (*mopt & mask)) ) { */
231 /* errormsg(E_FATAL, */
232 /* "category %s has incompatible token set (check -e switch)\n", */
233 /* cat->filename); */
234 /* return 0; */
235 /* } else { */
236 /* *mopt |= (cat->m_options & mask); */
237 /* } */
238 if( (*mcp != CP_DEFAULT) && (*mcp != cat->model.cp) ) {
239 errormsg(E_FATAL,
240 "category %s has incompatible token set (check -e switch)\n",
241 cat->filename);
242 return 0;
243 } else {
244 *mcp = cat->model.cp;
245 }
246
247 return 1;
248 }
249
load_regex(char * buf)250 regex_count_t load_regex(char *buf) {
251 char *p;
252 regex_count_t c;
253
254 /* set up the submatch bitmap */
255 re[regex_count].submatches |= 0;
256 re[regex_count].flags = 0;
257
258 p = strrchr(buf, '|');
259 if( p && ( *(--p) == '|') ) {
260 /* assume string ends in ||12345, use as bitmap */
261 *p = '\0';
262 for(p += 2; *p; p++) {
263 /* assume ascii number positions */
264 if( (*p > '9') || (*p < '1')) {
265 if( *p != '\n' ) {
266 errormsg(E_WARNING,
267 "could not decode || suffix for '%s'.\n", buf);
268 }
269 } else {
270 re[regex_count].submatches |= (1<<(*p - '0'));
271 }
272 }
273 } else { /* no bitmap specified */
274 re[regex_count].submatches = ~0;
275 }
276
277 /* remove trailing newline */
278 if( buf[strlen(buf) - 1] == '\n' ) {
279 buf[strlen(buf) - 1] = '\0';
280 }
281 /* GNU regexes use ordinary strings */
282 /* it's a regex - see if we've got it already */
283 for(c = 0; c < regex_count; c++) {
284 if( strcmp(re[c].string, buf) == 0 ) {
285 break;
286 }
287 }
288 if( c >= regex_count ) { /* not found */
289 /* add it to our list */
290 if( strchr(buf, '(') ) {
291 re[regex_count].string = strdup(buf);
292 } else {
293 char *dup = (char *)malloc(strlen(buf)+2);
294 if( dup ) {
295 sprintf(dup, "(%s)", buf);
296 errormsg(E_WARNING,
297 "no captures found in regex, converting to '%s'\n",
298 dup);
299 }
300 re[regex_count].string = dup;
301 }
302 if( !re[regex_count].string ) {
303 errormsg(E_FATAL,
304 "could not prepare regular expression '%s'.\n",
305 buf);
306 }
307 /* and compile the regex */
308 if( regcomp(&re[regex_count].regex,
309 re[regex_count].string, REG_EXTENDED) != 0 ) {
310 errormsg(E_FATAL, /* not much point going on */
311 "could not compile regular expression '%s'.\n",
312 re[regex_count].string);
313 } else {
314 regex_count++;
315 if( regex_count >= MAX_RE ) {
316 errormsg(E_FATAL, "too many regular expressions\n");
317 /* no point in going on */
318 }
319 }
320 }
321 return c;
322 }
323
free_all_regexes()324 void free_all_regexes() {
325 regex_count_t k;
326 for(k = 0; k < regex_count; k++) {
327 regfree(&re[k].regex);
328 }
329 }
330
331
332 /***********************************************************
333 * EMPIRICAL DISTRIBUTION OF TOKENS *
334 ***********************************************************/
335
336 /* initialize global learner object */
init_empirical(empirical_t * emp,hash_count_t dmt,hash_bit_count_t dmhb)337 void init_empirical(empirical_t *emp, hash_count_t dmt, hash_bit_count_t dmhb) {
338
339 /* some constants */
340 emp->max_tokens = dmt;
341 emp->max_hash_bits = dmhb;
342 emp->full_token_count = 0;
343 emp->unique_token_count = 0;
344 emp->track_features = 0;
345 emp->feature_stack_top = 0;
346 emp->hashfull_warning = 0;
347
348 /* allocate room for hash */
349 emp->hash = (h_item_t *)calloc(emp->max_tokens, sizeof(h_item_t));
350 if( !emp->hash ) {
351 errormsg(E_FATAL,
352 "not enough memory? I couldn't allocate %li bytes\n",
353 (sizeof(h_item_t) * ((long int)emp->max_tokens)));
354 }
355
356 MADVISE(emp->hash, sizeof(h_item_t) * emp->max_tokens, MADV_RANDOM);
357
358 }
359
free_empirical(empirical_t * emp)360 void free_empirical(empirical_t *emp) {
361 if( emp->hash ) {
362 free(emp->hash);
363 }
364 }
365
clear_empirical(empirical_t * emp)366 void clear_empirical(empirical_t *emp) {
367 token_stack_t i;
368
369 if( emp->track_features ) {
370 /* this may actually be slower than a global memset */
371 for(i = 0; i < emp->feature_stack_top; i++) {
372 memset(emp->feature_stack[i], 0, sizeof(h_item_t));
373 }
374 } else {
375 memset(emp->hash, 0, sizeof(h_item_t) * emp->max_tokens);
376 }
377
378 emp->full_token_count = 0;
379 emp->unique_token_count = 0;
380 emp->feature_stack_top = 0;
381
382 if( u_options & (1<<U_OPTION_FASTEMP) ) {
383 emp->track_features = 1;
384 }
385 }
386
find_in_empirical(empirical_t * emp,hash_value_t id)387 h_item_t *find_in_empirical(empirical_t *emp, hash_value_t id) {
388 register h_item_t *i, *loop;
389 /* start at id */
390 i = loop = &emp->hash[id & (emp->max_tokens - 1)];
391
392 while( FILLEDP(i) ) {
393 if( EQUALP(i->id,id) ) {
394 return i; /* found id */
395 } else {
396 i++; /* not found */
397 /* wrap around */
398 i = (i >= &emp->hash[emp->max_tokens]) ? emp->hash : i;
399 if( i == loop ) {
400 return NULL; /* when hash table is full */
401 }
402 }
403 }
404
405 /* empty slot, so not found */
406
407 return i;
408 }
409
410 /* calculates the entropy of the full empirical measure */
empirical_entropy(empirical_t * emp)411 score_t empirical_entropy(empirical_t *emp) {
412 token_stack_t j;
413 hash_count_t i;
414 score_t e = 0.0;
415
416 if( emp->track_features &&
417 (emp->feature_stack_top < MAX_TOKEN_LINE_STACK) ) {
418 for(j = 0; j < emp->feature_stack_top; j++) {
419 e += ((score_t)emp->feature_stack[j]->count) *
420 log((score_t)emp->feature_stack[j]->count);
421 }
422 } else {
423 for(i = 0; i < emp->max_tokens; i++) {
424 if( FILLEDP(&emp->hash[i]) ) {
425 e += ((score_t)emp->hash[i].count) *
426 log((score_t)emp->hash[i].count);
427 }
428 }
429 }
430 e = e/emp->full_token_count - log((score_t)emp->full_token_count);
431
432 return -e;
433 }
434
435 /***********************************************************
436 * CATEGORY FUNCTIONS *
437 ***********************************************************/
438
439 /* initialize to zero. fullfilename specified elsewhere */
init_category(category_t * cat)440 void init_category(category_t *cat) {
441 char *p;
442
443 cat->model_full_token_count = 0;
444 cat->model_unique_token_count = 0;
445 cat->score = 0.0;
446 cat->score_div = 0.0;
447 cat->score_s2 = 0.0;
448 cat->score_shannon = 0.0;
449 cat->shannon = 0.0;
450 cat->shannon_s2 = 0.0;
451 cat->alpha = 0.0;
452 cat->beta = 0.0;
453 cat->mu = 0.0;
454 cat->s2 = 0.0;
455 cat->prior = 0.0;
456 cat->complexity = 0.0;
457 cat->fcomplexity = 0;
458 cat->fmiss = 0;
459 cat->max_order = 0;
460 memset(cat->mediacounts, 0, sizeof(token_count_t)*TOKEN_CLASS_MAX);
461 p = strrchr(cat->fullfilename, '/');
462 if( p ) {
463 cat->filename = p + 1; /* only keep basename */
464 } else {
465 cat->filename = cat->fullfilename;
466 }
467 cat->filename = strdup(cat->filename);
468 if( extn && *extn ) {
469 p = strstr(cat->filename, extn);
470 if( p ) { *p = '\0'; }
471 }
472 cat->retype = 0;
473 cat->model.type = simple;
474 cat->model.options = 0;
475 cat->model.cp = 0;
476 cat->model.dt = 0;
477 cat->c_options = 0;
478 cat->hash = NULL;
479 cat->mmap_offset = 0;
480 cat->mmap_start = NULL;
481 }
482
create_category_hash(category_t * cat,FILE * input,int protf)483 bool_t create_category_hash(category_t *cat, FILE *input, int protf) {
484 hash_count_t i, j;
485
486 if( u_options & (1<<U_OPTION_MMAP) ) {
487 cat->mmap_offset = ftell(input);
488 if( cat->mmap_offset > 0 ) {
489 cat->mmap_start =
490 (byte_t *)MMAP(0, sizeof(c_item_t) * cat->max_tokens +
491 cat->mmap_offset,
492 protf, MAP_SHARED, fileno(input), 0);
493 if( cat->mmap_start == MAP_FAILED ) { cat->mmap_start = NULL; }
494 if( cat->mmap_start ) {
495 cat->hash = (c_item_t *)(cat->mmap_start + cat->mmap_offset);
496 MADVISE(cat->hash, sizeof(c_item_t) * cat->max_tokens,
497 MADV_SEQUENTIAL|MADV_WILLNEED);
498 /* lock the pages to prevent swapping - on Linux, this
499 works without root privs so long as the user limits
500 are big enough - mine are unlimited ;-)
501 On other OSes, root may me necessary. If we can't
502 lock, it doesn't really matter, but cross validations
503 and multiple classifications are a _lot_ faster with locking. */
504 MLOCK(cat->hash, sizeof(c_item_t) * cat->max_tokens);
505 cat->c_options |= (1<<C_OPTION_MMAPPED_HASH);
506 }
507 }
508 }
509
510 if( !cat->hash ) {
511 cat->c_options &= ~(1<<C_OPTION_MMAPPED_HASH);
512 /* allocate hash table */
513 cat->hash = (c_item_t *)malloc(sizeof(c_item_t) * cat->max_tokens);
514 if( !cat->hash ) {
515 errormsg(E_ERROR, "not enough memory for category %s\n",
516 cat->filename);
517 return 0;
518 }
519
520 MADVISE(cat->hash, sizeof(c_item_t) * cat->max_tokens,
521 MADV_SEQUENTIAL);
522
523 /* read in hash table */
524 i = cat->max_tokens;
525 j = 0;
526 while(!ferror(input) && !feof(input) && (j < i) ) {
527 j += fread(cat->hash + j, sizeof(c_item_t), i - j, input);
528 }
529
530 if( j < i ) {
531 errormsg(E_ERROR, "corrupt category? %s\n",
532 cat->fullfilename);
533 free(cat->hash);
534 return 0;
535 }
536
537 }
538 return 1;
539 }
540
541
free_category_hash(category_t * cat)542 void free_category_hash(category_t *cat) {
543 if( cat->hash ) {
544 if( cat->mmap_start != NULL ) {
545 MUNMAP(cat->mmap_start, cat->max_tokens * sizeof(c_item_t) +
546 cat->mmap_offset);
547 cat->mmap_start = NULL;
548 cat->mmap_offset = 0;
549 cat->hash = NULL;
550 }
551 if( cat->hash ) {
552 free(cat->hash);
553 cat->hash = NULL;
554 }
555 }
556 }
557
558 /* frees the resrouces associated with a category */
free_category(category_t * cat)559 void free_category(category_t *cat) {
560 free_category_hash(cat);
561 if( cat->filename ) { free(cat->filename); }
562 if( cat->fullfilename ) { free(cat->fullfilename); }
563 }
564
565 /* turns purely random text into a category of its own */
init_purely_random_text_category(category_t * cat)566 void init_purely_random_text_category(category_t *cat) {
567 alphabet_size_t i, j;
568 weight_t z = -log((double)(ASIZE - AMIN));
569
570 #if defined DIGITIZE_DIGRAMS
571 digitized_weight_t zz = PACK_DIGRAMS(z);
572 #endif
573
574 for(i = AMIN; i < ASIZE; i++) {
575 for(j = AMIN; j < ASIZE; j++) {
576 #if defined DIGITIZE_DIGRAMS
577 cat->dig[i][j] = zz;
578 #else
579 cat->dig[i][j] = z;
580 #endif
581 }
582 }
583
584 z = -log((double)(ASIZE - AMIN));
585 #if defined DIGITIZE_DIGRAMS
586 zz = PACK_DIGRAMS(z);
587 #endif
588 for(j = AMIN; j < ASIZE; j++) {
589 #if defined DIGITIZE_DIGRAMS
590 cat->dig[(alphabet_size_t)DIAMOND][j] = zz;
591 #else
592 cat->dig[(alphabet_size_t)DIAMOND][j] = z;
593 #endif
594 }
595
596 /* not needed: set DIAMOND-DIAMOND score for completeness only */
597
598 #if defined DIGITIZE_DIGRAMS
599 cat->dig[(alphabet_size_t)DIAMOND][(alphabet_size_t)DIAMOND] = DIGITIZED_WEIGHT_MIN;
600 #else
601 cat->dig[(alphabet_size_t)DIAMOND][(alphabet_size_t)DIAMOND] = log(0.0);
602 #endif
603
604 cat->logZ = 0.0;
605 cat->divergence = 0.0;
606 cat->delta = 0.0;
607 cat->renorm = 0.0;
608 cat->hash = NULL;
609 cat->mmap_start = NULL;
610 cat->mmap_offset = 0;
611 cat->model.type = simple;
612 cat->max_order = 1;
613 cat->model.options = 0;
614 cat->model.cp = 0;
615 cat->model.dt = 0;
616 }
617
find_in_category(category_t * cat,hash_value_t id)618 c_item_t *find_in_category(category_t *cat, hash_value_t id) {
619 register c_item_t *i, *loop;
620
621 if( cat->hash ) {
622 /* start at id */
623 i = loop = &cat->hash[id & (cat->max_tokens - 1)];
624
625 while( FILLEDP(i) ) {
626 if( EQUALP(NTOH_ID(i->id),id) ) {
627 return i; /* found id */
628 } else {
629 i++; /* not found */
630 /* wrap around */
631 i = (i >= &cat->hash[cat->max_tokens]) ? cat->hash : i;
632 if( i == loop ) {
633 return NULL; /* when hash table is full */
634 }
635 }
636 }
637 return i;
638 } else {
639 return NULL;
640 }
641 }
642
643 /* for each loaded category, this calculates the score.
644 Tokens have the format
645 DIAMOND t1 DIAMOND t2 ... tn DIAMOND CLASSEP class NUL */
score_word(char * tok,token_type_t tt,regex_count_t re)646 void score_word(char *tok, token_type_t tt, regex_count_t re) {
647 category_count_t i = 0;
648 weight_t multinomial_correction = 0.0;
649 weight_t shannon_correction = 0.0;
650 weight_t lambda, ref, oldscore;
651 bool_t apply;
652 alphabet_size_t pp, pc, len;
653 hash_value_t id;
654 char *q;
655 register c_item_t *k = NULL;
656 h_item_t *h = NULL;
657
658 /* we skip "empty" tokens */
659 for(q = tok; q && *q == DIAMOND; q++);
660 if( q && (*q != EOTOKEN) ) {
661
662 id = hash_full_token(tok);
663
664 if( (m_options & (1<<M_OPTION_CALCENTROPY)) ) {
665 /* add the token to the hash */
666
667 h = find_in_empirical(&empirical, id);
668 if( h ) {
669 if( FILLEDP(h) ) {
670 if( h->count < K_TOKEN_COUNT_MAX ) {
671 h->count++;
672
673 #if defined SHANNON_STIRLING
674 shannon_correction = 1.0 + log((weight_t)h->count *
675 (weight_t)(h->count - 1))/2.0;
676 #else
677 shannon_correction += log((weight_t)h->count)*(weight_t)h->count
678 - log((weight_t)(h->count - 1))*(weight_t)(h->count - 1);
679 #endif
680 }
681 } else {
682 if( /* !FILLEDP(i) && */
683 ((100 * empirical.unique_token_count) <
684 (HASH_FULL * empirical.max_tokens) )) {
685 /* fill the empirical hash */
686 SET(h->id,id);
687 empirical.unique_token_count +=
688 ( empirical.unique_token_count < K_TOKEN_COUNT_MAX ) ? 1 : 0;
689 h->count = 1;
690 #if defined SHANNON_STIRLING
691 shannon_correction = 1.0 - log(2.0 * M_PI)/2.0;
692 #else
693 shannon_correction = 0;
694 #endif
695 } else {
696 /* hash full */
697 h = NULL;
698 if( !empirical.hashfull_warning ) {
699 errormsg(E_WARNING,
700 "empirical hash full, calculation may be skewed. "
701 "Try option -h %d\n",
702 (empirical.max_hash_bits + 1));
703 empirical.hashfull_warning = 1;
704 }
705 return; /* pretend word doesn't exist */
706 }
707
708 if( empirical.track_features ) {
709 if( empirical.feature_stack_top < MAX_TOKEN_LINE_STACK ) {
710 empirical.feature_stack[empirical.feature_stack_top++] = h;
711 } else {
712 empirical.track_features = 0;
713 empirical.feature_stack_top = 0;
714 }
715 }
716 }
717 empirical.full_token_count +=
718 ( empirical.full_token_count < K_TOKEN_COUNT_MAX ) ? 1 : 0;
719
720 }
721 }
722
723 /* now do scoring for all available categories */
724 for(i = 0; i < cat_count; i++) {
725
726 oldscore = cat[i].score;
727 lambda = 0.0;
728 ref = 0.0;
729
730 /* see if this token is for us. The rule is: a category either
731 uses the standard tokenizer (in that case re = INVALID_RE),
732 or it uses only those regexes which are listed in the retype
733 bitmap. Since re = 0 is taken by the standard tokenizer,
734 this occurs when re > 0 and we have to subtract 1 to check
735 the bitmap. Simple, really ;-) */
736 apply = ( ((re == INVALID_RE) &&
737 (tt.order <= cat[i].max_order) && !cat[i].retype) ||
738 ((re > 0) &&
739 (cat[i].retype & (1<<(re-1)))) );
740 if( apply ) {
741
742 /* if token found, add its lambda weight */
743 k = find_in_category(&cat[i], id);
744 if( k ) {
745 lambda = UNPACK_LAMBDA(NTOH_LAMBDA(k->lam));
746 }
747
748 if( tt.order == 1 ) {
749 /* now compute the reference weight from digram model,
750 (while duplicating the digitization error) */
751 pp = (unsigned char)*tok;
752 CLIP_ALPHABET(pp);
753 q = tok + 1;
754 len = 1;
755 while( *q != EOTOKEN ) {
756 if( *q == '\r' ) {
757 q++;
758 continue;
759 }
760 pc = (unsigned char)*q;
761 CLIP_ALPHABET(pc);
762 ref += UNPACK_DIGRAMS(cat[i].dig[pp][pc]);
763 pp = pc;
764 q++;
765 if( *q != DIAMOND ) { len++; }
766 }
767 ref += UNPACK_DIGRAMS(cat[i].dig[RESERVED_TOKLEN][len]) -
768 UNPACK_DIGRAMS(cat[i].dig[RESERVED_TOKLEN][0]);
769 ref = UNPACK_RWEIGHTS(PACK_RWEIGHTS(ref));
770
771 }
772
773 /* update the complexity */
774 /* note: complexity has nothing to do with Kolmogorov's definition */
775 /* this is actually very simple in hindsight, but took
776 me a long time to get right. Different versions of dbacl
777 compute the complexity in different ways, and I kept changing
778 the method because I wasn't happy.
779
780 In previous versions, complexity is an integer, which begs
781 the question "what does it count?". For simple models
782 (max_order = 1) this is easy: we count the number of
783 tokens. But for max_order > 1, it's not obvious, because we
784 need to divide by 1/max_order asymptotically.
785
786 One way is to increment the complexity if we encounter a
787 token of order max_order. This is correct for Markovian
788 models and corresponds to the dbacl.ps writeup, but causes
789 trouble in some edge cases. For example, if we classify a
790 very short document, there might not be enough tokens to
791 make sense. This actually occurs when dbacl must classify
792 individual lines, and some lines contain one or two tokens
793 only. Worse, dbacl used to renormalize at the same time as
794 updating the complexity, which increases the likelihood of
795 having a negative divergence score estimate in the first
796 few iterations - very bad. Finally, the complexity is
797 nearly meaningless for models built up from regular
798 expressions, because both the start and the end of each
799 line contains incomplete n-grams (recall regexes can't
800 straddle newlines).
801
802 So to solve these problems, some previous versions of dbacl
803 counted always the order 1 tokens. Asymptotically, this
804 makes no difference, but again it fails on edge
805 cases. Firstly, doing this means that the complexity for a
806 simple model is the same as the complexity for an n-gram
807 model for any n, so that makes it hard to compare mixed
808 models because n-gram model scores are consitently biased
809 for n > 1. Another problem is again with regexes, because
810 the incomplete n-gram tokens at the start and end of each
811 line add up to a pretty large error over thousands of
812 tokens.
813
814 The solution to the above problems is twofold: first, we
815 renormalize after each token, regardless of its order. Of
816 course this means we must divide logZ by the number of
817 tokens per complexity unit, ie renorm = delta * logZ with
818 delta = 1/max_order. Once I realized this it was obvious
819 that the complexity should be also incremented by delta for
820 every token. As a side effect, the complexity is now a real
821 number, and actually measures not just the max_order token
822 count, but also the fraction of incomplete n-grams. This
823 seems like the right way to go, especially for models based
824 on regexes, since now we also count the incomplete n-grams
825 at both ends of the line, which adds up to quite a bit over
826 many lines. */
827
828 cat[i].fcomplexity++; /* don't actually need this, but nice to have */
829 cat[i].complexity += cat[i].delta;
830
831 /* now adjust the score */
832 switch(cat[i].model.type) {
833 case simple:
834 multinomial_correction = h ?
835 (log((weight_t)cat[i].complexity) - log((weight_t)h->count)) : 0.0;
836 cat[i].score +=
837 lambda + multinomial_correction + ref - cat[i].renorm;
838 break;
839 case sequential:
840 default:
841 cat[i].score += lambda + ref - cat[i].renorm;
842 if( tt.order == cat[i].max_order ) {
843 cat[i].score_shannon += shannon_correction;
844 }
845 break;
846 }
847
848 if( !k || !NTOH_ID(k->id) ) {
849 /* missing data */
850 cat[i].fmiss++;
851 }
852
853 if( tt.order == 1 ) {
854 /* sample variance */
855 cat[i].score_s2 += (cat[i].score - oldscore) * (cat[i].score - oldscore);
856 /* only count medium for 1-grams */
857 cat[i].mediacounts[tt.cls]++;
858 }
859
860 }
861
862 if( u_options & (1<<U_OPTION_DUMP) ) {
863 if( u_options & (1<<U_OPTION_SCORES) ) {
864 fprintf(stdout, " %8.2f * %-6.1f\t",
865 -sample_mean(cat[i].score, cat[i].complexity),
866 cat[i].complexity);
867 } else if( u_options & (1<<U_OPTION_POSTERIOR) ) {
868 fprintf(stdout, " %8.2f\t", oldscore - cat[i].score);
869 } else if( u_options & (1<<U_OPTION_VAR) ) {
870 fprintf(stdout, " %8.2f * %-6.1f # %-8.2f\t",
871 -sample_mean(cat[i].score,cat[i].complexity),
872 cat[i].complexity,
873 sample_variance(cat[i].score_s2, cat[i].score,
874 cat[i].complexity)/cat[i].complexity);
875 } else {
876 fprintf(stdout,
877 "%7.2f %7.2f %7.2f %7.2f %8lx\t",
878 lambda, ref, apply ? -cat[i].renorm : 0.0,
879 multinomial_correction,
880 (long unsigned int)((k && apply) ? NTOH_ID(k->id) : 0));
881 }
882 }
883
884 }
885
886 if( u_options & (1<<U_OPTION_DUMP) ) {
887 print_token(stdout, tok);
888 if( re > 0 ) {
889 fprintf(stdout, "<re=%d>\n", re);
890 } else {
891 fprintf(stdout, "\n");
892 }
893 }
894
895 }
896 }
897
898 /*
899 * Returns 2 * min[ F(obs), 1 - F(obs) ], and calls it
900 * the "confidence". In reality, this is a type of p-value,
901 * which is zero when obs = +/- infinity, and 1 at the median.
902 * Under the null hypothesis obs ~ Gamma(alpha, beta), this
903 * p-value is uniformly distributed. This is chosen so that
904 * we can combine our p-palues with Fisher's chi^2 test, viz.
905 * -2\sum_i=1^n \log X_i ~ chi^2(2n), if all X_i are IID uniform.
906 *
907 */
gamma_pvalue(category_t * cat,double obs)908 confidence_t gamma_pvalue(category_t *cat, double obs) {
909 double m;
910 if( (obs <= 0.0) || (cat->alpha <= 0.0) || (cat->beta <= 0.0) ) {
911 return 0; /* bogus value */
912 } else if( obs > 120.0 ) {
913 /* large values can produce underflow in gamma_tail() */
914 return 0; /* bogus value */
915 }
916
917 m = gamma_tail(cat->alpha, cat->beta, obs);
918
919 /* uncomment below to get one sided confidence */
920 /* return (int)(1000.0 * m); */
921 /* result is per-thousand */
922 return (confidence_t)(2000.0 * ((m > 0.5) ? 1.0 - m : m));
923 }
924
925 /***********************************************************
926 * FILE MANAGEMENT FUNCTIONS *
927 ***********************************************************/
load_category_header(FILE * input,category_t * cat)928 error_code_t load_category_header(FILE *input, category_t *cat) {
929 char buf[MAGIC_BUFSIZE];
930 char scratchbuf[MAGIC_BUFSIZE];
931 short int shint_val, shint_val2;
932 long int lint_val1, lint_val2, lint_val3;
933
934 if( input ) {
935 if( !fgets(buf, MAGIC_BUFSIZE, input) ||
936 strncmp(buf, MAGIC1, MAGIC1_LEN) ) {
937 errormsg(E_ERROR,
938 "not a dbacl " SIGNATURE " category file [%s]\n",
939 cat->fullfilename);
940 return 0;
941 }
942
943 init_category(cat); /* changes filename */
944
945 if( !fgets(buf, MAGIC_BUFSIZE, input) ||
946 (sscanf(buf, MAGIC2_i, &cat->divergence, &cat->logZ,
947 &shint_val, scratchbuf) < 4) ) {
948 errormsg(E_ERROR, "bad category file [2]\n");
949 return 0;
950 }
951 cat->max_order = (token_order_t)shint_val;
952 cat->delta = 1.0/(score_t)(cat->max_order);
953 cat->renorm = cat->delta * cat->logZ;
954 if( scratchbuf[0] == 'm' ) {
955 cat->model.type = simple;
956 } else {
957 cat->model.type = sequential;
958 }
959
960 if( !fgets(buf, MAGIC_BUFSIZE, input) ||
961 (sscanf(buf, MAGIC3,
962 &shint_val,
963 &lint_val1,
964 &lint_val2,
965 &lint_val3) < 4) ) {
966 errormsg(E_ERROR, "bad category file [3]\n");
967 return 0;
968 }
969 cat->max_hash_bits = (token_order_t)shint_val;
970 cat->model_full_token_count = (token_count_t)lint_val1;
971 cat->model_unique_token_count = (token_count_t)lint_val2;
972 cat->model_num_docs = (document_count_t)lint_val3;
973
974 cat->max_tokens = (1<<cat->max_hash_bits);
975
976 if( !fgets(buf, MAGIC_BUFSIZE, input) ||
977 (sscanf(buf, MAGIC8_i,
978 &cat->shannon, &cat->shannon_s2) < 2) ) {
979 errormsg(E_ERROR, "bad category file [8]\n");
980 return 0;
981 }
982
983 if( !fgets(buf, MAGIC_BUFSIZE, input) ||
984 (sscanf(buf, MAGIC10_i,
985 &cat->alpha, &cat->beta,
986 &cat->mu, &cat->s2) < 4) ) {
987 errormsg(E_ERROR, "bad category file [10]\n");
988 return 0;
989 }
990
991 /* see if there are any regexes */
992 while(fgets(buf, MAGIC_BUFSIZE, input)) {
993 if( strncmp(buf, MAGIC6, 2) == 0 ) {
994 break;
995 } else if( strncmp(buf, MAGIC5_i, 8) == 0 ) {
996
997 /* if regex can't be compiled, load_regex() exits */
998 cat->retype |= (1<<load_regex(buf + RESTARTPOS));
999
1000 } else if( strncmp(buf, MAGIC4_i, 10) == 0) {
1001 if( sscanf(buf, MAGIC4_i, &lint_val1, &shint_val, &shint_val2, scratchbuf) == 4 ) {
1002 cat->model.options = (options_t)lint_val1;
1003 cat->model.cp = (charparser_t)shint_val;
1004 cat->model.dt = (digtype_t)shint_val2;
1005 }
1006 }
1007
1008 /* finished with current line, get next one */
1009 }
1010
1011 /* if this category did not register a regex, it wants
1012 the default processing, so we flag this */
1013 if( !cat->retype ) {
1014 cat->model.options |= (1<<M_OPTION_USE_STDTOK);
1015 }
1016
1017 /* if we haven't read a character class, use alpha */
1018 if( cat->model.cp == CP_DEFAULT ) {
1019 if( cat->model.options & (1<<M_OPTION_MBOX_FORMAT) ) {
1020 cat->model.cp = CP_ADP;
1021 } else {
1022 cat->model.cp = CP_ALPHA;
1023 }
1024 }
1025
1026 /* if we're here, success! */
1027 return 1;
1028 }
1029 return 0;
1030 }
1031
1032
explicit_load_category(category_t * cat,char * openf,int protf)1033 error_code_t explicit_load_category(category_t *cat, char *openf, int protf) {
1034 hash_count_t i, j;
1035
1036 FILE *input;
1037
1038 /* this is needed in case we try to open with write permissions,
1039 which would otherwise create the file */
1040
1041 input = fopen(cat->fullfilename, "rb");
1042 if( input && (strcmp(openf, "rb") != 0) ) {
1043 input = freopen(cat->fullfilename, openf, input);
1044 }
1045
1046 if( input ) {
1047
1048 if( !load_category_header(input, cat) ) {
1049 fclose(input);
1050 return 0;
1051 }
1052
1053 /* read character frequencies */
1054 i = ASIZE * ASIZE;
1055 j = 0;
1056 while(!ferror(input) && !feof(input) && (j < i) ) {
1057 j += fread(cat->dig + j, SIZEOF_DIGRAMS, i - j, input);
1058 }
1059 if( j < i ) {
1060 errormsg(E_ERROR, "is this category corrupt: %s?\n",
1061 cat->fullfilename);
1062 fclose(input);
1063 return 0;
1064 }
1065
1066 #if defined PORTABLE_CATS
1067 for(i = 0; i < ASIZE; i++) {
1068 for(j = 0; j < ASIZE; j++) {
1069 cat->dig[i][j] = NTOH_DIGRAM(cat->dig[i][j]);
1070 }
1071 }
1072 #endif
1073
1074 if( !create_category_hash(cat, input, protf) ) {
1075 fclose(input);
1076 return 0;
1077 }
1078
1079 fclose(input);
1080
1081 return 1;
1082 }
1083
1084 return 0;
1085 }
1086
1087
1088 /* loads a category hash
1089 returns 0 on failure, you should free the category in that case */
load_category(category_t * cat)1090 error_code_t load_category(category_t *cat) {
1091 return explicit_load_category(cat, "rb", PROT_READ);
1092 }
1093
1094 /* loads a category file for potential read/write */
open_category(category_t * cat)1095 error_code_t open_category(category_t *cat) {
1096 return explicit_load_category(cat, "r+b", PROT_READ|PROT_WRITE);
1097 }
1098
reload_category(category_t * cat)1099 error_code_t reload_category(category_t *cat) {
1100 if( cat ) {
1101 /* free the hash, but keep the cat->fullfilename */
1102 free_category_hash(cat);
1103 return load_category(cat) &&
1104 sanitize_model_options(&m_options,&m_cp,cat);
1105 }
1106 return 0;
1107 }
1108
reload_all_categories()1109 void reload_all_categories() {
1110 category_count_t c;
1111 for(c = 0; c < cat_count; c++) {
1112 if( !reload_category(&cat[c]) ) {
1113 errormsg(E_FATAL,
1114 "could not reload %s, exiting\n", cat[c].fullfilename);
1115 }
1116 }
1117 }
1118