1 /*
2 * Copyright (C) 2002 Laird Breyer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
17 *
18 * Author: Laird Breyer <laird@lbreyer.com>
19 */
20
21 /*
22 * Note to regenerate the Makefile and configure script:
23 * aclocal
24 * autoconf
25 * touch NEWS README AUTHORS ChangeLog
26 * automake -a
27 *
28 * You can also just run the bootstrap script.
29 */
30
31 #ifdef HAVE_CONFIG_H
32 #include "config.h"
33 #endif
34
35 #include <math.h>
36 #include <ctype.h>
37 #include <string.h>
38 #include <stdlib.h>
39 #include <sys/stat.h>
40 #include <fcntl.h>
41 #include <time.h>
42
43 #if defined HAVE_UNISTD_H
44 #include <unistd.h>
45 #include <sys/types.h>
46 #endif
47
48 #include <locale.h>
49
50 #if defined HAVE_LANGINFO_H
51 #include <langinfo.h>
52 #if !defined CODESET
53 /* on OpenBSD, CODESET doesn't seem to be defined -
54 we use 3, which should be US-ASCII, but it's not ideal... */
55 #define CODESET 3
56 #endif
57 #endif
58
59 #include "util.h"
60 #include "dbacl.h" /* make sure this is last */
61
62 /* global variables */
63
64 extern signal_cleanup_t cleanup;
65
66 extern hash_bit_count_t default_max_hash_bits;
67 extern hash_count_t default_max_tokens;
68
69 extern hash_bit_count_t default_max_grow_hash_bits;
70 extern hash_count_t default_max_grow_tokens;
71
72 hash_bit_count_t decimation;
73 int zthreshold = 0;
74
75 learner_t learner;
76 dirichlet_t dirichlet;
77 int filter[MAX_CAT];
78 category_count_t filter_count = 0;
79
80 extern category_t cat[MAX_CAT];
81 extern category_count_t cat_count;
82
83 extern myregex_t re[MAX_RE];
84 extern regex_count_t regex_count;
85
86 extern empirical_t empirical;
87
88 extern options_t u_options;
89 extern options_t m_options;
90 extern charparser_t m_cp;
91 extern digtype_t m_dt;
92 extern char *extn;
93
94 extern token_order_t ngram_order; /* defaults to 1 */
95
96 /* for counting emails */
97 bool_t not_header;
98 extern MBOX_State mbox;
99 extern XML_State xml;
100
101 /* for option processing */
102 extern char *optarg;
103 extern int optind, opterr, optopt;
104
105 char *title = "";
106 char *digtype = "";
107 char *online = "";
108 char *ronline[MAX_CAT];
109 category_count_t ronline_count = 0;
110 extern char *progname;
111 extern char *inputfile;
112 extern long inputline;
113
114 extern int cmd;
115 int exit_code = 0; /* default */
116
117 int overflow_warning = 0;
118 int digramic_overflow_warning = 0;
119 int skewed_constraints_warning = 0;
120
121 extern long system_pagesize;
122
123 extern void *in_iobuf;
124 extern void *out_iobuf;
125
126 /* tolerance for the error in divergence - this can be changed with -q.
127 More accuracy means slower learning. This should be the same order
128 of magnitude as the error due to digitization, but we tweak them
129 to obtain timely output */
130
131 int quality = 0;
132 double qtol_alpha = 0.0001;
133 double qtol_div = 0.01;
134 double qtol_lam = CLIP_LAMBDA_TOL(0.05);
135 double qtol_logz = 0.05;
136 bool_t qtol_multipass = 0;
137
138 /***********************************************************
139 * MISCELLANEOUS FUNCTIONS *
140 ***********************************************************/
141
usage(char ** argv)142 static void usage(/*@unused@*/ char **argv) {
143 fprintf(stderr,
144 "\n");
145 fprintf(stderr,
146 "dbacl [-vniNR] [-T type] -c CATEGORY [-c CATEGORY]...\n");
147 fprintf(stderr,
148 " [-f KEEP]... [FILE]...\n");
149 fprintf(stderr,
150 "\n");
151 fprintf(stderr,
152 " classifies FILE or STDIN using CATEGORY, optionally\n");
153 fprintf(stderr,
154 " removing all lines which don't fit the category KEEP.\n");
155 fprintf(stderr,
156 "\n");
157 fprintf(stderr,
158 "dbacl [-vnirNDL] [-h size] [-T type] -l CATEGORY \n");
159 fprintf(stderr,
160 " [-g regex]... [FILE]...\n");
161 fprintf(stderr,
162 "\n");
163 fprintf(stderr,
164 " builds a maximum entropy model from the words in FILE or STDIN\n");
165 fprintf(stderr,
166 " or concatenated regex submatches if using the -g option.\n");
167 fprintf(stderr,
168 "\n");
169 fprintf(stderr,
170 "dbacl -V\n");
171 fprintf(stderr,
172 "\n");
173 fprintf(stderr,
174 " prints program version.\n");
175 }
176
177
178 /***********************************************************
179 * CATEGORY FUNCTIONS *
180 ***********************************************************/
181
reset_all_scores()182 void reset_all_scores() {
183 category_count_t i;
184 for(i = 0; i < cat_count; i++) {
185 cat[i].score = 0.0;
186 cat[i].score_s2 = 0.0;
187 cat[i].score_div = 0.0;
188 cat[i].score_shannon = 0.0;
189 cat[i].complexity = 0.0;
190 cat[i].fcomplexity = 0;
191 }
192 }
193
194 /* calculate the overlap probabilities (ie the probability that the
195 given score is the minimal score assuming independent Gaussian
196 distributions.)
197
198 NOTE: I also toyed with the idea of multiplying each uncertainty
199 score by the percentage of recognized tokens for that category. It
200 seems plausible that uncertainty should also be proportional to the
201 unknown token count, but in practice, a typical email has a
202 substantial fraction of unknown tokens, and this dominates the
203 uncertainty obtained from the Gaussian assumption. I've disabled
204 this code again, see ALTERNATIVE UNCERTAINTY code below
205 */
calc_uncertainty(int map)206 double calc_uncertainty(int map) {
207 double mu[MAX_CAT];
208 double sigma[MAX_CAT];
209 int i;
210 double p, u, t, pmax;
211
212 for(i = 0; i < cat_count; i++) {
213 mu[i] = -sample_mean(cat[i].score, cat[i].complexity);
214 sigma[i] = sqrt(cat[i].score_s2/cat[i].complexity);
215 }
216
217 /* even though p below is a true probability, it doesn't quite sum
218 to 1, because of numerical errors in the min_prob function, which
219 is only designed to do about 1% error for speed.
220 So we might get up to 1 + cat_count% total mass,
221 which means that u could be 102% say in spam filtering.
222 Doesn't look good. So we compute the normalizing constant t.
223 */
224 t = 0.0;
225 pmax = 1.0;
226 for(i = 0; i < cat_count; i++) {
227 p = min_prob(i, cat_count, mu, sigma);
228 if( i == map ) {
229 pmax = p;
230 }
231 t += p;
232 }
233
234 /* pmax is proportional to the MAP probability, which is guaranteed
235 to be >= 0.5. The value 0.5 occurs when the MAP is totally ambiguous,
236 ie the classification is completely uncertain. Since people prefer a range
237 0-100%, we stretch the MAP probability into the interval [0,1] here. */
238 u = 2.0 * pmax / t - 1.0;
239
240
241 /* ALTERNATIVE UNCERTAINTY : uncomment below to enable */
242 /* now multiply by the feature hit rate, which is also in [0,1] */
243 /* u *= ((cat[map].complexity - cat[map].fmiss) / cat[map].complexity); */
244
245 return u;
246 }
247
248 /* note: don't forget to flush after each line */
line_score_categories(char * textbuf)249 void line_score_categories(char *textbuf) {
250 category_count_t i;
251 category_count_t map;
252 score_t c, cmax;
253
254 if( !textbuf ) { return; }
255
256 /* find MAP */
257 cmax = cat[0].score;
258 map = 0;
259 for(i = 0; i < cat_count; i++) {
260 if(cmax < cat[i].score) {
261 cmax = cat[i].score;
262 map = i;
263 }
264 /* finish sample variance calculation */
265 cat[i].score_s2 =
266 sample_variance(cat[i].score_s2, cat[i].score, cat[i].complexity);
267 }
268
269 if( !(u_options & (1<<U_OPTION_DUMP)) ) {
270
271 if(u_options & (1<<U_OPTION_POSTERIOR) ) {
272 /* compute probabilities given exclusive choices */
273 c = 0.0;
274 for(i = 0; i < cat_count; i++) {
275 c += exp((cat[i].score - cmax));
276 }
277
278 if( *textbuf ) {
279 for(i = 0; i < cat_count; i++) {
280 fprintf(stdout, "%s %6.2" FMT_printf_score_t "%% ",
281 cat[i].filename, 100 * exp((cat[i].score - cmax))/c);
282 }
283 fprintf(stdout, "%s", textbuf);
284 fflush(stdout);
285 }
286 } else if( u_options & (1<<U_OPTION_SCORES) ) {
287 /* display normalized divergence scores */
288 if( *textbuf ) {
289 for(i = 0; i < cat_count; i++) {
290 if( u_options & (1<<U_OPTION_VERBOSE) ) {
291 fprintf(stdout, "%s %6.2" FMT_printf_score_t " * %-4.1f ",
292 cat[i].filename,
293 -nats2bits(sample_mean(cat[i].score, cat[i].complexity)),
294 cat[i].complexity);
295 } else {
296 fprintf(stdout, "%s %6.2" FMT_printf_score_t " ",
297 cat[i].filename, -nats2bits(cat[i].score));
298 }
299 }
300 fprintf(stdout, "%s", textbuf);
301 fflush(stdout);
302 }
303 } else {
304 /* prune the text which doesn't fit */
305 for(i = 0; i < filter_count; i++) {
306 if( cat[map].score <= cat[filter[i]].score ) {
307 fprintf(stdout, "%s", textbuf);
308 fflush(stdout);
309 break;
310 }
311 }
312 }
313
314 }
315 /* clean up for next line */
316 reset_all_scores();
317
318 if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
319 clear_empirical(&empirical);
320 }
321 }
322
score_categories()323 void score_categories() {
324 bool_t no_title;
325 category_count_t i, j, map;
326 score_t c, cmax, lam;
327 score_t sumdocs, sumfeats;
328 bool_t hasnum;
329
330 /* finish computing sample entropies */
331 if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
332 for(i = 0; i < cat_count; i++) {
333 cat[i].score_shannon =
334 -( sample_mean(cat[i].score_shannon, cat[i].complexity) -
335 log((weight_t)cat[i].complexity) );
336 cat[i].score_div =
337 -( sample_mean(cat[i].score, cat[i].complexity) + cat[i].score_shannon);
338 }
339 }
340
341 hasnum = 1;
342 sumdocs = 0.0;
343 sumfeats = 0.0;
344 for(i = 0; i < cat_count; i++) {
345 /* finish sample variance calculation */
346 cat[i].score_s2 =
347 sample_variance(cat[i].score_s2, cat[i].score, cat[i].complexity);
348 /* compute some constants */
349 hasnum = hasnum && (cat[i].model_num_docs > 0);
350 sumdocs += cat[i].model_num_docs;
351 sumfeats += cat[i].model_unique_token_count;
352 }
353
354 if( (u_options & (1<<U_OPTION_PRIOR_CORRECTION)) && hasnum ) {
355
356 cmax = log(2.0*M_PI)/2.0;
357 for(i = 0; i < cat_count; i++) {
358 /* the prior is Poisson based on mean document length.
359 -lambda + x * log(lambda) - log(x!), which using Stirling's
360 approximation
361 log(x!) = x*log(x) - x + 0.5*log(2PI*x),
362 can be calculated as
363 -lambda - 0.5*log(2PI) + x * (1 + log(lambda/x)).
364 */
365
366 lam = (score_t)cat[i].model_full_token_count/cat[i].model_num_docs;
367 cat[i].prior =
368 -lam -cmax + cat[i].complexity * (1.0 + log(lam/cat[i].complexity));
369 cat[i].score += cat[i].prior;
370 }
371 }
372
373
374 /* find MAP */
375 cmax = cat[0].score;
376 map = 0;
377 for(i = 0; i < cat_count; i++) {
378 if(cmax < cat[i].score) {
379 cmax = cat[i].score;
380 map = i;
381 }
382 }
383 exit_code = (int)map;
384
385 /* there are three cases: posterior, scores, nothing */
386
387 if( u_options & (1<<U_OPTION_POSTERIOR) ) {
388
389 if( u_options & (1<<U_OPTION_APPEND) ) {
390 fprintf(stdout, "\n# probabilities ");
391 }
392
393 /* here we compute probabilities given exclusive choices */
394 c = 0.0;
395 for(i = 0; i < cat_count; i++) {
396 c += exp((cat[i].score - cmax));
397 }
398
399 for(i = 0; i < cat_count; i++) {
400 fprintf(stdout, "%s %5.2" FMT_printf_score_t "%% ",
401 cat[i].filename, 100 * exp((cat[i].score - cmax))/c);
402 if( u_options & (1<<U_OPTION_CONFIDENCE) ) {
403 fprintf(stdout, "@ %5.1f%% ",
404 (float)gamma_pvalue(&cat[i], cat[i].score_div)/10);
405 }
406 }
407 fprintf(stdout, "\n");
408
409 } else if( u_options & (1<<U_OPTION_SCORES) ) {
410
411 if( u_options & (1<<U_OPTION_APPEND) ) {
412 fprintf(stdout, "\n# scores ");
413 }
414
415 /* display logarithmic score */
416 for(i = 0; i < cat_count; i++) {
417 if( u_options & (1<<U_OPTION_VERBOSE) ) {
418 if( u_options & (1<<U_OPTION_VAR) ) {
419 fprintf(stdout, "%s ( %5.2" FMT_printf_score_t
420 " # %5.2" FMT_printf_score_t " )* %-.1f ",
421 cat[i].filename,
422 -nats2bits(sample_mean(cat[i].score, cat[i].complexity)),
423 nats2bits(sqrt(cat[i].score_s2/cat[i].complexity)),
424 cat[i].complexity);
425 } else {
426 fprintf(stdout, "%s %5.2" FMT_printf_score_t " * %-.1f ",
427 cat[i].filename,
428 -nats2bits(sample_mean(cat[i].score, cat[i].complexity)),
429 cat[i].complexity);
430 }
431 if( u_options & (1<<U_OPTION_CONFIDENCE) ) {
432 fprintf(stdout, "@ %5.1f%% ",
433 (float)gamma_pvalue(&cat[i], cat[i].score_div)/10);
434 }
435 } else {
436 fprintf(stdout, "%s %5.2" FMT_printf_score_t " ",
437 cat[i].filename,
438 -nats2bits(cat[i].score));
439 }
440 }
441 fprintf(stdout, "\n");
442
443 if( u_options & (1<<U_OPTION_APPEND) ) {
444 no_title = (bool_t)1;
445 for(i = 0; i < cat_count; i++) {
446 if( cat[i].model_num_docs > 0 ) {
447 if( no_title ) {
448 fprintf(stdout, "# mean_complexity ");
449 no_title = (bool_t)0;
450 }
451 fprintf(stdout, "%s %5.2" FMT_printf_score_t " ",
452 cat[i].filename,
453 (double)cat[i].model_full_token_count/cat[i].model_num_docs);
454 }
455 }
456 if( !no_title ) { fprintf(stdout, "\n"); }
457 }
458
459 } else if(u_options & (1<<U_OPTION_MEDIACOUNTS) ) {
460 for(i = 0; i < cat_count; i++) {
461 if( (u_options & (1<<U_OPTION_VERBOSE)) ) {
462 fprintf(stdout, "%s ( %5.2" FMT_printf_score_t " M ",
463 cat[i].filename,
464 -nats2bits(sample_mean(cat[i].score, cat[i].complexity)));
465 for(j = 0; j < TOKEN_CLASS_MAX; j++) {
466 fprintf(stdout, "%4" FMT_printf_integer_t " ", cat[i].mediacounts[j]);
467 }
468 fprintf(stdout, " )* %-.1f ",
469 cat[i].complexity);
470 } else {
471 fprintf(stdout, "%s\tM ", cat[i].filename);
472 for(j = 0; j < TOKEN_CLASS_MAX; j++) {
473 fprintf(stdout, "%4" FMT_printf_integer_t " ", cat[i].mediacounts[j]);
474 }
475 fprintf(stdout, "\n");
476 }
477 }
478 } else {
479
480 if( u_options & (1<<U_OPTION_VERBOSE) ) {
481 if( (u_options & (1<<U_OPTION_CONFIDENCE)) ||
482 (u_options & (1<<U_OPTION_VAR)) ) {
483 for(i = 0; i < cat_count; i++) {
484 fprintf(stdout, "%s ( %5.2" FMT_printf_score_t " + "
485 "%-5.2" FMT_printf_score_t " ",
486 cat[i].filename,
487 nats2bits(cat[i].score_div),
488 nats2bits(cat[i].score_shannon));
489 if( u_options & (1<<U_OPTION_VAR) ) {
490 fprintf(stdout, "# %5.2" FMT_printf_score_t " ",
491 nats2bits(sqrt(cat[i].score_s2/cat[i].complexity)));
492 }
493 fprintf(stdout, ")* %-6.1f ", cat[i].complexity);
494 if( u_options & (1<<U_OPTION_CONFIDENCE) ) {
495 fprintf(stdout, "@ %5.1f%% ",
496 (float)gamma_pvalue(&cat[i], cat[i].score_div)/10);
497 } else {
498 /* percentage of tokens which were recognized */
499 fprintf(stdout, "H %.f%% ",
500 (100.0 * (1.0 - cat[i].fmiss/((score_t)cat[i].fcomplexity))));
501 }
502 }
503 fprintf(stdout, "\n");
504 } else {
505 if( u_options & (1<<U_OPTION_APPEND) ) {
506 fprintf(stdout, "\n# category ");
507 }
508 fprintf(stdout, "%s\n", cat[exit_code].filename);
509 }
510 } else {
511 if( u_options & (1<<U_OPTION_VAR) ) {
512 if( u_options & (1<<U_OPTION_APPEND) ) {
513 fprintf(stdout, "\n# category ");
514 }
515
516 cmax = calc_uncertainty(exit_code);
517
518 /* cmax = 1.0; */
519 /* for(i = 0; i < cat_count; i++) { */
520 /* if( (int)i != exit_code ) { */
521 /* /\* c is a standard normal variable *\/ */
522 /* c = (-sample_mean(cat[i].score, cat[i].complexity) - */
523 /* -sample_mean(cat[exit_code].score, cat[exit_code].complexity)) / */
524 /* sqrt(cat[i].score_s2/cat[i].complexity + */
525 /* cat[exit_code].score_s2/cat[exit_code].complexity); */
526 /* /\* c will always be positive, but let's be safe *\/ */
527 /* if( isnan(c) ) { */
528 /* c = 0.0; */
529 /* } else if( c > 5.0 ) { */
530 /* c = 1.0; */
531 /* } else if( c < -5.0 ) { */
532 /* c = 0.0; */
533 /* } else { */
534 /* /\* normal_cdf(c) has a range of 0.5 - 1.0, we convert to a full range *\/ */
535 /* c = 2 * normal_cdf(c) - 1.0; */
536 /* } */
537 /* cmax = (cmax > c) ? c : cmax; */
538 /* } */
539 /* } */
540 fprintf(stdout, "%s # %d%%\n",
541 cat[exit_code].filename, (int)ceil(cmax * 100.0));
542 }
543 }
544
545 }
546
547 exit_code++; /* make number between 1 and cat_count+1 */
548 }
549
file_score_categories(char * name)550 void file_score_categories(char *name) {
551 fprintf(stdout, "%s ", name);
552 score_categories();
553 /* clean up for next file */
554 reset_all_scores();
555 if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
556 clear_empirical(&empirical);
557 }
558 }
559
560 /***********************************************************
561 * FILE MANAGEMENT FUNCTIONS *
562 ***********************************************************/
563
check_magic_write(char * path,char * magic,size_t len)564 bool_t check_magic_write(char *path, char *magic, size_t len) {
565 FILE *output;
566 char buf[MAGIC_BUFSIZE];
567
568 output = fopen(path, "rb");
569 if( output ) {
570 /* output file exists already */
571 if( (!feof(output) && !fgets(buf, MAGIC_BUFSIZE, output)) ||
572 (strncmp(buf, magic, len) != 0) ) {
573 errormsg(E_ERROR,"the file %s is already used for something, "
574 "use another filename. Nothing written.\n", path);
575 fclose(output);
576 return (bool_t)0;
577 } else {
578 /* it's an existing category file */
579 fclose(output);
580 }
581 }
582 return (bool_t)1;
583 }
584
585 /* the standard tmpfile() call doesn't tell the filename,
586 the standard mkstemp() is unreliable,
587 and the moronic unix system doesn't remember filenames on open().
588 1) remember to free the tmpname when done.
589 2) the tmplate is appended with a pattern .tmp.xx,
590 3) if you want a particular directory, prepend it to tmplate.
591 4) file is opened for read/write, but truncated to zero.
592 5) The limit of 100 tempfiles is hardcoded. If dbacl needs more than
593 that then it's either a bug with dbacl or something wrong on your FS.
594 */
595 /*@null@*/
mytmpfile(const char * tmplate,char ** tmpname)596 FILE *mytmpfile(const char *tmplate, /*@out@*/ char **tmpname) {
597 FILE *result = NULL;
598 size_t l;
599 int i, fd;
600
601 l = strlen(tmplate);
602 *tmpname = (char *)malloc(sizeof(char)*(l + 8));
603 if( *tmpname ) {
604 strcpy(*tmpname, tmplate);
605 #if defined ATOMIC_CATSAVE
606 for(i = 0; i < 100; i++) {
607 snprintf(*tmpname + l, 8, ".tmp.%d", i);
608 /* this may have problems on NFS systems? */
609 fd = ATOMIC_CREATE(*tmpname);
610 if( fd != -1 ) {
611 result = fdopen(fd, "w+b");
612 /* no need to close fd */
613 break;
614 }
615 }
616 #else
617 snprintf(*tmpname + l, 8, ".tmp.%d", rand() % 100);
618 result = fopen(*tmpname, "w+b");
619 #endif
620
621 if( !result ) {
622 free(*tmpname);
623 *tmpname = NULL;
624 }
625 }
626 return result;
627 }
628
myrename(const char * src,const char * dest)629 bool_t myrename(const char *src, const char *dest) {
630 #if defined ATOMIC_CATSAVE
631 /* the rename is atomic on posix */
632 return (bool_t)(rename(src, dest) == 0);
633 #else
634 return (bool_t)1; /* just pretend */
635 #endif
636 }
637
638 /* output -log (mediaprobs) */
write_mediaprobs(FILE * out,learner_t * learner)639 void write_mediaprobs(FILE *out, learner_t *learner) {
640 token_class_t t;
641
642 fprintf(out, MAGIC11);
643 for(t = 0; t < TOKEN_CLASS_MAX; t++) {
644 fprintf(out, "%" FMT_printf_score_t " ", -log(learner->mediaprobs[t]));
645 }
646 fprintf(out, "\n");
647 }
648
write_category_headers(learner_t * learner,FILE * output)649 bool_t write_category_headers(learner_t *learner, FILE *output) {
650 regex_count_t c;
651 char scratchbuf[MAGIC_BUFSIZE];
652 char smb[MAX_SUBMATCH+1];
653 token_order_t s;
654 char *p;
655
656 bool_t ok = (bool_t)1;
657
658 /* print out standard category file headers */
659 ok = ok &&
660 (0 < fprintf(output, MAGIC1, learner->filename,
661 (m_options & (1<<M_OPTION_REFMODEL)) ? "(ref)" : ""));
662 ok = ok &&
663 (0 < fprintf(output,
664 MAGIC2_o, learner->divergence, learner->logZ,
665 (short int)learner->max_order,
666 (m_options & (1<<M_OPTION_MULTINOMIAL)) ? "multinomial" : "hierarchical" ));
667 ok = ok &&
668 (0 < fprintf(output, MAGIC3,
669 (short int)learner->max_hash_bits,
670 (long int)learner->full_token_count,
671 (long int)learner->unique_token_count,
672 (long int)(learner->doc.count)));
673
674 ok = ok &&
675 (0 < fprintf(output, MAGIC8_o,
676 learner->shannon, learner->shannon2));
677
678 ok = ok &&
679 (0 < fprintf(output, MAGIC10_o,
680 learner->alpha, learner->beta,
681 learner->mu, learner->s2));
682
683 write_mediaprobs(output, learner);
684
685 /* print out any regexes we might need */
686 for(c = 0; c < regex_count; c++) {
687 /* write the bitmap */
688 smb[0] = '\0';
689 for(p = smb, s = 1; s <= MAX_SUBMATCH; s++) {
690 if( re[c].submatches & (1<<s) ) {
691 *p++ = (char)s + '0';
692 }
693 }
694 *p = '\0';
695
696 ok = ok &&
697 (0 < fprintf(output, MAGIC5_o, re[c].string, smb));
698 }
699
700 /* print options */
701 ok = ok &&
702 (0 < fprintf(output, MAGIC4_o, m_options, m_cp, m_dt,
703 print_model_options(m_options, m_cp, scratchbuf)));
704
705 ok = ok &&
706 (0 < fprintf(output, MAGIC6));
707 return ok;
708 }
709
710 #if defined DIGITIZE_DIGRAMS
711 typedef digitized_weight_t myweight_t;
712 #else
713 typedef weight_t myweight_t;
714 #endif
715
716 /* writes the learner to a file for easily readable category */
717 /* the category file is first constructed as a temporary file,
718 then renamed if no problems occured. Because renames are
719 atomic on POSIX, this guarantees that a loaded category file
720 used for classifications is never internally corrupt */
save_learner(learner_t * learner,char * opath)721 error_code_t save_learner(learner_t *learner, char *opath) {
722
723 alphabet_size_t i, j;
724 hash_count_t t;
725 FILE *output;
726 char *tempname = NULL;
727
728 bool_t ok;
729 size_t n;
730 c_item_t ci;
731 c_item_t *ci_ptr;
732 myweight_t shval;
733 myweight_t *shval_ptr = NULL;
734
735 long mmap_offset = 0;
736 size_t mmap_length = 0;
737 byte_t *mmap_start = NULL;
738
739
740 if( u_options & (1<<U_OPTION_VERBOSE) ) {
741 fprintf(stdout, "saving category to file %s\n", learner->filename);
742 }
743
744 /* don't overwrite data files */
745 if( !check_magic_write(learner->filename, MAGIC1, 10) ) {
746 exit(1);
747 }
748
749 /* In case we have both the -m and -o switches we try to write the
750 data with mmap. We don't do this in general, because mmap can
751 cause corruption, but if -m and -o are used together, we assume the
752 user knows that a single process must read/write the file at a time.
753 Also, we don't try to create the file - if the file doesn't exist,
754 we won't gain much time by using mmap on that single occasion. */
755 if( opath && *opath && (u_options & (1<<U_OPTION_MMAP)) ) {
756 ok = (bool_t)0;
757 output = fopen(learner->filename, "r+b");
758 if( output ) {
759 ok = (bool_t)1;
760 if( out_iobuf ) {
761 setvbuf(output, (char *)out_iobuf, (int)_IOFBF, (size_t)(BUFFER_MAG * system_pagesize));
762 }
763
764 ok = ok && write_category_headers(learner, output);
765 if( !ok ) {
766 goto skip_mmap;
767 }
768 /* now mmap the file and write out the arrays real quick */
769 mmap_offset = ftell(output);
770 if( mmap_offset == (long)-1 ) {
771 ok = 0;
772 goto skip_mmap;
773 }
774 mmap_length = (size_t)mmap_offset + (ASIZE * ASIZE * SIZEOF_DIGRAMS) +
775 learner->max_tokens * sizeof(c_item_t);
776
777 ok = ok && (-1 != ftruncate(fileno(output), (off_t)mmap_length));
778 if( !ok ) {
779 goto skip_mmap;
780 }
781
782 mmap_start = (byte_t *)MMAP(0, mmap_length,
783 PROT_READ|PROT_WRITE, MAP_SHARED, fileno(output), 0);
784 if( mmap_start == MAP_FAILED ) { mmap_start = NULL; }
785 if( !mmap_start ) {
786 ok = 0;
787 goto skip_mmap;
788 }
789
790 MLOCK(mmap_start, mmap_length);
791
792 MADVISE(mmap_start, mmap_length, MADV_SEQUENTIAL|MADV_WILLNEED);
793
794 /* character frequencies */
795 shval_ptr = (myweight_t *)(mmap_start + mmap_offset);
796 for(i = 0; i < ASIZE; i++) {
797 for(j = 0; j < ASIZE; j++) {
798 *shval_ptr = HTON_DIGRAM(PACK_DIGRAMS(learner->dig[i][j]));
799 shval_ptr++;
800 }
801 }
802
803 /* token/feature weights */
804 ci_ptr = (c_item_t *)(mmap_start + mmap_offset +
805 (ASIZE * ASIZE * SIZEOF_DIGRAMS));
806 for(t = 0; t < learner->max_tokens; t++) {
807 /* write each element so that it's easy to read back in a
808 c_item_t array */
809 SET(ci_ptr->id,learner->hash[t].id);
810 ci_ptr->lam = learner->hash[t].lam;
811
812 ci_ptr->id = HTON_ID(ci_ptr->id);
813 ci_ptr->lam = HTON_LAMBDA(ci_ptr->lam);
814 ci_ptr++;
815 }
816
817 skip_mmap:
818 fclose(output);
819
820 if( mmap_start ) {
821 MUNMAP(mmap_start, mmap_length);
822 mmap_start = NULL;
823 }
824 }
825
826 if( ok ) {
827
828 return 1; /* we're done */
829 } else {
830 errormsg(E_WARNING, "could not mmap %s, trying stdio.\n",
831 learner->filename);
832 /* oh oh, file is corrupt. We delete it and try again below
833 without mmap */
834 unlink(learner->filename);
835 }
836 }
837
838 /* this keeps track to see if writing is successful,
839 it's not foolproof, but probably good enough */
840 ok = (bool_t)1;
841
842 output = mytmpfile(learner->filename, &tempname);
843 if( output ) {
844
845 if( out_iobuf ) {
846 setvbuf(output, (char *)out_iobuf, (int)_IOFBF, (size_t)(BUFFER_MAG * system_pagesize));
847 }
848
849 ok = ok && write_category_headers(learner, output);
850
851 /* end of readable stuff */
852 if( ok ) {
853 /* now write arrays: this is far(!) from efficient, but the bits
854 we need are not nicely tucked into a block of memory, and due
855 to the sizes involved when dealing with possibly millions of tokens, we
856 can't afford to malloc a brand new block just to pack everything
857 nicely. At least, the file buffering should mitigate the performance
858 hit, and we prefer to learn slowly and classify fast. */
859
860 /* character frequencies */
861 for(i = 0; i < ASIZE; i++) {
862 for(j = 0; j < ASIZE; j++) {
863 shval = HTON_DIGRAM(PACK_DIGRAMS(learner->dig[i][j]));
864 for(n = 0; n < 1; ) {
865 if( 0 > (n = fwrite(&shval, SIZEOF_DIGRAMS, (size_t)1, output)) ) {
866 ok = 0;
867 goto skip_remaining;
868 }
869 }
870 }
871 }
872
873 MADVISE(learner->hash, sizeof(l_item_t) * learner->max_tokens,
874 MADV_SEQUENTIAL|MADV_WILLNEED);
875
876 /* token/feature weights */
877 for(t = 0; t < learner->max_tokens; t++) {
878 /* write each element so that it's easy to read back in a c_item_t array */
879 SET(ci.id,learner->hash[t].id);
880 ci.lam = learner->hash[t].lam;
881
882 ci.id = HTON_ID(ci.id);
883 ci.lam = HTON_LAMBDA(ci.lam);
884
885 for(n = 0; n < 1; ) {
886 if( 0 > (n = fwrite(&ci, sizeof(ci), (size_t)1, output)) ) {
887 ok = 0;
888 goto skip_remaining;
889 }
890 }
891 }
892 }
893
894 skip_remaining:
895
896 fclose(output);
897
898 /* the rename is atomic on posix */
899 if( !ok || !myrename(tempname, learner->filename) ) {
900 errormsg(E_ERROR,
901 "due to a potential file corruption, category %s was not updated\n",
902 learner->filename);
903 unlink(tempname);
904 cleanup.tempfile = NULL;
905 }
906 free(tempname);
907 } else {
908 errormsg(E_ERROR, "cannot open tempfile for writing %s\n", learner->filename);
909 return 0;
910 }
911
912
913 return 1;
914 }
915
916 /* Instead of saving a full new category file, we try to
917 * overwrite the contents of an already open, memory mapped category.
918 * CAREFUL: THIS CAN CORRUPT THE CATEGORY!
919 * If the return value is 0, then you should assume the category no
920 * longer exists on disk.
921 */
fast_partial_save_learner(learner_t * learner,category_t * xcat)922 error_code_t fast_partial_save_learner(learner_t *learner, category_t *xcat) {
923 char *p, *q;
924 #define REPLBUF (3*128)
925 char buf[REPLBUF];
926 charbuf_len_t n, max;
927
928 alphabet_size_t i, j;
929 hash_count_t t;
930 c_item_t *ci_ptr;
931 myweight_t *shval_ptr;
932
933 if( xcat->mmap_start &&
934 (xcat->model.options == learner->model.options) &&
935 (xcat->max_order == learner->max_order) &&
936 (xcat->max_hash_bits == learner->max_hash_bits) ) {
937 max = 0;
938 buf[0] = '\0';
939 /* we only overwrite the header if there's exactly enough space */
940 /* we must overwrite 3 lines which all start with # */
941 q = p = strchr((char *)xcat->mmap_start + 1, '#');
942 if( p ) {
943 n = snprintf(buf, (size_t)(REPLBUF - max), MAGIC2_o,
944 learner->divergence, learner->logZ,
945 (short int)learner->max_order,
946 (m_options & (1<<M_OPTION_MULTINOMIAL)) ? "multinomial" : "hierarchical" );
947 max += n;
948 }
949 q = strchr(q + 1, '#');
950 if( q && (max < REPLBUF) ) {
951 n = snprintf(buf + max, (size_t)(REPLBUF - max), MAGIC3,
952 (short int)learner->max_hash_bits,
953 (long int)learner->full_token_count,
954 (long int)learner->unique_token_count,
955 (long int)(learner->doc.count));
956 max += n;
957 }
958 q = strchr(q + 1, '#');
959 if( q && (max < REPLBUF) ) {
960 n = snprintf(buf + max, (size_t)(REPLBUF - max), MAGIC8_o,
961 learner->shannon, learner->shannon2);
962 max += n;
963 }
964 q = strchr(q + 1, '#');
965 if( q && (max < REPLBUF) ) {
966 n = snprintf(buf + max, (size_t)(REPLBUF - max), MAGIC10_o,
967 learner->alpha, learner->beta,
968 learner->mu, learner->s2);
969 max += n;
970 }
971 /* now check it fits into the existing space */
972 q = strchr(q + 1, '#');
973 if( (q - p) == max ) {
974 /* from here on, there's no turning back. Since we don't
975 overwrite everything, if there's a problem, we'll have to
976 unlink the category */
977
978 /* header */
979 memcpy(p, buf, (size_t)max);
980
981 /* character frequencies */
982 shval_ptr = (myweight_t *)(xcat->mmap_start + xcat->mmap_offset -
983 ASIZE * ASIZE * sizeof(myweight_t));
984 for(i = 0; i < ASIZE; i++) {
985 for(j = 0; j < ASIZE; j++) {
986 *shval_ptr = HTON_DIGRAM(PACK_DIGRAMS(learner->dig[i][j]));
987 shval_ptr++;
988 }
989 }
990 /* token/feature weights
991 (we assume cat->max_tokens <= learner->max_tokens, and
992 that filled hash values in cat are also filled in learner) */
993 ci_ptr = (c_item_t *)(xcat->mmap_start + xcat->mmap_offset);
994 for(t = 0; t < learner->max_tokens; t++) {
995 /* write each element so that it's easy to read back in a
996 c_item_t array */
997 SET(ci_ptr->id,learner->hash[t].id);
998 ci_ptr->lam = learner->hash[t].lam;
999
1000 ci_ptr->id = HTON_ID(ci_ptr->id);
1001 ci_ptr->lam = HTON_LAMBDA(ci_ptr->lam);
1002 ci_ptr++;
1003 }
1004
1005 if( u_options & (1<<U_OPTION_VERBOSE) ) {
1006 fprintf(stdout, "partially overwriting category file %s\n",
1007 learner->filename);
1008 }
1009 return 1;
1010 }
1011 }
1012 if( u_options & (1<<U_OPTION_VERBOSE) ) {
1013 fprintf(stdout, "can't make fast save\n");
1014 }
1015 return 0;
1016 }
1017
1018
tmp_seek_start(learner_t * learner)1019 bool_t tmp_seek_start(learner_t *learner) {
1020 if( learner->tmp.mmap_start ) {
1021 learner->tmp.mmap_cursor = learner->tmp.mmap_offset;
1022 return (bool_t)1;
1023 } else if( learner->tmp.file ) {
1024 clearerr(learner->tmp.file);
1025 return (fseek(learner->tmp.file, learner->tmp.offset, SEEK_SET) == 0);
1026 }
1027 return (bool_t)0;
1028 }
1029
tmp_seek_end(learner_t * learner)1030 bool_t tmp_seek_end(learner_t *learner) {
1031 if( learner->tmp.mmap_start ) {
1032 learner->tmp.mmap_cursor = learner->tmp.mmap_offset + learner->tmp.used;
1033 return (bool_t)1;
1034 } else if( learner->tmp.file ) {
1035 return (fseek(learner->tmp.file,
1036 learner->tmp.offset + learner->tmp.used, SEEK_SET) == 0);
1037 }
1038 return (bool_t)0;
1039 }
1040
tmp_get_pos(learner_t * learner)1041 long tmp_get_pos(learner_t *learner) {
1042 if( learner->tmp.mmap_start ) {
1043 return learner->tmp.mmap_cursor - learner->tmp.mmap_offset;
1044 } else if( learner->tmp.file ) {
1045 return ftell(learner->tmp.file) - learner->tmp.offset;
1046 }
1047 return (bool_t)0;
1048 }
1049
tmp_read_block(learner_t * learner,byte_t * buf,size_t bufsiz,const byte_t ** startp)1050 size_t tmp_read_block(learner_t *learner, byte_t *buf, size_t bufsiz,
1051 const byte_t **startp) {
1052 long left = learner->tmp.used - tmp_get_pos(learner);
1053 if( bufsiz > (size_t)left ) { bufsiz = (left >= 0) ? (size_t)left : 0; }
1054 if( learner->tmp.mmap_start ) {
1055 /* memcpy(buf, learner->tmp.mmap_start + learner->tmp.mmap_cursor, bufsiz); */
1056 *startp = learner->tmp.mmap_start + learner->tmp.mmap_cursor;
1057 learner->tmp.mmap_cursor += bufsiz;
1058 return bufsiz;
1059 } else if( learner->tmp.file ) {
1060 if( ferror(learner->tmp.file) || feof(learner->tmp.file) ) {
1061 return 0;
1062 }
1063 *startp = buf;
1064 return fread(buf, 1, bufsiz, learner->tmp.file);
1065 }
1066 return 0;
1067 }
1068
1069 /* must unmap/ftruncate/remap if using mmap, don't touch mmap_cursor */
tmp_grow(learner_t * learner)1070 bool_t tmp_grow(learner_t *learner) {
1071 long offset;
1072 if( learner->tmp.file &&
1073 ((learner->tmp.used + 2 * MAX_TOKEN_LEN) >= learner->tmp.avail) ) {
1074
1075 if( learner->tmp.mmap_start ) {
1076 MUNLOCK(learner->tmp.mmap_start, learner->tmp.mmap_length);
1077 MUNMAP(learner->tmp.mmap_start, learner->tmp.mmap_length);
1078 }
1079
1080 learner->tmp.avail += TOKEN_LIST_GROW;
1081
1082 if( -1 == ftruncate(fileno(learner->tmp.file),
1083 learner->tmp.offset + learner->tmp.avail) ) {
1084 errormsg(E_FATAL,
1085 "could not resize the tempfile, unable to proceed.\n");
1086 }
1087
1088 /* mmap_start is still old value, but invalid */
1089 if( learner->tmp.mmap_start ) {
1090 offset = PAGEALIGN(learner->tmp.offset);
1091 learner->tmp.mmap_length = learner->tmp.avail + system_pagesize;
1092 learner->tmp.mmap_start =
1093 (byte_t *)MMAP(learner->tmp.mmap_start, learner->tmp.mmap_length,
1094 PROT_READ|PROT_WRITE, MAP_SHARED,
1095 fileno(learner->tmp.file), offset);
1096 if( learner->tmp.mmap_start == MAP_FAILED ) { learner->tmp.mmap_start = NULL; }
1097 if( !learner->tmp.mmap_start ) {
1098 if( u_options & (1<<U_OPTION_VERBOSE) ) {
1099 errormsg(E_WARNING, "could not mmap token file after resize\n");
1100 }
1101 /* this isn't fatal, we just switch to ordinary file ops, since
1102 the file is still open :) */
1103 if( -1 == fseek(learner->tmp.file,learner->tmp.offset +
1104 (learner->tmp.mmap_cursor - learner->tmp.mmap_offset),
1105 SEEK_SET) ) {
1106 /* ok, this is fatal */
1107 errormsg(E_FATAL,
1108 "could not resize and seek in tempfile, unable to proceed.\n");
1109 }
1110 learner->tmp.mmap_length = 0;
1111 learner->tmp.mmap_offset = 0;
1112 learner->tmp.mmap_cursor = 0;
1113 } else {
1114 /* all is well, relock the pages */
1115 MLOCK(learner->tmp.mmap_start, learner->tmp.mmap_length);
1116 MADVISE(learner->tmp.mmap_start, learner->tmp.mmap_length,
1117 MADV_SEQUENTIAL|MADV_WILLNEED);
1118 }
1119 /* if there was a fatal error, we simply don't return */
1120 return (bool_t)1;
1121 }
1122 }
1123 return (bool_t)0;
1124 }
1125
1126 /* assume we are at the end of the token list and there is enough room */
tmp_write_token(learner_t * learner,const char * tok)1127 void tmp_write_token(learner_t *learner, const char *tok) {
1128 byte_t *p;
1129 if( learner->tmp.mmap_start ) {
1130 p = learner->tmp.mmap_start + learner->tmp.mmap_cursor;
1131 while( *tok ) { *p++ = *tok++; }
1132 *p++ = TOKENSEP;
1133 learner->tmp.mmap_cursor = p - learner->tmp.mmap_start;
1134 learner->tmp.used = learner->tmp.mmap_cursor - learner->tmp.mmap_offset;
1135 } else if( learner->tmp.file ) {
1136 /* now save token to temporary file */
1137 if( (EOF == fputs(tok, learner->tmp.file)) ||
1138 (EOF == fputc(TOKENSEP, learner->tmp.file)) ) {
1139 errormsg(E_ERROR,
1140 "could not write a token to tempfile, calculations will be skewed.\n");
1141 } else {
1142 learner->tmp.used += (strlen(tok) + 1);
1143 }
1144 }
1145 }
1146
tmp_close(learner_t * learner)1147 void tmp_close(learner_t *learner) {
1148 if( learner->tmp.mmap_start ) {
1149 MUNLOCK(learner->tmp.mmap_start, learner->tmp.mmap_length);
1150 MUNMAP(learner->tmp.mmap_start, learner->tmp.mmap_length);
1151 learner->tmp.mmap_start = NULL;
1152 }
1153 if( learner->tmp.file ) {
1154 fclose(learner->tmp.file);
1155 learner->tmp.file = NULL;
1156 if( learner->tmp.filename ) {
1157 unlink(learner->tmp.filename);
1158 cleanup.tempfile = NULL;
1159 free(learner->tmp.filename);
1160 learner->tmp.filename = NULL;
1161 }
1162 }
1163 }
1164
1165 /***********************************************************
1166 * EMPLIST FUNCTIONS *
1167 ***********************************************************/
emplist_grow(emplist_t * emp)1168 bool_t emplist_grow(emplist_t *emp) {
1169 hash_count_t *n;
1170
1171 if( emp ) {
1172 if( u_options & (1<<U_OPTION_VERBOSE) ) {
1173 errormsg(E_WARNING, "growing emp.stack %" FMT_printf_integer_t "\n",
1174 emp->max * 2);
1175 }
1176
1177 n = (hash_value_t *)realloc(emp->stack,
1178 sizeof(hash_value_t *) * emp->max * 2);
1179 if( n != NULL ) {
1180 emp->max *= 2;
1181 emp->stack = n;
1182 return 1;
1183 }
1184 }
1185 errormsg(E_WARNING,
1186 "disabling emp.stack. Not enough memory? I couldn't allocate %li bytes\n",
1187 (sizeof(l_item_t *) * ((long int)emp->max) * 2));
1188 m_options &= ~(1<<M_OPTION_CALCENTROPY);
1189
1190 return 0;
1191 }
1192
emplist_replace(emplist_t * reservoir,document_count_t n,emplist_t * emp)1193 void emplist_replace(emplist_t *reservoir, document_count_t n, emplist_t *emp) {
1194 if( reservoir[n].stack && (emp->top > reservoir[n].max) ) {
1195 free(reservoir[n].stack);
1196 reservoir[n].stack = NULL;
1197 }
1198 if( !reservoir[n].stack ) {
1199 for(reservoir[n].max = 1; emp->top > reservoir[n].max;
1200 reservoir[n].max *=2);
1201 reservoir[n].stack = (hash_value_t *)malloc(reservoir[n].max * sizeof(hash_value_t));
1202 }
1203 if( reservoir[n].stack ) {
1204 memcpy(reservoir[n].stack, emp->stack, emp->top * sizeof(hash_value_t));
1205 reservoir[n].top = emp->top;
1206 reservoir[n].shannon = emp->shannon;
1207 } else {
1208 errormsg(E_WARNING,
1209 "couldn't replace document #%" FMT_printf_integer_t " in reservoir, ignoring\n", n);
1210 }
1211 }
1212
1213 /*
1214 * Adds emp to the reservoir in such a way that we
1215 * end up with a uniform random sample of size RESERVOIR_SIZE.
1216 * This algorithm is a short form of reservoir sampling, and we use
1217 * it here because we don't know the total number of documents
1218 * until it's too late.
1219 */
1220
emplist_add_to_reservoir(emplist_t * reservoir,document_count_t docno,emplist_t * emp)1221 void emplist_add_to_reservoir(emplist_t *reservoir, document_count_t docno,
1222 emplist_t *emp) {
1223 double xi;
1224 if( docno < RESERVOIR_SIZE) {
1225 emplist_replace(reservoir, docno, emp);
1226 } else {
1227 xi = ((double)rand()/(RAND_MAX + 1.0));
1228 if( (docno + 1) * xi < RESERVOIR_SIZE ) {
1229 xi = ((double)rand()/(RAND_MAX + 1.0));
1230 emplist_replace(reservoir, (int)(RESERVOIR_SIZE * xi), emp);
1231 }
1232 }
1233 }
1234
1235 /***********************************************************
1236 * LEARNER FUNCTIONS *
1237 ***********************************************************/
free_learner_hash(learner_t * learner)1238 void free_learner_hash(learner_t *learner) {
1239 if( learner->hash ) {
1240 if( learner->mmap_start != NULL ) {
1241 MUNMAP(learner->mmap_start,
1242 learner->mmap_hash_offset + cat->max_tokens * sizeof(c_item_t));
1243 learner->mmap_start = NULL;
1244 learner->mmap_learner_offset = 0;
1245 learner->mmap_hash_offset = 0;
1246 learner->hash = NULL;
1247 }
1248 if( learner->hash ) {
1249 free(learner->hash);
1250 learner->hash = NULL;
1251 }
1252 }
1253 }
1254
create_learner_hash(learner_t * learner,FILE * input,bool_t readonly)1255 bool_t create_learner_hash(learner_t *learner, FILE *input, bool_t readonly) {
1256 size_t j, n;
1257 byte_t *mmap_start;
1258 long mmap_learner_offset;
1259 long mmap_hash_offset;
1260
1261 if( learner->hash ) { free_learner_hash(learner); }
1262
1263 if( u_options & (1<<U_OPTION_MMAP) ) {
1264 /* here we mmap the fixed size portion of the dump file. First
1265 we mmap the learner structure portion as a fast way of loading
1266 the learner variable, then we must undo/redo the mmap because
1267 we cannot know the hash table size until we've read the dumped learner_t.
1268 */
1269
1270 mmap_learner_offset = strlen(MAGIC_ONLINE);
1271 mmap_hash_offset = mmap_learner_offset + sizeof(learner_t);
1272
1273 mmap_start =
1274 (byte_t *)MMAP(0, mmap_hash_offset,
1275 PROT_READ|(readonly ? 0 : PROT_WRITE),
1276 MAP_SHARED, fileno(input), 0);
1277 if( mmap_start == MAP_FAILED ) { mmap_start = NULL; }
1278 if( mmap_start ) {
1279 /* first we overwrite the learner struct with the contents of
1280 the mmapped region */
1281 memcpy(learner, mmap_start + mmap_learner_offset, sizeof(learner_t));
1282
1283 MUNMAP(mmap_start, mmap_hash_offset);
1284
1285 mmap_start =
1286 (byte_t *)MMAP(mmap_start,
1287 mmap_hash_offset + sizeof(l_item_t) * learner->max_tokens,
1288 PROT_READ|(readonly ? 0 : PROT_WRITE),
1289 MAP_SHARED, fileno(input), 0);
1290 if( mmap_start == MAP_FAILED ) { mmap_start = NULL; }
1291 if( mmap_start ) {
1292 /* now fill some member variables */
1293 learner->mmap_start = mmap_start;
1294 learner->mmap_learner_offset = mmap_learner_offset;
1295 learner->mmap_hash_offset = mmap_hash_offset;
1296 learner->hash = (l_item_t *)(mmap_start + mmap_hash_offset);
1297 MADVISE(learner->mmap_start,
1298 learner->mmap_hash_offset + sizeof(l_item_t) * learner->max_tokens,
1299 MADV_SEQUENTIAL|MADV_WILLNEED);
1300 /* lock the pages to prevent swapping - on Linux, this
1301 works without root privs so long as the user limits
1302 are big enough - mine are unlimited ;-)
1303 On other OSes, root may me necessary. */
1304 MLOCK(learner->mmap_start,
1305 learner->mmap_hash_offset + sizeof(l_item_t) * learner->max_tokens);
1306 }
1307 }
1308 }
1309
1310 if( !learner->hash ) {
1311
1312 for(n = 0; n < 1; ) {
1313 n = fread(learner, sizeof(learner_t), 1, input);
1314 if( (n == 0) && ferror(input) ) {
1315 return 0;
1316 }
1317 }
1318
1319 /* allocate hash table normally */
1320 learner->hash = (l_item_t *)malloc(learner->max_tokens * sizeof(l_item_t));
1321 if( !learner->hash ) {
1322 errormsg(E_WARNING, "failed to allocate %li bytes for learner hash.\n",
1323 (sizeof(l_item_t) * ((long int)learner->max_tokens)));
1324 /* this error is not fatal, because we have another opportunity to
1325 allocate the hash later */
1326 return 0;
1327 }
1328
1329 MADVISE(learner->hash, sizeof(l_item_t) * learner->max_tokens,
1330 MADV_SEQUENTIAL);
1331
1332 for(n = j = 0; n < learner->max_tokens; n += j) {
1333 j = fread(&learner->hash[n], sizeof(l_item_t),
1334 learner->max_tokens - n, input);
1335 if( (j == 0) && ferror(input) ) {
1336 errormsg(E_WARNING, "could not read online learner dump, ignoring.\n");
1337 return 0;
1338 }
1339 }
1340
1341 }
1342 return (learner->hash != NULL);
1343 }
1344
1345 /* returns true if the hash could be grown, false otherwise.
1346 When the hash is grown, the old values must be redistributed.
1347 To do this, we reuse the l_item.lam member as a flag, since this
1348 is only needed later during minimization */
grow_learner_hash(learner_t * learner)1349 bool_t grow_learner_hash(learner_t *learner) {
1350 hash_count_t c, new_size;
1351 l_item_t *i, temp_item;
1352
1353 if( !(u_options & (1<<U_OPTION_GROWHASH)) ) {
1354 return 0;
1355 } else {
1356 if( learner->max_hash_bits < default_max_grow_hash_bits ) {
1357
1358 /*
1359 this warning is mostly useless and distracting
1360
1361 errormsg(E_WARNING,
1362 "growing hash table, next time perhaps consider using -h %d\n",
1363 learner.max_hash_bits + 1);
1364
1365 */
1366
1367 /* there are two cases - if learner is mmapped, we must malloc a
1368 * new hash, otherwise we can realloc. An easy way to reduce the
1369 * first case to the second is to create a throwaway hash with
1370 * the old size and copy the mmapped contents to it.
1371 */
1372 if( learner->mmap_start ) {
1373 if( (i = (l_item_t *)malloc(learner->max_tokens *
1374 sizeof(l_item_t))) == NULL ) {
1375 errormsg(E_WARNING,
1376 "failed to malloc hash table for growing.\n");
1377 return 0;
1378 }
1379 memcpy(i, learner->hash, sizeof(l_item_t) * learner->max_tokens);
1380 free_learner_hash(learner);
1381 learner->hash = i;
1382
1383 /* there's a MUNLOCK call below, so in theory we should
1384 balance this out by calling MLOCK here, since the i pointed
1385 region was never locked. However mlock/munlock calls do not nest,
1386 so there's no need.
1387 */
1388 /* if( u_options & (1<<U_OPTION_MMAP) ) { */
1389 /* MLOCK(learner->hash, sizeof(l_item_t) * learner->max_tokens); */
1390 /* } */
1391 }
1392
1393
1394 if( u_options & (1<<U_OPTION_MMAP) ) {
1395 MUNLOCK(learner->hash, sizeof(l_item_t) * learner->max_tokens);
1396 }
1397
1398 /* grow the memory around the hash */
1399 if( (i = (l_item_t *)realloc(learner->hash,
1400 sizeof(l_item_t) *
1401 (1<<(learner->max_hash_bits+1)))) == NULL ) {
1402 errormsg(E_WARNING,
1403 "failed to grow hash table.\n");
1404 return 0;
1405 }
1406 /* we need the old value of learner->max_tokens for a while yet */
1407 learner->max_hash_bits++;
1408
1409 if( u_options & (1<<U_OPTION_MMAP) ) {
1410 MLOCK(i, sizeof(l_item_t) * (1<<learner->max_hash_bits));
1411 }
1412
1413 MADVISE(i, sizeof(l_item_t) * (1<<learner->max_hash_bits), MADV_SEQUENTIAL);
1414
1415 /* realloc doesn't initialize the memory */
1416 memset(&((l_item_t *)i)[learner->max_tokens], 0,
1417 ((1<<learner->max_hash_bits) - learner->max_tokens) *
1418 sizeof(l_item_t));
1419 learner->hash = i;
1420
1421 /* now mark every used slot */
1422 for(c = 0; c < learner->max_tokens; c++) {
1423 if( FILLEDP(&learner->hash[c]) ) {
1424 SETMARK(&learner->hash[c]);
1425 }
1426 }
1427
1428 MADVISE(i, sizeof(l_item_t) * (1<<learner->max_hash_bits), MADV_RANDOM);
1429
1430 /* now relocate each marked slot and clear it */
1431 new_size = (1<<learner->max_hash_bits) - 1;
1432 for(c = 0; c < learner->max_tokens; c++) {
1433 while( MARKEDP(&learner->hash[c]) ) {
1434 /* find where it belongs */
1435 i = &learner->hash[learner->hash[c].id & new_size];
1436 while( FILLEDP(i) && !MARKEDP(i) ) {
1437 i++;
1438 i = (i > &learner->hash[new_size]) ? learner->hash : i;
1439 } /* guaranteed to exit since hash is larger than original */
1440
1441 /* clear the mark - this must happen after we look for i,
1442 since it should be possible to find i == learner->hash[c] */
1443 UNSETMARK(&learner->hash[c]);
1444
1445 /* now refile */
1446 if( i != &learner->hash[c] ) {
1447 if( MARKEDP(i) ) {
1448 /* swap */
1449 memcpy(&temp_item, i, sizeof(l_item_t));
1450 memcpy(i, &learner->hash[c], sizeof(l_item_t));
1451 memcpy(&learner->hash[c], &temp_item, sizeof(l_item_t));
1452 } else {
1453 /* copy and clear */
1454 memcpy(i, &learner->hash[c], sizeof(l_item_t));
1455 memset(&learner->hash[c], 0, sizeof(l_item_t));
1456 }
1457 }
1458 /* now &learner->hash[c] is marked iff there was a swap */
1459 }
1460 }
1461 learner->max_tokens = (1<<learner->max_hash_bits);
1462 } else {
1463 u_options &= ~(1<<U_OPTION_GROWHASH); /* it's the law */
1464 errormsg(E_WARNING,
1465 "the token hash table is nearly full, slowing down.\n");
1466 }
1467 }
1468 return 1;
1469 }
1470
1471 /* This code malloc()s and fread()s the learner structure. */
read_online_learner_struct(learner_t * learner,char * path,bool_t readonly)1472 bool_t read_online_learner_struct(learner_t *learner, char *path, bool_t readonly) {
1473 FILE *input;
1474 char buf[BUFSIZ+1]; /* must be greater than MAGIC_BUFSIZE */
1475 bool_t ok = 0;
1476 char *sav_filename;
1477 long offset;
1478 l_item_t *p, *e;
1479
1480 /*
1481 * This code malloc()s and fread()s the learner structure, or alternatively
1482 * it mmap()s it for speed. Note we open for read/write, because the
1483 * file might be left open and reused later.
1484 */
1485 input = fopen(path, readonly ? "rb" : "r+b");
1486 if( input ) {
1487
1488 /* if( out_iobuf ) { */
1489 /* setvbuf(input, (char *)out_iobuf, (int)_IOFBF, (size_t)(BUFFER_MAG * system_pagesize)); */
1490 /* } */
1491
1492 if( !fgets(buf, MAGIC_BUFSIZE, input) ||
1493 (strncmp(buf, MAGIC_ONLINE, strlen(MAGIC_ONLINE)) != 0) ) {
1494 if( strncmp(buf, MAGIC_ONLINE, 35) == 0 ) {
1495 errormsg(E_WARNING,
1496 "the file %s has the wrong version number, it will be ignored\n",
1497 path);
1498 } else {
1499 errormsg(E_WARNING,
1500 "the file %s is not a dbacl online memory dump, it will be ignored\n",
1501 path);
1502 }
1503 } else {
1504 /* from here on, if a problem arises then learner is hosed, so we exit */
1505 ok = 1;
1506
1507 /* must save some prefilled members because learner is read from disk */
1508 sav_filename = learner->filename;
1509
1510 if( !create_learner_hash(learner, input, readonly) ) {
1511 ok = 0;
1512 goto skip_read_online;
1513 }
1514 /* restore members */
1515 learner->filename = sav_filename;
1516
1517 /* override options */
1518 if( (m_options != learner->model.options) ||
1519 (zthreshold != learner->model.tmin) ||
1520 (m_cp != learner->model.cp) ||
1521 (m_dt != learner->model.dt) ) {
1522 /* we don't warn about changes in u_options, as they can happen
1523 during computations */
1524 errormsg(E_WARNING,
1525 "some command line options were changed when loading %s\n",
1526 path);
1527 m_options = learner->model.options;
1528 m_cp = learner->model.cp;
1529 m_dt = learner->model.dt;
1530 /* u_options = learner->u_options; */
1531 zthreshold = learner->model.tmin;
1532 }
1533
1534 /* last, there is a long list of token strings, we compute the
1535 start offset, and seek to the end so we can append more
1536 tokens */
1537 learner->tmp.file = input;
1538 learner->tmp.filename = NULL;
1539 learner->tmp.offset = strlen(MAGIC_ONLINE) + sizeof(learner_t) +
1540 sizeof(l_item_t) * learner->max_tokens;
1541 learner->tmp.iobuf = NULL;
1542
1543 if( u_options & (1<<U_OPTION_MMAP) ) {
1544 offset = PAGEALIGN(learner->tmp.offset);
1545 /* since offset can be up to a page less than the true beginning,
1546 we must allow an extra page in the mmap length */
1547 learner->tmp.mmap_length = learner->tmp.avail + system_pagesize;
1548 learner->tmp.mmap_start =
1549 (byte_t *)MMAP(0, learner->tmp.mmap_length,
1550 PROT_READ|(readonly ? 0 : PROT_WRITE), MAP_SHARED,
1551 fileno(learner->tmp.file), offset);
1552 if( learner->tmp.mmap_start == MAP_FAILED ) { learner->tmp.mmap_start = NULL; }
1553 if( learner->tmp.mmap_start ) {
1554 MLOCK(learner->tmp.mmap_start, learner->tmp.mmap_length);
1555 MADVISE(learner->tmp.mmap_start, learner->tmp.mmap_length,
1556 MADV_SEQUENTIAL|MADV_WILLNEED);
1557 learner->tmp.mmap_offset = learner->tmp.offset - offset;
1558 learner->tmp.mmap_cursor = 0;
1559 } else {
1560 learner->tmp.mmap_length = 0;
1561 if( u_options & (1<<U_OPTION_VERBOSE) ) {
1562 errormsg(E_WARNING, "couldn't mmap temporary tokens\n");
1563 }
1564 }
1565 }
1566
1567 if( !tmp_seek_end(learner) ) {
1568 ok = 0;
1569 learner->tmp.file = NULL;
1570 learner->tmp.filename = NULL;
1571 goto skip_read_online;
1572 }
1573 /* consistency check */
1574 if( tmp_get_pos(learner) != learner->tmp.used ) {
1575 ok = 0;
1576 learner->tmp.file = NULL;
1577 learner->tmp.filename = NULL;
1578 goto skip_read_online;
1579 }
1580 /* printf("tmp.used = %ld tmp.avail = %ld tmp.offset = %ld\n", */
1581 /* learner->tmp.used, learner->tmp.avail, learner->tmp.offset); */
1582
1583 /* last but not least, hash may contain junk because mmapped, so
1584 clear some data */
1585 e = learner->hash + learner->max_tokens;
1586 for(p = learner->hash; p < e; p++) {
1587 if( FILLEDP(p) ) {
1588 p->tmp.read.eff = 0;
1589 }
1590 }
1591
1592 }
1593 skip_read_online:
1594 /* if we're ok, we must leave the file open, even if it's mmapped,
1595 because we must be able to unmap/remap it on the fly */
1596 if( !ok ) {
1597 if( learner->tmp.mmap_start ) {
1598 MUNMAP(learner->tmp.mmap_start, learner->tmp.mmap_length);
1599 learner->tmp.mmap_start = 0;
1600 learner->tmp.mmap_length = 0;
1601 learner->tmp.mmap_offset = 0;
1602 learner->tmp.mmap_cursor = 0;
1603 }
1604 fclose(input);
1605 errormsg(E_FATAL,
1606 "the file %s could not be read, learning from scratch.\n",
1607 path);
1608 }
1609 }
1610
1611 if( u_options & (1<<U_OPTION_VERBOSE) ) {
1612 fprintf(stdout, "%s %s\n", ok ? "loaded" : "couldn't load", path);
1613 }
1614
1615 return ok;
1616 }
1617
1618 /* this is a straight memory dump */
write_online_learner_struct(learner_t * learner,char * path)1619 void write_online_learner_struct(learner_t *learner, char *path) {
1620 FILE *output;
1621 char *tempname = NULL;
1622 bool_t ok;
1623 byte_t buf[BUFSIZ+1];
1624 size_t t, j, n;
1625 learner_t *mml;
1626 long tokoff;
1627 const byte_t *sp;
1628
1629 if( !check_magic_write(path, MAGIC_ONLINE, strlen(MAGIC_ONLINE)) ) {
1630 /* we simply ignore this. Note that check_magic_write() already
1631 notifies the user */
1632 return;
1633 }
1634
1635
1636 /* if the hash table is memory mapped, then we simply update the
1637 learner structure and exit */
1638
1639 if( learner->mmap_start != NULL ) {
1640 mml = (learner_t *)(learner->mmap_start + learner->mmap_learner_offset);
1641 memcpy(mml, learner, sizeof(learner_t));
1642 /* clear some variables just to be safe */
1643 mml->mmap_start = NULL;
1644 mml->mmap_learner_offset = 0;
1645 mml->mmap_hash_offset = 0;
1646 mml->hash = NULL;
1647
1648 return;
1649 }
1650
1651
1652 /* if we're here, then we must write out a completely new memory dump,
1653 * and the mmap_ variables are all zero. */
1654
1655 output = mytmpfile(path, &tempname);
1656 if( output ) {
1657
1658 if( out_iobuf ) {
1659 setvbuf(output, (char *)out_iobuf, (int)_IOFBF, (size_t)(BUFFER_MAG * system_pagesize));
1660 }
1661
1662 ok = 1;
1663 ok = ok &&
1664 (0 < fprintf(output, MAGIC_ONLINE));
1665
1666 /* save current model options - don't want them to change next time we learn */
1667 learner->model.options = m_options;
1668 learner->model.cp = m_cp;
1669 learner->model.dt = m_dt;
1670 learner->u_options = u_options;
1671
1672 /* make sure some stuff is zeroed out */
1673 /* but leave others untouched, eg doc.A, doc.S, doc.count for shannon */
1674 learner->mmap_start = NULL; /* assert mmap_start == NULL */
1675 learner->mmap_learner_offset = 0;
1676 learner->mmap_hash_offset = 0;
1677 learner->doc.emp.top = 0;
1678 learner->doc.emp.max = 0;
1679 learner->doc.emp.stack = NULL;
1680 memset(learner->doc.reservoir, 0, RESERVOIR_SIZE * sizeof(emplist_t));
1681
1682 /* write learner */
1683 ok = ok &&
1684 (0 < fwrite(learner, sizeof(learner_t), 1, output));
1685 for(t = j = 0; t < learner->max_tokens; t += j) {
1686 j = fwrite(&(learner->hash[t]), sizeof(l_item_t),
1687 learner->max_tokens - t, output);
1688 if( (j == 0) && ferror(output) ) {
1689 ok = 0;
1690 goto skip_write_online;
1691 }
1692 }
1693
1694 tokoff = ftell(output);
1695 /* extend the size ofthe file - this is needed mostly for mmapping,
1696 but it might also marginally speed up the repeated writes below */
1697 if( -1 == ftruncate(fileno(output), tokoff + learner->tmp.avail) ) {
1698 ok = 0;
1699 goto skip_write_online;
1700 }
1701
1702 /* write temporary tokens */
1703 if( !tmp_seek_start(learner) ) {
1704 errormsg(E_ERROR, "cannot seek in online token file %s\n", tempname);
1705 ok = 0;
1706 goto skip_write_online;
1707 }
1708 n = 0;
1709 while( (n = tmp_read_block(learner, buf, BUFSIZ, &sp)) > 0 ) {
1710 for(t = j = 0; t < n; t += j) {
1711 j = fwrite(sp + t, 1, n - t, output);
1712 if( (j == 0) && ferror(output) ) {
1713 ok = 0;
1714 goto skip_write_online;
1715 }
1716 }
1717 }
1718 /* consistency check */
1719 if( (tokoff + learner->tmp.used) != ftell(output) ) {
1720 ok = 0;
1721 goto skip_write_online;
1722 }
1723
1724 skip_write_online:
1725 fclose(output);
1726
1727 if( u_options & (1<<U_OPTION_VERBOSE) ) {
1728 fprintf(stdout, "writing online memory dump: %s\n", ok ? "success" : "failed");
1729 }
1730
1731 /* the rename is atomic on posix */
1732 if( !ok || (rename(tempname, path) < 0) ) {
1733 errormsg(E_ERROR,
1734 "due to a potential file corruption, %s was not updated\n",
1735 path);
1736 unlink(tempname);
1737 }
1738
1739 free(tempname);
1740 }
1741 }
1742
merge_temp_tokens(learner_t * dest,learner_t * src)1743 bool_t merge_temp_tokens(learner_t *dest, learner_t *src) {
1744 hash_value_t id;
1745 byte_t buf[BUFSIZ+1];
1746 char tok[(MAX_TOKEN_LEN+1)*MAX_SUBMATCH+EXTRA_TOKEN_LEN];
1747 size_t n = 0;
1748
1749 const byte_t *p;
1750 char *q;
1751
1752 l_item_t *k;
1753
1754 if( !tmp_seek_start(src) ) {
1755 errormsg(E_ERROR, "cannot seek in temporary token file [%s]\n",
1756 src->filename);
1757 return 0;
1758 }
1759 if( !tmp_seek_end(dest) ) {
1760 errormsg(E_ERROR, "cannot seek in temporary token file [%s]\n",
1761 dest->filename);
1762 return 0;
1763 }
1764
1765 q = tok;
1766 while( (n = tmp_read_block(src, buf, BUFSIZ, &p)) > 0 ) {
1767 /* p = buf; */
1768 /* p[n] = '\0'; */
1769 while( n-- > 0 ) {
1770 if( *p != TOKENSEP) {
1771 *q++ = *p; /* copy into tok */
1772 } else { /* interword space */
1773 *q = 0; /* append NUL to tok */
1774 /* now lookup weight in hash */
1775 id = hash_full_token(tok);
1776 k = find_in_learner(dest, id);
1777 if( !k || !FILLEDP(k) ) {
1778 tmp_grow(dest); /* just in case we're full */
1779 tmp_write_token(dest, tok);
1780 }
1781 q = tok; /* reset q */
1782 }
1783 p++;
1784 }
1785 }
1786
1787
1788 return 1;
1789 }
1790
merge_hashes(learner_t * dest,learner_t * src)1791 bool_t merge_hashes(learner_t *dest, learner_t *src) {
1792 register l_item_t *i, *j, *e;
1793 alphabet_size_t ci, cj;
1794
1795 e = src->hash + src->max_tokens;
1796 for(j = src->hash ; j != e; j++) {
1797 if( FILLEDP(j) ) {
1798 i = find_in_learner(dest, j->id);
1799 if( i && !FILLEDP(i) &&
1800 ((100 * dest->unique_token_count) >=
1801 (HASH_FULL * dest->max_tokens)) && grow_learner_hash(dest) ) {
1802 i = find_in_learner(dest,j->id);
1803 /* new i, go through all tests again */
1804 }
1805 if( i ) {
1806 if( FILLEDP(i) ) {
1807
1808 /* nothing */
1809
1810 } else if( /* !FILLEDP(i) && */
1811 ((100 * dest->unique_token_count) <
1812 (HASH_FULL * dest->max_tokens) ) ) {
1813
1814 SET(i->id, j->id);
1815
1816 INCREMENT(dest->unique_token_count,
1817 K_TOKEN_COUNT_MAX, overflow_warning);
1818
1819 i->typ = j->typ;
1820
1821 /* order accounting */
1822 dest->max_order = MAXIMUM(dest->max_order,i->typ.order);
1823
1824 INCREMENT(dest->fixed_order_unique_token_count[i->typ.order],
1825 K_TOKEN_COUNT_MAX, overflow_warning);
1826
1827 }
1828
1829 INCREASE(i->count, j->count,
1830 K_TOKEN_COUNT_MAX, overflow_warning);
1831
1832 dest->tmax = MAXIMUM(dest->tmax, i->count);
1833
1834 INCREASE(dest->full_token_count, j->count,
1835 K_TOKEN_COUNT_MAX, overflow_warning);
1836
1837 INCREASE(dest->fixed_order_token_count[i->typ.order], j->count,
1838 K_TOKEN_COUNT_MAX, skewed_constraints_warning);
1839
1840 }
1841 }
1842 }
1843
1844 for(ci = 0; ci < ASIZE; ci++) {
1845 for(cj = 0; cj < ASIZE; cj++) {
1846 dest->dig[ci][cj] += src->dig[ci][cj];
1847 }
1848 }
1849
1850 return 1;
1851 }
1852
1853
merge_learner_struct(learner_t * learner,char * path)1854 bool_t merge_learner_struct(learner_t *learner, char *path) {
1855 learner_t dummy;
1856 bool_t ok = 0;
1857 options_t m_sav, u_sav;
1858
1859 m_sav = m_options;
1860 u_sav = u_options;
1861
1862 init_learner(&dummy, path, 1);
1863
1864 m_options = m_sav;
1865 u_options = u_sav;
1866
1867 if( dummy.retype != 0 ) {
1868 errormsg(E_WARNING,
1869 "merging regular expressions is not supported, ignoring %s\n",
1870 path);
1871 ok = 0;
1872 } else {
1873 /* in case these changed while initing dummy */
1874 learner->model.options = m_options;
1875 learner->model.cp = m_cp;
1876 learner->model.dt = m_dt;
1877
1878 /* _before_ we fill the hash, we merge the temp tokens (we only
1879 add those thokens not found in learner->hash) */
1880 ok =
1881 merge_temp_tokens(learner, &dummy) &&
1882 merge_hashes(learner, &dummy);
1883
1884 if( ok ) {
1885 learner->doc.A += dummy.doc.A;
1886 learner->doc.S += dummy.doc.S;
1887 learner->doc.count += dummy.doc.count;
1888 learner->doc.nullcount += dummy.doc.nullcount;
1889 /* can't use these when merging */
1890 learner->alpha = 0.0;
1891 learner->beta = 0.0;
1892 learner->mu = 0.0;
1893 learner->s2 = 0.0;
1894 }
1895 }
1896
1897 free_learner(&dummy);
1898
1899 if( !ok ) {
1900 errormsg(E_WARNING,
1901 "%s could not be merged, the learned category might be corrupt\n",
1902 path);
1903 }
1904
1905 return ok;
1906 }
1907
1908 /* returns an approximate binomial r.v.; if np < 10 and n > 20,
1909 * a Poisson approximation is used, else the variable is exact.
1910 */
binomial(token_count_t n,double p)1911 token_count_t binomial(token_count_t n, double p) {
1912 double sum, thresh;
1913 token_count_t i, x;
1914
1915 x = 0;
1916 if( (n > 20) && (n * p < 10) ) {
1917 /* approximate poisson has lambda = np */
1918 sum = -log( ((double)rand())/(RAND_MAX + 1.0) );
1919 thresh = n * p;
1920 /* printf("poisson with sum = %g thresh = %g\n", sum, thresh); */
1921 while( (sum < thresh) && (x < n) ) {
1922 x++;
1923 sum += -log( ((double)rand())/(RAND_MAX + 1.0) );
1924 /* printf("poisson with sum = %g thresh = %g, x = %d\n", sum, thresh, x); */
1925 }
1926 } else {
1927 /* direct calculation of binomial */
1928 for(i = 0; i < n; i++) {
1929 x += (int)(p + ((double)rand())/(RAND_MAX + 1.0));
1930 }
1931 }
1932 /* if( x > 0 ) { printf("binomial x = %d, n = %d, p = %g, np = %g\n", x, n, p, n * p); } */
1933 return x;
1934 }
1935
update_shannon_partials(learner_t * learner,bool_t fulldoc)1936 void update_shannon_partials(learner_t *learner, bool_t fulldoc) {
1937 hash_count_t i;
1938 weight_t ell, lell;
1939 bool_t docount;
1940
1941 docount = (learner->doc.emp.top > 0);
1942 /* if force is true, we have a document, else we check emp.top to
1943 see if this is a nonempty document */
1944 if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
1945
1946 learner->doc.emp.shannon = 0;
1947 if( learner->doc.emp.top > 0 ) {
1948 /* assume all marks are zero to start with */
1949 for(i = 0; i < learner->doc.emp.top; i++) {
1950 l_item_t *p = find_in_learner(learner, learner->doc.emp.stack[i]);
1951 if( p && !MARKEDP(p) ) {
1952 ell = ((weight_t)p->tmp.read.eff)/learner->doc.emp.top;
1953
1954 /* it would be nice to be able to digitize p->B, but ell is
1955 often smaller than the smallest value, and if there are
1956 many documents, there could simultaneously be overflow on the most
1957 frequent features. So we need p->B to be a floating point type. */
1958 p->B += ell;
1959
1960 /* the standard entropy convention is that 0*inf = 0. here, if
1961 * ell is so small that log(ell) is infinite, we pretend ell
1962 * == 0 this is slightly better than testing for ell == 0
1963 * directly.
1964 */
1965
1966 lell = -log(ell);
1967 if( !isinf(lell) ) {
1968 learner->doc.emp.shannon += ell * lell;
1969 }
1970
1971 SETMARK(p);
1972 }
1973 }
1974
1975 if( u_options & (1<<U_OPTION_CONFIDENCE) ) {
1976 emplist_add_to_reservoir(learner->doc.reservoir,
1977 learner->doc.count,
1978 &learner->doc.emp);
1979 }
1980
1981 /* fprintf(stderr, "udate_shannon A +(%f) S +(%f)\n", */
1982 /* -learner->doc.emp.shannon, (learner->doc.emp.shannon * learner->doc.emp.shannon)); */
1983 learner->doc.A += -learner->doc.emp.shannon;
1984 learner->doc.S += (learner->doc.emp.shannon * learner->doc.emp.shannon);
1985
1986 /* clear the empirical counts and marks */
1987 for(i = 0; i < learner->doc.emp.top; i++) {
1988 l_item_t *p = find_in_learner(learner, learner->doc.emp.stack[i]);
1989 if( p && MARKEDP(p) ) {
1990 p->tmp.read.eff = 0;
1991 UNSETMARK(p);
1992 }
1993 }
1994
1995 /* now reset the empirical features */
1996 learner->doc.emp.top = 0;
1997
1998 }
1999 }
2000 if( docount ) {
2001 learner->doc.count++;
2002 }
2003 }
2004
calc_shannon(learner_t * learner)2005 void calc_shannon(learner_t *learner) {
2006 l_item_t *i, *e;
2007 document_count_t c, n;
2008 hash_count_t q;
2009 emplist_t *empl;
2010 score_t score;
2011 score_t effective_count = 0.0;
2012 double Lambda, jensen, s2;
2013
2014
2015 learner->mu = 0.0;
2016 learner->s2 = 0.0;
2017 learner->alpha = 0.0;
2018 learner->beta = 0.0;
2019
2020 effective_count = learner->doc.count; /* cast to real value for readability */
2021
2022 if( (m_options & (1<<M_OPTION_CALCENTROPY)) &&
2023 (learner->doc.count > 1) ) {
2024 learner->shannon += -(learner->doc.A/effective_count);
2025 learner->shannon2 =
2026 (effective_count * learner->doc.S - (learner->doc.A * learner->doc.A)) /
2027 (effective_count * (effective_count - 1.0));
2028
2029 /* fprintf(stderr, "[%f %f %f %f]\n", effective_count * learner->doc.S, (learner->doc.A * learner->doc.A), (effective_count * learner->doc.S - (learner->doc.A * learner->doc.A)), (effective_count * (effective_count - 1.0)) ); */
2030 /* fprintf(stderr, "calc_entropy eff %f sha %f sha2 %f A %f S %f\n", effective_count, learner->shannon, learner->shannon2, learner->doc.A , learner->doc.S); */
2031
2032 } else { /* no individual documents, so compute global entropy */
2033
2034 learner->shannon = 0.0;
2035 learner->shannon2 = 0.0;
2036
2037 e = learner->hash + learner->max_tokens;
2038 for(i = learner->hash; i != e; i++) {
2039 if( FILLEDP(i) &&
2040 (i->typ.order == learner->max_order) ) {
2041 learner->shannon += log((weight_t)i->count) * (weight_t)i->count;
2042 }
2043 }
2044 learner->shannon =
2045 -( learner->shannon/learner->full_token_count -
2046 log((weight_t)learner->full_token_count) );
2047 }
2048
2049 if( (u_options & (1<<U_OPTION_CONFIDENCE)) &&
2050 (learner->doc.count > 0) ) {
2051
2052 /* shannon was computed during update */
2053 jensen = 0.0;
2054 e = learner->hash + learner->max_tokens;
2055 for(i = learner->hash; i != e; i++) {
2056 if( NOTNULL(i->lam) ) {
2057 Lambda = UNPACK_LAMBDA(i->lam);
2058 if( i->typ.order == 1 ) {
2059 Lambda += UNPACK_RWEIGHTS(i->tmp.min.dref) - learner->logZ;
2060 }
2061 learner->mu += (Lambda * i->B);
2062 jensen += (Lambda * Lambda * i->B);
2063 }
2064 }
2065
2066 learner->mu /= effective_count;
2067 learner->mu = -(learner->shannon + learner->mu);
2068
2069 s2 = 0.0;
2070 n = 1;
2071 for(c = 0; c < RESERVOIR_SIZE; c++) {
2072 empl = &(learner->doc.reservoir[c]);
2073 if( !empl->stack ) {
2074 n++;
2075 continue;
2076 } else {
2077 score = 0;
2078 for(q = 0; q < empl->top; q++) {
2079 i = find_in_learner(learner, empl->stack[q]);
2080 if( i && NOTNULL(i->lam) ) {
2081 Lambda = UNPACK_LAMBDA(i->lam);
2082 if( i->typ.order == 1 ) {
2083 Lambda += UNPACK_RWEIGHTS(i->tmp.min.dref) - learner->logZ;
2084 }
2085 score += Lambda;
2086 }
2087 }
2088
2089 score = score/empl->top + empl->shannon;
2090 if( u_options & (1<<U_OPTION_VERBOSE) &&
2091 u_options & (1<<U_OPTION_CONFIDENCE) ) {
2092 fprintf(stdout,
2093 "%s ( %5.2" FMT_printf_score_t " + "
2094 "%-5.2" FMT_printf_score_t " )* %-" FMT_printf_integer_t " \n",
2095 "reservoir",
2096 -nats2bits(score),
2097 nats2bits(empl->shannon),
2098 empl->top);
2099 }
2100
2101 score = -score - learner->mu;
2102 s2 += (score)*(score);
2103 }
2104 }
2105
2106 /* the reservoir doesn't contain enough extreme samples, so
2107 the variance is necessarily underestimated.
2108 */
2109 if( (c <= n) || isinf(learner->s2) ) {
2110 learner->s2 = 0.0;
2111 learner->alpha = 0.0;
2112 learner->beta = 0.0;
2113 } else {
2114 learner->s2 = s2/(c-n);
2115 learner->alpha = (learner->mu*learner->mu)/learner->s2;
2116 learner->beta = learner->s2/learner->mu;
2117 }
2118
2119
2120 } else {
2121 learner->alpha = 0.0;
2122 learner->beta = 0.0;
2123 learner->mu = 0.0;
2124 learner->s2 = 0.0;
2125 }
2126
2127 if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
2128 clear_empirical(&empirical);
2129 }
2130
2131 if( u_options & (1<<U_OPTION_VERBOSE) ) {
2132 fprintf(stdout,
2133 "documents %ld shannon %" FMT_printf_score_t
2134 " mu %" FMT_printf_score_t
2135 " s2 %" FMT_printf_score_t
2136 " alpha %" FMT_printf_score_t
2137 " beta %" FMT_printf_score_t "\n",
2138 (long int)effective_count,
2139 nats2bits(learner->shannon),
2140 nats2bits(learner->mu),
2141 nats2bits(learner->s2),
2142 nats2bits(learner->alpha),
2143 nats2bits(learner->beta));
2144 }
2145
2146 }
2147
reset_mbox_messages(learner_t * learner,MBOX_State * mbox)2148 void reset_mbox_messages(learner_t *learner, MBOX_State *mbox) {
2149 not_header = 1;
2150 learner->doc.count = 0;
2151 learner->doc.nullcount = 0; /* first header doesn't count */
2152 learner->doc.skip = 0;
2153 }
2154
count_mbox_messages(learner_t * learner,Mstate mbox_state,char * textbuf)2155 void count_mbox_messages(learner_t *learner, Mstate mbox_state, char *textbuf) {
2156
2157 if( !textbuf ) {
2158 update_shannon_partials(learner, 0);
2159 /* don't count document */
2160 } else {
2161 switch(mbox_state) {
2162 case msHEADER:
2163 if(not_header) {
2164 if( u_options & (1<<U_OPTION_DECIMATE) ) {
2165 learner->doc.skip = ( rand() > (int)(RAND_MAX>>decimation) );
2166 }
2167 if( !learner->doc.skip ) {
2168 if( m_options & (1<<M_OPTION_CALCENTROPY) &&
2169 (learner->doc.emp.top == 0) ) {
2170 learner->doc.nullcount++;
2171 }
2172 update_shannon_partials(learner, 1);
2173 }
2174 }
2175 not_header = 0;
2176 break;
2177 default:
2178 not_header = 1;
2179 break;
2180 }
2181 }
2182
2183 }
2184
2185
find_in_learner(learner_t * learner,hash_value_t id)2186 l_item_t *find_in_learner(learner_t *learner, hash_value_t id) {
2187 register l_item_t *i, *loop;
2188 /* start at id */
2189 i = loop = &learner->hash[id & (learner->max_tokens - 1)];
2190
2191 while( FILLEDP(i) ) {
2192 if( EQUALP(i->id,id) ) {
2193 return i; /* found id */
2194 } else {
2195 i++; /* not found */
2196 /* wrap around */
2197 i = (i >= &learner->hash[learner->max_tokens]) ? learner->hash : i;
2198 if( i == loop ) {
2199 return NULL; /* when hash table is full */
2200 }
2201 }
2202 }
2203
2204 /* empty slot, so not found */
2205
2206 return i;
2207 }
2208
2209
2210 /* places the token in the global hash and writes the
2211 token to a temporary file for later, then updates
2212 the digram frequencies. Tokens have the format
2213 DIAMOND t1 DIAMOND t2 ... tn DIAMOND CLASSEP class NUL */
hash_word_and_learn(learner_t * learner,char * tok,token_type_t tt,regex_count_t re)2214 void hash_word_and_learn(learner_t *learner,
2215 char *tok, token_type_t tt, regex_count_t re) {
2216 hash_value_t id;
2217 l_item_t *i;
2218 char *s;
2219 alphabet_size_t p,q,len;
2220
2221 for(s = tok; s && *s == DIAMOND; s++);
2222 if( s && (*s != EOTOKEN) ) {
2223
2224 if( overflow_warning ) {
2225 return; /* for there be troubles ahead */
2226 }
2227
2228 if( m_options & (1<<M_OPTION_MULTINOMIAL) ) { tt.order = 1; }
2229
2230 if( u_options & (1<<U_OPTION_DECIMATE) ) {
2231 if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
2232 if( learner->doc.skip ) {
2233 return;
2234 }
2235 } else {
2236 if( rand() > (int)(RAND_MAX>>decimation) ) {
2237 return;
2238 }
2239 }
2240 }
2241
2242 id = hash_full_token(tok);
2243 i = find_in_learner(learner, id);
2244
2245 if( i && !FILLEDP(i) &&
2246 ((100 * learner->unique_token_count) >=
2247 (HASH_FULL * learner->max_tokens)) && grow_learner_hash(learner) ) {
2248 i = find_in_learner(learner,id);
2249 /* new i, go through all tests again */
2250 }
2251
2252 if( i ) {
2253
2254 if( FILLEDP(i) ) {
2255
2256 /* redundant :-) */
2257
2258 } else if( /* !FILLEDP(i) && */
2259 ((100 * learner->unique_token_count) <
2260 (HASH_FULL * learner->max_tokens) ) ) {
2261
2262 /* fill the hash and write to file */
2263
2264 SET(i->id,id);
2265
2266 INCREMENT(learner->unique_token_count,
2267 K_TOKEN_COUNT_MAX, overflow_warning);
2268
2269 i->typ = tt;
2270
2271 /* order accounting */
2272 learner->max_order = MAXIMUM(learner->max_order,i->typ.order);
2273
2274 INCREMENT(learner->fixed_order_unique_token_count[i->typ.order],
2275 K_TOKEN_COUNT_MAX, overflow_warning);
2276
2277 if( (u_options & (1<<U_OPTION_DEBUG)) ) {
2278 fprintf(stdout, "match %8lx ", (long unsigned int)i->id);
2279 print_token(stdout, tok);
2280 fprintf(stdout, " %d\n", i->typ.order);
2281 }
2282
2283 tmp_grow(learner); /* just in case we're full */
2284 tmp_write_token(learner, tok);
2285 }
2286
2287 INCREMENT(i->count, K_TOKEN_COUNT_MAX, overflow_warning);
2288 learner->tmax = MAXIMUM(learner->tmax, i->count);
2289
2290 if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
2291 if( (learner->doc.emp.top < learner->doc.emp.max) ||
2292 emplist_grow(&learner->doc.emp) ) {
2293 i->tmp.read.eff++; /* this can never overflow before i->count */
2294 UNSETMARK(i); /* needed for calculating shannon */
2295 learner->doc.emp.stack[learner->doc.emp.top++] = i->id;
2296 }
2297 }
2298
2299 INCREMENT(learner->full_token_count,
2300 K_TOKEN_COUNT_MAX, overflow_warning);
2301
2302 INCREMENT(learner->fixed_order_token_count[i->typ.order],
2303 K_TOKEN_COUNT_MAX, skewed_constraints_warning);
2304
2305 }
2306
2307 if( digramic_overflow_warning ) {
2308 return;
2309 }
2310
2311
2312 /* now update digram frequency counts */
2313 if( tt.order == 1 ) { /* count each token only once */
2314 p = *tok++;
2315 CLIP_ALPHABET(p);
2316 len = 1;
2317 /* finally update character frequencies */
2318 while( *tok != EOTOKEN ) {
2319 q = (unsigned char)*tok;
2320 CLIP_ALPHABET(q);
2321 if( learner->dig[p][q] < K_DIGRAM_COUNT_MAX )
2322 { learner->dig[p][q]++; } else { digramic_overflow_warning = 1; }
2323 if( learner->dig[RESERVED_MARGINAL][q] < K_DIGRAM_COUNT_MAX )
2324 { learner->dig[RESERVED_MARGINAL][q]++; } /* no warning on overflow */
2325
2326 p = q;
2327 tok++;
2328 if( q != DIAMOND ) { len++; }
2329 }
2330 learner->dig[RESERVED_TOKLEN][len]++;
2331
2332 if( digramic_overflow_warning ) {
2333 errormsg(E_WARNING,
2334 "ran out of integers (too much data), "
2335 "reference measure may be skewed.\n");
2336 m_options |= (1<<M_OPTION_WARNING_BAD);
2337 }
2338 }
2339
2340 if( overflow_warning ) {
2341 errormsg(E_WARNING,
2342 "ran out of integers (too much data), "
2343 "results may be skewed.\n");
2344 m_options |= (1<<M_OPTION_WARNING_BAD);
2345 }
2346 }
2347 }
2348
2349 /* initialize global learner object */
init_learner(learner_t * learner,char * opath,bool_t readonly)2350 void init_learner(learner_t *learner, char *opath, bool_t readonly) {
2351 alphabet_size_t i, j;
2352 regex_count_t c;
2353 token_order_t z;
2354
2355 /* some constants */
2356 learner->max_tokens = default_max_tokens;
2357 learner->max_hash_bits = default_max_hash_bits;
2358 learner->full_token_count = 0;
2359 learner->unique_token_count = 0;
2360 learner->tmax = 0;
2361 learner->logZ = 0.0;
2362 learner->shannon = 0.0;
2363 learner->shannon2 = 0.0;
2364 learner->alpha = 0.0;
2365 learner->beta = 0.0;
2366 learner->mu = 0.0;
2367 learner->s2 = 0.0;
2368 learner->max_order = 0;
2369 memset(learner->mediaprobs, 0 , TOKEN_CLASS_MAX * sizeof(score_t));
2370
2371 learner->doc.A = 0.0;
2372 learner->doc.S = 0.0;
2373 learner->doc.count = 0;
2374 learner->doc.nullcount = 0;
2375 learner->doc.emp.top = 0;
2376 learner->doc.emp.max = 0;
2377 learner->doc.emp.stack = NULL;
2378 memset(learner->doc.reservoir, 0, RESERVOIR_SIZE * sizeof(emplist_t));
2379
2380 learner->model.options = m_options;
2381 learner->model.cp = m_cp;
2382 learner->model.dt = m_dt;
2383 learner->model.tmin = zthreshold;
2384 learner->u_options = u_options;
2385
2386 learner->mmap_start = NULL;
2387 learner->mmap_learner_offset = 0;
2388 learner->mmap_hash_offset = 0;
2389 learner->hash = NULL;
2390
2391 /* init character frequencies */
2392 for(i = 0; i < ASIZE; i++) {
2393 for(j = 0; j < ASIZE; j++) {
2394 learner->dig[i][j] = 0.0;
2395 }
2396 }
2397
2398 for(c = 0; c < MAX_RE + 1; c++) {
2399 learner->regex_token_count[c] = 0;
2400 }
2401
2402 for(z = 0; z < MAX_SUBMATCH; z++) {
2403 learner->fixed_order_token_count[z] = 0;
2404 learner->fixed_order_unique_token_count[z] = 0;
2405 }
2406
2407 if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
2408 reset_mbox_messages(learner, &mbox);
2409 }
2410
2411 if( !(opath && *opath &&
2412 read_online_learner_struct(learner, opath, readonly)) ) {
2413 /* if we're here, we must do what read_online_... didn't do */
2414
2415 /* allocate and zero room for hash */
2416 learner->hash = (l_item_t *)calloc(learner->max_tokens, sizeof(l_item_t));
2417 if( !learner->hash ) {
2418 errormsg(E_FATAL,
2419 "not enough memory? I couldn't allocate %li bytes\n",
2420 (sizeof(l_item_t) * ((long int)learner->max_tokens)));
2421 }
2422
2423 /* open temporary file for writing tokens */
2424 learner->tmp.file =
2425 mytmpfile(learner->filename ? learner->filename : progname,
2426 &learner->tmp.filename);
2427 /* set this: in case of a fatal signal, we can unlink the temprile */
2428 cleanup.tempfile = learner->tmp.filename;
2429
2430 learner->tmp.iobuf = NULL; /* can't reuse in_iobuf or out_iobuf */
2431 learner->tmp.offset = 0;
2432 learner->tmp.avail = 0;
2433 learner->tmp.used = 0;
2434 learner->tmp.mmap_start = NULL;
2435 learner->tmp.mmap_offset = 0;
2436 learner->tmp.mmap_length = 0;
2437 learner->tmp.mmap_cursor = 0;
2438
2439 if( learner->tmp.file ) {
2440 #if defined HAVE_POSIX_MEMALIGN
2441 /* buffer must exist until after fclose() */
2442 if( 0 != posix_memalign(&learner->tmp.iobuf, system_pagesize,
2443 BUFFER_MAG * system_pagesize) ) {
2444 learner->tmp.iobuf = NULL;
2445 }
2446 #elif defined HAVE_MEMALIGN
2447 /* buffer can't be free()'d */
2448 learner->tmpiobuf = (void *)memalign(system_pagesize,
2449 BUFFER_MAG * system_pagesize);
2450 #elif defined HAVE_VALLOC
2451 /* buffer can't be free()'d */
2452 learner->tmpiobuf = (void *)valloc(BUFFER_MAG * system_pagesize);
2453 #endif
2454 if( learner->tmp.iobuf ) {
2455 setvbuf(learner->tmp.file, (char *)learner->tmp.iobuf, (int)_IOFBF,
2456 (size_t)(BUFFER_MAG * system_pagesize));
2457 }
2458 }
2459
2460 }
2461
2462 if( u_options & (1<<U_OPTION_MMAP) ) {
2463 MLOCK(learner->hash, sizeof(l_item_t) * learner->max_tokens);
2464 }
2465
2466 MADVISE(learner->hash, sizeof(l_item_t) * learner->max_tokens,
2467 MADV_SEQUENTIAL);
2468
2469 if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
2470 learner->doc.emp.max = system_pagesize/sizeof(hash_count_t);
2471 learner->doc.emp.stack =
2472 (hash_value_t *)malloc(learner->doc.emp.max * sizeof(hash_value_t));
2473 if( !learner->doc.emp.stack ) {
2474 errormsg(E_WARNING,
2475 "disabling emp.stack. Not enough memory? I couldn't allocate %li bytes\n",
2476 sizeof(hash_count_t *) * ((long int)learner->doc.emp.max));
2477 m_options &= ~(1<<M_OPTION_CALCENTROPY);
2478 }
2479 }
2480
2481 }
2482
free_learner(learner_t * learner)2483 void free_learner(learner_t *learner) {
2484 document_count_t i;
2485
2486 free_learner_hash(learner);
2487
2488 if( learner->doc.emp.stack ) {
2489 free(learner->doc.emp.stack);
2490 learner->doc.emp.stack = NULL;
2491 }
2492
2493 for( i = 0; i < RESERVOIR_SIZE; i++ ) {
2494 if( learner->doc.reservoir[i].stack ) {
2495 free(learner->doc.reservoir[i].stack);
2496 learner->doc.reservoir[i].stack = NULL;
2497 }
2498 }
2499
2500 if( learner->tmp.file ) {
2501 fclose(learner->tmp.file);
2502 learner->tmp.file = NULL;
2503 }
2504
2505 cleanup_tempfiles();
2506 }
2507
2508 /* calculates the most probable Dirichlet parameters
2509 given digram counts. Method described in MacKay and Peto (1995)
2510 Since we don't count the DIAMOND-DIAMOND digram, including the prior will
2511 overestimate the DIAMOND-DIAMOND transition probability. However, for this
2512 transition as well as the others, the Dirichlet will have practically
2513 no effect when there's enough data in the corpus.
2514
2515 HOWEVER:
2516 * A property of the MP posterior estimates is that if character j
2517 * never appears as a transition *target*, then the corresponding
2518 * u[j] entry is null. Sometimes, this makes sense, as j isn't part
2519 * of the "vocabulary", but other times it's not sensible, becasue sooner
2520 * or later some spam message will make use of that character in
2521 * some combination with another character. When that happens, we're
2522 * in trouble.
2523 *
2524 * Previous versions of dbacl.c had a simple hack to prevent this
2525 * possibility: we pretend we saw a DIAMOND-j-DIAMOND
2526 * double transition, for each j. Equivalently, we simply increment
2527 * the count by 1 for each such transition, i.e.
2528 * learner->dig[i][(alphabet_size_t)DIAMOND]++;
2529 * learner->dig[(alphabet_size_t)DIAMOND][i]++;
2530 *
2531 * This hack has now been removed again, to prevent confusion.
2532 * It's not really needed, since the uniform measure can be used
2533 * for spam filtering.
2534 */
make_dirichlet_digrams(learner_t * learner)2535 void make_dirichlet_digrams(learner_t *learner) {
2536 alphabet_size_t i, j;
2537 token_count_t k;
2538 float G[ASIZE];
2539 float H[ASIZE];
2540 float V[ASIZE];
2541 float F[ASIZE];
2542 double prev_alpha;
2543 double K, t;
2544
2545 /* initialize */
2546 dirichlet.alpha = 1.0;
2547 for(i = AMIN; i < ASIZE; i++) {
2548 dirichlet.u[i] = 1.0/((double)(ASIZE - AMIN));
2549 }
2550
2551 /* precompute useful constants */
2552 for(j = AMIN; j < ASIZE; j++) {
2553 G[j] = 0.0;
2554 H[j] = 0.0;
2555 V[j] = 0.0;
2556 F[j] = 0.0;
2557 for(i = AMIN; i < ASIZE; i++) {
2558 for(k = AMIN; k < (token_count_t)learner->dig[i][j]; k++) {
2559 G[j] += 1.0/k;
2560 H[j] += 1.0/((double)k*k);
2561 }
2562 if( learner->dig[i][j] > 0 ) {
2563 V[j]++;
2564 F[j] += learner->dig[i][j];
2565 }
2566 }
2567 if( 0 && u_options & (1<<U_OPTION_DEBUG) ) {
2568 fprintf(stdout,
2569 "symbol %d has G = %g H = %g V = %g F = %g\n",
2570 j, G[j], H[j], V[j], F[j]);
2571 }
2572 }
2573
2574 /* now optimize the dirichlet parameters */
2575 #define MAX_DIRITER 300
2576 k = 0;
2577 do {
2578 prev_alpha = dirichlet.alpha;
2579 K = 0.0;
2580 for(i = AMIN; i < ASIZE; i++) {
2581 K += log((F[i] + prev_alpha)/prev_alpha) +
2582 (0.5) * F[i]/(prev_alpha * (F[i] + prev_alpha));
2583 }
2584 dirichlet.alpha = 0.0;
2585 for(j = AMIN; j < ASIZE; j++) {
2586 t = K - G[j];
2587 dirichlet.u[j] = 2 * V[j] /(t + sqrt(t * t + 4 * H[j] * V[j]));
2588 dirichlet.alpha += dirichlet.u[j];
2589 }
2590 if( u_options & (1<<U_OPTION_VERBOSE) ) {
2591 fprintf(stdout,
2592 "dirichlet alpha %g --> %g\n",
2593 prev_alpha, dirichlet.alpha);
2594 }
2595 } while((k++ < MAX_DIRITER) &&
2596 (fabs(prev_alpha - dirichlet.alpha) > qtol_alpha) );
2597
2598 if( k >= MAX_DIRITER ) {
2599 errormsg(E_WARNING, "dirichlet measure did not converge.\n");
2600 }
2601
2602 if(u_options & (1<<U_OPTION_VERBOSE) ) {
2603 for(j = AMIN; j < ASIZE; j++) {
2604 fprintf(stdout,
2605 "dirichlet u[%d] = %g\n",
2606 j, dirichlet.u[j]);
2607 }
2608 }
2609
2610 /* now make the probabilities */
2611
2612 for(i = AMIN; i < ASIZE; i++) {
2613 t = 0.0;
2614 for(j = AMIN; j < ASIZE; j++) {
2615 t += learner->dig[i][j];
2616 }
2617 for(j = AMIN; j < ASIZE; j++) {
2618 /* note: simulate the effect of digitizing the digrams */
2619 learner->dig[i][j] =
2620 UNPACK_DIGRAMS(PACK_DIGRAMS(log(((weight_t)learner->dig[i][j] +
2621 dirichlet.u[j]) /
2622 (t + dirichlet.alpha))));
2623 if( 1 && u_options & (1<<U_OPTION_DEBUG) ) {
2624 fprintf(stdout,
2625 "learner->dig[%d][%d] = %f\n",
2626 i, j, learner->dig[i][j]);
2627 }
2628 }
2629 }
2630 /* clear other data */
2631 for(i = 0; i < AMIN; i++) {
2632 for(j = 0; j < ASIZE; j++) {
2633 learner->dig[i][j] = 0.0;
2634 }
2635 }
2636 }
2637
make_uniform_digrams(learner_t * learner)2638 void make_uniform_digrams(learner_t *learner) {
2639 alphabet_size_t i, j;
2640 for(i = AMIN; i < ASIZE; i++) {
2641 for(j = AMIN; j < ASIZE; j++) {
2642 /* note: simulate the effect of digitizing the digrams */
2643 learner->dig[i][j] = UNPACK_DIGRAMS(PACK_DIGRAMS(-log(ASIZE - AMIN)));
2644 if(0 && u_options & (1<<U_OPTION_DEBUG) ) {
2645 fprintf(stdout,
2646 "learner->dig[%d][%d] = %f\n",
2647 i, j, learner->dig[i][j]);
2648 }
2649 }
2650 }
2651 /* clear other data */
2652 for(i = 0; i < AMIN; i++) {
2653 for(j = 0; j < ASIZE; j++) {
2654 learner->dig[i][j] = 0.0;
2655 }
2656 }
2657 }
2658
2659
make_toklen_digrams(learner_t * learner)2660 void make_toklen_digrams(learner_t *learner) {
2661 alphabet_size_t i, j;
2662 weight_t lam[MAX_TOKEN_LEN+2];
2663 weight_t total;
2664 /* zero out usual digrams, we only use token length */
2665 {
2666 for(i = AMIN; i < ASIZE; i++) {
2667 for(j = AMIN; j < ASIZE; j++) {
2668 learner->dig[i][j] = 0.0;
2669 }
2670 }
2671 for(j = AMIN; j < ASIZE; j++) {
2672 learner->dig[RESERVED_MARGINAL][j] = 0.0;
2673 }
2674 }
2675 /* note: toklen counts the number of transitions, not number of chars */
2676 total = 0.0;
2677 for(i = 2; i < MAX_TOKEN_LEN+2; i++) {
2678 if( learner->dig[RESERVED_TOKLEN][i] <= 0.0 ) {
2679 /* don't want infinities */
2680 learner->dig[RESERVED_TOKLEN][i]++;
2681 }
2682 total += learner->dig[RESERVED_TOKLEN][i];
2683 }
2684
2685 lam[0] = 0.0; /* normalizing constant */
2686 for(i = 2; i < MAX_TOKEN_LEN+2; i++) {
2687 lam[i] = log(learner->dig[RESERVED_TOKLEN][i]/total) - i * log(ASIZE - AMIN);
2688 }
2689 for(i = 0; i < MAX_TOKEN_LEN+2; i++) {
2690 if(0 && u_options & (1<<U_OPTION_DEBUG) ) {
2691 fprintf(stdout,
2692 "learner->dig[%d][%d] = %f %f %f\n",
2693 RESERVED_TOKLEN, i, learner->dig[RESERVED_TOKLEN][i],
2694 -i * log(ASIZE - AMIN), lam[i]);
2695 }
2696 learner->dig[RESERVED_TOKLEN][i] = UNPACK_DIGRAMS(PACK_DIGRAMS(lam[i]));
2697 }
2698 }
2699
make_mle_digrams(learner_t * learner)2700 void make_mle_digrams(learner_t *learner) {
2701 alphabet_size_t i, j;
2702 double total;
2703 double missing;
2704
2705 /* for the mle we use a slightly modified emprirical frequency histogram.
2706 This is necessary because the true mle causes a singularity in classification
2707 whenever a word contains a character transition which was never seen before.
2708 The modification is to mix the true mle with a uniform distribution, but
2709 the uniform only accounts for 1/total of the mass, so ie it disappears
2710 quickly.
2711 */
2712 for(i = AMIN; i < ASIZE; i++) {
2713 total = 0.0;
2714 missing = 0.0;
2715 for(j = AMIN; j < ASIZE; j++) {
2716 if( learner->dig[i][j] > 0.0 ) {
2717 total += learner->dig[i][j];
2718 } else {
2719 missing++;
2720 }
2721 }
2722 if( total > 0.0 ) {
2723 if( missing > 0.0 ) {
2724 missing = 1.0/missing;
2725 for(j = AMIN; j < ASIZE; j++) {
2726 learner->dig[i][j] =
2727 UNPACK_DIGRAMS(PACK_DIGRAMS(log((learner->dig[i][j]/total) * (1.0 - missing) + missing)));
2728 }
2729 } else {
2730 for(j = AMIN; j < ASIZE; j++) {
2731 learner->dig[i][j] =
2732 UNPACK_DIGRAMS(PACK_DIGRAMS(log(learner->dig[i][j] / total)));
2733 }
2734 }
2735 } else {
2736 for(j = AMIN; j < ASIZE; j++) {
2737 learner->dig[i][j] =
2738 UNPACK_DIGRAMS(PACK_DIGRAMS(-log(missing)));
2739 }
2740 }
2741 }
2742 /* clear other data */
2743 for(i = 0; i < AMIN; i++) {
2744 for(j = 0; j < ASIZE; j++) {
2745 learner->dig[i][j] = 0.0;
2746 }
2747 }
2748 }
2749
make_iid_digrams(learner_t * learner)2750 void make_iid_digrams(learner_t *learner) {
2751 alphabet_size_t i, j;
2752 double total;
2753 double missing;
2754
2755 /* the digrams here are not character transitions, but merely destination
2756 character frequencies, ie the characters of a token are assumed
2757 independent and identically distributed. */
2758
2759 total = 0.0;
2760 missing = 0.0;
2761
2762 for(j = AMIN; j < ASIZE; j++) {
2763 if( learner->dig[RESERVED_MARGINAL][j] > 0.0 ) {
2764 total += learner->dig[RESERVED_MARGINAL][j];
2765 } else {
2766 missing++;
2767 }
2768 }
2769 if( total > 0.0 ) {
2770 if( missing > 0.0 ) {
2771 missing = 1.0/missing;
2772 for(j = AMIN; j < ASIZE; j++) {
2773 learner->dig[RESERVED_MARGINAL][j] =
2774 UNPACK_DIGRAMS(PACK_DIGRAMS(log((learner->dig[RESERVED_MARGINAL][j]/total) * (1.0 - missing) + missing)));
2775 }
2776 } else {
2777 for(j = AMIN; j < ASIZE; j++) {
2778 learner->dig[RESERVED_MARGINAL][j] =
2779 UNPACK_DIGRAMS(PACK_DIGRAMS(log(learner->dig[RESERVED_MARGINAL][j] / total)));
2780 }
2781 }
2782 } else {
2783 for(j = AMIN; j < ASIZE; j++) {
2784 learner->dig[RESERVED_MARGINAL][j] =
2785 UNPACK_DIGRAMS(PACK_DIGRAMS(-log(missing)));
2786 }
2787 }
2788
2789 /* now make all transitions identical */
2790 for(j = 0; j < ASIZE; j++) {
2791 for(i = AMIN; i < ASIZE; i++) {
2792 learner->dig[i][j] = learner->dig[RESERVED_MARGINAL][j];
2793 }
2794 }
2795 /* clear other data */
2796 for(i = 0; i < AMIN; i++) {
2797 for(j = 0; j < ASIZE; j++) {
2798 learner->dig[i][j] = 0.0;
2799 }
2800 }
2801 }
2802
2803 /* use 0.0 for small datasets, 1.0 for huge datasets.
2804 * 0.6 works well generally. I don't understand this :-(
2805 */
transpose_digrams(learner_t * learner)2806 void transpose_digrams(learner_t *learner) {
2807 register alphabet_size_t i, j;
2808 register weight_t t;
2809
2810 /* we skip transitions involving DIAMOND, TOKENSEP */
2811 /* code below uses fact that DIAMOND == 0x01 */
2812 /* code below uses fact that TOKENSEP == 0x02 */
2813 for(i = AMIN; i < ASIZE; i++) {
2814 for(j = AMIN; j < i; j++) {
2815 t = (learner->dig[i][j] + learner->dig[j][i])/2.0;
2816 learner->dig[j][i] = 0.6 * learner->dig[j][i] + 0.4 * t;
2817 learner->dig[i][j] = 0.6 * learner->dig[i][j] + 0.4 * t;
2818 }
2819 }
2820 }
2821
recompute_ed(learner_t * learner,weight_t * plogzon,weight_t * pdiv,int i,weight_t lam[ASIZE],weight_t Xi)2822 void recompute_ed(learner_t *learner, weight_t *plogzon, weight_t *pdiv,
2823 int i, weight_t lam[ASIZE], weight_t Xi) {
2824 register alphabet_size_t j;
2825 weight_t logA = log((weight_t)(ASIZE - AMIN));
2826 weight_t logzon, div;
2827 weight_t maxlogz, tmp;
2828
2829 maxlogz = 0.0;
2830 for(j = AMIN; j < ASIZE; j++) {
2831 if( lam[j] > 0.0 ) {
2832 tmp = lam[j] + log(1.0 - exp(-lam[j])) - logA;
2833 if( maxlogz < tmp ) {
2834 maxlogz = tmp;
2835 }
2836 }
2837 }
2838
2839
2840 logzon = exp(0.0 - maxlogz);
2841 div = 0.0;
2842 for(j = AMIN; j < ASIZE; j++) {
2843 if( lam[j] > 0.0 ) {
2844 tmp = -logA - maxlogz;
2845 logzon += (exp( lam[j] + tmp) - exp(tmp));
2846 div += lam[j] * learner->dig[i][j];
2847 }
2848 }
2849
2850
2851 tmp = maxlogz + log(logzon);
2852
2853 if( isnan(tmp) ) {
2854 errormsg(E_FATAL,"sorry, digramic partition function went kaboom.\n");
2855 }
2856
2857 logzon = tmp;
2858 div = div/Xi - logzon;
2859
2860 *plogzon = logzon;
2861 *pdiv = div;
2862 }
2863
make_entropic_digrams(learner_t * learner)2864 void make_entropic_digrams(learner_t *learner) {
2865 weight_t lam[ASIZE];
2866 weight_t lam_delta, old_lam, logt, maxlogt;
2867 weight_t Xi, logXi, logzon, div, old_logzon, old_div;
2868 weight_t logA = log((weight_t)(ASIZE - AMIN));
2869 register alphabet_size_t i, j;
2870 int itcount;
2871
2872 for(i = AMIN; i < ASIZE; i++) {
2873
2874 /* initializations */
2875 Xi = 0.0;
2876 for(j = AMIN; j < ASIZE; j++) {
2877 Xi += learner->dig[i][j];
2878 lam[j] = 0.0;
2879 }
2880 logXi = log(Xi);
2881
2882 recompute_ed(learner, &logzon, &div, i, lam, Xi);
2883
2884 /* now iterate - this is like minimize_learner_divergence() */
2885 itcount = 0;
2886 do {
2887 itcount++;
2888
2889 old_logzon = logzon;
2890 old_div = div;
2891
2892 lam_delta = 0.0;
2893 for(j = AMIN; j < ASIZE; j++) {
2894 if( learner->dig[i][j] > 0.0 ) {
2895
2896 old_lam = lam[j];
2897 lam[j] = log(learner->dig[i][j]) - logXi + logA + logzon;
2898
2899 if( isnan(lam[j]) ) {
2900
2901 lam[j] = old_lam;
2902 recompute_ed(learner, &logzon, &div, i, lam, Xi);
2903
2904 } else {
2905
2906 if( lam[j] > (old_lam + MAX_LAMBDA_JUMP) ) {
2907 lam[j] = (old_lam + MAX_LAMBDA_JUMP);
2908 } else if( lam[j] < (old_lam - MAX_LAMBDA_JUMP) ) {
2909 lam[j] = (old_lam - MAX_LAMBDA_JUMP);
2910 }
2911
2912 /* don't want negative weights */
2913 if( lam[j] < 0.0 ) { lam[j] = 0.0; }
2914
2915 if( lam_delta < fabs(lam[j] - old_lam) ) {
2916 lam_delta = fabs(lam[j] - old_lam);
2917 }
2918
2919 }
2920 }
2921 }
2922
2923
2924 recompute_ed(learner, &logzon, &div, i, lam, Xi);
2925
2926 if( u_options & (1<<U_OPTION_VERBOSE) ) {
2927 fprintf(stdout, "digramic (%d) entropy change %" FMT_printf_score_t \
2928 " --> %" FMT_printf_score_t " (%10f, %10f)\n", i,
2929 old_div, div, lam_delta, fabs(logzon - old_logzon));
2930 }
2931
2932
2933 } while( ((fabs(div - old_div) > 0.001) || (lam_delta > 0.001) ||
2934 (fabs(logzon - old_logzon) > 0.001)) && (itcount < 500) );
2935
2936
2937 /* now make the probabilities */
2938
2939 /* transitions should already sum to 1, but better safe than sorry */
2940 maxlogt = log(0.0);
2941 for(j = AMIN; j < ASIZE; j++) {
2942 if( maxlogt < lam[j] ) {
2943 maxlogt = lam[j];
2944 }
2945 }
2946 maxlogt += -logA -logzon;
2947 logt = 0.0;
2948 for(j = AMIN; j < ASIZE; j++) {
2949 logt += exp(lam[j] - logA - logzon - maxlogt);
2950 }
2951 logt = maxlogt + log(logt);
2952
2953 for(j = AMIN; j < ASIZE; j++) {
2954 /* note: simulate the effect of digitizing the digrams */
2955 learner->dig[i][j] =
2956 UNPACK_DIGRAMS(PACK_DIGRAMS(lam[j] - logA - logzon - logt));
2957
2958 if(0 && u_options & (1<<U_OPTION_DEBUG) ) {
2959 fprintf(stdout,
2960 "learner->dig[%d][%d] = %f\n",
2961 i, j, learner->dig[i][j]);
2962 }
2963 }
2964 }
2965
2966 /* clear other data */
2967 for(i = 0; i < AMIN; i++) {
2968 for(j = 0; j < ASIZE; j++) {
2969 learner->dig[i][j] = 0.0;
2970 }
2971 }
2972 }
2973
2974
interpolate_digrams(weight_t N,weight_t count[],weight_t p[])2975 void interpolate_digrams(weight_t N, weight_t count[], weight_t p[]) {
2976 weight_t lambda = 0.5;
2977 weight_t Clam, Dlam, Zlam;
2978 alphabet_size_t i;
2979 int q = 0;
2980
2981 for(q = 0; q < 100; q++) {
2982 Zlam = Dlam = Clam = 0.0;
2983 for(i = AMIN; i < ASIZE; i++) {
2984 Clam += log(p[i]) * (count[i]/N);
2985 Zlam += pow(p[i], lambda);
2986 Dlam += log(p[i]) * pow(p[i], lambda);
2987 }
2988 lambda += 0.1 * (Clam - Dlam/Zlam);
2989 /* if( lambda > 1.0 ) { lambda = 1.0; } */
2990 /* else if( lambda < 0.0 ) { lambda = 0.0; } */
2991 /* printf("Clam = %g Dlam = %g, Zlam = %g, lamdba = %g\n", Clam, Dlam, Zlam, lambda); */
2992 }
2993
2994 }
2995
2996
calc_learner_digramic_excursion(learner_t * learner,char * tok)2997 weight_t calc_learner_digramic_excursion(learner_t *learner, char *tok) {
2998 register alphabet_size_t p, q;
2999 register weight_t t = 0.0;
3000 register alphabet_size_t len;
3001 /* now update digram frequency counts */
3002 p = (unsigned char)*tok++;
3003 CLIP_ALPHABET(p);
3004 len = 1;
3005 /* finally update character frequencies */
3006 while( *tok != EOTOKEN ) {
3007 q = (unsigned char)*tok;
3008 CLIP_ALPHABET(q);
3009 t += learner->dig[p][q];
3010 p = q;
3011 tok++;
3012 if( q != DIAMOND ) { len++; }
3013 }
3014 t += learner->dig[RESERVED_TOKLEN][len] - learner->dig[RESERVED_TOKLEN][0];
3015 return t;
3016 }
3017
3018 /* calculates the rth-order divergence but needs normalizing constant
3019 note: this isn't the full divergence from digref, just the bits
3020 needed for the r-th optimization - fwd indicates the traversal
3021 direction in the hash (alternate between them to reduce numerical
3022 errors */
learner_divergence(learner_t * learner,score_t logzonr,score_t Xi,token_order_t r,bool_t fwd)3023 score_t learner_divergence(learner_t *learner,
3024 score_t logzonr, score_t Xi,
3025 token_order_t r, bool_t fwd) {
3026 register l_item_t *i, *e;
3027 register token_count_t c = 0;
3028 score_t t = 0.0;
3029
3030 e = fwd ? (learner->hash + learner->max_tokens) : learner->hash - 1;
3031 for(i = fwd ? learner->hash : learner->hash + learner->max_tokens - 1;
3032 i != e; fwd ? i++ : i--) {
3033 if( FILLEDP(i) && (i->typ.order == r) ) {
3034
3035 t += UNPACK_LAMBDA(i->lam) * (score_t)i->count;
3036
3037 if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3038 c = 0;
3039 break;
3040 }
3041 }
3042 }
3043
3044 return -logzonr + t/Xi;
3045 }
3046
3047 /* calculates the normalizing constant - fwd indicates the traversal
3048 direction in the hash (alternate between them to reduce numerical
3049 errors */
learner_logZ(learner_t * learner,token_order_t r,score_t log_unchanging_part,bool_t fwd)3050 score_t learner_logZ(learner_t *learner, token_order_t r,
3051 score_t log_unchanging_part, bool_t fwd) {
3052 register l_item_t *i, *e;
3053 register token_count_t c = 0;
3054
3055 score_t maxlogz, tmp;
3056 score_t t =0.0;
3057 score_t R = (score_t)r;
3058
3059 /* printf("learner_logZ(%d, %f)\n", r, log_unchanging_part); */
3060
3061 e = fwd ? (learner->hash + learner->max_tokens) : learner->hash - 1;
3062 maxlogz = log_unchanging_part;
3063 for(i = fwd ? learner->hash : learner->hash + learner->max_tokens - 1;
3064 i != e; fwd ? i++ : i--) {
3065 if( FILLEDP(i) && (i->typ.order == r) ) {
3066
3067 tmp = R * UNPACK_LAMBDA(i->lam) + R * UNPACK_LWEIGHTS(i->tmp.min.ltrms) +
3068 UNPACK_RWEIGHTS(i->tmp.min.dref);
3069
3070 if( maxlogz < tmp ) {
3071 maxlogz = tmp;
3072 }
3073
3074 if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3075 c = 0;
3076 break;
3077 }
3078 }
3079 }
3080
3081 t = exp(log_unchanging_part - maxlogz);
3082 for(i = fwd ? learner->hash : learner->hash + learner->max_tokens - 1;
3083 i != e; fwd ? i++ : i--) {
3084 if( FILLEDP(i) && (i->typ.order == r) ) {
3085
3086 tmp = R * UNPACK_LWEIGHTS(i->tmp.min.ltrms) +
3087 UNPACK_RWEIGHTS(i->tmp.min.dref) - maxlogz;
3088 t += (exp(R * UNPACK_LAMBDA(i->lam) + tmp) - exp(tmp));
3089
3090 if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3091 c = 0;
3092 break;
3093 }
3094 }
3095 }
3096
3097 tmp = (maxlogz + log(t))/R;
3098 /* printf("t =%f maxlogz = %f logZ/R = %f\n", t, maxlogz, tmp); */
3099
3100 /* investigate this someday */
3101 if( isnan(tmp) ) {
3102 errormsg(E_FATAL,"sorry, partition function went kaboom.\n");
3103 }
3104
3105 return tmp;
3106 }
3107
3108 /* reconstructs the order of a token (can be zero for empty token) */
get_token_order(char * tok)3109 token_order_t get_token_order(char *tok) {
3110 token_order_t o = 0;
3111 if( tok && *tok ) {
3112 while( *(++tok) ) {
3113 if( *tok == DIAMOND ) {
3114 o++;
3115 }
3116 }
3117 }
3118 return o;
3119 }
3120
3121
3122 /* fills the hash with partial calculations */
fill_ref_vars(learner_t * learner,l_item_t * k,char * tok)3123 void fill_ref_vars(learner_t *learner, l_item_t *k, char *tok) {
3124 hash_value_t id;
3125 char *t, *e;
3126 l_item_t *l;
3127
3128 /* weight of the r-th order excursion */
3129 k->tmp.min.dref = PACK_RWEIGHTS(calc_learner_digramic_excursion(learner,tok));
3130
3131 /* for each suffix of tok, add its weight */
3132 k->tmp.min.ltrms = PACK_LWEIGHTS(0.0);
3133
3134 if( tok && *tok ) {
3135 e = strchr(tok + 1, EOTOKEN);
3136 for( t = tok + 1; t + 1 < e; t++ ) {
3137 if( *t == DIAMOND ) {
3138 id = hash_partial_token(t, e - t, e);
3139 l = find_in_learner(learner, id);
3140 if( l ) {
3141 k->tmp.min.ltrms += PACK_LWEIGHTS(UNPACK_LAMBDA(l->lam));
3142 }
3143 }
3144 }
3145
3146 /* extra smoothing doesn't seem that good */
3147 #undef EXTRA_SMOOTHING
3148 #if defined EXTRA_SMOOTHING
3149 /* rather than adding only suffixes, this adds all
3150 * the prefixes also, then divides by two. */
3151 if( e ) {
3152 /* for each prefix of tok, add its weight */
3153 for(t = e - 2; t > tok; t--) {
3154 if( *t == DIAMOND ) {
3155 id = hash_partial_token(tok, t - tok + 1, e);
3156 l = find_in_learner(id);
3157 if( l ) {
3158 k->ltrms += PACK_LWEIGHTS(UNPACK_LAMBDA(l->lam));
3159 }
3160 }
3161 }
3162 /* now we have twice as many weights as we need */
3163 k->ltrms /= 2;
3164 }
3165 #endif
3166 }
3167
3168 }
3169
3170 /* fills hash with partial calculations and returns the
3171 * log unchanging part of the normalizing constant, for all
3172 * orders < r (not r). On return, kappa contains the reference probability
3173 * mass of the set of all features <= r (including r).
3174 */
recalculate_reference_measure(learner_t * learner,token_order_t r,score_t * kappa)3175 score_t recalculate_reference_measure(learner_t *learner, token_order_t r, score_t *kappa) {
3176 hash_value_t id;
3177 byte_t buf[BUFSIZ+1];
3178 char tok[(MAX_TOKEN_LEN+1)*MAX_SUBMATCH+EXTRA_TOKEN_LEN];
3179 size_t n = 0;
3180
3181 const byte_t *p;
3182 char *q;
3183
3184 l_item_t *k, *i;
3185
3186 score_t tmp, lunch;
3187 score_t R = (score_t)r;
3188 score_t max = 0.0;
3189 score_t mykappa = 0.0;
3190
3191 /* for r == 1, the partial_z is not modified, but
3192 we still have to go through the tokens and prebuild k-dref */
3193
3194 /* now we calculate the logarithmic word weight
3195 from the digram model, for each token in the hash */
3196 if( !tmp_seek_start(learner) ) {
3197 errormsg(E_ERROR, "cannot seek in temporary token file, reference weights not calculated.\n");
3198 } else {
3199 q = tok;
3200 while( (n = tmp_read_block(learner, buf, BUFSIZ, &p)) > 0 ) {
3201 /* p = buf; */
3202 /* p[n] = '\0'; */
3203 while( n-- > 0 ) {
3204 if( *p != TOKENSEP) {
3205 *q++ = *p; /* copy into tok */
3206 } else { /* interword space */
3207 *q = 0; /* append NUL to tok */
3208 /* now write weight in hash */
3209 id = hash_full_token(tok);
3210 k = find_in_learner(learner, id);
3211 if( k && (get_token_order(tok) == k->typ.order) ) {
3212 if( k->typ.order <= r) {
3213 if( k->typ.order == r ) {
3214 fill_ref_vars(learner, k, tok);
3215 } else if(k->typ.order < r) {
3216 /* assume ref_vars were already filled */
3217 if( NOTNULL(k->lam) ) {
3218 tmp = R * UNPACK_LAMBDA(k->lam) +
3219 R * UNPACK_LWEIGHTS(k->tmp.min.ltrms) +
3220 UNPACK_RWEIGHTS(k->tmp.min.dref);
3221 if( max < tmp ) {
3222 max = tmp;
3223 }
3224 }
3225 }
3226 mykappa += exp(UNPACK_RWEIGHTS(k->tmp.min.dref));
3227 }
3228 }
3229 q = tok; /* reset q */
3230 }
3231 p++;
3232 }
3233 }
3234 }
3235
3236 lunch = 1.0;
3237 k = learner->hash + learner->max_tokens;
3238 for(i = learner->hash; i != k; i++) {
3239 if( FILLEDP(i) ) {
3240 if( i->typ.order < r ) {
3241 if( NOTNULL(i->lam) ) {
3242 tmp = -max + R * UNPACK_LWEIGHTS(i->tmp.min.ltrms) +
3243 UNPACK_RWEIGHTS(i->tmp.min.dref);
3244 lunch += (exp(R * UNPACK_LAMBDA(i->lam) + tmp) - exp(tmp));
3245 }
3246 }
3247 }
3248 }
3249 lunch = max + log(lunch);
3250
3251 /* kappa is the reference mass of all features <= rth order. */
3252 *kappa = mykappa;
3253
3254 return lunch;
3255 }
3256
3257
3258 /* just for debugging */
print_score(learner_t * learner,token_order_t r)3259 void print_score(learner_t *learner, token_order_t r) {
3260 hash_count_t i;
3261 double logprob = 0.0;
3262 double lpapprox = 0.0;
3263
3264 for(i = 0; i < learner->max_tokens; i++) {
3265 if( FILLEDP(&learner->hash[i]) &&
3266 (learner->hash[i].typ.order <= r) ) {
3267 logprob += learner->hash[i].count * (learner->hash[i].lam);
3268 if( learner->hash[i].typ.order == 1 ) {
3269 logprob += learner->hash[i].count *
3270 UNPACK_RWEIGHTS(learner->hash[i].tmp.min.dref)/((weight_t)r);
3271 }
3272 }
3273 if( FILLEDP(&learner->hash[i]) &&
3274 (learner->hash[i].typ.order == r) ) {
3275 lpapprox += learner->hash[i].count *
3276 ((learner->hash[i].lam) +
3277 UNPACK_LWEIGHTS(learner->hash[i].tmp.min.ltrms) +
3278 UNPACK_RWEIGHTS(learner->hash[i].tmp.min.dref)/((weight_t)r));
3279 }
3280
3281 }
3282
3283 fprintf(stdout,
3284 "*** logprob = %" FMT_printf_score_t " * %d (r = %" FMT_printf_integer_t ", logZ = %" FMT_printf_score_t ")\n",
3285 ((score_t)(logprob / learner->fixed_order_token_count[r])),
3286 learner->fixed_order_token_count[r],
3287 r, learner->logZ);
3288 fprintf(stdout,
3289 "*** lpapprox = %" FMT_printf_score_t " * %ld (r = %d, logZ = %" FMT_printf_score_t ")\n",
3290 ((score_t)(lpapprox / learner->fixed_order_token_count[r])),
3291 learner->fixed_order_token_count[r],
3292 r, learner->logZ);
3293 }
3294
3295
3296 /* experimental: this sounded like a good idea, but isn't ?? */
theta_rescale(learner_t * learner,token_order_t r,score_t logupz,score_t Xi,bool_t fwd)3297 void theta_rescale(learner_t *learner, token_order_t r, score_t logupz, score_t Xi, bool_t fwd) {
3298 register l_item_t *i, *e;
3299 register token_count_t c = 0;
3300
3301 score_t tmp;
3302 score_t theta =0.0;
3303 score_t R = (score_t)r;
3304 score_t mu;
3305 score_t sum1, sum2;
3306 score_t msum1, msum2;
3307 score_t s = 0;
3308
3309 e = fwd ? (learner->hash + learner->max_tokens) : learner->hash - 1;
3310
3311 msum1 = msum2 = log(0.0);
3312 for(i = fwd ? learner->hash : learner->hash + learner->max_tokens - 1;
3313 i != e; fwd ? i++ : i--) {
3314 if( FILLEDP(i) && (i->typ.order == r) ) {
3315 if( NOTNULL(i->lam) ) {
3316 s += i->count;
3317 tmp = R * UNPACK_LWEIGHTS(i->tmp.min.ltrms) +
3318 UNPACK_RWEIGHTS(i->tmp.min.dref);
3319 if( tmp > msum1 ) {
3320 msum1 = tmp;
3321 }
3322 tmp += R * UNPACK_LAMBDA(i->lam);
3323 if( tmp > msum2 ) {
3324 msum2 = tmp;
3325 }
3326 }
3327 if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3328 c = 0;
3329 break;
3330 }
3331 }
3332 }
3333
3334 mu = ((score_t)s)/Xi;
3335 s = log(mu) - log(1 - mu) + logupz;
3336 if( s > msum1 ) {
3337 msum1 = s;
3338 }
3339
3340 sum1 = exp(s - msum1);
3341 sum2 = 0.0;
3342 for(i = fwd ? learner->hash : learner->hash + learner->max_tokens - 1;
3343 i != e; fwd ? i++ : i--) {
3344 if( FILLEDP(i) && (i->typ.order == r) ) {
3345 if( NOTNULL(i->lam) ) {
3346 tmp = R * UNPACK_LWEIGHTS(i->tmp.min.ltrms) +
3347 UNPACK_RWEIGHTS(i->tmp.min.dref);
3348 sum1 += exp(tmp - msum1);
3349 tmp += R * UNPACK_LAMBDA(i->lam);
3350 sum2 += exp(tmp - msum2);
3351 }
3352 if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3353 c = 0;
3354 break;
3355 }
3356 }
3357 }
3358
3359 theta = ((msum1 + log(sum1)) - (msum2 + log(sum2)))/R;
3360 if( isnan(theta) ) {
3361 return; /* ignore */
3362 }
3363 if( u_options & (1<<U_OPTION_VERBOSE) ) {
3364 fprintf(stdout, "theta = %f\n", theta);
3365 }
3366
3367 for(i = fwd ? learner->hash : learner->hash + learner->max_tokens - 1;
3368 i != e; fwd ? i++ : i--) {
3369 if( FILLEDP(i) && (i->typ.order == r) ) {
3370 if( NOTNULL(i->lam) ) {
3371 i->lam = PACK_LAMBDA(UNPACK_LAMBDA(i->lam) + theta);
3372 }
3373 if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3374 c = 0;
3375 break;
3376 }
3377 }
3378 }
3379
3380 }
3381
3382 /* the mediaprobs are the token probabilities for each token class.
3383 (summing the mediaprobs) == 1. We put a uniform prior on the media. */
compute_mediaprobs(learner_t * learner)3384 void compute_mediaprobs(learner_t *learner) {
3385 l_item_t *k, *i;
3386 token_class_t j;
3387 weight_t R, tmp;
3388
3389 R = (weight_t)learner->max_order;
3390 for(j = 0; j < TOKEN_CLASS_MAX; j++) {
3391 learner->mediaprobs[j] = 1.0/TOKEN_CLASS_MAX;
3392 }
3393
3394 k = learner->hash + learner->max_tokens;
3395 for(i = learner->hash; i != k; i++) {
3396 if( FILLEDP(i) && NOTNULL(i->lam)) {
3397 tmp = R * UNPACK_LWEIGHTS(i->tmp.min.ltrms) +
3398 UNPACK_RWEIGHTS(i->tmp.min.dref);
3399 learner->mediaprobs[i->typ.cls] +=
3400 exp(R * UNPACK_LAMBDA(i->lam) + tmp) - exp(tmp);
3401 }
3402 }
3403
3404 tmp = 0;
3405 for(j = 0; j < TOKEN_CLASS_MAX; j++) {
3406 learner->mediaprobs[j] *= exp(-learner->logZ);
3407 tmp += learner->mediaprobs[j];
3408 }
3409 /* should have tmp == 1.0 */
3410 }
3411
3412 /* minimizes the divergence by solving for lambda one
3413 component at a time. */
minimize_learner_divergence(learner_t * learner)3414 void minimize_learner_divergence(learner_t *learner) {
3415 l_item_t *i, *e;
3416 token_order_t r;
3417 token_count_t lzero, c = 0;
3418 token_count_t zcut = 0;
3419 int itcount, mcount;
3420 score_t d, dd, b;
3421 score_t lam_delta, old_lam, new_lam;
3422 score_t logzonr, old_logzonr;
3423 score_t logupz, div_extra_bits, kappa;
3424 score_t R, Xi, logXi;
3425 score_t mp_logz;
3426 bool_t fwd = 1;
3427
3428 if( u_options & (1<<U_OPTION_VERBOSE) ) {
3429 fprintf(stdout, "now maximizing model entropy\n");
3430 }
3431
3432 learner->logZ = 0.0;
3433 learner->divergence = 0.0;
3434 mp_logz = 0.0;
3435
3436 /* disable multipass if we only have one order to play with */
3437 if( (m_options & (1<<M_OPTION_MULTINOMIAL)) ||
3438 (learner->max_order == 1) ) {
3439 qtol_multipass = 0;
3440 }
3441
3442 if( learner->model.tmin != 0 ) {
3443 /* ftreshold is unsigned */
3444 if( learner->model.tmin > 0 ) {
3445 zcut = learner->model.tmin;
3446 } else if (learner->model.tmin + learner->tmax > 0) {
3447 zcut = learner->model.tmin + learner->tmax;
3448 }
3449 }
3450
3451 e = learner->hash + learner->max_tokens;
3452 for(mcount = 0; mcount < (qtol_multipass ? 50 : 1); mcount++) {
3453 for(r = 1;
3454 r <= ((m_options & (1<<M_OPTION_MULTINOMIAL)) ? 1 : learner->max_order);
3455 r++) {
3456
3457 /* here we precalculate various bits and pieces
3458 which aren't going to change during this iteration */
3459 logupz = recalculate_reference_measure(learner, r, &kappa);
3460
3461 R = (score_t)r;
3462 Xi = (score_t)learner->fixed_order_token_count[r];
3463 logXi = log(Xi);
3464
3465 if( r == 1 ) {
3466 /* Xi /= kappa; */
3467 div_extra_bits = 0.0;
3468 } else {
3469 /* calculate extra bits for divergence score display */
3470 b = 0.0;
3471
3472 for(i = learner->hash; i != e; i++) {
3473 if( FILLEDP(i) && (i->typ.order < r) ) {
3474 b += UNPACK_LAMBDA(i->lam) * (score_t)i->count;
3475 }
3476 }
3477 div_extra_bits = b/Xi;
3478 }
3479
3480 if( m_options & (1<<M_OPTION_REFMODEL) ) {
3481 break;
3482 }
3483
3484 if( u_options & (1<<U_OPTION_VERBOSE) ) {
3485 fprintf(stdout,
3486 "* optimizing order %" FMT_printf_integer_t " weights (%" FMT_printf_integer_t " tokens, %" FMT_printf_integer_t " unique, kappa = %f)\n",
3487 r, learner->fixed_order_token_count[r],
3488 learner->fixed_order_unique_token_count[r], kappa);
3489 }
3490
3491 logzonr = learner_logZ(learner, r, logupz, fwd);
3492 dd = learner_divergence(learner, logzonr, Xi, r, fwd);
3493 fwd = 1 - fwd;
3494 /* printf("logzonr = %f dd = %f logupz = %f Xi = %f\n", logzonr, dd, logupz, Xi); */
3495 itcount = 0;
3496 do {
3497 itcount++;
3498 lzero= 0;
3499
3500 d = dd; /* save old divergence */
3501 old_logzonr = logzonr;
3502
3503 lam_delta = 0.0;
3504
3505 c = 0;
3506 for(i = learner->hash; i != e; i++) {
3507 if( FILLEDP(i) && (i->typ.order == r)
3508 /* && (rand() > RAND_MAX/2) */
3509 ) {
3510
3511 old_lam = UNPACK_LAMBDA(i->lam);
3512
3513 if( /* (i->typ.order == 1) || */
3514 (i->count > zcut) ) {
3515 /* "iterative scaling" lower bound */
3516 new_lam = (log((score_t)i->count) - logXi -
3517 UNPACK_RWEIGHTS(i->tmp.min.dref))/R + logzonr -
3518 UNPACK_LWEIGHTS(i->tmp.min.ltrms);
3519 } else {
3520 new_lam = 0.0;
3521 }
3522
3523 if( isnan(new_lam) ) {
3524 /* precision problem, just ignore, don't change lambda */
3525 logzonr = learner_logZ(learner, r, logupz, fwd);
3526 } else {
3527
3528 /* this code shouldn't be necessary, but is crucial */
3529 if( new_lam > (old_lam + MAX_LAMBDA_JUMP) ) {
3530 new_lam = (old_lam + MAX_LAMBDA_JUMP);
3531 } else if( new_lam < (old_lam - MAX_LAMBDA_JUMP) ) {
3532 new_lam = (old_lam - MAX_LAMBDA_JUMP);
3533 }
3534
3535 /* don't want negative weights */
3536 if( new_lam < 0.0 ) { new_lam = 0.0; lzero++; }
3537
3538 if( lam_delta < fabs(new_lam - old_lam) ) {
3539 lam_delta = fabs(new_lam - old_lam);
3540 }
3541 i->lam = PACK_LAMBDA(new_lam);
3542 }
3543
3544 if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3545 c = 0;
3546 break;
3547 }
3548 }
3549 }
3550
3551 /* theta_rescale(r, logupz, Xi, fwd); */
3552
3553 /* update values */
3554 logzonr = learner_logZ(learner, r, logupz, fwd);
3555 dd = learner_divergence(learner, logzonr, Xi, r, fwd);
3556 fwd = 1 - fwd;
3557
3558 if( u_options & (1<<U_OPTION_VERBOSE) ) {
3559 /* fprintf(stdout, "lzero = %ld\n", lzero); */
3560 fprintf(stdout, "entropy change %" FMT_printf_score_t \
3561 " --> %" FMT_printf_score_t " (%10f, %10f)\n",
3562 d + div_extra_bits,
3563 dd + div_extra_bits,
3564 lam_delta, fabs(logzonr - old_logzonr));
3565 }
3566
3567 process_pending_signal(NULL);
3568
3569 } while( ((fabs(d - dd) > qtol_div) || (lam_delta > qtol_lam) ||
3570 (fabs(logzonr - old_logzonr) > qtol_logz)) && (itcount < 50) );
3571
3572 learner->logZ = logzonr;
3573 learner->divergence = dd + div_extra_bits;
3574 }
3575 /* for multipass, we wait until logZ stabilizes */
3576 if( fabs(1.0 - mp_logz/learner->logZ) < 0.01 ) {
3577 break;
3578 }
3579 mp_logz = learner->logZ;
3580 }
3581
3582 /* compute the probability mass of each token class (medium) separately */
3583 compute_mediaprobs(learner);
3584 }
3585
3586 /* dumps readable model weights to the output */
dump_model(learner_t * learner,FILE * out,FILE * in)3587 void dump_model(learner_t *learner, FILE *out, FILE *in) {
3588
3589 hash_value_t id;
3590 byte_t buf[BUFSIZ+1];
3591 char tok[(MAX_TOKEN_LEN+1)*MAX_SUBMATCH+EXTRA_TOKEN_LEN];
3592 char smb[MAX_SUBMATCH+1];
3593 token_order_t s;
3594 regex_count_t c;
3595 const byte_t *p;
3596 char *q;
3597 l_item_t *k;
3598 size_t n = 0;
3599
3600 /* preamble - this is copied from save_learner */
3601
3602 fprintf(out, MAGIC1, learner->filename,
3603 (m_options & (1<<M_OPTION_REFMODEL)) ? "(ref)" : "");
3604 fprintf(out, MAGIC2_o, learner->divergence, learner->logZ, learner->max_order,
3605 (m_options & (1<<M_OPTION_MULTINOMIAL)) ? "multinomial" : "hierarchical" );
3606 fprintf(out, MAGIC3,
3607 (short int)learner->max_hash_bits,
3608 (long int)learner->full_token_count,
3609 (long int)learner->unique_token_count,
3610 (long int)learner->doc.count);
3611
3612 fprintf(out, MAGIC8_o, learner->shannon, learner->shannon2);
3613
3614 fprintf(out, MAGIC10_o,
3615 learner->alpha, learner->beta,
3616 learner->mu, learner->s2);
3617
3618 write_mediaprobs(out, learner);
3619
3620 fprintf(out, MAGIC9, (long int)learner->model.tmin, (long int)learner->tmax);
3621
3622 /* print out any regexes we might need */
3623 for(c = 0; c < regex_count; c++) {
3624 /* write the bitmap */
3625 for(q = smb, s = 1; s <= MAX_SUBMATCH; s++) {
3626 if( re[c].submatches & (1<<s) ) {
3627 *q++ = s + '0';
3628 }
3629 }
3630 *q = '\0';
3631
3632 fprintf(out, MAGIC5_o, re[c].string, smb);
3633 }
3634
3635 /* print options */
3636 fprintf(out, MAGIC4_o, m_options, m_cp, m_dt,
3637 print_model_options(m_options, m_cp, (char*)buf));
3638
3639 fprintf(out, MAGIC6);
3640
3641 fprintf(out, MAGIC_DUMP);
3642
3643
3644 /* now go through hash printing values */
3645 if( !tmp_seek_start(learner) ) {
3646 errormsg(E_ERROR, "cannot seek in temporary token file.\n");
3647 } else {
3648 q = tok;
3649 while( (n = tmp_read_block(learner, buf, BUFSIZ, &p)) > 0 ) {
3650 /* p = buf; */
3651 /* p[n] = '\0'; */
3652 while( n-- > 0 ) {
3653 if( *p != TOKENSEP) {
3654 *q++ = *p; /* copy into tok */
3655 } else { /* interword space */
3656 *q = 0; /* append NUL to tok */
3657 /* now write weight in hash */
3658 id = hash_full_token(tok);
3659 k = find_in_learner(learner, id); /* guaranteed to be found */
3660 fprintf(out, MAGIC_DUMPTBL_o,
3661 (weight_t)UNPACK_LAMBDA(k->lam),
3662 UNPACK_RWEIGHTS(k->tmp.min.dref), k->count,
3663 (long unsigned int)k->id);
3664 print_token(out, tok);
3665 fprintf(out, "\n");
3666 q = tok; /* reset q */
3667 }
3668 p++;
3669 }
3670 }
3671 }
3672 }
3673
3674 /* This is a quick hack to
3675 1) speed up the optimization and
3676 2) stabilise the estimated category model over a sequence of learning runs.
3677
3678 Normally, if the dataset is nearly the same, then the optimal lambda weights
3679 should be fairly similar (This is not strictly true if the digramic
3680 measure varies too much).
3681
3682 By reading the old category and presetting the lambdas from it, we typically
3683 get close enough to the true weights that one or two optimization iterations
3684 suffice. Moreover in that case, most weights should stay close to their
3685 "previous" values in the category, so the user doesn't think it's weird.
3686
3687 A side effect is that there is entropy creep when relearing the same
3688 dataset over and over. Unless there's a bug I haven't noticed, this occurs
3689 because the entropy maximum we compute is affected by numerical precision
3690 and feature ordering within the hash. Each time we relearn the same dataset,
3691 we start off with a perturbation from the previous best weights, and obtain
3692 a small improvement.
3693
3694 WARNING: THIS USES cat[0], SO IS NOT COMPATIBLE WITH CLASSIFYING.
3695 */
learner_prefill_lambdas(learner_t * learner,category_t ** pxcat)3696 void learner_prefill_lambdas(learner_t *learner, category_t **pxcat) {
3697 hash_count_t c;
3698 c_item_t *i, *e;
3699 l_item_t *k;
3700 category_t *xcat = &(cat[0]);
3701
3702 *pxcat = NULL;
3703 xcat->fullfilename = strdup(learner->filename);
3704 if( open_category(cat) ) {
3705 if( xcat->model.options & (1<<M_OPTION_WARNING_BAD) ) {
3706 if( u_options & (1<<U_OPTION_VERBOSE) ) {
3707 errormsg(E_WARNING, "old category file %s may have bad data\n",
3708 xcat->fullfilename);
3709 }
3710 } else if( (xcat->model.options == m_options) &&
3711 (xcat->retype == learner->retype) &&
3712 (fabs((xcat->model_unique_token_count/
3713 (double)learner->unique_token_count) - 1.0) < 0.15) ) {
3714 c = xcat->model_unique_token_count;
3715 if( u_options & (1<<U_OPTION_VERBOSE) ) {
3716 fprintf(stdout, "preloading %ld token weights\n", (long int)c);
3717 }
3718
3719 MADVISE(xcat->hash, sizeof(c_item_t) * xcat->max_tokens, MADV_SEQUENTIAL);
3720
3721 e = xcat->hash + xcat->max_tokens;
3722 for(i = xcat->hash; i != e; i++) {
3723 if( FILLEDP(i) ) {
3724 k = find_in_learner(learner, NTOH_ID(i->id));
3725 if( k ) {
3726 k->lam = NTOH_LAMBDA(i->lam);
3727 }
3728 if( --c <= 0 ) {
3729 break;
3730 }
3731 }
3732 }
3733
3734 /* tweak default quality settings */
3735 if( !quality ) {
3736 qtol_div = 0.01;
3737 qtol_lam = CLIP_LAMBDA_TOL(0.01);
3738 qtol_logz = 0.05;
3739 }
3740
3741 }
3742 /* we're done, but don't close the category yet, as we might want to
3743 overwrite it */
3744 *pxcat = xcat;
3745 } else {
3746 if( u_options & (1<<U_OPTION_VERBOSE) ) {
3747 errormsg(E_WARNING, "cannot open file for reading %s\n",
3748 xcat->fullfilename);
3749 }
3750 free(xcat->fullfilename);
3751 }
3752 }
3753
3754
optimize_and_save(learner_t * learner)3755 void optimize_and_save(learner_t *learner) {
3756
3757 hash_count_t i;
3758 token_order_t c;
3759 category_t *opencat = NULL;
3760
3761 if(100 * learner->unique_token_count >= HASH_FULL * learner->max_tokens) {
3762 errormsg(E_WARNING,
3763 "table full, some tokens ignored - "
3764 "try with option -h %i\n",
3765 learner->max_hash_bits + 1);
3766 } else if( learner->unique_token_count <= 0 ) {
3767 errormsg(E_WARNING,
3768 "no tokens matched - have I learned nothing?\n");
3769 tmp_close(learner);
3770 exit(0); /* exit code for success */
3771 }
3772
3773 if( skewed_constraints_warning ) {
3774 errormsg(E_WARNING,
3775 "ran out of integers (too much data) constraints will be skewed.\n");
3776 m_options |= (1<<M_OPTION_WARNING_BAD);
3777 }
3778
3779 if( u_options & (1<<U_OPTION_VERBOSE) ) {
3780 fprintf(stdout,
3781 "picked up %" FMT_printf_integer_t " (%" FMT_printf_integer_t " distinct) tokens\n",
3782 learner->full_token_count, learner->unique_token_count);
3783 fprintf(stdout,
3784 "calculating reference word weights\n");
3785 }
3786
3787 if( *online ) {
3788 write_online_learner_struct(learner, online);
3789 }
3790
3791 /* transposition smoothing */
3792 /* transpose_digrams(); */
3793
3794 if( *digtype && !strcmp(digtype, "uniform") ) {
3795 make_uniform_digrams(learner);
3796 } else if( *digtype && !strcmp(digtype, "dirichlet") ) {
3797 make_dirichlet_digrams(learner);
3798 } else if( *digtype && !strcmp(digtype, "maxent") ) {
3799 make_entropic_digrams(learner);
3800 } else if( *digtype && !strcmp(digtype, "toklen") ) {
3801 make_toklen_digrams(learner);
3802 } else if( *digtype && !strcmp(digtype, "mle") ) {
3803 make_mle_digrams(learner);
3804 } else if( *digtype && !strcmp(digtype, "iid") ) {
3805 make_iid_digrams(learner);
3806 } else {
3807 make_uniform_digrams(learner);
3808 }
3809
3810 if( learner->fixed_order_token_count[1] == 0 ) {
3811 /* it's a higher order model but there are no first
3812 order tokens! We can't handle that! Go through the
3813 hash converting everything to first order.
3814 This will result in incorrect calculations, unless
3815 the higher order tokens don't overlap. Suitable only for
3816 geniuses and fools. */
3817 errormsg(E_WARNING,
3818 "\n"
3819 " You have not defined any unigrams, so in this model\n"
3820 " features will be treated independently, which is quite\n"
3821 " likely incorrect. I hope you know what you're doing,\n"
3822 " because I don't!\n\n");
3823
3824 m_options |= (1<<M_OPTION_MULTINOMIAL);
3825 for(c = 2; c <= learner->max_order; c++) {
3826 learner->fixed_order_token_count[1] += learner->fixed_order_token_count[c];
3827 learner->fixed_order_unique_token_count[1] +=
3828 learner->fixed_order_unique_token_count[c];
3829 }
3830 for(i = 0; i < learner->max_tokens; i++) {
3831 if( FILLEDP(&learner->hash[i]) ) {
3832 learner->hash[i].typ.order = 1;
3833 }
3834 }
3835
3836 }
3837
3838
3839 /* if the category already exists, we read its lambda values.
3840 this should speed up the minimization slightly */
3841 if( u_options & (1<<U_OPTION_NOZEROLEARN) ) {
3842 learner_prefill_lambdas(learner, &opencat);
3843 }
3844
3845 minimize_learner_divergence(learner);
3846
3847 calc_shannon(learner);
3848
3849 if( u_options & (1<<U_OPTION_DUMP) ) {
3850 dump_model(learner, stdout, learner->tmp.file);
3851 }
3852
3853 tmp_close(learner);
3854 /* we don't free learner->tmpiobuf, as we'll be exiting soon anyway */
3855 #if defined HAVE_POSIX_MEMALIGN
3856 /* if( learner->tmpiobuf ) { free(learner->tmpiobuf); } */
3857 #endif
3858
3859 /* now save the model to a file */
3860 if( !opencat || !fast_partial_save_learner(learner, opencat) ) {
3861 save_learner(learner, online);
3862 }
3863 if( opencat ) { free_category(opencat); }
3864 }
3865
3866 /***********************************************************
3867 * MULTIBYTE FILE HANDLING FUNCTIONS *
3868 * this is suitable for any locale whose character set *
3869 * encoding doesn't include NUL bytes inside characters *
3870 ***********************************************************/
3871
3872 /* this code executed before processing each line of input.
3873 - handles indents and appends via -Aa switches */
handle_indents_and_appends(char * textbuf)3874 char *handle_indents_and_appends(char *textbuf) {
3875
3876 char *pptextbuf = textbuf; /* default */
3877
3878 if( u_options & (1<<U_OPTION_INDENTED) ) {
3879 if( textbuf[0] == ' ' ) {
3880 pptextbuf = textbuf + 1; /* processing should ignore indent */
3881 } else {
3882 if( u_options & (1<<U_OPTION_APPEND) ) {
3883 fprintf(stdout, "%s", textbuf);
3884 }
3885 pptextbuf = NULL; /* no further processing */
3886 }
3887 }
3888
3889 /* if appending, print the lines as they
3890 come in before they are processed */
3891 if( u_options & (1<<U_OPTION_APPEND) ) {
3892 fprintf(stdout, " %s", pptextbuf);
3893 }
3894
3895 return pptextbuf;
3896 }
3897
3898 /***********************************************************
3899 * WIDE CHARACTER FILE HANDLING FUNCTIONS *
3900 * this is needed for any locale whose character set *
3901 * encoding can include NUL bytes inside characters *
3902 ***********************************************************/
3903
3904
3905 /***********************************************************
3906 * MAIN FUNCTIONS *
3907 ***********************************************************/
learner_preprocess_fun()3908 void learner_preprocess_fun() {
3909 category_count_t r;
3910
3911 init_learner(&learner, online, 0);
3912
3913 for(r = 0; r < ronline_count; r++) {
3914 merge_learner_struct(&learner, ronline[r]);
3915 }
3916 }
3917
learner_post_line_fun(char * buf)3918 void learner_post_line_fun(char *buf) {
3919 /* only call this when buf is NULL, ie end of file */
3920 if( !buf && (m_options & (1<<M_OPTION_MBOX_FORMAT)) ) {
3921 count_mbox_messages(&learner, mbox.state, buf);
3922 }
3923 }
3924
learner_postprocess_fun()3925 void learner_postprocess_fun() {
3926 optimize_and_save(&learner);
3927 }
3928
learner_cleanup_fun()3929 void learner_cleanup_fun() {
3930 free_learner(&learner);
3931 }
3932
learner_word_fun(char * tok,token_type_t tt,regex_count_t re)3933 void learner_word_fun(char *tok, token_type_t tt, regex_count_t re) {
3934 hash_word_and_learn(&learner, tok, tt, re);
3935 }
3936
classifier_preprocess_fun()3937 void classifier_preprocess_fun() {
3938 category_count_t c;
3939 /* no need to "load" the categories, this is done in set_option() */
3940
3941 if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
3942 init_empirical(&empirical,
3943 default_max_tokens,
3944 default_max_hash_bits); /* sets cached to zero */
3945 }
3946 reset_all_scores();
3947
3948 if( u_options & (1<<U_OPTION_DUMP) ) {
3949 for(c = 0; c < cat_count; c++) {
3950 fprintf(stdout, "%s%10s ", (c ? " " : "# categories: "), cat[c].filename);
3951 }
3952 fprintf(stdout, "\n# format: ");
3953
3954 if( u_options & (1<<U_OPTION_POSTERIOR) ) {
3955 fprintf(stdout, "score_delta\n");
3956 } else if( u_options & (1<<U_OPTION_SCORES) ) {
3957 fprintf(stdout, "avg_score * complexity\n");
3958 } else if( u_options & (1<<U_OPTION_VAR) ) {
3959 fprintf(stdout, "avg_score * complexity # s2\n");
3960 } else {
3961 fprintf(stdout, "lambda digref -renorm multi shannon id\n");
3962 }
3963 } else if( u_options & (1<<U_OPTION_DEBUG) ) {
3964 if( u_options & (1<<U_OPTION_PRIOR_CORRECTION) ) {
3965 for(c = 0; c < cat_count; c++) {
3966 fprintf(stdout, "%s%10s %5.2f ", (c ? " " : "# prior: "),
3967 cat[c].filename, cat[c].prior);
3968 }
3969 fprintf(stdout, "\n");
3970 }
3971 }
3972
3973 }
3974
classifier_cleanup_fun()3975 void classifier_cleanup_fun() {
3976 #undef GOODGUY
3977 #if defined GOODGUY
3978 /* normally we should free everything nicely, but there's no
3979 point, since the kernel will free the process memory. It's actually
3980 faster to not free... */
3981 category_count_t c;
3982 for(c = 0; c < cat_count; c++) {
3983 free_category(&cat[c]);
3984 }
3985 free_empirical(&empirical);
3986 #endif
3987 }
3988
3989
email_line_filter(MBOX_State * mbox,char * buf)3990 int email_line_filter(MBOX_State *mbox, char *buf) {
3991 int retval = mbox_line_filter(mbox, buf, &xml);
3992 count_mbox_messages(&learner, mbox->state, buf);
3993 return retval;
3994 }
3995
3996 #if defined HAVE_MBRTOWC
w_email_line_filter(MBOX_State * mbox,wchar_t * buf)3997 int w_email_line_filter(MBOX_State *mbox, wchar_t *buf) {
3998 return w_mbox_line_filter(mbox, buf, &xml);
3999 }
4000 #endif
4001
set_option(int op,char * optarg)4002 int set_option(int op, char *optarg) {
4003 int c = 0;
4004 switch(op) {
4005 case '@':
4006 /* this is an official NOOP, it MUST be ignored (see spherecl) */
4007 break;
4008 case '0':
4009 u_options &= ~(1<<U_OPTION_NOZEROLEARN);
4010 break;
4011 case '1':
4012 u_options |= (1<<U_OPTION_NOZEROLEARN);
4013 break;
4014 case 'A':
4015 u_options |= (1<<U_OPTION_INDENTED);
4016 /* fall through */
4017 case 'a':
4018 u_options |= (1<<U_OPTION_APPEND);
4019 break;
4020 case 'e':
4021 if( !strcasecmp(optarg, "alnum") ) {
4022 m_cp = CP_ALNUM;
4023 } else if( !strcasecmp(optarg, "alpha") ) {
4024 m_cp = CP_ALPHA;
4025 } else if( !strcasecmp(optarg, "cef") ) {
4026 m_cp = CP_CEF;
4027 } else if( !strcasecmp(optarg, "graph") ) {
4028 m_cp = CP_GRAPH;
4029 } else if( !strcasecmp(optarg, "adp") ) {
4030 m_cp = CP_ADP;
4031 } else if( !strcasecmp(optarg, "cef2") ) {
4032 m_cp = CP_CEF2;
4033 } else if( !strcasecmp(optarg, "char") ) {
4034 m_cp = CP_CHAR;
4035 } else {
4036 errormsg(E_WARNING,
4037 "unrecognized option \"%s\", ignoring.\n",
4038 optarg);
4039 }
4040 c++;
4041 break;
4042 case 'D':
4043 u_options |= (1<<U_OPTION_DEBUG);
4044 break;
4045 case 'm':
4046 #if defined HAVE_MMAP
4047 u_options |= (1<<U_OPTION_MMAP);
4048 #endif
4049 break;
4050 case 'M':
4051 m_options |= (1<<M_OPTION_MULTINOMIAL);
4052 break;
4053 case 'N':
4054 u_options |= (1<<U_OPTION_POSTERIOR);
4055 break;
4056 case 'R':
4057 if( cat_count >= MAX_CAT ) {
4058 errormsg(E_WARNING,
4059 "maximum reached, random text category omitted\n");
4060 } else if( u_options & (1<<U_OPTION_LEARN) ) {
4061 errormsg(E_ERROR,"cannot use options -l and -R together\n");
4062 exit(1);
4063 } else {
4064 u_options |= (1<<U_OPTION_CLASSIFY);
4065
4066 cat[cat_count].fullfilename = "random";
4067
4068 init_category(&cat[cat_count]);
4069 init_purely_random_text_category(&cat[cat_count]);
4070
4071 cat_count++;
4072 }
4073 break;
4074 case 'T':
4075 if( !strncasecmp(optarg, "text", 4) ) {
4076 /* this is the default */
4077 /* below, make sure the comparisons are in the correct order */
4078 } else if( !strcasecmp(optarg, "email:noplain") ) {
4079 m_options |= (1<<M_OPTION_NOPLAIN);
4080 m_options |= (1<<M_OPTION_MBOX_FORMAT);
4081 } else if( !strcasecmp(optarg, "email:plain") ) {
4082 m_options |= (1<<M_OPTION_PLAIN);
4083 m_options |= (1<<M_OPTION_MBOX_FORMAT);
4084 } else if( !strcasecmp(optarg, "email:headers") ) {
4085 m_options |= (1<<M_OPTION_HEADERS);
4086 m_options |= (1<<M_OPTION_MBOX_FORMAT);
4087 } else if( !strcasecmp(optarg, "email:noheaders") ) {
4088 m_options |= (1<<M_OPTION_NOHEADERS);
4089 m_options |= (1<<M_OPTION_MBOX_FORMAT);
4090 } else if( !strcasecmp(optarg, "email:atts") ) {
4091 m_options |= (1<<M_OPTION_ATTACHMENTS);
4092 m_options |= (1<<M_OPTION_MBOX_FORMAT);
4093 } else if( !strcasecmp(optarg, "email:xheaders") ) {
4094 m_options |= (1<<M_OPTION_XHEADERS);
4095 m_options |= (1<<M_OPTION_MBOX_FORMAT);
4096 } else if( !strcasecmp(optarg, "email:theaders") ) {
4097 m_options |= (1<<M_OPTION_THEADERS);
4098 m_options |= (1<<M_OPTION_MBOX_FORMAT);
4099 } else if( !strcasecmp(optarg, "email") ) {
4100 m_options |= (1<<M_OPTION_MBOX_FORMAT);
4101 } else if( !strcasecmp(optarg, "html:links") ) {
4102 m_options |= (1<<M_OPTION_SHOW_LINKS);
4103 m_options |= (1<<M_OPTION_HTML);
4104 } else if( !strcasecmp(optarg, "html:alt") ) {
4105 m_options |= (1<<M_OPTION_SHOW_ALT);
4106 m_options |= (1<<M_OPTION_HTML);
4107 } else if( !strcasecmp(optarg, "html:forms") ) {
4108 m_options |= (1<<M_OPTION_SHOW_FORMS);
4109 m_options |= (1<<M_OPTION_HTML);
4110 } else if( !strcasecmp(optarg, "html:scripts") ) {
4111 m_options |= (1<<M_OPTION_SHOW_SCRIPT);
4112 m_options |= (1<<M_OPTION_HTML);
4113 } else if( !strcasecmp(optarg, "html:styles") ) {
4114 m_options |= (1<<M_OPTION_SHOW_STYLE);
4115 m_options |= (1<<M_OPTION_HTML);
4116 } else if( !strcasecmp(optarg, "html:comments") ) {
4117 m_options |= (1<<M_OPTION_SHOW_HTML_COMMENTS);
4118 m_options |= (1<<M_OPTION_HTML);
4119 } else if( !strcasecmp(optarg, "html") ) {
4120 m_options |= (1<<M_OPTION_HTML);
4121 } else if( !strcasecmp(optarg, "xml") ) {
4122 m_options |= (1<<M_OPTION_XML);
4123 } else { /* default */
4124 errormsg(E_WARNING,
4125 "unrecognized option -T %s, ignoring.\n",
4126 optarg);
4127 }
4128 c++;
4129 break;
4130 case 'U':
4131 u_options |= (1<<U_OPTION_VAR);
4132 m_options |= (1<<M_OPTION_CALCENTROPY);
4133 break;
4134 case 'V':
4135 fprintf(stdout, "dbacl version %s\n", VERSION);
4136 fprintf(stdout, COPYBLURB, "dbacl");
4137 #if defined __GNUC__
4138 fprintf(stdout, "Using GNU modern regexes.\n");
4139 #else
4140 fprintf(stdout, "Using system regexes.\n");
4141 #endif
4142 #if defined DIGITIZE_DIGRAMS
4143 fprintf(stdout, "Digrams are digitized.\n");
4144 #endif
4145 #if defined DIGITIZE_LAMBDA
4146 fprintf(stdout, "Lagrangian multipliers are digitized.\n");
4147 #endif
4148 #if defined DIGITIZE_LWEIGHTS
4149 fprintf(stdout, "Reference weights are digitized.\n");
4150 #endif
4151 #if defined PORTABLE_CATS
4152 fprintf(stdout, "Category files are portable to systems with other byte orders.\n");
4153 #else
4154 fprintf(stdout, "Category files are NOT portable to systems with other byte orders.\n");
4155 #endif
4156 #if ! defined HAVE_MMAP
4157 fprintf(stdout, "Fast category loading with -m switch unavailable.\n");
4158 #endif
4159 fprintf(stdout, "Feature memory requirements: %d bytes (classifying), %d bytes (learning)\n",
4160 (int)sizeof(c_item_t), (int)sizeof(l_item_t));
4161 fprintf(stdout, "To change these settings, recompile from source.\n");
4162 exit(0);
4163 break;
4164 case 'X':
4165 m_options |= (1<<M_OPTION_CALCENTROPY);
4166 u_options |= (1<<U_OPTION_CONFIDENCE);
4167 break;
4168 case 'd':
4169 u_options |= (1<<U_OPTION_DUMP);
4170 break;
4171 case 'h': /* select memory size in powers of 2 */
4172 default_max_hash_bits = atoi(optarg);
4173 if( default_max_hash_bits > MAX_HASH_BITS ) {
4174 errormsg(E_WARNING,
4175 "maximum hash size will be 2^%d\n",
4176 MAX_HASH_BITS);
4177 default_max_hash_bits = MAX_HASH_BITS;
4178 }
4179 default_max_tokens = (1<<default_max_hash_bits);
4180 c++;
4181 break;
4182 case 'H': /* select memory size in powers of 2 */
4183 default_max_grow_hash_bits = atoi(optarg);
4184 if( default_max_grow_hash_bits > MAX_HASH_BITS ) {
4185 errormsg(E_WARNING,
4186 "maximum hash size will be 2^%d\n",
4187 MAX_HASH_BITS);
4188 default_max_grow_hash_bits = MAX_HASH_BITS;
4189 }
4190 default_max_grow_tokens = (1<<default_max_grow_hash_bits);
4191 u_options |= (1<<U_OPTION_GROWHASH);
4192 c++;
4193 break;
4194 case 'j':
4195 m_options |= (1<<M_OPTION_CASEN);
4196 break;
4197 case 'n':
4198 u_options |= (1<<U_OPTION_SCORES);
4199 break;
4200 case 'c':
4201 if( cat_count >= MAX_CAT ) {
4202 errormsg(E_WARNING,
4203 "maximum reached, category ignored\n");
4204 } else if( u_options & (1<<U_OPTION_LEARN) ) {
4205 errormsg(E_ERROR, "cannot use options -l and -c together\n");
4206 exit(1);
4207 } else {
4208 u_options |= (1<<U_OPTION_CLASSIFY);
4209
4210 cat[cat_count].fullfilename = sanitize_path(optarg, extn);
4211
4212 if( !*optarg ) {
4213 errormsg(E_FATAL, "category needs a name\n");
4214 }
4215 if( !load_category(&cat[cat_count]) ) {
4216 errormsg(E_FATAL, "could not load category %s\n",
4217 cat[cat_count].fullfilename);
4218 }
4219 if( sanitize_model_options(&m_options,&m_cp,&cat[cat_count]) ) {
4220 ngram_order = (ngram_order < cat[cat_count].max_order) ?
4221 cat[cat_count].max_order : ngram_order;
4222 cat_count++;
4223 }
4224 }
4225 c++;
4226 break;
4227 case 'P':
4228 u_options |= (1<<U_OPTION_PRIOR_CORRECTION);
4229 break;
4230 case 'q':
4231 quality = atoi(optarg);
4232 switch(quality) {
4233 case 1:
4234 qtol_div = 0.01;
4235 qtol_lam = CLIP_LAMBDA_TOL(0.05);
4236 qtol_logz = 0.05;
4237 break;
4238 case 2:
4239 qtol_div = 0.01;
4240 qtol_lam = CLIP_LAMBDA_TOL(0.01);
4241 qtol_logz = 0.05;
4242 break;
4243 case 3:
4244 qtol_div = 0.01;
4245 qtol_lam = CLIP_LAMBDA_TOL(0.01);
4246 qtol_logz = 0.01;
4247 break;
4248 case 4:
4249 qtol_div = 0.01;
4250 qtol_lam = CLIP_LAMBDA_TOL(0.01);
4251 qtol_logz = 0.01;
4252 qtol_multipass = 1;
4253 break;
4254 default:
4255 errormsg(E_FATAL,
4256 "the -q switch needs a number between 1(fastest) and 4(best quality)\n");
4257 break;
4258 }
4259 c++;
4260 break;
4261 case 'r':
4262 m_options |= (1<<M_OPTION_REFMODEL);
4263 break;
4264 case 'w':
4265 ngram_order = atoi(optarg);
4266 if( !*optarg || (ngram_order < 1) || (ngram_order > 7) ) {
4267 errormsg(E_FATAL,
4268 "the -w switch needs a number between 1 and 7\n");
4269 }
4270 c++;
4271 break;
4272 case 'S':
4273 m_options |= (1<<M_OPTION_NGRAM_STRADDLE_NL);
4274 break;
4275 case 'x':
4276 decimation = atoi(optarg);
4277 if( !*optarg || (decimation < 1) || (decimation > MAX_HASH_BITS) ) {
4278 errormsg(E_WARNING,
4279 "option -x ignored, needs an integer between 1 and %d\n",
4280 MAX_HASH_BITS);
4281 } else {
4282 u_options |= (1<<U_OPTION_DECIMATE);
4283 srand(0);
4284 }
4285 c++;
4286 break;
4287 case 'f':
4288 if( filter_count >= MAX_CAT ) {
4289 errormsg(E_WARNING, "maximum reached, filter ignored\n");
4290 } else if( u_options & (1<<U_OPTION_LEARN) ) {
4291 errormsg(E_ERROR, "cannot use options -l and -f together\n");
4292 exit(1);
4293 } else if( !*optarg ) {
4294 errormsg(E_ERROR, "filter must be category name or number\n");
4295 exit(1);
4296 } else {
4297 u_options |= (1<<U_OPTION_FILTER);
4298 filter[filter_count] = -1;
4299
4300 /* see if it's a known category */
4301 for(c = 0; c < cat_count; c++) {
4302 if( !strcmp(cat[c].filename, optarg) ) {
4303 filter[filter_count] = c;
4304 break;
4305 }
4306 }
4307 /* if not recognized, see if it's a number */
4308 if( filter[filter_count] < 0 ) {
4309 filter[filter_count] = atoi(optarg) - 1;
4310 }
4311 if( filter[filter_count] < 0 ) { /* still not recognized */
4312 errormsg(E_ERROR, "unrecognized category in -f option [%s]\n",
4313 optarg);
4314 exit(1);
4315 }
4316 filter_count++;
4317 }
4318 c++;
4319 break;
4320 case 'F':
4321 u_options |= (1<<U_OPTION_CLASSIFY_MULTIFILE);
4322 break;
4323 case 'v':
4324 u_options |= (1<<U_OPTION_VERBOSE);
4325 break;
4326 case 'L':
4327 if( *optarg &&
4328 (!strcmp(optarg, "uniform") ||
4329 !strcmp(optarg, "dirichlet") ||
4330 !strcmp(optarg, "maxent") ||
4331 !strcmp(optarg, "mle") ||
4332 !strcmp(optarg, "iid") ||
4333 !strcmp(optarg, "toklen")) ) {
4334 digtype = optarg;
4335 } else {
4336 errormsg(E_FATAL, "-L option needs one of \"uniform\", \"dirichlet\", \"maxent\", \"toklen\", \"mle\"\n");
4337 }
4338 c++;
4339 break;
4340 case 'l':
4341 if( u_options & (1<<U_OPTION_CLASSIFY) ) {
4342 errormsg(E_ERROR,
4343 "cannot use options -l and -c together\n");
4344 exit(1);
4345 } else if( u_options & (1<<U_OPTION_LEARN) ) {
4346 errormsg(E_ERROR,
4347 "option -l can only occur once\n");
4348 exit(1);
4349 } else if( !*optarg ) {
4350 errormsg(E_ERROR, "category name must not be empty\n");
4351 } else {
4352 u_options |= (1<<U_OPTION_LEARN);
4353 learner.filename = sanitize_path(optarg, extn);
4354 if( !*learner.filename ) {
4355 errormsg(E_ERROR, "category needs a name\n");
4356 exit(1);
4357 }
4358 }
4359 c++;
4360 break;
4361 case 'o':
4362 if( !*optarg ) {
4363 errormsg(E_ERROR, "category name must not be empty in -o switch\n");
4364 } else if( !*online ) {
4365 online = sanitize_path(optarg, "");
4366 } else {
4367 errormsg(E_WARNING,
4368 "multiple -o switches not supported, ignoring all but the first\n");
4369 }
4370 c++;
4371 break;
4372 case 'O':
4373 if( !*optarg ) {
4374 errormsg(E_ERROR, "category name must not be empty in -o switch\n");
4375 } else {
4376 ronline[ronline_count++] = sanitize_path(optarg, "");
4377 }
4378 c++;
4379 break;
4380 case 'G':
4381 case 'g':
4382 /* if regex can't be compiled, load_regex() exits */
4383 learner.retype |= (1<<load_regex(optarg));
4384 c++;
4385 break;
4386 case 'i':
4387 m_options |= (1<<M_OPTION_I18N);
4388 #if defined HAVE_LANGINFO_H
4389 if( !strcmp(nl_langinfo(CODESET), "UTF-8") ) {
4390 errormsg(E_WARNING, "you have UTF-8, so -i is not needed.\n");
4391 }
4392 #endif
4393 #if !defined HAVE_MBRTOWC
4394 errormsg(E_WARNING, "this tool was compiled without wide character support. Full internationalization is disabled.\n");
4395 m_options &= ~(1<<M_OPTION_I18N);
4396 #endif
4397 break;
4398 case 'Y':
4399 u_options |= (1<<U_OPTION_MEDIACOUNTS);
4400 break;
4401 case 'z':
4402 zthreshold = atoi(optarg);
4403 c++;
4404 break;
4405 default:
4406 c--;
4407 break;
4408 }
4409 return c;
4410 }
4411
sanitize_options()4412 void sanitize_options() {
4413 category_count_t c;
4414
4415 /* consistency checks */
4416 if( ((u_options>>U_OPTION_CLASSIFY) & 1) +
4417 ((u_options>>U_OPTION_LEARN) & 1) != 1 ) {
4418 errormsg(E_ERROR, "please use either -c or -l option.\n");
4419 exit(1);
4420 }
4421
4422 if( (*online || (ronline_count > 0)) &&
4423 (u_options & (1<<U_OPTION_CONFIDENCE)) ) {
4424 /* errormsg(E_WARNING, */
4425 /* "-X switch cannot be used with -o switch, disabling -X.\n"); */
4426 m_options &= ~(1<<U_OPTION_CONFIDENCE);
4427 }
4428
4429 if( (u_options & (1<<U_OPTION_DECIMATE)) &&
4430 !(u_options & (1<<U_OPTION_LEARN)) ) {
4431 errormsg(E_WARNING,
4432 "option -x ignored, applies only when learning.\n");
4433 u_options &= ~(1<<U_OPTION_DECIMATE);
4434 }
4435
4436 if( u_options & (1<<U_OPTION_DUMP) ) {
4437 if( u_options & (1<<U_OPTION_VERBOSE) ) {
4438 u_options &= ~(1<<U_OPTION_VERBOSE); /* verbose writes garbage to stdout */
4439 u_options &= ~(1<<U_OPTION_DEBUG);
4440 }
4441 }
4442
4443 if( !(m_options & (1<<M_OPTION_TEXT_FORMAT)) &&
4444 !(m_options & (1<<M_OPTION_MBOX_FORMAT)) &&
4445 !(m_options & (1<<M_OPTION_HTML)) &&
4446 !(m_options & (1<<M_OPTION_XML)) ) {
4447 m_options |= (1<<M_OPTION_TEXT_FORMAT);
4448 }
4449
4450 if( ((m_options>>M_OPTION_TEXT_FORMAT) & 1) +
4451 ((m_options>>M_OPTION_MBOX_FORMAT) & 1) > 1 ) {
4452 errormsg(E_ERROR,
4453 "please use only one of either -T text or -T email options.\n");
4454 exit(1);
4455 }
4456
4457 if( ((m_options>>M_OPTION_XML) & 1) +
4458 ((m_options>>M_OPTION_HTML) & 1) > 1 ) {
4459 errormsg(E_ERROR,
4460 "please use only one of either -T xml or -T html options.\n");
4461 exit(1);
4462 }
4463
4464
4465 if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4466 /* mbox format needs html unless user wants xml */
4467 if( !(m_options & (1<<M_OPTION_XML)) ) {
4468 m_options |= (1<<M_OPTION_HTML);
4469 }
4470
4471 /* for mboxes, only compute ngrams for each line individually */
4472 /* m_options &= ~(1<<M_OPTION_NGRAM_STRADDLE_NL); */
4473
4474 /* always calculate entropy statistics when learning */
4475 if( u_options & (1<<U_OPTION_LEARN) ) {
4476 m_options |= (1<<M_OPTION_CALCENTROPY);
4477 }
4478 }
4479
4480
4481 if( (u_options & (1<<U_OPTION_APPEND)) &&
4482 (u_options & (1<<U_OPTION_FILTER)) ) {
4483 u_options &= ~(1<<U_OPTION_APPEND);
4484 errormsg(E_WARNING,
4485 "disabling option -a, because it cannot be used with -f.\n");
4486 }
4487
4488 /* decide if we need some options */
4489
4490 if( u_options & (1<<U_OPTION_LEARN) ) {
4491 if( !regex_count ) {
4492 m_options |= (1<<M_OPTION_USE_STDTOK);
4493 } else {
4494 m_options &= ~(1<<M_OPTION_USE_STDTOK);
4495 }
4496 }
4497
4498 if( m_options & (1<<M_OPTION_I18N) ) {
4499 /* I've removed the internationalized regex code, because it makes
4500 the code too complex to handle both multibyte and wide char regexes
4501 simultaneously. */
4502 errormsg(E_WARNING,
4503 "this version of dbacl does not support -g and -i switches simultaneously, disabling -i.\n");
4504 m_options &= ~(1<<M_OPTION_I18N);
4505 }
4506
4507 if( m_options & (1<<M_OPTION_MULTINOMIAL) ) {
4508 if( cat_count == 1 ) {
4509 if( cat[0].model.type == simple ) {
4510 m_options |= (1<<M_OPTION_CALCENTROPY);
4511 }
4512 } else if( cat_count > 1 ) {
4513 for(c = 1; c < cat_count; c++) {
4514 if( cat[c].model.type == sequential ) { break; }
4515 if( cat[c].retype != cat[c-1].retype ) { break; }
4516 }
4517 if( c == cat_count ) {
4518 m_options |= (1<<M_OPTION_CALCENTROPY);
4519 }
4520 }
4521 if( !(m_options & (1<<M_OPTION_CALCENTROPY)) ) {
4522 errormsg(E_WARNING,
4523 "not all categories support multinomial calculations, disabling (-M switch)\n");
4524 m_options &= ~(1<<M_OPTION_MULTINOMIAL);
4525 }
4526 }
4527
4528 /* unless overridden, we use alpha character classes only */
4529 if( m_cp == CP_DEFAULT ) {
4530 if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4531 m_cp = CP_ADP;
4532 } else {
4533 m_cp = CP_ALPHA;
4534 }
4535 }
4536
4537 if( !*digtype ) {
4538 if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4539 digtype = "uniform";
4540 } else {
4541 digtype = "dirichlet";
4542 }
4543 }
4544 }
4545
4546
main(int argc,char ** argv)4547 int main(int argc, char **argv) {
4548
4549 FILE *input;
4550 signed char op;
4551 struct stat statinfo;
4552
4553 void (*preprocess_fun)(void) = NULL;
4554 void (*word_fun)(char *, token_type_t, regex_count_t) = NULL;
4555 char *(*pre_line_fun)(char *) = NULL;
4556 void (*post_line_fun)(char *) = NULL;
4557 void (*post_file_fun)(char *) = NULL;
4558 void (*postprocess_fun)(void) = NULL;
4559 void (*cleanup_fun)(void) = NULL;
4560 int (*line_filter)(MBOX_State *, char *) = NULL;
4561 void (*character_filter)(XML_State *, char *) = NULL;
4562 #if defined HAVE_MBRTOWC
4563 int (*w_line_filter)(MBOX_State *, wchar_t *) = NULL;
4564 void (*w_character_filter)(XML_State *, wchar_t *) = NULL;
4565 #endif
4566
4567 progname = "dbacl";
4568 inputfile = "stdin";
4569 inputline = 0;
4570
4571 learner.filename = NULL;
4572 learner.retype = 0;
4573
4574 /* set up internationalization */
4575 if( !setlocale(LC_ALL, "") ) {
4576 errormsg(E_WARNING,
4577 "could not set locale, internationalization disabled\n");
4578 } else {
4579 if( u_options & (1<<U_OPTION_VERBOSE) ) {
4580 errormsg(E_WARNING,
4581 "international locales not supported\n");
4582 }
4583 }
4584
4585 #if defined(HAVE_GETPAGESIZE)
4586 system_pagesize = getpagesize();
4587 #endif
4588 if( system_pagesize == -1 ) { system_pagesize = BUFSIZ; }
4589
4590 init_signal_handling();
4591
4592 /* parse the options */
4593 while( (op = getopt(argc, argv,
4594 "01Aac:Dde:f:FG:g:H:h:ijL:l:mMNno:O:Ppq:RrST:UVvw:x:XYz:@")) > -1 ) {
4595 set_option(op, optarg);
4596 }
4597
4598 /* end option processing */
4599 sanitize_options();
4600
4601 /* set up callbacks */
4602 if( u_options & (1<<U_OPTION_CLASSIFY) ) {
4603
4604 preprocess_fun = classifier_preprocess_fun;
4605 word_fun = score_word;
4606 if( u_options & (1<<U_OPTION_FILTER) ) {
4607 u_options |= (1<<U_OPTION_FASTEMP);
4608 empirical.track_features = 1;
4609 post_line_fun = line_score_categories;
4610 post_file_fun = NULL;
4611 postprocess_fun = NULL;
4612 } else if( u_options & (1<<U_OPTION_CLASSIFY_MULTIFILE) ) {
4613 post_line_fun = NULL;
4614 post_file_fun = file_score_categories;
4615 postprocess_fun = NULL;
4616 } else {
4617 post_line_fun = NULL;
4618 post_file_fun = NULL;
4619 postprocess_fun = score_categories;
4620 }
4621 cleanup_fun = classifier_cleanup_fun;
4622
4623 } else if( u_options & (1<<U_OPTION_LEARN) ) {
4624 /* category learning */
4625
4626 preprocess_fun = learner_preprocess_fun;
4627 word_fun = learner_word_fun;
4628 post_line_fun = learner_post_line_fun;
4629 post_file_fun = NULL;
4630 postprocess_fun = learner_postprocess_fun;
4631 cleanup_fun = learner_cleanup_fun;
4632
4633 } else { /* something wrong ? */
4634 usage(argv);
4635 exit(1);
4636 }
4637
4638 /* handles some common filtering options */
4639 if( (u_options & (1<<U_OPTION_INDENTED)) ||
4640 (u_options & (1<<U_OPTION_APPEND)) ) {
4641 pre_line_fun = handle_indents_and_appends;
4642 }
4643
4644 if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4645 line_filter = email_line_filter;
4646 }
4647
4648 if( (m_options & (1<<M_OPTION_XML)) ||
4649 (m_options & (1<<M_OPTION_HTML)) ) {
4650 character_filter = xml_character_filter;
4651 }
4652
4653 #if defined HAVE_MBRTOWC
4654 if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4655 w_line_filter = w_email_line_filter;
4656 }
4657
4658 if( (m_options & (1<<M_OPTION_XML)) ||
4659 (m_options & (1<<M_OPTION_HTML)) ) {
4660 w_character_filter = w_xml_character_filter;
4661 }
4662 #endif
4663
4664
4665 if( preprocess_fun ) { (*preprocess_fun)(); }
4666
4667
4668 init_file_handling();
4669
4670 /* now process each file on the command line,
4671 or if none provided read stdin */
4672 while( (optind > -1) && *(argv + optind) && !(cmd & (1<<CMD_QUITNOW)) ) {
4673 /* if it's a filename, process it */
4674 input = fopen(argv[optind], "rb");
4675 if( input ) {
4676 inputfile = argv[optind];
4677
4678 u_options |= (1<<U_OPTION_STDIN);
4679
4680 if( (u_options & (1<<U_OPTION_VERBOSE)) &&
4681 !(u_options & (1<<U_OPTION_CLASSIFY))) {
4682 fprintf(stdout, "processing file %s\n", argv[optind]);
4683 }
4684
4685 /* set some initial options */
4686 reset_xml_character_filter(&xml, xmlRESET);
4687
4688 if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4689 reset_mbox_line_filter(&mbox);
4690 }
4691
4692 if( fstat(fileno(input), &statinfo) == 0 ) {
4693 switch(statinfo.st_mode & S_IFMT) {
4694 case S_IFDIR:
4695 if( !(m_options & (1<<M_OPTION_I18N)) ) {
4696 process_directory(inputfile, line_filter, character_filter,
4697 word_fun, pre_line_fun, post_line_fun, post_file_fun);
4698 } else {
4699 #if defined HAVE_MBRTOWC
4700 w_process_directory(inputfile, w_line_filter, w_character_filter,
4701 word_fun, pre_line_fun, post_line_fun, post_file_fun);
4702 #else
4703 errormsg(E_ERROR, "international support not available (recompile).\n");
4704 #endif
4705 }
4706 break;
4707 default:
4708 if( !(m_options & (1<<M_OPTION_I18N)) ) {
4709 process_file(input, line_filter, character_filter,
4710 word_fun, pre_line_fun, post_line_fun);
4711 } else {
4712 #if defined HAVE_MBRTOWC
4713 w_process_file(input, w_line_filter, w_character_filter,
4714 word_fun, pre_line_fun, post_line_fun);
4715 #else
4716 errormsg(E_ERROR, "international support not available (recompile).\n");
4717 #endif
4718 }
4719 }
4720 }
4721 fclose(input);
4722
4723 if( post_file_fun ) { (*post_file_fun)(inputfile); }
4724
4725 } else { /* unrecognized file name */
4726
4727 errormsg(E_FATAL, "couldn't open %s\n", argv[optind]);
4728 /* usage(argv); */
4729 /* exit(1); */
4730
4731 }
4732 optind++;
4733 }
4734 /* in case no files were specified, get input from stdin,
4735 * but note we reopen it in binary mode
4736 */
4737 if( !(u_options & (1<<U_OPTION_STDIN)) ) {
4738 input = fdopen(fileno(stdin), "rb");
4739 if( input ) {
4740
4741 if( (u_options & (1<<U_OPTION_VERBOSE)) &&
4742 !(u_options & (1<<U_OPTION_CLASSIFY)) ) {
4743 fprintf(stdout, "taking input from stdin\n");
4744 }
4745
4746 /* set some initial options */
4747 reset_xml_character_filter(&xml, xmlRESET);
4748
4749 if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4750 reset_mbox_line_filter(&mbox);
4751 }
4752
4753 if( !(m_options & (1<<M_OPTION_I18N)) ) {
4754 process_file(input, line_filter, character_filter,
4755 word_fun, pre_line_fun, post_line_fun);
4756 } else {
4757 #if defined HAVE_MBRTOWC
4758 w_process_file(input, w_line_filter, w_character_filter,
4759 word_fun, pre_line_fun, post_line_fun);
4760 #else
4761 errormsg(E_ERROR, "international support not available (recompile).\n");
4762 #endif
4763 }
4764
4765 /* must close before freeing in_iobuf, in case setvbuf was called */
4766 fclose(input);
4767
4768 if( post_file_fun ) { (*post_file_fun)(inputfile); }
4769
4770 }
4771 }
4772
4773
4774 if( postprocess_fun ) { (*postprocess_fun)(); }
4775
4776 cleanup_file_handling();
4777
4778 if( cleanup_fun ) { (*cleanup_fun)(); }
4779
4780 free_all_regexes();
4781
4782 cleanup_signal_handling();
4783
4784 return exit_code;
4785 }
4786
4787
4788