1 /*
2  * Copyright (C) 2002 Laird Breyer
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
17  *
18  * Author:   Laird Breyer <laird@lbreyer.com>
19  */
20 
21 /*
22  * Note to regenerate the Makefile and configure script:
23  * aclocal
24  * autoconf
25  * touch NEWS README AUTHORS ChangeLog
26  * automake -a
27  *
28  * You can also just run the bootstrap script.
29  */
30 
31 #ifdef HAVE_CONFIG_H
32 #include "config.h"
33 #endif
34 
35 #include <math.h>
36 #include <ctype.h>
37 #include <string.h>
38 #include <stdlib.h>
39 #include <sys/stat.h>
40 #include <fcntl.h>
41 #include <time.h>
42 
43 #if defined HAVE_UNISTD_H
44 #include <unistd.h>
45 #include <sys/types.h>
46 #endif
47 
48 #include <locale.h>
49 
50 #if defined HAVE_LANGINFO_H
51 #include <langinfo.h>
52 #if !defined CODESET
53 /* on OpenBSD, CODESET doesn't seem to be defined -
54    we use 3, which should be US-ASCII, but it's not ideal... */
55 #define CODESET 3
56 #endif
57 #endif
58 
59 #include "util.h"
60 #include "dbacl.h" /* make sure this is last */
61 
62 /* global variables */
63 
64 extern signal_cleanup_t cleanup;
65 
66 extern hash_bit_count_t default_max_hash_bits;
67 extern hash_count_t default_max_tokens;
68 
69 extern hash_bit_count_t default_max_grow_hash_bits;
70 extern hash_count_t default_max_grow_tokens;
71 
72 hash_bit_count_t decimation;
73 int zthreshold = 0;
74 
75 learner_t learner;
76 dirichlet_t dirichlet;
77 int filter[MAX_CAT];
78 category_count_t filter_count = 0;
79 
80 extern category_t cat[MAX_CAT];
81 extern category_count_t cat_count;
82 
83 extern myregex_t re[MAX_RE];
84 extern regex_count_t regex_count;
85 
86 extern empirical_t empirical;
87 
88 extern options_t u_options;
89 extern options_t m_options;
90 extern charparser_t m_cp;
91 extern digtype_t m_dt;
92 extern char *extn;
93 
94 extern token_order_t ngram_order; /* defaults to 1 */
95 
96 /* for counting emails */
97 bool_t not_header;
98 extern MBOX_State mbox;
99 extern XML_State xml;
100 
101 /* for option processing */
102 extern char *optarg;
103 extern int optind, opterr, optopt;
104 
105 char *title = "";
106 char *digtype = "";
107 char *online = "";
108 char *ronline[MAX_CAT];
109 category_count_t ronline_count = 0;
110 extern char *progname;
111 extern char *inputfile;
112 extern long inputline;
113 
114 extern int cmd;
115 int exit_code = 0; /* default */
116 
117 int overflow_warning = 0;
118 int digramic_overflow_warning = 0;
119 int skewed_constraints_warning = 0;
120 
121 extern long system_pagesize;
122 
123 extern void *in_iobuf;
124 extern void *out_iobuf;
125 
126 /* tolerance for the error in divergence - this can be changed with -q.
127    More accuracy means slower learning. This should be the same order
128    of magnitude as the error due to digitization, but we tweak them
129    to obtain timely output */
130 
131 int quality = 0;
132 double qtol_alpha = 0.0001;
133 double qtol_div = 0.01;
134 double qtol_lam = CLIP_LAMBDA_TOL(0.05);
135 double qtol_logz = 0.05;
136 bool_t qtol_multipass = 0;
137 
138 /***********************************************************
139  * MISCELLANEOUS FUNCTIONS                                 *
140  ***********************************************************/
141 
usage(char ** argv)142 static void usage(/*@unused@*/ char **argv) {
143   fprintf(stderr,
144 	  "\n");
145   fprintf(stderr,
146 	  "dbacl [-vniNR] [-T type] -c CATEGORY [-c CATEGORY]...\n");
147   fprintf(stderr,
148 	  "      [-f KEEP]... [FILE]...\n");
149   fprintf(stderr,
150 	  "\n");
151   fprintf(stderr,
152 	  "      classifies FILE or STDIN using CATEGORY, optionally\n");
153   fprintf(stderr,
154 	  "      removing all lines which don't fit the category KEEP.\n");
155   fprintf(stderr,
156 	  "\n");
157   fprintf(stderr,
158 	  "dbacl [-vnirNDL] [-h size] [-T type] -l CATEGORY \n");
159   fprintf(stderr,
160 	  "      [-g regex]... [FILE]...\n");
161   fprintf(stderr,
162 	  "\n");
163   fprintf(stderr,
164 	  "      builds a maximum entropy model from the words in FILE or STDIN\n");
165   fprintf(stderr,
166 	  "      or concatenated regex submatches if using the -g option.\n");
167   fprintf(stderr,
168 	  "\n");
169   fprintf(stderr,
170 	  "dbacl -V\n");
171   fprintf(stderr,
172 	  "\n");
173   fprintf(stderr,
174 	  "      prints program version.\n");
175 }
176 
177 
178 /***********************************************************
179  * CATEGORY FUNCTIONS                                      *
180  ***********************************************************/
181 
reset_all_scores()182 void reset_all_scores() {
183   category_count_t i;
184   for(i = 0; i < cat_count; i++) {
185     cat[i].score = 0.0;
186     cat[i].score_s2 = 0.0;
187     cat[i].score_div = 0.0;
188     cat[i].score_shannon = 0.0;
189     cat[i].complexity = 0.0;
190     cat[i].fcomplexity = 0;
191   }
192 }
193 
194 /* calculate the overlap probabilities (ie the probability that the
195    given score is the minimal score assuming independent Gaussian
196    distributions.)
197 
198    NOTE: I also toyed with the idea of multiplying each uncertainty
199    score by the percentage of recognized tokens for that category.  It
200    seems plausible that uncertainty should also be proportional to the
201    unknown token count, but in practice, a typical email has a
202    substantial fraction of unknown tokens, and this dominates the
203    uncertainty obtained from the Gaussian assumption.  I've disabled
204    this code again, see ALTERNATIVE UNCERTAINTY code below
205 */
calc_uncertainty(int map)206 double calc_uncertainty(int map) {
207   double mu[MAX_CAT];
208   double sigma[MAX_CAT];
209   int i;
210   double p, u, t, pmax;
211 
212   for(i = 0; i < cat_count; i++) {
213     mu[i] = -sample_mean(cat[i].score, cat[i].complexity);
214     sigma[i] = sqrt(cat[i].score_s2/cat[i].complexity);
215   }
216 
217   /* even though p below is a true probability, it doesn't quite sum
218      to 1, because of numerical errors in the min_prob function, which
219      is only designed to do about 1% error for speed.
220      So we might get up to 1 + cat_count% total mass,
221      which means that u could be 102% say in spam filtering.
222      Doesn't look good. So we compute the normalizing constant t.
223   */
224   t = 0.0;
225   pmax = 1.0;
226   for(i = 0; i < cat_count; i++) {
227     p = min_prob(i, cat_count, mu, sigma);
228     if( i == map ) {
229       pmax = p;
230     }
231     t += p;
232   }
233 
234   /* pmax is proportional to the MAP probability, which is guaranteed
235      to be >= 0.5.  The value 0.5 occurs when the MAP is totally ambiguous,
236   ie the classification is completely uncertain. Since people prefer a range
237   0-100%, we stretch the MAP probability into the interval [0,1] here. */
238   u = 2.0 * pmax / t - 1.0;
239 
240 
241   /* ALTERNATIVE UNCERTAINTY : uncomment below to enable */
242   /* now multiply by the feature hit rate, which is also in [0,1] */
243   /* u *= ((cat[map].complexity - cat[map].fmiss) / cat[map].complexity); */
244 
245   return u;
246 }
247 
248 /* note: don't forget to flush after each line */
line_score_categories(char * textbuf)249 void line_score_categories(char *textbuf) {
250   category_count_t i;
251   category_count_t map;
252   score_t c, cmax;
253 
254   if( !textbuf ) { return; }
255 
256   /* find MAP */
257   cmax = cat[0].score;
258   map = 0;
259   for(i = 0; i < cat_count; i++) {
260     if(cmax < cat[i].score) {
261       cmax = cat[i].score;
262       map = i;
263     }
264     /* finish sample variance calculation */
265     cat[i].score_s2 =
266       sample_variance(cat[i].score_s2, cat[i].score, cat[i].complexity);
267   }
268 
269   if( !(u_options & (1<<U_OPTION_DUMP)) ) {
270 
271     if(u_options & (1<<U_OPTION_POSTERIOR) ) {
272       /* compute probabilities given exclusive choices */
273       c = 0.0;
274       for(i = 0; i < cat_count; i++) {
275 	c += exp((cat[i].score - cmax));
276       }
277 
278       if( *textbuf ) {
279 	for(i = 0; i < cat_count; i++) {
280 	  fprintf(stdout, "%s %6.2" FMT_printf_score_t "%% ",
281 		  cat[i].filename, 100 * exp((cat[i].score - cmax))/c);
282 	}
283 	fprintf(stdout, "%s", textbuf);
284 	fflush(stdout);
285       }
286     } else if( u_options & (1<<U_OPTION_SCORES) ) {
287       /* display normalized divergence scores */
288       if( *textbuf ) {
289 	for(i = 0; i < cat_count; i++) {
290 	  if( u_options & (1<<U_OPTION_VERBOSE) ) {
291 	    fprintf(stdout, "%s %6.2" FMT_printf_score_t " * %-4.1f ",
292 		    cat[i].filename,
293 		    -nats2bits(sample_mean(cat[i].score, cat[i].complexity)),
294 		    cat[i].complexity);
295 	  } else {
296 	    fprintf(stdout, "%s %6.2" FMT_printf_score_t " ",
297 		    cat[i].filename, -nats2bits(cat[i].score));
298 	  }
299 	}
300 	fprintf(stdout, "%s", textbuf);
301 	fflush(stdout);
302       }
303     } else {
304       /* prune the text which doesn't fit */
305       for(i = 0; i < filter_count; i++) {
306 	if( cat[map].score <= cat[filter[i]].score ) {
307 	  fprintf(stdout, "%s", textbuf);
308 	  fflush(stdout);
309 	  break;
310 	}
311       }
312     }
313 
314   }
315   /* clean up for next line */
316   reset_all_scores();
317 
318   if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
319     clear_empirical(&empirical);
320   }
321 }
322 
score_categories()323 void score_categories() {
324   bool_t no_title;
325   category_count_t i, j, map;
326   score_t c, cmax, lam;
327   score_t sumdocs, sumfeats;
328   bool_t hasnum;
329 
330   /* finish computing sample entropies */
331   if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
332     for(i = 0; i < cat_count; i++) {
333       cat[i].score_shannon =
334 	-( sample_mean(cat[i].score_shannon, cat[i].complexity) -
335 	   log((weight_t)cat[i].complexity) );
336       cat[i].score_div =
337 	-( sample_mean(cat[i].score, cat[i].complexity) + cat[i].score_shannon);
338     }
339   }
340 
341   hasnum = 1;
342   sumdocs = 0.0;
343   sumfeats = 0.0;
344   for(i = 0; i < cat_count; i++) {
345     /* finish sample variance calculation */
346     cat[i].score_s2 =
347       sample_variance(cat[i].score_s2, cat[i].score, cat[i].complexity);
348     /* compute some constants */
349     hasnum = hasnum && (cat[i].model_num_docs > 0);
350     sumdocs += cat[i].model_num_docs;
351     sumfeats += cat[i].model_unique_token_count;
352   }
353 
354   if( (u_options & (1<<U_OPTION_PRIOR_CORRECTION)) && hasnum ) {
355 
356     cmax = log(2.0*M_PI)/2.0;
357     for(i = 0; i < cat_count; i++) {
358       /* the prior is Poisson based on mean document length.
359 	 -lambda + x * log(lambda) - log(x!), which using Stirling's
360 	 approximation
361 	 log(x!) = x*log(x) - x + 0.5*log(2PI*x),
362 	 can be calculated as
363 	 -lambda - 0.5*log(2PI) + x * (1 + log(lambda/x)).
364       */
365 
366       lam = (score_t)cat[i].model_full_token_count/cat[i].model_num_docs;
367       cat[i].prior =
368 	-lam -cmax + cat[i].complexity * (1.0 + log(lam/cat[i].complexity));
369       cat[i].score += cat[i].prior;
370     }
371   }
372 
373 
374   /* find MAP */
375   cmax = cat[0].score;
376   map = 0;
377   for(i = 0; i < cat_count; i++) {
378     if(cmax < cat[i].score) {
379       cmax = cat[i].score;
380       map = i;
381     }
382   }
383   exit_code = (int)map;
384 
385   /* there are three cases: posterior, scores, nothing */
386 
387   if( u_options & (1<<U_OPTION_POSTERIOR) ) {
388 
389     if( u_options & (1<<U_OPTION_APPEND) ) {
390       fprintf(stdout, "\n# probabilities ");
391     }
392 
393     /* here we compute probabilities given exclusive choices */
394     c = 0.0;
395     for(i = 0; i < cat_count; i++) {
396       c += exp((cat[i].score - cmax));
397     }
398 
399     for(i = 0; i < cat_count; i++) {
400       fprintf(stdout, "%s %5.2" FMT_printf_score_t "%% ",
401 	      cat[i].filename, 100 * exp((cat[i].score - cmax))/c);
402       if( u_options & (1<<U_OPTION_CONFIDENCE) ) {
403 	fprintf(stdout, "@ %5.1f%% ",
404 		(float)gamma_pvalue(&cat[i], cat[i].score_div)/10);
405       }
406     }
407     fprintf(stdout, "\n");
408 
409   } else if( u_options & (1<<U_OPTION_SCORES) ) {
410 
411     if( u_options & (1<<U_OPTION_APPEND) ) {
412       fprintf(stdout, "\n# scores ");
413     }
414 
415     /* display logarithmic score */
416     for(i = 0; i < cat_count; i++) {
417       if( u_options & (1<<U_OPTION_VERBOSE) ) {
418 	if( u_options & (1<<U_OPTION_VAR) ) {
419 	  fprintf(stdout, "%s ( %5.2" FMT_printf_score_t
420 		  " # %5.2" FMT_printf_score_t " )* %-.1f ",
421 		  cat[i].filename,
422 		  -nats2bits(sample_mean(cat[i].score, cat[i].complexity)),
423 		  nats2bits(sqrt(cat[i].score_s2/cat[i].complexity)),
424 		  cat[i].complexity);
425 	} else {
426 	  fprintf(stdout, "%s %5.2" FMT_printf_score_t " * %-.1f ",
427 		  cat[i].filename,
428 		  -nats2bits(sample_mean(cat[i].score, cat[i].complexity)),
429 		  cat[i].complexity);
430 	}
431 	if( u_options & (1<<U_OPTION_CONFIDENCE) ) {
432 	  fprintf(stdout, "@ %5.1f%% ",
433 		  (float)gamma_pvalue(&cat[i], cat[i].score_div)/10);
434 	}
435       } else {
436 	fprintf(stdout, "%s %5.2" FMT_printf_score_t " ",
437 		cat[i].filename,
438 		-nats2bits(cat[i].score));
439       }
440     }
441     fprintf(stdout, "\n");
442 
443     if( u_options & (1<<U_OPTION_APPEND) ) {
444       no_title = (bool_t)1;
445       for(i = 0; i < cat_count; i++) {
446 	if( cat[i].model_num_docs > 0 ) {
447 	  if( no_title ) {
448 	    fprintf(stdout, "# mean_complexity ");
449 	    no_title = (bool_t)0;
450 	  }
451 	  fprintf(stdout, "%s %5.2" FMT_printf_score_t " ",
452 		  cat[i].filename,
453 		  (double)cat[i].model_full_token_count/cat[i].model_num_docs);
454 	}
455       }
456       if( !no_title ) { fprintf(stdout, "\n"); }
457     }
458 
459   } else if(u_options & (1<<U_OPTION_MEDIACOUNTS) ) {
460     for(i = 0; i < cat_count; i++) {
461       if( (u_options & (1<<U_OPTION_VERBOSE)) ) {
462 	fprintf(stdout, "%s ( %5.2" FMT_printf_score_t " M ",
463 		cat[i].filename,
464 		-nats2bits(sample_mean(cat[i].score, cat[i].complexity)));
465 	for(j = 0; j < TOKEN_CLASS_MAX; j++) {
466 	  fprintf(stdout, "%4" FMT_printf_integer_t " ", cat[i].mediacounts[j]);
467 	}
468 	fprintf(stdout, " )* %-.1f ",
469 		cat[i].complexity);
470       } else {
471 	fprintf(stdout, "%s\tM ", cat[i].filename);
472 	for(j = 0; j < TOKEN_CLASS_MAX; j++) {
473 	  fprintf(stdout, "%4" FMT_printf_integer_t " ", cat[i].mediacounts[j]);
474 	}
475 	fprintf(stdout, "\n");
476       }
477     }
478   } else {
479 
480     if( u_options & (1<<U_OPTION_VERBOSE) ) {
481       if( (u_options & (1<<U_OPTION_CONFIDENCE)) ||
482 	  (u_options & (1<<U_OPTION_VAR)) ) {
483 	for(i = 0; i < cat_count; i++) {
484 	  fprintf(stdout, "%s ( %5.2" FMT_printf_score_t " + "
485 		  "%-5.2" FMT_printf_score_t " ",
486 		  cat[i].filename,
487 		  nats2bits(cat[i].score_div),
488 		  nats2bits(cat[i].score_shannon));
489 	  if( u_options & (1<<U_OPTION_VAR) ) {
490 	    fprintf(stdout, "# %5.2" FMT_printf_score_t " ",
491 		    nats2bits(sqrt(cat[i].score_s2/cat[i].complexity)));
492 	  }
493 	  fprintf(stdout, ")* %-6.1f ", cat[i].complexity);
494 	  if( u_options & (1<<U_OPTION_CONFIDENCE) ) {
495 	    fprintf(stdout, "@ %5.1f%% ",
496 		    (float)gamma_pvalue(&cat[i], cat[i].score_div)/10);
497 	  } else {
498 	    /* percentage of tokens which were recognized */
499 	    fprintf(stdout, "H %.f%% ",
500 		    (100.0 * (1.0 - cat[i].fmiss/((score_t)cat[i].fcomplexity))));
501 	  }
502 	}
503 	fprintf(stdout, "\n");
504       } else {
505 	if( u_options & (1<<U_OPTION_APPEND) ) {
506 	  fprintf(stdout, "\n# category ");
507 	}
508 	fprintf(stdout, "%s\n", cat[exit_code].filename);
509       }
510     } else {
511       if( u_options & (1<<U_OPTION_VAR) ) {
512 	if( u_options & (1<<U_OPTION_APPEND) ) {
513 	  fprintf(stdout, "\n# category ");
514 	}
515 
516 	cmax = calc_uncertainty(exit_code);
517 
518 /* 	cmax = 1.0; */
519 /* 	for(i = 0; i < cat_count; i++) { */
520 /* 	  if( (int)i != exit_code ) { */
521 /* 	    /\* c is a standard normal variable *\/ */
522 /* 	    c = (-sample_mean(cat[i].score, cat[i].complexity) -  */
523 /* 		 -sample_mean(cat[exit_code].score, cat[exit_code].complexity)) / */
524 /* 	      sqrt(cat[i].score_s2/cat[i].complexity +  */
525 /* 		   cat[exit_code].score_s2/cat[exit_code].complexity); */
526 /* 	    /\* c will always be positive, but let's be safe *\/ */
527 /* 	    if( isnan(c) ) { */
528 /* 	      c = 0.0; */
529 /* 	    } else if( c > 5.0 ) { */
530 /* 	      c = 1.0; */
531 /* 	    } else if( c < -5.0 ) { */
532 /* 	      c = 0.0; */
533 /* 	    } else { */
534 /* 	      /\* normal_cdf(c) has a range of 0.5 - 1.0, we convert to a full range *\/ */
535 /* 	      c = 2 * normal_cdf(c) - 1.0; */
536 /* 	    } */
537 /* 	    cmax = (cmax > c) ? c : cmax;  */
538 /* 	  } */
539 /* 	} */
540 	fprintf(stdout, "%s # %d%%\n",
541 		cat[exit_code].filename, (int)ceil(cmax * 100.0));
542       }
543     }
544 
545   }
546 
547   exit_code++; /* make number between 1 and cat_count+1 */
548 }
549 
file_score_categories(char * name)550 void file_score_categories(char *name) {
551   fprintf(stdout, "%s ", name);
552   score_categories();
553   /* clean up for next file */
554   reset_all_scores();
555   if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
556     clear_empirical(&empirical);
557   }
558 }
559 
560 /***********************************************************
561  * FILE MANAGEMENT FUNCTIONS                               *
562  ***********************************************************/
563 
check_magic_write(char * path,char * magic,size_t len)564 bool_t check_magic_write(char *path, char *magic, size_t len) {
565   FILE *output;
566   char buf[MAGIC_BUFSIZE];
567 
568   output = fopen(path, "rb");
569   if( output ) {
570     /* output file exists already */
571     if( (!feof(output) && !fgets(buf, MAGIC_BUFSIZE, output)) ||
572 	(strncmp(buf, magic, len) != 0) ) {
573       errormsg(E_ERROR,"the file %s is already used for something, "
574 	       "use another filename. Nothing written.\n", path);
575       fclose(output);
576       return (bool_t)0;
577     } else {
578       /* it's an existing category file */
579       fclose(output);
580     }
581   }
582   return (bool_t)1;
583 }
584 
585 /* the standard tmpfile() call doesn't tell the filename,
586    the standard mkstemp() is unreliable,
587    and the moronic unix system doesn't remember filenames on open().
588    1) remember to free the tmpname when done.
589    2) the tmplate is appended with a pattern .tmp.xx,
590    3) if you want a particular directory, prepend it to tmplate.
591    4) file is opened for read/write, but truncated to zero.
592    5) The limit of 100 tempfiles is hardcoded. If dbacl needs more than
593       that then it's either a bug with dbacl or something wrong on your FS.
594 */
595 /*@null@*/
mytmpfile(const char * tmplate,char ** tmpname)596 FILE *mytmpfile(const char *tmplate, /*@out@*/ char **tmpname) {
597   FILE *result = NULL;
598   size_t l;
599   int i, fd;
600 
601   l = strlen(tmplate);
602   *tmpname = (char *)malloc(sizeof(char)*(l + 8));
603   if( *tmpname ) {
604     strcpy(*tmpname, tmplate);
605 #if defined ATOMIC_CATSAVE
606     for(i = 0; i < 100; i++) {
607       snprintf(*tmpname + l, 8, ".tmp.%d", i);
608       /* this may have problems on NFS systems? */
609       fd = ATOMIC_CREATE(*tmpname);
610       if( fd != -1 ) {
611 	result = fdopen(fd, "w+b");
612 	/* no need to close fd */
613 	break;
614       }
615     }
616 #else
617     snprintf(*tmpname + l, 8, ".tmp.%d", rand() % 100);
618     result = fopen(*tmpname, "w+b");
619 #endif
620 
621     if( !result ) {
622       free(*tmpname);
623       *tmpname = NULL;
624     }
625   }
626   return result;
627 }
628 
myrename(const char * src,const char * dest)629 bool_t myrename(const char *src, const char *dest) {
630 #if defined ATOMIC_CATSAVE
631   /* the rename is atomic on posix */
632   return (bool_t)(rename(src, dest) == 0);
633 #else
634   return (bool_t)1; /* just pretend */
635 #endif
636 }
637 
638 /* output -log (mediaprobs) */
write_mediaprobs(FILE * out,learner_t * learner)639 void write_mediaprobs(FILE *out, learner_t *learner) {
640   token_class_t t;
641 
642   fprintf(out, MAGIC11);
643   for(t = 0; t < TOKEN_CLASS_MAX; t++) {
644     fprintf(out, "%" FMT_printf_score_t " ", -log(learner->mediaprobs[t]));
645   }
646   fprintf(out, "\n");
647 }
648 
write_category_headers(learner_t * learner,FILE * output)649 bool_t write_category_headers(learner_t *learner, FILE *output) {
650   regex_count_t c;
651   char scratchbuf[MAGIC_BUFSIZE];
652   char smb[MAX_SUBMATCH+1];
653   token_order_t s;
654   char *p;
655 
656   bool_t ok = (bool_t)1;
657 
658   /* print out standard category file headers */
659   ok = ok &&
660     (0 < fprintf(output, MAGIC1, learner->filename,
661 		 (m_options & (1<<M_OPTION_REFMODEL)) ? "(ref)" : ""));
662   ok = ok &&
663     (0 < fprintf(output,
664 		 MAGIC2_o, learner->divergence, learner->logZ,
665 		 (short int)learner->max_order,
666 		 (m_options & (1<<M_OPTION_MULTINOMIAL)) ? "multinomial" : "hierarchical" ));
667   ok = ok &&
668     (0 < fprintf(output, MAGIC3,
669 		 (short int)learner->max_hash_bits,
670 		 (long int)learner->full_token_count,
671 		 (long int)learner->unique_token_count,
672 		 (long int)(learner->doc.count)));
673 
674   ok = ok &&
675     (0 < fprintf(output, MAGIC8_o,
676 		 learner->shannon, learner->shannon2));
677 
678   ok = ok &&
679     (0 < fprintf(output, MAGIC10_o,
680 		 learner->alpha, learner->beta,
681 		 learner->mu, learner->s2));
682 
683   write_mediaprobs(output, learner);
684 
685   /* print out any regexes we might need */
686   for(c = 0; c < regex_count; c++) {
687     /* write the bitmap */
688     smb[0] = '\0';
689     for(p = smb, s = 1; s <= MAX_SUBMATCH; s++) {
690       if( re[c].submatches & (1<<s) ) {
691 	*p++ = (char)s + '0';
692       }
693     }
694     *p = '\0';
695 
696     ok = ok &&
697       (0 < fprintf(output, MAGIC5_o, re[c].string, smb));
698   }
699 
700   /* print options */
701   ok = ok &&
702     (0 < fprintf(output, MAGIC4_o, m_options, m_cp, m_dt,
703 		 print_model_options(m_options, m_cp, scratchbuf)));
704 
705   ok = ok &&
706     (0 < fprintf(output, MAGIC6));
707   return ok;
708 }
709 
710 #if defined DIGITIZE_DIGRAMS
711 typedef digitized_weight_t myweight_t;
712 #else
713 typedef weight_t myweight_t;
714 #endif
715 
716 /* writes the learner to a file for easily readable category */
717 /* the category file is first constructed as a temporary file,
718    then renamed if no problems occured. Because renames are
719    atomic on POSIX, this guarantees that a loaded category file
720    used for classifications is never internally corrupt */
save_learner(learner_t * learner,char * opath)721 error_code_t save_learner(learner_t *learner, char *opath) {
722 
723   alphabet_size_t i, j;
724   hash_count_t t;
725   FILE *output;
726   char *tempname = NULL;
727 
728   bool_t ok;
729   size_t n;
730   c_item_t ci;
731   c_item_t *ci_ptr;
732   myweight_t shval;
733   myweight_t *shval_ptr = NULL;
734 
735   long mmap_offset = 0;
736   size_t mmap_length = 0;
737   byte_t *mmap_start = NULL;
738 
739 
740   if( u_options & (1<<U_OPTION_VERBOSE) ) {
741     fprintf(stdout, "saving category to file %s\n", learner->filename);
742   }
743 
744   /* don't overwrite data files */
745   if( !check_magic_write(learner->filename, MAGIC1, 10) ) {
746     exit(1);
747   }
748 
749   /* In case we have both the -m and -o switches we try to write the
750      data with mmap. We don't do this in general, because mmap can
751      cause corruption, but if -m and -o are used together, we assume the
752      user knows that a single process must read/write the file at a time.
753      Also, we don't try to create the file - if the file doesn't exist,
754      we won't gain much time by using mmap on that single occasion. */
755   if( opath && *opath && (u_options & (1<<U_OPTION_MMAP)) ) {
756     ok = (bool_t)0;
757     output = fopen(learner->filename, "r+b");
758     if( output ) {
759       ok = (bool_t)1;
760       if( out_iobuf ) {
761 	setvbuf(output, (char *)out_iobuf, (int)_IOFBF, (size_t)(BUFFER_MAG * system_pagesize));
762       }
763 
764       ok = ok && write_category_headers(learner, output);
765       if( !ok ) {
766 	goto skip_mmap;
767       }
768       /* now mmap the file and write out the arrays real quick */
769       mmap_offset = ftell(output);
770       if( mmap_offset == (long)-1 ) {
771 	ok = 0;
772 	goto skip_mmap;
773       }
774       mmap_length = (size_t)mmap_offset + (ASIZE * ASIZE * SIZEOF_DIGRAMS) +
775 	learner->max_tokens * sizeof(c_item_t);
776 
777       ok = ok && (-1 != ftruncate(fileno(output), (off_t)mmap_length));
778       if( !ok ) {
779 	goto skip_mmap;
780       }
781 
782       mmap_start = (byte_t *)MMAP(0, mmap_length,
783 				  PROT_READ|PROT_WRITE, MAP_SHARED, fileno(output), 0);
784       if( mmap_start == MAP_FAILED ) { mmap_start = NULL; }
785       if( !mmap_start ) {
786 	ok = 0;
787 	goto skip_mmap;
788       }
789 
790       MLOCK(mmap_start, mmap_length);
791 
792       MADVISE(mmap_start, mmap_length, MADV_SEQUENTIAL|MADV_WILLNEED);
793 
794       /* character frequencies */
795       shval_ptr = (myweight_t *)(mmap_start + mmap_offset);
796       for(i = 0; i < ASIZE; i++) {
797 	for(j = 0; j < ASIZE; j++) {
798 	  *shval_ptr = HTON_DIGRAM(PACK_DIGRAMS(learner->dig[i][j]));
799 	  shval_ptr++;
800 	}
801       }
802 
803       /* token/feature weights */
804       ci_ptr = (c_item_t *)(mmap_start + mmap_offset +
805 			    (ASIZE * ASIZE * SIZEOF_DIGRAMS));
806       for(t = 0; t < learner->max_tokens; t++) {
807 	/* write each element so that it's easy to read back in a
808 	   c_item_t array */
809 	SET(ci_ptr->id,learner->hash[t].id);
810 	ci_ptr->lam = learner->hash[t].lam;
811 
812 	ci_ptr->id = HTON_ID(ci_ptr->id);
813 	ci_ptr->lam = HTON_LAMBDA(ci_ptr->lam);
814 	ci_ptr++;
815       }
816 
817     skip_mmap:
818       fclose(output);
819 
820       if( mmap_start ) {
821 	MUNMAP(mmap_start, mmap_length);
822 	mmap_start = NULL;
823       }
824     }
825 
826     if( ok ) {
827 
828       return 1; /* we're done */
829     } else {
830       errormsg(E_WARNING, "could not mmap %s, trying stdio.\n",
831 	       learner->filename);
832       /* oh oh, file is corrupt. We delete it and try again below
833 	 without mmap */
834       unlink(learner->filename);
835     }
836   }
837 
838   /* this keeps track to see if writing is successful,
839      it's not foolproof, but probably good enough */
840   ok = (bool_t)1;
841 
842   output = mytmpfile(learner->filename, &tempname);
843   if( output ) {
844 
845     if( out_iobuf ) {
846       setvbuf(output, (char *)out_iobuf, (int)_IOFBF, (size_t)(BUFFER_MAG * system_pagesize));
847     }
848 
849     ok = ok && write_category_headers(learner, output);
850 
851     /* end of readable stuff */
852     if( ok ) {
853       /* now write arrays: this is far(!) from efficient, but the bits
854 	 we need are not nicely tucked into a block of memory, and due
855 	 to the sizes involved when dealing with possibly millions of tokens, we
856 	 can't afford to malloc a brand new block just to pack everything
857 	 nicely. At least, the file buffering should mitigate the performance
858 	 hit, and we prefer to learn slowly and classify fast. */
859 
860       /* character frequencies */
861       for(i = 0; i < ASIZE; i++) {
862 	for(j = 0; j < ASIZE; j++) {
863 	  shval = HTON_DIGRAM(PACK_DIGRAMS(learner->dig[i][j]));
864 	  for(n = 0; n < 1; ) {
865 	    if( 0 > (n = fwrite(&shval, SIZEOF_DIGRAMS, (size_t)1, output)) ) {
866 	      ok = 0;
867 	      goto skip_remaining;
868 	    }
869 	  }
870 	}
871       }
872 
873       MADVISE(learner->hash, sizeof(l_item_t) * learner->max_tokens,
874 	      MADV_SEQUENTIAL|MADV_WILLNEED);
875 
876       /* token/feature weights */
877       for(t = 0; t < learner->max_tokens; t++) {
878 	/* write each element so that it's easy to read back in a c_item_t array */
879 	SET(ci.id,learner->hash[t].id);
880 	ci.lam = learner->hash[t].lam;
881 
882 	ci.id = HTON_ID(ci.id);
883 	ci.lam = HTON_LAMBDA(ci.lam);
884 
885 	for(n = 0; n < 1; ) {
886 	  if( 0 > (n = fwrite(&ci, sizeof(ci), (size_t)1, output)) ) {
887 	    ok = 0;
888 	    goto skip_remaining;
889 	  }
890 	}
891       }
892     }
893 
894   skip_remaining:
895 
896     fclose(output);
897 
898     /* the rename is atomic on posix */
899     if( !ok || !myrename(tempname, learner->filename) ) {
900       errormsg(E_ERROR,
901 	       "due to a potential file corruption, category %s was not updated\n",
902 	       learner->filename);
903       unlink(tempname);
904       cleanup.tempfile = NULL;
905     }
906     free(tempname);
907   } else {
908     errormsg(E_ERROR, "cannot open tempfile for writing %s\n", learner->filename);
909     return 0;
910   }
911 
912 
913   return 1;
914 }
915 
916 /* Instead of saving a full new category file, we try to
917  * overwrite the contents of an already open, memory mapped category.
918  * CAREFUL: THIS CAN CORRUPT THE CATEGORY!
919  * If the return value is 0, then you should assume the category no
920  * longer exists on disk.
921  */
fast_partial_save_learner(learner_t * learner,category_t * xcat)922 error_code_t fast_partial_save_learner(learner_t *learner, category_t *xcat) {
923   char *p, *q;
924 #define REPLBUF (3*128)
925   char buf[REPLBUF];
926   charbuf_len_t n, max;
927 
928   alphabet_size_t i, j;
929   hash_count_t t;
930   c_item_t *ci_ptr;
931   myweight_t *shval_ptr;
932 
933   if( xcat->mmap_start &&
934       (xcat->model.options == learner->model.options) &&
935       (xcat->max_order == learner->max_order) &&
936       (xcat->max_hash_bits == learner->max_hash_bits) ) {
937     max = 0;
938     buf[0] = '\0';
939     /* we only overwrite the header if there's exactly enough space */
940     /* we must overwrite 3 lines which all start with # */
941     q = p = strchr((char *)xcat->mmap_start + 1, '#');
942     if( p ) {
943       n = snprintf(buf, (size_t)(REPLBUF - max), MAGIC2_o,
944 		   learner->divergence, learner->logZ,
945 		   (short int)learner->max_order,
946 		   (m_options & (1<<M_OPTION_MULTINOMIAL)) ? "multinomial" : "hierarchical" );
947       max += n;
948     }
949     q = strchr(q + 1, '#');
950     if( q && (max < REPLBUF) ) {
951       n = snprintf(buf + max, (size_t)(REPLBUF - max), MAGIC3,
952 		   (short int)learner->max_hash_bits,
953 		   (long int)learner->full_token_count,
954 		   (long int)learner->unique_token_count,
955 		   (long int)(learner->doc.count));
956       max += n;
957     }
958     q = strchr(q + 1, '#');
959     if( q && (max < REPLBUF) ) {
960       n = snprintf(buf + max, (size_t)(REPLBUF - max), MAGIC8_o,
961 		   learner->shannon, learner->shannon2);
962       max += n;
963     }
964     q = strchr(q + 1, '#');
965     if( q && (max < REPLBUF) ) {
966       n = snprintf(buf + max, (size_t)(REPLBUF - max), MAGIC10_o,
967 		   learner->alpha, learner->beta,
968 		   learner->mu, learner->s2);
969       max += n;
970     }
971     /* now check it fits into the existing space */
972     q = strchr(q + 1, '#');
973     if( (q - p) == max ) {
974       /* from here on, there's no turning back. Since we don't
975 	 overwrite everything, if there's a problem, we'll have to
976 	 unlink the category */
977 
978       /* header */
979       memcpy(p, buf, (size_t)max);
980 
981       /* character frequencies */
982       shval_ptr = (myweight_t *)(xcat->mmap_start + xcat->mmap_offset -
983 				 ASIZE * ASIZE * sizeof(myweight_t));
984       for(i = 0; i < ASIZE; i++) {
985 	for(j = 0; j < ASIZE; j++) {
986 	  *shval_ptr = HTON_DIGRAM(PACK_DIGRAMS(learner->dig[i][j]));
987 	  shval_ptr++;
988 	}
989       }
990       /* token/feature weights
991 	 (we assume cat->max_tokens <= learner->max_tokens, and
992 	 that filled hash values in cat are also filled in learner) */
993       ci_ptr = (c_item_t *)(xcat->mmap_start + xcat->mmap_offset);
994       for(t = 0; t < learner->max_tokens; t++) {
995 	/* write each element so that it's easy to read back in a
996 	   c_item_t array */
997 	SET(ci_ptr->id,learner->hash[t].id);
998 	ci_ptr->lam = learner->hash[t].lam;
999 
1000 	ci_ptr->id = HTON_ID(ci_ptr->id);
1001 	ci_ptr->lam = HTON_LAMBDA(ci_ptr->lam);
1002 	ci_ptr++;
1003       }
1004 
1005       if( u_options & (1<<U_OPTION_VERBOSE) ) {
1006 	fprintf(stdout, "partially overwriting category file %s\n",
1007 		learner->filename);
1008       }
1009       return 1;
1010     }
1011   }
1012   if( u_options & (1<<U_OPTION_VERBOSE) ) {
1013     fprintf(stdout, "can't make fast save\n");
1014   }
1015   return 0;
1016 }
1017 
1018 
tmp_seek_start(learner_t * learner)1019 bool_t tmp_seek_start(learner_t *learner) {
1020   if( learner->tmp.mmap_start ) {
1021     learner->tmp.mmap_cursor = learner->tmp.mmap_offset;
1022     return (bool_t)1;
1023   } else if( learner->tmp.file ) {
1024     clearerr(learner->tmp.file);
1025     return (fseek(learner->tmp.file, learner->tmp.offset, SEEK_SET) == 0);
1026   }
1027   return (bool_t)0;
1028 }
1029 
tmp_seek_end(learner_t * learner)1030 bool_t tmp_seek_end(learner_t *learner) {
1031   if( learner->tmp.mmap_start ) {
1032     learner->tmp.mmap_cursor = learner->tmp.mmap_offset + learner->tmp.used;
1033     return (bool_t)1;
1034   } else if( learner->tmp.file ) {
1035     return (fseek(learner->tmp.file,
1036 		  learner->tmp.offset + learner->tmp.used, SEEK_SET) == 0);
1037   }
1038   return (bool_t)0;
1039 }
1040 
tmp_get_pos(learner_t * learner)1041 long tmp_get_pos(learner_t *learner) {
1042   if( learner->tmp.mmap_start ) {
1043     return learner->tmp.mmap_cursor - learner->tmp.mmap_offset;
1044   } else if( learner->tmp.file ) {
1045     return ftell(learner->tmp.file) - learner->tmp.offset;
1046   }
1047   return (bool_t)0;
1048 }
1049 
tmp_read_block(learner_t * learner,byte_t * buf,size_t bufsiz,const byte_t ** startp)1050 size_t tmp_read_block(learner_t *learner, byte_t *buf, size_t bufsiz,
1051 		      const byte_t **startp) {
1052   long left = learner->tmp.used - tmp_get_pos(learner);
1053   if( bufsiz > (size_t)left ) { bufsiz = (left >= 0) ? (size_t)left : 0; }
1054   if( learner->tmp.mmap_start ) {
1055     /*     memcpy(buf, learner->tmp.mmap_start + learner->tmp.mmap_cursor, bufsiz); */
1056     *startp = learner->tmp.mmap_start + learner->tmp.mmap_cursor;
1057     learner->tmp.mmap_cursor += bufsiz;
1058     return bufsiz;
1059   } else if( learner->tmp.file ) {
1060     if( ferror(learner->tmp.file) || feof(learner->tmp.file) ) {
1061       return 0;
1062     }
1063     *startp = buf;
1064     return fread(buf, 1, bufsiz, learner->tmp.file);
1065   }
1066   return 0;
1067 }
1068 
1069 /* must unmap/ftruncate/remap if using mmap, don't touch mmap_cursor */
tmp_grow(learner_t * learner)1070 bool_t tmp_grow(learner_t *learner) {
1071   long offset;
1072   if( learner->tmp.file &&
1073       ((learner->tmp.used + 2 * MAX_TOKEN_LEN) >= learner->tmp.avail) ) {
1074 
1075     if( learner->tmp.mmap_start ) {
1076       MUNLOCK(learner->tmp.mmap_start, learner->tmp.mmap_length);
1077       MUNMAP(learner->tmp.mmap_start, learner->tmp.mmap_length);
1078     }
1079 
1080     learner->tmp.avail += TOKEN_LIST_GROW;
1081 
1082     if( -1 == ftruncate(fileno(learner->tmp.file),
1083 			learner->tmp.offset + learner->tmp.avail) ) {
1084       errormsg(E_FATAL,
1085 	       "could not resize the tempfile, unable to proceed.\n");
1086     }
1087 
1088     /* mmap_start is still old value, but invalid */
1089     if( learner->tmp.mmap_start ) {
1090       offset = PAGEALIGN(learner->tmp.offset);
1091       learner->tmp.mmap_length = learner->tmp.avail + system_pagesize;
1092       learner->tmp.mmap_start =
1093 	(byte_t *)MMAP(learner->tmp.mmap_start, learner->tmp.mmap_length,
1094 		       PROT_READ|PROT_WRITE, MAP_SHARED,
1095 		       fileno(learner->tmp.file), offset);
1096       if( learner->tmp.mmap_start == MAP_FAILED ) { learner->tmp.mmap_start = NULL; }
1097       if( !learner->tmp.mmap_start ) {
1098 	if( u_options & (1<<U_OPTION_VERBOSE) ) {
1099 	  errormsg(E_WARNING, "could not mmap token file after resize\n");
1100 	}
1101 	/* this isn't fatal, we just switch to ordinary file ops, since
1102 	 the file is still open :) */
1103 	if( -1 == fseek(learner->tmp.file,learner->tmp.offset +
1104 			(learner->tmp.mmap_cursor - learner->tmp.mmap_offset),
1105 			SEEK_SET) ) {
1106 	  /* ok, this is fatal */
1107 	  errormsg(E_FATAL,
1108 		   "could not resize and seek in tempfile, unable to proceed.\n");
1109 	}
1110 	learner->tmp.mmap_length = 0;
1111 	learner->tmp.mmap_offset = 0;
1112 	learner->tmp.mmap_cursor = 0;
1113       } else {
1114 	/* all is well, relock the pages */
1115 	MLOCK(learner->tmp.mmap_start, learner->tmp.mmap_length);
1116 	MADVISE(learner->tmp.mmap_start, learner->tmp.mmap_length,
1117 		MADV_SEQUENTIAL|MADV_WILLNEED);
1118       }
1119       /* if there was a fatal error, we simply don't return */
1120       return (bool_t)1;
1121     }
1122   }
1123   return (bool_t)0;
1124 }
1125 
1126 /* assume we are at the end of the token list and there is enough room */
tmp_write_token(learner_t * learner,const char * tok)1127 void tmp_write_token(learner_t *learner, const char *tok) {
1128   byte_t *p;
1129   if( learner->tmp.mmap_start ) {
1130     p = learner->tmp.mmap_start + learner->tmp.mmap_cursor;
1131     while( *tok ) { *p++ = *tok++; }
1132     *p++ = TOKENSEP;
1133     learner->tmp.mmap_cursor = p - learner->tmp.mmap_start;
1134     learner->tmp.used = learner->tmp.mmap_cursor - learner->tmp.mmap_offset;
1135   } else if( learner->tmp.file ) {
1136     /* now save token to temporary file */
1137     if( (EOF == fputs(tok, learner->tmp.file)) ||
1138 	(EOF == fputc(TOKENSEP, learner->tmp.file)) ) {
1139       errormsg(E_ERROR,
1140 	       "could not write a token to tempfile, calculations will be skewed.\n");
1141     } else {
1142       learner->tmp.used += (strlen(tok) + 1);
1143     }
1144   }
1145 }
1146 
tmp_close(learner_t * learner)1147 void tmp_close(learner_t *learner) {
1148   if( learner->tmp.mmap_start ) {
1149     MUNLOCK(learner->tmp.mmap_start, learner->tmp.mmap_length);
1150     MUNMAP(learner->tmp.mmap_start, learner->tmp.mmap_length);
1151     learner->tmp.mmap_start = NULL;
1152   }
1153   if( learner->tmp.file ) {
1154     fclose(learner->tmp.file);
1155     learner->tmp.file = NULL;
1156     if( learner->tmp.filename ) {
1157       unlink(learner->tmp.filename);
1158       cleanup.tempfile = NULL;
1159       free(learner->tmp.filename);
1160       learner->tmp.filename = NULL;
1161     }
1162   }
1163 }
1164 
1165 /***********************************************************
1166  * EMPLIST FUNCTIONS                                       *
1167  ***********************************************************/
emplist_grow(emplist_t * emp)1168 bool_t emplist_grow(emplist_t *emp) {
1169   hash_count_t *n;
1170 
1171   if( emp ) {
1172     if( u_options & (1<<U_OPTION_VERBOSE) ) {
1173       errormsg(E_WARNING, "growing emp.stack %" FMT_printf_integer_t "\n",
1174 	       emp->max * 2);
1175     }
1176 
1177     n = (hash_value_t *)realloc(emp->stack,
1178 				sizeof(hash_value_t *) * emp->max * 2);
1179     if( n != NULL ) {
1180       emp->max *= 2;
1181       emp->stack = n;
1182       return 1;
1183     }
1184   }
1185   errormsg(E_WARNING,
1186 	   "disabling emp.stack. Not enough memory? I couldn't allocate %li bytes\n",
1187 	   (sizeof(l_item_t *) * ((long int)emp->max) * 2));
1188   m_options &= ~(1<<M_OPTION_CALCENTROPY);
1189 
1190   return 0;
1191 }
1192 
emplist_replace(emplist_t * reservoir,document_count_t n,emplist_t * emp)1193 void emplist_replace(emplist_t *reservoir, document_count_t n, emplist_t *emp) {
1194   if( reservoir[n].stack && (emp->top > reservoir[n].max) ) {
1195     free(reservoir[n].stack);
1196     reservoir[n].stack = NULL;
1197   }
1198   if( !reservoir[n].stack ) {
1199     for(reservoir[n].max = 1; emp->top > reservoir[n].max;
1200 	reservoir[n].max *=2);
1201     reservoir[n].stack = (hash_value_t *)malloc(reservoir[n].max * sizeof(hash_value_t));
1202   }
1203   if( reservoir[n].stack ) {
1204     memcpy(reservoir[n].stack, emp->stack, emp->top * sizeof(hash_value_t));
1205     reservoir[n].top = emp->top;
1206     reservoir[n].shannon = emp->shannon;
1207   } else {
1208     errormsg(E_WARNING,
1209 	     "couldn't replace document #%" FMT_printf_integer_t " in reservoir, ignoring\n", n);
1210   }
1211 }
1212 
1213 /*
1214  * Adds emp to the reservoir in such a way that we
1215  * end up with a uniform random sample of size RESERVOIR_SIZE.
1216  * This algorithm is a short form of reservoir sampling, and we use
1217  * it here because we don't know the total number of documents
1218  * until it's too late.
1219  */
1220 
emplist_add_to_reservoir(emplist_t * reservoir,document_count_t docno,emplist_t * emp)1221 void emplist_add_to_reservoir(emplist_t *reservoir, document_count_t docno,
1222 			      emplist_t *emp) {
1223   double xi;
1224   if( docno < RESERVOIR_SIZE) {
1225     emplist_replace(reservoir, docno, emp);
1226   } else {
1227     xi = ((double)rand()/(RAND_MAX + 1.0));
1228     if( (docno + 1) * xi < RESERVOIR_SIZE ) {
1229       xi = ((double)rand()/(RAND_MAX + 1.0));
1230       emplist_replace(reservoir, (int)(RESERVOIR_SIZE * xi), emp);
1231     }
1232   }
1233 }
1234 
1235 /***********************************************************
1236  * LEARNER FUNCTIONS                                       *
1237  ***********************************************************/
free_learner_hash(learner_t * learner)1238 void free_learner_hash(learner_t *learner) {
1239   if( learner->hash ) {
1240     if( learner->mmap_start != NULL ) {
1241       MUNMAP(learner->mmap_start,
1242 	     learner->mmap_hash_offset + cat->max_tokens * sizeof(c_item_t));
1243       learner->mmap_start = NULL;
1244       learner->mmap_learner_offset = 0;
1245       learner->mmap_hash_offset = 0;
1246       learner->hash = NULL;
1247     }
1248     if( learner->hash ) {
1249       free(learner->hash);
1250       learner->hash = NULL;
1251     }
1252   }
1253 }
1254 
create_learner_hash(learner_t * learner,FILE * input,bool_t readonly)1255 bool_t create_learner_hash(learner_t *learner, FILE *input, bool_t readonly) {
1256   size_t j, n;
1257   byte_t *mmap_start;
1258   long mmap_learner_offset;
1259   long mmap_hash_offset;
1260 
1261   if( learner->hash ) { free_learner_hash(learner); }
1262 
1263   if( u_options & (1<<U_OPTION_MMAP) ) {
1264     /* here we mmap the fixed size portion of the dump file. First
1265        we mmap the learner structure portion as a fast way of loading
1266        the learner variable, then we must undo/redo the mmap because
1267        we cannot know the hash table size until we've read the dumped learner_t.
1268     */
1269 
1270     mmap_learner_offset = strlen(MAGIC_ONLINE);
1271     mmap_hash_offset = mmap_learner_offset + sizeof(learner_t);
1272 
1273     mmap_start =
1274       (byte_t *)MMAP(0, mmap_hash_offset,
1275 		     PROT_READ|(readonly ? 0 : PROT_WRITE),
1276 		     MAP_SHARED, fileno(input), 0);
1277     if( mmap_start == MAP_FAILED ) { mmap_start = NULL; }
1278     if( mmap_start ) {
1279       /* first we overwrite the learner struct with the contents of
1280 	 the mmapped region */
1281       memcpy(learner, mmap_start + mmap_learner_offset, sizeof(learner_t));
1282 
1283       MUNMAP(mmap_start, mmap_hash_offset);
1284 
1285       mmap_start =
1286 	(byte_t *)MMAP(mmap_start,
1287 		       mmap_hash_offset + sizeof(l_item_t) * learner->max_tokens,
1288 		       PROT_READ|(readonly ? 0 : PROT_WRITE),
1289 		       MAP_SHARED, fileno(input), 0);
1290       if( mmap_start == MAP_FAILED ) { mmap_start = NULL; }
1291       if( mmap_start ) {
1292 	/* now fill some member variables */
1293 	learner->mmap_start = mmap_start;
1294 	learner->mmap_learner_offset = mmap_learner_offset;
1295 	learner->mmap_hash_offset = mmap_hash_offset;
1296 	learner->hash = (l_item_t *)(mmap_start + mmap_hash_offset);
1297 	MADVISE(learner->mmap_start,
1298 		learner->mmap_hash_offset + sizeof(l_item_t) * learner->max_tokens,
1299 		MADV_SEQUENTIAL|MADV_WILLNEED);
1300 	/* lock the pages to prevent swapping - on Linux, this
1301 	   works without root privs so long as the user limits
1302 	   are big enough - mine are unlimited ;-)
1303 	   On other OSes, root may me necessary. */
1304 	MLOCK(learner->mmap_start,
1305 	      learner->mmap_hash_offset + sizeof(l_item_t) * learner->max_tokens);
1306       }
1307     }
1308   }
1309 
1310   if( !learner->hash ) {
1311 
1312     for(n = 0; n < 1; ) {
1313       n = fread(learner, sizeof(learner_t), 1, input);
1314       if( (n == 0) && ferror(input) ) {
1315 	return 0;
1316       }
1317     }
1318 
1319     /* allocate hash table normally */
1320     learner->hash = (l_item_t *)malloc(learner->max_tokens * sizeof(l_item_t));
1321     if( !learner->hash ) {
1322       errormsg(E_WARNING, "failed to allocate %li bytes for learner hash.\n",
1323 	       (sizeof(l_item_t) * ((long int)learner->max_tokens)));
1324       /* this error is not fatal, because we have another opportunity to
1325 	 allocate the hash later */
1326       return 0;
1327     }
1328 
1329     MADVISE(learner->hash, sizeof(l_item_t) * learner->max_tokens,
1330 	    MADV_SEQUENTIAL);
1331 
1332     for(n = j = 0; n < learner->max_tokens; n += j) {
1333       j = fread(&learner->hash[n], sizeof(l_item_t),
1334 		learner->max_tokens - n, input);
1335       if( (j == 0) && ferror(input) ) {
1336 	errormsg(E_WARNING, "could not read online learner dump, ignoring.\n");
1337 	return 0;
1338       }
1339     }
1340 
1341   }
1342   return (learner->hash != NULL);
1343 }
1344 
1345 /* returns true if the hash could be grown, false otherwise.
1346    When the hash is grown, the old values must be redistributed.
1347    To do this, we reuse the l_item.lam member as a flag, since this
1348    is only needed later during minimization */
grow_learner_hash(learner_t * learner)1349 bool_t grow_learner_hash(learner_t *learner) {
1350   hash_count_t c, new_size;
1351   l_item_t *i, temp_item;
1352 
1353   if( !(u_options & (1<<U_OPTION_GROWHASH)) ) {
1354     return 0;
1355   } else {
1356     if( learner->max_hash_bits < default_max_grow_hash_bits ) {
1357 
1358       /*
1359 	 this warning is mostly useless and distracting
1360 
1361 	 errormsg(E_WARNING,
1362 	 "growing hash table, next time perhaps consider using -h %d\n",
1363 	 learner.max_hash_bits + 1);
1364 
1365       */
1366 
1367       /* there are two cases - if learner is mmapped, we must malloc a
1368        * new hash, otherwise we can realloc. An easy way to reduce the
1369        * first case to the second is to create a throwaway hash with
1370        * the old size and copy the mmapped contents to it.
1371        */
1372       if( learner->mmap_start ) {
1373 	if( (i = (l_item_t *)malloc(learner->max_tokens *
1374 				    sizeof(l_item_t))) == NULL ) {
1375 	  errormsg(E_WARNING,
1376 		   "failed to malloc hash table for growing.\n");
1377 	  return 0;
1378 	}
1379 	memcpy(i, learner->hash, sizeof(l_item_t) * learner->max_tokens);
1380 	free_learner_hash(learner);
1381 	learner->hash = i;
1382 
1383 	/* there's a MUNLOCK call below, so in theory we should
1384 	   balance this out by calling MLOCK here, since the i pointed
1385 	   region was never locked. However mlock/munlock calls do not nest,
1386 	   so there's no need.
1387 	*/
1388 /* 	if( u_options & (1<<U_OPTION_MMAP) ) { */
1389 /* 	  MLOCK(learner->hash, sizeof(l_item_t) * learner->max_tokens); */
1390 /* 	} */
1391       }
1392 
1393 
1394       if( u_options & (1<<U_OPTION_MMAP) ) {
1395 	MUNLOCK(learner->hash, sizeof(l_item_t) * learner->max_tokens);
1396       }
1397 
1398       /* grow the memory around the hash */
1399       if( (i = (l_item_t *)realloc(learner->hash,
1400 		       sizeof(l_item_t) *
1401 		       (1<<(learner->max_hash_bits+1)))) == NULL ) {
1402 	errormsg(E_WARNING,
1403 		"failed to grow hash table.\n");
1404 	return 0;
1405       }
1406       /* we need the old value of learner->max_tokens for a while yet */
1407       learner->max_hash_bits++;
1408 
1409       if( u_options & (1<<U_OPTION_MMAP) ) {
1410 	MLOCK(i, sizeof(l_item_t) * (1<<learner->max_hash_bits));
1411       }
1412 
1413       MADVISE(i, sizeof(l_item_t) * (1<<learner->max_hash_bits), MADV_SEQUENTIAL);
1414 
1415       /* realloc doesn't initialize the memory */
1416       memset(&((l_item_t *)i)[learner->max_tokens], 0,
1417 	     ((1<<learner->max_hash_bits) - learner->max_tokens) *
1418 	     sizeof(l_item_t));
1419       learner->hash = i;
1420 
1421       /* now mark every used slot */
1422       for(c = 0; c < learner->max_tokens; c++) {
1423 	if( FILLEDP(&learner->hash[c]) ) {
1424 	  SETMARK(&learner->hash[c]);
1425 	}
1426       }
1427 
1428       MADVISE(i, sizeof(l_item_t) * (1<<learner->max_hash_bits), MADV_RANDOM);
1429 
1430       /* now relocate each marked slot and clear it */
1431       new_size = (1<<learner->max_hash_bits) - 1;
1432       for(c = 0; c < learner->max_tokens; c++) {
1433 	while( MARKEDP(&learner->hash[c]) ) {
1434 	  /* find where it belongs */
1435 	  i = &learner->hash[learner->hash[c].id & new_size];
1436 	  while( FILLEDP(i) && !MARKEDP(i) ) {
1437 	    i++;
1438 	    i = (i > &learner->hash[new_size]) ? learner->hash : i;
1439 	  } /* guaranteed to exit since hash is larger than original */
1440 
1441 	  /* clear the mark - this must happen after we look for i,
1442 	   since it should be possible to find i == learner->hash[c] */
1443 	  UNSETMARK(&learner->hash[c]);
1444 
1445 	  /* now refile */
1446 	  if( i != &learner->hash[c] ) {
1447 	    if( MARKEDP(i) ) {
1448 	      /* swap */
1449 	      memcpy(&temp_item, i, sizeof(l_item_t));
1450 	      memcpy(i, &learner->hash[c], sizeof(l_item_t));
1451 	      memcpy(&learner->hash[c], &temp_item, sizeof(l_item_t));
1452 	    } else {
1453 	      /* copy and clear */
1454 	      memcpy(i, &learner->hash[c], sizeof(l_item_t));
1455 	      memset(&learner->hash[c], 0, sizeof(l_item_t));
1456 	    }
1457 	  }
1458 	  /* now &learner->hash[c] is marked iff there was a swap */
1459 	}
1460       }
1461       learner->max_tokens = (1<<learner->max_hash_bits);
1462     } else {
1463       u_options &= ~(1<<U_OPTION_GROWHASH); /* it's the law */
1464       errormsg(E_WARNING,
1465 	       "the token hash table is nearly full, slowing down.\n");
1466     }
1467   }
1468   return 1;
1469 }
1470 
1471 /* This code malloc()s and fread()s the learner structure. */
read_online_learner_struct(learner_t * learner,char * path,bool_t readonly)1472 bool_t read_online_learner_struct(learner_t *learner, char *path, bool_t readonly) {
1473   FILE *input;
1474   char buf[BUFSIZ+1]; /* must be greater than MAGIC_BUFSIZE */
1475   bool_t ok = 0;
1476   char *sav_filename;
1477   long offset;
1478   l_item_t *p, *e;
1479 
1480   /*
1481    * This code malloc()s and fread()s the learner structure, or alternatively
1482    * it mmap()s it for speed. Note we open for read/write, because the
1483    * file might be left open and reused later.
1484    */
1485   input = fopen(path, readonly ? "rb" : "r+b");
1486   if( input ) {
1487 
1488 /*     if( out_iobuf ) { */
1489 /*       setvbuf(input, (char *)out_iobuf, (int)_IOFBF, (size_t)(BUFFER_MAG * system_pagesize)); */
1490 /*     } */
1491 
1492     if( !fgets(buf, MAGIC_BUFSIZE, input) ||
1493 	(strncmp(buf, MAGIC_ONLINE, strlen(MAGIC_ONLINE)) != 0) ) {
1494 	if( strncmp(buf, MAGIC_ONLINE, 35) == 0 ) {
1495 	  errormsg(E_WARNING,
1496 		  "the file %s has the wrong version number, it will be ignored\n",
1497 		  path);
1498 	} else {
1499 	  errormsg(E_WARNING,
1500 		  "the file %s is not a dbacl online memory dump, it will be ignored\n",
1501 		  path);
1502 	}
1503     } else {
1504       /* from here on, if a problem arises then learner is hosed, so we exit */
1505       ok = 1;
1506 
1507       /* must save some prefilled members because learner is read from disk */
1508       sav_filename = learner->filename;
1509 
1510       if( !create_learner_hash(learner, input, readonly) ) {
1511 	ok = 0;
1512 	goto skip_read_online;
1513       }
1514       /* restore members */
1515       learner->filename = sav_filename;
1516 
1517       /* override options */
1518       if( (m_options != learner->model.options) ||
1519 	  (zthreshold != learner->model.tmin) ||
1520 	  (m_cp != learner->model.cp) ||
1521 	  (m_dt != learner->model.dt) ) {
1522 	/* we don't warn about changes in u_options, as they can happen
1523 	   during computations */
1524 	errormsg(E_WARNING,
1525 		 "some command line options were changed when loading %s\n",
1526 		 path);
1527 	m_options = learner->model.options;
1528 	m_cp = learner->model.cp;
1529 	m_dt = learner->model.dt;
1530 /*       u_options = learner->u_options; */
1531 	zthreshold = learner->model.tmin;
1532       }
1533 
1534       /* last, there is a long list of token strings, we compute the
1535        start offset, and seek to the end so we can append more
1536        tokens */
1537       learner->tmp.file = input;
1538       learner->tmp.filename = NULL;
1539       learner->tmp.offset = strlen(MAGIC_ONLINE) + sizeof(learner_t) +
1540 	sizeof(l_item_t) * learner->max_tokens;
1541       learner->tmp.iobuf = NULL;
1542 
1543       if( u_options & (1<<U_OPTION_MMAP) ) {
1544 	offset = PAGEALIGN(learner->tmp.offset);
1545 	/* since offset can be up to a page less than the true beginning,
1546 	   we must allow an extra page in the mmap length */
1547 	learner->tmp.mmap_length = learner->tmp.avail + system_pagesize;
1548 	learner->tmp.mmap_start =
1549 	  (byte_t *)MMAP(0, learner->tmp.mmap_length,
1550 			 PROT_READ|(readonly ? 0 : PROT_WRITE), MAP_SHARED,
1551 			 fileno(learner->tmp.file), offset);
1552 	if( learner->tmp.mmap_start == MAP_FAILED ) { learner->tmp.mmap_start = NULL; }
1553 	if( learner->tmp.mmap_start ) {
1554 	  MLOCK(learner->tmp.mmap_start, learner->tmp.mmap_length);
1555 	  MADVISE(learner->tmp.mmap_start, learner->tmp.mmap_length,
1556 		  MADV_SEQUENTIAL|MADV_WILLNEED);
1557 	  learner->tmp.mmap_offset = learner->tmp.offset - offset;
1558 	  learner->tmp.mmap_cursor = 0;
1559 	} else {
1560 	  learner->tmp.mmap_length = 0;
1561 	  if( u_options & (1<<U_OPTION_VERBOSE) ) {
1562 	    errormsg(E_WARNING, "couldn't mmap temporary tokens\n");
1563 	  }
1564 	}
1565       }
1566 
1567       if( !tmp_seek_end(learner) ) {
1568 	ok = 0;
1569 	learner->tmp.file = NULL;
1570 	learner->tmp.filename = NULL;
1571 	goto skip_read_online;
1572       }
1573       /* consistency check */
1574       if( tmp_get_pos(learner) != learner->tmp.used ) {
1575 	ok = 0;
1576 	learner->tmp.file = NULL;
1577 	learner->tmp.filename = NULL;
1578 	goto skip_read_online;
1579       }
1580 /*       printf("tmp.used = %ld tmp.avail = %ld tmp.offset = %ld\n", */
1581 /* 	     learner->tmp.used, learner->tmp.avail, learner->tmp.offset); */
1582 
1583       /* last but not least, hash may contain junk because mmapped, so
1584 	 clear some data */
1585       e = learner->hash + learner->max_tokens;
1586       for(p = learner->hash; p < e; p++) {
1587 	if( FILLEDP(p) ) {
1588 	  p->tmp.read.eff = 0;
1589 	}
1590       }
1591 
1592     }
1593   skip_read_online:
1594     /* if we're ok, we must leave the file open, even if it's mmapped,
1595      because we must be able to unmap/remap it on the fly */
1596     if( !ok ) {
1597       if( learner->tmp.mmap_start ) {
1598 	MUNMAP(learner->tmp.mmap_start, learner->tmp.mmap_length);
1599 	learner->tmp.mmap_start = 0;
1600 	learner->tmp.mmap_length = 0;
1601 	learner->tmp.mmap_offset = 0;
1602 	learner->tmp.mmap_cursor = 0;
1603       }
1604       fclose(input);
1605       errormsg(E_FATAL,
1606 	       "the file %s could not be read, learning from scratch.\n",
1607 	       path);
1608     }
1609   }
1610 
1611   if( u_options & (1<<U_OPTION_VERBOSE) ) {
1612     fprintf(stdout, "%s %s\n", ok ? "loaded" : "couldn't load", path);
1613   }
1614 
1615   return ok;
1616 }
1617 
1618 /* this is a straight memory dump  */
write_online_learner_struct(learner_t * learner,char * path)1619 void write_online_learner_struct(learner_t *learner, char *path) {
1620   FILE *output;
1621   char *tempname = NULL;
1622   bool_t ok;
1623   byte_t buf[BUFSIZ+1];
1624   size_t t, j, n;
1625   learner_t *mml;
1626   long tokoff;
1627   const byte_t *sp;
1628 
1629   if( !check_magic_write(path, MAGIC_ONLINE, strlen(MAGIC_ONLINE)) ) {
1630     /* we simply ignore this. Note that check_magic_write() already
1631        notifies the user */
1632     return;
1633   }
1634 
1635 
1636   /* if the hash table is memory mapped, then we simply update the
1637      learner structure and exit */
1638 
1639   if( learner->mmap_start != NULL ) {
1640     mml = (learner_t *)(learner->mmap_start + learner->mmap_learner_offset);
1641     memcpy(mml, learner, sizeof(learner_t));
1642     /* clear some variables just to be safe */
1643     mml->mmap_start = NULL;
1644     mml->mmap_learner_offset = 0;
1645     mml->mmap_hash_offset = 0;
1646     mml->hash = NULL;
1647 
1648     return;
1649   }
1650 
1651 
1652   /* if we're here, then we must write out a completely new memory dump,
1653    * and the mmap_ variables are all zero. */
1654 
1655   output = mytmpfile(path, &tempname);
1656   if( output ) {
1657 
1658     if( out_iobuf ) {
1659       setvbuf(output, (char *)out_iobuf, (int)_IOFBF, (size_t)(BUFFER_MAG * system_pagesize));
1660     }
1661 
1662     ok = 1;
1663     ok = ok &&
1664       (0 < fprintf(output, MAGIC_ONLINE));
1665 
1666     /* save current model options - don't want them to change next time we learn */
1667     learner->model.options = m_options;
1668     learner->model.cp = m_cp;
1669     learner->model.dt = m_dt;
1670     learner->u_options = u_options;
1671 
1672     /* make sure some stuff is zeroed out */
1673     /* but leave others untouched, eg doc.A, doc.S, doc.count for shannon */
1674     learner->mmap_start = NULL; /* assert mmap_start == NULL */
1675     learner->mmap_learner_offset = 0;
1676     learner->mmap_hash_offset = 0;
1677     learner->doc.emp.top = 0;
1678     learner->doc.emp.max = 0;
1679     learner->doc.emp.stack = NULL;
1680     memset(learner->doc.reservoir, 0, RESERVOIR_SIZE * sizeof(emplist_t));
1681 
1682     /* write learner */
1683     ok = ok &&
1684       (0 < fwrite(learner, sizeof(learner_t), 1, output));
1685     for(t = j = 0; t < learner->max_tokens; t += j) {
1686       j = fwrite(&(learner->hash[t]), sizeof(l_item_t),
1687 		 learner->max_tokens - t, output);
1688       if( (j == 0) && ferror(output) ) {
1689 	ok = 0;
1690 	goto skip_write_online;
1691       }
1692     }
1693 
1694     tokoff = ftell(output);
1695     /* extend the size ofthe file - this is needed mostly for mmapping,
1696      but it might also marginally speed up the repeated writes below */
1697     if( -1 == ftruncate(fileno(output), tokoff + learner->tmp.avail) ) {
1698       ok = 0;
1699       goto skip_write_online;
1700     }
1701 
1702     /* write temporary tokens */
1703     if( !tmp_seek_start(learner) ) {
1704       errormsg(E_ERROR, "cannot seek in online token file %s\n", tempname);
1705       ok = 0;
1706       goto skip_write_online;
1707     }
1708     n = 0;
1709     while( (n = tmp_read_block(learner, buf, BUFSIZ, &sp)) > 0 ) {
1710       for(t = j = 0; t < n; t += j) {
1711 	j = fwrite(sp + t, 1, n - t, output);
1712 	if( (j == 0) && ferror(output) ) {
1713 	  ok = 0;
1714 	  goto skip_write_online;
1715 	}
1716       }
1717     }
1718     /* consistency check */
1719     if( (tokoff + learner->tmp.used) != ftell(output) ) {
1720       ok = 0;
1721       goto skip_write_online;
1722     }
1723 
1724   skip_write_online:
1725     fclose(output);
1726 
1727     if( u_options & (1<<U_OPTION_VERBOSE) ) {
1728       fprintf(stdout, "writing online memory dump: %s\n", ok ? "success" : "failed");
1729     }
1730 
1731     /* the rename is atomic on posix */
1732     if( !ok || (rename(tempname, path) < 0) ) {
1733       errormsg(E_ERROR,
1734 	       "due to a potential file corruption, %s was not updated\n",
1735 	       path);
1736       unlink(tempname);
1737     }
1738 
1739     free(tempname);
1740   }
1741 }
1742 
merge_temp_tokens(learner_t * dest,learner_t * src)1743 bool_t merge_temp_tokens(learner_t *dest, learner_t *src) {
1744   hash_value_t id;
1745   byte_t buf[BUFSIZ+1];
1746   char tok[(MAX_TOKEN_LEN+1)*MAX_SUBMATCH+EXTRA_TOKEN_LEN];
1747   size_t n = 0;
1748 
1749   const byte_t *p;
1750   char *q;
1751 
1752   l_item_t *k;
1753 
1754   if( !tmp_seek_start(src) ) {
1755     errormsg(E_ERROR, "cannot seek in temporary token file [%s]\n",
1756 	     src->filename);
1757     return 0;
1758   }
1759   if( !tmp_seek_end(dest) ) {
1760     errormsg(E_ERROR, "cannot seek in temporary token file [%s]\n",
1761 	     dest->filename);
1762     return 0;
1763   }
1764 
1765   q = tok;
1766   while( (n = tmp_read_block(src, buf, BUFSIZ, &p)) > 0 ) {
1767     /*       p = buf; */
1768     /*       p[n] = '\0'; */
1769     while( n-- > 0 ) {
1770       if( *p != TOKENSEP) {
1771 	*q++ = *p; /* copy into tok */
1772       } else { /* interword space */
1773 	*q = 0; /* append NUL to tok */
1774 	/* now lookup weight in hash */
1775 	id = hash_full_token(tok);
1776 	k = find_in_learner(dest, id);
1777 	if( !k || !FILLEDP(k) ) {
1778 	  tmp_grow(dest); /* just in case we're full */
1779 	  tmp_write_token(dest, tok);
1780 	}
1781 	q = tok; /* reset q */
1782       }
1783       p++;
1784     }
1785   }
1786 
1787 
1788   return 1;
1789 }
1790 
merge_hashes(learner_t * dest,learner_t * src)1791 bool_t merge_hashes(learner_t *dest, learner_t *src) {
1792   register l_item_t *i, *j, *e;
1793   alphabet_size_t ci, cj;
1794 
1795   e = src->hash + src->max_tokens;
1796   for(j = src->hash ; j != e; j++) {
1797     if( FILLEDP(j) ) {
1798       i = find_in_learner(dest, j->id);
1799       if( i && !FILLEDP(i) &&
1800 	((100 * dest->unique_token_count) >=
1801 	 (HASH_FULL * dest->max_tokens)) && grow_learner_hash(dest) ) {
1802 	i = find_in_learner(dest,j->id);
1803 	/* new i, go through all tests again */
1804       }
1805       if( i ) {
1806 	if( FILLEDP(i) ) {
1807 
1808 	  /* nothing */
1809 
1810 	} else if( /* !FILLEDP(i) && */
1811 		  ((100 * dest->unique_token_count) <
1812 		   (HASH_FULL * dest->max_tokens) ) ) {
1813 
1814 	  SET(i->id, j->id);
1815 
1816 	  INCREMENT(dest->unique_token_count,
1817 		    K_TOKEN_COUNT_MAX, overflow_warning);
1818 
1819 	  i->typ = j->typ;
1820 
1821 	  /* order accounting */
1822 	  dest->max_order = MAXIMUM(dest->max_order,i->typ.order);
1823 
1824 	  INCREMENT(dest->fixed_order_unique_token_count[i->typ.order],
1825 		    K_TOKEN_COUNT_MAX, overflow_warning);
1826 
1827 	}
1828 
1829 	INCREASE(i->count, j->count,
1830 		 K_TOKEN_COUNT_MAX, overflow_warning);
1831 
1832 	dest->tmax = MAXIMUM(dest->tmax, i->count);
1833 
1834 	INCREASE(dest->full_token_count, j->count,
1835 		 K_TOKEN_COUNT_MAX, overflow_warning);
1836 
1837 	INCREASE(dest->fixed_order_token_count[i->typ.order], j->count,
1838 		  K_TOKEN_COUNT_MAX, skewed_constraints_warning);
1839 
1840       }
1841     }
1842   }
1843 
1844   for(ci = 0; ci < ASIZE; ci++) {
1845     for(cj = 0; cj < ASIZE; cj++) {
1846       dest->dig[ci][cj] += src->dig[ci][cj];
1847     }
1848   }
1849 
1850   return 1;
1851 }
1852 
1853 
merge_learner_struct(learner_t * learner,char * path)1854 bool_t merge_learner_struct(learner_t *learner, char *path) {
1855   learner_t dummy;
1856   bool_t ok = 0;
1857   options_t m_sav, u_sav;
1858 
1859   m_sav = m_options;
1860   u_sav = u_options;
1861 
1862   init_learner(&dummy, path, 1);
1863 
1864   m_options = m_sav;
1865   u_options = u_sav;
1866 
1867   if( dummy.retype != 0 ) {
1868     errormsg(E_WARNING,
1869 	     "merging regular expressions is not supported, ignoring %s\n",
1870 	     path);
1871     ok = 0;
1872   } else {
1873     /* in case these changed while initing dummy */
1874     learner->model.options = m_options;
1875     learner->model.cp = m_cp;
1876     learner->model.dt = m_dt;
1877 
1878     /* _before_ we fill the hash, we merge the temp tokens (we only
1879        add those thokens not found in learner->hash) */
1880     ok =
1881       merge_temp_tokens(learner, &dummy) &&
1882       merge_hashes(learner, &dummy);
1883 
1884     if( ok ) {
1885       learner->doc.A += dummy.doc.A;
1886       learner->doc.S += dummy.doc.S;
1887       learner->doc.count += dummy.doc.count;
1888       learner->doc.nullcount += dummy.doc.nullcount;
1889       /* can't use these when merging */
1890       learner->alpha = 0.0;
1891       learner->beta = 0.0;
1892       learner->mu = 0.0;
1893       learner->s2 = 0.0;
1894     }
1895   }
1896 
1897   free_learner(&dummy);
1898 
1899   if( !ok ) {
1900     errormsg(E_WARNING,
1901 	     "%s could not be merged, the learned category might be corrupt\n",
1902 	     path);
1903   }
1904 
1905   return ok;
1906 }
1907 
1908 /* returns an approximate binomial r.v.; if np < 10 and n > 20,
1909  * a Poisson approximation is used, else the variable is exact.
1910  */
binomial(token_count_t n,double p)1911 token_count_t binomial(token_count_t n, double p) {
1912   double sum, thresh;
1913   token_count_t i, x;
1914 
1915   x = 0;
1916   if( (n > 20) && (n * p < 10) ) {
1917     /* approximate poisson has lambda = np */
1918     sum = -log( ((double)rand())/(RAND_MAX + 1.0) );
1919     thresh = n * p;
1920     /* printf("poisson with sum = %g thresh = %g\n", sum, thresh);       */
1921     while( (sum < thresh) && (x < n) ) {
1922       x++;
1923       sum += -log( ((double)rand())/(RAND_MAX + 1.0) );
1924       /* printf("poisson with sum = %g thresh = %g, x = %d\n", sum, thresh, x); */
1925     }
1926   } else {
1927     /* direct calculation of binomial */
1928     for(i = 0; i < n; i++) {
1929       x += (int)(p + ((double)rand())/(RAND_MAX + 1.0));
1930     }
1931   }
1932 /*   if( x > 0 ) { printf("binomial x = %d, n = %d, p = %g, np = %g\n", x, n, p, n * p); } */
1933   return x;
1934 }
1935 
update_shannon_partials(learner_t * learner,bool_t fulldoc)1936 void update_shannon_partials(learner_t *learner, bool_t fulldoc) {
1937   hash_count_t i;
1938   weight_t ell, lell;
1939   bool_t docount;
1940 
1941   docount = (learner->doc.emp.top > 0);
1942   /* if force is true, we have a document, else we check emp.top to
1943      see if this is a nonempty document */
1944   if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
1945 
1946     learner->doc.emp.shannon = 0;
1947     if( learner->doc.emp.top > 0 ) {
1948       /* assume all marks are zero to start with */
1949       for(i = 0; i < learner->doc.emp.top; i++) {
1950 	l_item_t *p = find_in_learner(learner, learner->doc.emp.stack[i]);
1951 	if( p && !MARKEDP(p) ) {
1952 	  ell = ((weight_t)p->tmp.read.eff)/learner->doc.emp.top;
1953 
1954 	  /* it would be nice to be able to digitize p->B, but ell is
1955 	     often smaller than the smallest value, and if there are
1956 	     many documents, there could simultaneously be overflow on the most
1957 	     frequent features. So we need p->B to be a floating point type. */
1958 	  p->B += ell;
1959 
1960 	  /* the standard entropy convention is that 0*inf = 0. here, if
1961 	   * ell is so small that log(ell) is infinite, we pretend ell
1962 	   * == 0 this is slightly better than testing for ell == 0
1963 	   * directly.
1964 	   */
1965 
1966 	  lell = -log(ell);
1967 	  if( !isinf(lell) ) {
1968 	    learner->doc.emp.shannon += ell * lell;
1969 	  }
1970 
1971 	  SETMARK(p);
1972 	}
1973       }
1974 
1975       if( u_options & (1<<U_OPTION_CONFIDENCE) ) {
1976 	emplist_add_to_reservoir(learner->doc.reservoir,
1977 				 learner->doc.count,
1978 				 &learner->doc.emp);
1979       }
1980 
1981 /*       fprintf(stderr, "udate_shannon A +(%f) S +(%f)\n",  */
1982 /* 	      -learner->doc.emp.shannon, (learner->doc.emp.shannon * learner->doc.emp.shannon)); */
1983       learner->doc.A += -learner->doc.emp.shannon;
1984       learner->doc.S += (learner->doc.emp.shannon * learner->doc.emp.shannon);
1985 
1986       /* clear the empirical counts and marks */
1987       for(i = 0; i < learner->doc.emp.top; i++) {
1988 	l_item_t *p = find_in_learner(learner, learner->doc.emp.stack[i]);
1989 	if( p && MARKEDP(p) ) {
1990 	  p->tmp.read.eff = 0;
1991 	  UNSETMARK(p);
1992 	}
1993       }
1994 
1995       /* now reset the empirical features */
1996       learner->doc.emp.top = 0;
1997 
1998     }
1999   }
2000   if( docount ) {
2001     learner->doc.count++;
2002   }
2003 }
2004 
calc_shannon(learner_t * learner)2005 void calc_shannon(learner_t *learner) {
2006   l_item_t *i, *e;
2007   document_count_t c, n;
2008   hash_count_t q;
2009   emplist_t *empl;
2010   score_t score;
2011   score_t effective_count = 0.0;
2012   double Lambda, jensen, s2;
2013 
2014 
2015   learner->mu = 0.0;
2016   learner->s2 = 0.0;
2017   learner->alpha = 0.0;
2018   learner->beta = 0.0;
2019 
2020   effective_count = learner->doc.count; /* cast to real value for readability */
2021 
2022   if( (m_options & (1<<M_OPTION_CALCENTROPY)) &&
2023       (learner->doc.count > 1) ) {
2024     learner->shannon += -(learner->doc.A/effective_count);
2025     learner->shannon2 =
2026       (effective_count * learner->doc.S - (learner->doc.A * learner->doc.A)) /
2027       (effective_count * (effective_count - 1.0));
2028 
2029 /*     fprintf(stderr, "[%f %f %f %f]\n", effective_count * learner->doc.S, (learner->doc.A * learner->doc.A), (effective_count * learner->doc.S - (learner->doc.A * learner->doc.A)), (effective_count * (effective_count - 1.0)) ); */
2030 /*     fprintf(stderr, "calc_entropy eff %f sha %f sha2 %f A %f S %f\n", effective_count, learner->shannon, learner->shannon2, learner->doc.A , learner->doc.S); */
2031 
2032   } else { /* no individual documents, so compute global entropy */
2033 
2034     learner->shannon = 0.0;
2035     learner->shannon2 = 0.0;
2036 
2037     e = learner->hash + learner->max_tokens;
2038     for(i = learner->hash; i != e; i++) {
2039       if( FILLEDP(i) &&
2040 	  (i->typ.order == learner->max_order) ) {
2041 	learner->shannon += log((weight_t)i->count) * (weight_t)i->count;
2042       }
2043     }
2044     learner->shannon =
2045       -( learner->shannon/learner->full_token_count -
2046 	 log((weight_t)learner->full_token_count) );
2047   }
2048 
2049   if( (u_options & (1<<U_OPTION_CONFIDENCE)) &&
2050       (learner->doc.count > 0) ) {
2051 
2052     /* shannon was computed during update */
2053     jensen = 0.0;
2054     e = learner->hash + learner->max_tokens;
2055     for(i = learner->hash; i != e; i++) {
2056       if( NOTNULL(i->lam) ) {
2057 	Lambda = UNPACK_LAMBDA(i->lam);
2058 	if( i->typ.order == 1 ) {
2059 	  Lambda += UNPACK_RWEIGHTS(i->tmp.min.dref) - learner->logZ;
2060 	}
2061 	learner->mu += (Lambda * i->B);
2062 	jensen += (Lambda * Lambda * i->B);
2063       }
2064     }
2065 
2066     learner->mu /= effective_count;
2067     learner->mu = -(learner->shannon + learner->mu);
2068 
2069     s2 = 0.0;
2070     n = 1;
2071     for(c = 0; c < RESERVOIR_SIZE; c++) {
2072       empl = &(learner->doc.reservoir[c]);
2073       if( !empl->stack  ) {
2074 	n++;
2075 	continue;
2076       } else {
2077 	score = 0;
2078 	for(q = 0; q < empl->top; q++) {
2079 	  i = find_in_learner(learner, empl->stack[q]);
2080 	  if( i && NOTNULL(i->lam) ) {
2081 	    Lambda = UNPACK_LAMBDA(i->lam);
2082 	    if( i->typ.order == 1 ) {
2083 	      Lambda += UNPACK_RWEIGHTS(i->tmp.min.dref) - learner->logZ;
2084 	    }
2085 	    score += Lambda;
2086 	  }
2087 	}
2088 
2089 	score = score/empl->top + empl->shannon;
2090 	if( u_options & (1<<U_OPTION_VERBOSE) &&
2091 	    u_options & (1<<U_OPTION_CONFIDENCE) ) {
2092 	  fprintf(stdout,
2093 		  "%s ( %5.2" FMT_printf_score_t " + "
2094 		  "%-5.2" FMT_printf_score_t " )* %-" FMT_printf_integer_t " \n",
2095 		  "reservoir",
2096 		  -nats2bits(score),
2097 		  nats2bits(empl->shannon),
2098 		  empl->top);
2099 	}
2100 
2101 	score = -score - learner->mu;
2102 	s2 += (score)*(score);
2103       }
2104     }
2105 
2106     /* the reservoir doesn't contain enough extreme samples, so
2107        the variance is necessarily underestimated.
2108     */
2109     if( (c <= n) || isinf(learner->s2) ) {
2110       learner->s2 = 0.0;
2111       learner->alpha = 0.0;
2112       learner->beta = 0.0;
2113     } else {
2114       learner->s2 = s2/(c-n);
2115       learner->alpha = (learner->mu*learner->mu)/learner->s2;
2116       learner->beta = learner->s2/learner->mu;
2117     }
2118 
2119 
2120   } else {
2121     learner->alpha = 0.0;
2122     learner->beta = 0.0;
2123     learner->mu = 0.0;
2124     learner->s2 = 0.0;
2125   }
2126 
2127   if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
2128     clear_empirical(&empirical);
2129   }
2130 
2131   if( u_options & (1<<U_OPTION_VERBOSE) ) {
2132     fprintf(stdout,
2133 	    "documents %ld shannon %" FMT_printf_score_t
2134 	    " mu %" FMT_printf_score_t
2135 	    " s2 %" FMT_printf_score_t
2136 	    " alpha %" FMT_printf_score_t
2137 	    " beta %" FMT_printf_score_t "\n",
2138 	    (long int)effective_count,
2139 	    nats2bits(learner->shannon),
2140 	    nats2bits(learner->mu),
2141 	    nats2bits(learner->s2),
2142 	    nats2bits(learner->alpha),
2143 	    nats2bits(learner->beta));
2144   }
2145 
2146 }
2147 
reset_mbox_messages(learner_t * learner,MBOX_State * mbox)2148 void reset_mbox_messages(learner_t *learner, MBOX_State *mbox) {
2149   not_header = 1;
2150   learner->doc.count = 0;
2151   learner->doc.nullcount = 0; /* first header doesn't count */
2152   learner->doc.skip = 0;
2153 }
2154 
count_mbox_messages(learner_t * learner,Mstate mbox_state,char * textbuf)2155 void count_mbox_messages(learner_t *learner, Mstate mbox_state, char *textbuf) {
2156 
2157   if( !textbuf ) {
2158     update_shannon_partials(learner, 0);
2159     /* don't count document */
2160   } else {
2161     switch(mbox_state) {
2162     case msHEADER:
2163       if(not_header) {
2164 	if( u_options & (1<<U_OPTION_DECIMATE) ) {
2165 	  learner->doc.skip = ( rand() > (int)(RAND_MAX>>decimation) );
2166 	}
2167 	if( !learner->doc.skip ) {
2168 	  if( m_options & (1<<M_OPTION_CALCENTROPY) &&
2169 	      (learner->doc.emp.top == 0) ) {
2170 	    learner->doc.nullcount++;
2171 	  }
2172 	  update_shannon_partials(learner, 1);
2173 	}
2174       }
2175       not_header = 0;
2176       break;
2177     default:
2178       not_header = 1;
2179       break;
2180     }
2181   }
2182 
2183 }
2184 
2185 
find_in_learner(learner_t * learner,hash_value_t id)2186 l_item_t *find_in_learner(learner_t *learner, hash_value_t id) {
2187     register l_item_t *i, *loop;
2188     /* start at id */
2189     i = loop = &learner->hash[id & (learner->max_tokens - 1)];
2190 
2191     while( FILLEDP(i) ) {
2192 	if( EQUALP(i->id,id) ) {
2193 	    return i; /* found id */
2194 	} else {
2195 	    i++; /* not found */
2196 	    /* wrap around */
2197 	    i = (i >= &learner->hash[learner->max_tokens]) ? learner->hash : i;
2198 	    if( i == loop ) {
2199 		return NULL; /* when hash table is full */
2200 	    }
2201 	}
2202     }
2203 
2204     /* empty slot, so not found */
2205 
2206     return i;
2207 }
2208 
2209 
2210 /* places the token in the global hash and writes the
2211    token to a temporary file for later, then updates
2212    the digram frequencies. Tokens have the format
2213    DIAMOND t1 DIAMOND t2 ... tn DIAMOND CLASSEP class NUL */
hash_word_and_learn(learner_t * learner,char * tok,token_type_t tt,regex_count_t re)2214 void hash_word_and_learn(learner_t *learner,
2215 			 char *tok, token_type_t tt, regex_count_t re) {
2216   hash_value_t id;
2217   l_item_t *i;
2218   char *s;
2219   alphabet_size_t p,q,len;
2220 
2221   for(s = tok; s && *s == DIAMOND; s++);
2222   if( s && (*s != EOTOKEN) ) {
2223 
2224     if( overflow_warning ) {
2225       return; /* for there be troubles ahead */
2226     }
2227 
2228     if( m_options & (1<<M_OPTION_MULTINOMIAL) ) { tt.order = 1; }
2229 
2230     if( u_options & (1<<U_OPTION_DECIMATE) ) {
2231       if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
2232 	if( learner->doc.skip ) {
2233 	  return;
2234 	}
2235       } else {
2236 	if( rand() > (int)(RAND_MAX>>decimation) ) {
2237 	  return;
2238 	}
2239       }
2240     }
2241 
2242     id = hash_full_token(tok);
2243     i = find_in_learner(learner, id);
2244 
2245     if( i && !FILLEDP(i) &&
2246 	((100 * learner->unique_token_count) >=
2247 	 (HASH_FULL * learner->max_tokens)) && grow_learner_hash(learner) ) {
2248       i = find_in_learner(learner,id);
2249       /* new i, go through all tests again */
2250     }
2251 
2252     if( i ) {
2253 
2254       if( FILLEDP(i) ) {
2255 
2256 	/* redundant :-) */
2257 
2258       } else if( /* !FILLEDP(i) && */
2259 		((100 * learner->unique_token_count) <
2260 		 (HASH_FULL * learner->max_tokens) ) ) {
2261 
2262 	/* fill the hash and write to file */
2263 
2264 	SET(i->id,id);
2265 
2266 	INCREMENT(learner->unique_token_count,
2267 		  K_TOKEN_COUNT_MAX, overflow_warning);
2268 
2269 	i->typ = tt;
2270 
2271 	/* order accounting */
2272 	learner->max_order = MAXIMUM(learner->max_order,i->typ.order);
2273 
2274 	INCREMENT(learner->fixed_order_unique_token_count[i->typ.order],
2275 		  K_TOKEN_COUNT_MAX, overflow_warning);
2276 
2277      	if( (u_options & (1<<U_OPTION_DEBUG)) ) {
2278      	  fprintf(stdout, "match %8lx ", (long unsigned int)i->id);
2279 	  print_token(stdout, tok);
2280 	  fprintf(stdout, " %d\n", i->typ.order);
2281      	}
2282 
2283 	tmp_grow(learner); /* just in case we're full */
2284 	tmp_write_token(learner, tok);
2285       }
2286 
2287       INCREMENT(i->count, K_TOKEN_COUNT_MAX, overflow_warning);
2288       learner->tmax = MAXIMUM(learner->tmax, i->count);
2289 
2290       if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
2291 	if( (learner->doc.emp.top < learner->doc.emp.max) ||
2292 	    emplist_grow(&learner->doc.emp) ) {
2293 	  i->tmp.read.eff++; /* this can never overflow before i->count */
2294 	  UNSETMARK(i); /* needed for calculating shannon */
2295 	  learner->doc.emp.stack[learner->doc.emp.top++] = i->id;
2296 	}
2297       }
2298 
2299       INCREMENT(learner->full_token_count,
2300 		K_TOKEN_COUNT_MAX, overflow_warning);
2301 
2302       INCREMENT(learner->fixed_order_token_count[i->typ.order],
2303 		K_TOKEN_COUNT_MAX, skewed_constraints_warning);
2304 
2305     }
2306 
2307     if( digramic_overflow_warning ) {
2308       return;
2309     }
2310 
2311 
2312     /* now update digram frequency counts */
2313     if( tt.order == 1 ) { /* count each token only once */
2314       p = *tok++;
2315       CLIP_ALPHABET(p);
2316       len = 1;
2317       /* finally update character frequencies */
2318       while( *tok != EOTOKEN ) {
2319 	q = (unsigned char)*tok;
2320 	CLIP_ALPHABET(q);
2321 	if( learner->dig[p][q] < K_DIGRAM_COUNT_MAX )
2322 	  { learner->dig[p][q]++; } else { digramic_overflow_warning = 1; }
2323 	if( learner->dig[RESERVED_MARGINAL][q] < K_DIGRAM_COUNT_MAX )
2324 	  { learner->dig[RESERVED_MARGINAL][q]++; } /* no warning on overflow */
2325 
2326 	p = q;
2327 	tok++;
2328 	if( q != DIAMOND ) { len++; }
2329       }
2330       learner->dig[RESERVED_TOKLEN][len]++;
2331 
2332       if( digramic_overflow_warning ) {
2333 	errormsg(E_WARNING,
2334 		"ran out of integers (too much data), "
2335 		"reference measure may be skewed.\n");
2336 	m_options |= (1<<M_OPTION_WARNING_BAD);
2337       }
2338     }
2339 
2340     if( overflow_warning ) {
2341       errormsg(E_WARNING,
2342 	      "ran out of integers (too much data), "
2343 	      "results may be skewed.\n");
2344       m_options |= (1<<M_OPTION_WARNING_BAD);
2345     }
2346   }
2347 }
2348 
2349 /* initialize global learner object */
init_learner(learner_t * learner,char * opath,bool_t readonly)2350 void init_learner(learner_t *learner, char *opath, bool_t readonly) {
2351   alphabet_size_t i, j;
2352   regex_count_t c;
2353   token_order_t z;
2354 
2355   /* some constants */
2356   learner->max_tokens = default_max_tokens;
2357   learner->max_hash_bits = default_max_hash_bits;
2358   learner->full_token_count = 0;
2359   learner->unique_token_count = 0;
2360   learner->tmax = 0;
2361   learner->logZ = 0.0;
2362   learner->shannon = 0.0;
2363   learner->shannon2 = 0.0;
2364   learner->alpha = 0.0;
2365   learner->beta = 0.0;
2366   learner->mu = 0.0;
2367   learner->s2 = 0.0;
2368   learner->max_order = 0;
2369   memset(learner->mediaprobs, 0 , TOKEN_CLASS_MAX * sizeof(score_t));
2370 
2371   learner->doc.A = 0.0;
2372   learner->doc.S = 0.0;
2373   learner->doc.count = 0;
2374   learner->doc.nullcount = 0;
2375   learner->doc.emp.top = 0;
2376   learner->doc.emp.max = 0;
2377   learner->doc.emp.stack = NULL;
2378   memset(learner->doc.reservoir, 0, RESERVOIR_SIZE * sizeof(emplist_t));
2379 
2380   learner->model.options = m_options;
2381   learner->model.cp = m_cp;
2382   learner->model.dt = m_dt;
2383   learner->model.tmin = zthreshold;
2384   learner->u_options = u_options;
2385 
2386   learner->mmap_start = NULL;
2387   learner->mmap_learner_offset = 0;
2388   learner->mmap_hash_offset = 0;
2389   learner->hash = NULL;
2390 
2391   /* init character frequencies */
2392   for(i = 0; i < ASIZE; i++) {
2393     for(j = 0; j < ASIZE; j++) {
2394       learner->dig[i][j] = 0.0;
2395     }
2396   }
2397 
2398   for(c = 0; c < MAX_RE + 1; c++) {
2399     learner->regex_token_count[c] = 0;
2400   }
2401 
2402   for(z = 0; z < MAX_SUBMATCH; z++) {
2403     learner->fixed_order_token_count[z] = 0;
2404     learner->fixed_order_unique_token_count[z] = 0;
2405   }
2406 
2407   if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
2408     reset_mbox_messages(learner, &mbox);
2409   }
2410 
2411   if( !(opath && *opath &&
2412 	read_online_learner_struct(learner, opath, readonly)) ) {
2413     /* if we're here, we must do what read_online_... didn't do */
2414 
2415     /* allocate and zero room for hash */
2416     learner->hash = (l_item_t *)calloc(learner->max_tokens, sizeof(l_item_t));
2417     if( !learner->hash ) {
2418       errormsg(E_FATAL,
2419 	       "not enough memory? I couldn't allocate %li bytes\n",
2420 	       (sizeof(l_item_t) * ((long int)learner->max_tokens)));
2421     }
2422 
2423     /* open temporary file for writing tokens */
2424     learner->tmp.file =
2425       mytmpfile(learner->filename ? learner->filename : progname,
2426 		&learner->tmp.filename);
2427     /* set this: in case of a fatal signal, we can unlink the temprile */
2428     cleanup.tempfile = learner->tmp.filename;
2429 
2430     learner->tmp.iobuf = NULL;   /* can't reuse in_iobuf or out_iobuf */
2431     learner->tmp.offset = 0;
2432     learner->tmp.avail = 0;
2433     learner->tmp.used = 0;
2434     learner->tmp.mmap_start = NULL;
2435     learner->tmp.mmap_offset = 0;
2436     learner->tmp.mmap_length = 0;
2437     learner->tmp.mmap_cursor = 0;
2438 
2439     if( learner->tmp.file ) {
2440 #if defined HAVE_POSIX_MEMALIGN
2441       /* buffer must exist until after fclose() */
2442       if( 0 != posix_memalign(&learner->tmp.iobuf, system_pagesize,
2443 			      BUFFER_MAG * system_pagesize) ) {
2444 	learner->tmp.iobuf = NULL;
2445       }
2446 #elif defined HAVE_MEMALIGN
2447       /* buffer can't be free()'d */
2448       learner->tmpiobuf = (void *)memalign(system_pagesize,
2449 					   BUFFER_MAG * system_pagesize);
2450 #elif defined HAVE_VALLOC
2451       /* buffer can't be free()'d */
2452       learner->tmpiobuf = (void *)valloc(BUFFER_MAG * system_pagesize);
2453 #endif
2454       if( learner->tmp.iobuf ) {
2455 	setvbuf(learner->tmp.file, (char *)learner->tmp.iobuf, (int)_IOFBF,
2456 		(size_t)(BUFFER_MAG * system_pagesize));
2457       }
2458     }
2459 
2460   }
2461 
2462   if( u_options & (1<<U_OPTION_MMAP) ) {
2463     MLOCK(learner->hash, sizeof(l_item_t) * learner->max_tokens);
2464   }
2465 
2466   MADVISE(learner->hash, sizeof(l_item_t) * learner->max_tokens,
2467 	  MADV_SEQUENTIAL);
2468 
2469   if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
2470     learner->doc.emp.max = system_pagesize/sizeof(hash_count_t);
2471     learner->doc.emp.stack =
2472       (hash_value_t *)malloc(learner->doc.emp.max * sizeof(hash_value_t));
2473     if( !learner->doc.emp.stack ) {
2474       errormsg(E_WARNING,
2475 	       "disabling emp.stack. Not enough memory? I couldn't allocate %li bytes\n",
2476 	       sizeof(hash_count_t *) * ((long int)learner->doc.emp.max));
2477       m_options &= ~(1<<M_OPTION_CALCENTROPY);
2478     }
2479   }
2480 
2481 }
2482 
free_learner(learner_t * learner)2483 void free_learner(learner_t *learner) {
2484   document_count_t i;
2485 
2486   free_learner_hash(learner);
2487 
2488   if( learner->doc.emp.stack ) {
2489     free(learner->doc.emp.stack);
2490     learner->doc.emp.stack = NULL;
2491   }
2492 
2493   for( i = 0; i < RESERVOIR_SIZE; i++ ) {
2494     if( learner->doc.reservoir[i].stack ) {
2495       free(learner->doc.reservoir[i].stack);
2496       learner->doc.reservoir[i].stack = NULL;
2497     }
2498   }
2499 
2500   if( learner->tmp.file ) {
2501     fclose(learner->tmp.file);
2502     learner->tmp.file = NULL;
2503   }
2504 
2505   cleanup_tempfiles();
2506 }
2507 
2508 /* calculates the most probable Dirichlet parameters
2509    given digram counts. Method described in MacKay and Peto (1995)
2510    Since we don't count the DIAMOND-DIAMOND digram, including the prior will
2511    overestimate the DIAMOND-DIAMOND transition probability. However, for this
2512    transition as well as the others, the Dirichlet will have practically
2513    no effect when there's enough data in the corpus.
2514 
2515    HOWEVER:
2516    * A property of the MP posterior estimates is that if character j
2517    * never appears as a transition *target*, then the corresponding
2518    * u[j] entry is null. Sometimes, this makes sense, as j isn't part
2519    * of the "vocabulary", but other times it's not sensible, becasue sooner
2520    * or later some spam message will make use of that character in
2521    * some combination with another character. When that happens, we're
2522    * in trouble.
2523    *
2524    * Previous versions of dbacl.c had a simple hack to prevent this
2525    * possibility: we pretend we saw a DIAMOND-j-DIAMOND
2526    * double transition, for each j. Equivalently, we simply increment
2527    * the count by 1 for each such transition, i.e.
2528    *     learner->dig[i][(alphabet_size_t)DIAMOND]++;
2529    *     learner->dig[(alphabet_size_t)DIAMOND][i]++;
2530    *
2531    * This hack has now been removed again, to prevent confusion.
2532    * It's not really needed, since the uniform measure can be used
2533    * for spam filtering.
2534 */
make_dirichlet_digrams(learner_t * learner)2535 void make_dirichlet_digrams(learner_t *learner) {
2536   alphabet_size_t i, j;
2537   token_count_t k;
2538   float G[ASIZE];
2539   float H[ASIZE];
2540   float V[ASIZE];
2541   float F[ASIZE];
2542   double prev_alpha;
2543   double K, t;
2544 
2545   /* initialize */
2546   dirichlet.alpha = 1.0;
2547   for(i = AMIN; i < ASIZE; i++) {
2548     dirichlet.u[i] = 1.0/((double)(ASIZE - AMIN));
2549   }
2550 
2551   /* precompute useful constants */
2552   for(j = AMIN; j < ASIZE; j++) {
2553     G[j] = 0.0;
2554     H[j] = 0.0;
2555     V[j] = 0.0;
2556     F[j] = 0.0;
2557     for(i = AMIN; i < ASIZE; i++) {
2558       for(k = AMIN; k < (token_count_t)learner->dig[i][j]; k++) {
2559 	G[j] += 1.0/k;
2560 	H[j] += 1.0/((double)k*k);
2561       }
2562       if( learner->dig[i][j] > 0 ) {
2563 	V[j]++;
2564 	F[j] += learner->dig[i][j];
2565       }
2566     }
2567     if( 0 && u_options & (1<<U_OPTION_DEBUG) ) {
2568       fprintf(stdout,
2569 	      "symbol %d has G = %g H = %g V = %g F = %g\n",
2570 	      j, G[j], H[j], V[j], F[j]);
2571     }
2572   }
2573 
2574   /* now optimize the dirichlet parameters */
2575 #define MAX_DIRITER 300
2576   k = 0;
2577   do {
2578     prev_alpha = dirichlet.alpha;
2579     K = 0.0;
2580     for(i = AMIN; i < ASIZE; i++) {
2581       K += log((F[i] + prev_alpha)/prev_alpha) +
2582 	(0.5) * F[i]/(prev_alpha * (F[i] + prev_alpha));
2583     }
2584     dirichlet.alpha = 0.0;
2585     for(j = AMIN; j < ASIZE; j++) {
2586       t = K - G[j];
2587       dirichlet.u[j] = 2 * V[j] /(t + sqrt(t * t + 4 * H[j] * V[j]));
2588       dirichlet.alpha += dirichlet.u[j];
2589     }
2590     if( u_options & (1<<U_OPTION_VERBOSE) ) {
2591       fprintf(stdout,
2592 	      "dirichlet alpha %g --> %g\n",
2593 	      prev_alpha, dirichlet.alpha);
2594     }
2595   } while((k++ < MAX_DIRITER) &&
2596 	  (fabs(prev_alpha - dirichlet.alpha) > qtol_alpha) );
2597 
2598   if( k >= MAX_DIRITER ) {
2599     errormsg(E_WARNING, "dirichlet measure did not converge.\n");
2600   }
2601 
2602   if(u_options & (1<<U_OPTION_VERBOSE) ) {
2603     for(j = AMIN; j < ASIZE; j++) {
2604       fprintf(stdout,
2605 	      "dirichlet u[%d] =  %g\n",
2606 	      j, dirichlet.u[j]);
2607     }
2608   }
2609 
2610   /* now make the probabilities */
2611 
2612   for(i = AMIN; i < ASIZE; i++) {
2613     t = 0.0;
2614     for(j = AMIN; j < ASIZE; j++) {
2615       t += learner->dig[i][j];
2616     }
2617     for(j = AMIN; j < ASIZE; j++) {
2618       /* note: simulate the effect of digitizing the digrams */
2619       learner->dig[i][j] =
2620 	UNPACK_DIGRAMS(PACK_DIGRAMS(log(((weight_t)learner->dig[i][j] +
2621 					 dirichlet.u[j]) /
2622 					(t + dirichlet.alpha))));
2623       if( 1 && u_options & (1<<U_OPTION_DEBUG) ) {
2624 	fprintf(stdout,
2625 		"learner->dig[%d][%d] = %f\n",
2626 		i, j, learner->dig[i][j]);
2627       }
2628     }
2629   }
2630   /* clear other data */
2631   for(i = 0; i < AMIN; i++) {
2632     for(j = 0; j < ASIZE; j++) {
2633       learner->dig[i][j] = 0.0;
2634     }
2635   }
2636 }
2637 
make_uniform_digrams(learner_t * learner)2638 void make_uniform_digrams(learner_t *learner) {
2639   alphabet_size_t i, j;
2640   for(i = AMIN; i < ASIZE; i++) {
2641     for(j = AMIN; j < ASIZE; j++) {
2642       /* note: simulate the effect of digitizing the digrams */
2643       learner->dig[i][j] = UNPACK_DIGRAMS(PACK_DIGRAMS(-log(ASIZE - AMIN)));
2644       if(0 && u_options & (1<<U_OPTION_DEBUG) ) {
2645 	fprintf(stdout,
2646 		"learner->dig[%d][%d] = %f\n",
2647 		i, j, learner->dig[i][j]);
2648       }
2649     }
2650   }
2651   /* clear other data */
2652   for(i = 0; i < AMIN; i++) {
2653     for(j = 0; j < ASIZE; j++) {
2654       learner->dig[i][j] = 0.0;
2655     }
2656   }
2657 }
2658 
2659 
make_toklen_digrams(learner_t * learner)2660 void make_toklen_digrams(learner_t *learner) {
2661   alphabet_size_t i, j;
2662   weight_t lam[MAX_TOKEN_LEN+2];
2663   weight_t total;
2664   /* zero out usual digrams, we only use token length */
2665   {
2666     for(i = AMIN; i < ASIZE; i++) {
2667       for(j = AMIN; j < ASIZE; j++) {
2668 	learner->dig[i][j] = 0.0;
2669       }
2670     }
2671     for(j = AMIN; j < ASIZE; j++) {
2672       learner->dig[RESERVED_MARGINAL][j] = 0.0;
2673     }
2674   }
2675   /* note: toklen counts the number of transitions, not number of chars */
2676   total = 0.0;
2677   for(i = 2; i < MAX_TOKEN_LEN+2; i++) {
2678     if( learner->dig[RESERVED_TOKLEN][i] <= 0.0 ) {
2679       /* don't want infinities */
2680       learner->dig[RESERVED_TOKLEN][i]++;
2681     }
2682     total += learner->dig[RESERVED_TOKLEN][i];
2683   }
2684 
2685   lam[0] = 0.0; /* normalizing constant */
2686   for(i = 2; i < MAX_TOKEN_LEN+2; i++) {
2687     lam[i] = log(learner->dig[RESERVED_TOKLEN][i]/total) - i * log(ASIZE - AMIN);
2688   }
2689   for(i = 0; i < MAX_TOKEN_LEN+2; i++) {
2690     if(0 && u_options & (1<<U_OPTION_DEBUG) ) {
2691       fprintf(stdout,
2692 	      "learner->dig[%d][%d] = %f %f %f\n",
2693 	      RESERVED_TOKLEN, i, learner->dig[RESERVED_TOKLEN][i],
2694 	      -i * log(ASIZE - AMIN), lam[i]);
2695     }
2696     learner->dig[RESERVED_TOKLEN][i] = UNPACK_DIGRAMS(PACK_DIGRAMS(lam[i]));
2697   }
2698 }
2699 
make_mle_digrams(learner_t * learner)2700 void make_mle_digrams(learner_t *learner) {
2701   alphabet_size_t i, j;
2702   double total;
2703   double missing;
2704 
2705   /* for the mle we use a slightly modified emprirical frequency histogram.
2706      This is necessary because the true mle causes a singularity in classification
2707      whenever a word contains a character transition which was never seen before.
2708      The modification is to mix the true mle with a uniform distribution, but
2709      the uniform only accounts for 1/total of the mass, so ie it disappears
2710      quickly.
2711   */
2712   for(i = AMIN; i < ASIZE; i++) {
2713     total = 0.0;
2714     missing = 0.0;
2715     for(j = AMIN; j < ASIZE; j++) {
2716       if( learner->dig[i][j] > 0.0 ) {
2717 	total += learner->dig[i][j];
2718       } else {
2719 	missing++;
2720       }
2721     }
2722     if( total > 0.0 ) {
2723       if( missing > 0.0 ) {
2724 	missing = 1.0/missing;
2725 	for(j = AMIN; j < ASIZE; j++) {
2726 	  learner->dig[i][j] =
2727 	    UNPACK_DIGRAMS(PACK_DIGRAMS(log((learner->dig[i][j]/total) * (1.0 - missing) + missing)));
2728 	}
2729       } else {
2730 	for(j = AMIN; j < ASIZE; j++) {
2731 	  learner->dig[i][j] =
2732 	    UNPACK_DIGRAMS(PACK_DIGRAMS(log(learner->dig[i][j] / total)));
2733 	}
2734       }
2735     } else {
2736       for(j = AMIN; j < ASIZE; j++) {
2737 	learner->dig[i][j] =
2738 	  UNPACK_DIGRAMS(PACK_DIGRAMS(-log(missing)));
2739       }
2740     }
2741   }
2742   /* clear other data */
2743   for(i = 0; i < AMIN; i++) {
2744     for(j = 0; j < ASIZE; j++) {
2745       learner->dig[i][j] = 0.0;
2746     }
2747   }
2748 }
2749 
make_iid_digrams(learner_t * learner)2750 void make_iid_digrams(learner_t *learner) {
2751   alphabet_size_t i, j;
2752   double total;
2753   double missing;
2754 
2755   /* the digrams here are not character transitions, but merely destination
2756      character frequencies, ie the characters of a token are assumed
2757      independent and identically distributed. */
2758 
2759   total = 0.0;
2760   missing = 0.0;
2761 
2762   for(j = AMIN; j < ASIZE; j++) {
2763     if( learner->dig[RESERVED_MARGINAL][j] > 0.0 ) {
2764       total += learner->dig[RESERVED_MARGINAL][j];
2765     } else {
2766       missing++;
2767     }
2768   }
2769   if( total > 0.0 ) {
2770     if( missing > 0.0 ) {
2771       missing = 1.0/missing;
2772       for(j = AMIN; j < ASIZE; j++) {
2773 	learner->dig[RESERVED_MARGINAL][j] =
2774 	  UNPACK_DIGRAMS(PACK_DIGRAMS(log((learner->dig[RESERVED_MARGINAL][j]/total) * (1.0 - missing) + missing)));
2775       }
2776     } else {
2777       for(j = AMIN; j < ASIZE; j++) {
2778 	learner->dig[RESERVED_MARGINAL][j] =
2779 	  UNPACK_DIGRAMS(PACK_DIGRAMS(log(learner->dig[RESERVED_MARGINAL][j] / total)));
2780       }
2781     }
2782   } else {
2783     for(j = AMIN; j < ASIZE; j++) {
2784       learner->dig[RESERVED_MARGINAL][j] =
2785 	UNPACK_DIGRAMS(PACK_DIGRAMS(-log(missing)));
2786     }
2787   }
2788 
2789   /* now make all transitions identical */
2790   for(j = 0; j < ASIZE; j++) {
2791     for(i = AMIN; i < ASIZE; i++) {
2792       learner->dig[i][j] = learner->dig[RESERVED_MARGINAL][j];
2793     }
2794   }
2795   /* clear other data */
2796   for(i = 0; i < AMIN; i++) {
2797     for(j = 0; j < ASIZE; j++) {
2798       learner->dig[i][j] = 0.0;
2799     }
2800   }
2801 }
2802 
2803 /* use 0.0 for small datasets, 1.0 for huge datasets.
2804  * 0.6 works well generally. I don't understand this :-(
2805  */
transpose_digrams(learner_t * learner)2806 void transpose_digrams(learner_t *learner) {
2807   register alphabet_size_t i, j;
2808   register weight_t t;
2809 
2810   /* we skip transitions involving DIAMOND, TOKENSEP */
2811   /* code below uses fact that DIAMOND == 0x01 */
2812   /* code below uses fact that TOKENSEP == 0x02 */
2813   for(i = AMIN; i < ASIZE; i++) {
2814     for(j = AMIN; j < i; j++) {
2815       t = (learner->dig[i][j] + learner->dig[j][i])/2.0;
2816       learner->dig[j][i] =  0.6 * learner->dig[j][i] + 0.4 * t;
2817       learner->dig[i][j] =  0.6 * learner->dig[i][j] + 0.4 * t;
2818     }
2819   }
2820 }
2821 
recompute_ed(learner_t * learner,weight_t * plogzon,weight_t * pdiv,int i,weight_t lam[ASIZE],weight_t Xi)2822 void recompute_ed(learner_t *learner, weight_t *plogzon, weight_t *pdiv,
2823 		  int i, weight_t lam[ASIZE], weight_t Xi) {
2824   register alphabet_size_t j;
2825   weight_t logA = log((weight_t)(ASIZE - AMIN));
2826   weight_t logzon, div;
2827   weight_t maxlogz, tmp;
2828 
2829   maxlogz = 0.0;
2830   for(j = AMIN; j < ASIZE; j++) {
2831     if( lam[j] > 0.0 ) {
2832       tmp = lam[j] + log(1.0 - exp(-lam[j])) - logA;
2833       if( maxlogz < tmp ) {
2834 	maxlogz = tmp;
2835       }
2836     }
2837   }
2838 
2839 
2840   logzon = exp(0.0 - maxlogz);
2841   div = 0.0;
2842   for(j = AMIN; j < ASIZE; j++) {
2843     if( lam[j] > 0.0 ) {
2844       tmp = -logA - maxlogz;
2845       logzon += (exp( lam[j] + tmp) - exp(tmp));
2846       div += lam[j] * learner->dig[i][j];
2847     }
2848   }
2849 
2850 
2851   tmp = maxlogz + log(logzon);
2852 
2853   if( isnan(tmp) ) {
2854     errormsg(E_FATAL,"sorry, digramic partition function went kaboom.\n");
2855   }
2856 
2857   logzon = tmp;
2858   div = div/Xi - logzon;
2859 
2860   *plogzon = logzon;
2861   *pdiv = div;
2862 }
2863 
make_entropic_digrams(learner_t * learner)2864 void make_entropic_digrams(learner_t *learner) {
2865   weight_t lam[ASIZE];
2866   weight_t lam_delta, old_lam, logt, maxlogt;
2867   weight_t Xi, logXi, logzon, div, old_logzon, old_div;
2868   weight_t logA = log((weight_t)(ASIZE - AMIN));
2869   register alphabet_size_t i, j;
2870   int itcount;
2871 
2872   for(i = AMIN; i < ASIZE; i++) {
2873 
2874     /* initializations */
2875     Xi = 0.0;
2876     for(j = AMIN; j < ASIZE; j++) {
2877       Xi += learner->dig[i][j];
2878       lam[j] = 0.0;
2879     }
2880     logXi = log(Xi);
2881 
2882     recompute_ed(learner, &logzon, &div, i, lam, Xi);
2883 
2884     /* now iterate - this is like minimize_learner_divergence() */
2885     itcount = 0;
2886     do {
2887       itcount++;
2888 
2889       old_logzon = logzon;
2890       old_div = div;
2891 
2892       lam_delta = 0.0;
2893       for(j = AMIN; j < ASIZE; j++) {
2894 	if( learner->dig[i][j] > 0.0 ) {
2895 
2896 	  old_lam = lam[j];
2897 	  lam[j] = log(learner->dig[i][j]) - logXi + logA + logzon;
2898 
2899 	  if( isnan(lam[j]) ) {
2900 
2901 	    lam[j] = old_lam;
2902 	    recompute_ed(learner, &logzon, &div, i, lam, Xi);
2903 
2904 	  } else {
2905 
2906 	    if( lam[j] > (old_lam + MAX_LAMBDA_JUMP) ) {
2907 	      lam[j] = (old_lam + MAX_LAMBDA_JUMP);
2908 	    } else if( lam[j] < (old_lam - MAX_LAMBDA_JUMP) ) {
2909 	      lam[j] = (old_lam - MAX_LAMBDA_JUMP);
2910 	    }
2911 
2912 	    /* don't want negative weights */
2913 	    if( lam[j] < 0.0 ) { lam[j] = 0.0; }
2914 
2915 	    if( lam_delta < fabs(lam[j] - old_lam) ) {
2916 	      lam_delta = fabs(lam[j] - old_lam);
2917 	    }
2918 
2919 	  }
2920 	}
2921       }
2922 
2923 
2924       recompute_ed(learner, &logzon, &div, i, lam, Xi);
2925 
2926       if( u_options & (1<<U_OPTION_VERBOSE) ) {
2927 	fprintf(stdout, "digramic (%d) entropy change %" FMT_printf_score_t \
2928 		" --> %" FMT_printf_score_t " (%10f, %10f)\n", i,
2929 		old_div, div, lam_delta, fabs(logzon - old_logzon));
2930       }
2931 
2932 
2933     } while( ((fabs(div - old_div) > 0.001) || (lam_delta > 0.001) ||
2934 	      (fabs(logzon - old_logzon) > 0.001)) && (itcount < 500) );
2935 
2936 
2937     /* now make the probabilities */
2938 
2939     /* transitions should already sum to 1, but better safe than sorry */
2940     maxlogt = log(0.0);
2941     for(j = AMIN; j < ASIZE; j++) {
2942       if( maxlogt < lam[j] ) {
2943 	maxlogt = lam[j];
2944       }
2945     }
2946     maxlogt += -logA -logzon;
2947     logt = 0.0;
2948     for(j = AMIN; j < ASIZE; j++) {
2949       logt += exp(lam[j] - logA - logzon - maxlogt);
2950     }
2951     logt = maxlogt + log(logt);
2952 
2953     for(j = AMIN; j < ASIZE; j++) {
2954       /* note: simulate the effect of digitizing the digrams */
2955       learner->dig[i][j] =
2956 	UNPACK_DIGRAMS(PACK_DIGRAMS(lam[j] - logA - logzon - logt));
2957 
2958       if(0 && u_options & (1<<U_OPTION_DEBUG) ) {
2959 	fprintf(stdout,
2960 		"learner->dig[%d][%d] = %f\n",
2961 		i, j, learner->dig[i][j]);
2962       }
2963     }
2964   }
2965 
2966   /* clear other data */
2967   for(i = 0; i < AMIN; i++) {
2968     for(j = 0; j < ASIZE; j++) {
2969       learner->dig[i][j] = 0.0;
2970     }
2971   }
2972 }
2973 
2974 
interpolate_digrams(weight_t N,weight_t count[],weight_t p[])2975 void interpolate_digrams(weight_t N, weight_t count[], weight_t p[]) {
2976   weight_t lambda = 0.5;
2977   weight_t Clam, Dlam, Zlam;
2978   alphabet_size_t i;
2979   int q = 0;
2980 
2981   for(q = 0; q < 100; q++) {
2982     Zlam = Dlam = Clam = 0.0;
2983     for(i = AMIN; i < ASIZE; i++) {
2984       Clam += log(p[i]) * (count[i]/N);
2985       Zlam += pow(p[i], lambda);
2986       Dlam += log(p[i]) * pow(p[i], lambda);
2987     }
2988     lambda += 0.1 * (Clam - Dlam/Zlam);
2989 /*     if( lambda > 1.0 ) { lambda = 1.0; } */
2990 /*     else if( lambda < 0.0 ) { lambda = 0.0; } */
2991 /*     printf("Clam = %g Dlam = %g, Zlam = %g, lamdba = %g\n", Clam, Dlam, Zlam, lambda); */
2992   }
2993 
2994 }
2995 
2996 
calc_learner_digramic_excursion(learner_t * learner,char * tok)2997 weight_t calc_learner_digramic_excursion(learner_t *learner, char *tok) {
2998   register alphabet_size_t p, q;
2999   register weight_t t = 0.0;
3000   register alphabet_size_t len;
3001   /* now update digram frequency counts */
3002   p = (unsigned char)*tok++;
3003   CLIP_ALPHABET(p);
3004   len = 1;
3005   /* finally update character frequencies */
3006   while( *tok != EOTOKEN ) {
3007     q = (unsigned char)*tok;
3008     CLIP_ALPHABET(q);
3009     t += learner->dig[p][q];
3010     p = q;
3011     tok++;
3012     if( q != DIAMOND ) { len++; }
3013   }
3014   t += learner->dig[RESERVED_TOKLEN][len] - learner->dig[RESERVED_TOKLEN][0];
3015   return t;
3016 }
3017 
3018 /* calculates the rth-order divergence but needs normalizing constant
3019    note: this isn't the full divergence from digref, just the bits
3020    needed for the r-th optimization - fwd indicates the traversal
3021    direction in the hash (alternate between them to reduce numerical
3022    errors */
learner_divergence(learner_t * learner,score_t logzonr,score_t Xi,token_order_t r,bool_t fwd)3023 score_t learner_divergence(learner_t *learner,
3024 			   score_t logzonr, score_t Xi,
3025 			   token_order_t r, bool_t fwd) {
3026   register l_item_t *i, *e;
3027   register token_count_t c = 0;
3028   score_t t = 0.0;
3029 
3030   e = fwd ? (learner->hash + learner->max_tokens) : learner->hash - 1;
3031   for(i = fwd ? learner->hash : learner->hash + learner->max_tokens - 1;
3032       i != e; fwd ? i++ : i--) {
3033     if( FILLEDP(i) && (i->typ.order == r) ) {
3034 
3035       t += UNPACK_LAMBDA(i->lam) * (score_t)i->count;
3036 
3037       if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3038 	c = 0;
3039 	break;
3040       }
3041     }
3042   }
3043 
3044   return -logzonr + t/Xi;
3045 }
3046 
3047 /* calculates the normalizing constant - fwd indicates the traversal
3048    direction in the hash (alternate between them to reduce numerical
3049    errors */
learner_logZ(learner_t * learner,token_order_t r,score_t log_unchanging_part,bool_t fwd)3050 score_t learner_logZ(learner_t *learner, token_order_t r,
3051 		     score_t log_unchanging_part, bool_t fwd) {
3052   register l_item_t *i, *e;
3053   register token_count_t c = 0;
3054 
3055   score_t maxlogz, tmp;
3056   score_t t =0.0;
3057   score_t R = (score_t)r;
3058 
3059 /*   printf("learner_logZ(%d, %f)\n", r, log_unchanging_part); */
3060 
3061   e = fwd ? (learner->hash + learner->max_tokens) : learner->hash - 1;
3062   maxlogz = log_unchanging_part;
3063   for(i = fwd ? learner->hash : learner->hash + learner->max_tokens - 1;
3064       i != e; fwd ? i++ : i--) {
3065     if( FILLEDP(i) && (i->typ.order == r) ) {
3066 
3067       tmp = R * UNPACK_LAMBDA(i->lam) + R * UNPACK_LWEIGHTS(i->tmp.min.ltrms) +
3068 	UNPACK_RWEIGHTS(i->tmp.min.dref);
3069 
3070       if( maxlogz < tmp ) {
3071 	maxlogz = tmp;
3072       }
3073 
3074       if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3075 	c = 0;
3076 	break;
3077       }
3078     }
3079   }
3080 
3081   t =  exp(log_unchanging_part - maxlogz);
3082   for(i = fwd ? learner->hash : learner->hash + learner->max_tokens - 1;
3083       i != e; fwd ? i++ : i--) {
3084     if( FILLEDP(i) && (i->typ.order == r) ) {
3085 
3086       tmp = R * UNPACK_LWEIGHTS(i->tmp.min.ltrms) +
3087 	UNPACK_RWEIGHTS(i->tmp.min.dref) - maxlogz;
3088       t += (exp(R * UNPACK_LAMBDA(i->lam) + tmp) - exp(tmp));
3089 
3090       if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3091 	c = 0;
3092 	break;
3093       }
3094     }
3095   }
3096 
3097   tmp =  (maxlogz + log(t))/R;
3098 /*   printf("t =%f maxlogz = %f logZ/R = %f\n", t, maxlogz, tmp); */
3099 
3100   /* investigate this someday */
3101   if( isnan(tmp) ) {
3102     errormsg(E_FATAL,"sorry, partition function went kaboom.\n");
3103   }
3104 
3105   return tmp;
3106 }
3107 
3108 /* reconstructs the order of a token (can be zero for empty token) */
get_token_order(char * tok)3109 token_order_t get_token_order(char *tok) {
3110   token_order_t o = 0;
3111   if( tok && *tok ) {
3112     while( *(++tok) ) {
3113       if( *tok == DIAMOND ) {
3114 	o++;
3115       }
3116     }
3117   }
3118   return o;
3119 }
3120 
3121 
3122 /* fills the hash with partial calculations */
fill_ref_vars(learner_t * learner,l_item_t * k,char * tok)3123 void fill_ref_vars(learner_t *learner, l_item_t *k, char *tok) {
3124   hash_value_t id;
3125   char *t, *e;
3126   l_item_t *l;
3127 
3128   /* weight of the r-th order excursion */
3129   k->tmp.min.dref = PACK_RWEIGHTS(calc_learner_digramic_excursion(learner,tok));
3130 
3131   /* for each suffix of tok, add its weight */
3132   k->tmp.min.ltrms = PACK_LWEIGHTS(0.0);
3133 
3134   if( tok && *tok ) {
3135     e = strchr(tok + 1, EOTOKEN);
3136     for( t = tok + 1; t + 1 < e; t++ ) {
3137       if( *t == DIAMOND ) {
3138 	id = hash_partial_token(t, e - t, e);
3139 	l = find_in_learner(learner, id);
3140 	if( l ) {
3141 	  k->tmp.min.ltrms += PACK_LWEIGHTS(UNPACK_LAMBDA(l->lam));
3142 	}
3143       }
3144     }
3145 
3146     /* extra smoothing doesn't seem that good */
3147 #undef EXTRA_SMOOTHING
3148 #if defined EXTRA_SMOOTHING
3149     /* rather than adding only suffixes, this adds all
3150      * the prefixes also, then divides by two. */
3151     if( e ) {
3152       /* for each prefix of tok, add its weight */
3153       for(t = e - 2; t > tok; t--) {
3154 	if( *t == DIAMOND ) {
3155 	  id = hash_partial_token(tok, t - tok + 1, e);
3156 	  l = find_in_learner(id);
3157 	  if( l ) {
3158 	    k->ltrms += PACK_LWEIGHTS(UNPACK_LAMBDA(l->lam));
3159 	  }
3160 	}
3161       }
3162       /* now we have twice as many weights as we need */
3163       k->ltrms /= 2;
3164     }
3165 #endif
3166   }
3167 
3168 }
3169 
3170 /* fills hash with partial calculations and returns the
3171  * log unchanging part of the normalizing constant, for all
3172  * orders < r (not r). On return, kappa contains the reference probability
3173  * mass of the set of all features <= r (including r).
3174  */
recalculate_reference_measure(learner_t * learner,token_order_t r,score_t * kappa)3175 score_t recalculate_reference_measure(learner_t *learner, token_order_t r, score_t *kappa) {
3176   hash_value_t id;
3177   byte_t buf[BUFSIZ+1];
3178   char tok[(MAX_TOKEN_LEN+1)*MAX_SUBMATCH+EXTRA_TOKEN_LEN];
3179   size_t n = 0;
3180 
3181   const byte_t *p;
3182   char *q;
3183 
3184   l_item_t *k, *i;
3185 
3186   score_t tmp, lunch;
3187   score_t R = (score_t)r;
3188   score_t max = 0.0;
3189   score_t mykappa = 0.0;
3190 
3191   /* for r == 1, the partial_z is not modified, but
3192      we still have to go through the tokens and prebuild k-dref */
3193 
3194   /* now we calculate the logarithmic word weight
3195      from the digram model, for each token in the hash */
3196   if( !tmp_seek_start(learner) ) {
3197     errormsg(E_ERROR, "cannot seek in temporary token file, reference weights not calculated.\n");
3198   } else {
3199     q = tok;
3200     while( (n = tmp_read_block(learner, buf, BUFSIZ, &p)) > 0 ) {
3201 /*       p = buf; */
3202 /*       p[n] = '\0'; */
3203       while( n-- > 0 ) {
3204 	if( *p != TOKENSEP) {
3205 	  *q++ = *p; /* copy into tok */
3206 	} else { /* interword space */
3207 	  *q = 0; /* append NUL to tok */
3208 	  /* now write weight in hash */
3209 	  id = hash_full_token(tok);
3210 	  k = find_in_learner(learner, id);
3211 	  if( k && (get_token_order(tok) == k->typ.order) ) {
3212 	    if( k->typ.order <= r) {
3213 	      if( k->typ.order == r ) {
3214 		fill_ref_vars(learner, k, tok);
3215 	      } else if(k->typ.order < r) {
3216 		/* assume ref_vars were already filled */
3217 		if( NOTNULL(k->lam) ) {
3218 		  tmp = R * UNPACK_LAMBDA(k->lam) +
3219 		    R * UNPACK_LWEIGHTS(k->tmp.min.ltrms) +
3220 		    UNPACK_RWEIGHTS(k->tmp.min.dref);
3221 		  if( max < tmp ) {
3222 		    max = tmp;
3223 		  }
3224 		}
3225 	      }
3226 	      mykappa += exp(UNPACK_RWEIGHTS(k->tmp.min.dref));
3227 	    }
3228 	  }
3229 	  q = tok; /* reset q */
3230 	}
3231 	p++;
3232       }
3233     }
3234   }
3235 
3236   lunch = 1.0;
3237   k = learner->hash + learner->max_tokens;
3238   for(i = learner->hash; i != k; i++) {
3239     if( FILLEDP(i) ) {
3240       if( i->typ.order < r ) {
3241 	if( NOTNULL(i->lam) ) {
3242 	  tmp = -max + R * UNPACK_LWEIGHTS(i->tmp.min.ltrms) +
3243 	    UNPACK_RWEIGHTS(i->tmp.min.dref);
3244 	  lunch += (exp(R * UNPACK_LAMBDA(i->lam) + tmp) - exp(tmp));
3245 	}
3246       }
3247     }
3248   }
3249   lunch = max + log(lunch);
3250 
3251   /* kappa is the reference mass of all features <= rth order. */
3252   *kappa = mykappa;
3253 
3254   return lunch;
3255 }
3256 
3257 
3258 /* just for debugging */
print_score(learner_t * learner,token_order_t r)3259 void print_score(learner_t *learner, token_order_t r) {
3260   hash_count_t i;
3261   double logprob = 0.0;
3262   double lpapprox = 0.0;
3263 
3264   for(i = 0; i < learner->max_tokens; i++) {
3265     if( FILLEDP(&learner->hash[i]) &&
3266 	(learner->hash[i].typ.order <= r) ) {
3267       logprob += learner->hash[i].count * (learner->hash[i].lam);
3268       if( learner->hash[i].typ.order == 1 ) {
3269 	logprob += learner->hash[i].count *
3270 	  UNPACK_RWEIGHTS(learner->hash[i].tmp.min.dref)/((weight_t)r);
3271       }
3272     }
3273     if( FILLEDP(&learner->hash[i]) &&
3274 	(learner->hash[i].typ.order == r) ) {
3275       lpapprox += learner->hash[i].count *
3276 	((learner->hash[i].lam) +
3277 	 UNPACK_LWEIGHTS(learner->hash[i].tmp.min.ltrms) +
3278 	 UNPACK_RWEIGHTS(learner->hash[i].tmp.min.dref)/((weight_t)r));
3279     }
3280 
3281   }
3282 
3283   fprintf(stdout,
3284 	  "*** logprob = %" FMT_printf_score_t " * %d (r = %" FMT_printf_integer_t ", logZ = %" FMT_printf_score_t ")\n",
3285 	 ((score_t)(logprob / learner->fixed_order_token_count[r])),
3286 	 learner->fixed_order_token_count[r],
3287 	 r, learner->logZ);
3288   fprintf(stdout,
3289 	  "*** lpapprox = %" FMT_printf_score_t " * %ld (r = %d, logZ = %" FMT_printf_score_t ")\n",
3290 	 ((score_t)(lpapprox / learner->fixed_order_token_count[r])),
3291 	 learner->fixed_order_token_count[r],
3292 	 r, learner->logZ);
3293 }
3294 
3295 
3296 /* experimental: this sounded like a good idea, but isn't ?? */
theta_rescale(learner_t * learner,token_order_t r,score_t logupz,score_t Xi,bool_t fwd)3297 void theta_rescale(learner_t *learner, token_order_t r, score_t logupz, score_t Xi, bool_t fwd) {
3298   register l_item_t *i, *e;
3299   register token_count_t c = 0;
3300 
3301   score_t tmp;
3302   score_t theta =0.0;
3303   score_t R = (score_t)r;
3304   score_t mu;
3305   score_t sum1, sum2;
3306   score_t msum1, msum2;
3307   score_t s = 0;
3308 
3309   e = fwd ? (learner->hash + learner->max_tokens) : learner->hash - 1;
3310 
3311   msum1 = msum2 = log(0.0);
3312   for(i = fwd ? learner->hash : learner->hash + learner->max_tokens - 1;
3313       i != e; fwd ? i++ : i--) {
3314     if( FILLEDP(i) && (i->typ.order == r) ) {
3315       if( NOTNULL(i->lam) ) {
3316 	s += i->count;
3317 	tmp = R * UNPACK_LWEIGHTS(i->tmp.min.ltrms) +
3318 	  UNPACK_RWEIGHTS(i->tmp.min.dref);
3319 	if( tmp > msum1 ) {
3320 	  msum1 = tmp;
3321 	}
3322 	tmp += R * UNPACK_LAMBDA(i->lam);
3323 	if( tmp > msum2 ) {
3324 	  msum2 = tmp;
3325 	}
3326       }
3327       if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3328 	c = 0;
3329 	break;
3330       }
3331     }
3332   }
3333 
3334   mu = ((score_t)s)/Xi;
3335   s = log(mu) - log(1 - mu) + logupz;
3336   if( s > msum1 ) {
3337     msum1 = s;
3338   }
3339 
3340   sum1 = exp(s - msum1);
3341   sum2 = 0.0;
3342   for(i = fwd ? learner->hash : learner->hash + learner->max_tokens - 1;
3343       i != e; fwd ? i++ : i--) {
3344     if( FILLEDP(i) && (i->typ.order == r) ) {
3345       if( NOTNULL(i->lam) ) {
3346 	tmp = R * UNPACK_LWEIGHTS(i->tmp.min.ltrms) +
3347 	  UNPACK_RWEIGHTS(i->tmp.min.dref);
3348 	sum1 += exp(tmp - msum1);
3349 	tmp += R * UNPACK_LAMBDA(i->lam);
3350 	sum2 += exp(tmp - msum2);
3351       }
3352       if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3353 	c = 0;
3354 	break;
3355       }
3356     }
3357   }
3358 
3359   theta = ((msum1 + log(sum1)) - (msum2 + log(sum2)))/R;
3360   if( isnan(theta) ) {
3361     return; /* ignore */
3362   }
3363   if( u_options & (1<<U_OPTION_VERBOSE) ) {
3364     fprintf(stdout, "theta = %f\n", theta);
3365   }
3366 
3367   for(i = fwd ? learner->hash : learner->hash + learner->max_tokens - 1;
3368       i != e; fwd ? i++ : i--) {
3369     if( FILLEDP(i) && (i->typ.order == r) ) {
3370       if( NOTNULL(i->lam) ) {
3371 	i->lam = PACK_LAMBDA(UNPACK_LAMBDA(i->lam) + theta);
3372       }
3373       if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3374 	c = 0;
3375 	break;
3376       }
3377     }
3378   }
3379 
3380 }
3381 
3382 /* the mediaprobs are the token probabilities for each token class.
3383    (summing the mediaprobs) == 1. We put a uniform prior on the media. */
compute_mediaprobs(learner_t * learner)3384 void compute_mediaprobs(learner_t *learner) {
3385   l_item_t *k, *i;
3386   token_class_t j;
3387   weight_t R, tmp;
3388 
3389   R = (weight_t)learner->max_order;
3390   for(j = 0; j < TOKEN_CLASS_MAX; j++) {
3391     learner->mediaprobs[j] = 1.0/TOKEN_CLASS_MAX;
3392   }
3393 
3394   k = learner->hash + learner->max_tokens;
3395   for(i = learner->hash; i != k; i++) {
3396     if( FILLEDP(i) && NOTNULL(i->lam)) {
3397       tmp = R * UNPACK_LWEIGHTS(i->tmp.min.ltrms) +
3398 	UNPACK_RWEIGHTS(i->tmp.min.dref);
3399       learner->mediaprobs[i->typ.cls] +=
3400 	exp(R * UNPACK_LAMBDA(i->lam) + tmp) - exp(tmp);
3401     }
3402   }
3403 
3404   tmp = 0;
3405   for(j = 0; j < TOKEN_CLASS_MAX; j++) {
3406     learner->mediaprobs[j] *= exp(-learner->logZ);
3407     tmp += learner->mediaprobs[j];
3408   }
3409   /* should have tmp == 1.0 */
3410 }
3411 
3412 /* minimizes the divergence by solving for lambda one
3413    component at a time.  */
minimize_learner_divergence(learner_t * learner)3414 void minimize_learner_divergence(learner_t *learner) {
3415   l_item_t *i, *e;
3416   token_order_t r;
3417   token_count_t lzero, c = 0;
3418   token_count_t zcut = 0;
3419   int itcount, mcount;
3420   score_t d, dd, b;
3421   score_t lam_delta, old_lam, new_lam;
3422   score_t logzonr, old_logzonr;
3423   score_t logupz, div_extra_bits, kappa;
3424   score_t R, Xi, logXi;
3425   score_t mp_logz;
3426   bool_t fwd = 1;
3427 
3428   if( u_options & (1<<U_OPTION_VERBOSE) ) {
3429     fprintf(stdout, "now maximizing model entropy\n");
3430   }
3431 
3432   learner->logZ = 0.0;
3433   learner->divergence = 0.0;
3434   mp_logz = 0.0;
3435 
3436   /* disable multipass if we only have one order to play with */
3437   if( (m_options & (1<<M_OPTION_MULTINOMIAL)) ||
3438       (learner->max_order == 1) ) {
3439     qtol_multipass = 0;
3440   }
3441 
3442   if( learner->model.tmin != 0 ) {
3443     /* ftreshold is unsigned */
3444     if( learner->model.tmin > 0 ) {
3445       zcut = learner->model.tmin;
3446     } else if (learner->model.tmin + learner->tmax > 0) {
3447       zcut = learner->model.tmin + learner->tmax;
3448     }
3449   }
3450 
3451   e = learner->hash + learner->max_tokens;
3452   for(mcount = 0; mcount < (qtol_multipass ? 50 : 1); mcount++) {
3453     for(r = 1;
3454 	r <= ((m_options & (1<<M_OPTION_MULTINOMIAL)) ? 1 : learner->max_order);
3455 	r++) {
3456 
3457       /* here we precalculate various bits and pieces
3458 	 which aren't going to change during this iteration */
3459       logupz = recalculate_reference_measure(learner, r, &kappa);
3460 
3461       R = (score_t)r;
3462       Xi = (score_t)learner->fixed_order_token_count[r];
3463       logXi = log(Xi);
3464 
3465       if( r == 1 ) {
3466 /*       Xi /= kappa; */
3467 	div_extra_bits = 0.0;
3468       } else {
3469       /* calculate extra bits for divergence score display */
3470 	b = 0.0;
3471 
3472 	for(i = learner->hash; i != e; i++) {
3473 	  if( FILLEDP(i) && (i->typ.order < r) ) {
3474 	    b += UNPACK_LAMBDA(i->lam) * (score_t)i->count;
3475 	  }
3476 	}
3477 	div_extra_bits = b/Xi;
3478       }
3479 
3480       if( m_options & (1<<M_OPTION_REFMODEL) ) {
3481 	break;
3482       }
3483 
3484       if( u_options & (1<<U_OPTION_VERBOSE) ) {
3485 	fprintf(stdout,
3486 		"* optimizing order %" FMT_printf_integer_t " weights (%" FMT_printf_integer_t " tokens, %" FMT_printf_integer_t " unique, kappa = %f)\n",
3487 		r, learner->fixed_order_token_count[r],
3488 		learner->fixed_order_unique_token_count[r], kappa);
3489       }
3490 
3491       logzonr = learner_logZ(learner, r, logupz, fwd);
3492       dd = learner_divergence(learner, logzonr, Xi, r, fwd);
3493       fwd = 1 - fwd;
3494 /*       printf("logzonr = %f dd = %f logupz = %f Xi = %f\n", logzonr, dd, logupz, Xi); */
3495       itcount = 0;
3496       do {
3497 	itcount++;
3498 	lzero= 0;
3499 
3500 	d = dd;     /* save old divergence */
3501 	old_logzonr = logzonr;
3502 
3503 	lam_delta = 0.0;
3504 
3505 	c = 0;
3506 	for(i = learner->hash; i != e; i++) {
3507 	  if( FILLEDP(i) && (i->typ.order == r)
3508 /* 	    && (rand() > RAND_MAX/2)  */
3509 	      ) {
3510 
3511 	    old_lam = UNPACK_LAMBDA(i->lam);
3512 
3513 	    if( /* (i->typ.order == 1) ||  */
3514 		(i->count > zcut) ) {
3515 	      /* "iterative scaling" lower bound */
3516 	      new_lam = (log((score_t)i->count) - logXi -
3517 			 UNPACK_RWEIGHTS(i->tmp.min.dref))/R + logzonr -
3518 		UNPACK_LWEIGHTS(i->tmp.min.ltrms);
3519 	    } else {
3520 	      new_lam = 0.0;
3521 	    }
3522 
3523 	    if( isnan(new_lam) ) {
3524 	      /* precision problem, just ignore, don't change lambda */
3525 	      logzonr = learner_logZ(learner, r, logupz, fwd);
3526 	    } else {
3527 
3528 	      /* this code shouldn't be necessary, but is crucial */
3529 	      if( new_lam > (old_lam + MAX_LAMBDA_JUMP) ) {
3530 		new_lam = (old_lam + MAX_LAMBDA_JUMP);
3531 	      } else if( new_lam < (old_lam - MAX_LAMBDA_JUMP) ) {
3532 		new_lam = (old_lam - MAX_LAMBDA_JUMP);
3533 	      }
3534 
3535 	      /* don't want negative weights */
3536 	      if( new_lam < 0.0 ) { new_lam = 0.0; lzero++; }
3537 
3538 	      if( lam_delta < fabs(new_lam - old_lam) ) {
3539 		lam_delta = fabs(new_lam - old_lam);
3540 	      }
3541 	      i->lam = PACK_LAMBDA(new_lam);
3542 	    }
3543 
3544 	    if( ++c >= learner->fixed_order_unique_token_count[r] ) {
3545 	      c = 0;
3546 	      break;
3547 	    }
3548 	  }
3549 	}
3550 
3551 /* 	theta_rescale(r, logupz, Xi, fwd); */
3552 
3553 	/* update values */
3554 	logzonr = learner_logZ(learner, r, logupz, fwd);
3555 	dd = learner_divergence(learner, logzonr, Xi, r, fwd);
3556 	fwd = 1 - fwd;
3557 
3558 	if( u_options & (1<<U_OPTION_VERBOSE) ) {
3559 /* 	fprintf(stdout, "lzero = %ld\n", lzero); */
3560 	  fprintf(stdout, "entropy change %" FMT_printf_score_t \
3561 		  " --> %" FMT_printf_score_t " (%10f, %10f)\n",
3562 		  d + div_extra_bits,
3563 		  dd + div_extra_bits,
3564 		  lam_delta, fabs(logzonr - old_logzonr));
3565 	}
3566 
3567 	process_pending_signal(NULL);
3568 
3569       } while( ((fabs(d - dd) > qtol_div) || (lam_delta > qtol_lam) ||
3570 		(fabs(logzonr - old_logzonr) > qtol_logz)) && (itcount < 50) );
3571 
3572       learner->logZ = logzonr;
3573       learner->divergence = dd + div_extra_bits;
3574     }
3575     /* for multipass, we wait until logZ stabilizes */
3576     if( fabs(1.0 - mp_logz/learner->logZ) < 0.01 ) {
3577       break;
3578     }
3579     mp_logz = learner->logZ;
3580   }
3581 
3582   /* compute the probability mass of each token class (medium) separately */
3583   compute_mediaprobs(learner);
3584 }
3585 
3586 /* dumps readable model weights to the output */
dump_model(learner_t * learner,FILE * out,FILE * in)3587 void dump_model(learner_t *learner, FILE *out, FILE *in) {
3588 
3589   hash_value_t id;
3590   byte_t buf[BUFSIZ+1];
3591   char tok[(MAX_TOKEN_LEN+1)*MAX_SUBMATCH+EXTRA_TOKEN_LEN];
3592   char smb[MAX_SUBMATCH+1];
3593   token_order_t s;
3594   regex_count_t c;
3595   const byte_t *p;
3596   char *q;
3597   l_item_t *k;
3598   size_t n = 0;
3599 
3600   /* preamble - this is copied from save_learner */
3601 
3602   fprintf(out, MAGIC1, learner->filename,
3603 	  (m_options & (1<<M_OPTION_REFMODEL)) ? "(ref)" : "");
3604   fprintf(out, MAGIC2_o, learner->divergence, learner->logZ, learner->max_order,
3605 	  (m_options & (1<<M_OPTION_MULTINOMIAL)) ? "multinomial" : "hierarchical" );
3606   fprintf(out, MAGIC3,
3607 	  (short int)learner->max_hash_bits,
3608 	  (long int)learner->full_token_count,
3609 	  (long int)learner->unique_token_count,
3610 	  (long int)learner->doc.count);
3611 
3612   fprintf(out, MAGIC8_o, learner->shannon, learner->shannon2);
3613 
3614   fprintf(out, MAGIC10_o,
3615 	  learner->alpha, learner->beta,
3616 	  learner->mu, learner->s2);
3617 
3618   write_mediaprobs(out, learner);
3619 
3620   fprintf(out, MAGIC9, (long int)learner->model.tmin, (long int)learner->tmax);
3621 
3622   /* print out any regexes we might need */
3623   for(c = 0; c < regex_count; c++) {
3624     /* write the bitmap */
3625     for(q = smb, s = 1; s <= MAX_SUBMATCH; s++) {
3626       if( re[c].submatches & (1<<s) ) {
3627 	*q++ = s + '0';
3628       }
3629     }
3630     *q = '\0';
3631 
3632     fprintf(out, MAGIC5_o, re[c].string, smb);
3633   }
3634 
3635   /* print options */
3636   fprintf(out, MAGIC4_o, m_options, m_cp, m_dt,
3637 	  print_model_options(m_options, m_cp, (char*)buf));
3638 
3639   fprintf(out, MAGIC6);
3640 
3641   fprintf(out, MAGIC_DUMP);
3642 
3643 
3644   /* now go through hash printing values */
3645   if( !tmp_seek_start(learner) ) {
3646     errormsg(E_ERROR, "cannot seek in temporary token file.\n");
3647   } else {
3648     q = tok;
3649     while( (n = tmp_read_block(learner, buf, BUFSIZ, &p)) > 0 ) {
3650 /*       p = buf; */
3651 /*       p[n] = '\0'; */
3652       while( n-- > 0 ) {
3653 	if( *p != TOKENSEP) {
3654 	  *q++ = *p; /* copy into tok */
3655 	} else { /* interword space */
3656 	  *q = 0; /* append NUL to tok */
3657 	  /* now write weight in hash */
3658 	  id = hash_full_token(tok);
3659 	  k = find_in_learner(learner, id); /* guaranteed to be found */
3660 	  fprintf(out, MAGIC_DUMPTBL_o,
3661 		  (weight_t)UNPACK_LAMBDA(k->lam),
3662 		  UNPACK_RWEIGHTS(k->tmp.min.dref), k->count,
3663 		  (long unsigned int)k->id);
3664 	  print_token(out, tok);
3665 	  fprintf(out, "\n");
3666 	  q = tok; /* reset q */
3667 	}
3668 	p++;
3669       }
3670     }
3671   }
3672 }
3673 
3674 /* This is a quick hack to
3675    1) speed up the optimization and
3676    2) stabilise the estimated category model over a sequence of learning runs.
3677 
3678    Normally, if the dataset is nearly the same, then the optimal lambda weights
3679    should be fairly similar (This is not strictly true if the digramic
3680    measure varies too much).
3681 
3682    By reading the old category and presetting the lambdas from it, we typically
3683    get close enough to the true weights that one or two optimization iterations
3684    suffice. Moreover in that case, most weights should stay close to their
3685    "previous" values in the category, so the user doesn't think it's weird.
3686 
3687    A side effect is that there is entropy creep when relearing the same
3688    dataset over and over. Unless there's a bug I haven't noticed, this occurs
3689    because the entropy maximum we compute is affected by numerical precision
3690    and feature ordering within the hash. Each time we relearn the same dataset,
3691    we start off with a perturbation from the previous best weights, and obtain
3692    a small improvement.
3693 
3694    WARNING: THIS USES cat[0], SO IS NOT COMPATIBLE WITH CLASSIFYING.
3695  */
learner_prefill_lambdas(learner_t * learner,category_t ** pxcat)3696 void learner_prefill_lambdas(learner_t *learner, category_t **pxcat) {
3697   hash_count_t c;
3698   c_item_t *i, *e;
3699   l_item_t *k;
3700   category_t *xcat = &(cat[0]);
3701 
3702   *pxcat = NULL;
3703   xcat->fullfilename = strdup(learner->filename);
3704   if( open_category(cat) ) {
3705     if( xcat->model.options & (1<<M_OPTION_WARNING_BAD) ) {
3706       if( u_options & (1<<U_OPTION_VERBOSE) ) {
3707 	errormsg(E_WARNING, "old category file %s may have bad data\n",
3708 		xcat->fullfilename);
3709       }
3710     } else if( (xcat->model.options == m_options) &&
3711 	       (xcat->retype == learner->retype) &&
3712 	       (fabs((xcat->model_unique_token_count/
3713 		      (double)learner->unique_token_count) - 1.0) < 0.15) ) {
3714       c = xcat->model_unique_token_count;
3715       if( u_options & (1<<U_OPTION_VERBOSE) ) {
3716 	fprintf(stdout, "preloading %ld token weights\n", (long int)c);
3717       }
3718 
3719       MADVISE(xcat->hash, sizeof(c_item_t) * xcat->max_tokens, MADV_SEQUENTIAL);
3720 
3721       e = xcat->hash + xcat->max_tokens;
3722       for(i = xcat->hash; i != e; i++) {
3723 	if( FILLEDP(i) ) {
3724 	  k = find_in_learner(learner, NTOH_ID(i->id));
3725 	  if( k ) {
3726 	    k->lam = NTOH_LAMBDA(i->lam);
3727 	  }
3728 	  if( --c <= 0 ) {
3729 	    break;
3730 	  }
3731 	}
3732       }
3733 
3734       /* tweak default quality settings */
3735       if( !quality ) {
3736 	qtol_div = 0.01;
3737 	qtol_lam = CLIP_LAMBDA_TOL(0.01);
3738 	qtol_logz = 0.05;
3739       }
3740 
3741     }
3742     /* we're done, but don't close the category yet, as we might want to
3743        overwrite it */
3744     *pxcat = xcat;
3745   } else {
3746     if( u_options & (1<<U_OPTION_VERBOSE) ) {
3747       errormsg(E_WARNING, "cannot open file for reading %s\n",
3748 	      xcat->fullfilename);
3749     }
3750     free(xcat->fullfilename);
3751   }
3752 }
3753 
3754 
optimize_and_save(learner_t * learner)3755 void optimize_and_save(learner_t *learner) {
3756 
3757   hash_count_t i;
3758   token_order_t c;
3759   category_t *opencat = NULL;
3760 
3761   if(100 * learner->unique_token_count >= HASH_FULL * learner->max_tokens) {
3762     errormsg(E_WARNING,
3763 	    "table full, some tokens ignored - "
3764 	    "try with option -h %i\n",
3765 	    learner->max_hash_bits + 1);
3766   } else if( learner->unique_token_count <= 0 ) {
3767     errormsg(E_WARNING,
3768 	    "no tokens matched - have I learned nothing?\n");
3769     tmp_close(learner);
3770     exit(0); /* exit code for success */
3771   }
3772 
3773   if( skewed_constraints_warning ) {
3774     errormsg(E_WARNING,
3775 	    "ran out of integers (too much data) constraints will be skewed.\n");
3776     m_options |= (1<<M_OPTION_WARNING_BAD);
3777   }
3778 
3779   if( u_options & (1<<U_OPTION_VERBOSE) ) {
3780     fprintf(stdout,
3781 	    "picked up %" FMT_printf_integer_t " (%" FMT_printf_integer_t " distinct) tokens\n",
3782 	    learner->full_token_count, learner->unique_token_count);
3783     fprintf(stdout,
3784 	    "calculating reference word weights\n");
3785   }
3786 
3787   if( *online ) {
3788     write_online_learner_struct(learner, online);
3789   }
3790 
3791   /* transposition smoothing */
3792 /*   transpose_digrams(); */
3793 
3794   if( *digtype && !strcmp(digtype, "uniform") ) {
3795     make_uniform_digrams(learner);
3796   } else if( *digtype && !strcmp(digtype, "dirichlet") ) {
3797     make_dirichlet_digrams(learner);
3798   } else if( *digtype && !strcmp(digtype, "maxent") ) {
3799     make_entropic_digrams(learner);
3800   } else if( *digtype && !strcmp(digtype, "toklen") ) {
3801     make_toklen_digrams(learner);
3802   } else if( *digtype && !strcmp(digtype, "mle") ) {
3803     make_mle_digrams(learner);
3804   } else if( *digtype && !strcmp(digtype, "iid") ) {
3805     make_iid_digrams(learner);
3806   } else {
3807     make_uniform_digrams(learner);
3808   }
3809 
3810   if( learner->fixed_order_token_count[1] == 0 ) {
3811     /* it's a higher order model but there are no first
3812        order tokens! We can't handle that! Go through the
3813        hash converting everything to first order.
3814        This will result in incorrect calculations, unless
3815        the higher order tokens don't overlap. Suitable only for
3816        geniuses and fools. */
3817     errormsg(E_WARNING,
3818 	    "\n"
3819 	    "         You have not defined any unigrams, so in this model\n"
3820 	    "         features will be treated independently, which is quite\n"
3821 	    "         likely incorrect. I hope you know what you're doing,\n"
3822 	    "         because I don't!\n\n");
3823 
3824     m_options |= (1<<M_OPTION_MULTINOMIAL);
3825     for(c = 2; c <= learner->max_order; c++) {
3826       learner->fixed_order_token_count[1] += learner->fixed_order_token_count[c];
3827       learner->fixed_order_unique_token_count[1] +=
3828 	learner->fixed_order_unique_token_count[c];
3829     }
3830     for(i = 0; i < learner->max_tokens; i++) {
3831       if( FILLEDP(&learner->hash[i]) ) {
3832 	learner->hash[i].typ.order = 1;
3833       }
3834     }
3835 
3836   }
3837 
3838 
3839   /* if the category already exists, we read its lambda values.
3840      this should speed up the minimization slightly */
3841   if( u_options & (1<<U_OPTION_NOZEROLEARN) ) {
3842     learner_prefill_lambdas(learner, &opencat);
3843   }
3844 
3845   minimize_learner_divergence(learner);
3846 
3847   calc_shannon(learner);
3848 
3849   if( u_options & (1<<U_OPTION_DUMP) ) {
3850     dump_model(learner, stdout, learner->tmp.file);
3851   }
3852 
3853   tmp_close(learner);
3854   /* we don't free learner->tmpiobuf, as we'll be exiting soon anyway */
3855 #if defined HAVE_POSIX_MEMALIGN
3856 /*   if( learner->tmpiobuf ) { free(learner->tmpiobuf); } */
3857 #endif
3858 
3859   /* now save the model to a file */
3860   if( !opencat || !fast_partial_save_learner(learner, opencat) ) {
3861     save_learner(learner, online);
3862   }
3863   if( opencat ) { free_category(opencat); }
3864 }
3865 
3866 /***********************************************************
3867  * MULTIBYTE FILE HANDLING FUNCTIONS                       *
3868  * this is suitable for any locale whose character set     *
3869  * encoding doesn't include NUL bytes inside characters    *
3870  ***********************************************************/
3871 
3872 /* this code executed before processing each line of input.
3873    - handles indents and appends via -Aa switches  */
handle_indents_and_appends(char * textbuf)3874 char *handle_indents_and_appends(char *textbuf) {
3875 
3876   char *pptextbuf = textbuf; /* default */
3877 
3878   if( u_options & (1<<U_OPTION_INDENTED) ) {
3879     if( textbuf[0] == ' ' ) {
3880       pptextbuf = textbuf + 1; /* processing should ignore indent */
3881     } else {
3882       if( u_options & (1<<U_OPTION_APPEND) ) {
3883 	fprintf(stdout, "%s", textbuf);
3884       }
3885       pptextbuf = NULL; /* no further processing */
3886     }
3887   }
3888 
3889   /* if appending, print the lines as they
3890      come in before they are processed */
3891   if( u_options & (1<<U_OPTION_APPEND) ) {
3892     fprintf(stdout, " %s", pptextbuf);
3893   }
3894 
3895   return pptextbuf;
3896 }
3897 
3898 /***********************************************************
3899  * WIDE CHARACTER FILE HANDLING FUNCTIONS                  *
3900  * this is needed for any locale whose character set       *
3901  * encoding can include NUL bytes inside characters        *
3902  ***********************************************************/
3903 
3904 
3905 /***********************************************************
3906  * MAIN FUNCTIONS                                          *
3907  ***********************************************************/
learner_preprocess_fun()3908 void learner_preprocess_fun() {
3909   category_count_t r;
3910 
3911   init_learner(&learner, online, 0);
3912 
3913   for(r = 0; r < ronline_count; r++) {
3914     merge_learner_struct(&learner, ronline[r]);
3915   }
3916 }
3917 
learner_post_line_fun(char * buf)3918 void learner_post_line_fun(char *buf) {
3919   /* only call this when buf is NULL, ie end of file */
3920   if( !buf && (m_options & (1<<M_OPTION_MBOX_FORMAT)) ) {
3921     count_mbox_messages(&learner, mbox.state, buf);
3922   }
3923 }
3924 
learner_postprocess_fun()3925 void learner_postprocess_fun() {
3926   optimize_and_save(&learner);
3927 }
3928 
learner_cleanup_fun()3929 void learner_cleanup_fun() {
3930   free_learner(&learner);
3931 }
3932 
learner_word_fun(char * tok,token_type_t tt,regex_count_t re)3933 void learner_word_fun(char *tok, token_type_t tt, regex_count_t re) {
3934   hash_word_and_learn(&learner, tok, tt, re);
3935 }
3936 
classifier_preprocess_fun()3937 void classifier_preprocess_fun() {
3938   category_count_t c;
3939   /* no need to "load" the categories, this is done in set_option() */
3940 
3941   if( m_options & (1<<M_OPTION_CALCENTROPY) ) {
3942     init_empirical(&empirical,
3943 		   default_max_tokens,
3944 		   default_max_hash_bits); /* sets cached to zero */
3945   }
3946   reset_all_scores();
3947 
3948   if( u_options & (1<<U_OPTION_DUMP) ) {
3949     for(c = 0; c < cat_count; c++) {
3950       fprintf(stdout, "%s%10s ", (c ? " " : "# categories: "), cat[c].filename);
3951     }
3952     fprintf(stdout, "\n# format: ");
3953 
3954     if( u_options & (1<<U_OPTION_POSTERIOR) ) {
3955       fprintf(stdout, "score_delta\n");
3956     } else if( u_options & (1<<U_OPTION_SCORES) ) {
3957       fprintf(stdout, "avg_score * complexity\n");
3958     } else if( u_options & (1<<U_OPTION_VAR) ) {
3959       fprintf(stdout, "avg_score * complexity # s2\n");
3960     } else {
3961       fprintf(stdout, "lambda digref -renorm multi shannon id\n");
3962     }
3963   } else if( u_options & (1<<U_OPTION_DEBUG) ) {
3964     if( u_options & (1<<U_OPTION_PRIOR_CORRECTION) ) {
3965       for(c = 0; c < cat_count; c++) {
3966 	fprintf(stdout, "%s%10s %5.2f ", (c ? " " : "# prior: "),
3967 		cat[c].filename, cat[c].prior);
3968       }
3969       fprintf(stdout, "\n");
3970     }
3971   }
3972 
3973 }
3974 
classifier_cleanup_fun()3975 void classifier_cleanup_fun() {
3976 #undef GOODGUY
3977 #if defined GOODGUY
3978   /* normally we should free everything nicely, but there's no
3979      point, since the kernel will free the process memory. It's actually
3980      faster to not free... */
3981   category_count_t c;
3982   for(c = 0; c < cat_count; c++) {
3983     free_category(&cat[c]);
3984   }
3985   free_empirical(&empirical);
3986 #endif
3987 }
3988 
3989 
email_line_filter(MBOX_State * mbox,char * buf)3990 int email_line_filter(MBOX_State *mbox, char *buf) {
3991   int retval = mbox_line_filter(mbox, buf, &xml);
3992   count_mbox_messages(&learner, mbox->state, buf);
3993   return retval;
3994 }
3995 
3996 #if defined HAVE_MBRTOWC
w_email_line_filter(MBOX_State * mbox,wchar_t * buf)3997 int w_email_line_filter(MBOX_State *mbox, wchar_t *buf) {
3998   return w_mbox_line_filter(mbox, buf, &xml);
3999 }
4000 #endif
4001 
set_option(int op,char * optarg)4002 int set_option(int op, char *optarg) {
4003   int c = 0;
4004   switch(op) {
4005   case '@':
4006     /* this is an official NOOP, it MUST be ignored (see spherecl) */
4007     break;
4008   case '0':
4009     u_options &= ~(1<<U_OPTION_NOZEROLEARN);
4010     break;
4011   case '1':
4012     u_options |= (1<<U_OPTION_NOZEROLEARN);
4013     break;
4014   case 'A':
4015     u_options |= (1<<U_OPTION_INDENTED);
4016     /* fall through */
4017   case 'a':
4018     u_options |= (1<<U_OPTION_APPEND);
4019     break;
4020   case 'e':
4021     if( !strcasecmp(optarg, "alnum") ) {
4022       m_cp = CP_ALNUM;
4023     } else if( !strcasecmp(optarg, "alpha") ) {
4024       m_cp = CP_ALPHA;
4025     } else if( !strcasecmp(optarg, "cef") ) {
4026       m_cp = CP_CEF;
4027     } else if( !strcasecmp(optarg, "graph") ) {
4028       m_cp = CP_GRAPH;
4029     } else if( !strcasecmp(optarg, "adp") ) {
4030       m_cp = CP_ADP;
4031     } else if( !strcasecmp(optarg, "cef2") ) {
4032       m_cp = CP_CEF2;
4033     } else if( !strcasecmp(optarg, "char") ) {
4034       m_cp = CP_CHAR;
4035     } else {
4036       errormsg(E_WARNING,
4037 	       "unrecognized option \"%s\", ignoring.\n",
4038 	       optarg);
4039     }
4040     c++;
4041     break;
4042   case 'D':
4043     u_options |= (1<<U_OPTION_DEBUG);
4044     break;
4045   case 'm':
4046 #if defined HAVE_MMAP
4047     u_options |= (1<<U_OPTION_MMAP);
4048 #endif
4049     break;
4050   case 'M':
4051     m_options |= (1<<M_OPTION_MULTINOMIAL);
4052     break;
4053   case 'N':
4054     u_options |= (1<<U_OPTION_POSTERIOR);
4055     break;
4056   case 'R':
4057     if( cat_count >= MAX_CAT ) {
4058       errormsg(E_WARNING,
4059 	       "maximum reached, random text category omitted\n");
4060     } else if( u_options & (1<<U_OPTION_LEARN) ) {
4061       errormsg(E_ERROR,"cannot use options -l and -R together\n");
4062       exit(1);
4063     } else {
4064       u_options |= (1<<U_OPTION_CLASSIFY);
4065 
4066       cat[cat_count].fullfilename = "random";
4067 
4068       init_category(&cat[cat_count]);
4069       init_purely_random_text_category(&cat[cat_count]);
4070 
4071       cat_count++;
4072     }
4073     break;
4074   case 'T':
4075     if( !strncasecmp(optarg, "text", 4) ) {
4076       /* this is the default */
4077       /* below, make sure the comparisons are in the correct order */
4078     } else if( !strcasecmp(optarg, "email:noplain") ) {
4079       m_options |= (1<<M_OPTION_NOPLAIN);
4080       m_options |= (1<<M_OPTION_MBOX_FORMAT);
4081     } else if( !strcasecmp(optarg, "email:plain") ) {
4082       m_options |= (1<<M_OPTION_PLAIN);
4083       m_options |= (1<<M_OPTION_MBOX_FORMAT);
4084     } else if( !strcasecmp(optarg, "email:headers") ) {
4085       m_options |= (1<<M_OPTION_HEADERS);
4086       m_options |= (1<<M_OPTION_MBOX_FORMAT);
4087     } else if( !strcasecmp(optarg, "email:noheaders") ) {
4088       m_options |= (1<<M_OPTION_NOHEADERS);
4089       m_options |= (1<<M_OPTION_MBOX_FORMAT);
4090     } else if( !strcasecmp(optarg, "email:atts") ) {
4091       m_options |= (1<<M_OPTION_ATTACHMENTS);
4092       m_options |= (1<<M_OPTION_MBOX_FORMAT);
4093     } else if( !strcasecmp(optarg, "email:xheaders") ) {
4094       m_options |= (1<<M_OPTION_XHEADERS);
4095       m_options |= (1<<M_OPTION_MBOX_FORMAT);
4096     } else if( !strcasecmp(optarg, "email:theaders") ) {
4097       m_options |= (1<<M_OPTION_THEADERS);
4098       m_options |= (1<<M_OPTION_MBOX_FORMAT);
4099     } else if( !strcasecmp(optarg, "email") ) {
4100       m_options |= (1<<M_OPTION_MBOX_FORMAT);
4101     } else if( !strcasecmp(optarg, "html:links") ) {
4102       m_options |= (1<<M_OPTION_SHOW_LINKS);
4103       m_options |= (1<<M_OPTION_HTML);
4104     } else if( !strcasecmp(optarg, "html:alt") ) {
4105       m_options |= (1<<M_OPTION_SHOW_ALT);
4106       m_options |= (1<<M_OPTION_HTML);
4107     } else if( !strcasecmp(optarg, "html:forms") ) {
4108       m_options |= (1<<M_OPTION_SHOW_FORMS);
4109       m_options |= (1<<M_OPTION_HTML);
4110     } else if( !strcasecmp(optarg, "html:scripts") ) {
4111       m_options |= (1<<M_OPTION_SHOW_SCRIPT);
4112       m_options |= (1<<M_OPTION_HTML);
4113     } else if( !strcasecmp(optarg, "html:styles") ) {
4114       m_options |= (1<<M_OPTION_SHOW_STYLE);
4115       m_options |= (1<<M_OPTION_HTML);
4116     } else if( !strcasecmp(optarg, "html:comments") ) {
4117       m_options |= (1<<M_OPTION_SHOW_HTML_COMMENTS);
4118       m_options |= (1<<M_OPTION_HTML);
4119     } else if( !strcasecmp(optarg, "html") ) {
4120       m_options |= (1<<M_OPTION_HTML);
4121     } else if( !strcasecmp(optarg, "xml") ) {
4122       m_options |= (1<<M_OPTION_XML);
4123     } else { /* default */
4124       errormsg(E_WARNING,
4125 	       "unrecognized option -T %s, ignoring.\n",
4126 	       optarg);
4127     }
4128     c++;
4129     break;
4130   case 'U':
4131     u_options |= (1<<U_OPTION_VAR);
4132     m_options |= (1<<M_OPTION_CALCENTROPY);
4133     break;
4134   case 'V':
4135     fprintf(stdout, "dbacl version %s\n", VERSION);
4136     fprintf(stdout, COPYBLURB, "dbacl");
4137 #if defined __GNUC__
4138     fprintf(stdout, "Using GNU modern regexes.\n");
4139 #else
4140     fprintf(stdout, "Using system regexes.\n");
4141 #endif
4142 #if defined DIGITIZE_DIGRAMS
4143     fprintf(stdout, "Digrams are digitized.\n");
4144 #endif
4145 #if defined DIGITIZE_LAMBDA
4146     fprintf(stdout, "Lagrangian multipliers are digitized.\n");
4147 #endif
4148 #if defined DIGITIZE_LWEIGHTS
4149     fprintf(stdout, "Reference weights are digitized.\n");
4150 #endif
4151 #if defined PORTABLE_CATS
4152     fprintf(stdout, "Category files are portable to systems with other byte orders.\n");
4153 #else
4154     fprintf(stdout, "Category files are NOT portable to systems with other byte orders.\n");
4155 #endif
4156 #if ! defined HAVE_MMAP
4157     fprintf(stdout, "Fast category loading with -m switch unavailable.\n");
4158 #endif
4159     fprintf(stdout, "Feature memory requirements: %d bytes (classifying), %d bytes (learning)\n",
4160 	    (int)sizeof(c_item_t), (int)sizeof(l_item_t));
4161     fprintf(stdout, "To change these settings, recompile from source.\n");
4162     exit(0);
4163     break;
4164   case 'X':
4165     m_options |= (1<<M_OPTION_CALCENTROPY);
4166     u_options |= (1<<U_OPTION_CONFIDENCE);
4167     break;
4168   case 'd':
4169     u_options |= (1<<U_OPTION_DUMP);
4170     break;
4171   case 'h': /* select memory size in powers of 2 */
4172     default_max_hash_bits = atoi(optarg);
4173     if( default_max_hash_bits > MAX_HASH_BITS ) {
4174       errormsg(E_WARNING,
4175 	       "maximum hash size will be 2^%d\n",
4176 	       MAX_HASH_BITS);
4177       default_max_hash_bits = MAX_HASH_BITS;
4178     }
4179     default_max_tokens = (1<<default_max_hash_bits);
4180     c++;
4181     break;
4182   case 'H': /* select memory size in powers of 2 */
4183     default_max_grow_hash_bits = atoi(optarg);
4184     if( default_max_grow_hash_bits > MAX_HASH_BITS ) {
4185       errormsg(E_WARNING,
4186 	       "maximum hash size will be 2^%d\n",
4187 	       MAX_HASH_BITS);
4188       default_max_grow_hash_bits = MAX_HASH_BITS;
4189     }
4190     default_max_grow_tokens = (1<<default_max_grow_hash_bits);
4191     u_options |= (1<<U_OPTION_GROWHASH);
4192     c++;
4193     break;
4194   case 'j':
4195     m_options |= (1<<M_OPTION_CASEN);
4196     break;
4197   case 'n':
4198     u_options |= (1<<U_OPTION_SCORES);
4199     break;
4200   case 'c':
4201     if( cat_count >= MAX_CAT ) {
4202       errormsg(E_WARNING,
4203 	       "maximum reached, category ignored\n");
4204     } else if( u_options & (1<<U_OPTION_LEARN) ) {
4205       errormsg(E_ERROR, "cannot use options -l and -c together\n");
4206       exit(1);
4207     } else {
4208       u_options |= (1<<U_OPTION_CLASSIFY);
4209 
4210       cat[cat_count].fullfilename = sanitize_path(optarg, extn);
4211 
4212       if( !*optarg ) {
4213 	errormsg(E_FATAL, "category needs a name\n");
4214       }
4215       if( !load_category(&cat[cat_count]) ) {
4216 	errormsg(E_FATAL, "could not load category %s\n",
4217 		 cat[cat_count].fullfilename);
4218       }
4219       if( sanitize_model_options(&m_options,&m_cp,&cat[cat_count]) ) {
4220 	ngram_order = (ngram_order < cat[cat_count].max_order) ?
4221 	  cat[cat_count].max_order : ngram_order;
4222 	cat_count++;
4223       }
4224     }
4225     c++;
4226     break;
4227   case 'P':
4228     u_options |= (1<<U_OPTION_PRIOR_CORRECTION);
4229     break;
4230   case 'q':
4231     quality = atoi(optarg);
4232     switch(quality) {
4233     case 1:
4234       qtol_div = 0.01;
4235       qtol_lam = CLIP_LAMBDA_TOL(0.05);
4236       qtol_logz = 0.05;
4237       break;
4238     case 2:
4239       qtol_div = 0.01;
4240       qtol_lam = CLIP_LAMBDA_TOL(0.01);
4241       qtol_logz = 0.05;
4242       break;
4243     case 3:
4244       qtol_div = 0.01;
4245       qtol_lam = CLIP_LAMBDA_TOL(0.01);
4246       qtol_logz = 0.01;
4247       break;
4248     case 4:
4249       qtol_div = 0.01;
4250       qtol_lam = CLIP_LAMBDA_TOL(0.01);
4251       qtol_logz = 0.01;
4252       qtol_multipass = 1;
4253       break;
4254     default:
4255       errormsg(E_FATAL,
4256 	       "the -q switch needs a number between 1(fastest) and 4(best quality)\n");
4257       break;
4258     }
4259     c++;
4260     break;
4261   case 'r':
4262     m_options |= (1<<M_OPTION_REFMODEL);
4263     break;
4264   case 'w':
4265     ngram_order = atoi(optarg);
4266     if( !*optarg || (ngram_order < 1) || (ngram_order > 7) ) {
4267       errormsg(E_FATAL,
4268 	       "the -w switch needs a number between 1 and 7\n");
4269     }
4270     c++;
4271     break;
4272   case 'S':
4273     m_options |= (1<<M_OPTION_NGRAM_STRADDLE_NL);
4274     break;
4275   case 'x':
4276     decimation = atoi(optarg);
4277     if( !*optarg || (decimation < 1) || (decimation > MAX_HASH_BITS) ) {
4278       errormsg(E_WARNING,
4279 	       "option -x ignored, needs an integer between 1 and %d\n",
4280 	       MAX_HASH_BITS);
4281     } else {
4282       u_options |= (1<<U_OPTION_DECIMATE);
4283       srand(0);
4284     }
4285     c++;
4286     break;
4287   case 'f':
4288     if( filter_count >= MAX_CAT ) {
4289       errormsg(E_WARNING, "maximum reached, filter ignored\n");
4290     } else if( u_options & (1<<U_OPTION_LEARN) ) {
4291       errormsg(E_ERROR, "cannot use options -l and -f together\n");
4292       exit(1);
4293     } else if( !*optarg ) {
4294       errormsg(E_ERROR, "filter must be category name or number\n");
4295       exit(1);
4296     } else {
4297       u_options |= (1<<U_OPTION_FILTER);
4298       filter[filter_count] = -1;
4299 
4300       /* see if it's a known category */
4301       for(c = 0; c < cat_count; c++) {
4302 	if( !strcmp(cat[c].filename, optarg) ) {
4303 	  filter[filter_count] = c;
4304 	  break;
4305 	}
4306       }
4307       /* if not recognized, see if it's a number */
4308       if( filter[filter_count] < 0 ) {
4309 	filter[filter_count] = atoi(optarg) - 1;
4310       }
4311       if( filter[filter_count] < 0 ) { /* still not recognized */
4312 	errormsg(E_ERROR, "unrecognized category in -f option [%s]\n",
4313 		 optarg);
4314 	exit(1);
4315       }
4316       filter_count++;
4317     }
4318     c++;
4319     break;
4320   case 'F':
4321     u_options |= (1<<U_OPTION_CLASSIFY_MULTIFILE);
4322     break;
4323   case 'v':
4324     u_options |= (1<<U_OPTION_VERBOSE);
4325     break;
4326   case 'L':
4327     if( *optarg &&
4328 	(!strcmp(optarg, "uniform") ||
4329 	 !strcmp(optarg, "dirichlet") ||
4330 	 !strcmp(optarg, "maxent") ||
4331 	 !strcmp(optarg, "mle") ||
4332 	 !strcmp(optarg, "iid") ||
4333 	 !strcmp(optarg, "toklen")) ) {
4334       digtype = optarg;
4335     } else {
4336       errormsg(E_FATAL, "-L option needs one of \"uniform\", \"dirichlet\", \"maxent\", \"toklen\", \"mle\"\n");
4337     }
4338     c++;
4339     break;
4340   case 'l':
4341     if( u_options & (1<<U_OPTION_CLASSIFY) ) {
4342       errormsg(E_ERROR,
4343 	       "cannot use options -l and -c together\n");
4344       exit(1);
4345     } else if( u_options & (1<<U_OPTION_LEARN) ) {
4346       errormsg(E_ERROR,
4347 	       "option -l can only occur once\n");
4348       exit(1);
4349     } else if( !*optarg ) {
4350       errormsg(E_ERROR, "category name must not be empty\n");
4351     } else {
4352       u_options |= (1<<U_OPTION_LEARN);
4353       learner.filename = sanitize_path(optarg, extn);
4354       if( !*learner.filename ) {
4355 	errormsg(E_ERROR, "category needs a name\n");
4356 	exit(1);
4357       }
4358     }
4359     c++;
4360     break;
4361   case 'o':
4362     if( !*optarg ) {
4363       errormsg(E_ERROR, "category name must not be empty in -o switch\n");
4364     } else if( !*online ) {
4365       online = sanitize_path(optarg, "");
4366     } else {
4367       errormsg(E_WARNING,
4368 	       "multiple -o switches not supported, ignoring all but the first\n");
4369     }
4370     c++;
4371     break;
4372   case 'O':
4373     if( !*optarg ) {
4374       errormsg(E_ERROR, "category name must not be empty in -o switch\n");
4375     } else {
4376       ronline[ronline_count++] = sanitize_path(optarg, "");
4377     }
4378     c++;
4379     break;
4380   case 'G':
4381   case 'g':
4382     /* if regex can't be compiled, load_regex() exits */
4383     learner.retype |= (1<<load_regex(optarg));
4384     c++;
4385     break;
4386   case 'i':
4387     m_options |= (1<<M_OPTION_I18N);
4388 #if defined HAVE_LANGINFO_H
4389     if( !strcmp(nl_langinfo(CODESET), "UTF-8") ) {
4390       errormsg(E_WARNING, "you have UTF-8, so -i is not needed.\n");
4391     }
4392 #endif
4393 #if !defined HAVE_MBRTOWC
4394     errormsg(E_WARNING, "this tool was compiled without wide character support. Full internationalization is disabled.\n");
4395     m_options &= ~(1<<M_OPTION_I18N);
4396 #endif
4397     break;
4398   case 'Y':
4399     u_options |= (1<<U_OPTION_MEDIACOUNTS);
4400     break;
4401   case 'z':
4402     zthreshold = atoi(optarg);
4403     c++;
4404     break;
4405   default:
4406     c--;
4407     break;
4408   }
4409   return c;
4410 }
4411 
sanitize_options()4412 void sanitize_options() {
4413   category_count_t c;
4414 
4415   /* consistency checks */
4416   if( ((u_options>>U_OPTION_CLASSIFY) & 1) +
4417       ((u_options>>U_OPTION_LEARN) & 1) != 1 ) {
4418     errormsg(E_ERROR, "please use either -c or -l option.\n");
4419     exit(1);
4420   }
4421 
4422   if( (*online || (ronline_count > 0)) &&
4423       (u_options & (1<<U_OPTION_CONFIDENCE)) ) {
4424 /*     errormsg(E_WARNING,  */
4425 /* 	     "-X switch cannot be used with -o switch, disabling -X.\n"); */
4426     m_options &= ~(1<<U_OPTION_CONFIDENCE);
4427   }
4428 
4429   if( (u_options & (1<<U_OPTION_DECIMATE)) &&
4430       !(u_options & (1<<U_OPTION_LEARN)) ) {
4431     errormsg(E_WARNING,
4432 	    "option -x ignored, applies only when learning.\n");
4433     u_options &= ~(1<<U_OPTION_DECIMATE);
4434   }
4435 
4436   if( u_options & (1<<U_OPTION_DUMP) ) {
4437     if( u_options & (1<<U_OPTION_VERBOSE) ) {
4438       u_options &= ~(1<<U_OPTION_VERBOSE); /* verbose writes garbage to stdout */
4439       u_options &= ~(1<<U_OPTION_DEBUG);
4440     }
4441   }
4442 
4443   if( !(m_options & (1<<M_OPTION_TEXT_FORMAT)) &&
4444       !(m_options & (1<<M_OPTION_MBOX_FORMAT)) &&
4445       !(m_options & (1<<M_OPTION_HTML)) &&
4446       !(m_options & (1<<M_OPTION_XML)) ) {
4447     m_options |= (1<<M_OPTION_TEXT_FORMAT);
4448   }
4449 
4450   if( ((m_options>>M_OPTION_TEXT_FORMAT) & 1) +
4451       ((m_options>>M_OPTION_MBOX_FORMAT) & 1) > 1 ) {
4452     errormsg(E_ERROR,
4453 	    "please use only one of either -T text or -T email options.\n");
4454     exit(1);
4455   }
4456 
4457   if( ((m_options>>M_OPTION_XML) & 1) +
4458       ((m_options>>M_OPTION_HTML) & 1) > 1 ) {
4459     errormsg(E_ERROR,
4460 	    "please use only one of either -T xml or -T html options.\n");
4461     exit(1);
4462   }
4463 
4464 
4465   if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4466     /* mbox format needs html unless user wants xml */
4467     if( !(m_options & (1<<M_OPTION_XML)) ) {
4468       m_options |= (1<<M_OPTION_HTML);
4469     }
4470 
4471     /* for mboxes, only compute ngrams for each line individually */
4472 /*     m_options &= ~(1<<M_OPTION_NGRAM_STRADDLE_NL); */
4473 
4474     /* always calculate entropy statistics when learning */
4475     if( u_options & (1<<U_OPTION_LEARN) ) {
4476       m_options |= (1<<M_OPTION_CALCENTROPY);
4477     }
4478   }
4479 
4480 
4481   if( (u_options & (1<<U_OPTION_APPEND)) &&
4482       (u_options & (1<<U_OPTION_FILTER)) ) {
4483     u_options &= ~(1<<U_OPTION_APPEND);
4484     errormsg(E_WARNING,
4485 	    "disabling option -a, because it cannot be used with -f.\n");
4486   }
4487 
4488   /* decide if we need some options */
4489 
4490   if( u_options & (1<<U_OPTION_LEARN) ) {
4491     if( !regex_count ) {
4492       m_options |= (1<<M_OPTION_USE_STDTOK);
4493     } else {
4494       m_options &= ~(1<<M_OPTION_USE_STDTOK);
4495     }
4496   }
4497 
4498   if( m_options & (1<<M_OPTION_I18N) ) {
4499     /* I've removed the internationalized regex code, because it makes
4500        the code too complex to handle both multibyte and wide char regexes
4501        simultaneously. */
4502     errormsg(E_WARNING,
4503 	     "this version of dbacl does not support -g and -i switches simultaneously, disabling -i.\n");
4504     m_options &= ~(1<<M_OPTION_I18N);
4505   }
4506 
4507   if( m_options & (1<<M_OPTION_MULTINOMIAL) ) {
4508     if( cat_count == 1 ) {
4509       if( cat[0].model.type == simple ) {
4510 	m_options |= (1<<M_OPTION_CALCENTROPY);
4511       }
4512     } else if( cat_count > 1 ) {
4513       for(c = 1; c < cat_count; c++) {
4514 	if( cat[c].model.type == sequential ) { break; }
4515 	if( cat[c].retype != cat[c-1].retype ) { break; }
4516       }
4517       if( c == cat_count ) {
4518 	m_options |= (1<<M_OPTION_CALCENTROPY);
4519       }
4520     }
4521     if( !(m_options & (1<<M_OPTION_CALCENTROPY)) ) {
4522       errormsg(E_WARNING,
4523 	      "not all categories support multinomial calculations, disabling (-M switch)\n");
4524       m_options &= ~(1<<M_OPTION_MULTINOMIAL);
4525     }
4526   }
4527 
4528   /* unless overridden, we use alpha character classes only */
4529   if( m_cp == CP_DEFAULT ) {
4530     if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4531       m_cp = CP_ADP;
4532     } else {
4533       m_cp = CP_ALPHA;
4534     }
4535   }
4536 
4537   if( !*digtype ) {
4538     if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4539       digtype = "uniform";
4540     } else {
4541       digtype = "dirichlet";
4542     }
4543   }
4544 }
4545 
4546 
main(int argc,char ** argv)4547 int main(int argc, char **argv) {
4548 
4549   FILE *input;
4550   signed char op;
4551   struct stat statinfo;
4552 
4553   void (*preprocess_fun)(void) = NULL;
4554   void (*word_fun)(char *, token_type_t, regex_count_t) = NULL;
4555   char *(*pre_line_fun)(char *) = NULL;
4556   void (*post_line_fun)(char *) = NULL;
4557   void (*post_file_fun)(char *) = NULL;
4558   void (*postprocess_fun)(void) = NULL;
4559   void (*cleanup_fun)(void) = NULL;
4560   int (*line_filter)(MBOX_State *, char *) = NULL;
4561   void (*character_filter)(XML_State *, char *) = NULL;
4562 #if defined HAVE_MBRTOWC
4563   int (*w_line_filter)(MBOX_State *, wchar_t *) = NULL;
4564   void (*w_character_filter)(XML_State *, wchar_t *) = NULL;
4565 #endif
4566 
4567   progname = "dbacl";
4568   inputfile = "stdin";
4569   inputline = 0;
4570 
4571   learner.filename = NULL;
4572   learner.retype = 0;
4573 
4574   /* set up internationalization */
4575   if( !setlocale(LC_ALL, "") ) {
4576     errormsg(E_WARNING,
4577 	    "could not set locale, internationalization disabled\n");
4578   } else {
4579     if( u_options & (1<<U_OPTION_VERBOSE) ) {
4580       errormsg(E_WARNING,
4581 	      "international locales not supported\n");
4582     }
4583   }
4584 
4585 #if defined(HAVE_GETPAGESIZE)
4586   system_pagesize = getpagesize();
4587 #endif
4588   if( system_pagesize == -1 ) { system_pagesize = BUFSIZ; }
4589 
4590   init_signal_handling();
4591 
4592   /* parse the options */
4593   while( (op = getopt(argc, argv,
4594 		      "01Aac:Dde:f:FG:g:H:h:ijL:l:mMNno:O:Ppq:RrST:UVvw:x:XYz:@")) > -1 ) {
4595     set_option(op, optarg);
4596   }
4597 
4598   /* end option processing */
4599   sanitize_options();
4600 
4601   /* set up callbacks */
4602   if( u_options & (1<<U_OPTION_CLASSIFY) ) {
4603 
4604     preprocess_fun = classifier_preprocess_fun;
4605     word_fun = score_word;
4606     if( u_options & (1<<U_OPTION_FILTER) ) {
4607       u_options |= (1<<U_OPTION_FASTEMP);
4608       empirical.track_features = 1;
4609       post_line_fun = line_score_categories;
4610       post_file_fun = NULL;
4611       postprocess_fun = NULL;
4612     } else if( u_options & (1<<U_OPTION_CLASSIFY_MULTIFILE) ) {
4613       post_line_fun = NULL;
4614       post_file_fun = file_score_categories;
4615       postprocess_fun = NULL;
4616     } else {
4617       post_line_fun = NULL;
4618       post_file_fun = NULL;
4619       postprocess_fun = score_categories;
4620     }
4621     cleanup_fun = classifier_cleanup_fun;
4622 
4623   } else if( u_options & (1<<U_OPTION_LEARN) ) {
4624     /* category learning */
4625 
4626     preprocess_fun = learner_preprocess_fun;
4627     word_fun = learner_word_fun;
4628     post_line_fun =  learner_post_line_fun;
4629     post_file_fun = NULL;
4630     postprocess_fun = learner_postprocess_fun;
4631     cleanup_fun = learner_cleanup_fun;
4632 
4633   } else { /* something wrong ? */
4634     usage(argv);
4635     exit(1);
4636   }
4637 
4638   /* handles some common filtering options */
4639   if( (u_options & (1<<U_OPTION_INDENTED)) ||
4640       (u_options & (1<<U_OPTION_APPEND)) ) {
4641     pre_line_fun = handle_indents_and_appends;
4642   }
4643 
4644   if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4645     line_filter = email_line_filter;
4646   }
4647 
4648   if( (m_options & (1<<M_OPTION_XML)) ||
4649       (m_options & (1<<M_OPTION_HTML)) ) {
4650     character_filter = xml_character_filter;
4651   }
4652 
4653 #if defined HAVE_MBRTOWC
4654   if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4655     w_line_filter = w_email_line_filter;
4656   }
4657 
4658   if( (m_options & (1<<M_OPTION_XML)) ||
4659       (m_options & (1<<M_OPTION_HTML)) ) {
4660     w_character_filter = w_xml_character_filter;
4661   }
4662 #endif
4663 
4664 
4665   if( preprocess_fun ) { (*preprocess_fun)(); }
4666 
4667 
4668   init_file_handling();
4669 
4670   /* now process each file on the command line,
4671      or if none provided read stdin */
4672   while( (optind > -1) && *(argv + optind) && !(cmd & (1<<CMD_QUITNOW)) ) {
4673     /* if it's a filename, process it */
4674     input = fopen(argv[optind], "rb");
4675     if( input ) {
4676       inputfile = argv[optind];
4677 
4678       u_options |= (1<<U_OPTION_STDIN);
4679 
4680       if( (u_options & (1<<U_OPTION_VERBOSE)) &&
4681 	  !(u_options & (1<<U_OPTION_CLASSIFY))) {
4682 	fprintf(stdout, "processing file %s\n", argv[optind]);
4683       }
4684 
4685       /* set some initial options */
4686       reset_xml_character_filter(&xml, xmlRESET);
4687 
4688       if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4689 	reset_mbox_line_filter(&mbox);
4690       }
4691 
4692       if( fstat(fileno(input), &statinfo) == 0 ) {
4693 	switch(statinfo.st_mode & S_IFMT) {
4694 	case S_IFDIR:
4695 	  if( !(m_options & (1<<M_OPTION_I18N)) ) {
4696 	    process_directory(inputfile, line_filter, character_filter,
4697 			      word_fun, pre_line_fun, post_line_fun, post_file_fun);
4698 	  } else {
4699 #if defined HAVE_MBRTOWC
4700 	    w_process_directory(inputfile, w_line_filter, w_character_filter,
4701 				word_fun, pre_line_fun, post_line_fun, post_file_fun);
4702 #else
4703 	    errormsg(E_ERROR, "international support not available (recompile).\n");
4704 #endif
4705 	  }
4706 	  break;
4707 	default:
4708 	  if( !(m_options & (1<<M_OPTION_I18N)) ) {
4709 	    process_file(input, line_filter, character_filter,
4710 			 word_fun, pre_line_fun, post_line_fun);
4711 	  } else {
4712 #if defined HAVE_MBRTOWC
4713 	    w_process_file(input, w_line_filter, w_character_filter,
4714 			   word_fun, pre_line_fun, post_line_fun);
4715 #else
4716 	    errormsg(E_ERROR, "international support not available (recompile).\n");
4717 #endif
4718 	  }
4719 	}
4720       }
4721       fclose(input);
4722 
4723       if( post_file_fun ) { (*post_file_fun)(inputfile); }
4724 
4725     } else { /* unrecognized file name */
4726 
4727       errormsg(E_FATAL, "couldn't open %s\n", argv[optind]);
4728       /* usage(argv); */
4729       /* exit(1); */
4730 
4731     }
4732     optind++;
4733   }
4734   /* in case no files were specified, get input from stdin,
4735    * but note we reopen it in binary mode
4736    */
4737   if( !(u_options & (1<<U_OPTION_STDIN)) ) {
4738     input = fdopen(fileno(stdin), "rb");
4739     if( input ) {
4740 
4741       if( (u_options & (1<<U_OPTION_VERBOSE)) &&
4742 	  !(u_options & (1<<U_OPTION_CLASSIFY)) ) {
4743 	fprintf(stdout, "taking input from stdin\n");
4744       }
4745 
4746       /* set some initial options */
4747       reset_xml_character_filter(&xml, xmlRESET);
4748 
4749       if( m_options & (1<<M_OPTION_MBOX_FORMAT) ) {
4750 	reset_mbox_line_filter(&mbox);
4751       }
4752 
4753       if( !(m_options & (1<<M_OPTION_I18N)) ) {
4754 	process_file(input, line_filter, character_filter,
4755 		     word_fun, pre_line_fun, post_line_fun);
4756       } else {
4757 #if defined HAVE_MBRTOWC
4758 	w_process_file(input, w_line_filter, w_character_filter,
4759 		       word_fun, pre_line_fun, post_line_fun);
4760 #else
4761 	errormsg(E_ERROR, "international support not available (recompile).\n");
4762 #endif
4763       }
4764 
4765       /* must close before freeing in_iobuf, in case setvbuf was called */
4766       fclose(input);
4767 
4768       if( post_file_fun ) { (*post_file_fun)(inputfile); }
4769 
4770     }
4771   }
4772 
4773 
4774   if( postprocess_fun ) { (*postprocess_fun)(); }
4775 
4776   cleanup_file_handling();
4777 
4778   if( cleanup_fun ) { (*cleanup_fun)(); }
4779 
4780   free_all_regexes();
4781 
4782   cleanup_signal_handling();
4783 
4784   return exit_code;
4785 }
4786 
4787 
4788