1 //	crm_osbf_maintenance.c  - OSBF microgrooming utilities
2 
3 // Copyright 2004 Fidelis Assis
4 // Copyright 2004-2009 William S. Yerazunis.
5 // This file is under GPLv3, as described in COPYING.
6 
7 //  OBS: CSS header structure and pruning method modified for OSBF classifier.
8 //       See functions crm_osbf_microgroom and crm_osbf_create_cssfile, below,
9 //       for details.  -- Fidelis Assis - 2004/10/20
10 
11 //  include some standard files
12 #include "crm114_sysincludes.h"
13 
14 //  include any local crm114 configuration file
15 #include "crm114_config.h"
16 
17 //  include the crm114 data structures file
18 #include "crm114_structs.h"
19 
20 //  and include the routine declarations file
21 #include "crm114.h"
22 
23 #include "crm114_osbf.h"
24 
25 long microgroom_chain_length;
26 long microgroom_stop_after;
27 
28 /* Version names */
29 char *CSS_version_name[] = {
30   "SBPH-Markovian",
31   "OSB-Bayes",
32   "Correlate",
33   "Neural",
34   "OSB-Winnow",
35   "OSBF-Bayes",
36   "Unknown"
37 };
38 
39 //    microgroom flag for osbf
40 static int osbf_microgroom;
41 // turn microgroom on (1) or off (0)
crm_osbf_set_microgroom(int value)42 void crm_osbf_set_microgroom(int value)
43 {
44   osbf_microgroom = value;
45 }
46 
47 
48 //
49 //     How to microgroom a .css file that's getting full
50 //
51 //     NOTA BENE NOTA BENE NOTA BENE NOTA BENE
52 //
53 //         This whole section of code is under intense develoment; right now
54 //         it "works" but not any better than nothing at all.  Be warned
55 //         that any patches issued on it may well never see the light of
56 //         day, as intense testing and comparison may show that the current
57 //         algorithms are, well, suckful.
58 //
59 //
60 //     There are two steps to microgrooming - first, since we know we're
61 //     already too full, we execute a 'zero unity bins'.
62 //
63 void
crm_osbf_microgroom(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long hindex)64 crm_osbf_microgroom (OSBF_FEATURE_HEADER_STRUCT * header,
65 		     unsigned long hindex)
66 {
67   long i, j, k;
68   static long microgroom_count = 0;
69   long packstart;
70   long packlen;
71   long zeroed_countdown, max_zeroed_buckets;
72   long min_value, min_value_any, distance, max_distance;
73   int groom_any = 0;
74   OSBF_FEATUREBUCKET_STRUCT *h;
75 
76   // if not set by command line, use default
77   if (microgroom_chain_length == 0)
78     microgroom_chain_length = OSBF_MICROGROOM_CHAIN_LENGTH;
79   if (microgroom_stop_after == 0)
80     microgroom_stop_after = OSBF_MICROGROOM_STOP_AFTER;
81 
82 
83   // make h point to the first feature bucket
84   h = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
85 
86   zeroed_countdown = microgroom_stop_after;
87   i = j = k = 0;
88   microgroom_count++;
89 
90   if (user_trace)
91     {
92       if (microgroom_count == 1)
93 	fprintf (stderr, "CSS file too full: microgrooming this css chain: ");
94       fprintf (stderr, " %ld ", microgroom_count);
95     };
96 
97 //   micropack - start at initial chain start, move to back of
98 //   chain that overflowed, then scale just that chain.
99 
100   i = j = hindex % header->buckets;
101   min_value = OSBF_FEATUREBUCKET_VALUE_MAX;
102   min_value_any = GET_BUCKET_VALUE (h[i]);
103   while (BUCKET_IN_CHAIN (h[i]))
104     {
105       if (GET_BUCKET_VALUE (h[i]) < min_value && !BUCKET_IS_LOCKED (h[i]))
106 	min_value = GET_BUCKET_VALUE (h[i]);
107       if (GET_BUCKET_VALUE (h[i]) < min_value_any)
108 	min_value_any = GET_BUCKET_VALUE (h[i]);
109       if (i == 0)
110 	i = header->buckets - 1;
111       else
112 	i--;
113       if (i == j)
114 	break;			// don't hang if we have a 100% full .css file
115       // fprintf (stderr, "-");
116     }
117 
118   if (min_value == OSBF_FEATUREBUCKET_VALUE_MAX)
119     {
120       /* no unlocked bucket avaiable so groom any */
121       groom_any = 1;
122       min_value = min_value_any;
123     }
124 
125   //     now, move our index to the first bucket in this chain.
126   i++;
127   if (i >= header->buckets)
128     i = 0;
129   packstart = i;
130 
131   /* i = j = hindex % header->buckets; */
132   while (BUCKET_IN_CHAIN (h[i]))
133     {
134       i++;
135       if (i == header->buckets)
136 	i = 0;
137       if (i == packstart)
138 	break;		// don't hang if we have a 100% full .cfc file
139     }
140   //     now, our index is right after the last bucket in this chain.
141 
142   /* if there was a wraparound, full .cfc file,    */
143   /* i == packstart and packlen == header->buckets */
144   if (i > packstart)
145   packlen = i - packstart;
146   else
147     packlen = header->buckets + i - packstart;
148 
149 //   This pruning method zeroes buckets with minimum count in the chain.
150 //   It tries first buckets with minimum distance to their right position,
151 //   to increase the chance of zeroing older buckets first. If none with
152 //   distance 0 is found, the distance is increased until at least one
153 //   bucket is zeroed.
154 //
155 //   We keep track of how many buckets we've zeroed and we stop
156 //   zeroing additional buckets after that point.   NO!  BUG!  That
157 //   messes up the tail length, and if we don't repack the tail, then
158 //   features in the tail can become permanently inaccessible!   Therefore,
159 //   we really can't stop in the middle of the tail (well, we could
160 //   stop zeroing, but we need to pass the full length of the tail in.
161 //
162 //   Note that we can't do this "adaptively" in packcss, because zeroes
163 //   there aren't necessarily overflow chain terminators (because -we-
164 //   might have inserted them here.
165 //
166 //   GROT GROT GROT  Note that the following algorithm does multiple
167 //   passes to find the lowest-valued features.  In fact, that's
168 //   actually rather slow; a better algorithm would keep track of
169 //   the N least-valued features in the chain in ONE pass and zero
170 //   those.
171 //
172 //   --
173 //   I'm not sure if it's worth working on a better algoritm for this:
174 //
175 //   This is a statistics report of microgroomings for 4147 messages
176 //   of the SpamAssassin corpus. It shows that 77% is done in a single
177 //   pass, 95.2% in 1 or 2 passes and 99% in at most 3 passes.
178 //
179 //   # microgrommings   passes   %    accum. %
180 //        232584           1    76.6   76.6
181 //         56396           2    18.6   95.2
182 //         11172           3     3.7   98.9
183 //          2502           4     0.8   99.7
184 //           726           5     0.2   99.9
185 //           ...
186 //   -----------
187 //        303773
188 //
189 //   If we consider only the last 100 microgroomings, when the css
190 //   file is full, we'll have the following numbers showing that most
191 //   microgroomings (61%) are still done in a single pass, almost 90%
192 //   is done in 1 or 2 passes and 97% are done in at most 3 passes:
193 //
194 //   # microgrommings   passes   %    accum. %
195 //          61             1    61      61
196 //          27             2    27      88
197 //           9             3     9      97
198 //           3             4     3     100
199 //         ---
200 //         100
201 //
202 //   So, it's not so slow. Anyway, a better algorithm could be
203 //   implemented using 2 additional arrays, with MICROGROOM_STOP_AFTER
204 //   positions each, to store the indexes of the candidate buckets
205 //   found with distance equal to 1 or 2 while we scan for distance 0.
206 //   Those with distance 0 are zeroed immediatelly. If none with
207 //   distance 0 is found, we'll zero the indexes stored in the first
208 //   array. Again, if none is found in the first array, we'll try the
209 //   second one. Finally, if none is found in both arrays, the loop
210 //   will continue until one bucket is zeroed.
211 //
212 //   But now comes the question: do the numbers above justify the
213 //   additional code/work? I'll try to find out the answer
214 //   implementing it :), but this has low priority for now.
215 //
216 //   -- Fidelis Assis
217 //
218 
219   // try features in their right place first
220   max_distance = 1;
221 
222   /* zero up to 50% of packlen */
223   /* max_zeroed_buckets = (long) (0.5 * packlen + 0.5); */
224   max_zeroed_buckets =  microgroom_stop_after;
225   zeroed_countdown = max_zeroed_buckets;
226 
227   /*fprintf(stderr, "packstart: %ld,  packlen: %ld, max_zeroed_buckets: %ld\n",
228       packstart, packlen, max_zeroed_buckets); */
229 
230   // while no bucket is zeroed...
231   while (zeroed_countdown == max_zeroed_buckets)
232     {
233       /*
234          fprintf(stderr, "Start: %ld, stop_after: %ld, max_distance: %ld\n", packstart, microgroom_stop_after, max_distance);
235        */
236       i = packstart;
237       while (BUCKET_IN_CHAIN (h[i]) && zeroed_countdown > 0)
238 	{
239 	  // check if it's a candidate
240 	  if (GET_BUCKET_VALUE (h[i]) == min_value &&
241 	      (!BUCKET_IS_LOCKED (h[i]) || (groom_any != 0)))
242 	    {
243 	      // if it is, check the distance
244 	      distance = i - BUCKET_HASH (h[i]) % header->buckets;
245 	      if (distance < 0)
246 	        distance += header->buckets;
247 	      if (distance < max_distance)
248 	        {
249 		  BUCKET_RAW_VALUE (h[i]) = 0;
250 	          zeroed_countdown--;
251 	        }
252 	    }
253 	  i++;
254 	  if (i >= header->buckets)
255 	    i = 0;
256 	}
257 
258 //  if none was zeroed, increase the allowed distance between the
259 //  candidade's position and its right place.
260 
261       if (zeroed_countdown == max_zeroed_buckets)
262 	max_distance++;
263     }
264 
265   /*
266      fprintf (stderr, "Leaving microgroom: %ld buckets zeroed at distance %ld\n", microgroom_stop_after - zeroed_countdown, max_distance - 1);
267    */
268 
269   //   now we pack the buckets
270   crm_osbf_packcss (header, packstart, packlen);
271 
272 }
273 
274 void
crm_osbf_packcss(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long packstart,unsigned long packlen)275 crm_osbf_packcss (OSBF_FEATURE_HEADER_STRUCT * header,
276 		  unsigned long packstart, unsigned long packlen)
277 {
278 
279 //    How we pack...
280 //
281 //    We look at each bucket, and attempt to reinsert it at the "best"
282 //    place.  We know at worst it will end up where it already is, and
283 //    at best it will end up lower (at a lower index) in the file, except
284 //    if it's in wraparound mode, in which case we know it will not get
285 //    back up past us (since the file must contain at least one empty)
286 //    and so it's still below us in the file.
287 
288 
289   OSBF_FEATUREBUCKET_STRUCT *h;
290 
291   // make h point to the first feature bucket
292   h = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
293 
294   if (packstart + packlen <= header->buckets)	//  no wraparound in this case
295     {
296       crm_osbf_packseg (header, packstart, packlen);
297     }
298   else		//  wraparound mode - do it as two separate repacks
299     {
300       crm_osbf_packseg (header, packstart, (header->buckets - packstart));
301       crm_osbf_packseg (header, 0, (packlen - (header->buckets - packstart)));
302     };
303 }
304 
305 void
crm_osbf_packseg(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long packstart,unsigned long packlen)306 crm_osbf_packseg (OSBF_FEATURE_HEADER_STRUCT * header,
307 		  unsigned long packstart, unsigned long packlen)
308 {
309   unsigned long ifrom, ito;
310   unsigned long thash, tkey;
311   OSBF_FEATUREBUCKET_STRUCT *h;
312 
313   // make h point to the first feature bucket
314   h = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
315 
316   if (internal_trace)
317     fprintf (stderr, " < %ld %ld >", packstart, packlen);
318 
319   // Our slot values are now somewhat in disorder because empty
320   // buckets may now have been inserted into a chain where there used
321   // to be placeholder buckets.  We need to re-insert slot data in a
322   // bucket where it will be found.
323 
324   for (ifrom = packstart; ifrom < packstart + packlen; ifrom++)
325     {
326       //    Now find the next bucket to place somewhere
327       thash = BUCKET_HASH (h[ifrom]);
328       tkey = BUCKET_KEY (h[ifrom]);
329 
330       if (GET_BUCKET_VALUE (h[ifrom]) == 0)
331 	{
332 	  if (internal_trace)
333 	    fprintf (stderr, "X");
334 	}
335       else
336 	{
337 	  ito = thash % header->buckets;
338 	  // fprintf (stderr, "a %ld", ito);
339 
340 	  while (BUCKET_IN_CHAIN (h[ito]) &&
341 		 !BUCKET_HASH_COMPARE (h[ito], thash, tkey))
342 	    {
343 	      ito++;
344 	      if (ito >= header->buckets)
345 	        ito = 0;
346             }
347 
348 	  //   found an empty slot, put this value there, and zero the
349 	  //   original one.  Sometimes this is a noop.  We don't care.
350 
351 	  if (ito != ifrom)
352 	    {
353 	      BUCKET_HASH (h[ito]) = thash;
354 	      BUCKET_KEY (h[ito]) = tkey;
355 	      // move value and lock together
356 	      BUCKET_RAW_VALUE (h[ito]) = BUCKET_RAW_VALUE (h[ifrom]);
357 
358 	      // clean "from" bucket
359 	      BUCKET_HASH (h[ifrom]) = 0;
360 	      BUCKET_KEY (h[ifrom]) = 0;
361 	      BUCKET_RAW_VALUE (h[ifrom]) = 0;
362 	    }
363 
364 	  if (internal_trace)
365 	    {
366 	      if (ifrom == ito)
367 		fprintf (stderr, "=");
368 	      if (ito < ifrom)
369 		fprintf (stderr, "<");
370 	      if (ito > ifrom)
371 		fprintf (stderr, ">");
372 	    };
373 
374 	};
375     };
376 }
377 
378 /* get next bucket index */
379 unsigned long
crm_osbf_next_bindex(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long hindex)380 crm_osbf_next_bindex (OSBF_FEATURE_HEADER_STRUCT * header,
381 		      unsigned long hindex)
382 {
383   hindex++;
384   if (hindex >= header->buckets)
385     hindex = 0;
386   return hindex;
387 }
388 
389 /* get index of the last bucket in a chain */
390 unsigned long
crm_osbf_last_in_chain(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long hindex)391 crm_osbf_last_in_chain (OSBF_FEATURE_HEADER_STRUCT * header,
392 			unsigned long hindex)
393 {
394   unsigned long wraparound;
395   OSBF_FEATUREBUCKET_STRUCT *hashes;
396 
397   hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
398 
399   /* if the bucket is not in a chain, return an index */
400   /* out of the buckets space, equal to the number of */
401   /* buckets in the file to indicate an empty chain */
402   if (!BUCKET_IN_CHAIN (hashes[hindex]))
403     return header->buckets;
404 
405   wraparound = hindex;
406   while (BUCKET_IN_CHAIN (hashes[hindex]))
407     {
408       hindex++;
409       if (hindex >= header->buckets)
410 	hindex = 0;
411 
412       /* if .cfc file is full return an index out of */
413       /* the buckets space, equal to number of buckets */
414       /* in the file, plus one */
415       if (hindex == wraparound)
416 	return header->buckets + 1;
417     }
418   hindex = crm_osbf_prev_bindex (header, hindex);
419   return hindex;
420 }
421 
422 
423 /* get previous bucket index */
424 unsigned long
crm_osbf_prev_bindex(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long hindex)425 crm_osbf_prev_bindex (OSBF_FEATURE_HEADER_STRUCT * header,
426 		      unsigned long hindex)
427 {
428   if (hindex == 0)
429     hindex = header->buckets - 1;
430   else
431     hindex--;
432   return hindex;
433 }
434 
435 /* get index of the first bucket in a chain */
436 unsigned long
crm_osbf_first_in_chain(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long hindex)437 crm_osbf_first_in_chain (OSBF_FEATURE_HEADER_STRUCT * header,
438 			 unsigned long hindex)
439 {
440   unsigned long wraparound;
441   OSBF_FEATUREBUCKET_STRUCT *hashes;
442 
443   hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
444 
445   /* if the bucket is not in a chain, return an index */
446   /* out of the buckets space, equal to the number of */
447   /* buckets in the file to indicate an empty chain */
448   if (!BUCKET_IN_CHAIN (hashes[hindex]))
449     return header->buckets;
450 
451   wraparound = hindex;
452   while (BUCKET_IN_CHAIN (hashes[hindex]))
453     {
454       if (hindex == 0)
455 	hindex = header->buckets - 1;
456       else
457 	hindex--;
458 
459       /* if .cfc file is full return an index out of */
460       /* the buckets space, equal to number of buckets */
461       /* in the file, plus one */
462       if (hindex == wraparound)
463 	return header->buckets + 1;
464     }
465   return crm_osbf_next_bindex (header, hindex);
466 }
467 
468 unsigned long
crm_osbf_find_bucket(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long hash,unsigned long key)469 crm_osbf_find_bucket (OSBF_FEATURE_HEADER_STRUCT * header,
470 		      unsigned long hash, unsigned long key)
471 {
472   OSBF_FEATUREBUCKET_STRUCT *hashes;
473   unsigned long hindex, start;
474 
475   hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
476   hindex = start = hash % header->buckets;
477   while (!BUCKET_HASH_COMPARE (hashes[hindex], hash, key)
478 	 && !EMPTY_BUCKET (hashes[hindex]))
479     {
480       hindex = crm_osbf_next_bindex (header, hindex);
481       /* if .cfc file is completely full return an index */
482       /* out of the buckets space, equal to number of buckets */
483       /* in the file, plus one */
484       if (hindex == start)
485 	return header->buckets + 1;
486     }
487 
488   /* return the index of the found bucket or, if not found,
489    * the index of a free bucket where it could be put */
490   return hindex;
491 }
492 
493 void
crm_osbf_update_bucket(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long bindex,int delta)494 crm_osbf_update_bucket (OSBF_FEATURE_HEADER_STRUCT * header,
495 			unsigned long bindex, int delta)
496 {
497   OSBF_FEATUREBUCKET_STRUCT *hashes;
498   hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
499   /*
500      fprintf (stderr, "Bucket updated at %lu, hash: %lu, key: %lu, value: %d\n",
501      bindex, hashes[bindex].hash, hashes[bindex].key, delta);
502    */
503   if (delta > 0 && GET_BUCKET_VALUE (hashes[bindex]) +
504       delta >= OSBF_FEATUREBUCKET_VALUE_MAX - 1)
505     {
506       SETL_BUCKET_VALUE (hashes[bindex], OSBF_FEATUREBUCKET_VALUE_MAX - 1);
507     }
508   else if (delta < 0 && GET_BUCKET_VALUE (hashes[bindex]) <= -delta)
509     {
510       BUCKET_RAW_VALUE (hashes[bindex]) = 0;
511       BUCKET_HASH (hashes[bindex]) = 0;
512       BUCKET_KEY (hashes[bindex]) = 0;
513       /* pack chain */
514       {
515 	long i, j, packlen;
516 	i = crm_osbf_next_bindex (header, bindex);
517 	j = crm_osbf_last_in_chain (header, i);
518 
519 	/* if there's a valid chain tail starting at i, pack it */
520 	if (j < header->buckets)
521 	  {
522 	    if (j >= i)
523 	      packlen = j - i + 1;
524 	    else
525 	      packlen = header->buckets + 1 - (i - j);
526 	    crm_osbf_packcss (header, i, packlen);
527 	  }
528       }
529     }
530   else
531     {
532       SETL_BUCKET_VALUE (hashes[bindex],
533 			 GET_BUCKET_VALUE (hashes[bindex]) + delta);
534     }
535 }
536 
537 void
crm_osbf_insert_bucket(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long bindex,unsigned long hash,unsigned long key,int value)538 crm_osbf_insert_bucket (OSBF_FEATURE_HEADER_STRUCT * header,
539 			unsigned long bindex, unsigned long hash,
540 			unsigned long key, int value)
541 {
542   unsigned long hindex, distance;
543   OSBF_FEATUREBUCKET_STRUCT *hashes;
544 
545   if (microgroom_chain_length == 0)
546     microgroom_chain_length = OSBF_MICROGROOM_CHAIN_LENGTH;
547 
548 
549   hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
550   /* "right" bucket position */
551   hindex = hash % header->buckets;
552   /* distance from right position to free position */
553   distance = (bindex >= hindex) ? bindex - hindex :
554     header->buckets - (hindex - bindex);
555   if ((osbf_microgroom != 0) && (value > 0))
556     while (distance > microgroom_chain_length)
557       {
558 	/*
559 	   fprintf (stderr, "hindex: %lu, bindex: %lu, distance: %lu\n",
560 	   hindex, bindex, distance);
561 	 */
562 	crm_osbf_microgroom (header, crm_osbf_prev_bindex (header, bindex));
563 	/* get new free bucket index */
564 	bindex = crm_osbf_find_bucket (header, hash, key);
565 	distance = (bindex >= hindex) ? bindex - hindex :
566 	  header->buckets - (hindex - bindex);
567       }
568 
569   /*
570      fprintf (stderr, "new bucket at %lu, hash: %lu, key: %lu, distance: %lu\n",
571      bindex, hash, key, distance);
572    */
573 
574   SETL_BUCKET_VALUE (hashes[bindex], value);
575   BUCKET_HASH (hashes[bindex]) = hash;
576   BUCKET_KEY (hashes[bindex]) = key;
577 }
578 
579 static OSBF_HEADER_UNION hu;
580 int
crm_osbf_create_cssfile(char * cssfile,unsigned long buckets,unsigned long major,unsigned long minor,unsigned long spectrum_start)581 crm_osbf_create_cssfile (char *cssfile, unsigned long buckets,
582 			 unsigned long major, unsigned long minor,
583 			 unsigned long spectrum_start)
584 {
585   FILE *f;
586   long i;
587   OSBF_FEATUREBUCKET_STRUCT feature = {
588     0, 0, 0
589   };
590   if (user_trace)
591     fprintf (stderr, "Opening file %s for read/write\n", cssfile);
592   f = fopen (cssfile, "wb");
593   if (!f)
594     {
595       fatalerror ("Couldn't open the new .cfc file for writing; file = ",
596                   cssfile);
597     };
598   // Set the header.
599   *((unsigned long *) hu.header.version) = major;	// quick hack for now...
600   hu.header.flags = minor;
601   hu.header.learnings = 0;
602   hu.header.buckets = buckets;
603   hu.header.buckets_start = OSBF_CSS_SPECTRA_START;
604   // Write header
605   if (fwrite (&hu, sizeof (hu), 1, f) != 1)
606     {
607       fatalerror (" Couldn't initialize the .cfc file header; file = ",
608                   cssfile);
609     }
610 
611   //  Initialize CSS hashes - zero all buckets
612   for (i = 0; i < buckets; i++)
613     {
614       // Write buckets
615       if (fwrite (&feature, sizeof (feature), 1, f) != 1)
616         {
617           fatalerror (" Couldn't initialize the .cfc buckets; file = ",
618                       cssfile);
619         }
620     }
621   fclose (f);
622   return (EXIT_SUCCESS);
623 }
624