1 // crm_osbf_maintenance.c - OSBF microgrooming utilities
2
3 // Copyright 2004 Fidelis Assis
4 // Copyright 2004-2009 William S. Yerazunis.
5 // This file is under GPLv3, as described in COPYING.
6
7 // OBS: CSS header structure and pruning method modified for OSBF classifier.
8 // See functions crm_osbf_microgroom and crm_osbf_create_cssfile, below,
9 // for details. -- Fidelis Assis - 2004/10/20
10
11 // include some standard files
12 #include "crm114_sysincludes.h"
13
14 // include any local crm114 configuration file
15 #include "crm114_config.h"
16
17 // include the crm114 data structures file
18 #include "crm114_structs.h"
19
20 // and include the routine declarations file
21 #include "crm114.h"
22
23 #include "crm114_osbf.h"
24
25 long microgroom_chain_length;
26 long microgroom_stop_after;
27
28 /* Version names */
29 char *CSS_version_name[] = {
30 "SBPH-Markovian",
31 "OSB-Bayes",
32 "Correlate",
33 "Neural",
34 "OSB-Winnow",
35 "OSBF-Bayes",
36 "Unknown"
37 };
38
39 // microgroom flag for osbf
40 static int osbf_microgroom;
41 // turn microgroom on (1) or off (0)
crm_osbf_set_microgroom(int value)42 void crm_osbf_set_microgroom(int value)
43 {
44 osbf_microgroom = value;
45 }
46
47
48 //
49 // How to microgroom a .css file that's getting full
50 //
51 // NOTA BENE NOTA BENE NOTA BENE NOTA BENE
52 //
53 // This whole section of code is under intense develoment; right now
54 // it "works" but not any better than nothing at all. Be warned
55 // that any patches issued on it may well never see the light of
56 // day, as intense testing and comparison may show that the current
57 // algorithms are, well, suckful.
58 //
59 //
60 // There are two steps to microgrooming - first, since we know we're
61 // already too full, we execute a 'zero unity bins'.
62 //
63 void
crm_osbf_microgroom(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long hindex)64 crm_osbf_microgroom (OSBF_FEATURE_HEADER_STRUCT * header,
65 unsigned long hindex)
66 {
67 long i, j, k;
68 static long microgroom_count = 0;
69 long packstart;
70 long packlen;
71 long zeroed_countdown, max_zeroed_buckets;
72 long min_value, min_value_any, distance, max_distance;
73 int groom_any = 0;
74 OSBF_FEATUREBUCKET_STRUCT *h;
75
76 // if not set by command line, use default
77 if (microgroom_chain_length == 0)
78 microgroom_chain_length = OSBF_MICROGROOM_CHAIN_LENGTH;
79 if (microgroom_stop_after == 0)
80 microgroom_stop_after = OSBF_MICROGROOM_STOP_AFTER;
81
82
83 // make h point to the first feature bucket
84 h = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
85
86 zeroed_countdown = microgroom_stop_after;
87 i = j = k = 0;
88 microgroom_count++;
89
90 if (user_trace)
91 {
92 if (microgroom_count == 1)
93 fprintf (stderr, "CSS file too full: microgrooming this css chain: ");
94 fprintf (stderr, " %ld ", microgroom_count);
95 };
96
97 // micropack - start at initial chain start, move to back of
98 // chain that overflowed, then scale just that chain.
99
100 i = j = hindex % header->buckets;
101 min_value = OSBF_FEATUREBUCKET_VALUE_MAX;
102 min_value_any = GET_BUCKET_VALUE (h[i]);
103 while (BUCKET_IN_CHAIN (h[i]))
104 {
105 if (GET_BUCKET_VALUE (h[i]) < min_value && !BUCKET_IS_LOCKED (h[i]))
106 min_value = GET_BUCKET_VALUE (h[i]);
107 if (GET_BUCKET_VALUE (h[i]) < min_value_any)
108 min_value_any = GET_BUCKET_VALUE (h[i]);
109 if (i == 0)
110 i = header->buckets - 1;
111 else
112 i--;
113 if (i == j)
114 break; // don't hang if we have a 100% full .css file
115 // fprintf (stderr, "-");
116 }
117
118 if (min_value == OSBF_FEATUREBUCKET_VALUE_MAX)
119 {
120 /* no unlocked bucket avaiable so groom any */
121 groom_any = 1;
122 min_value = min_value_any;
123 }
124
125 // now, move our index to the first bucket in this chain.
126 i++;
127 if (i >= header->buckets)
128 i = 0;
129 packstart = i;
130
131 /* i = j = hindex % header->buckets; */
132 while (BUCKET_IN_CHAIN (h[i]))
133 {
134 i++;
135 if (i == header->buckets)
136 i = 0;
137 if (i == packstart)
138 break; // don't hang if we have a 100% full .cfc file
139 }
140 // now, our index is right after the last bucket in this chain.
141
142 /* if there was a wraparound, full .cfc file, */
143 /* i == packstart and packlen == header->buckets */
144 if (i > packstart)
145 packlen = i - packstart;
146 else
147 packlen = header->buckets + i - packstart;
148
149 // This pruning method zeroes buckets with minimum count in the chain.
150 // It tries first buckets with minimum distance to their right position,
151 // to increase the chance of zeroing older buckets first. If none with
152 // distance 0 is found, the distance is increased until at least one
153 // bucket is zeroed.
154 //
155 // We keep track of how many buckets we've zeroed and we stop
156 // zeroing additional buckets after that point. NO! BUG! That
157 // messes up the tail length, and if we don't repack the tail, then
158 // features in the tail can become permanently inaccessible! Therefore,
159 // we really can't stop in the middle of the tail (well, we could
160 // stop zeroing, but we need to pass the full length of the tail in.
161 //
162 // Note that we can't do this "adaptively" in packcss, because zeroes
163 // there aren't necessarily overflow chain terminators (because -we-
164 // might have inserted them here.
165 //
166 // GROT GROT GROT Note that the following algorithm does multiple
167 // passes to find the lowest-valued features. In fact, that's
168 // actually rather slow; a better algorithm would keep track of
169 // the N least-valued features in the chain in ONE pass and zero
170 // those.
171 //
172 // --
173 // I'm not sure if it's worth working on a better algoritm for this:
174 //
175 // This is a statistics report of microgroomings for 4147 messages
176 // of the SpamAssassin corpus. It shows that 77% is done in a single
177 // pass, 95.2% in 1 or 2 passes and 99% in at most 3 passes.
178 //
179 // # microgrommings passes % accum. %
180 // 232584 1 76.6 76.6
181 // 56396 2 18.6 95.2
182 // 11172 3 3.7 98.9
183 // 2502 4 0.8 99.7
184 // 726 5 0.2 99.9
185 // ...
186 // -----------
187 // 303773
188 //
189 // If we consider only the last 100 microgroomings, when the css
190 // file is full, we'll have the following numbers showing that most
191 // microgroomings (61%) are still done in a single pass, almost 90%
192 // is done in 1 or 2 passes and 97% are done in at most 3 passes:
193 //
194 // # microgrommings passes % accum. %
195 // 61 1 61 61
196 // 27 2 27 88
197 // 9 3 9 97
198 // 3 4 3 100
199 // ---
200 // 100
201 //
202 // So, it's not so slow. Anyway, a better algorithm could be
203 // implemented using 2 additional arrays, with MICROGROOM_STOP_AFTER
204 // positions each, to store the indexes of the candidate buckets
205 // found with distance equal to 1 or 2 while we scan for distance 0.
206 // Those with distance 0 are zeroed immediatelly. If none with
207 // distance 0 is found, we'll zero the indexes stored in the first
208 // array. Again, if none is found in the first array, we'll try the
209 // second one. Finally, if none is found in both arrays, the loop
210 // will continue until one bucket is zeroed.
211 //
212 // But now comes the question: do the numbers above justify the
213 // additional code/work? I'll try to find out the answer
214 // implementing it :), but this has low priority for now.
215 //
216 // -- Fidelis Assis
217 //
218
219 // try features in their right place first
220 max_distance = 1;
221
222 /* zero up to 50% of packlen */
223 /* max_zeroed_buckets = (long) (0.5 * packlen + 0.5); */
224 max_zeroed_buckets = microgroom_stop_after;
225 zeroed_countdown = max_zeroed_buckets;
226
227 /*fprintf(stderr, "packstart: %ld, packlen: %ld, max_zeroed_buckets: %ld\n",
228 packstart, packlen, max_zeroed_buckets); */
229
230 // while no bucket is zeroed...
231 while (zeroed_countdown == max_zeroed_buckets)
232 {
233 /*
234 fprintf(stderr, "Start: %ld, stop_after: %ld, max_distance: %ld\n", packstart, microgroom_stop_after, max_distance);
235 */
236 i = packstart;
237 while (BUCKET_IN_CHAIN (h[i]) && zeroed_countdown > 0)
238 {
239 // check if it's a candidate
240 if (GET_BUCKET_VALUE (h[i]) == min_value &&
241 (!BUCKET_IS_LOCKED (h[i]) || (groom_any != 0)))
242 {
243 // if it is, check the distance
244 distance = i - BUCKET_HASH (h[i]) % header->buckets;
245 if (distance < 0)
246 distance += header->buckets;
247 if (distance < max_distance)
248 {
249 BUCKET_RAW_VALUE (h[i]) = 0;
250 zeroed_countdown--;
251 }
252 }
253 i++;
254 if (i >= header->buckets)
255 i = 0;
256 }
257
258 // if none was zeroed, increase the allowed distance between the
259 // candidade's position and its right place.
260
261 if (zeroed_countdown == max_zeroed_buckets)
262 max_distance++;
263 }
264
265 /*
266 fprintf (stderr, "Leaving microgroom: %ld buckets zeroed at distance %ld\n", microgroom_stop_after - zeroed_countdown, max_distance - 1);
267 */
268
269 // now we pack the buckets
270 crm_osbf_packcss (header, packstart, packlen);
271
272 }
273
274 void
crm_osbf_packcss(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long packstart,unsigned long packlen)275 crm_osbf_packcss (OSBF_FEATURE_HEADER_STRUCT * header,
276 unsigned long packstart, unsigned long packlen)
277 {
278
279 // How we pack...
280 //
281 // We look at each bucket, and attempt to reinsert it at the "best"
282 // place. We know at worst it will end up where it already is, and
283 // at best it will end up lower (at a lower index) in the file, except
284 // if it's in wraparound mode, in which case we know it will not get
285 // back up past us (since the file must contain at least one empty)
286 // and so it's still below us in the file.
287
288
289 OSBF_FEATUREBUCKET_STRUCT *h;
290
291 // make h point to the first feature bucket
292 h = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
293
294 if (packstart + packlen <= header->buckets) // no wraparound in this case
295 {
296 crm_osbf_packseg (header, packstart, packlen);
297 }
298 else // wraparound mode - do it as two separate repacks
299 {
300 crm_osbf_packseg (header, packstart, (header->buckets - packstart));
301 crm_osbf_packseg (header, 0, (packlen - (header->buckets - packstart)));
302 };
303 }
304
305 void
crm_osbf_packseg(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long packstart,unsigned long packlen)306 crm_osbf_packseg (OSBF_FEATURE_HEADER_STRUCT * header,
307 unsigned long packstart, unsigned long packlen)
308 {
309 unsigned long ifrom, ito;
310 unsigned long thash, tkey;
311 OSBF_FEATUREBUCKET_STRUCT *h;
312
313 // make h point to the first feature bucket
314 h = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
315
316 if (internal_trace)
317 fprintf (stderr, " < %ld %ld >", packstart, packlen);
318
319 // Our slot values are now somewhat in disorder because empty
320 // buckets may now have been inserted into a chain where there used
321 // to be placeholder buckets. We need to re-insert slot data in a
322 // bucket where it will be found.
323
324 for (ifrom = packstart; ifrom < packstart + packlen; ifrom++)
325 {
326 // Now find the next bucket to place somewhere
327 thash = BUCKET_HASH (h[ifrom]);
328 tkey = BUCKET_KEY (h[ifrom]);
329
330 if (GET_BUCKET_VALUE (h[ifrom]) == 0)
331 {
332 if (internal_trace)
333 fprintf (stderr, "X");
334 }
335 else
336 {
337 ito = thash % header->buckets;
338 // fprintf (stderr, "a %ld", ito);
339
340 while (BUCKET_IN_CHAIN (h[ito]) &&
341 !BUCKET_HASH_COMPARE (h[ito], thash, tkey))
342 {
343 ito++;
344 if (ito >= header->buckets)
345 ito = 0;
346 }
347
348 // found an empty slot, put this value there, and zero the
349 // original one. Sometimes this is a noop. We don't care.
350
351 if (ito != ifrom)
352 {
353 BUCKET_HASH (h[ito]) = thash;
354 BUCKET_KEY (h[ito]) = tkey;
355 // move value and lock together
356 BUCKET_RAW_VALUE (h[ito]) = BUCKET_RAW_VALUE (h[ifrom]);
357
358 // clean "from" bucket
359 BUCKET_HASH (h[ifrom]) = 0;
360 BUCKET_KEY (h[ifrom]) = 0;
361 BUCKET_RAW_VALUE (h[ifrom]) = 0;
362 }
363
364 if (internal_trace)
365 {
366 if (ifrom == ito)
367 fprintf (stderr, "=");
368 if (ito < ifrom)
369 fprintf (stderr, "<");
370 if (ito > ifrom)
371 fprintf (stderr, ">");
372 };
373
374 };
375 };
376 }
377
378 /* get next bucket index */
379 unsigned long
crm_osbf_next_bindex(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long hindex)380 crm_osbf_next_bindex (OSBF_FEATURE_HEADER_STRUCT * header,
381 unsigned long hindex)
382 {
383 hindex++;
384 if (hindex >= header->buckets)
385 hindex = 0;
386 return hindex;
387 }
388
389 /* get index of the last bucket in a chain */
390 unsigned long
crm_osbf_last_in_chain(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long hindex)391 crm_osbf_last_in_chain (OSBF_FEATURE_HEADER_STRUCT * header,
392 unsigned long hindex)
393 {
394 unsigned long wraparound;
395 OSBF_FEATUREBUCKET_STRUCT *hashes;
396
397 hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
398
399 /* if the bucket is not in a chain, return an index */
400 /* out of the buckets space, equal to the number of */
401 /* buckets in the file to indicate an empty chain */
402 if (!BUCKET_IN_CHAIN (hashes[hindex]))
403 return header->buckets;
404
405 wraparound = hindex;
406 while (BUCKET_IN_CHAIN (hashes[hindex]))
407 {
408 hindex++;
409 if (hindex >= header->buckets)
410 hindex = 0;
411
412 /* if .cfc file is full return an index out of */
413 /* the buckets space, equal to number of buckets */
414 /* in the file, plus one */
415 if (hindex == wraparound)
416 return header->buckets + 1;
417 }
418 hindex = crm_osbf_prev_bindex (header, hindex);
419 return hindex;
420 }
421
422
423 /* get previous bucket index */
424 unsigned long
crm_osbf_prev_bindex(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long hindex)425 crm_osbf_prev_bindex (OSBF_FEATURE_HEADER_STRUCT * header,
426 unsigned long hindex)
427 {
428 if (hindex == 0)
429 hindex = header->buckets - 1;
430 else
431 hindex--;
432 return hindex;
433 }
434
435 /* get index of the first bucket in a chain */
436 unsigned long
crm_osbf_first_in_chain(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long hindex)437 crm_osbf_first_in_chain (OSBF_FEATURE_HEADER_STRUCT * header,
438 unsigned long hindex)
439 {
440 unsigned long wraparound;
441 OSBF_FEATUREBUCKET_STRUCT *hashes;
442
443 hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
444
445 /* if the bucket is not in a chain, return an index */
446 /* out of the buckets space, equal to the number of */
447 /* buckets in the file to indicate an empty chain */
448 if (!BUCKET_IN_CHAIN (hashes[hindex]))
449 return header->buckets;
450
451 wraparound = hindex;
452 while (BUCKET_IN_CHAIN (hashes[hindex]))
453 {
454 if (hindex == 0)
455 hindex = header->buckets - 1;
456 else
457 hindex--;
458
459 /* if .cfc file is full return an index out of */
460 /* the buckets space, equal to number of buckets */
461 /* in the file, plus one */
462 if (hindex == wraparound)
463 return header->buckets + 1;
464 }
465 return crm_osbf_next_bindex (header, hindex);
466 }
467
468 unsigned long
crm_osbf_find_bucket(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long hash,unsigned long key)469 crm_osbf_find_bucket (OSBF_FEATURE_HEADER_STRUCT * header,
470 unsigned long hash, unsigned long key)
471 {
472 OSBF_FEATUREBUCKET_STRUCT *hashes;
473 unsigned long hindex, start;
474
475 hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
476 hindex = start = hash % header->buckets;
477 while (!BUCKET_HASH_COMPARE (hashes[hindex], hash, key)
478 && !EMPTY_BUCKET (hashes[hindex]))
479 {
480 hindex = crm_osbf_next_bindex (header, hindex);
481 /* if .cfc file is completely full return an index */
482 /* out of the buckets space, equal to number of buckets */
483 /* in the file, plus one */
484 if (hindex == start)
485 return header->buckets + 1;
486 }
487
488 /* return the index of the found bucket or, if not found,
489 * the index of a free bucket where it could be put */
490 return hindex;
491 }
492
493 void
crm_osbf_update_bucket(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long bindex,int delta)494 crm_osbf_update_bucket (OSBF_FEATURE_HEADER_STRUCT * header,
495 unsigned long bindex, int delta)
496 {
497 OSBF_FEATUREBUCKET_STRUCT *hashes;
498 hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
499 /*
500 fprintf (stderr, "Bucket updated at %lu, hash: %lu, key: %lu, value: %d\n",
501 bindex, hashes[bindex].hash, hashes[bindex].key, delta);
502 */
503 if (delta > 0 && GET_BUCKET_VALUE (hashes[bindex]) +
504 delta >= OSBF_FEATUREBUCKET_VALUE_MAX - 1)
505 {
506 SETL_BUCKET_VALUE (hashes[bindex], OSBF_FEATUREBUCKET_VALUE_MAX - 1);
507 }
508 else if (delta < 0 && GET_BUCKET_VALUE (hashes[bindex]) <= -delta)
509 {
510 BUCKET_RAW_VALUE (hashes[bindex]) = 0;
511 BUCKET_HASH (hashes[bindex]) = 0;
512 BUCKET_KEY (hashes[bindex]) = 0;
513 /* pack chain */
514 {
515 long i, j, packlen;
516 i = crm_osbf_next_bindex (header, bindex);
517 j = crm_osbf_last_in_chain (header, i);
518
519 /* if there's a valid chain tail starting at i, pack it */
520 if (j < header->buckets)
521 {
522 if (j >= i)
523 packlen = j - i + 1;
524 else
525 packlen = header->buckets + 1 - (i - j);
526 crm_osbf_packcss (header, i, packlen);
527 }
528 }
529 }
530 else
531 {
532 SETL_BUCKET_VALUE (hashes[bindex],
533 GET_BUCKET_VALUE (hashes[bindex]) + delta);
534 }
535 }
536
537 void
crm_osbf_insert_bucket(OSBF_FEATURE_HEADER_STRUCT * header,unsigned long bindex,unsigned long hash,unsigned long key,int value)538 crm_osbf_insert_bucket (OSBF_FEATURE_HEADER_STRUCT * header,
539 unsigned long bindex, unsigned long hash,
540 unsigned long key, int value)
541 {
542 unsigned long hindex, distance;
543 OSBF_FEATUREBUCKET_STRUCT *hashes;
544
545 if (microgroom_chain_length == 0)
546 microgroom_chain_length = OSBF_MICROGROOM_CHAIN_LENGTH;
547
548
549 hashes = (OSBF_FEATUREBUCKET_STRUCT *) header + header->buckets_start;
550 /* "right" bucket position */
551 hindex = hash % header->buckets;
552 /* distance from right position to free position */
553 distance = (bindex >= hindex) ? bindex - hindex :
554 header->buckets - (hindex - bindex);
555 if ((osbf_microgroom != 0) && (value > 0))
556 while (distance > microgroom_chain_length)
557 {
558 /*
559 fprintf (stderr, "hindex: %lu, bindex: %lu, distance: %lu\n",
560 hindex, bindex, distance);
561 */
562 crm_osbf_microgroom (header, crm_osbf_prev_bindex (header, bindex));
563 /* get new free bucket index */
564 bindex = crm_osbf_find_bucket (header, hash, key);
565 distance = (bindex >= hindex) ? bindex - hindex :
566 header->buckets - (hindex - bindex);
567 }
568
569 /*
570 fprintf (stderr, "new bucket at %lu, hash: %lu, key: %lu, distance: %lu\n",
571 bindex, hash, key, distance);
572 */
573
574 SETL_BUCKET_VALUE (hashes[bindex], value);
575 BUCKET_HASH (hashes[bindex]) = hash;
576 BUCKET_KEY (hashes[bindex]) = key;
577 }
578
579 static OSBF_HEADER_UNION hu;
580 int
crm_osbf_create_cssfile(char * cssfile,unsigned long buckets,unsigned long major,unsigned long minor,unsigned long spectrum_start)581 crm_osbf_create_cssfile (char *cssfile, unsigned long buckets,
582 unsigned long major, unsigned long minor,
583 unsigned long spectrum_start)
584 {
585 FILE *f;
586 long i;
587 OSBF_FEATUREBUCKET_STRUCT feature = {
588 0, 0, 0
589 };
590 if (user_trace)
591 fprintf (stderr, "Opening file %s for read/write\n", cssfile);
592 f = fopen (cssfile, "wb");
593 if (!f)
594 {
595 fatalerror ("Couldn't open the new .cfc file for writing; file = ",
596 cssfile);
597 };
598 // Set the header.
599 *((unsigned long *) hu.header.version) = major; // quick hack for now...
600 hu.header.flags = minor;
601 hu.header.learnings = 0;
602 hu.header.buckets = buckets;
603 hu.header.buckets_start = OSBF_CSS_SPECTRA_START;
604 // Write header
605 if (fwrite (&hu, sizeof (hu), 1, f) != 1)
606 {
607 fatalerror (" Couldn't initialize the .cfc file header; file = ",
608 cssfile);
609 }
610
611 // Initialize CSS hashes - zero all buckets
612 for (i = 0; i < buckets; i++)
613 {
614 // Write buckets
615 if (fwrite (&feature, sizeof (feature), 1, f) != 1)
616 {
617 fatalerror (" Couldn't initialize the .cfc buckets; file = ",
618 cssfile);
619 }
620 }
621 fclose (f);
622 return (EXIT_SUCCESS);
623 }
624