1 /* $Id: tokenizer.c,v 1.301 2011/06/28 00:13:48 sbajic Exp $ */
2 
3 /*
4  DSPAM
5  COPYRIGHT (C) 2002-2012 DSPAM PROJECT
6 
7  This program is free software: you can redistribute it and/or modify
8  it under the terms of the GNU Affero General Public License as
9  published by the Free Software Foundation, either version 3 of the
10  License, or (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  GNU Affero General Public License for more details.
16 
17  You should have received a copy of the GNU Affero General Public License
18  along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 
20 */
21 
22 /*
23  * tokenizer.c - tokenizer functions
24  *
25  * DESCRIPTION
26  *   The tokenizer subroutines are responsible for decomposing a message into
27  *   its colloquial components. All components are stored collectively in
28  *   a diction object, passed into the function.
29  *
30  */
31 
32 #ifdef HAVE_CONFIG_H
33 #include <auto-config.h>
34 #endif
35 
36 #include <stdio.h>
37 #include <stdlib.h>
38 #include <math.h>
39 #include <ctype.h>
40 #include <errno.h>
41 #include <string.h>
42 #ifdef HAVE_UNISTD_H
43 #include <unistd.h>
44 #endif
45 #include <sys/types.h>
46 #include <sys/stat.h>
47 
48 #ifdef TIME_WITH_SYS_TIME
49 #   include <sys/time.h>
50 #   include <time.h>
51 #else
52 #   ifdef HAVE_SYS_TIME_H
53 #       include <sys/time.h>
54 #   else
55 #       include <time.h>
56 #   endif
57 #endif
58 
59 #include "config.h"
60 #include "tokenizer.h"
61 #include "util.h"
62 #include "libdspam.h"
63 #include "language.h"
64 
65 /*
66  * _ds_tokenize() - tokenize the message
67  *
68  * DESCRIPTION
69  *    tokenizes the supplied message
70  *
71  * INPUT ARGUMENTS
72  *     DSPAM_CTX *CTX    pointer to context
73  *     char *header      pointer to message header
74  *     char *body        pointer to message body
75  *     ds_diction_t      diction to store components
76  *
77  * RETURN VALUES
78  *   standard errors on failure
79  *   zero if successful
80  *
81  */
82 
83 int
_ds_tokenize(DSPAM_CTX * CTX,char * headers,char * body,ds_diction_t diction)84 _ds_tokenize (DSPAM_CTX * CTX, char *headers, char *body, ds_diction_t diction)
85 {
86   if (diction == NULL)
87     return EINVAL;
88 
89   if (CTX->tokenizer == DSZ_SBPH || CTX->tokenizer == DSZ_OSB)
90     return _ds_tokenize_sparse(CTX, headers, body, diction);
91   else
92     return _ds_tokenize_ngram(CTX, headers, body, diction);
93 }
94 
_ds_tokenize_ngram(DSPAM_CTX * CTX,char * headers,char * body,ds_diction_t diction)95 int _ds_tokenize_ngram(
96   DSPAM_CTX *CTX,
97   char *headers,
98   char *body,
99   ds_diction_t diction)
100 {
101   char *token;				/* current token */
102   char *previous_token = NULL;		/* used for bigrams (chained tokens) */
103   char *line = NULL;			/* header broken up into lines */
104   char *ptrptr;
105   char heading[128];			/* current heading */
106   int l, tokenizer = CTX->tokenizer;
107 
108   struct nt *header = NULL;
109   struct nt_node *node_nt;
110   struct nt_c c_nt;
111 
112   /* Tokenize URLs in message */
113 
114   if (_ds_match_attribute(CTX->config->attributes, "ProcessorURLContext", "on"))  {
115     _ds_url_tokenize(diction, body, "http://");
116     _ds_url_tokenize(diction, body, "www.");
117     _ds_url_tokenize(diction, body, "href=");
118   }
119 
120   /*
121    * Header Tokenization
122    */
123 
124   header = nt_create (NT_CHAR);
125   if (header == NULL)
126   {
127     LOG (LOG_CRIT, ERR_MEM_ALLOC);
128     return EUNKNOWN;
129   }
130 
131   line = strtok_r (headers, "\n", &ptrptr);
132   while (line) {
133     nt_add (header, line);
134     line = strtok_r (NULL, "\n", &ptrptr);
135   }
136 
137   node_nt = c_nt_first (header, &c_nt);
138   heading[0] = 0;
139   while (node_nt) {
140     int multiline;
141 
142 #ifdef VERBOSE
143     LOGDEBUG("processing line: %s", node_nt->ptr);
144 #endif
145 
146     line = node_nt->ptr;
147     token = strtok_r (line, ":", &ptrptr);
148     if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " "))
149     {
150       multiline = 0;
151       strlcpy (heading, token, 128);
152       previous_token = NULL;
153     } else {
154       multiline = 1;
155     }
156 
157 #ifdef VERBOSE
158     LOGDEBUG ("Reading '%s' header from: '%s'", heading, line);
159 #endif
160 
161     if (CTX->flags & DSF_WHITELIST) {
162       /* Use the entire From: line for auto-whitelisting */
163 
164       if (!strcmp(heading, "From")) {
165         char wl[256];
166         char *fromline = line + 5;
167         unsigned long long whitelist_token;
168 
169         if (fromline[0] == 32)
170           fromline++;
171         snprintf(wl, sizeof(wl), "%s*%s", heading, fromline);
172         whitelist_token = _ds_getcrc64(wl);
173         ds_diction_touch(diction, whitelist_token, wl, 0);
174         diction->whitelist_token = whitelist_token;
175       }
176     }
177 
178     /* Received headers use a different set of delimiters to preserve things
179        like ip addresses */
180 
181     token = strtok_r ((multiline) ? line : NULL, DELIMITERS_HEADING, &ptrptr);
182 
183     while (token)
184     {
185       l = strlen(token);
186 
187       if (l >= 1 && l < 50)
188       {
189 #ifdef VERBOSE
190         LOGDEBUG ("Processing '%s' token in '%s' header", token, heading);
191 #endif
192 
193         /* Process "current" token */
194         if (!_ds_process_header_token
195             (CTX, token, previous_token, diction, heading) &&
196             (tokenizer == DSZ_CHAIN))
197         {
198           previous_token = token;
199         }
200       }
201 
202       token = strtok_r (NULL, DELIMITERS_HEADING, &ptrptr);
203     }
204 
205     previous_token = NULL;
206     node_nt = c_nt_next (header, &c_nt);
207   }
208 
209   nt_destroy (header);
210 
211   /*
212    * Body Tokenization
213    */
214 
215 #ifdef VERBOSE
216   LOGDEBUG("parsing message body");
217 #endif
218 
219   token = strtok_r (body, DELIMITERS, &ptrptr);
220   while (token != NULL)
221   {
222     l = strlen (token);
223     if (l >= 1 && l < 50)
224     {
225 #ifdef VERBOSE
226         LOGDEBUG ("Processing body token '%s'", token);
227 #endif
228 
229       /* Process "current" token */
230       if ( !_ds_process_body_token(CTX, token, previous_token, diction)
231         && tokenizer == DSZ_CHAIN)
232       {
233         previous_token = token;
234       }
235     }
236     token = strtok_r (NULL, DELIMITERS, &ptrptr);
237   }
238 
239 #ifdef VERBOSE
240   LOGDEBUG("Finished tokenizing (ngram) message");
241 #endif
242 
243   /* Final token reassembly (anything left in the buffer) */
244 
245   return 0;
246 }
247 
_ds_tokenize_sparse(DSPAM_CTX * CTX,char * headers,char * body,ds_diction_t diction)248 int _ds_tokenize_sparse(
249   DSPAM_CTX *CTX,
250   char *headers,
251   char *body,
252   ds_diction_t diction)
253 {
254   int i;
255   char *token;				/* current token */
256   char *previous_tokens[SPARSE_WINDOW_SIZE];	/* sparse chain */
257 
258   char *line = NULL;			/* header broken up into lines */
259   char *ptrptr;
260   char *bitpattern;
261 
262   char heading[128];			/* current heading */
263   int l;
264 
265   struct nt *header = NULL;
266   struct nt_node *node_nt;
267   struct nt_c c_nt;
268 
269   for(i=0;i<SPARSE_WINDOW_SIZE;i++)
270     previous_tokens[i] = NULL;
271 
272   bitpattern = _ds_generate_bitpattern(_ds_pow2(SPARSE_WINDOW_SIZE));
273 
274   /* Tokenize URLs in message */
275 
276   if (_ds_match_attribute(CTX->config->attributes, "ProcessorURLContext", "on"))
277   {
278     _ds_url_tokenize(diction, body, "http://");
279     _ds_url_tokenize(diction, body, "www.");
280     _ds_url_tokenize(diction, body, "href=");
281   }
282 
283   /*
284    * Header Tokenization
285    */
286 
287   header = nt_create (NT_CHAR);
288   if (header == NULL)
289   {
290     LOG (LOG_CRIT, ERR_MEM_ALLOC);
291     free(bitpattern);
292     return EUNKNOWN;
293   }
294 
295   line = strtok_r (headers, "\n", &ptrptr);
296   while (line) {
297     nt_add (header, line);
298     line = strtok_r (NULL, "\n", &ptrptr);
299   }
300 
301   node_nt = c_nt_first (header, &c_nt);
302   heading[0] = 0;
303   while (node_nt) {
304     int multiline;
305 
306 #ifdef VERBOSE
307     LOGDEBUG("processing line: %s", node_nt->ptr);
308 #endif
309 
310     _ds_sparse_clear(previous_tokens);
311 
312     line = node_nt->ptr;
313     token = strtok_r (line, ":", &ptrptr);
314     if (token && token[0] != 32 && token[0] != 9 && !strstr (token, " "))
315     {
316       multiline = 0;
317       strlcpy (heading, token, 128);
318       _ds_sparse_clear(previous_tokens);
319     } else {
320       multiline = 1;
321     }
322 
323 #ifdef VERBOSE
324     LOGDEBUG ("Reading '%s' header from: '%s'", heading, line);
325 #endif
326 
327     if (CTX->flags & DSF_WHITELIST) {
328       /* Use the entire From: line for auto-whitelisting */
329 
330       if (!strcmp(heading, "From")) {
331         char wl[256];
332         char *fromline = line + 5;
333         unsigned long long whitelist_token;
334 
335         if (fromline[0] == 32)
336           fromline++;
337         snprintf(wl, sizeof(wl), "%s*%s", heading, fromline);
338         whitelist_token = _ds_getcrc64(wl);
339         ds_diction_touch(diction, whitelist_token, wl, 0);
340         diction->whitelist_token = whitelist_token;
341       }
342     }
343 
344     /* Received headers use a different set of delimiters to preserve things
345        like ip addresses */
346 
347     token = strtok_r ((multiline) ? line : NULL, SPARSE_DELIMITERS_HEADING, &ptrptr);
348 
349     while (token)
350     {
351       l = strlen(token);
352 
353       if (l > 0 && l < 50)
354       {
355 #ifdef VERBOSE
356         LOGDEBUG ("Processing '%s' token in '%s' header", token, heading);
357 #endif
358         _ds_map_header_token (CTX, token, previous_tokens, diction, heading, bitpattern);
359       }
360 
361       token = strtok_r (NULL, SPARSE_DELIMITERS_HEADING, &ptrptr);
362     }
363 
364     for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
365       _ds_map_header_token(CTX, NULL, previous_tokens, diction, heading, bitpattern);
366     }
367 
368     _ds_sparse_clear(previous_tokens);
369     node_nt = c_nt_next (header, &c_nt);
370   }
371   nt_destroy (header);
372 
373   /*
374    * Body Tokenization
375    */
376 
377 #ifdef VERBOSE
378   LOGDEBUG("parsing message body");
379 #endif
380 
381   token = strtok_r (body, SPARSE_DELIMITERS, &ptrptr);
382   while (token != NULL)
383   {
384     l = strlen (token);
385     if (l > 0 && l < 50)
386     {
387 #ifdef VERBOSE
388         LOGDEBUG ("Processing body token '%s'", token);
389 #endif
390 
391       /* Process "current" token */
392       _ds_map_body_token (CTX, token, previous_tokens, diction, bitpattern);
393     }
394     token = strtok_r (NULL, SPARSE_DELIMITERS, &ptrptr);
395   }
396 
397   for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
398     _ds_map_body_token(CTX, NULL, previous_tokens, diction, bitpattern);
399   }
400 
401   _ds_sparse_clear(previous_tokens);
402 
403   free(bitpattern);
404 
405 #ifdef VERBOSE
406   LOGDEBUG("Finished tokenizing (sparse) message");
407 #endif
408 
409   return 0;
410 }
411 
412 /*
413  * _ds_{process,map}_{header,body}_token()
414  *
415  * DESCRIPTION
416  *  Token processing and mapping functions
417  *    _ds_process_header_token
418  *    _ds_process_body_token
419  *    _ds_map_header_token
420  *    _ds_map_body_token
421  *
422  *  These functions are responsible to converting the input words into
423  *  full blown tokens with CRCs, probabilities, and producing variants
424  *  based on the tokenizer approach applied.
425  */
426 
427 int
_ds_process_header_token(DSPAM_CTX * CTX,char * token,const char * previous_token,ds_diction_t diction,const char * heading)428 _ds_process_header_token (DSPAM_CTX * CTX, char *token,
429                           const char *previous_token, ds_diction_t diction,
430                           const char *heading)
431 {
432   char combined_token[256];
433   unsigned long long crc;
434   char *tweaked_token;
435 
436   if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading))
437     return 0;
438 
439   if (!strncmp(heading, "X-DSPAM-", 8))
440     return 0;
441 
442   /* This is where we used to ignore certain headings */
443 
444   if (heading[0] != 0)
445     snprintf (combined_token, sizeof (combined_token),
446               "%s*%s", heading, token);
447   else
448     strlcpy (combined_token, token, sizeof (combined_token));
449 
450   tweaked_token = _ds_truncate_token(token);
451   if (tweaked_token == NULL)
452     return EUNKNOWN;
453 
454   snprintf(combined_token, sizeof(combined_token), "%s*%s", heading, tweaked_token);
455 
456   crc = _ds_getcrc64 (combined_token);
457 #ifdef VERBOSE
458   LOGDEBUG ("Token Hit: '%s'", combined_token);
459 #endif
460   ds_diction_touch(diction, crc, combined_token, 0);
461 
462   if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL)
463   {
464     char *tweaked_previous;
465 
466     tweaked_previous = _ds_truncate_token(previous_token);
467     if (tweaked_previous == NULL) {
468       free(tweaked_token);
469       return EUNKNOWN;
470     }
471 
472     snprintf (combined_token, sizeof (combined_token),
473               "%s*%s+%s", heading, tweaked_previous, tweaked_token);
474     crc = _ds_getcrc64 (combined_token);
475 
476     ds_diction_touch(diction, crc, combined_token, DSD_CHAINED);
477     free(tweaked_previous);
478   }
479 
480   free(tweaked_token);
481   return 0;
482 }
483 
484 int
_ds_process_body_token(DSPAM_CTX * CTX,char * token,const char * previous_token,ds_diction_t diction)485 _ds_process_body_token (DSPAM_CTX * CTX, char *token,
486                         const char *previous_token, ds_diction_t diction)
487 {
488   char combined_token[256];
489   unsigned long long crc;
490   char *tweaked_token;
491 
492   tweaked_token = _ds_truncate_token(token);
493   if (tweaked_token == NULL)
494     return EUNKNOWN;
495 
496   crc = _ds_getcrc64 (tweaked_token);
497 
498   ds_diction_touch(diction, crc, tweaked_token, DSD_CONTEXT);
499 
500   if (CTX->tokenizer == DSZ_CHAIN && previous_token != NULL)
501   {
502     char *tweaked_previous = _ds_truncate_token(previous_token);
503     if (tweaked_previous == NULL) {
504       free(tweaked_token);
505       return EUNKNOWN;
506     }
507 
508     snprintf (combined_token, sizeof (combined_token), "%s+%s",
509               tweaked_previous, tweaked_token);
510     crc = _ds_getcrc64 (combined_token);
511 
512     ds_diction_touch(diction, crc, combined_token, DSD_CHAINED | DSD_CONTEXT);
513     free(tweaked_previous);
514   }
515   free(tweaked_token);
516 
517   return 0;
518 }
519 
520 
521 int
_ds_map_header_token(DSPAM_CTX * CTX,char * token,char ** previous_tokens,ds_diction_t diction,const char * heading,const char * bitpattern)522 _ds_map_header_token (DSPAM_CTX * CTX, char *token,
523                       char **previous_tokens, ds_diction_t diction,
524                       const char *heading, const char *bitpattern)
525 {
526   int i, t, keylen, breadth;
527   u_int32_t mask;
528   unsigned long long crc;
529   char key[256];
530   int active = 0, top, tokenizer = CTX->tokenizer;
531 
532   if (_ds_match_attribute(CTX->config->attributes, "IgnoreHeader", heading))
533     return 0;
534 
535   if (!strncmp(heading, "X-DSPAM-", 8))
536     return 0;
537 
538   /* Shift all previous tokens up */
539   for(i=0;i<SPARSE_WINDOW_SIZE-1;i++) {
540     previous_tokens[i] = previous_tokens[i+1];
541     if (previous_tokens[i])
542       active++;
543   }
544 
545   previous_tokens[SPARSE_WINDOW_SIZE-1] = token;
546 
547   if (token)
548     active++;
549 
550   breadth = _ds_pow2(active);
551 
552   /* Iterate and generate all keys necessary */
553   for (mask=0; mask < (u_int32_t)breadth; mask++) {
554     int terms = 0;
555 
556     key[0] = 0;
557     keylen = 0;
558     t = 0;
559     top = 1;
560 
561     /* Each Bit */
562     for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
563 
564       if (t) {
565         if ((size_t)keylen < (sizeof(key)-1)) {
566           key[keylen] = '+';
567           key[++keylen] = 0;
568         }
569       }
570 
571       if (bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] == 1) {
572         if (previous_tokens[i] == NULL || previous_tokens[i][0] == 0) {
573           if ((size_t)keylen < (sizeof(key)-1)) {
574             key[keylen] = '#';
575             key[++keylen] = 0;
576           }
577         }
578         else
579         {
580           int tl = strlen(previous_tokens[i]);
581           if ((size_t)(keylen + tl) < (sizeof(key)-1)) {
582             strcpy(key+keylen, previous_tokens[i]);
583             keylen += tl;
584           }
585           terms++;
586         }
587       } else {
588         if ((size_t)keylen < (sizeof(key)-1)) {
589           key[keylen] = '#';
590           key[++keylen] = 0;
591         }
592       }
593       t++;
594     }
595 
596     /* If the bucket has at least 1 literal, hit it */
597     if ((tokenizer == DSZ_SBPH && terms != 0) ||
598         (tokenizer == DSZ_OSB  && terms == 2))
599     {
600       char hkey[256];
601       char *k = key;
602       while(keylen>2 && !strcmp((key+keylen)-2, "+#")) {
603         key[keylen-2] = 0;
604         keylen -=2;
605       }
606       while(!strncmp(k, "#+", 2)) {
607         top = 0;
608         k+=2;
609         keylen -= 2;
610       }
611 
612       if (top) {
613         snprintf(hkey, sizeof(hkey), "%s*%s", heading, k);
614         crc = _ds_getcrc64(hkey);
615         ds_diction_touch(diction, crc, hkey, DSD_CONTEXT);
616       }
617     }
618   }
619 
620   return 0;
621 }
622 
623 int
_ds_map_body_token(DSPAM_CTX * CTX,char * token,char ** previous_tokens,ds_diction_t diction,const char * bitpattern)624 _ds_map_body_token (
625   DSPAM_CTX * CTX,
626   char *token,
627   char **previous_tokens,
628   ds_diction_t diction,
629   const char *bitpattern)
630 {
631   int i, t, keylen, breadth;
632   int top, tokenizer = CTX->tokenizer;
633   unsigned long long crc;
634   char key[256];
635   int active = 0;
636   u_int32_t mask;
637 
638   /* Shift all previous tokens up */
639   for(i=0;i<SPARSE_WINDOW_SIZE-1;i++) {
640     previous_tokens[i] = previous_tokens[i+1];
641     if (previous_tokens[i])
642       active++;
643   }
644 
645   previous_tokens[SPARSE_WINDOW_SIZE-1] = token;
646   if (token)
647     active++;
648 
649   breadth = _ds_pow2(active);
650 
651   /* Iterate and generate all keys necessary */
652 
653   for(mask=0;mask < (u_int32_t)breadth;mask++) {
654     int terms = 0;
655     t = 0;
656 
657     key[0] = 0;
658     keylen = 0;
659     top = 1;
660 
661     /* Each Bit */
662     for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
663       if (t) {
664         if ((size_t)keylen < (sizeof(key)-1)) {
665            key[keylen] = '+';
666            key[++keylen] = 0;
667         }
668       }
669       if (bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] == 1) {
670         if (previous_tokens[i] == NULL || previous_tokens[i][0] == 0) {
671           if ((size_t)keylen < (sizeof(key)-1)) {
672             key[keylen] = '#';
673             key[++keylen] = 0;
674           }
675         }
676         else
677         {
678           int tl = strlen(previous_tokens[i]);
679           if ((size_t)(keylen + tl) < (sizeof(key)-1)) {
680             strcpy(key+keylen, previous_tokens[i]);
681             keylen += tl;
682           }
683           terms++;
684         }
685       } else {
686         if ((size_t)keylen < (sizeof(key)-1)) {
687           key[keylen] = '#';
688           key[++keylen] = 0;
689         }
690       }
691       t++;
692     }
693 
694     /* If the bucket has at least 1 literal, hit it */
695     if ((tokenizer == DSZ_SBPH && terms != 0) ||
696         (tokenizer == DSZ_OSB  && terms == 2))
697     {
698       char *k = key;
699       while(keylen>2 && !strcmp((key+keylen)-2, "+#")) {
700         key[keylen-2] = 0;
701         keylen -=2;
702       }
703       while(!strncmp(k, "#+", 2)) {
704         top = 0;
705         k+=2;
706         keylen -=2;
707       }
708 
709       if (top) {
710         crc = _ds_getcrc64(k);
711         ds_diction_touch(diction, crc, k, DSD_CONTEXT);
712       }
713     }
714   }
715 
716   return 0;
717 }
718 
719 /*
720  *  _ds_degenerate_message()
721  *
722  * DESCRIPTION
723  *   Degenerate the message into headers, body and tokenizable pieces
724  *
725  *   This function is responsible for analyzing the actualized message and
726  *   degenerating it into only the components which are tokenizable.  This
727  *   process  effectively eliminates much HTML noise, special symbols,  or
728  *   other  non-tokenizable/non-desirable components. What is left  is the
729  *   bulk of  the message  and only  desired tags,  URLs, and other  data.
730  *
731  * INPUT ARGUMENTS
732  *      header    pointer to buffer containing headers
733  *      body      pointer to buffer containing message body
734  */
735 
_ds_degenerate_message(DSPAM_CTX * CTX,buffer * header,buffer * body)736 int _ds_degenerate_message(DSPAM_CTX *CTX, buffer * header, buffer * body)
737 {
738   char *decode = NULL;
739   struct nt_node *node_nt, *node_header;
740   struct nt_c c_nt, c_nt2;
741   int i = 0;
742   char heading[1024];
743 
744   if (! CTX->message)
745   {
746     LOG (LOG_WARNING, "_ds_degenerate_message() failed: CTX->message is NULL");
747     return EUNKNOWN;
748   }
749 
750   /* Iterate through each component and create large header/body buffers */
751 
752   node_nt = c_nt_first (CTX->message->components, &c_nt);
753   while (node_nt != NULL)
754   {
755     struct _ds_message_part *block = (struct _ds_message_part *) node_nt->ptr;
756 
757 #ifdef VERBOSE
758     LOGDEBUG ("Processing component %d", i);
759 #endif
760 
761     if (! block->headers || ! block->headers->items)
762     {
763 #ifdef VERBOSE
764       LOGDEBUG ("  : End of Message Identifier");
765 #endif
766     }
767 
768     else
769     {
770       struct _ds_header_field *current_header;
771 
772       /* Accumulate the headers */
773       node_header = c_nt_first (block->headers, &c_nt2);
774       while (node_header != NULL)
775       {
776         current_header = (struct _ds_header_field *) node_header->ptr;
777         snprintf (heading, sizeof (heading),
778                   "%s: %s\n", current_header->heading,
779                   current_header->data);
780         buffer_cat (header, heading);
781         node_header = c_nt_next (block->headers, &c_nt2);
782       }
783 
784       decode = block->body->data;
785 
786       if (block->media_type == MT_TEXT    ||
787                block->media_type == MT_MESSAGE ||
788                block->media_type == MT_UNKNOWN ||
789                (block->media_type == MT_MULTIPART && !i))
790       {
791         /* Accumulate the bodies, skip attachments */
792 
793         if (
794              (   block->encoding == EN_BASE64
795               || block->encoding == EN_QUOTED_PRINTABLE)
796             && ! block->original_signed_body)
797         {
798           if (block->content_disposition != PCD_ATTACHMENT)
799           {
800             LOGDEBUG ("decoding message block from encoding type %d",
801                       block->encoding);
802             decode = _ds_decode_block (block);
803           }
804         }
805 
806         /* We found a tokenizable body component, add prefilters */
807 
808         if (decode)
809         {
810           char *decode2 = NULL;
811           char *decode3 = NULL;
812 
813           /* -- PREFILTERS BEGIN -- */
814 
815           /* Hexadecimal 8-Bit Encodings */
816 
817           if (block->encoding == EN_8BIT) {
818             decode2 = _ds_decode_hex8bit(decode);
819           } else {
820             decode2 = strdup(decode);
821           }
822 
823           /* HTML-Specific Filters */
824 
825           if (decode2) {
826             if (block->media_subtype == MST_HTML) {
827               decode3 = _ds_strip_html(decode2);
828             } else {
829               decode3 = strdup(decode2);
830             }
831             free(decode2);
832           }
833 
834           /* -- PREFILTERS END -- */
835 
836           if (decode3) {
837             buffer_cat (body, decode3);
838             free(decode3);
839           }
840 
841           /* If we've decoded the body, save the original copy */
842           if (decode != block->body->data)
843           {
844             block->original_signed_body = block->body;
845             block->body = buffer_create (decode);
846             free (decode);
847           }
848         }
849       }
850     }
851 #ifdef VERBOSE
852     LOGDEBUG ("Getting next message component");
853 #endif
854     node_nt = c_nt_next (CTX->message->components, &c_nt);
855     i++;
856   } /* while (node_nt != NULL) */
857 
858   if (header->data == NULL)
859     buffer_cat (header, " ");
860 
861   if (body->data == NULL)
862     buffer_cat (body, " ");
863 
864   return 0;
865 }
866 
_ds_url_tokenize(ds_diction_t diction,char * body,const char * key)867 int _ds_url_tokenize(ds_diction_t diction, char *body, const char *key)
868 {
869   char *token, *url_ptr, *url_token, *ptr;
870   char combined_token[256];
871   unsigned long long crc;
872   int key_len = strlen(key);
873 
874 #ifdef VERBOSE
875   LOGDEBUG("scanning for urls: %s\n", key);
876 #endif
877   if (!body)
878     return EINVAL;
879   url_ptr = body;
880 
881   token = strcasestr(url_ptr, key);
882   while (token != NULL)
883   {
884     int i = 0, old;
885 
886     while(token[i]
887        && token[i] > 32
888        && token[i] != '>'
889        && ((token[i] != '\"' && token[i] != '\'') || i <= key_len))
890       i++;
891     old = token[i];
892     token[i] = 0; /* parse in place */
893 
894     /* Tokenize URL */
895     url_token = strtok_r (token, DELIMITERS, &ptr);
896     while (url_token != NULL)
897     {
898       snprintf (combined_token, sizeof (combined_token), "Url*%s", url_token);
899       crc = _ds_getcrc64 (combined_token);
900       ds_diction_touch(diction, crc, combined_token, 0);
901       url_token = strtok_r (NULL, DELIMITERS, &ptr);
902     }
903     memset (token, 32, i);
904     token[i] = old;
905     url_ptr = token + i;
906     token = strcasestr(url_ptr, key);
907   }
908   return 0;
909 }
910 
911 /* Truncate tokens with EOT delimiters */
_ds_truncate_token(const char * token)912 char * _ds_truncate_token(const char *token) {
913   char *tweaked;
914   int i;
915 
916   if (token == NULL)
917     return NULL;
918 
919   tweaked = strdup(token);
920 
921   if (tweaked == NULL)
922     return NULL;
923 
924   i = strlen(tweaked);
925   while(i>1 && strspn(tweaked+i-2, DELIMITERS_EOT)) {
926     tweaked[i-1] = 0;
927     i--;
928   }
929 
930   return tweaked;
931 }
932 
933 /*
934  *  _ds_spbh_clear
935  *
936  * DESCRIPTION
937  *   Clears the SBPH stack
938  *
939  *   Clears and frees all of the tokens in the SBPH stack. Used when a
940  *   boundary has been crossed (such as a new message header) where
941  *   tokens from the previous boundary are no longer useful.
942  */
943 
_ds_sparse_clear(char ** previous_tokens)944 void _ds_sparse_clear(char **previous_tokens) {
945   int i;
946   for(i=0;i<SPARSE_WINDOW_SIZE;i++)
947     previous_tokens[i] = NULL;
948   return;
949 }
950 
951 /*
952  * _ds_generate_bitpattern
953  *
954  * DESCRIPTION
955  *   Generates a sparse bitpattern for SPARSE_WINDOW_SIZE
956  *
957  *   This pattern is then used to create token patterns when using SBPH or OSB
958  *
959  */
960 
_ds_generate_bitpattern(int breadth)961 char *_ds_generate_bitpattern(int breadth) {
962   char *bitpattern;
963   u_int32_t mask;
964   unsigned long exp;
965   int i;
966 
967   bitpattern = malloc(SPARSE_WINDOW_SIZE * breadth);
968 
969   for(mask=0;mask<(u_int32_t)breadth;mask++) {
970       for(i=0;i<SPARSE_WINDOW_SIZE;i++) {
971           exp = (i) ? _ds_pow2(i) : 1;
972           /* Reverse pos = SPARSE_WINDOW_SIZE - (i+1); */
973           if (mask & exp)
974           {
975               bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] = 1;
976           }
977           else
978           {
979               bitpattern[(mask*SPARSE_WINDOW_SIZE) + i] = 0;
980           }
981       }
982   }
983 
984   return bitpattern;
985 }
986 
987