1 %{
2 /*
3 * NAME
4 * lexer_v3.l -- bogofilter's lexical analyzer for message headers
5 *
6 * 01/01/2003 - split out of lexer.l
7 *
8 */
9
10 /*
11 * Our lexical analysis is different from Paul Graham's rules:
12 *
13 * We throw away headers that are readily identifiable as dates.
14 * We throw away all digit strings that don't look like IP address parts.
15 * We thow away lines beginning with <tab>id<space> -- mailer UDs.
16 *
17 * These are optimizations to keep the token lists from bloating.
18 * The big win is recognizing machine-generated unique IDs that
19 * we'll never see again and shouldn't
20 *
21 * We don't treat dot between two alphanumerics as a separator,
22 * because we want to keep domain names and IP addresses together as
23 * recognizable units.
24 *
25 * Having done the above, there isn't much need to recognize URLs.
26 * If a URL is a spam indicator, very likely any other URL from the
27 * same site is as well, so the hostname part should be an adequate
28 * statistical trigger.
29 *
30 * LEXED_TOKENS, which are found in "msg-count" files need a special pattern
31 * because they can be:
32 * 1 - normal bogofilter tokens
33 * 2 - url:xxx and subj: tokens
34 * 3 - mime boundaries
35 */
36
37 /* 12 May 2003
38 * Added Paul Graham's latest ideas on parsing.
39 * (From http://www.paulgraham.com/better.html)
40 *
41 * 1. Case is preserved.
42 *
43 * 2. Exclamation points are constituent characters.
44 *
45 * 3. Periods and commas are constituents if they occur between two
46 * digits. This lets me get ip addresses and prices intact.
47 *
48 * 4. A price range like $20-25 yields two tokens, $20 and $25.
49 *
50 * 5. Tokens that occur within the To, From, Subject, and Return-Path
51 * lines, or within urls, get marked accordingly.
52 * For example. "foo" in the Subject line becomes "subj:foo".
53 */
54
55 /* DR 08/29/2003:
56 **
57 ** With flex-2.5.31 and '%option never-interactive noreject', file
58 ** msg.dr.0118.base64 (in tests/bogofilter/inputs/split.d) parses
59 ** incorrectly because line 24 isn't base64 decoded.
60 */
61
62 #define YY_NO_INPUT
63
64 #include "common.h"
65
66 #include <ctype.h>
67 #include <stdlib.h>
68
69 #include "buff.h"
70 #include "charset.h"
71 #include "lexer.h"
72 #include "mime.h" /* for mime_*() */
73 #include "msgcounts.h"
74 #include "textblock.h"
75 #include "token.h"
76 #include "xmalloc.h"
77
78 #define YY_DECL token_t yylex(void)
79 YY_DECL; /* declare function */
80
81 #define YY_INPUT(buf,result,max_size) result = yyinput((byte *)buf, result, max_size)
82 #define YY_EXIT_FAILURE EX_ERROR
83
84 #undef stderr
85 #define stderr dbgout /* for debug & -D options */
86
87 static int lineno;
88
89 #define FLEX_VER(MAJ, MIN, SUB) ( MAJ * 1000 + MIN * 100 + SUB)
90
91 #ifndef YY_FLEX_SUBMINOR_VERSION
92 #define FLEX_VERSION_BF FLEX_VER(YY_FLEX_MAJOR_VERSION, YY_FLEX_MINOR_VERSION, 0)
93 #else
94 #define FLEX_VERSION_BF FLEX_VER(YY_FLEX_MAJOR_VERSION, YY_FLEX_MINOR_VERSION, YY_FLEX_SUBMINOR_VERSION)
95 #endif
96
97 #if FLEX_VERSION_BF < 2531
98 int yylineno;
99 #endif
100
101 /* Function Prototypes/Forward Declarations */
102
103 static word_t *yy_text(void);
104 static void html_char(void);
105 static void html_reorder(void);
106
107 static void url_char(void);
108
109 static void skip_to(char chr);
110 static void yy_unput(const byte *txt, uint len);
111
112 char yy_get_state(void);
113 void yy_set_state_initial(void);
114
115 static void header(void);
116
117 /* Function Definitions */
118
yy_text(void)119 static word_t *yy_text(void)
120 {
121 static word_t yyt;
122 yyt.u.text = (byte *)yytext;
123 yyt.leng = yyleng;
124 return &yyt;
125 }
126
127 %}
128
129 %option warn
130 %option nodebug debug
131 %option align caseless 8bit
132 /***********************************************************************
133 WARNING: The scanner must be interactive so as not to look ahead past
134 \n = EOH and other header/body delimiters, else it will fail when MIME
135 decoding, because part of the body has already been read ahead.
136 ***********************************************************************/
137 %option interactive always-interactive
138 %option noreject noyywrap
139
140 UINT8 ([01]?[0-9]?[0-9]|2([0-4][0-9]|5[0-5]))
141 IPADDR {UINT8}\.{UINT8}\.{UINT8}\.{UINT8}
142 BCHARSNOSPC [[:alnum:]()+_,-./:=?#\']
143 BCHARS [[:alnum:]()+_,-./:=?#\' ]
144 MIME_BOUNDARY {BCHARS}*{BCHARSNOSPC}
145
146 ID <?[[:alnum:]\-\.]+>?
147 CHARSET [[:alnum:]-]+
148 VERPID [[:alnum:]#-]+[[:digit:]]+[[:alnum:]#-]+
149 MTYPE [[:blank:]]*[[:alnum:]/-]*
150
151 NUM [[:digit:]]+
152 NUM_NUM \ {NUM}\ {NUM}
153 MSG_COUNT ^\".MSG_COUNT\"
154
155 FRONT_CHAR [^[:blank:][:cntrl:][:digit:][:punct:]]
156 MID_CHAR [^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]]
157 BOGOLEX_CHAR [^[:blank:][:cntrl:] <>;=()&%#@+|/\\{}^\"?,\[\]]
158 BACK_CHAR [^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]._~\'\`\-]
159
160 TOKEN {FRONT_CHAR}({MID_CHAR}*{BACK_CHAR})?
161
162 /* RFC2047.2
163 encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
164 charset = token ; see section 3
165 encoding = token ; see section 4
166 token = 1*<Any CHAR except SPACE, CTLs, and especials>
167 especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "
168 <"> / "/" / "[" / "]" / "?" / "." / "="
169 encoded-text = 1*<Any printable ASCII character other than "?"
170 or SPACE>
171 ; (but see "Use of encoded-words in message
172 ; headers", section 5)
173 */
174
175 /* 09/01/03
176 Using "[^?]" in the pattern and validating the charset in 'C'
177 reduces executable size by approx 290k.
178 new: ENCODED_WORD =\?{CHARSET}\?[bq]\?[^?]*\?=
179 old: ENCODED_WORD =\?{CHARSET}\?(b\?{BASE64}\|q\?{QP})\?=
180
181 BASE64 [0-9a-zA-Z/+=]+
182 QP [!->@-~]+
183 */
184
185 WHITESPACE [[:blank:]\n]
186 NOTWHITESPACE [^[:blank:]\n]
187
188 HTML_ENCODING "&#"x?[[:xdigit:]]+";"
189 URL_ENCODING "%"[[:xdigit:]][[:xdigit:]]
190
191 ENCODED_WORD =\?{CHARSET}\?[bq]\?[^?\n]*\?=
192 ENCODED_TOKEN ({FRONT_CHAR}{MID_CHAR}*)?({ENCODED_WORD}{WHITESPACE}+)*{ENCODED_WORD}
193
194 /*
195 HTML_WI_COMMENTS "<"[^>]*">"
196 HTML_WO_COMMENTS "<"[^!][^>]*">"\|"<>"
197 */
198
199 HTMLTOKEN "<"[^>]*">"
200
201 /*
202 * Generally, there are some html tags that cause an "eyebreak" and some
203 * that do not. For example, the "P" tag or the "BR" tag cause a break,
204 * and can be interpreted in place, while, the B (bold) tag does not.
205 * No close tags seem to cause a break.
206 * Comments do not. This is an attempt to make an exhaustive list of
207 * tags that cause an "eyebreak". When the exit tag also causes a break,
208 * we include the /?. I believe this to be a complete list of tags that
209 * can cause a formatting break.
210 */
211
212 HBREAK p|br|li|h[1-6]|hr|title|table|center|dd|dt|iframe|img|input|select|td|textarea|th|\/?(div|blockquote|pre|dir|dl|fieldset|legend|form|menu|ol|ul)
213
214 BREAKHTML "<"({HBREAK}({WHITESPACE}[^>]*|""))">"
215
216 VERP {TOKEN}-{VERPID}-{TOKEN}={TOKEN}@{TOKEN}
217
218 %s TEXT HTML BOGO_LEX
219 %s HTOKEN HDISCARD SCOMMENT LCOMMENT
220 %s PGP_HEAD PGP_BODY
221
222 %%
223
224 <INITIAL,BOGO_LEX>{MSG_COUNT}{NUM_NUM} { if (lineno == 0) {
225 BEGIN BOGO_LEX;
226 set_msg_counts_from_str(strchr(yytext, ' ') + 1);
227 }
228 return MSG_COUNT_LINE;
229 }
230 <BOGO_LEX>^\"{BOGOLEX_CHAR}+\"{NUM_NUM}$ { return BOGO_LEX_LINE; }
231 <BOGO_LEX>\n { lineno += 1; }
232
233 <INITIAL>{ENCODED_TOKEN} { word_t *raw = yy_text();
234 word_t *txt = text_decode(raw);
235 yy_unput(txt->u.text, txt->leng);
236 }
237
238 <INITIAL>^(To|CC|From|Return-Path|Subject|Received): { set_tag(yytext); }
239 <INITIAL>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE} { mime_content(yy_text()); skip_to(':'); header(); return TOKEN; }
240
241 <INITIAL>^Message-ID:.* { /* save token for logging */
242 unsigned long off = 11;
243 while(isspace((unsigned char)yytext[off]) && off < INT_MAX && off < (unsigned long)yyleng)
244 off++;
245 set_msg_id((unsigned char *)(yytext+off), yyleng-off);
246 header();
247 return HEADKEY;
248 }
249 <INITIAL>^(Delivery-)?Date:.* |
250 <INITIAL>^Resent-Message-ID:.* |
251 <INITIAL>^(In-Reply-To|References):.* { header(); return HEADKEY; }
252
253 <INITIAL>boundary=[ ]*\"?{MIME_BOUNDARY}\"? { mime_boundary_set(yy_text()); }
254 <INITIAL>charset=\"?{CHARSET}\"? { got_charset(yytext); skip_to('='); header(); return TOKEN; }
255
256 <INITIAL>(file)?name=\"? /* ignore */
257 <INITIAL>[[:blank:]]id{WHITESPACE}+{ID} { return QUEUE_ID; }
258
259 /**********************************************************************
260 WARNING: Do NOT add header (<INITIAL>) rules that require characters
261 beyond a LF character (\n) - doing so will make the parser read ahead
262 parts of the body in the wrong MIME decoding mode and goof up
263 seriously.
264 **********************************************************************/
265
266 <INITIAL>^\n { enum mimetype type = get_content_type();
267 have_body = true;
268 msg_header = false;
269 clr_tag();
270 switch (type) {
271 case MIME_TEXT_HTML: BEGIN HTML; break;
272 case MIME_MESSAGE: yy_set_state_initial(); break;
273 default: BEGIN TEXT;
274 }
275 if (DEBUG_LEXER(1))
276 fprintf(dbgout, "*** end of header\n");
277 return EOH;
278 }
279 <INITIAL>^{TOKEN} { header(); return TOKEN; }
280 <INITIAL>\n { lineno += 1; }
281
282 <INITIAL>{VERP} { skip_to('='); return VERP; }
283
284 ^-----BEGIN\ PGP\ SIGNATURE-----$ { BEGIN PGP_HEAD;
285 yy_unput((byte *)yytext, yyleng);
286 }
287 <PGP_HEAD>^\n { BEGIN PGP_BODY; }
288 <PGP_BODY>{TOKEN} { /* ignore */ }
289 ^-----END\ PGP\ SIGNATURE-----$ { BEGIN TEXT;
290 yy_unput((byte *)yytext, yyleng);
291 }
292 ^--{MIME_BOUNDARY}(--)?$ { if (got_mime_boundary(yy_text())) {
293 yy_set_state_initial();
294 return BOUNDARY;
295 } else {
296 yyless(2);
297 }
298 }
299
300 /* This has to match just as much or more than the below rules, so as to be the
301 controlling rule. */
302 <HTML>{TOKEN}({HTMLTOKEN}*{BREAKHTML}+{HTMLTOKEN}*.?|{HTMLTOKEN}+{WHITESPACE}) {
303 char *chr = (char *)memchr(yytext, '<', yyleng); /* find start of html tag */
304 size_t len = chr - yytext;
305 yyless(len);
306 return TOKEN;
307 }
308
309 <HTML>{TOKEN}{HTMLTOKEN}+/{NOTWHITESPACE} { html_reorder(); }
310
311 <HTML>"<!--" { BEGIN SCOMMENT; }
312 <HTML>"<!" { BEGIN LCOMMENT; }
313 <HTML>"<"(a|img|font){WHITESPACE} { BEGIN HTOKEN; }
314 <HTML>"<" { BEGIN HDISCARD; } /* unknown tag */
315
316 <HTOKEN>{TOKEN} { return TOKEN; }
317 <HDISCARD,LCOMMENT,SCOMMENT>{TOKEN} { /* discard innards of html tokens and comments */ }
318
319 <HTOKEN,HDISCARD,LCOMMENT>">" { BEGIN HTML; } /* end of tag, loose comment; return to normal html processing */
320 <SCOMMENT>"-->" { BEGIN HTML; } /* end of strict comment; return to normal html processing */
321 "<"\!DOCTYPE\ HTML\ PUBLIC\ .*">" { BEGIN HTML; }
322
323 {IPADDR} { return IPADDR;}
324 "\["({IPADDR})"\]" { return MESSAGE_ADDR;}
325
326 {TOKEN} { return TOKEN;}
327
328 <HTML>{TOKEN}?{HTML_ENCODING} { html_char(); } /* process escaped chars, eg 'e' is 'a' */
329 <HTOKEN>"/"[^/[:blank:]\n%]*{URL_ENCODING}+ { url_char(); } /* process escaped chars, eg '%61' is 'a' */
330
331 \${NUM}(\.{NUM})? { return MONEY;} /* Dollars and cents */
332
333 . /* ignore character */
334 \n { lineno += 1; clr_tag(); }
335 <<EOF>> { return NONE; }
336 %%
337
338 static void header(void)
339 {
340 set_tag("Header");
341 }
342
343 void lexer_v3_init(FILE *fp)
344 {
345 lineno = 0;
346 have_body = false;
347 yy_set_state_initial();
348 yyrestart(fp);
349 }
350
351 static void skip_to(char chr)
352 {
353 size_t len = strchr(yytext, chr) - yytext;
354 yyless(len);
355 }
356
357 static void html_reorder(void)
358 {
359 char *chr = (char *)memchr(yytext, '<', yyleng); /* find start of html tag */
360 size_t len = chr - yytext;
361 char *yycopy = (char *)xmalloc(yyleng + 1); /* +1 for NUL byte below */
362
363 memcpy(yycopy, yytext+len, yyleng-len); /* copy tag to start of buffer */
364 memcpy(yycopy+yyleng-len, yytext, len); /* copy leading text to end of buffer */
365 yycopy[yyleng] = '\0'; /* for debugging */
366
367 yy_unput((byte *)yycopy, yyleng);
368 xfree(yycopy);
369 }
370
371 static int xtoi(char *in, size_t len)
372 {
373 int val = 0;
374 while (isxdigit((byte) *in) && (len-- > 0)) {
375 char c = *in++;
376 val <<= 4;
377 val += isdigit((byte)c)
378 ? (c - '0')
379 : (tolower((byte)c) - 'a' + 10);
380 }
381 return val;
382 }
383
384 static void html_char(void)
385 {
386 char *txt = strstr(yytext, "&#"); /* find decodable char */
387 size_t len = txt - yytext;
388 int val;
389 char *yycopy = NULL;
390
391 if (len != 0) {
392 yycopy = (char *)xmalloc(yyleng + 1); /* +1 for NUL byte below */
393 memcpy(yycopy, yytext, yyleng); /* copy tag to start of buffer */
394 yycopy[yyleng] = '\0'; /* for debugging */
395 }
396
397 txt += 2;
398 val = isdigit((byte) *txt) ? atoi(txt) : xtoi(txt+1, 4);
399
400 /* xtoi() limits conversion to 4 characters */
401 /* atoi() limits value to 0x7fffffff, i.e. 2147483647 */
402 /* no problem on linux */
403
404 if ((val > 0) && (val < 256) &&
405 isprint(val)) { /* use it if printable */
406 yyunput(val, yytext);
407 yyleng = len; /* adjust len to pre-char count */
408 }
409 else {
410 if (yycopy)
411 yycopy[yyleng-1] = ' '; /* prevents parsing loop */
412 }
413
414 if (yycopy != NULL) {
415 yy_unput((byte *)yycopy, yyleng);
416 xfree(yycopy);
417 }
418 }
419
420 static void url_char(void)
421 {
422 char *src, *dst;
423 src = dst = yytext;
424
425 while (src < yytext + yyleng) {
426 char c = *src++;
427 if (c == '%') {
428 c = xtoi(src, 2);
429 src += 2;
430 }
431 *dst++ = c;
432 }
433 while (dst > yytext) {
434 yyunput(*--dst, yytext);
435 }
436 }
437
438 static void yy_unput(const byte *txt, uint len)
439 {
440 while (len-- > 0)
441 yyunput(txt[len], yytext);
442 }
443
444 char yy_get_state()
445 {
446 switch (YYSTATE) {
447 case INITIAL: return 'i';
448 case TEXT: return 't';
449 case HTML:
450 case HTOKEN: return 'h';
451 case SCOMMENT: return 's';
452 case LCOMMENT: return 'l';
453 default: return 'o';
454 }
455 }
456
457 void yy_set_state_initial(void)
458 {
459 BEGIN INITIAL;
460 msg_header = true;
461 header();
462
463 if (DEBUG_LEXER(1))
464 fprintf(dbgout, "BEGIN INITIAL\n");
465
466 #ifdef FLEX_DEBUG
467 yy_flex_debug = BOGOTEST('L');
468 #endif
469 }
470
471 long lexer_v3_get_token(byte **output)
472 {
473 *output = (byte *)yytext;
474 return yyleng;
475 }
476
477 /*
478 * The following sets edit modes for GNU EMACS
479 * Local Variables:
480 * mode:c
481 * indent-tabs-mode:t
482 * End:
483 */
484