1 %{
2 /*
3  * NAME
4  *   lexer_v3.l -- bogofilter's lexical analyzer for message headers
5  *
6  *   01/01/2003 - split out of lexer.l
7  *
8 */
9 
10 /*
11  * Our lexical analysis is different from Paul Graham's rules:
12  *
13  * We throw away headers that are readily identifiable as dates.
14  * We throw away all digit strings that don't look like IP address parts.
15  * We thow away lines beginning with <tab>id<space> -- mailer UDs.
16  *
17  * These are optimizations to keep the token lists from bloating.
18  * The big win is recognizing machine-generated unique IDs that
19  * we'll never see again and shouldn't
20  *
21  * We don't treat dot between two alphanumerics as a separator,
22  * because we want to keep domain names and IP addresses together as
23  * recognizable units.
24  *
25  * Having done the above, there isn't much need to recognize URLs.
26  * If a URL is a spam indicator, very likely any other URL from the
27  * same site is as well, so the hostname part should be an adequate
28  * statistical trigger.
29  *
30  * LEXED_TOKENS, which are found in "msg-count" files need a special pattern
31  * because they can be:
32  *	1 - normal bogofilter tokens
33  *	2 - url:xxx and subj: tokens
34  *	3 - mime boundaries
35  */
36 
37 /* 12 May 2003
38  * Added Paul Graham's latest ideas on parsing.
39  * (From http://www.paulgraham.com/better.html)
40  *
41  * 1. Case is preserved.
42  *
43  * 2. Exclamation points are constituent characters.
44  *
45  * 3. Periods and commas are constituents if they occur between two
46  *    digits. This lets me get ip addresses and prices intact.
47  *
48  * 4. A price range like $20-25 yields two tokens, $20 and $25.
49  *
50  * 5. Tokens that occur within the To, From, Subject, and Return-Path
51  *    lines, or within urls, get marked accordingly.
52  *    For example. "foo" in the Subject line becomes "subj:foo".
53 */
54 
55 /* DR 08/29/2003:
56 **
57 ** With flex-2.5.31 and '%option never-interactive noreject', file
58 ** msg.dr.0118.base64 (in tests/bogofilter/inputs/split.d) parses
59 ** incorrectly because line 24 isn't base64 decoded.
60 */
61 
62 #define YY_NO_INPUT
63 
64 #include "common.h"
65 
66 #include <ctype.h>
67 #include <stdlib.h>
68 
69 #include "buff.h"
70 #include "charset.h"
71 #include "lexer.h"
72 #include "mime.h"		/* for mime_*() */
73 #include "msgcounts.h"
74 #include "textblock.h"
75 #include "token.h"
76 #include "xmalloc.h"
77 
78 #define YY_DECL token_t yylex(void)
79     YY_DECL;			/* declare function */
80 
81 #define YY_INPUT(buf,result,max_size) result = yyinput((byte *)buf, result, max_size)
82 #define YY_EXIT_FAILURE EX_ERROR
83 
84 #undef	stderr
85 #define	stderr	dbgout		/* for debug & -D options */
86 
87 static int lineno;
88 
89 #define	FLEX_VER(MAJ, MIN, SUB) ( MAJ * 1000 + MIN * 100 + SUB)
90 
91 #ifndef	YY_FLEX_SUBMINOR_VERSION
92 #define	FLEX_VERSION_BF FLEX_VER(YY_FLEX_MAJOR_VERSION, YY_FLEX_MINOR_VERSION, 0)
93 #else
94 #define	FLEX_VERSION_BF FLEX_VER(YY_FLEX_MAJOR_VERSION, YY_FLEX_MINOR_VERSION, YY_FLEX_SUBMINOR_VERSION)
95 #endif
96 
97 #if	FLEX_VERSION_BF < 2531
98 int yylineno;
99 #endif
100 
101 /* Function Prototypes/Forward Declarations */
102 
103 static word_t *yy_text(void);
104 static void html_char(void);
105 static void html_reorder(void);
106 
107 static void url_char(void);
108 
109 static void skip_to(char chr);
110 static void yy_unput(const byte *txt, uint len);
111 
112 char yy_get_state(void);
113 void yy_set_state_initial(void);
114 
115 static void header(void);
116 
117 /* Function Definitions */
118 
yy_text(void)119 static word_t *yy_text(void)
120 {
121     static word_t yyt;
122     yyt.u.text = (byte *)yytext;
123     yyt.leng = yyleng;
124     return &yyt;
125 }
126 
127 %}
128 
129 %option warn
130 %option nodebug debug
131 %option align caseless 8bit
132 /***********************************************************************
133  WARNING: The scanner must be interactive so as not to look ahead past
134  \n = EOH and other header/body delimiters, else it will fail when MIME
135  decoding, because part of the body has already been read ahead.
136  ***********************************************************************/
137 %option interactive always-interactive
138 %option noreject noyywrap
139 
140 UINT8		([01]?[0-9]?[0-9]|2([0-4][0-9]|5[0-5]))
141 IPADDR		{UINT8}\.{UINT8}\.{UINT8}\.{UINT8}
142 BCHARSNOSPC	[[:alnum:]()+_,-./:=?#\']
143 BCHARS		[[:alnum:]()+_,-./:=?#\' ]
144 MIME_BOUNDARY	{BCHARS}*{BCHARSNOSPC}
145 
146 ID		<?[[:alnum:]\-\.]+>?
147 CHARSET		[[:alnum:]-]+
148 VERPID		[[:alnum:]#-]+[[:digit:]]+[[:alnum:]#-]+
149 MTYPE		[[:blank:]]*[[:alnum:]/-]*
150 
151 NUM		[[:digit:]]+
152 NUM_NUM		\ {NUM}\ {NUM}
153 MSG_COUNT	^\".MSG_COUNT\"
154 
155 FRONT_CHAR	[^[:blank:][:cntrl:][:digit:][:punct:]]
156 MID_CHAR	[^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]]
157 BOGOLEX_CHAR	[^[:blank:][:cntrl:]   <>;=()&%#@+|/\\{}^\"?,\[\]]
158 BACK_CHAR	[^[:blank:][:cntrl:]:$*<>;=()&%#@+|/\\{}^\"?,\[\]._~\'\`\-]
159 
160 TOKEN		{FRONT_CHAR}({MID_CHAR}*{BACK_CHAR})?
161 
162 /*  RFC2047.2
163     encoded-word = "=?" charset "?" encoding "?" encoded-text "?="
164     charset = token    ; see section 3
165     encoding = token   ; see section 4
166     token = 1*<Any CHAR except SPACE, CTLs, and especials>
167     especials = "(" / ")" / "<" / ">" / "@" / "," / ";" / ":" / "
168 		<"> / "/" / "[" / "]" / "?" / "." / "="
169     encoded-text = 1*<Any printable ASCII character other than "?"
170 		      or SPACE>
171 		   ; (but see "Use of encoded-words in message
172 		   ; headers", section 5)
173 */
174 
175 /* 09/01/03
176   Using "[^?]" in the pattern and validating the charset in 'C'
177   reduces executable size by approx 290k.
178   new: ENCODED_WORD =\?{CHARSET}\?[bq]\?[^?]*\?=
179   old: ENCODED_WORD =\?{CHARSET}\?(b\?{BASE64}\|q\?{QP})\?=
180 
181   BASE64	[0-9a-zA-Z/+=]+
182   QP		[!->@-~]+
183 */
184 
185 WHITESPACE	[[:blank:]\n]
186 NOTWHITESPACE	[^[:blank:]\n]
187 
188 HTML_ENCODING	"&#"x?[[:xdigit:]]+";"
189 URL_ENCODING	"%"[[:xdigit:]][[:xdigit:]]
190 
191 ENCODED_WORD	=\?{CHARSET}\?[bq]\?[^?\n]*\?=
192 ENCODED_TOKEN	({FRONT_CHAR}{MID_CHAR}*)?({ENCODED_WORD}{WHITESPACE}+)*{ENCODED_WORD}
193 
194 /*
195 HTML_WI_COMMENTS	"<"[^>]*">"
196 HTML_WO_COMMENTS	"<"[^!][^>]*">"\|"<>"
197 */
198 
199 HTMLTOKEN		"<"[^>]*">"
200 
201 /*
202  * Generally, there are some html tags that cause an "eyebreak" and some
203  * that do not. For example, the "P" tag or the "BR" tag cause a break,
204  * and can be interpreted in place, while, the B (bold) tag does not.
205  * No close tags seem to cause a break.
206  * Comments do not.  This is an attempt to make an exhaustive list of
207  * tags that cause an "eyebreak". When the exit tag also causes a break,
208  * we include the /?. I believe this to be a complete list of tags that
209  * can cause a formatting break.
210  */
211 
212 HBREAK		p|br|li|h[1-6]|hr|title|table|center|dd|dt|iframe|img|input|select|td|textarea|th|\/?(div|blockquote|pre|dir|dl|fieldset|legend|form|menu|ol|ul)
213 
214 BREAKHTML	"<"({HBREAK}({WHITESPACE}[^>]*|""))">"
215 
216 VERP		{TOKEN}-{VERPID}-{TOKEN}={TOKEN}@{TOKEN}
217 
218 %s TEXT HTML BOGO_LEX
219 %s HTOKEN HDISCARD SCOMMENT LCOMMENT
220 %s PGP_HEAD PGP_BODY
221 
222 %%
223 
224 <INITIAL,BOGO_LEX>{MSG_COUNT}{NUM_NUM}		{ if (lineno == 0) {
225 						      BEGIN BOGO_LEX;
226 						      set_msg_counts_from_str(strchr(yytext, ' ') + 1);
227 						  }
228 						  return MSG_COUNT_LINE;
229 						}
230 <BOGO_LEX>^\"{BOGOLEX_CHAR}+\"{NUM_NUM}$	{ return BOGO_LEX_LINE; }
231 <BOGO_LEX>\n					{ lineno += 1; }
232 
233 <INITIAL>{ENCODED_TOKEN}			{ word_t *raw = yy_text();
234 						  word_t *txt = text_decode(raw);
235 						  yy_unput(txt->u.text, txt->leng);
236 						}
237 
238 <INITIAL>^(To|CC|From|Return-Path|Subject|Received):	{ set_tag(yytext); }
239 <INITIAL>^Content-(Transfer-Encoding|Type|Disposition):{MTYPE}	{ mime_content(yy_text()); skip_to(':'); header(); return TOKEN; }
240 
241 <INITIAL>^Message-ID:.*				{ /* save token for logging */
242 						  unsigned long off = 11;
243 						  while(isspace((unsigned char)yytext[off]) && off < INT_MAX && off < (unsigned long)yyleng)
244 						      off++;
245 						  set_msg_id((unsigned char *)(yytext+off), yyleng-off);
246 						  header();
247 						  return HEADKEY;
248 						}
249 <INITIAL>^(Delivery-)?Date:.*			|
250 <INITIAL>^Resent-Message-ID:.*			|
251 <INITIAL>^(In-Reply-To|References):.* 		{ header(); return HEADKEY; }
252 
253 <INITIAL>boundary=[ ]*\"?{MIME_BOUNDARY}\"?	{ mime_boundary_set(yy_text()); }
254 <INITIAL>charset=\"?{CHARSET}\"?		{ got_charset(yytext); skip_to('='); header(); return TOKEN; }
255 
256 <INITIAL>(file)?name=\"?			/* ignore */
257 <INITIAL>[[:blank:]]id{WHITESPACE}+{ID}		{ return QUEUE_ID; }
258 
259  /**********************************************************************
260  WARNING: Do NOT add header (<INITIAL>) rules that require characters
261  beyond a LF character (\n) - doing so will make the parser read ahead
262  parts of the body in the wrong MIME decoding mode and goof up
263  seriously.
264  **********************************************************************/
265 
266 <INITIAL>^\n					{ enum mimetype type = get_content_type();
267 						  have_body = true;
268 						  msg_header = false;
269 						  clr_tag();
270 						  switch (type) {
271 						  case MIME_TEXT_HTML:	BEGIN HTML; break;
272 						  case MIME_MESSAGE:	yy_set_state_initial(); break;
273 						  default:		BEGIN TEXT;
274 						  }
275 						  if (DEBUG_LEXER(1))
276 						      fprintf(dbgout, "*** end of header\n");
277 						  return EOH;
278 						}
279 <INITIAL>^{TOKEN}				{ header(); return TOKEN; }
280 <INITIAL>\n					{ lineno += 1; }
281 
282 <INITIAL>{VERP} 				{ skip_to('='); return VERP; }
283 
284 ^-----BEGIN\ PGP\ SIGNATURE-----$		{ BEGIN PGP_HEAD;
285 						  yy_unput((byte *)yytext, yyleng);
286 						}
287 <PGP_HEAD>^\n					{ BEGIN PGP_BODY; }
288 <PGP_BODY>{TOKEN}				{ /* ignore */ }
289 ^-----END\ PGP\ SIGNATURE-----$ 		{ BEGIN TEXT;
290 						  yy_unput((byte *)yytext, yyleng);
291 						}
292 ^--{MIME_BOUNDARY}(--)?$			{ if (got_mime_boundary(yy_text())) {
293 						      yy_set_state_initial();
294 						      return BOUNDARY;
295 						  } else {
296 						      yyless(2);
297 						  }
298 						}
299 
300   /* This has to match just as much or more than the below rules, so as to be the
301      controlling rule. */
302 <HTML>{TOKEN}({HTMLTOKEN}*{BREAKHTML}+{HTMLTOKEN}*.?|{HTMLTOKEN}+{WHITESPACE})		{
303     			char *chr = (char *)memchr(yytext, '<', yyleng);	/* find start of html tag */
304 			size_t len = chr - yytext;
305 			yyless(len);
306 			return TOKEN;
307 			}
308 
309 <HTML>{TOKEN}{HTMLTOKEN}+/{NOTWHITESPACE}	{ html_reorder(); }
310 
311 <HTML>"<!--"					{ BEGIN SCOMMENT; }
312 <HTML>"<!"					{ BEGIN LCOMMENT; }
313 <HTML>"<"(a|img|font){WHITESPACE}		{ BEGIN HTOKEN; }
314 <HTML>"<"					{ BEGIN HDISCARD; }	/* unknown tag */
315 
316 <HTOKEN>{TOKEN}					{ return TOKEN; }
317 <HDISCARD,LCOMMENT,SCOMMENT>{TOKEN}		{ /* discard innards of html tokens and comments */ }
318 
319 <HTOKEN,HDISCARD,LCOMMENT>">"			{ BEGIN HTML; }	/* end of tag, loose comment; return to normal html processing */
320 <SCOMMENT>"-->"					{ BEGIN HTML; }	/* end of strict comment; return to normal html processing */
321 "<"\!DOCTYPE\ HTML\ PUBLIC\ .*">" 		{ BEGIN HTML; }
322 
323 {IPADDR}					{ return IPADDR;}
324 "\["({IPADDR})"\]"				{ return MESSAGE_ADDR;}
325 
326 {TOKEN}						{ return TOKEN;}
327 
328 <HTML>{TOKEN}?{HTML_ENCODING}			{ html_char(); }	/* process escaped chars, eg '&#101;' is 'a' */
329 <HTOKEN>"/"[^/[:blank:]\n%]*{URL_ENCODING}+	{ url_char(); }		/* process escaped chars, eg '%61'    is 'a' */
330 
331 \${NUM}(\.{NUM})?				{ return MONEY;}	/* Dollars and cents */
332 
333 .						/* ignore character */
334 \n						{ lineno += 1; clr_tag(); }
335 <<EOF>>						{ return NONE; }
336 %%
337 
338 static void header(void)
339 {
340     set_tag("Header");
341 }
342 
343 void lexer_v3_init(FILE *fp)
344 {
345     lineno = 0;
346     have_body = false;
347     yy_set_state_initial();
348     yyrestart(fp);
349 }
350 
351 static void skip_to(char chr)
352 {
353     size_t len = strchr(yytext, chr) - yytext;
354     yyless(len);
355 }
356 
357 static void html_reorder(void)
358 {
359     char *chr = (char *)memchr(yytext, '<', yyleng);	/* find start of html tag */
360     size_t len = chr - yytext;
361     char *yycopy = (char *)xmalloc(yyleng + 1); 	/* +1 for NUL byte below */
362 
363     memcpy(yycopy, yytext+len, yyleng-len);	/* copy tag to start of buffer */
364     memcpy(yycopy+yyleng-len, yytext, len);	/* copy leading text to end of buffer */
365     yycopy[yyleng] = '\0';			/* for debugging */
366 
367     yy_unput((byte *)yycopy, yyleng);
368     xfree(yycopy);
369 }
370 
371 static int xtoi(char *in, size_t len)
372 {
373     int val = 0;
374     while (isxdigit((byte) *in) && (len-- > 0)) {
375 	char c = *in++;
376 	val <<= 4;
377 	val += isdigit((byte)c)
378 	    ? (c - '0')
379 	    : (tolower((byte)c) - 'a' + 10);
380     }
381     return val;
382 }
383 
384 static void html_char(void)
385 {
386     char *txt = strstr(yytext, "&#");	/* find decodable char */
387     size_t len = txt - yytext;
388     int  val;
389     char *yycopy = NULL;
390 
391     if (len != 0) {
392 	yycopy = (char *)xmalloc(yyleng + 1);	/* +1 for NUL byte below */
393 	memcpy(yycopy, yytext, yyleng);	/* copy tag to start of buffer */
394 	yycopy[yyleng] = '\0';		/* for debugging */
395     }
396 
397     txt += 2;
398     val = isdigit((byte) *txt) ? atoi(txt) : xtoi(txt+1, 4);
399 
400     /* xtoi() limits conversion to 4 characters */
401     /* atoi() limits value to 0x7fffffff, i.e. 2147483647 */
402     /* no problem on linux */
403 
404     if ((val > 0) && (val < 256) &&
405 	isprint(val)) {			/* use it if printable */
406 	yyunput(val, yytext);
407 	yyleng = len;			/* adjust len to pre-char count */
408     }
409     else {
410 	if (yycopy)
411 	    yycopy[yyleng-1] = ' ';	/* prevents parsing loop */
412     }
413 
414     if (yycopy != NULL) {
415 	yy_unput((byte *)yycopy, yyleng);
416 	xfree(yycopy);
417     }
418 }
419 
420 static void url_char(void)
421 {
422     char *src, *dst;
423     src = dst = yytext;
424 
425     while (src < yytext + yyleng) {
426 	char c = *src++;
427 	if (c == '%') {
428 	    c = xtoi(src, 2);
429 	    src += 2;
430 	}
431 	*dst++ = c;
432     }
433     while (dst > yytext) {
434 	yyunput(*--dst, yytext);
435     }
436 }
437 
438 static void yy_unput(const byte *txt, uint len)
439 {
440     while (len-- > 0)
441 	yyunput(txt[len], yytext);
442 }
443 
444 char yy_get_state()
445 {
446     switch (YYSTATE) {
447     case INITIAL:  return 'i';
448     case TEXT:     return 't';
449     case HTML:
450     case HTOKEN:   return 'h';
451     case SCOMMENT: return 's';
452     case LCOMMENT: return 'l';
453     default:       return 'o';
454     }
455 }
456 
457 void yy_set_state_initial(void)
458 {
459     BEGIN INITIAL;
460     msg_header = true;
461     header();
462 
463     if (DEBUG_LEXER(1))
464 	fprintf(dbgout, "BEGIN INITIAL\n");
465 
466 #ifdef	FLEX_DEBUG
467     yy_flex_debug = BOGOTEST('L');
468 #endif
469 }
470 
471 long lexer_v3_get_token(byte **output)
472 {
473 	*output = (byte *)yytext;
474 	return yyleng;
475 }
476 
477 /*
478  * The following sets edit modes for GNU EMACS
479  * Local Variables:
480  * mode:c
481  * indent-tabs-mode:t
482  * End:
483  */
484