1 /*
2 * Copyright (C) 2002 Laird Breyer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
17 *
18 * Author: Laird Breyer <laird@lbreyer.com>
19 */
20
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24
25 #include <ctype.h>
26 #include <string.h>
27 #include <stdlib.h>
28
29 #include "dbacl.h"
30
31
32 /* most functions in this file are logically identical in wide char
33 * and multibyte versions, except that char is replaced by wchar_t
34 * etc. It's become unwieldy to keep two slightly different copies of
35 * all functions involved, so we use the preprocessor to build a poor
36 * man's template facility.
37 *
38 * The "template" macros work as follows:
39 * mbw_lit("abc") -> "abc" or L"abc"
40 * mbw_t -> char or wchar_t
41 * mbw_prefix(good_char)(x) -> good_char(x) or w_good_char(x)
42 *
43 * Once the template macros have done their work, we obtain ordinary
44 * functions named in a parallel fashion, where the wide character
45 * functions have a w_ prefix, and instances of char are substituted
46 * with instances of wchar_t.
47 *
48 * The code below is split into uncommon code, where the
49 * implementations of corresponding functions is different, and common
50 * code where the implementation is identical. Only identical code is
51 * "templatized".
52 */
53
54 #include "mbw.h"
55
56 extern options_t u_options;
57 extern charparser_t m_cp;
58 extern options_t m_options;
59
60 extern myregex_t re[MAX_RE];
61 extern regex_count_t regex_count;
62
63 extern long system_pagesize;
64
65 /* uncommon code */
66
67 /***********************************************************
68 * UTILITY FUNCTIONS *
69 ***********************************************************/
70
71 #if defined HAVE_MBRTOWC && defined MBW_WIDE
72 /* compiler doesn't seem to know this function is in the
73 * library, so we define our own - bug or just plain weird? */
74 static __inline__
mywcsncasecmp(const wchar_t * s1,const wchar_t * s2,size_t n)75 int mywcsncasecmp(const wchar_t *s1, const wchar_t *s2, size_t n) {
76 register size_t i = 0;
77 while( i < n ) {
78 if( tolower(*s1) != tolower(*s2) ) {
79 return towlower(*s1) - towlower(*s2);
80 }
81 s1++;
82 s2++;
83 }
84 return 0;
85 }
86 #endif
87
88
89 #if defined HAVE_MBRTOWC && defined MBW_WIDE
90
91 static __inline__
w_b64_code(wchar_t c)92 int w_b64_code(wchar_t c) {
93 if( (c >= L'A') && (c <= L'Z') ) {
94 return (c - L'A');
95 } else if( (c >= L'a') && (c <= L'z') ) {
96 return (c - L'a') + 26;
97 } else if( (c >= L'0') && (c <= L'9') ) {
98 return (c - L'0') + 52;
99 } else if( c == L'+' ) {
100 return 62;
101 } else if( c == L'/' ) {
102 return 63;
103 } else if( c == L'=' ) {
104 return 64;
105 } else {
106 return -1;
107 }
108 }
109
110 static __inline__
w_qp_code(wchar_t c)111 int w_qp_code(wchar_t c) {
112 if( (c >= L'0') && (c <= L'9') ) {
113 return (c - L'0');
114 } else if( (c >= L'A') && (c <= L'F') ) {
115 return (c - L'A') + 10;
116 } else {
117 return -1;
118 }
119 }
120
121
122 #else
123
124 /* warning: only use char here so we never have to bother about endianness */
125 static const signed char b64_code_table[256] = {
126 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
127 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
128 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
129 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
130 -1,-1,-1,62,-1,-1,-1,63,52,53,
131 54,55,56,57,58,59,60,61,-1,-1,
132 -1,64,-1,-1,-1, 0, 1, 2, 3, 4,
133 5, 6, 7, 8, 9,10,11,12,13,14,
134 15,16,17,18,19,20,21,22,23,24,
135 25,-1,-1,-1,-1,-1,-1,26,27,28,
136 29,30,31,32,33,34,35,36,37,38,
137 39,40,41,42,43,44,45,46,47,48,
138 49,50,51,-1,-1,-1,-1,-1,-1,-1,
139 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
140 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
141 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
142 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
143 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
144 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
145 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
146 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
147 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
148 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
149 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
150 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
151 -1,-1,-1,-1,-1,-1
152 };
153
154 #define b64_code(c) ((int)b64_code_table[(int)c])
155
156 static const signed char qp_code_table[256] = {
157 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
158 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
159 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
160 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
161 -1,-1,-1,-1,-1,-1,-1,-1, 0, 1,
162 2, 3, 4, 5, 6, 7, 8, 9,-1,-1,
163 -1,-1,-1,-1,-1,10,11,12,13,14,
164 15,16,-1,-1,-1,-1,-1,-1,-1,-1,
165 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
166 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
167 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
168 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
169 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
170 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
171 -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
172 -1,-1,-1,-1,-1,-1
173 };
174
175 #define qp_code(c) (qp_code_table[(int)c])
176
177 #endif
178
179
180 /* common code */
181
182 #if defined MBW_MB || (defined MBW_WIDE && defined HAVE_MBRTOWC)
183
184
185 /* this is dbacl's idea of an empty line. Note that a single \n or
186 * a \r\n both constitute an empty line, contrary to RFC 2822, which
187 * doesn't allow single \n chars in headers. However, we might be
188 * reading the mail from a Unix mbox, where \r\n was replaced with \n.
189 * We don't accept single \r however.
190 */
191 #define MBW_EMPTYLINE(line) ((!(line) || \
192 (line[0] == mbw_lit('\0')) || \
193 ((line)[0] == mbw_lit('\n')) || \
194 (((line)[0] == mbw_lit('\r')) && \
195 ((line)[1] == mbw_lit('\n')))) ? 1 : 0)
196
197 #define MBW_DOUBLEDASH(line) ((line[0] == mbw_lit('-')) && \
198 (line[1] == mbw_lit('-')) && \
199 !mbw_isspace(line[2]))
200
201
202 /***********************************************************
203 * TABLES *
204 ***********************************************************/
205 typedef struct {
206 const mbw_t *type_subtype;
207 MIME_Content_Type medium;
208 } mbw_prefix(MIME_Media);
209
210 /* Wildcards such as text are represented as "text/" and must
211 * be placed after all other text/xxx types.
212 * More generally, comparison uses mystrcasestr, so the smallest strings
213 * must come after the more detailed ones.
214 *
215 * For a description of official mime types, see
216 * http://www.iana.org/assignments/
217 */
218 static const mbw_prefix(MIME_Media) mbw_prefix(mime_media)[] = {
219 { mbw_lit("text/html"), ctTEXT_HTML },
220 { mbw_lit("text/xhtml"), ctTEXT_HTML },
221 { mbw_lit("text/plain"), ctTEXT_PLAIN },
222 { mbw_lit("text/richtext"), ctTEXT_RICH },
223 { mbw_lit("text/enriched"), ctTEXT_RICH },
224 { mbw_lit("text/rtf"), ctTEXT_PLAIN },
225 { mbw_lit("text/xml"), ctTEXT_XML },
226 { mbw_lit("text/sgml"), ctTEXT_SGML },
227 { mbw_lit("text/"), ctTEXT_PLAIN },
228
229 { mbw_lit("multipart/"), ctTEXT_PLAIN },
230
231 { mbw_lit("message/rfc822"), ctMESSAGE_RFC822 },
232 { mbw_lit("message/partial"), ctOTHER },
233 { mbw_lit("message/external-body"), ctMESSAGE_RFC822 },
234 { mbw_lit("message/news"), ctMESSAGE_RFC822 },
235 { mbw_lit("message/"), ctOCTET_STREAM },
236
237 { mbw_lit("application/sgml"), ctTEXT_PLAIN },
238 { mbw_lit("application/xml"), ctTEXT_PLAIN },
239 { mbw_lit("application/rtf"), ctTEXT_PLAIN },
240 { mbw_lit("application/news-transmission"), ctMESSAGE_RFC822 },
241 { mbw_lit("application/andrew-inset"), ctTEXT_PLAIN },
242 { mbw_lit("application/msword"), ctAPPLICATION_MSWORD },
243 { mbw_lit("application/"), ctOCTET_STREAM },
244
245 { mbw_lit("image/"), ctIMAGE },
246 { mbw_lit("audio/"), ctAUDIO },
247 { mbw_lit("video/"), ctVIDEO },
248 { mbw_lit("model/"), ctMODEL },
249 };
250 static int num_mime_media = sizeof(mbw_prefix(mime_media))/sizeof(mbw_prefix(MIME_Media));
251
252 static const mbw_t *mbw_prefix(armor_start)[] = {
253 mbw_lit("-----BEGIN PGP MESSAGE"),
254 mbw_lit("-----BEGIN PGP PUBLIC KEY BLOCK"),
255 mbw_lit("-----BEGIN PGP PRIVATE KEY BLOCK"),
256 mbw_lit("-----BEGIN PGP SIGNATURE"),
257 };
258 static int num_armor_start = sizeof(mbw_prefix(armor_start))/sizeof(mbw_t *);
259
260 static const mbw_t *mbw_prefix(armor_end)[] = {
261 mbw_lit("-----END PGP MESSAGE"),
262 mbw_lit("-----END PGP PUBLIC KEY BLOCK"),
263 mbw_lit("-----END PGP PRIVATE KEY BLOCK"),
264 mbw_lit("-----END PGP SIGNATURE"),
265 };
266 static int num_armor_end = sizeof(mbw_prefix(armor_end))/sizeof(mbw_t *);
267
268 /***********************************************************
269 * UTILITY FUNCTIONS *
270 ***********************************************************/
271
272 /* checks if the line is "binary", ie contains printable chars
273 but not too many extended ascii chars */
274 static
mbw_prefix(is_binline)275 bool_t mbw_prefix(is_binline)(const mbw_t *line) {
276 int numa = 0;
277 const mbw_t *p = line;
278 while( *p ) {
279 if( mbw_isspace(*p) ||
280 (mbw_isascii(*p) && !mbw_iscntrl(*p)) ) {
281 numa++;
282 } else if( !mbw_isprint(*p) ) {
283 return 1;
284 }
285 p++;
286 }
287 return (numa < (p - line)/2);
288 }
289
290 static
mbw_prefix(is_emptyspace)291 bool_t mbw_prefix(is_emptyspace)(const mbw_t *line) {
292 const mbw_t *p = line;
293 while( *p ) {
294 if( !mbw_isspace(*p) ) {
295 return 0;
296 }
297 p++;
298 }
299 return 1;
300 }
301
302
303 static
mbw_prefix(is_b64line)304 bool_t mbw_prefix(is_b64line)(const mbw_t *line) {
305 const mbw_t *p = line;
306 while( *p ) {
307 if( (mbw_prefix(b64_code)(*p) == -1) &&
308 !mbw_isspace(*p) ) {
309 return 0;
310 }
311 p++;
312 }
313 return 1;
314 }
315
316 static
mbw_prefix(is_uuline)317 int mbw_prefix(is_uuline)(const mbw_t *line) {
318 int count = 0;
319 const mbw_t *p = line;
320 int len = (int)(line[0] - mbw_lit(' '));
321 if( (len < 0) || (len > 63) ) {
322 return -1;
323 }
324 while(*p && (*p != mbw_lit('\r')) && (*p != mbw_lit('\n')) ) {
325 if( (*p > mbw_lit('`')) ||
326 (*p < mbw_lit(' ')) ) {
327 return -2;
328 } else {
329 count++;
330 }
331 p++;
332 }
333
334 return (abs(count - 4*(len/3)) <= 3);
335 }
336
337 /* detecting true yEnc lines is too hard, so we detect
338 nonprintable characters instead */
339 static
mbw_prefix(is_yencline)340 bool_t mbw_prefix(is_yencline)(const mbw_t *line) {
341 int nonprint = 0;
342 const mbw_t *p = line;
343 while( *p ) {
344 nonprint += !mbw_isprint(*p);
345 p++;
346 }
347 return (nonprint > 5);
348 }
349
350 /*
351 * this code generates mystrcasestr() and w_mystrcasestr()
352 * (similar to strstr, but case insensitive)
353 */
354 static __inline__
mbw_prefix(mystrcasestr)355 const mbw_t *mbw_prefix(mystrcasestr)(const mbw_t *haystack, const mbw_t *needle) {
356 const mbw_t *p, *q, *r;
357
358 for(p = haystack; *p; p++) {
359 q = needle; r = p;
360 while( *q && *r && ((mbw_tolower(*q) - mbw_tolower(*r)) == 0) ) {
361 q++; r++;
362 }
363 if( !*q ) {
364 return p;
365 }
366 }
367 return NULL;
368 }
369
370 static __inline__
mbw_prefix(mystrncasecmp)371 int mbw_prefix(mystrncasecmp)(const mbw_t *s1, const mbw_t *s2, size_t n) {
372 int s = -1;
373 if( s1 && s2 ) {
374 while(--n > 0) {
375 s = (mbw_tolower(*s1++) - mbw_tolower(*s2++));
376 if( (s != 0) || (s1 == mbw_lit('\0')) || (s2 == mbw_lit('\0')) ) {
377 break;
378 }
379 }
380 }
381 return s;
382 }
383
384 static __inline__
mbw_prefix(mystrncmp)385 int mbw_prefix(mystrncmp)(const mbw_t *s1, const mbw_t *s2, size_t n) {
386 int s = -1;
387 if( s1 && s2 ) {
388 while(--n > 0) {
389 s = (*s1++ - *s2++);
390 if( (s != 0) || (s1 == mbw_lit('\0')) || (s2 == mbw_lit('\0')) ) {
391 break;
392 }
393 }
394 }
395 return s;
396 }
397
398
399 /***********************************************************
400 * DECODING CACHE FUNCTIONS *
401 ***********************************************************/
402
403
404 static
mbw_prefix(init_dc)405 void mbw_prefix(init_dc)(mbw_prefix(decoding_cache) *dc, size_t len) {
406 if( !dc->cache ) {
407 dc->cache = (mbw_t *)malloc(len * sizeof(mbw_t));
408 dc->data_ptr = dc->cache;
409 dc->cache_len = dc->cache ? len : 0;
410 dc->max_line_len = len;
411 }
412 }
413
414
415 static
mbw_prefix(adjust_cache_size)416 void mbw_prefix(adjust_cache_size)(mbw_prefix(decoding_cache) *dc, size_t n) {
417 mbw_t *p;
418 size_t m = (dc->data_ptr - dc->cache);
419 while( dc->cache_len < m + n ) {
420 p = (mbw_t *)realloc(dc->cache, 2 * dc->cache_len * sizeof(mbw_t));
421 if( p ) {
422 dc->cache = p;
423 dc->data_ptr = p + m;
424 dc->cache_len *= 2;
425 } else {
426 break;
427 }
428 }
429 dc->max_line_len = (dc->max_line_len < n) ? n : dc->max_line_len;
430 }
431
432 static
mbw_prefix(flush_cache)433 bool_t mbw_prefix(flush_cache)(mbw_prefix(decoding_cache) *dc, mbw_t *line, bool_t all) {
434 mbw_t *q;
435 mbw_t *p;
436 int i;
437 if( dc->cache && (dc->data_ptr > dc->cache) ) {
438 /* never output more bytes than will fit on output_line */
439 p = (dc->data_ptr > dc->cache + dc->max_line_len) ?
440 (dc->cache + dc->max_line_len): dc->data_ptr;
441
442 if( !all ) {
443 /* we break the line at the last space, or ampersand, or seventy
444 chars (> b64/qp limit) from the end - there may well be
445 stretches longer than this, but we try to flush as much as
446 possible, so the limit should be small. Also, we don't want
447 to break entities if possible.
448 */
449 /* for(i = 25; !mbw_isspace(*p) && */
450 /* (p > dc->cache) && i; --p, --i); */
451 for(i = 70; !mbw_isspace(*p) &&
452 (*p != mbw_lit('&')) && (p > dc->cache) && i; --p, --i);
453 }
454
455 if( p > dc->cache ) {
456 for(q = dc->cache; q < p; q++) {
457 *line++ = *q;
458 }
459 *line = mbw_lit('\0');
460
461 dc->data_ptr = dc->cache;
462 if( !all ) {
463 /* now fold unused part back into cache. Note that
464 * b64_line_cache is always NUL terminated, so we don't
465 * need b64_cache_ptr to mark the end. */
466 while( *p ) {
467 *dc->data_ptr++ = *p++;
468 }
469 }
470 *dc->data_ptr = mbw_lit('\0');
471 return 1;
472 }
473 }
474 return 0;
475 }
476
477 /***********************************************************
478 * DECODING FUNCTIONS *
479 ***********************************************************/
480
481
482 #define REPNUL mbw_lit('\t')
483
484 /*
485 * this code generates b64_line_filter2() and w_b64_line_filter2()
486 * works ok so long as q <= line, or q >> line
487 * WARNING: it is assumed that the buffer at q can hold (at most) all of line
488 *
489 * The string which is written is always NUL terminated, but if NULs
490 * were decoded in the middle, those are replaced by tabs (we could
491 * also replace them with a more neutral char, but the cache flushing
492 * code breaks up lines on spaces, and we want to take advantage of that. See
493 * the REPNUL define)
494 */
mbw_prefix(b64_line_filter2)495 mbw_t *mbw_prefix(b64_line_filter2)(mbw_t *line, mbw_t *q) {
496 mbw_t *p = line;
497 mbw_t buf[4];
498 mbw_t *buf_start = buf;
499 mbw_t *buf_end = buf + 4;
500
501 if( q ) {
502 while( *p ) {
503 if( mbw_prefix(b64_code)(*p) > -1 ) {
504 *buf_start++ = *p;
505 if( buf_start == buf_end ) {
506 buf_start = buf;
507 *q = (mbw_prefix(b64_code)(buf[0])<<2) + (mbw_prefix(b64_code)(buf[1])>>4);
508 if( !*q ) { *q = REPNUL; }
509 q++;
510 if( buf[2] != mbw_lit('=') ) {
511 *q = (mbw_prefix(b64_code)(buf[1])<<4) + (mbw_prefix(b64_code)(buf[2])>>2);
512 if( !*q ) { *q = REPNUL; }
513 q++;
514 if( buf[3] != mbw_lit('=') ) {
515 *q = (mbw_prefix(b64_code)(buf[2])<<6) + mbw_prefix(b64_code)(buf[3]);
516 if( !*q ) { *q = REPNUL; }
517 q++;
518 } else {
519 break;
520 }
521 } else {
522 break;
523 }
524 }
525 }
526 p++;
527 }
528 *q = mbw_lit('\0');
529 }
530
531 return q;
532 }
533
534 /*
535 * this code generates b64_line_filter() and w_b64_line_filter()
536 * Decodes a base64 encoded line. The input line is overwritten.
537 *
538 * The b64 standard arbitrarily truncates lines to 57 characters, so
539 * here we place the chunks in a cache and only overwrite line when
540 * the cache is full. Unfortunately, malformed email messages may not
541 * follow the standard, so in practice all this means is that we get
542 * arbitrarily truncated input.
543 *
544 * Note that when we overwrite line with the cached data, we assume
545 * the line is big enough to hold all the cached data. This is guaranteed
546 * by registering the current line length with the cache.
547 */
mbw_prefix(b64_line_filter)548 bool_t mbw_prefix(b64_line_filter)(mbw_prefix(decoding_cache) *b64cache,
549 mbw_t *line) {
550 mbw_prefix(adjust_cache_size)(b64cache, mbw_strlen(line));
551 b64cache->data_ptr =
552 mbw_prefix(b64_line_filter2)(line, b64cache->data_ptr);
553 return mbw_prefix(flush_cache)(b64cache, line, 0);
554 }
555
556
557 /*
558 * this code generates qp_line_filter2() and w_qp_line_filter2()
559 * this works ok so long as q <= line, or q >> line
560 * WARNING: it is assumed that the buffer at q can hold (at most) all of line
561 */
mbw_prefix(qp_line_filter2)562 mbw_t *mbw_prefix(qp_line_filter2)(mbw_t *line, mbw_t *q) {
563 mbw_t *p = line;
564 if( q ) {
565 while( *p ) {
566 if( *p != mbw_lit('=') ) {
567 *q++ = *p++;
568 } else {
569 if( !*(++p) || mbw_isspace(*p) ) {
570 break;
571 } else {
572 /* if the equal sign isn't followed by */
573 /* an upper case hex number, something's wrong */
574 *q = mbw_prefix(qp_code)(*p);
575 if( ((signed char)*q < 0) || !p[1] || (mbw_prefix(qp_code)(p[1]) < 0) ) {
576 *q++ = p[-1];
577 } else {
578 *q = (*q << 4) + mbw_prefix(qp_code)(p[1]);
579 if( *q ) { q++; }
580 p += 2;
581 }
582 }
583 }
584 }
585 *q = mbw_lit('\0');
586 }
587 return q;
588 }
589
590 /*
591 * this code generates qp_line_filter() and w_qp_line_filter()
592 * Decodes a quoted-printable line. The input line is overwritten.
593 *
594 * The QP standard arbitrarily truncates lines to 76 characters, so
595 * here we place the chunks in a cache and only overwrite line when
596 * the cache is full. Unfortunately, malformed email messages may not
597 * follow the standard, so in practice all this means is that we get
598 * arbitrarily truncated input.
599 *
600 * Note that when we overwrite line with the cached data, we assume
601 * the line is big enough to hold all the cached data. This is guaranteed
602 * by registering the current line length with the cache.
603 */
mbw_prefix(qp_line_filter)604 bool_t mbw_prefix(qp_line_filter)(mbw_prefix(decoding_cache) *qpcache,
605 mbw_t *line) {
606 mbw_prefix(adjust_cache_size)(qpcache, mbw_strlen(line));
607 qpcache->data_ptr =
608 mbw_prefix(qp_line_filter2)(line, qpcache->data_ptr);
609 return mbw_prefix(flush_cache)(qpcache, line, 0);
610 }
611
612
613 /***********************************************************
614 * TOKENIZER FUNCTIONS *
615 ***********************************************************/
616 /* the following modules handle state transitions, you can mix and
617 * match them, or write new ones. The is_func() is called in the
618 * default state, and switches the internal state if necessary. If it
619 * can't recognize the current char, it should return gcUNDEF, not
620 * gcDISCARD, that way the next is_func() can look at the character.
621 * The handle_func() is similar to the is_func(), but is
622 * called when the state is not the default. It should return gcDISCARD
623 * and switch to the default state if it can't recognize the current char,
624 * otherwise it can switch states any way it wants. When it detects the
625 * end of the current token, it must switch back to the default state.
626 */
627
628 /* these are macros to save typing, modules below */
629
630 #define SET1(c) ( (*(c) == mbw_lit('\'')) || (*(c) == mbw_lit('-')) || (*(c) == mbw_lit('.')) )
631 /* #define SET1(c) ( (*(c) == mbw_lit('-')) || (*(c) == mbw_lit('+')) || (*(c) == mbw_lit('.')) || (*(c) == mbw_lit('_')) || (*(c) == mbw_lit(',')) || (*(c) >= 0xA0) ) */
632 /* #define SET1(c) ( (*(c) == mbw_lit('-')) || (*(c) == mbw_lit('+')) || (*(c) == mbw_lit('.')) || (*(c) == mbw_lit('_')) || (*(c) == mbw_lit(',')) || (*(c) == mbw_lit('$')) || (*(c) >= 0xA0) ) */
633 #define SET2(c) ( (*(c) == mbw_lit(',')) || (*(c) == mbw_lit('.')) )
634 #define SET3(c) ( (*(c) > mbw_lit(' ')) && (*(c) <= mbw_lit('~')) && (*(c) != mbw_lit('>')) )
635
636 #define IO(c) ((*(c) & 0xC0) == 0x80)
637 #define I2O(c) ((*(c) & 0xE0) == 0xC0)
638 #define I3O(c) ((*(c) & 0xF0) == 0xE0)
639 #define I4O(c) ((*(c) & 0xF8) == 0xF0)
640 #define I5O(c) ((*(c) & 0xFC) == 0xF8)
641 #define I6O(c) ((*(c) & 0xFE) == 0xFC)
642
643 #define RANGE(c,x,y) ((*(c) >= x) && (*(c) <= y))
644 #define DRANGE(c,x,y,u,v) (RANGE(c,x,y) || RANGE(c,u,v))
645 #define DTEST(s,t,r) (s && t && (char_filter_state = r))
646 #define TTEST(s,t,u,r) (s && t && u && (char_filter_state = r))
647 #define QTEST(s,t,u,v,r) (s && t && u && v && (char_filter_state = r))
648 #define VTEST(s,t,u,v,w,r) (s && t && u && v && w && (char_filter_state = r))
649 #define STEST(s,t,u,v,w,x,r) (s && t && u && v && w && x && (char_filter_state = r))
650
651 #define Shift_JIS(c) ( DTEST(DRANGE(c,0x81,0x9F,0xE0,0xFC),DRANGE(c+1,0x40,0x7E,0x80,0xFC),fShift_JIS_1) )
652
653 #define EUC_Japanese(c) ( DTEST(RANGE(c,0xA1,0xFE),RANGE(c+1,0xA1,0xFE),fEUC_Japanese_1) || DTEST((*c == 0x8E),RANGE(c+1,0xA0,0xDF),fEUC_Japanese_1) || TTEST((*c == 0x8F),RANGE(c+1,0xA1,0xFE),RANGE(c+2,0xA1,0xFE),fEUC_Japanese_2) )
654
655 #define BIG5(c) ( DTEST(RANGE(c,0xA1,0xFE),DRANGE(c+1,0x40,0x7E,0xA1,0xFE),fBIG5_1) )
656
657 #define BIG5P(c) ( DTEST(RANGE(c,0x81,0xFE),DRANGE(c+1,0x40,0x7E,0x80,0xFE),fBIG5P_1) )
658
659 #define EUC_CN(c) ( DTEST(RANGE(c,0xA1,0xFE),RANGE(c+1,0xA1,0xFE),fEUC_CN_1) )
660
661 #define EUC_TW(c) ( DTEST(RANGE(c,0xA1,0xFE),RANGE(c+1,0xA1,0xFE),fEUC_TW_1) || QTEST((*c == 0x8E),RANGE(c+1,0xA1,0xB0),RANGE(c+2,0xA1,0xFE),RANGE(c+3,0xA1,0xFE),fEUC_TW_3) )
662
663 #define Johab(c) ( DTEST(RANGE(c,0x84,0xD3),DRANGE(c+1,0x41,0x7E,0x81,0xFE),fJohab_1) || DTEST(DRANGE(c,0xD8,0xDE,0xE0,0xF9),DRANGE(c+1,0x31,0x7E,0x91,0xFE),fJohab_1) )
664
665 #define UTF8(c) ( DTEST(I2O(c),IO(c+1),fUTF8_1) || TTEST(I3O(c),IO(c+1),IO(c+2),fUTF8_2) || QTEST(I4O(c),IO(c+1),IO(c+2),IO(c+3),fUTF8_3) || VTEST(I5O(c),IO(c+1),IO(c+2),IO(c+3),IO(c+4),fUTF8_4) || STEST(I6O(c),IO(c+1),IO(c+2),IO(c+3),IO(c+4),IO(c+5),fUTF8_5) )
666
667 #define ISO8859(c) ( RANGE(c,0xA1,0xFE) )
668
669 /* atom without slash */
670 static char rfc2822_atom[256] = {
671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
673 0, '!', 0, '#', '$', '%', '&', 0, 0, 0, '*', '+', 0, '-', 0, 0,
674 /* 0, '!', 0, '#', '$', '%', '&','\'', 0, 0, '*', '+', 0, '-', 0, '/', */
675 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0, 0, 0, '=', 0, '?',
676 0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
677 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 0, 0, 0, '^', '_',
678 '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
679 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 0,
680 };
681
682 #define ATOM(c) (((unsigned int)(*(c)) < 255) && rfc2822_atom[(unsigned int)(*(c))])
683 #define DOTTED_ATOM(c) ( ((c[0] == mbw_lit('.')) && ATOM((c+1))) || ATOM(c) )
684 #define COLON_ATOM(c) ( ((c[0] == mbw_lit(':')) && ATOM((c+1))) || ATOM(c) )
685 #define DOTTED_DIGITS(c) ( ((c[0] == mbw_lit('.')) && mbw_isdigit(c[1])) || mbw_isdigit(c[0]) )
686 #define COLON_DIGITS(c) ( ((c[0] == mbw_lit(':')) && mbw_isdigit(c[1])) || mbw_isdigit(c[0]) )
687 #define DOTTED_ALPHA(c) ( ((c[0] == mbw_lit('.')) && mbw_isalpha(c[1])) || mbw_isalpha(c[0]) )
688
689
690 /* warning: macros and modules work directly with this structure */
691 static enum {
692 fDEF = 1,
693 fANX,
694 fNAX,
695 fMUL,
696 fCUR,
697 fADD,
698 fSEP_1, fSEP_2, fSEP_3,
699 fUTF8_1, fUTF8_2, fUTF8_3, fUTF8_4, fUTF8_5,
700 fShift_JIS_1,
701 fEUC_Japanese_1, fEUC_Japanese_2,
702 fBIG5_1,
703 fBIG5P_1,
704 fEUC_CN_1,
705 fEUC_TW_1,fEUC_TW_2,fEUC_TW_3,
706 fJohab_1,
707 fALNUM,
708 fALPHA,
709 fNUMERIC,
710 fSYMBOL,
711 fANSX_1, fANSX_2, fANSX_3,
712 fCEF2_ATOM, fCEF2_DOTTED_ATOM, fCEF2_COLON_ATOM
713 } char_filter_state = fDEF;
714
715 /*
716 * asian character tokens
717 */
718
719 /* macro to be used in case statememt */
720 #define ASIAN_CASES fShift_JIS_1: case fBIG5_1: case fBIG5P_1: case fEUC_CN_1: case fJohab_1: case fEUC_TW_1: case fEUC_TW_2: case fEUC_TW_3: case fEUC_Japanese_1: case fEUC_Japanese_2
721
722 static __inline__
mbw_prefix(is_asian_case)723 good_char_t mbw_prefix(is_asian_case)(const mbw_t *c) {
724 return (Shift_JIS(c) || EUC_Japanese(c) ||
725 BIG5(c) || BIG5P(c) || EUC_CN(c) || EUC_TW(c) || Johab(c)) ? gcTOKEN : gcUNDEF;
726 }
727
728 static __inline__
mbw_prefix(handle_asian_case)729 good_char_t mbw_prefix(handle_asian_case)(const mbw_t *c) {
730 switch(char_filter_state) {
731 case fShift_JIS_1:
732 case fBIG5_1:
733 case fBIG5P_1:
734 case fEUC_CN_1:
735 case fJohab_1:
736 char_filter_state = fDEF;
737 return gcTOKEN_END;
738 case fEUC_TW_1:
739 char_filter_state = fDEF;
740 return gcTOKEN_END;
741 case fEUC_TW_2:
742 char_filter_state = fEUC_TW_1;
743 return gcTOKEN;
744 case fEUC_TW_3:
745 char_filter_state = fEUC_TW_2;
746 return gcTOKEN;
747 case fEUC_Japanese_1:
748 char_filter_state = fDEF;
749 return gcTOKEN_END;
750 case fEUC_Japanese_2:
751 char_filter_state = fEUC_Japanese_1;
752 return gcTOKEN;
753 default:
754 break;
755 }
756 char_filter_state = fDEF;
757 return gcDISCARD;
758 }
759
760 /*
761 * utf8 character tokens - only makes sense if parsing multibyte strings
762 */
763
764 /* macro to be used in case statememt */
765 #define UNICODE_CASES fUTF8_1: case fUTF8_2: case fUTF8_3: case fUTF8_4: case fUTF8_5
766
767 static __inline__
mbw_prefix(is_unicode_case)768 good_char_t mbw_prefix(is_unicode_case)(const mbw_t *c) {
769 return (UTF8(c)) ? gcTOKEN : gcUNDEF;
770 }
771
772 static __inline__
mbw_prefix(handle_unicode_case)773 good_char_t mbw_prefix(handle_unicode_case)(const mbw_t *c) {
774 switch(char_filter_state) {
775 case fUTF8_1:
776 char_filter_state = fDEF;
777 return gcTOKEN_END;
778 case fUTF8_2:
779 char_filter_state = fUTF8_1;
780 return gcTOKEN;
781 case fUTF8_3:
782 char_filter_state = fUTF8_2;
783 return gcTOKEN;
784 case fUTF8_4:
785 char_filter_state = fUTF8_3;
786 return gcTOKEN;
787 case fUTF8_5:
788 char_filter_state = fUTF8_4;
789 return gcTOKEN;
790 default:
791 break;
792 }
793 char_filter_state = fDEF;
794 return gcDISCARD;
795 }
796
797 /*
798 * alpha character tokens
799 */
800
801 #define ALPHA_CASES fALPHA
802
803 /* checks for alpha and switches to alphabetic state, or
804 returns gcUNDEF if unrecognized */
805 static __inline__
mbw_prefix(is_alpha_case)806 good_char_t mbw_prefix(is_alpha_case)(const mbw_t *c) {
807 if( mbw_isalpha(*c++) ) {
808 if( mbw_isalpha(*c) || (*c == mbw_lit('\0')) ) {
809 char_filter_state = fALPHA;
810 return gcTOKEN;
811 }
812 return gcTOKEN_END;
813 }
814 return gcUNDEF;
815 }
816
817 /* checks for alpha or discards, may switch back to default state */
818 static __inline__
mbw_prefix(handle_alpha_case)819 good_char_t mbw_prefix(handle_alpha_case)(const mbw_t *c) {
820 if( mbw_isalpha(*c++) ) {
821 if( mbw_isalpha(*c) || (*c == mbw_lit('\0')) ) {
822 return gcTOKEN;
823 }
824 char_filter_state = fDEF;
825 return gcTOKEN_END;
826 }
827 char_filter_state = fDEF;
828 return gcDISCARD;
829 }
830
831 /*
832 * alphanumeric character tokens
833 */
834
835 #define ALNUM_CASES fALNUM
836
837 /* checks for alnum and switches to alphanumeric state, or
838 returns gcUNDEF if unrecognized */
839 static __inline__
mbw_prefix(is_alnum_case)840 good_char_t mbw_prefix(is_alnum_case)(const mbw_t *c) {
841 if( mbw_isalnum(*c++) ) {
842 if( mbw_isalnum(*c) || (*c == mbw_lit('\0')) ) {
843 char_filter_state = fALNUM;
844 return gcTOKEN;
845 } else {
846 return gcTOKEN_END;
847 }
848 }
849 return gcUNDEF;
850 }
851
852 /* checks for alnum or discards, may switch back to default state */
853 static __inline__
mbw_prefix(handle_alnum_case)854 good_char_t mbw_prefix(handle_alnum_case)(const mbw_t *c) {
855 if( mbw_isalnum(*c) ) {
856 if( mbw_isalnum(*(++c)) || (*c == mbw_lit('\0')) ) {
857 return gcTOKEN;
858 }
859 char_filter_state = fDEF;
860 return gcTOKEN_END;
861 }
862 char_filter_state = fDEF;
863 return gcDISCARD;
864 }
865
866 /*
867 * numeric tokens
868 */
869
870 #define NUMERIC_CASES fNUMERIC
871
872 /* checks for digit and switches to numeric state, or
873 returns gcUNDEF if unrecognized */
874 static __inline__
mbw_prefix(is_numeric_case)875 good_char_t mbw_prefix(is_numeric_case)(const mbw_t *c) {
876 if( mbw_isdigit(*c++) ) {
877 if( mbw_isdigit(*c) || (*c == mbw_lit('\0')) ) {
878 char_filter_state = fNUMERIC;
879 return gcTOKEN;
880 } else {
881 return gcTOKEN_END;
882 }
883 }
884 return gcUNDEF;
885 }
886
887 /* checks for numeric or discards, may switch back to default state */
888 static __inline__
mbw_prefix(handle_numeric_case)889 good_char_t mbw_prefix(handle_numeric_case)(const mbw_t *c) {
890 if( mbw_isdigit(*c++) ) {
891 if( mbw_isdigit(*c) || (*c == mbw_lit('\0')) ) {
892 return gcTOKEN;
893 }
894 char_filter_state = fDEF;
895 return gcTOKEN_END;
896 }
897 char_filter_state = fDEF;
898 return gcDISCARD;
899 }
900
901 /*
902 * punctuation tokens
903 */
904
905 #define SYMBOLIC_CASES fSYMBOL
906
907 static __inline__
mbw_prefix(is_symbolic_case)908 good_char_t mbw_prefix(is_symbolic_case)(const mbw_t *c) {
909 if( mbw_ispunct(*c++) ) {
910 if( mbw_ispunct(*c) || (*c == mbw_lit('\0')) ) {
911 char_filter_state = fSYMBOL;
912 return gcTOKEN;
913 } else {
914 return gcTOKEN_END;
915 }
916 }
917 return gcUNDEF;
918 }
919
920 static __inline__
mbw_prefix(handle_symbolic_case)921 good_char_t mbw_prefix(handle_symbolic_case)(const mbw_t *c) {
922 if( mbw_ispunct(*c++) ) {
923 if( mbw_ispunct(*c) || (*c == mbw_lit('\0')) ) {
924 if( (c[-1] == c[0]) && (c[0] == c[1]) ) {
925 return gcIGNORE;
926 }
927 return gcTOKEN;
928 }
929 char_filter_state = fDEF;
930 return gcTOKEN_END;
931 }
932 char_filter_state = fDEF;
933 return gcDISCARD;
934 }
935
936 /*
937 * repeated character tokens, squeezed down to 3.
938 */
939
940 #define REPEAT_CASES fSEP_1: case fSEP_2: case fSEP_3
941
942 /* checks for repeated char, replaces with 3 copies only, or
943 returns gcUNDEF if unrecognized */
944 static __inline__
mbw_prefix(is_repeat_case)945 good_char_t mbw_prefix(is_repeat_case)(const mbw_t *c) {
946 if( (c[1] == c[0]) ) {
947 if( (c[2] == c[0]) ) {
948 char_filter_state = fSEP_3;
949 return gcTOKEN;
950 } else {
951 char_filter_state = fSEP_2;
952 return gcTOKEN;
953 }
954 }
955 return gcUNDEF;
956 }
957
958 /* checks for repeated char, replaces with 3 copies only */
959 static __inline__
mbw_prefix(handle_repeat_case)960 good_char_t mbw_prefix(handle_repeat_case)(const mbw_t *c) {
961 switch(char_filter_state) {
962 case fSEP_1:
963 if( c[0] != c[1] ) {
964 char_filter_state = fDEF;
965 }
966 return gcDISCARD;
967 case fSEP_2:
968 char_filter_state = fSEP_1;
969 return gcTOKEN;
970 case fSEP_3:
971 char_filter_state = fSEP_2;
972 return gcTOKEN;
973 default:
974 break;
975 }
976 char_filter_state = fDEF;
977 return gcDISCARD;
978 }
979
980 /*
981 * currency tokens, very simple and naive, not localized
982 */
983
984 #define CURRENCY_CASES fCUR
985
986 /* checks for currency, or
987 returns gcUNDEF if unrecognized */
988 static __inline__
mbw_prefix(is_currency_case)989 good_char_t mbw_prefix(is_currency_case)(const mbw_t *c) {
990 /* this should be done properly (locale) sometime ... */
991 if( (*c == mbw_lit('$')) || (*c == mbw_lit('\xa3')) ) {
992 if( mbw_isdigit(c[1]) &&
993 (!mbw_isdigit(c[2]) || !mbw_isdigit(c[3]) || !mbw_isdigit(c[4])) ) {
994 char_filter_state = fCUR;
995 return gcTOKEN;
996 }
997 }
998 return gcUNDEF;
999 }
1000
1001 /* checks for currency */
1002 static __inline__
mbw_prefix(handle_currency_case)1003 good_char_t mbw_prefix(handle_currency_case)(const mbw_t *c) {
1004 if( mbw_isdigit(c[1]) ) {
1005 return gcTOKEN;
1006 } else if( SET2(c+1) && mbw_isdigit(c[2]) ) {
1007 char_filter_state = fCUR;
1008 return gcTOKEN;
1009 }
1010 char_filter_state = fDEF;
1011 return gcTOKEN_END;
1012 }
1013
1014 /*
1015 * internet embedded address
1016 */
1017
1018 #define ADDRESS_CASES fADD
1019
1020 static __inline__
mbw_prefix(is_address_case)1021 good_char_t mbw_prefix(is_address_case)(const mbw_t *c) {
1022 if( *c == mbw_lit('<') ) {
1023 for(c++; SET3(c); c++);
1024 if( *c == mbw_lit('>') ) {
1025 char_filter_state = fADD;
1026 }
1027 }
1028 return gcUNDEF;
1029 }
1030
1031 static __inline__
mbw_prefix(handle_address_case)1032 good_char_t mbw_prefix(handle_address_case)(const mbw_t *c) {
1033 switch(*c) {
1034 case mbw_lit('@'):
1035 /* case mbw_lit('#'): */
1036 /* case mbw_lit('?'): */
1037 /* case mbw_lit('&'): */
1038 /* case mbw_lit(':'): */
1039 /* case mbw_lit('/'): */
1040 return gcDISCARD;
1041 case mbw_lit('>'):
1042 char_filter_state = fDEF;
1043 return gcDISCARD;
1044 default:
1045 break;
1046 }
1047 return gcTOKEN;
1048 }
1049
1050 /*
1051 * multiple alpha tokens separated by punctuation
1052 */
1053
1054 #define MULTI_ALPHA_CASES fMUL
1055
1056 /* checks for alpha and switches to alphabetic state, or
1057 returns gcUNDEF if unrecognized */
1058 static __inline__
mbw_prefix(is_multi_alpha_case)1059 good_char_t mbw_prefix(is_multi_alpha_case)(const mbw_t *c) {
1060 /* don't increment c in SET1 */
1061 if( mbw_isalpha(*c++) && SET1(c) && mbw_isalpha(*(++c)) ) {
1062 char_filter_state = fMUL;
1063 return gcTOKEN;
1064 }
1065 return gcUNDEF;
1066 }
1067
1068 /* checks for alpha or discards, may switch back to default state */
1069 static __inline__
mbw_prefix(handle_multi_alpha_case)1070 good_char_t mbw_prefix(handle_multi_alpha_case)(const mbw_t *c) {
1071 if( mbw_isalpha(c[1]) ) {
1072 return gcTOKEN;
1073 } else if( SET1(c+1) && mbw_isalpha(c[2]) ) {
1074 return gcTOKEN;
1075 }
1076 char_filter_state = fDEF;
1077 return gcTOKEN_END;
1078 }
1079
1080 /*
1081 * xxx123 identifiers
1082 */
1083
1084 #define ALPHA_NUMBER_CASES fANX
1085
1086 static __inline__
mbw_prefix(is_alpha_number_case)1087 good_char_t mbw_prefix(is_alpha_number_case)(const mbw_t *c) {
1088 if( mbw_isalpha(*c++) && mbw_isdigit(*c) ) {
1089 char_filter_state = fANX;
1090 return gcTOKEN;
1091 }
1092 return gcUNDEF;
1093 }
1094
1095 static __inline__
mbw_prefix(handle_alpha_number_case)1096 good_char_t mbw_prefix(handle_alpha_number_case)(const mbw_t *c) {
1097 if( mbw_isdigit(*c++) ) {
1098 if( mbw_isdigit(*c) ) {
1099 return gcTOKEN;
1100 }
1101 char_filter_state = fDEF;
1102 return gcTOKEN_END;
1103 }
1104 char_filter_state = fDEF;
1105 return gcDISCARD;
1106 }
1107
1108 /*
1109 * 123xxx identifiers
1110 */
1111
1112 #define NUMBER_ALPHA_CASES fNAX
1113
1114 static __inline__
mbw_prefix(is_number_alpha_case)1115 good_char_t mbw_prefix(is_number_alpha_case)(const mbw_t *c) {
1116 if( mbw_isdigit(*c++) && mbw_isalpha(*c) ) {
1117 char_filter_state = fNAX;
1118 return gcTOKEN;
1119 }
1120 return gcUNDEF;
1121 }
1122
1123 static __inline__
mbw_prefix(handle_number_alpha_case)1124 good_char_t mbw_prefix(handle_number_alpha_case)(const mbw_t *c) {
1125 if( mbw_isalpha(*c++) ) {
1126 if( mbw_isalpha(*c) ) {
1127 return gcTOKEN;
1128 }
1129 char_filter_state = fDEF;
1130 return gcTOKEN_END;
1131 }
1132 char_filter_state = fDEF;
1133 return gcDISCARD;
1134 }
1135
1136 /*
1137 * xxx123% identifiers
1138 */
1139
1140 #define ALPHA_NUMBER_SYMBOL_CASES fANSX_1: case fANSX_2: case fANSX_3
1141
1142 static __inline__
mbw_prefix(is_alpha_number_symbol_case)1143 good_char_t mbw_prefix(is_alpha_number_symbol_case)(const mbw_t *c) {
1144 if( mbw_isalpha(*c++) ) {
1145 if( mbw_isdigit(*c) ) {
1146 char_filter_state = fANSX_2;
1147 return gcTOKEN;
1148 } else if( mbw_ispunct(*c) ) {
1149 char_filter_state = fANSX_1;
1150 return gcTOKEN;
1151 } else if( !mbw_isalpha(*c) || (*c == mbw_lit('\0')) ) {
1152 return gcTOKEN_END;
1153 }
1154 char_filter_state = fANSX_3;
1155 return gcTOKEN;
1156 }
1157 return gcUNDEF;
1158 }
1159
1160 static __inline__
mbw_prefix(is_number_symbol_case)1161 good_char_t mbw_prefix(is_number_symbol_case)(const mbw_t *c) {
1162 if( mbw_isdigit(*c++) ) {
1163 if( mbw_ispunct(*c) ) {
1164 char_filter_state = fANSX_1;
1165 return gcTOKEN;
1166 } else if( !mbw_isdigit(*c) || (*c == mbw_lit('\0')) ) {
1167 return gcTOKEN_END;
1168 }
1169 char_filter_state = fANSX_2;
1170 return gcTOKEN;
1171 }
1172 return gcUNDEF;
1173 }
1174
1175 static __inline__
mbw_prefix(handle_alpha_number_symbol_case)1176 good_char_t mbw_prefix(handle_alpha_number_symbol_case)(const mbw_t *c) {
1177 if( *(c++) == mbw_lit('\0') ) {
1178 return gcTOKEN;
1179 }
1180 switch(char_filter_state) {
1181 case fANSX_1:
1182 if( !mbw_ispunct(*c) ) {
1183 char_filter_state = fDEF;
1184 return gcTOKEN_END;
1185 } else if( (c[-1] == c[0]) && (c[0] == c[1]) ) {
1186 return gcIGNORE;
1187 }
1188 return gcTOKEN;
1189 case fANSX_2:
1190 if( mbw_ispunct(*c) ) {
1191 char_filter_state = fANSX_1;
1192 return gcTOKEN;
1193 } else if( mbw_isalpha(*c) || ISO8859(c) ) {
1194 char_filter_state = fANSX_3;
1195 return gcTOKEN;
1196 } else if( !mbw_isdigit(*c) ) {
1197 char_filter_state = fDEF;
1198 return gcTOKEN_END;
1199 }
1200 return gcTOKEN;
1201 case fANSX_3:
1202 if( mbw_isdigit(*c) ) {
1203 char_filter_state = fANSX_2;
1204 return gcTOKEN;
1205 } else if( mbw_ispunct(*c) ) {
1206 char_filter_state = fANSX_1;
1207 return gcTOKEN;
1208 } else if( !mbw_isalpha(*c) && !ISO8859(c) ) {
1209 char_filter_state = fDEF;
1210 return gcTOKEN_END;
1211 } else if( (c[-1] == c[0]) && (c[0] == c[1]) ) {
1212 return gcIGNORE;
1213 }
1214 return gcTOKEN;
1215 default:
1216 /* ignore */
1217 break;
1218 }
1219 char_filter_state = fDEF;
1220 return gcDISCARD;
1221 }
1222
1223
1224 /*
1225 * This is the CEF (common encoding formats) tokenizer.
1226 * It was the first attempt at a specialized email tokenizer.
1227 */
1228
1229 static __inline__
mbw_prefix(is_cef_char)1230 good_char_t mbw_prefix(is_cef_char)(const mbw_t *c) {
1231 good_char_t retval;
1232 switch(char_filter_state) {
1233 case fDEF:
1234
1235 #if defined MBW_MB
1236 /* this doesn't make sense for wide characters */
1237 if(*c & 0x80) {
1238 if( mbw_prefix(is_unicode_case)(c) ||
1239 mbw_prefix(is_asian_case)(c) ) {
1240 return gcTOKEN;
1241 } else if( *c < 0xa0 ) {
1242 return gcDISCARD;
1243 }
1244 }
1245 #endif
1246
1247 if( mbw_isalpha(*c) ) {
1248 if( (retval = mbw_prefix(is_alpha_number_case)(c)) ||
1249 (retval = mbw_prefix(is_multi_alpha_case)(c)) ) {
1250 return retval;
1251 }
1252 return gcTOKEN;
1253 } else if( mbw_ispunct(*c) ) {
1254 if( (retval = mbw_prefix(is_repeat_case)(c)) ||
1255 (retval = mbw_prefix(is_currency_case)(c)) ||
1256 (retval = mbw_prefix(is_address_case)(c)) ) {
1257 return retval;
1258 }
1259 } else if( mbw_isdigit(*c) ) {
1260 if( (retval = mbw_prefix(is_number_alpha_case)(c)) ) {
1261 return retval;
1262 }
1263 }
1264 return gcDISCARD;
1265 case ALPHA_CASES:
1266 retval = mbw_prefix(handle_alpha_case)(c);
1267 if( retval == gcTOKEN_END ) {
1268 if( (retval = mbw_prefix(is_multi_alpha_case)(c)) ||
1269 (retval = mbw_prefix(is_alpha_number_case)(c)) ) {
1270 return retval;
1271 }
1272 }
1273 return retval;
1274 case ALPHA_NUMBER_CASES:
1275 return mbw_prefix(handle_alpha_number_case)(c);
1276 case NUMBER_ALPHA_CASES:
1277 return mbw_prefix(handle_number_alpha_case)(c);
1278 case MULTI_ALPHA_CASES:
1279 return mbw_prefix(handle_multi_alpha_case)(c);
1280 case ADDRESS_CASES:
1281 return mbw_prefix(handle_address_case)(c);
1282 case CURRENCY_CASES:
1283 return mbw_prefix(handle_currency_case)(c);
1284 case REPEAT_CASES:
1285 return mbw_prefix(handle_repeat_case)(c);
1286 case ASIAN_CASES:
1287 return mbw_prefix(handle_asian_case)(c);
1288 case UNICODE_CASES:
1289 return mbw_prefix(handle_unicode_case)(c);
1290 default:
1291 /* nothing */
1292 break;
1293 }
1294 char_filter_state = fDEF;
1295 return gcDISCARD; /* otherwise compiler complains */
1296 }
1297
1298
1299 /*
1300 * This is the ADP (alpha digit punctuation) tokenizer.
1301 * It was the second attempt at a specialized email tokenizer.
1302 */
1303
1304 static __inline__
mbw_prefix(is_adp_char)1305 good_char_t mbw_prefix(is_adp_char)(const mbw_t *c) {
1306 good_char_t retval;
1307 switch(char_filter_state) {
1308 case fDEF:
1309 #if defined MBW_MB
1310 /* this doesn't make sense for wide characters */
1311 if(*c & 0x80) {
1312 if( (retval = mbw_prefix(is_unicode_case)(c)) ||
1313 (retval = mbw_prefix(is_asian_case)(c)) ) {
1314 return retval;
1315 } else if( *c < 0xa0 ) {
1316 return gcDISCARD;
1317 }
1318 }
1319 #endif
1320 if( (retval = mbw_prefix(is_alpha_number_symbol_case)(c)) ||
1321 (retval = mbw_prefix(is_number_symbol_case)(c)) ||
1322 (retval = mbw_prefix(is_symbolic_case)(c)) ) {
1323 return retval;
1324 }
1325 return gcDISCARD;
1326
1327 case ALPHA_NUMBER_SYMBOL_CASES:
1328 return mbw_prefix(handle_alpha_number_symbol_case)(c);
1329 case SYMBOLIC_CASES:
1330 return mbw_prefix(handle_symbolic_case)(c);
1331 case ASIAN_CASES:
1332 return mbw_prefix(handle_asian_case)(c);
1333 case UNICODE_CASES:
1334 return mbw_prefix(handle_unicode_case)(c);
1335 default:
1336 /* nothing */
1337 break;
1338 }
1339 char_filter_state = fDEF;
1340 return gcDISCARD;
1341 }
1342
1343 /*
1344 * This is the CEF2 (common email format v2) tokenizer.
1345 */
1346
1347 static __inline__
mbw_prefix(is_cef2_special_case)1348 good_char_t mbw_prefix(is_cef2_special_case)(const mbw_t *c) {
1349 return gcUNDEF;
1350 }
1351
1352 static __inline__
mbw_prefix(is_cef2_atom_case)1353 good_char_t mbw_prefix(is_cef2_atom_case)(const mbw_t *c) {
1354 if( ATOM(c) ) {
1355 char_filter_state = fCEF2_ATOM;
1356 return gcTOKEN;
1357 }
1358 return gcUNDEF;
1359 }
1360
1361 static __inline__
mbw_prefix(handle_cef2_atom_case)1362 good_char_t mbw_prefix(handle_cef2_atom_case)(const mbw_t *c) {
1363 if( ATOM(c) ) {
1364 return gcTOKEN;
1365 } else if( DOTTED_ATOM(c) ) {
1366 char_filter_state = fCEF2_DOTTED_ATOM;
1367 return gcTOKEN;
1368 } else if( COLON_ATOM(c) ) {
1369 char_filter_state = fCEF2_COLON_ATOM;
1370 return gcTOKEN;
1371 }
1372 char_filter_state = fDEF;
1373 return gcDISCARD;
1374 }
1375
1376 static __inline__
mbw_prefix(handle_cef2_dotted_atom_case)1377 good_char_t mbw_prefix(handle_cef2_dotted_atom_case)(const mbw_t *c) {
1378 if( DOTTED_ATOM(c) ) {
1379 return gcTOKEN;
1380 }
1381 char_filter_state = fDEF;
1382 return gcDISCARD;
1383 }
1384
1385 static __inline__
mbw_prefix(handle_cef2_colon_atom_case)1386 good_char_t mbw_prefix(handle_cef2_colon_atom_case)(const mbw_t *c) {
1387 if( COLON_ATOM(c) ) {
1388 return gcTOKEN;
1389 }
1390 char_filter_state = fDEF;
1391 return gcDISCARD;
1392 }
1393
1394 static __inline__
mbw_prefix(is_cef2_char)1395 good_char_t mbw_prefix(is_cef2_char)(const mbw_t *c) {
1396 good_char_t retval;
1397 switch(char_filter_state) {
1398 case fDEF:
1399 #if defined MBW_MB
1400 /* this doesn't make sense for wide characters */
1401 if(*c & 0x80) {
1402 if( (retval = mbw_prefix(is_unicode_case)(c)) ||
1403 (retval = mbw_prefix(is_asian_case)(c)) ) {
1404 return retval;
1405 } else if( *c < 0xa0 ) {
1406 return gcDISCARD;
1407 }
1408 }
1409 #endif
1410 if( (retval = mbw_prefix(is_cef2_special_case)(c)) ||
1411 (retval = mbw_prefix(is_cef2_atom_case)(c)) ) {
1412 return retval;
1413 }
1414 return gcDISCARD;
1415 case fCEF2_ATOM:
1416 return mbw_prefix(handle_cef2_atom_case)(c);
1417 case fCEF2_DOTTED_ATOM:
1418 return mbw_prefix(handle_cef2_dotted_atom_case)(c);
1419 case fCEF2_COLON_ATOM:
1420 return mbw_prefix(handle_cef2_colon_atom_case)(c);
1421 case ASIAN_CASES:
1422 return mbw_prefix(handle_asian_case)(c);
1423 case UNICODE_CASES:
1424 return mbw_prefix(handle_unicode_case)(c);
1425 default:
1426 /* nothing */
1427 break;
1428 }
1429 char_filter_state = fDEF;
1430 return gcDISCARD;
1431 }
1432
1433
1434
1435 static __inline__
mbw_prefix(is_char_char)1436 good_char_t mbw_prefix(is_char_char)(const mbw_t *c) {
1437 /* return (mbw_isgraph(*c) ? gcTOKEN_END : */
1438 /* (mbw_isspace(*c) ? (mbw_isspace(c[1]) ? gcDISCARD : gcTOKEN_END) : */
1439 /* gcDISCARD)); */
1440 return mbw_isgraph(*c) ? gcTOKEN_END : gcDISCARD;
1441 }
1442
1443 /*
1444 * this code generates good_char() and w_good_char()
1445 * returns true if the character is part of a token
1446 *
1447 * gcTOKEN: character should be part of a token
1448 * gcTOKEN_END: like gcTOKEN, but token must end immediately
1449 * gcDISCARD: character is not part of a token
1450 * gcIGNORE: pretend there is no character here
1451 *
1452 * gcDISCARD is also returned if the line is empty
1453 */
mbw_prefix(good_char)1454 good_char_t mbw_prefix(good_char)(mbw_t *c) {
1455 if( c && (*c != mbw_lit('\0')) ) {
1456 if( !(m_options & (1<<M_OPTION_CASEN)) ) {
1457 *c = mbw_tolower(*c);
1458 }
1459 switch(m_cp) {
1460 case CP_ADP:
1461 return mbw_prefix(is_adp_char)(c);
1462 case CP_CEF2:
1463 return mbw_prefix(is_cef2_char)(c);
1464 case CP_CHAR:
1465 return mbw_prefix(is_char_char)(c);
1466 case CP_ALPHA:
1467 return mbw_isalpha(*c) ? gcTOKEN : gcDISCARD;
1468 case CP_CEF:
1469 return mbw_prefix(is_cef_char)(c);
1470 case CP_ALNUM:
1471 return mbw_isalnum(*c) ? gcTOKEN : gcDISCARD;
1472 case CP_GRAPH:
1473 return mbw_isgraph(*c) ? gcTOKEN : gcDISCARD;
1474 case CP_DEFAULT:
1475 break;
1476 }
1477 }
1478 return gcDISCARD;
1479 }
1480
1481 /*
1482 * The regex tokenizer operates on single lines only, ie regexes cannot
1483 * straddle lines. This makes the code much simpler.
1484 */
mbw_prefix(regex_tokenizer)1485 void mbw_prefix(regex_tokenizer)(mbw_t *p, int i,
1486 void (*word_fun)(char *, token_type_t, regex_count_t),
1487 token_type_t (*get_tt)(token_order_t)) {
1488 char *q, *cq;
1489 charbuf_len_t k,l, j;
1490 int eflag = 0;
1491 token_type_t tt;
1492 token_order_t z, order;
1493 char tok[(MAX_TOKEN_LEN+1)*MAX_SUBMATCH+EXTRA_TOKEN_LEN];
1494 regmatch_t pmatch[MAX_SUBMATCH];
1495
1496 k = 0;
1497 l = mbw_strlen(p);
1498 /* see if a match */
1499 while( (k < l) && (mbw_regexec(&re[i].regex, p + k,
1500 MAX_SUBMATCH, pmatch, eflag) == 0) ) {
1501 /* all the submatches (delimited by brackets in the regex)
1502 get concatenated and the result gets word_fun'd */
1503 q = tok;
1504 *q++ = DIAMOND;
1505 for(order = 0, z = 1;
1506 (z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) {
1507 if( !(re[i].submatches & (1<<z)) )
1508 { continue; } else { order++; }
1509 /* transcribe the submatch into tok */
1510 for(j = pmatch[z].rm_so;
1511 ((j < (charbuf_len_t)pmatch[z].rm_eo) &&
1512 (j < (charbuf_len_t)pmatch[z].rm_so + MAX_TOKEN_LEN)); j++) {
1513 if( m_options & (1<<M_OPTION_CASEN) ) {
1514 mbw_copychar(q,p[k + j]);
1515 } else {
1516 mbw_copychar(q,mbw_tolower(p[k+j]));
1517 }
1518 }
1519 *q++ = DIAMOND;
1520 }
1521
1522 tt = (*get_tt)(order);
1523
1524 cq = q;
1525 *cq++ = CLASSEP;
1526 *cq++ = (char)(AMIN + tt.cls);
1527 *cq = '\0';
1528
1529 /* now let each category process the token */
1530 (*word_fun)(tok, tt, i + 1); /* +1 because i = 0 means INVALID_RE */
1531
1532 k += pmatch[0].rm_so + 1; /* advance string and repeat */
1533 eflag = REG_NOTBOL;
1534 }
1535 }
1536
1537 /*
1538 * The standard tokenizer converts each acceptable token into a char
1539 * string, and passes it to word_fun(). To construct a token, the
1540 * good_char() function is called on each successive input character,
1541 * obtaining a code as follows:
1542 *
1543 * gcTOKEN: The character belongs to an acceptable token, and is
1544 * copied to the holding buffer at the position pointed by nq, unless
1545 * the token length would exceed MAX_TOKEN_LEN, in which case we
1546 * pretend we have a gcTOKEN_END code.
1547 *
1548 * gcTOKEN_END: The character belongs to an acceptable token, but the
1549 * token must be terminated immediately. In this case, we fall through
1550 * to the gcDISCARD case.
1551 *
1552 * gcDISCARD: The character does not belong to a token. In this case,
1553 * we check that the holding buffer contains usable data, to which we
1554 * apply the word_fun(). The holding buffer is then reset.
1555 *
1556 * If p is NULL, we simply flush the token. Tokens normally straddle
1557 * newlines, but if M_OPTION_NGRAM_STRADDLE_NL is set, then each
1558 * newline is flushes the current token also.
1559 */
mbw_prefix(std_tokenizer)1560 void mbw_prefix(std_tokenizer)(mbw_t *p, char **pq, char *hbuf,
1561 token_order_t *hbuf_order, token_order_t max_order,
1562 void (*word_fun)(char *, token_type_t, regex_count_t),
1563 token_type_t (*get_tt)(token_order_t)) {
1564 token_type_t tt;
1565 token_order_t n, o;
1566 char *q;
1567 char *tstart, *qq, *cq;
1568 bool_t reset;
1569
1570 if( p && (p[0] == mbw_lit('\0')) ) {
1571 /* waste of time */
1572 return;
1573 }
1574
1575 q = *pq;
1576 o = *hbuf_order;
1577
1578 if( !q ||
1579 (q < hbuf) ||
1580 (q > hbuf + (MAX_TOKEN_LEN+1)*MAX_SUBMATCH+EXTRA_TOKEN_LEN) ) {
1581 q = hbuf;
1582 *q++ = DIAMOND;
1583 #if defined MBW_WIDE
1584 memset(©char_shiftstate, 0, sizeof(mbstate_t));
1585 #endif
1586 }
1587 for(tstart = q - 1; *tstart != DIAMOND; --tstart);
1588
1589 /* p[0] at least is nonzero */
1590 do {
1591 switch( mbw_prefix(good_char)(p) ) {
1592 case gcIGNORE:
1593 /* pretend there is no character here */
1594 break;
1595 case gcTOKEN:
1596 if( p && q < tstart + MAX_TOKEN_LEN ) {
1597 mbw_copychar(q,*p);
1598 break;
1599 }
1600 /* if we're here, fall through */
1601 case gcTOKEN_END:
1602 if( p && q < tstart + MAX_TOKEN_LEN + 1) {
1603 mbw_copychar(q,*p);
1604 }
1605 /* don't break, always fall through */
1606 case gcUNDEF:
1607 case gcDISCARD:
1608 reset = ( !(m_options & (1<<M_OPTION_NGRAM_STRADDLE_NL)) &&
1609 p && ( (p[0] == mbw_lit('\0')) ||
1610 (p[0] == mbw_lit('\n')) ) );
1611
1612 if( (p == NULL) || reset || (q[-1] != DIAMOND) ) {
1613 tstart = q;
1614 *q++ = DIAMOND;
1615 *q = '\0';
1616
1617 if( max_order == 1 ) {
1618 tt = (*get_tt)(1);
1619 cq = q;
1620 *cq++ = CLASSEP;
1621 *cq++ = (char)(AMIN + tt.cls);
1622 *cq = '\0';
1623 /* let each category process the token */
1624 (*word_fun)(hbuf, tt, INVALID_RE);
1625 tstart = q = hbuf;
1626 *q++ = DIAMOND;
1627 } else if( p ) {
1628 /* do this only if we have a line to work with */
1629 if( ++o > max_order ) {
1630 o--;
1631 /* move all tokens down by one */
1632 for(q = hbuf + 1; *q != DIAMOND; q++) {};
1633 for(q++, qq = hbuf + 1; *q; *qq++ = *q++) {};
1634 *qq = '\0';
1635 tstart = q = qq;
1636 }
1637
1638 tt = (*get_tt)(o);
1639
1640 cq = q;
1641 *cq++ = CLASSEP;
1642 *cq++ = (char)(AMIN + tt.cls);
1643 *cq = '\0';
1644
1645 qq = hbuf;
1646 for(n = o; n > 0; n--) {
1647 /* let each category process the token */
1648 tt.order = n;
1649 (*word_fun)(qq, tt, INVALID_RE);
1650 qq++;
1651 /* skip to next token and repeat */
1652 while(*qq != DIAMOND ) { qq++; }
1653 }
1654 }
1655 if( reset ) {
1656 /* reset the current ngrams to zero */
1657 tstart = hbuf;
1658 q = hbuf + 1;
1659 o = 0;
1660 }
1661 }
1662
1663 }
1664 } while( p && (*(p++) != mbw_lit('\0')) );
1665
1666 *pq = q;
1667 *hbuf_order = o;
1668 }
1669
1670
1671
1672 /***********************************************************
1673 * FILTERING FUNCTIONS *
1674 ***********************************************************/
1675
1676 /*
1677 * this code generates mhe_line_filter() and w_mhe_line_filter()
1678 * translates a MIME message header extension encoded
1679 * token into its equivalent byte sequence.
1680 */
mbw_prefix(mhe_line_filter)1681 bool_t mbw_prefix(mhe_line_filter)(mbw_t *line) {
1682 mbw_t *p = line;
1683 mbw_t *q = line;
1684 mbw_t *r;
1685
1686 while( *p ) {
1687 if( (p[0] == mbw_lit('=')) && (p[1] == mbw_lit('?')) ) {
1688 r = p + 2;
1689 while( *r && (*r != mbw_lit('?'))) { r++; }
1690 r++;
1691 /* I think lower case is illegal */
1692 if( (*r == mbw_lit('Q')) || (*r == mbw_lit('q')) ) {
1693 if( *(++r) == mbw_lit('?') ) {
1694 r++;
1695 /* we are now committed. find end marker and replace with NUL */
1696 for(p = r; *p; p++) {
1697 if( *p == mbw_lit('_') ) {
1698 *p = ' ';
1699 } else if( (p[0] == mbw_lit('?')) && (p[1] == mbw_lit('=')) ) {
1700 *p = mbw_lit('\0');
1701 break;
1702 }
1703 }
1704 q = mbw_prefix(qp_line_filter2)(r, q);
1705 p += 2;
1706 } else {
1707 /* malformed encoding */
1708 *q++ = *p++;
1709 }
1710 } else if( (*r == mbw_lit('B')) || (*r == mbw_lit('b')) ) {
1711 /* I think lower case is illegal, but we're lenient */
1712 if( *(++r) == mbw_lit('?') ) {
1713 r++;
1714 /* we are now committed. find end marker and replace with NUL */
1715 for(p = r; *p; p++) {
1716 if( (p[0] == mbw_lit('?')) && (p[1] == mbw_lit('=')) ) {
1717 *p = mbw_lit('\0');
1718 break;
1719 }
1720 }
1721 q = mbw_prefix(b64_line_filter2)(r, q);
1722 p += 2;
1723 } else {
1724 /* malformed encoding */
1725 *q++ = *p++;
1726 }
1727 } else {
1728 /* malformed encoding */
1729 *q++ = *p++;
1730 }
1731 } else {
1732 *q++ = *p++;
1733 }
1734 }
1735 *q = '\0';
1736 return 1;
1737 }
1738
mbw_prefix(extract_header_label)1739 int mbw_prefix(extract_header_label)(MBOX_State *mbox, mbw_t *line) {
1740 mbw_t *p = line;
1741
1742 if( m_options & (1<<M_OPTION_XHEADERS) ) {
1743 if( (mbw_strncasecmp(p, mbw_lit("X-DBACL"),7) == 0) ||
1744 (mbw_strncasecmp(p, mbw_lit("Date:"),4) == 0) ||
1745 (mbw_strncasecmp(p, mbw_lit("Path:"),4) == 0) ||
1746 (mbw_strncasecmp(p, mbw_lit("Posted:"),6) == 0) ||
1747 (mbw_strncasecmp(p, mbw_lit("Expires:"),7) == 0) ||
1748 (mbw_strncasecmp(p, mbw_lit("Received:"),8) == 0) ||
1749 (mbw_strncasecmp(p, mbw_lit("Resent-Date:"),11) == 0) ||
1750 (mbw_strncasecmp(p, mbw_lit("Delivery-Date:"),13) == 0) ||
1751 (mbw_isspace(line[0]) && mbox->skip_header) ) {
1752 mbox->skip_header = 1;
1753 return 0;
1754 } else {
1755 mbox->skip_header = 0;
1756 return mbw_prefix(mhe_line_filter)(line);
1757 }
1758 }
1759
1760 return 0;
1761 }
1762
1763 /***********************************************************
1764 * MBOX PARSING FUNCTIONS *
1765 ***********************************************************/
1766
1767 /*
1768 * this code generates extract_mime_boundary() and w_extract_mime_boundary()
1769 * retrieves the MIME boundary if one is found. Doesn't cope with rfc2184
1770 */
mbw_prefix(extract_mime_boundary)1771 bool_t mbw_prefix(extract_mime_boundary)(MBOX_State *mbox,mbw_t *line) {
1772 const mbw_t *q;
1773 mbw_t *r;
1774 int size;
1775 bool_t quoted = 0; /* used both in calsulation and for return value */
1776
1777 q = mbw_prefix(mystrcasestr)(line, mbw_lit("boundary="));
1778
1779 if( q ) {
1780 /* we skip white space after = sign, even though it is not allowed */
1781 for(q += 9; mbw_isspace(*q); q++);
1782 if( *q ) {
1783 quoted = (*q == mbw_lit('"'));
1784 r = mbox->boundary.mbw_prefix(identifier)[mbox->boundary.index];
1785 size = 0;
1786 if( quoted ) {
1787 for(q++; *q && (*q != mbw_lit('"')) &&
1788 (size < MAX_BOUNDARY_BUFSIZE); q++, size++) {
1789 *r++ = *q;
1790 }
1791 } else {
1792 for(; *q && !mbw_isspace(*q) &&
1793 (size < MAX_BOUNDARY_BUFSIZE); q++, size++) {
1794 *r++ = *q;
1795 }
1796 }
1797 mbox->boundary.size[mbox->boundary.index] = size;
1798 if( ++mbox->boundary.index >= MAX_BOUNDARIES ) { mbox->boundary.index = 0; }
1799 quoted = (size > 0) ? 1 : 0;
1800 } else {
1801 /* this is bad */
1802 quoted = 0;
1803 }
1804 }
1805
1806 /* MIME messages look like this: head-preamble-sec1-...-secN-postamble,
1807 * and the RFCs recommend that preambles/postambles be ignored.
1808 * However, this introduces a loophole for spammers, who can define a
1809 * boundary, but then never cite it. "Robust" MUAs will show the
1810 * contents of the preamble, but we would not see it. To ignore preambles,
1811 * define the symbol IGNORE_MIME_PREAMBLE below.
1812 */
1813 #undef IGNORE_MIME_PREAMBLE
1814
1815 #if defined(IGNORE_MIME_PREAMBLE)
1816 return quoted;
1817 #else
1818 return 0;
1819 #endif
1820 }
1821
1822
1823 static
mbw_prefix(check_old_style_digest)1824 bool_t mbw_prefix(check_old_style_digest)(const mbw_t *line) {
1825
1826 #define THIRTYDASHES mbw_lit("------------------------------")
1827 #define SEVENTYDASHES mbw_lit("----------------------------------------------------------------------")
1828
1829 /* messages are separated by either exactly 30 or exactly 70 dashes */
1830 return ( ((mbw_strncmp(line, THIRTYDASHES, 30) == 0) &&
1831 (line[30] == mbw_lit('\r') || line[30] == mbw_lit('\n'))) ||
1832 ((mbw_strncmp(line, SEVENTYDASHES, 70) == 0) &&
1833 (line[70] == mbw_lit('\r') || line[70] == mbw_lit('\n'))) );
1834 }
1835
1836 /* static */
1837 /* bool_t mbw_prefix(outlook_message_announce)(const mbw_t *line) { */
1838 /* #define OLDASHES mbw_lit("-----Original Message-----") */
1839 /* #define OEDASHES mbw_lit("----- Original Message -----") */
1840 /* return ( ((mbw_strncmp(line, OLDASHES, 26) == 0) && */
1841 /* (line[26] == mbw_lit('\r') || line[26] == mbw_lit('\n'))) || */
1842 /* ((mbw_strncmp(line, OEDASHES, 28) == 0) && */
1843 /* (line[28] == mbw_lit('\r') || line[28] == mbw_lit('\n'))) ); */
1844 /* } */
1845
1846 /*
1847 * this code generates check_mime_boundary() and w_check_mime_boundary()
1848 * The check is only approximate.
1849 */
mbw_prefix(check_mime_boundary)1850 bool_t mbw_prefix(check_mime_boundary)(MBOX_State *mbox, const mbw_t *line) {
1851 int c = (mbox->boundary.index > 0) ?
1852 (mbox->boundary.index - 1) : (MAX_BOUNDARIES - 1);
1853 int k = 0;
1854 const mbw_t *p = mbox->boundary.mbw_prefix(identifier)[c];
1855 const mbw_t *q = line + 2;
1856 while(*q) {
1857 if( (k >= mbox->boundary.size[c]) || (*q != *p) ) {
1858 c--;
1859 if( c < 0 ) { c = MAX_BOUNDARIES - 1; }
1860 p = &mbox->boundary.mbw_prefix(identifier)[c][k];
1861 if( c == mbox->boundary.index ) {
1862 if( (k >= mbox->boundary.size[c]) || (*q != *p) ) {
1863 mbox->boundary.was_end = 0;
1864 return 0;
1865 }
1866 }
1867 } else if( k == mbox->boundary.size[c] - 1) {
1868 if((q[1] == mbw_lit('-')) && (q[2] == mbw_lit('-'))) {
1869 mbox->boundary.was_end = 1;
1870 } else {
1871 mbox->boundary.was_end = 0;
1872 }
1873 return 1;
1874 } else {
1875 /* normally, a space isn't allowed in the boundary, but we're lenient */
1876 q++;
1877 p++;
1878 k++;
1879 }
1880 }
1881 mbox->boundary.was_end = 0;
1882 return 0;
1883 }
1884
1885 static
mbw_prefix(check_armor_start)1886 bool_t mbw_prefix(check_armor_start)(const mbw_t *line) {
1887 int i;
1888 for(i = 0; i < num_armor_start; i++) {
1889 if( mbw_strncmp(line, mbw_prefix(armor_start)[i],
1890 mbw_strlen(mbw_prefix(armor_start)[i])) == 0 ) {
1891 return 1;
1892 }
1893 }
1894 if( (mbw_strncmp(line, mbw_lit("begin "), 6) == 0) &&
1895 ISOCT(line[6]) && ISOCT(line[7]) && ISOCT(line[8]) ) {
1896 /* uuencoded */
1897 return 1;
1898 } else if( (mbw_strncmp(line, mbw_lit("=ybegin"), 7) == 0) &&
1899 ((line[7] == mbw_lit(' ')) || (line[7] == mbw_lit('2'))) &&
1900 mbw_prefix(mystrcasestr)(line + 8, mbw_lit("line=")) &&
1901 mbw_prefix(mystrcasestr)(line + 8, mbw_lit("size=")) &&
1902 mbw_prefix(mystrcasestr)(line + 8, mbw_lit("name=")) ) {
1903 /* yEnc */
1904 return 1;
1905 }
1906 return 0;
1907 }
1908
1909 static
mbw_prefix(check_armor_end)1910 bool_t mbw_prefix(check_armor_end)(const mbw_t *line) {
1911 int i;
1912 for(i = 0; i < num_armor_end; i++) {
1913 if( mbw_strncmp(line, mbw_prefix(armor_end)[i],
1914 mbw_strlen(mbw_prefix(armor_end)[i])) == 0 ) {
1915 return 1;
1916 }
1917 }
1918 if( (mbw_strncmp(line, mbw_lit("end"),3) == 0) &&
1919 (!line[3] || (line[3] == mbw_lit('\n')) || (line[3] == mbw_lit('\r'))) ) {
1920 /* uuencoded */
1921 return 1;
1922 } else if( (mbw_strncmp(line, mbw_lit("=yend "), 6) == 0) &&
1923 mbw_prefix(mystrcasestr)(line + 8, mbw_lit("size=")) ) {
1924 /* yEnc */
1925 return 1;
1926 }
1927 return 0;
1928 }
1929
1930 /* return true if line should be shown, false otherwise */
1931 static
mbw_prefix(armor_filter)1932 bool_t mbw_prefix(armor_filter)(const mbw_t *line) {
1933 if( (mbw_prefix(is_b64line)(line) == 1) ||
1934 (mbw_prefix(is_uuline)(line) == 1) ||
1935 (mbw_prefix(is_yencline)(line) == 1) ) {
1936 return 0;
1937 }
1938 return 1;
1939 }
1940
1941 /*
1942 * this code generates extract_mime_types() and w_extract_mime_types()
1943 */
1944 static
mbw_prefix(extract_mime_types)1945 void mbw_prefix(extract_mime_types)(mbw_t *line, MIME_Struct *ms) {
1946 int i;
1947 if( !mbw_strncasecmp(line, mbw_lit("Content-Type:"), 13) ) {
1948 line += 13;
1949 for(i = 0; i < num_mime_media; i++) {
1950 if( mbw_prefix(mystrcasestr)(line,
1951 mbw_prefix(mime_media)[i].type_subtype) ) {
1952 ms->type = mbw_prefix(mime_media)[i].medium;
1953 return;
1954 }
1955 }
1956
1957 ms->type = ctOTHER;
1958
1959 } else if( !mbw_strncasecmp(line, mbw_lit("Content-Transfer-Encoding:"),
1960 26) ) {
1961 line += 26;
1962 if( mbw_prefix(mystrcasestr)(line, mbw_lit("base64")) ) {
1963 ms->encoding = ceB64;
1964 } else if( mbw_prefix(mystrcasestr)(line, mbw_lit("quoted-printable")) ) {
1965 ms->encoding = ceQP;
1966 } else if( mbw_prefix(mystrcasestr)(line, mbw_lit("binary")) ) {
1967 ms->encoding = ceBIN;
1968 } else if( mbw_prefix(mystrcasestr)(line, mbw_lit("7bit")) ) {
1969 ms->encoding = ceSEVEN;
1970 } else {
1971 ms->encoding = ceID;
1972 }
1973
1974 }
1975 }
1976
1977 /* scans the line for the character strip_header_char, and truncates
1978 * from that point on, and switches strip_header_char to the special value 1.
1979 * If special value 1, truncates line from second char onwards.
1980 * If special value 0, does nothing.
1981 */
1982 static __inline__
mbw_prefix(strip_from_char)1983 void mbw_prefix(strip_from_char)(MBOX_State *mbox, mbw_t *q) {
1984 if( mbox->mbw_prefix(strip_header_char) == mbw_lit('\x01') ) {
1985 if( *q++ ) {
1986 *q = mbw_lit('\0');
1987 }
1988 } else if( mbox->mbw_prefix(strip_header_char) ) {
1989 while(*q++) {
1990 if( *q == mbox->mbw_prefix(strip_header_char) ) {
1991 *q++ = mbw_lit('\n');
1992 *q = mbw_lit('\0');
1993 mbox->mbw_prefix(strip_header_char) = mbw_lit('\x01');
1994 break;
1995 }
1996 }
1997 }
1998 }
1999
2000 static
mbw_prefix(identify_header)2001 Mheaderid mbw_prefix(identify_header)(mbw_t *line) {
2002 #define HDRID(s,l,h) !mbw_strncasecmp(line, s,l) ? h
2003 if( mbw_isspace(*line) ) {
2004 return hidCONTINUATION;
2005 }
2006 switch(mbw_tolower(line[0])) {
2007 case mbw_lit('b'):
2008 return
2009 HDRID(mbw_lit("BCC:"),4,hidBCC) :
2010 hidUNDEF;
2011 case mbw_lit('c'):
2012 return
2013 HDRID(mbw_lit("Content-"),8,hidCONTENT_) :
2014 HDRID(mbw_lit("CC:"),3,hidCC) :
2015 HDRID(mbw_lit("Categor"),7,hidCATEGORY) :
2016 HDRID(mbw_lit("Comments:"),9,hidCOMMENTS) :
2017 hidUNDEF;
2018 case mbw_lit('f'):
2019 return
2020 HDRID(mbw_lit("From:"),5,hidFROM) :
2021 hidUNDEF;
2022 case mbw_lit('i'):
2023 return
2024 HDRID(mbw_lit("In-Reply-To:"),12,hidIN_REPLY_TO) :
2025 HDRID(mbw_lit("Importance:"),11,hidIMPORTANCE) :
2026 hidUNDEF;
2027 case mbw_lit('k'):
2028 return
2029 HDRID(mbw_lit("Keywords:"),9,hidKEYWORDS) :
2030 hidUNDEF;
2031 case mbw_lit('l'):
2032 return
2033 HDRID(mbw_lit("List-"),5,hidLIST_) :
2034 hidUNDEF;
2035 case mbw_lit('m'):
2036 return
2037 HDRID(mbw_lit("Message-ID:"),11,hidMESSAGE_ID) :
2038 HDRID(mbw_lit("MIME-Version:"),13,hidMIME_VERSION) :
2039 hidUNDEF;
2040 case mbw_lit('n'):
2041 return
2042 HDRID(mbw_lit("Notes:"),6,hidNOTE) :
2043 hidUNDEF;
2044 case mbw_lit('o'):
2045 return
2046 HDRID(mbw_lit("Original-"),8,hidORIGINAL_) :
2047 hidUNDEF;
2048 case mbw_lit('p'):
2049 return
2050 HDRID(mbw_lit("Priority:"),9,hidPRIORITY) :
2051 hidUNDEF;
2052 case mbw_lit('r'):
2053 return
2054 HDRID(mbw_lit("Received:"),9,hidRECEIVED) :
2055 HDRID(mbw_lit("Return-Path:"),12,hidRETURN_PATH) :
2056 HDRID(mbw_lit("References:"),11,hidREFERENCES) :
2057 HDRID(mbw_lit("Return-Receipt-To:"),18,hidRETURN_RECEIPT_TO) :
2058 HDRID(mbw_lit("Reply-To:"),9,hidREPLY_TO) :
2059 HDRID(mbw_lit("Resent-"),7,hidRESENT_) :
2060 hidUNDEF;
2061 case mbw_lit('s'):
2062 return
2063 HDRID(mbw_lit("Subject:"),8,hidSUBJECT) :
2064 HDRID(mbw_lit("Sent:"),5,hidSENT) :
2065 HDRID(mbw_lit("Sender:"),7,hidSENDER) :
2066 hidUNDEF;
2067 case mbw_lit('t'):
2068 return
2069 HDRID(mbw_lit("To:"),3,hidTO) :
2070 HDRID(mbw_lit("Thread-"),7,hidTHREAD_) :
2071 hidUNDEF;
2072 case mbw_lit('x'):
2073 return
2074 HDRID(mbw_lit("X-MS"),4,hidX_MS) :
2075 HDRID(mbw_lit("X-"),2,hidX_) :
2076 hidUNDEF;
2077 case mbw_lit('u'):
2078 return
2079 HDRID(mbw_lit("User-Agent:"),11,hidUSER_AGENT) :
2080 hidUNDEF;
2081 }
2082 return hidUNDEF;
2083 }
2084
2085
2086 static
mbw_prefix(scan_header_type)2087 HEADER_Type mbw_prefix(scan_header_type)(MBOX_State *mbox, mbw_t *line) {
2088
2089 #define STRIP(q) {while(*q++) { if( *q == mbw_lit(';') ) { *q++ = mbw_lit('\n'); *q = mbw_lit('\0'); break; }}}
2090 #define HDRIDCHK(x,y) ((mbox->hid == x) && (mbox->hstate = y))
2091 Mheaderid hid = mbw_prefix(identify_header)(line);
2092
2093 if( hid == hidCONTINUATION ) {
2094 /* we don't update mbox->hid */
2095 mbw_prefix(strip_from_char)(mbox, line);
2096 return htCONT;
2097 }
2098
2099 mbox->hid = hid;
2100 if( HDRIDCHK(hidFROM,mhsFROM) ||
2101 HDRIDCHK(hidTO,mhsTO) ||
2102 HDRIDCHK(hidMESSAGE_ID,mhsUNDEF) ||
2103 HDRIDCHK(hidIN_REPLY_TO,mhsUNDEF) ||
2104 HDRIDCHK(hidSUBJECT,mhsSUBJECT) ) {
2105 mbox->mbw_prefix(strip_header_char) = mbw_lit('\0');
2106 mbw_prefix(strip_from_char)(mbox, line);
2107 return htSTANDARD;
2108 } else if( HDRIDCHK(hidRETURN_PATH,mhsTRACE) ||
2109 HDRIDCHK(hidRECEIVED,mhsTRACE) ) {
2110 mbox->mbw_prefix(strip_header_char) = mbw_lit(';');
2111 mbw_prefix(strip_from_char)(mbox, line);
2112 return htTRACE;
2113 } else if( (mbox->hid == hidCONTENT_) &&
2114 mbw_strchr(line + 8, mbw_lit(':')) ) {
2115 mbox->hstate = mhsMIME;
2116 mbox->mbw_prefix(strip_header_char) = mbw_lit('\0');
2117 return htMIME;
2118 } else if( HDRIDCHK(hidSENDER,mhsUNDEF) ||
2119 HDRIDCHK(hidREPLY_TO,mhsUNDEF) ||
2120 HDRIDCHK(hidBCC,mhsUNDEF) ||
2121 HDRIDCHK(hidCC,mhsUNDEF) ||
2122 HDRIDCHK(hidREFERENCES,mhsUNDEF) ) {
2123 mbox->mbw_prefix(strip_header_char) = mbw_lit('\0');
2124 mbw_prefix(strip_from_char)(mbox, line);
2125 return htEXTENDED;
2126 } else {
2127 /* if the line starts with a word missing a :, then
2128 it could be a malformed continuation line */
2129 while( *line && !mbw_isspace(*line) && (*line != mbw_lit(':')) ) { line++; }
2130 if( *line == mbw_lit(':') ) {
2131 mbox->hstate = mhsUNDEF;
2132 mbox->mbw_prefix(strip_header_char) = mbw_lit('\0');
2133 return htUNDEF;
2134 } else {
2135 return htCONT;
2136 }
2137 }
2138 }
2139
2140 /* static */
2141 /* HEADER_Type mbw_prefix(scan_header_type)(MBOX_State *mbox, mbw_t *line) { */
2142
2143 /* #define STRIP(q) {while(*q++) { if( *q == mbw_lit(';') ) { *q++ = mbw_lit('\n'); *q = mbw_lit('\0'); break; }}} */
2144
2145 /* #define HDRCHK(s,l,h) (!mbw_strncasecmp(line, s,l) && (mbox->hstate = h)) */
2146
2147 /* mbox->mm = mbw_prefix(identify_header)(line); */
2148
2149 /* if( mbw_isspace(*line) ) { */
2150 /* mbw_prefix(strip_from_char)(mbox, line); */
2151 /* return htCONT; */
2152 /* } else if( HDRCHK(mbw_lit("From:"),5,mhsFROM) || */
2153 /* HDRCHK(mbw_lit("To:"),3,mhsTO) || */
2154 /* HDRCHK(mbw_lit("Message-ID:"),11,mhsUNDEF) || */
2155 /* HDRCHK(mbw_lit("In-Reply-To:"),12,mhsUNDEF) || */
2156 /* HDRCHK(mbw_lit("Subject:"),8,mhsSUBJECT) ) { */
2157 /* mbox->mbw_prefix(strip_header_char) = mbw_lit('\0'); */
2158 /* mbw_prefix(strip_from_char)(mbox, line); */
2159 /* return htSTANDARD; */
2160 /* } else if( HDRCHK(mbw_lit("Return-Path:"),12,mhsTRACE) || */
2161 /* HDRCHK(mbw_lit("Received:"),9,mhsTRACE) ) { */
2162 /* mbox->mbw_prefix(strip_header_char) = mbw_lit(';'); */
2163 /* mbw_prefix(strip_from_char)(mbox, line); */
2164 /* return htTRACE; */
2165 /* } else if( !mbw_strncasecmp(line, mbw_lit("Content-"),8) && */
2166 /* mbw_strchr(line + 8, mbw_lit(':')) ) { */
2167 /* mbox->hstate = mhsMIME; */
2168 /* mbox->mbw_prefix(strip_header_char) = mbw_lit('\0'); */
2169 /* return htMIME; */
2170 /* } else if( HDRCHK(mbw_lit("Sender:"),7,mhsUNDEF) || */
2171 /* HDRCHK(mbw_lit("Reply-To:"),9,mhsUNDEF) || */
2172 /* HDRCHK(mbw_lit("Bcc:"),4,mhsUNDEF) || */
2173 /* HDRCHK(mbw_lit("Cc:"),3,mhsUNDEF) || */
2174 /* HDRCHK(mbw_lit("References:"),11,mhsUNDEF) ) { */
2175 /* mbox->mbw_prefix(strip_header_char) = mbw_lit('\0'); */
2176 /* mbw_prefix(strip_from_char)(mbox, line); */
2177 /* return htEXTENDED; */
2178 /* } else { */
2179 /* /\* if the line starts with a word missing a :, then */
2180 /* it could be a malformed continuation line *\/ */
2181 /* while( *line && !mbw_isspace(*line) && (*line != mbw_lit(':')) ) { line++; } */
2182 /* if( *line == mbw_lit(':') ) { */
2183 /* mbox->hstate = mhsUNDEF; */
2184 /* mbox->mbw_prefix(strip_header_char) = mbw_lit('\0'); */
2185 /* return htUNDEF; */
2186 /* } else { */
2187 /* return htCONT; */
2188 /* } */
2189 /* } */
2190 /* } */
2191
2192 static
mbw_prefix(extract_mime_label)2193 int mbw_prefix(extract_mime_label)(mbw_t *line) {
2194 mbw_t *q;
2195 if( m_options & (1<<M_OPTION_HEADERS) ) {
2196 if( !mbw_strncasecmp(line, mbw_lit("Content-"),8) ) {
2197 line += 8;
2198 if( !mbw_strncasecmp(line, mbw_lit("Type:"),5) ) {
2199 /* we want both the mime type and the file name */
2200 q = (mbw_t *)mbw_prefix(mystrcasestr)(line, mbw_lit("name="));
2201 if( q ) { STRIP(q); } else { STRIP(line); }
2202 return 1;
2203 } else if( !mbw_strncasecmp(line, mbw_lit("Disposition:"),12) ) {
2204 STRIP(line);
2205 return 1;
2206 } else if( !mbw_strncasecmp(line, mbw_lit("ID:"),3) ||
2207 !mbw_strncasecmp(line, mbw_lit("Description:"),12) ) {
2208 /* note: we only get first line of description */
2209 return 1;
2210 }
2211 } else if( mbw_isspace(*line) ) {
2212 q = (mbw_t *)mbw_prefix(mystrcasestr)(line, mbw_lit("name="));
2213 if( q ) {
2214 STRIP(q);
2215 return 1;
2216 }
2217 }
2218 }
2219 return 0;
2220 }
2221
2222 /*
2223 * this code generates mbox_line_filter() and w_mbox_line_filter()
2224 *
2225 * returns true if the line should be processed further
2226 * depends on global mbox state
2227 */
mbw_prefix(mbox_line_filter)2228 bool_t mbw_prefix(mbox_line_filter)(MBOX_State *mbox, mbw_t *line,
2229 XML_State *xml) {
2230 bool_t line_empty = 0;
2231 bool_t doubledash = 0;
2232 bool_t process_line = 0; /* by default we skip the line */
2233 XML_Reset force_filter = xmlUNDEF;
2234 bool_t octet_stream = 0;
2235
2236 line_empty = MBW_EMPTYLINE(line);
2237 doubledash = MBW_DOUBLEDASH(line);
2238
2239 /* STEP 1: first perform state transitions */
2240 switch(mbox->state) {
2241 case msUNDEF:
2242 /* wait until we see the first nonempty line */
2243 if( !line_empty ) {
2244 mbox->state = msHEADER;
2245 mbox->substate = msuUNDEF;
2246 mbox->hid = hidUNDEF;
2247 mbox->hstate = mhsUNDEF;
2248 mbox->armor = maUNDEF;
2249 mbox->skip_until_boundary = 0;
2250 }
2251 break;
2252 case msHEADER:
2253 if( line_empty ) {
2254 mbox->state = msBODY;
2255 mbox->substate = msuUNDEF;
2256 mbox->hid = hidUNDEF;
2257 mbox->hstate = mhsUNDEF;
2258 mbox->armor = maUNDEF;
2259 /* don't reset skip_until_boundary */
2260 mbox->corruption_check = 5;
2261 }
2262 break;
2263 case msBODY:
2264 if( doubledash && mbw_prefix(check_mime_boundary)(mbox, line) ) {
2265 mbox->state = msATTACH;
2266 mbox->substate = msuUNDEF;
2267 mbox->hid = hidUNDEF;
2268 mbox->hstate = mhsUNDEF;
2269 mbox->armor = maUNDEF;
2270 mbox->skip_until_boundary = mbox->boundary.was_end;
2271 mbox->corruption_check = 0;
2272 } else if( doubledash &&
2273 /* mbw_prefix(outlook_message_announce)(line) || */
2274 mbw_prefix(check_old_style_digest)(line) ) {
2275 mbox->state = msATTACH;
2276 mbox->substate = msuTRACK;
2277 mbox->hid = hidUNDEF;
2278 mbox->hstate = mhsUNDEF;
2279 mbox->armor = maUNDEF;
2280 mbox->skip_until_boundary = mbox->boundary.was_end;
2281 mbox->corruption_check = 0;
2282 /* since there are no mime headers, we impose a content type */
2283 /* note: we only try to detect digests because we
2284 want to remove the Date: headers */
2285 mbox->body.type = ctMESSAGE_RFC822;
2286 } else if( doubledash &&
2287 (mbox->substate == msuARMOR) &&
2288 mbw_prefix(check_armor_end)(line) ) {
2289 mbox->substate = msuTRACK;
2290 mbox->hid = hidUNDEF;
2291 mbox->hstate = mhsUNDEF;
2292 mbox->armor = maUNDEF;
2293 mbox->skip_until_boundary = mbox->boundary.was_end;
2294 mbox->corruption_check = 0;
2295 } else if( doubledash &&
2296 (mbox->substate != msuARMOR) &&
2297 mbw_prefix(check_armor_start)(line) ) {
2298 mbox->substate = msuARMOR;
2299 mbox->armor = maENABLED;
2300 } else if( mbox->prev_line_empty ) {
2301 if( doubledash && !(m_options & (1<<M_OPTION_PLAIN)) ) {
2302 /* could be a corrupted boundary - note
2303 * previous empty line is not required, but often true
2304 */
2305 mbox->corruption_check = 5;
2306 } else if( !mbw_strncasecmp(line, mbw_lit("Content-"), 8) ) {
2307 mbw_prefix(mhe_line_filter)(line);
2308 switch(mbw_prefix(scan_header_type)(mbox, line)) {
2309 case htMIME:
2310 mbox->state = msATTACH;
2311 mbox->substate = msuMIME;
2312 mbox->hstate = mhsUNDEF;
2313 mbox->armor = maUNDEF;
2314 mbox->skip_until_boundary = 0;
2315 mbox->corruption_check = 0;
2316 break;
2317 default:
2318 /* do nothing - so far so good */
2319 break;
2320 }
2321 } else if( !mbw_strncmp(line, mbw_lit("From "), 5) ) {
2322 mbox->state = msHEADER;
2323 mbox->hid = hidUNDEF;
2324 mbox->substate = msuUNDEF;
2325 mbox->hstate = mhsUNDEF;
2326 mbox->armor = maUNDEF;
2327 mbox->skip_until_boundary = 0;
2328 mbox->corruption_check = 0;
2329 }
2330 } else if( mbox->corruption_check > 0 ) {
2331 mbox->corruption_check--;
2332 /* we filter out mail header extension codings - shouldn't do any harm */
2333 mbw_prefix(mhe_line_filter)(line);
2334 switch(mbw_prefix(scan_header_type)(mbox, line)) {
2335 case htMIME:
2336 mbox->state = msATTACH;
2337 mbox->substate = msuMIME;
2338 mbox->hstate = mhsUNDEF;
2339 mbox->armor = maUNDEF;
2340 mbox->skip_until_boundary = 0;
2341 mbox->corruption_check = 0;
2342 break;
2343 default:
2344 /* do nothing - so far so good */
2345 break;
2346 }
2347 }
2348 break;
2349 case msATTACH:
2350 if( line_empty ) {
2351 switch(mbox->body.type) {
2352 case ctMESSAGE_RFC822:
2353 /* our mime parse isn't recursive - instead we start a
2354 new message and associate with it all later attachments */
2355 mbox->state = msHEADER;
2356 break;
2357 case ctAPPLICATION_MSWORD:
2358 mbox->state = msBODY;
2359 /* override encoding if undefined */
2360 if( mbox->body.encoding == ceUNDEF ) {
2361 mbox->body.encoding = ceB64;
2362 }
2363 break;
2364 default:
2365 mbox->state = msBODY;
2366 break;
2367 }
2368 mbox->substate = msuUNDEF;
2369 mbox->hid = hidUNDEF;
2370 mbox->hstate = mhsUNDEF;
2371 mbox->armor = maUNDEF;
2372 mbox->skip_until_boundary = 0;
2373 mbox->corruption_check = 0;
2374 }
2375 break;
2376 }
2377
2378 mbox->prev_line_empty = line_empty; /* for next time */
2379
2380 /* STEP 2: now clean up and prepare the line according to current state
2381 * and substate.
2382 * After cleanup, the variable process_line indicates if the line
2383 * should be ignored.
2384 * The substate can evolve while the current state is unchanging.
2385 */
2386 switch(mbox->state) {
2387 case msUNDEF:
2388 /* line is not processed */
2389 break;
2390 case msHEADER:
2391 switch(mbox->substate) {
2392 case msuUNDEF:
2393 /* flush caches */
2394 process_line =
2395 mbw_prefix(flush_cache)(&(mbox->mbw_prefix(b64_dc)), line, 1) ||
2396 mbw_prefix(flush_cache)(&(mbox->mbw_prefix(qp_dc)), line, 1);
2397 if( process_line ) {
2398 /* we still remember previous type/encoding, decide if we need filter */
2399 force_filter = select_xml_defaults(&mbox->body);
2400 octet_stream = (mbox->body.type == ctOCTET_STREAM) ||
2401 (mbox->body.type == ctAPPLICATION_MSWORD);
2402 }
2403 /* there are no default mime types for headers */
2404 mbox->header.type = mbox->body.type = ctUNDEF;
2405 mbox->header.encoding = mbox->body.encoding = ceUNDEF;
2406 /* switch to normal state next time */
2407 mbox->substate = msuOTHER;
2408 mbox->corruption_check = 0;
2409 mbox->skip_header = 0;
2410 mbox->plainstate = psPLAIN;
2411 /* don't break, as the current line could contain
2412 interesting headers already */
2413
2414 default:
2415 /* switch substate if necessary */
2416 switch(mbw_prefix(scan_header_type)(mbox, line)) {
2417 case htSTANDARD:
2418 if( m_options & (1<<M_OPTION_NOHEADERS) ) {
2419 mbox->substate = (mbox->hstate == mhsSUBJECT) ? msuTRACK : msuOTHER;
2420 } else {
2421 mbox->substate = msuTRACK;
2422 }
2423 break;
2424 case htEXTENDED:
2425 mbox->substate = (m_options & (1<<M_OPTION_HEADERS)) ? msuTRACK : msuOTHER;
2426 break;
2427 case htTRACE:
2428 mbox->substate = (m_options & (1<<M_OPTION_THEADERS)) ? msuTRACK : msuOTHER;
2429 break;
2430 case htMIME:
2431 mbox->substate = msuMIME;
2432 break;
2433 case htCONT:
2434 /* nothing */
2435 break;
2436 case htUNDEF:
2437 mbox->substate = msuOTHER;
2438 break;
2439 }
2440
2441 /* process substate */
2442
2443 switch(mbox->substate) {
2444 case msuTRACK:
2445 process_line = mbw_prefix(mhe_line_filter)(line);
2446 break;
2447 case msuMIME:
2448 mbw_prefix(mhe_line_filter)(line);
2449 mbw_prefix(extract_mime_types)(line, &mbox->header);
2450 mbox->skip_until_boundary =
2451 mbw_prefix(extract_mime_boundary)(mbox, line) || mbox->skip_until_boundary;
2452 /* this comes last, modifies line */
2453 process_line = mbw_prefix(extract_mime_label)(line);
2454 break;
2455 case msuUNDEF:
2456 case msuOTHER:
2457 mbox->hstate = mhsXHEADER;
2458 case msuARMOR:
2459 process_line = mbw_prefix(extract_header_label)(mbox, line);
2460 break;
2461 }
2462 }
2463 break;
2464 case msBODY:
2465 switch(mbox->substate) {
2466 case msuUNDEF:
2467 /* flush caches */
2468 process_line =
2469 mbw_prefix(flush_cache)(&(mbox->mbw_prefix(b64_dc)), line, 1) ||
2470 mbw_prefix(flush_cache)(&(mbox->mbw_prefix(qp_dc)), line, 1);
2471 if( process_line ) {
2472 /* we still remember previous type/encoding, decide if we need filter */
2473 force_filter = select_xml_defaults(&mbox->body);
2474 octet_stream = (mbox->body.type == ctOCTET_STREAM) ||
2475 (mbox->body.type == ctAPPLICATION_MSWORD);
2476 }
2477 /* bodies by default inherit the header mime types */
2478 if( mbox->body.type == ctUNDEF )
2479 { mbox->body.type = mbox->header.type; }
2480 if( mbox->body.encoding == ceUNDEF )
2481 { mbox->body.encoding = mbox->header.encoding; }
2482
2483 /* switch to normal state next time */
2484 mbox->substate = msuTRACK;
2485 mbox->plainstate = psPLAIN;
2486 break;
2487 case msuARMOR:
2488 switch(mbox->armor) {
2489 case maUNDEF:
2490 process_line = 1;
2491 break;
2492 case maENABLED:
2493 process_line = mbw_prefix(armor_filter)(line);
2494 break;
2495 }
2496
2497 break;
2498 default:
2499 if( mbox->skip_until_boundary ) {
2500 process_line = 0;
2501 } else {
2502 switch(mbox->body.type) {
2503 case ctOCTET_STREAM:
2504 case ctAPPLICATION_MSWORD:
2505 if( !(m_options & (1<<M_OPTION_ATTACHMENTS)) ) {
2506 process_line = 0;
2507 break;
2508 } else {
2509 /* otherwise fall through */
2510 octet_stream = 1;
2511 }
2512 case ctUNDEF: /* the header didn't say, so we must assume text */
2513 case ctMESSAGE_RFC822:
2514 case ctTEXT_PLAIN:
2515 switch(mbox->body.encoding) {
2516 case ceBIN:
2517 process_line = 1;
2518 break;
2519 case ceUNDEF:
2520 case ceSEVEN:
2521 case ceID:
2522 process_line = ((m_options & (1<<M_OPTION_PLAIN)) ?
2523 1 : mbw_prefix(plain_text_filter)(mbox, line));
2524 break;
2525 case ceQP:
2526 process_line =
2527 mbw_prefix(qp_line_filter)(&(mbox->mbw_prefix(qp_dc)), line);
2528 break;
2529 case ceB64:
2530 process_line =
2531 mbw_prefix(b64_line_filter)(&(mbox->mbw_prefix(b64_dc)), line);
2532 break;
2533 }
2534 break;
2535 case ctTEXT_RICH:
2536 case ctTEXT_HTML:
2537 case ctTEXT_XML:
2538 case ctTEXT_SGML:
2539 case ctTEXT_UNKNOWN:
2540 switch(mbox->body.encoding) {
2541 case ceBIN:
2542 case ceUNDEF:
2543 case ceSEVEN:
2544 case ceID:
2545 process_line = 1;
2546 break;
2547 case ceQP:
2548 process_line =
2549 mbw_prefix(qp_line_filter)(&(mbox->mbw_prefix(qp_dc)), line);
2550 break;
2551 case ceB64:
2552 process_line =
2553 mbw_prefix(b64_line_filter)(&(mbox->mbw_prefix(b64_dc)), line);
2554 break;
2555 }
2556 break;
2557 case ctIMAGE:
2558 case ctAUDIO:
2559 case ctVIDEO:
2560 case ctMODEL:
2561 case ctOTHER:
2562 process_line = 0;
2563 break;
2564 }
2565 }
2566 break;
2567 }
2568 break;
2569 case msATTACH:
2570 switch(mbox->substate) {
2571 case msuUNDEF:
2572 /* flush caches */
2573 process_line =
2574 mbw_prefix(flush_cache)(&(mbox->mbw_prefix(b64_dc)), line, 1) ||
2575 mbw_prefix(flush_cache)(&(mbox->mbw_prefix(qp_dc)), line, 1);
2576 if( process_line ) {
2577 /* we still remember previous type/encoding, decide if we need filter */
2578 force_filter = select_xml_defaults(&mbox->body);
2579 octet_stream = (mbox->body.type == ctOCTET_STREAM) ||
2580 (mbox->body.type == ctAPPLICATION_MSWORD);
2581 }
2582 /* attachments by default inherit the header mime types */
2583 mbox->body.type = mbox->header.type;
2584 mbox->body.encoding = mbox->header.encoding;
2585 /* switch to normal state next time */
2586 /* this has a nice side-effect: if the first line is a htCONT,
2587 then it gets displayed and that's the right thing to do,
2588 because if the first line is a htCONT, then the ATTACH header
2589 is not a header at all, ie the paragraph was misidentified. */
2590 mbox->substate = msuTRACK;
2591 mbox->plainstate = psPLAIN;
2592 break;
2593 default:
2594 /* switch substate if necessary */
2595 switch(mbw_prefix(scan_header_type)(mbox, line)) {
2596 case htSTANDARD:
2597 mbox->substate = msuTRACK;
2598 break;
2599 case htEXTENDED:
2600 mbox->substate = (m_options & (1<<M_OPTION_HEADERS)) ? msuTRACK : msuOTHER;
2601 break;
2602 case htTRACE:
2603 mbox->substate = (m_options & (1<<M_OPTION_THEADERS)) ? msuTRACK : msuOTHER;
2604 break;
2605 case htMIME:
2606 mbox->substate = msuMIME;
2607 break;
2608 case htCONT:
2609 /* nothing */
2610 break;
2611 case htUNDEF:
2612 mbox->substate = msuOTHER;
2613 break;
2614 }
2615 /* process substate */
2616 switch(mbox->substate) {
2617 case msuTRACK:
2618 process_line = 1;
2619 break;
2620 case msuUNDEF:
2621 case msuOTHER:
2622 case msuARMOR:
2623 process_line = 0;
2624 break;
2625 case msuMIME:
2626 mbw_prefix(mhe_line_filter)(line);
2627 mbw_prefix(extract_mime_types)(line, &mbox->body);
2628 mbox->skip_until_boundary =
2629 mbw_prefix(extract_mime_boundary)(mbox, line) || mbox->skip_until_boundary;
2630 /* this comes last, modifies line */
2631 process_line = mbw_prefix(extract_mime_label)(line);
2632 break;
2633 }
2634 break;
2635 }
2636 break;
2637 }
2638
2639 /* STEP 3: activate filters */
2640
2641 if( octet_stream ) {
2642 process_line = mbw_prefix(strings1_filter)(line);
2643 }
2644
2645 if( !process_line && line_empty ) {
2646 /* don't touch this: the end of file is artificially marked by an
2647 empty line */
2648 process_line =
2649 mbw_prefix(flush_cache)(&(mbox->mbw_prefix(b64_dc)), line, 1) ||
2650 mbw_prefix(flush_cache)(&(mbox->mbw_prefix(qp_dc)), line, 1);
2651 }
2652
2653 if( force_filter != xmlUNDEF ) {
2654 reset_xml_character_filter(xml, force_filter);
2655 } else {
2656 if( mbox->state == msBODY ) {
2657 if( mbox->skip_until_boundary ) {
2658 process_line = 0;
2659 } else {
2660 reset_xml_character_filter(xml, select_xml_defaults(&(mbox->body)));
2661 }
2662 } else {
2663 reset_xml_character_filter(xml, xmlDISABLE);
2664 }
2665 }
2666
2667 /* we also process empty lines, as they can be helpful for n-gram boundaries */
2668 return process_line || line_empty;
2669 }
2670
2671
2672
2673 /***********************************************************
2674 * HTML PARSING FUNCTIONS *
2675 ***********************************************************/
2676
2677 /*
2678 * this code generates decode_html_entity() and w_decode_html_entity()
2679 * (this is ugly, but I am _not_ building a string hash, sheesh).
2680 *
2681 * note: the conversion from unicode to multibyte depends on the current
2682 * locale, but also assumes that wchar_t *is* unicode internally. Both assumptions
2683 * can be false on weird compilers. In case the locale is incapable of doing the job,
2684 * we convert based on the hex code.
2685 *
2686 * note2: the conversion is not always faithful, even so. We try not to convert
2687 * characters which could look like control codes to the html parser later.
2688 *
2689 * note3: upon successful conversion, *qq is incremented by the
2690 * character, and *lline is incremented by the entity length - 1, so
2691 * that you still need to increment *lline by one to obtain the next
2692 * parseable input. If conversion is unsuccessful, the pointers are not
2693 * modified. Check the return value for success or failure.
2694 *
2695 * note4: this function really needs to be reworked a bit. It ought to be possible
2696 * to do the right thing even for machines with missing wchar_t.
2697 */
2698 static
mbw_prefix(decode_html_entity)2699 bool_t mbw_prefix(decode_html_entity)(mbw_t **lline, mbw_t **qq) {
2700 bool_t retval = 0;
2701 mbw_t *line = *lline;
2702 mbw_t *q = *qq;
2703 /* printf("\nline = %p q = %p (line - q) = %d\n", line, q, line - q); */
2704 /* printf("[[[%s]]]\n", line); */
2705 #if defined HAVE_MBRTOWC
2706
2707 mbw_t *r = NULL;
2708 #if defined MBW_MB
2709 int s,t;
2710 mbw_t scratch[16]; /* C compiler complains about MB_CUR_MAX */
2711 #endif
2712 wchar_t c = 0; /* this must always be wchar_t */
2713
2714 switch(line[1]) {
2715 case mbw_lit('#'):
2716 if( (line[2] == mbw_lit('x')) || (line[2] == mbw_lit('X')) ) {
2717 #if defined MBW_MB || (defined MBW_WIDE && defined HAVE_WCSTOL)
2718 c = (wchar_t)mbw_strtol(line + 3, &r, 16);
2719 #else
2720 /* can't convert, but skip the payload anyway */
2721 for(r = line + 3; isxdigit(*r); r++);
2722 #endif
2723 } else {
2724 #if defined MBW_MB || (defined MBW_WIDE && defined HAVE_WCSTOL)
2725 c = (wchar_t)mbw_strtol(line + 2, &r, 10);
2726 #else
2727 /* can't convert, but skip the payload anyway */
2728 for(r = line + 2; isdigit(*r); r++);
2729 #endif
2730 }
2731 break;
2732
2733 #define ENTITY(x,y,z) if( !mbw_strncmp((line + 3), (x + 2), (y - 2)) ) \
2734 { c = (z); r = line + (y) + 1; }
2735
2736 case mbw_lit('a'):
2737 switch(line[2]) {
2738 case mbw_lit('a'):
2739 ENTITY(mbw_lit("aacute"),6,0xe1);
2740 break;
2741 case mbw_lit('c'):
2742 ENTITY(mbw_lit("acute"),5,0xb4) else ENTITY(mbw_lit("acirc"),4,0xe2);
2743 break;
2744 case mbw_lit('e'):
2745 ENTITY(mbw_lit("aelig"),5,0xe6);
2746 break;
2747 case mbw_lit('g'):
2748 ENTITY(mbw_lit("agrave"),6,0xe0);
2749 break;
2750 case mbw_lit('l'):
2751 ENTITY(mbw_lit("alpha"),5,0x03b1) else ENTITY(mbw_lit("alefsym"),7,0x2135);
2752 break;
2753 case mbw_lit('m'):
2754 ENTITY(mbw_lit("amp"),3,0x26);
2755 break;
2756 case mbw_lit('n'):
2757 ENTITY(mbw_lit("ang"),3,0x2220) else ENTITY(mbw_lit("and"),3,0x2227);
2758 break;
2759 case mbw_lit('r'):
2760 ENTITY(mbw_lit("aring"),5,0xe5);
2761 break;
2762 case mbw_lit('s'):
2763 ENTITY(mbw_lit("asymp"),5,0x2248);
2764 break;
2765 case mbw_lit('t'):
2766 ENTITY(mbw_lit("atilde"),6,0xe3);
2767 break;
2768 case mbw_lit('u'):
2769 ENTITY(mbw_lit("auml"),4,0xe4);
2770 break;
2771 }
2772 break;
2773
2774 case mbw_lit('A'):
2775 switch(line[2]) {
2776 case mbw_lit('a'):
2777 ENTITY(mbw_lit("Aacute"),6,0xc1);
2778 break;
2779 case mbw_lit('c'):
2780 ENTITY(mbw_lit("Acirc"),5,0xc2);
2781 break;
2782 case mbw_lit('E'):
2783 ENTITY(mbw_lit("AElig"),5,0xc6);
2784 break;
2785 case mbw_lit('g'):
2786 ENTITY(mbw_lit("Agrave"),6,0xc0);
2787 break;
2788 case mbw_lit('l'):
2789 ENTITY(mbw_lit("Alpha"),5,0x0391);
2790 break;
2791 case mbw_lit('r'):
2792 ENTITY(mbw_lit("Aring"),5,0xc5);
2793 break;
2794 case mbw_lit('t'):
2795 ENTITY(mbw_lit("Atilde"),6,0xc3);
2796 break;
2797 case mbw_lit('u'):
2798 ENTITY(mbw_lit("Auml"),4,0xc4);
2799 break;
2800 }
2801 break;
2802
2803 case mbw_lit('b'):
2804 switch(line[2]) {
2805 case mbw_lit('d'):
2806 ENTITY(mbw_lit("bdquo"),5,0x201e);
2807 break;
2808 case mbw_lit('e'):
2809 ENTITY(mbw_lit("beta"),4,0x03b2);
2810 break;
2811 case mbw_lit('u'):
2812 ENTITY(mbw_lit("bull"),4,0x2022);
2813 break;
2814 }
2815 break;
2816
2817 case mbw_lit('B'):
2818 switch(line[2]) {
2819 case mbw_lit('e'):
2820 ENTITY(mbw_lit("Beta"),4,0x0392);
2821 break;
2822 case mbw_lit('r'):
2823 ENTITY(mbw_lit("Brvbar"),6,0xa6);
2824 break;
2825 }
2826 break;
2827
2828 case mbw_lit('c'):
2829 switch(line[2]) {
2830 case mbw_lit('a'):
2831 ENTITY(mbw_lit("cap"),3,0x2229);
2832 break;
2833 case mbw_lit('c'):
2834 ENTITY(mbw_lit("ccedil"),6,0xe7);
2835 break;
2836 case mbw_lit('e'):
2837 ENTITY(mbw_lit("cent"),4,0xa2) else ENTITY(mbw_lit("cedil"),5,0xb8);
2838 break;
2839 case mbw_lit('h'):
2840 ENTITY(mbw_lit("chi"),3,0x03c7);
2841 break;
2842 case mbw_lit('i'):
2843 ENTITY(mbw_lit("circ"),4,0x02c6);
2844 break;
2845 case mbw_lit('l'):
2846 ENTITY(mbw_lit("clubs"),5,0x2663);
2847 break;
2848 case mbw_lit('o'):
2849 ENTITY(mbw_lit("copy"),4,0xa9) else ENTITY(mbw_lit("cong"),4,0x2245);
2850 break;
2851 case mbw_lit('r'):
2852 ENTITY(mbw_lit("crarr"),5,0x21b5);
2853 break;
2854 case mbw_lit('u'):
2855 ENTITY(mbw_lit("curren"),6,0xa4) else ENTITY(mbw_lit("cup"),3,0x222a);
2856 break;
2857 }
2858 break;
2859
2860 case mbw_lit('C'):
2861 switch(line[2]) {
2862 case mbw_lit('c'):
2863 ENTITY(mbw_lit("Ccedil"),6,0xc7);
2864 break;
2865 case mbw_lit('h'):
2866 ENTITY(mbw_lit("Chi"),3,0x03a7);
2867 break;
2868 }
2869 break;
2870
2871 case mbw_lit('d'):
2872 switch(line[2]) {
2873 case mbw_lit('a'):
2874 ENTITY(mbw_lit("darr"),4,0x2193) else ENTITY(mbw_lit("dagger"),6,0x2020);
2875 break;
2876 case mbw_lit('A'):
2877 ENTITY(mbw_lit("dArr"),4,0x21d3);
2878 break;
2879 case mbw_lit('e'):
2880 ENTITY(mbw_lit("delta"),5,0x03b4);
2881 break;
2882 case mbw_lit('i'):
2883 ENTITY(mbw_lit("divide"),6,0xf7) else ENTITY(mbw_lit("diams"),5,0x2666);
2884 break;
2885 }
2886 break;
2887
2888 case mbw_lit('D'):
2889 switch(line[2]) {
2890 case mbw_lit('a'):
2891 ENTITY(mbw_lit("Dagger"),6,0x2021);
2892 break;
2893 case mbw_lit('e'):
2894 ENTITY(mbw_lit("Deg"),3,0xb0) else ENTITY(mbw_lit("Delta"),5,0x0394);
2895 break;
2896 }
2897 break;
2898
2899 case mbw_lit('e'):
2900 switch(line[2]) {
2901 case mbw_lit('a'):
2902 ENTITY(mbw_lit("eacute"),6,0xe9);
2903 break;
2904 case mbw_lit('c'):
2905 ENTITY(mbw_lit("ecirc"),5,0xea);
2906 break;
2907 case mbw_lit('g'):
2908 ENTITY(mbw_lit("egrave"),6,0xe8);
2909 break;
2910 case mbw_lit('m'):
2911 ENTITY(mbw_lit("empty"),5,0x2205) else ENTITY(mbw_lit("emsp"),4,0x2003);
2912 break;
2913 case mbw_lit('n'):
2914 ENTITY(mbw_lit("ensp"),4,0x2002);
2915 break;
2916 case mbw_lit('p'):
2917 ENTITY(mbw_lit("epsilon"),7,0x03b5);
2918 break;
2919 case mbw_lit('q'):
2920 ENTITY(mbw_lit("equiv"),5,0x2261);
2921 break;
2922 case mbw_lit('t'):
2923 ENTITY(mbw_lit("eth"),3,0xf0) else ENTITY(mbw_lit("eta"),3,0x03b7);
2924 break;
2925 case mbw_lit('u'):
2926 ENTITY(mbw_lit("euml"),4,0xeb) else ENTITY(mbw_lit("euro"),4,0x20ac);
2927 break;
2928 case mbw_lit('x'):
2929 ENTITY(mbw_lit("exist"),5,0x2203);
2930 break;
2931 }
2932 break;
2933
2934 case mbw_lit('E'):
2935 switch(line[2]) {
2936 case mbw_lit('a'):
2937 ENTITY(mbw_lit("Eacute"),6,0xc9);
2938 break;
2939 case mbw_lit('c'):
2940 ENTITY(mbw_lit("Ecirc"),5,0xca);
2941 break;
2942 case mbw_lit('g'):
2943 ENTITY(mbw_lit("Egrave"),6,0xc8);
2944 break;
2945 case mbw_lit('p'):
2946 ENTITY(mbw_lit("Epsilon"),7,0x0395);
2947 break;
2948 case mbw_lit('T'):
2949 ENTITY(mbw_lit("ETH"),3,0xd0);
2950 break;
2951 case mbw_lit('t'):
2952 ENTITY(mbw_lit("Eta"),3,0x0397);
2953 break;
2954 case mbw_lit('u'):
2955 ENTITY(mbw_lit("Euml"),4,0xcb);
2956 break;
2957 }
2958 break;
2959
2960 case mbw_lit('f'):
2961 switch(line[2]) {
2962 case mbw_lit('n'):
2963 ENTITY(mbw_lit("fnof"),4,0x0192);
2964 break;
2965 case mbw_lit('o'):
2966 ENTITY(mbw_lit("forall"),6,0x2200);
2967 break;
2968 case mbw_lit('r'):
2969 ENTITY(mbw_lit("frac14"),6,0xbc) else ENTITY(mbw_lit("frac12"),6,0xbd) else
2970 ENTITY(mbw_lit("frac34"),6,0xbe) else ENTITY(mbw_lit("frasl"),5,0x2044);
2971 break;
2972 }
2973 break;
2974
2975 case mbw_lit('F'):
2976 /* nothing */
2977 break;
2978
2979 case mbw_lit('g'):
2980 switch(line[2]) {
2981 case mbw_lit('a'):
2982 ENTITY(mbw_lit("gamma"),5,0x3b3);
2983 break;
2984 case mbw_lit('e'):
2985 ENTITY(mbw_lit("ge"),2,0x2265);
2986 break;
2987 case mbw_lit('t'):
2988 ENTITY(mbw_lit("gt"),2,0x3e);
2989 break;
2990 }
2991 break;
2992
2993 case mbw_lit('G'):
2994 switch(line[2]) {
2995 case mbw_lit('a'):
2996 ENTITY(mbw_lit("Gamma"),5,0x0393);
2997 break;
2998 }
2999 break;
3000
3001 case mbw_lit('h'):
3002 switch(line[2]) {
3003 case mbw_lit('a'):
3004 ENTITY(mbw_lit("harr"),4,0x2194);
3005 break;
3006 case mbw_lit('A'):
3007 ENTITY(mbw_lit("hArr"),4,0x21d4);
3008 break;
3009 case mbw_lit('e'):
3010 ENTITY(mbw_lit("hearts"),6,0x2665) else ENTITY(mbw_lit("hellip"),6,0x2026);
3011 break;
3012 }
3013 break;
3014
3015 case mbw_lit('H'):
3016 /* nothing */
3017 break;
3018
3019 case mbw_lit('i'):
3020 switch(line[2]) {
3021 case mbw_lit('a'):
3022 ENTITY(mbw_lit("iacute"),6,0xed);
3023 break;
3024 case mbw_lit('c'):
3025 ENTITY(mbw_lit("icirc"),5,0xee);
3026 break;
3027 case mbw_lit('e'):
3028 ENTITY(mbw_lit("iexcl"),5,0xa1);
3029 break;
3030 case mbw_lit('g'):
3031 ENTITY(mbw_lit("igrave"),6,0xec);
3032 break;
3033 case mbw_lit('m'):
3034 ENTITY(mbw_lit("image"),5,0x2111);
3035 break;
3036 case mbw_lit('n'):
3037 ENTITY(mbw_lit("infin"),5,0x221e) else ENTITY(mbw_lit("int"),3,0x222b);
3038 break;
3039 case mbw_lit('o'):
3040 ENTITY(mbw_lit("iota"),4,0x03b9);
3041 break;
3042 case mbw_lit('q'):
3043 ENTITY(mbw_lit("iquest"),6,0xbf);
3044 break;
3045 case mbw_lit('s'):
3046 ENTITY(mbw_lit("isin"),4,0x2208);
3047 break;
3048 case mbw_lit('u'):
3049 ENTITY(mbw_lit("iuml"),4,0xef);
3050 break;
3051 }
3052 break;
3053
3054 case mbw_lit('I'):
3055 switch(line[2]) {
3056 case mbw_lit('a'):
3057 ENTITY(mbw_lit("Iacute"),6,0xcd);
3058 break;
3059 case mbw_lit('c'):
3060 ENTITY(mbw_lit("Icirc"),5,0xce);
3061 break;
3062 case mbw_lit('g'):
3063 ENTITY(mbw_lit("Igrave"),6,0xcc);
3064 break;
3065 case mbw_lit('o'):
3066 ENTITY(mbw_lit("Iota"),4,0x0399);
3067 break;
3068 case mbw_lit('u'):
3069 ENTITY(mbw_lit("Iuml"),4,0xcf);
3070 break;
3071 }
3072 break;
3073
3074 case mbw_lit('j'):
3075 /* nothing */
3076 break;
3077
3078 case mbw_lit('J'):
3079 /* nothing */
3080 break;
3081
3082 case mbw_lit('k'):
3083 switch(line[2]) {
3084 case mbw_lit('a'):
3085 ENTITY(mbw_lit("kappa"),5,0x03ba);
3086 break;
3087 }
3088 break;
3089
3090 case mbw_lit('K'):
3091 switch(line[2]) {
3092 case mbw_lit('a'):
3093 ENTITY(mbw_lit("Kappa"),5,0x039a);
3094 break;
3095 }
3096 break;
3097
3098 case mbw_lit('l'):
3099 switch(line[2]) {
3100 case mbw_lit('a'):
3101 ENTITY(mbw_lit("lambda"),6,0x03bb) else ENTITY(mbw_lit("lang"),4,0x2329);
3102 break;
3103 case mbw_lit('A'):
3104 ENTITY(mbw_lit("lArr"),4,0x21d0);
3105 break;
3106 case mbw_lit('c'):
3107 ENTITY(mbw_lit("lceil"),5,0x2308);
3108 break;
3109 case mbw_lit('d'):
3110 ENTITY(mbw_lit("ldquo"),5,0x201c);
3111 break;
3112 case mbw_lit('e'):
3113 ENTITY(mbw_lit("le"),2,0x2264);
3114 break;
3115 case mbw_lit('f'):
3116 ENTITY(mbw_lit("lfloor"),6,0x2309);
3117 break;
3118 case mbw_lit('o'):
3119 ENTITY(mbw_lit("lowast"),6,0x2217) else ENTITY(mbw_lit("loz"),3,0x25ca);
3120 break;
3121 case mbw_lit('r'):
3122 ENTITY(mbw_lit("lrm"),3,0x200e);
3123 break;
3124 case mbw_lit('s'):
3125 ENTITY(mbw_lit("lsquo"),5,0x2018) else ENTITY(mbw_lit("lsaquo"),6,0x2039);
3126 break;
3127 case mbw_lit('t'):
3128 ENTITY(mbw_lit("lt"),2,0x3c);
3129 break;
3130 }
3131 break;
3132
3133 case mbw_lit('L'):
3134 switch(line[2]) {
3135 case mbw_lit('a'):
3136 ENTITY(mbw_lit("Laquo"),5,0xab) else ENTITY(mbw_lit("Lambda"),6,0x039b) else
3137 ENTITY(mbw_lit("Larr"),4,0x2190);
3138 break;
3139 }
3140 break;
3141
3142 case mbw_lit('m'):
3143 switch(line[2]) {
3144 case mbw_lit('d'):
3145 ENTITY(mbw_lit("mdash"),5,0x2014);
3146 break;
3147 case mbw_lit('i'):
3148 ENTITY(mbw_lit("minus"),5,0x2212);
3149 break;
3150 case mbw_lit('u'):
3151 ENTITY(mbw_lit("mu"),2,0x03bc);
3152 break;
3153 }
3154 break;
3155
3156 case mbw_lit('M'):
3157 switch(line[2]) {
3158 case mbw_lit('a'):
3159 ENTITY(mbw_lit("Macr"),4,0xaf);
3160 break;
3161 case mbw_lit('i'):
3162 ENTITY(mbw_lit("Micro"),5,0xb5) else ENTITY(mbw_lit("Middot"),6,0xb7);
3163 break;
3164 case mbw_lit('u'):
3165 ENTITY(mbw_lit("Mu"),2,0x039c);
3166 break;
3167 }
3168 break;
3169
3170 case mbw_lit('n'):
3171 switch(line[2]) {
3172 case mbw_lit('a'):
3173 ENTITY(mbw_lit("nabla"),5,0x2207);
3174 break;
3175 case mbw_lit('b'):
3176 ENTITY(mbw_lit("nbsp"),4,0xa0);
3177 break;
3178 case mbw_lit('d'):
3179 ENTITY(mbw_lit("ndash"),5,0x2013);
3180 break;
3181 case mbw_lit('e'):
3182 ENTITY(mbw_lit("ne"),2,0x2260);
3183 break;
3184 case mbw_lit('i'):
3185 ENTITY(mbw_lit("ni"),2,0x220b);
3186 break;
3187 case mbw_lit('o'):
3188 ENTITY(mbw_lit("not"),3,0xac) else ENTITY(mbw_lit("notin"),5,0x2209);
3189 break;
3190 case mbw_lit('s'):
3191 ENTITY(mbw_lit("nsub"),4,0x2284);
3192 break;
3193 case mbw_lit('t'):
3194 ENTITY(mbw_lit("ntilde"),6,0xf1);
3195 break;
3196 case mbw_lit('u'):
3197 ENTITY(mbw_lit("nu"),2,0x03bd);
3198 break;
3199 }
3200 break;
3201
3202 case mbw_lit('N'):
3203 switch(line[2]) {
3204 case mbw_lit('t'):
3205 ENTITY(mbw_lit("Ntilde"),6,0xd1);
3206 break;
3207 case mbw_lit('u'):
3208 ENTITY(mbw_lit("Nu"),2,0x039d);
3209 break;
3210 }
3211 break;
3212
3213 case mbw_lit('o'):
3214 switch(line[2]) {
3215 case mbw_lit('a'):
3216 ENTITY(mbw_lit("oacute"),6,0xf3);
3217 break;
3218 case mbw_lit('c'):
3219 ENTITY(mbw_lit("ocirc"),5,0xf4);
3220 break;
3221 case mbw_lit('e'):
3222 ENTITY(mbw_lit("oelig"),5,0x0153);
3223 break;
3224 case mbw_lit('g'):
3225 ENTITY(mbw_lit("ograve"),6,0xf2);
3226 break;
3227 case mbw_lit('l'):
3228 ENTITY(mbw_lit("oline"),5,0x203e);
3229 break;
3230 case mbw_lit('m'):
3231 ENTITY(mbw_lit("omicron"),7,0x03bf) else ENTITY(mbw_lit("omega"),5,0x03c9);
3232 break;
3233 case mbw_lit('p'):
3234 ENTITY(mbw_lit("oplus"),5,0x2295);
3235 break;
3236 case mbw_lit('r'):
3237 ENTITY(mbw_lit("ordf"),4,0xaa) else ENTITY(mbw_lit("ordm"),4,0xba) else
3238 ENTITY(mbw_lit("or"),2,0x2228);
3239 break;
3240 case mbw_lit('s'):
3241 ENTITY(mbw_lit("oslash"),6,0xf8);
3242 break;
3243 case mbw_lit('t'):
3244 ENTITY(mbw_lit("otilde"),6,0xf5) else ENTITY(mbw_lit("otimes"),6,0x2297);
3245 break;
3246 case mbw_lit('u'):
3247 ENTITY(mbw_lit("ouml"),4,0xf6);
3248 break;
3249 }
3250 break;
3251
3252 case mbw_lit('O'):
3253 switch(line[2]) {
3254 case mbw_lit('a'):
3255 ENTITY(mbw_lit("Oacute"),6,0xd3);
3256 break;
3257 case mbw_lit('c'):
3258 ENTITY(mbw_lit("Ocirc"),5,0xd4);
3259 break;
3260 case mbw_lit('E'):
3261 ENTITY(mbw_lit("OElig"),5,0x0152);
3262 break;
3263 case mbw_lit('m'):
3264 ENTITY(mbw_lit("Omicron"),7,0x039f) else ENTITY(mbw_lit("Omega"),5,0x03a9);
3265 break;
3266 case mbw_lit('g'):
3267 ENTITY(mbw_lit("Ograve"),6,0xd2);
3268 break;
3269 case mbw_lit('s'):
3270 ENTITY(mbw_lit("Oslash"),6,0xd8);
3271 break;
3272 case mbw_lit('t'):
3273 ENTITY(mbw_lit("Otilde"),6,0xd5);
3274 break;
3275 case mbw_lit('u'):
3276 ENTITY(mbw_lit("Ouml"),4,0xd6);
3277 break;
3278 }
3279 break;
3280
3281 case mbw_lit('p'):
3282 switch(line[2]) {
3283 case mbw_lit('a'):
3284 ENTITY(mbw_lit("part"),4,0x2202);
3285 break;
3286 case mbw_lit('e'):
3287 ENTITY(mbw_lit("perp"),4,0x22a5) else ENTITY(mbw_lit("permil"),6,0x2030);
3288 break;
3289 case mbw_lit('h'):
3290 ENTITY(mbw_lit("phi"),3,0x03c6);
3291 break;
3292 case mbw_lit('i'):
3293 ENTITY(mbw_lit("pi"),2,0x03c0) else ENTITY(mbw_lit("piv"),3,0x03d6);
3294 break;
3295 case mbw_lit('r'):
3296 ENTITY(mbw_lit("prime"),5,0x2032) else ENTITY(mbw_lit("prod"),4,0x220f);
3297 break;
3298 case mbw_lit('s'):
3299 ENTITY(mbw_lit("psi"),3,0x03c8);
3300 break;
3301 }
3302 break;
3303
3304 case mbw_lit('P'):
3305 switch(line[2]) {
3306 case mbw_lit('a'):
3307 ENTITY(mbw_lit("Para"),4,0xb6);
3308 break;
3309 case mbw_lit('h'):
3310 ENTITY(mbw_lit("Phi"),3,0x03a6);
3311 break;
3312 case mbw_lit('i'):
3313 ENTITY(mbw_lit("Pi"),2,0x03a0);
3314 break;
3315 case mbw_lit('l'):
3316 ENTITY(mbw_lit("Plusmn"),6,0xb1);
3317 break;
3318 case mbw_lit('o'):
3319 ENTITY(mbw_lit("Pound"),5,0xa3);
3320 break;
3321 case mbw_lit('r'):
3322 ENTITY(mbw_lit("Prime"),5,0x2033) else ENTITY(mbw_lit("Prop"),4,0x221d);
3323 break;
3324 case mbw_lit('s'):
3325 ENTITY(mbw_lit("Psi"),3,0x03a8);
3326 break;
3327 }
3328 break;
3329
3330 case mbw_lit('q'):
3331 switch(line[2]) {
3332 case mbw_lit('u'):
3333 ENTITY(mbw_lit("quot"),4,0x22);
3334 break;
3335 }
3336 break;
3337
3338 case mbw_lit('Q'):
3339 /* nothing */
3340 break;
3341
3342 case mbw_lit('r'):
3343 switch(line[2]) {
3344 case mbw_lit('a'):
3345 ENTITY(mbw_lit("rarr"),4,0x2192) else ENTITY(mbw_lit("radic"),5,0x221a) else
3346 ENTITY(mbw_lit("rang"),4,0x232a);
3347 break;
3348 case mbw_lit('A'):
3349 ENTITY(mbw_lit("rArr"),4,0x21d2);
3350 break;
3351 case mbw_lit('c'):
3352 ENTITY(mbw_lit("rceil"),5,0x2309);
3353 break;
3354 case mbw_lit('d'):
3355 ENTITY(mbw_lit("rdquo"),5,0x201d);
3356 break;
3357 case mbw_lit('e'):
3358 ENTITY(mbw_lit("real"),4,0x211C) else ENTITY(mbw_lit("reg"),3,0xae);
3359 break;
3360 case mbw_lit('f'):
3361 ENTITY(mbw_lit("rfloor"),6,0x230a);
3362 break;
3363 case mbw_lit('h'):
3364 ENTITY(mbw_lit("rho"),3,0x03c1);
3365 break;
3366 case mbw_lit('l'):
3367 ENTITY(mbw_lit("rlm"),3,0x200f);
3368 break;
3369 case mbw_lit('s'):
3370 ENTITY(mbw_lit("rsquo"),5,0x2019) else ENTITY(mbw_lit("rsaquo"),6,0x203a);
3371 break;
3372 }
3373 break;
3374
3375 case mbw_lit('R'):
3376 switch(line[2]) {
3377 case mbw_lit('a'):
3378 ENTITY(mbw_lit("Raquo"),5,0xbb);
3379 break;
3380 case mbw_lit('e'):
3381 ENTITY(mbw_lit("Reg"),3,0xae);
3382 break;
3383 case mbw_lit('h'):
3384 ENTITY(mbw_lit("Rho"),3,0x03a1);
3385 break;
3386 }
3387 break;
3388
3389 case mbw_lit('s'):
3390 switch(line[2]) {
3391 case mbw_lit('b'):
3392 ENTITY(mbw_lit("sbquo"),5,0x201a);
3393 break;
3394 case mbw_lit('c'):
3395 ENTITY(mbw_lit("scaron"),6,0x0161);
3396 break;
3397 case mbw_lit('d'):
3398 ENTITY(mbw_lit("sdot"),4,0x22c5);
3399 break;
3400 case mbw_lit('e'):
3401 ENTITY(mbw_lit("sect"),4,0xa7);
3402 break;
3403 case mbw_lit('h'):
3404 ENTITY(mbw_lit("shy"),3,0xad);
3405 break;
3406 case mbw_lit('i'):
3407 ENTITY(mbw_lit("sigmaf"),6,0x03c2) else ENTITY(mbw_lit("sigma"),5,0x03c3) else
3408 ENTITY(mbw_lit("sim"),3,0x223c);
3409 break;
3410 case mbw_lit('p'):
3411 ENTITY(mbw_lit("spades"),6,0x2660);
3412 break;
3413 case mbw_lit('u'):
3414 ENTITY(mbw_lit("sup2"),4,0xb2) else ENTITY(mbw_lit("sup3"),4,0xb3) else
3415 ENTITY(mbw_lit("sup1"),4,0xb9) else ENTITY(mbw_lit("sum"),3,0x2211) else
3416 ENTITY(mbw_lit("sub"),3,0x2282) else ENTITY(mbw_lit("sup"),3,0x2283) else
3417 ENTITY(mbw_lit("sube"),4,0x2286) else ENTITY(mbw_lit("supe"),4,0x2287);
3418 break;
3419 case mbw_lit('z'):
3420 ENTITY(mbw_lit("szlig"),5,0xdf);
3421 break;
3422 }
3423 break;
3424
3425 case mbw_lit('S'):
3426 switch(line[2]) {
3427 case mbw_lit('c'):
3428 ENTITY(mbw_lit("Scaron"),6,0x0160);
3429 break;
3430 case mbw_lit('i'):
3431 ENTITY(mbw_lit("Sigma"),5,0x03a3);
3432 break;
3433 }
3434 break;
3435
3436 case mbw_lit('t'):
3437 switch(line[2]) {
3438 case mbw_lit('a'):
3439 ENTITY(mbw_lit("tau"),3,0x03c4);
3440 break;
3441 case mbw_lit('h'):
3442 ENTITY(mbw_lit("thorn"),5,0xfe) else ENTITY(mbw_lit("theta"),5,0x03b8) else
3443 ENTITY(mbw_lit("thetasym"),8,0x03d1) else ENTITY(mbw_lit("there4"),6,0x2234) else
3444 ENTITY(mbw_lit("thinsp"),6,0x2009);
3445 break;
3446 case mbw_lit('i'):
3447 ENTITY(mbw_lit("times"),5,0xd7) else ENTITY(mbw_lit("tilde"),5,0x02dc);
3448 break;
3449 case mbw_lit('r'):
3450 ENTITY(mbw_lit("trade"),5,0x2122);
3451 break;
3452 }
3453 break;
3454
3455 case mbw_lit('T'):
3456 switch(line[2]) {
3457 case mbw_lit('a'):
3458 ENTITY(mbw_lit("Tau"),3,0x03a4);
3459 break;
3460 case mbw_lit('h'):
3461 ENTITY(mbw_lit("Theta"),5,0x0398);
3462 break;
3463 case mbw_lit('H'):
3464 ENTITY(mbw_lit("THORN"),5,0xde);
3465 break;
3466 }
3467 break;
3468
3469 case mbw_lit('u'):
3470 switch(line[2]) {
3471 case mbw_lit('a'):
3472 ENTITY(mbw_lit("uacute"),6,0xfa) else ENTITY(mbw_lit("uarr"),4,0x2191);
3473 break;
3474 case mbw_lit('A'):
3475 ENTITY(mbw_lit("uArr"),4,0x21d1);
3476 break;
3477 case mbw_lit('c'):
3478 ENTITY(mbw_lit("ucirc"),5,0xfb);
3479 break;
3480 case mbw_lit('g'):
3481 ENTITY(mbw_lit("ugrave"),6,0xf9);
3482 break;
3483 case mbw_lit('m'):
3484 ENTITY(mbw_lit("uml"),3,0xa8);
3485 break;
3486 case mbw_lit('p'):
3487 ENTITY(mbw_lit("upsilon"),7,0xc5) else ENTITY(mbw_lit("upsih"),5,0x03d2);
3488 break;
3489 case mbw_lit('u'):
3490 ENTITY(mbw_lit("uuml"),4,0xfc);
3491 break;
3492 }
3493 break;
3494
3495 case mbw_lit('U'):
3496 switch(line[2]) {
3497 case mbw_lit('a'):
3498 ENTITY(mbw_lit("Uacute"),6,0xda);
3499 break;
3500 case mbw_lit('c'):
3501 ENTITY(mbw_lit("Ucirc"),5,0xdb);
3502 break;
3503 case mbw_lit('g'):
3504 ENTITY(mbw_lit("Ugrave"),6,0xd9);
3505 break;
3506 case mbw_lit('p'):
3507 ENTITY(mbw_lit("Upsilon"),7,0xa5);
3508 break;
3509 case mbw_lit('u'):
3510 ENTITY(mbw_lit("Uuml"),4,0xdc);
3511 break;
3512 }
3513 break;
3514
3515 case mbw_lit('v'):
3516 /* nothing */
3517 break;
3518
3519 case mbw_lit('V'):
3520 /* nothing */
3521 break;
3522
3523 case mbw_lit('w'):
3524 switch(line[2]) {
3525 case mbw_lit('e'):
3526 ENTITY(mbw_lit("weierp"),6,0x2118);
3527 break;
3528 }
3529 break;
3530
3531 case mbw_lit('W'):
3532 /* nothing */
3533 break;
3534
3535 case mbw_lit('x'):
3536 switch(line[2]) {
3537 case mbw_lit('i'):
3538 ENTITY(mbw_lit("xi"),2,0x03be);
3539 break;
3540 }
3541 break;
3542
3543 case mbw_lit('X'):
3544 switch(line[2]) {
3545 case mbw_lit('i'):
3546 ENTITY(mbw_lit("Xi"),2,0x039e);
3547 break;
3548 }
3549 break;
3550
3551 case mbw_lit('y'):
3552 switch(line[2]) {
3553 case mbw_lit('a'):
3554 ENTITY(mbw_lit("yacute"),6,0xfd);
3555 break;
3556 case mbw_lit('e'):
3557 ENTITY(mbw_lit("yen"),3,0xa5);
3558 break;
3559 case mbw_lit('u'):
3560 ENTITY(mbw_lit("yuml"),4,0xff);
3561 break;
3562 }
3563 break;
3564
3565 case mbw_lit('Y'):
3566 switch(line[2]) {
3567 case mbw_lit('a'):
3568 ENTITY(mbw_lit("Yacute"),6,0xdd);
3569 break;
3570 case mbw_lit('u'):
3571 ENTITY(mbw_lit("Yuml"),4,0x0178);
3572 break;
3573 }
3574 break;
3575
3576 case mbw_lit('z'):
3577 switch(line[2]) {
3578 case mbw_lit('e'):
3579 ENTITY(mbw_lit("zeta"),4,0x03b6);
3580 break;
3581 case mbw_lit('w'):
3582 ENTITY(mbw_lit("zwnj"),4,0x200c) else ENTITY(mbw_lit("zwj"),3,0x200d);
3583 break;
3584 }
3585 break;
3586
3587 case mbw_lit('Z'):
3588 switch(line[2]) {
3589 case mbw_lit('e'):
3590 ENTITY(mbw_lit("Zeta"),4,0x0396);
3591 break;
3592 }
3593 break;
3594
3595 default:
3596 break;
3597 }
3598
3599 /* some values of c are not allowed, because they interfere with the
3600 html parser */
3601 switch(c) {
3602 case L'\0':
3603 case L'\001': /* TOKENSEP */
3604 case L'\002': /* CLASSEP */
3605 case L'\003': /* DIAMOND */
3606 /* reserved control codes */
3607 c = L' '; break;
3608 case L'<': c = L'('; break;
3609 case L'>': c = L')'; break;
3610 default: break;
3611 }
3612
3613 /* normally, entities end with ';', which will be skipped
3614 after we exit this function. However, we're lenient: if
3615 we don't point to ';', then we back up by one so that later
3616 we don't skip this character. Note this is safe, because
3617 ENTITY makes r point to at least line +1, and otherwise r is NULL */
3618 if( r && (*r != mbw_lit(';')) ) { r--; }
3619
3620 #if defined MBW_WIDE
3621
3622 if( c && r ) {
3623 *q++ = c;
3624 line = r;
3625 retval = 1;
3626 } else {
3627 /* do nothing */
3628 }
3629 #else
3630
3631 /* now if c is nonzero, then we found the entity */
3632 if( c && r ) {
3633 if( c == 0xa0 ) {
3634 /* shortcut for */
3635 *q++ = mbw_lit(' ');
3636 } else {
3637 s = wcrtomb(scratch,c,NULL);
3638 if( (s > -1) && (q + s <= r) ) {
3639 for(t = 0; t < s; t++) { *q++ = scratch[t]; }
3640 } else {
3641 /* locale doesn't recognize this char */
3642 s = c;
3643 if( s < 0xFF ) { *q++ = (mbw_t)s; }
3644 }
3645 }
3646 line = r;
3647 retval = 1;
3648 } else {
3649 /* do nothing */
3650 }
3651 #endif
3652
3653 #else /* HAVE_MBRTOWC is not defined */
3654
3655 /* do nothing */
3656
3657 #endif
3658
3659 /* reminder: if no conversion is possible, we do nothing -
3660 I always forget this and try to update q and line, thereby
3661 introducing bugs */
3662 *lline = line;
3663 *qq = q;
3664 return retval;
3665 }
3666
3667 /*
3668 * this code generates decode_escaped_uri_character() and
3669 * w_decode_escaped_uri_character().
3670 *
3671 * NOTE: this doesn't cope correctly with the case that the
3672 * URI encoded character is itself encoded as html entities.
3673 * For example, %20 can itself be encoded as %20
3674 * Since the first char is decoded as '%' (otherwise we wouldn't
3675 * be inside decode_uri_character(), the function effectively
3676 * tries to decode %20 and fails. This is harmless, as
3677 * the ultimately decode line will be %20 instead of ' '.
3678 */
mbw_prefix(decode_uri_character)3679 void mbw_prefix(decode_uri_character)(mbw_t **lline, mbw_t **qq) {
3680 mbw_t *line = *lline;
3681 mbw_t *q = *qq;
3682 mbw_t scratch[3];
3683 mbw_t c = 0;
3684 mbw_t *r;
3685
3686 if( *line == mbw_lit('%') ) {
3687 #if defined MBW_MB || (defined MBW_WIDE && defined HAVE_WCSTOL)
3688 /* check that the next two chars are hex */
3689 for(r = line + 1; mbw_isspace(*r); r++);
3690 if( mbw_isxdigit(*r) ) {
3691 scratch[0] = *r;
3692 for(r++; mbw_isspace(*r); r++);
3693 if( mbw_isxdigit(*r) ) {
3694 scratch[1] = *r;
3695 scratch[2] = mbw_lit('\0');
3696 c = (mbw_t)mbw_strtol(scratch, NULL, 16);
3697 }
3698 }
3699 #endif
3700 if( c ) {
3701 *q++ = c;
3702 line = r;
3703 } else {
3704 *q++ = *line;
3705 }
3706
3707 } else if( mbw_isspace(*line) ) {
3708 /* ignore */
3709 } else {
3710 /* not an escaped character */
3711 *q++ = *line;
3712 }
3713
3714 *lline = line;
3715 *qq = q;
3716 }
3717
3718 /* this reads one or more characters from line and outputs
3719 * zero or one character at q.
3720 * The characters output depend on the xml.attribute state, but
3721 * the function never outputs more than it reads. Thus if q <= line
3722 * to begin with, this is preserved. This is necessary as the function
3723 * is normally used to modify line in-place.
3724 *
3725 * For URL type attributes, this function assumes the standard URI
3726 * form scheme:netloc/extra, and only prints the netloc part.
3727 *
3728 * A special case is the URI javascript:xxxxx, which is treated differently.
3729 *
3730 * CAUTION: if you increment qq, don't also call decode_uri_character().
3731 * Just do one or the other.
3732 */
3733 static
mbw_prefix(xml_attribute_filter)3734 void mbw_prefix(xml_attribute_filter)(XML_State *xml, mbw_t **lline, mbw_t **qq) {
3735
3736 if( (xml->attribute != UNDEF) && (*(*lline) == mbw_lit('&')) ) {
3737 if( mbw_prefix(decode_html_entity)(lline, qq) ) {
3738 (*lline)++;
3739 }
3740 }
3741
3742 switch(xml->attribute) {
3743 case SRC:
3744 if( *(*lline) == mbw_lit(':') ) {
3745 xml->attribute = SRC_NETLOC_PREFIX;
3746 *(*qq)++ = *(*lline);
3747 } else if( mbw_strncasecmp(*lline, mbw_lit("javascript:"), 11) == 0 ) {
3748 /* scripts are dealt differently */
3749 xml->attribute = JSCRIPT;
3750 if( m_options & (1<<M_OPTION_SHOW_SCRIPT) ) {
3751 *(*qq)++ = *(*lline);
3752 }
3753 } else {
3754 *(*qq)++ = *(*lline);
3755 }
3756 break;
3757 case SRC_NETLOC_PREFIX:
3758 switch(*(*lline)) {
3759 case mbw_lit('/'):
3760 *(*qq)++ = *(*lline);
3761 break;
3762 case mbw_lit('?'):
3763 case mbw_lit(';'):
3764 case mbw_lit('#'):
3765 case mbw_lit('&'):
3766 xml->attribute = SRC_NETLOC_SUFFIX;
3767 *(*qq)++ = *(*lline);
3768 break;
3769 default:
3770 mbw_prefix(decode_uri_character)(lline,qq);
3771 xml->attribute = SRC_NETLOC;
3772 break;
3773 }
3774 break;
3775 case SRC_NETLOC:
3776 switch(*(*lline)) {
3777 case mbw_lit('/'):
3778 *(*qq)++ = *(*lline);
3779 xml->attribute = SRC_NETLOC_PATH;
3780 break;
3781 case mbw_lit('?'):
3782 case mbw_lit(';'):
3783 case mbw_lit('#'):
3784 case mbw_lit('&'):
3785 xml->attribute = SRC_NETLOC_SUFFIX;
3786 break;
3787 default:
3788 mbw_prefix(decode_uri_character)(lline,qq);
3789 break;
3790 }
3791 break;
3792 case SRC_NETLOC_PATH:
3793 switch(*(*lline)) {
3794 case mbw_lit('.'):
3795 *(*qq)++ = *(*lline);
3796 break;
3797 case mbw_lit('?'):
3798 case mbw_lit(';'):
3799 case mbw_lit('#'):
3800 case mbw_lit('&'):
3801 xml->attribute = SRC_NETLOC_SUFFIX;
3802 *(*qq)++ = *(*lline);
3803 break;
3804 default:
3805 mbw_prefix(decode_uri_character)(lline,qq);
3806 break;
3807 }
3808 break;
3809 case SRC_NETLOC_SUFFIX:
3810 /* *(*qq)++ = mbw_lit(' '); */
3811 *(*qq)++ = *(*lline);
3812 break;
3813 case ALT:
3814 *(*qq)++ = *(*lline);
3815 break;
3816 case UNDEF:
3817 /* nothing */
3818 break;
3819 case JSCRIPT:
3820 if( m_options & (1<<M_OPTION_SHOW_SCRIPT) ) {
3821 *(*qq)++ = *(*lline);
3822 }
3823 break;
3824 case ASTYLE:
3825 if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
3826 *(*qq)++ = *(*lline);
3827 }
3828 break;
3829 }
3830 }
3831
3832 /* Removes tags in the string - modifies in place
3833 * the name of this function is a misnomer, since it doesn't
3834 * parse xml properly.
3835 *
3836 * The filter can be called in several "modes" selected by the
3837 * xml.parser variable.
3838 *
3839 * The simplest parsing is xpDUMB mode, which simply skips XML like
3840 * tags without looking inside them.
3841 *
3842 * For HTML parsing, there is xpHTML mode, and its counterpart xpSMART
3843 * mode. xpHTML looks inside common HTML tags and can print the contents
3844 * of attributes. To explain xpSMART mode, remember that HTML documents
3845 * should normally be written inside <html> and </html> tags. If these
3846 * tags are found, then everything outside them is handled by xpSMART mode.
3847 *
3848 * Thus, in particular, a new document should be started in xpSMART mode.
3849 * For text documents, this ensures that any preambles are not rendered,
3850 * until true HTML is encountered.
3851 *
3852 * However, there is a small problem, namely the <html> tags are
3853 * optional. For text documents, missing <html> tags are rare, but
3854 * email often contains only fragments with <html> missing. To cope
3855 * with this, in mail mode, xpSMART scans the current line and
3856 * switches immediately to xpHTML mode if the line is printable.
3857 *
3858 * This turns out to be an extremely important function, because
3859 * spammers don't always label attachments correctly. So it's possible
3860 * to get a binary stream labeled as text/html, and of course lots of
3861 * junk tokens. If xpSMART mode detects binary, then it does NOT
3862 * switch to xpHTML mode immediately, and nothing gets printed. If
3863 * and when a valid <html> tag is found later, HTML will be enabled as
3864 * necessary. I think this is a robust partial solution to an
3865 * intractable problem.
3866 */
mbw_prefix(xml_character_filter)3867 void mbw_prefix(xml_character_filter)(XML_State *xml, mbw_t *line) {
3868 mbw_t *q;
3869 q = line;
3870 /* int k; */
3871
3872 /* don't call this with y < 1 */
3873 #define TAGMATCH(x,y) (!mbw_strncasecmp(line + 1, x + 1, y - 1) && (mbw_isspace(line[y]) || (line[y] == mbw_lit('>')) || (line[y] == mbw_lit('\0'))) && (line += (y - 1)))
3874
3875 #define ATTRMATCH(x,y) (!mbw_strncasecmp(line + 1, x + 1, y - 1) && (mbw_isspace(line[y]) || (line[y] == mbw_lit('=')) || (line[y] == mbw_lit('\0'))) && (line += (y - 1)))
3876
3877 /* this is convenient for debugging */
3878 #define PDEBUG(x) printf(#x"{%c%c%c%c%c%c%c%c%c%c}\n", line[0], line[1], line[2], line[3], line[4], line[5], line[6], line[7], line[8], line[9])
3879 #define PDEBUG2(x,y) {printf(#x"2{"); for(k = 0; k < y; k++) printf("%c",line[k]); printf("}\n"); }
3880
3881 /* this is important - read comments above */
3882 if( (m_options & (1<<M_OPTION_MBOX_FORMAT)) &&
3883 (xml->parser == xpSMART) &&
3884 !mbw_prefix(is_binline)(line) &&
3885 !mbw_prefix(is_emptyspace)(line) ) {
3886 xml->parser = xpHTML;
3887 }
3888
3889 while( *line ) {
3890 /* printf("%d %d ->", xml->state, xml->attribute); */
3891 /* PDEBUG(LINE); */
3892 switch(xml->state) {
3893 case TEXT:
3894 switch(line[0]) {
3895 case mbw_lit('<'):
3896 /* does it look like <x where x is either alpha or punctuation? */
3897 if( mbw_isalpha(line[1]) ) {
3898 line++;
3899 /* tags aren't mined, xtags are */
3900 xml->state = TAG;
3901 xml->attribute = UNDEF;
3902 switch(mbw_tolower(line[0])) {
3903 case mbw_lit('a'):
3904 if( (line[1] == mbw_lit('\0')) || mbw_isspace(line[1]) ||
3905 TAGMATCH(mbw_lit("area"),4) ||
3906 TAGMATCH(mbw_lit("applet"),6) ) {
3907 xml->state = XTAG;
3908 }
3909 break;
3910 case mbw_lit('b'):
3911 if( TAGMATCH(mbw_lit("base"),4) ||
3912 TAGMATCH(mbw_lit("bgsound"),7) ) {
3913 xml->state = XTAG;
3914 } else if( TAGMATCH(mbw_lit("br"),2) ) {
3915 *q++ = mbw_lit('\n');
3916 xml->state = TAG;
3917 } else if( TAGMATCH(mbw_lit("body"),4) ) {
3918 xml->hide = VISIBLE;
3919 if( xml->parser == xpSMART ) {
3920 xml->parser = xpHTML;
3921 }
3922 xml->state = XTAG;
3923 }
3924 break;
3925 case mbw_lit('c'):
3926 if( TAGMATCH(mbw_lit("comment"),7) ) {
3927 if( xml->parser == xpHTML ) {
3928 xml->hide = COMMENT;
3929 }
3930 }
3931 break;
3932 case mbw_lit('d'):
3933 if( TAGMATCH(mbw_lit("div"),3) ) {
3934 xml->state = XTAG;
3935 }
3936 break;
3937 case mbw_lit('e'):
3938 if( TAGMATCH(mbw_lit("embed"),5) ) {
3939 xml->state = XTAG;
3940 }
3941 break;
3942 case mbw_lit('f'):
3943 if( TAGMATCH(mbw_lit("frame"),5) ||
3944 TAGMATCH(mbw_lit("form"),4) ) {
3945 xml->state = XTAG;
3946 }
3947 break;
3948 case mbw_lit('h'):
3949 if( TAGMATCH(mbw_lit("html"),4) || TAGMATCH(mbw_lit("head"),4) ) {
3950 xml->hide = VISIBLE;
3951 if( xml->parser == xpSMART ) {
3952 xml->parser = xpHTML;
3953 }
3954 } else if( TAGMATCH(mbw_lit("hr"),2) ) {
3955 *q++ = mbw_lit('\n');
3956 xml->state = TAG;
3957 }
3958 break;
3959 case mbw_lit('i'):
3960 if( TAGMATCH(mbw_lit("img"),3) ||
3961 TAGMATCH(mbw_lit("iframe"),6) ||
3962 TAGMATCH(mbw_lit("ilayer"),6) ||
3963 TAGMATCH(mbw_lit("input"),5) ) {
3964 xml->state = XTAG;
3965 }
3966 break;
3967 case mbw_lit('l'):
3968 if( TAGMATCH(mbw_lit("layer"),5) ||
3969 TAGMATCH(mbw_lit("link"),4) ) {
3970 xml->state = XTAG;
3971 }
3972 break;
3973 case mbw_lit('n'):
3974 if( (TAGMATCH(mbw_lit("noframes"),8) && (xml->hide = NOFRAMES)) ||
3975 (TAGMATCH(mbw_lit("nolayer"),7) && (xml->hide = NOLAYER)) ||
3976 (TAGMATCH(mbw_lit("noscript"),8) && (xml->hide = NOSCRIPT)) ||
3977 (TAGMATCH(mbw_lit("noembed"),7) && (xml->hide = NOEMBED)) ) {
3978 if( xml->parser == xpHTML ) {
3979 if( (m_options & (1<<M_OPTION_SHOW_ALT)) ) {
3980 xml->hide = VISIBLE;
3981 }
3982 }
3983 }
3984 break;
3985 case mbw_lit('o'):
3986 if( TAGMATCH(mbw_lit("object"),6) ) {
3987 xml->state = XTAG;
3988 }
3989 break;
3990 case mbw_lit('s'):
3991 if( TAGMATCH(mbw_lit("span"),4) ) {
3992 xml->state = XTAG;
3993 } else if( TAGMATCH(mbw_lit("script"),6) ) {
3994 xml->hide = SCRIPT;
3995 } else if( TAGMATCH(mbw_lit("style"),5) ) {
3996 xml->hide = STYLE;
3997 }
3998 break;
3999 case mbw_lit('t'):
4000 if( TAGMATCH(mbw_lit("title"),5) ) {
4001 xml->hide = TITLE;
4002 }
4003 break;
4004 default:
4005 /* ignore, ie it's a TAG */
4006 break;
4007 }
4008 } else if( line[1] == mbw_lit('/') ) {
4009 line++;
4010 /* tags aren't mined, xtags are */
4011 xml->state = TAG;
4012 xml->attribute = UNDEF;
4013 switch(mbw_tolower(line[1])) {
4014 case mbw_lit('b'):
4015 if( TAGMATCH(mbw_lit("/body"),5) ) {
4016 if( xml->parser == xpHTML ) {
4017 xml->parser = xpSMART;
4018 }
4019 xml->hide = VISIBLE;
4020 }
4021 break;
4022 case mbw_lit('c'):
4023 if( (xml->hide == COMMENT) && TAGMATCH(mbw_lit("/comment"),8) ) {
4024 /* nothing */
4025 }
4026 break;
4027 case mbw_lit('h'):
4028 if( TAGMATCH(mbw_lit("/html"),5) || TAGMATCH(mbw_lit("/head"),5) ) {
4029 if( xml->parser == xpHTML ) {
4030 xml->parser = xpSMART;
4031 }
4032 xml->hide = VISIBLE;
4033 }
4034 break;
4035 case mbw_lit('n'):
4036 if( ((xml->hide == NOFRAMES) && TAGMATCH(mbw_lit("/noframes"),9)) ||
4037 ((xml->hide == NOSCRIPT) && TAGMATCH(mbw_lit("/noscript"),9)) ||
4038 ((xml->hide == NOLAYER) && TAGMATCH(mbw_lit("/nolayer"),8)) ||
4039 ((xml->hide == NOEMBED) && TAGMATCH(mbw_lit("/noembed"),8)) ) {
4040 xml->hide = VISIBLE;
4041 }
4042 break;
4043 case mbw_lit('s'):
4044 if( ((xml->hide == SCRIPT) && TAGMATCH(mbw_lit("/script"),7)) ||
4045 ((xml->hide == STYLE) && TAGMATCH(mbw_lit("/style"),6)) ) {
4046 xml->hide = VISIBLE;
4047 }
4048 break;
4049 case mbw_lit('t'):
4050 if( TAGMATCH(mbw_lit("/title"),6) ) {
4051 xml->hide = VISIBLE;
4052 }
4053 break;
4054 default:
4055 /* ignore, ie it's a TAG */
4056 break;
4057 }
4058 } else { /* second char is not alpha or slash */
4059 if( mbw_strncmp(line + 1, mbw_lit("!--"), 3) == 0 ) {
4060 if( line[4] == mbw_lit('>') ) {
4061 /* buggy MSHTML accepts <!--> as a comment, so ignore this */
4062 line += 4;
4063 } else { /* real comment */
4064 xml->state = CMNT;
4065 line += 3;
4066 }
4067 } else if( line[1] == mbw_lit('<') ) {
4068 /* stay in TEXT state */
4069 if( xml->parser == xpDUMB ) {
4070 line++;
4071 } else {
4072 switch(xml->hide) {
4073 case VISIBLE:
4074 case TITLE:
4075 *q++ = *line;
4076 break;
4077 case SCRIPT:
4078 if( m_options & (1<<M_OPTION_SHOW_SCRIPT) ) {
4079 *q++ = *line;
4080 }
4081 break;
4082 case STYLE:
4083 if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
4084 *q++ = *line;
4085 }
4086 break;
4087 default:
4088 break;
4089 }
4090 }
4091 } else {
4092 /* bogus tag? */
4093 xml->state = TAG;
4094 line++;
4095 }
4096 }
4097 break;
4098 case mbw_lit('&'):
4099 if( (xml->parser == xpHTML) || (xml->parser == xpDUMB) ) {
4100 switch(xml->hide) {
4101 case VISIBLE:
4102 case TITLE:
4103 mbw_prefix(decode_html_entity)(&line, &q);
4104 break;
4105 case SCRIPT:
4106 if( m_options & (1<<M_OPTION_SHOW_SCRIPT) ) {
4107 mbw_prefix(decode_html_entity)(&line, &q);
4108 }
4109 break;
4110 case STYLE:
4111 if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
4112 mbw_prefix(decode_html_entity)(&line, &q);
4113 }
4114 break;
4115 default:
4116 break;
4117 }
4118 }
4119 break;
4120 default:
4121 if( (xml->parser == xpHTML) || (xml->parser == xpDUMB) ) {
4122 switch(xml->hide) {
4123 case VISIBLE:
4124 case TITLE:
4125 *q++ = *line;
4126 break;
4127 case SCRIPT:
4128 if( m_options & (1<<M_OPTION_SHOW_SCRIPT) ) {
4129 *q++ = *line;
4130 }
4131 break;
4132 case STYLE:
4133 if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
4134 *q++ = *line;
4135 }
4136 break;
4137 default:
4138 break;
4139 }
4140 }
4141 break;
4142 }
4143 break;
4144 case TAG:
4145 if( line[0] == mbw_lit('>') ) {
4146 xml->state = TEXT;
4147 } else if( line[0] == mbw_lit('=') ) {
4148 xml->state = TAGPREQ;
4149 }
4150 break;
4151 case TAGPREQ:
4152 if( line[0] == mbw_lit('\'') ) {
4153 xml->state = TAGQUOTE;
4154 } else if( line[0] == mbw_lit('"') ) {
4155 xml->state = TAGDQUOTE;
4156 } else if( !mbw_isspace(line[0]) ) {
4157 xml->state = TAG;
4158 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4159 }
4160 break;
4161 case TAGQUOTE:
4162 if( (line[0] == '\\') && (line[1] == mbw_lit('\'')) ) {
4163 line++;
4164 } else if( line[0] == mbw_lit('\'') ) {
4165 xml->state = TAG;
4166 }
4167 break;
4168 case TAGDQUOTE:
4169 if( (line[0] == '\\') && (line[1] == mbw_lit('"')) ) {
4170 line++;
4171 } else if( line[0] == mbw_lit('"') ) {
4172 xml->state = TAG;
4173 }
4174 break;
4175 case XTAG:
4176 if( xml->parser == xpSMART ) {
4177 /* we've recognized an HTML tag */
4178 xml->parser = xpHTML;
4179 }
4180 if( (xml->attribute != UNDEF) &&
4181 mbw_isspace(line[0]) &&
4182 !MBW_EMPTYLINE(line) ) {
4183 xml->attribute = UNDEF;
4184 *q++ = ATTRIBSEP;
4185 } else
4186 switch(mbw_tolower(line[0])) {
4187 case mbw_lit('>'):
4188 xml->state = TEXT;
4189 if( xml->attribute != UNDEF ) {
4190 xml->attribute = UNDEF;
4191 *q++ = ATTRIBSEP;
4192 }
4193 break;
4194 case mbw_lit('='):
4195 xml->state = XTAGPREQ;
4196 break;
4197 case mbw_lit('a'):
4198 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4199 if( m_options & (1<<M_OPTION_SHOW_ALT) ) {
4200 if( !mbw_strncasecmp(line, mbw_lit("alt"), 3) ) {
4201 if( xml->attribute != UNDEF ) {
4202 *q++ = ATTRIBSEP;
4203 }
4204 xml->attribute = ALT;
4205 line += 2;
4206 *q++ = ATTRIBSEP;
4207 }
4208 }
4209 if( m_options & (1<<M_OPTION_SHOW_FORMS) ) {
4210 if( !mbw_strncasecmp(line, mbw_lit("action"), 6) ) {
4211 if( xml->attribute != UNDEF ) {
4212 *q++ = ATTRIBSEP;
4213 }
4214 xml->attribute = SRC;
4215 line += 5;
4216 *q++ = ATTRIBSEP;
4217 }
4218 }
4219 break;
4220 case mbw_lit('c'):
4221 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4222 if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
4223 if( !mbw_strncasecmp(line, mbw_lit("class"), 5) ) {
4224 if( xml->attribute != UNDEF ) {
4225 *q++ = ATTRIBSEP;
4226 }
4227 xml->attribute = ASTYLE;
4228 line += 4;
4229 *q++ = ATTRIBSEP;
4230 }
4231 }
4232 break;
4233 case mbw_lit('d'):
4234 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4235 if( m_options & (1<<M_OPTION_SHOW_LINKS) ) {
4236 if( !mbw_strncasecmp(line, mbw_lit("data"), 4) ) {
4237 if( xml->attribute != UNDEF ) {
4238 *q++ = ATTRIBSEP;
4239 }
4240 xml->attribute = SRC;
4241 line += 3;
4242 *q++ = ATTRIBSEP;
4243 }
4244 }
4245 break;
4246 case mbw_lit('h'):
4247 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4248 if( m_options & (1<<M_OPTION_SHOW_LINKS) ) {
4249 if( !mbw_strncasecmp(line, mbw_lit("href"), 4) ) {
4250 if( xml->attribute != UNDEF ) {
4251 *q++ = ATTRIBSEP;
4252 }
4253 xml->attribute = SRC;
4254 line += 3;
4255 *q++ = ATTRIBSEP;
4256 }
4257 }
4258 break;
4259 case mbw_lit('o'):
4260 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4261 if( mbw_tolower(line[1]) != mbw_lit('n') ) {
4262 /* false alarm */
4263 } else if( ATTRMATCH(mbw_lit("onmousedown"), 11) ||
4264 ATTRMATCH(mbw_lit("onmousemove"), 11) ||
4265 ATTRMATCH(mbw_lit("onmouseout"), 10) ||
4266 ATTRMATCH(mbw_lit("onmouseover"), 11) ||
4267 ATTRMATCH(mbw_lit("onmouseup"), 9) ||
4268
4269 ATTRMATCH(mbw_lit("onclick"), 7) ||
4270 ATTRMATCH(mbw_lit("ondblclick"), 10) ||
4271 ATTRMATCH(mbw_lit("onfocus"), 7) ||
4272
4273 ATTRMATCH(mbw_lit("onkeydown"), 9) ||
4274 ATTRMATCH(mbw_lit("onkeypress"), 10) ||
4275 ATTRMATCH(mbw_lit("onkeyup"), 7) ||
4276
4277 ATTRMATCH(mbw_lit("ondataavailable"), 15) ||
4278 ATTRMATCH(mbw_lit("ondatasetchanged"), 16) ||
4279 ATTRMATCH(mbw_lit("ondatasetcomplete"), 17) ||
4280
4281 ATTRMATCH(mbw_lit("onabort"), 7) ||
4282 ATTRMATCH(mbw_lit("onload"), 6) ||
4283 ATTRMATCH(mbw_lit("onunload"), 8) ||
4284 ATTRMATCH(mbw_lit("onmove"), 6) ||
4285 ATTRMATCH(mbw_lit("onresize"), 8) ||
4286 ATTRMATCH(mbw_lit("onsubmit"), 8) ) {
4287 if( xml->attribute != UNDEF ) {
4288 *q++ = ATTRIBSEP;
4289 }
4290
4291 xml->attribute = JSCRIPT;
4292 *q++ = ATTRIBSEP;
4293 /* line is already updated by ATTRMATCH */
4294 }
4295 break;
4296 case mbw_lit('s'):
4297 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4298 if( m_options & (1<<M_OPTION_SHOW_LINKS) ) {
4299 if( !mbw_strncasecmp(line, mbw_lit("src"), 3) ) {
4300 if( xml->attribute != UNDEF ) {
4301 *q++ = ATTRIBSEP;
4302 }
4303 xml->attribute = SRC;
4304 line += 2;
4305 *q++ = ATTRIBSEP;
4306 }
4307 }
4308 if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
4309 if( !mbw_strncasecmp(line, mbw_lit("style"), 5) ) {
4310 if( xml->attribute != UNDEF ) {
4311 *q++ = ATTRIBSEP;
4312 }
4313 xml->attribute = ASTYLE;
4314 line += 4;
4315 *q++ = ATTRIBSEP;
4316 }
4317 }
4318 break;
4319 case mbw_lit('t'):
4320 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4321 if( m_options & (1<<M_OPTION_SHOW_ALT) ) {
4322 if( !mbw_strncasecmp(line, mbw_lit("title"), 5) ) {
4323 if( xml->attribute != UNDEF ) {
4324 *q++ = ATTRIBSEP;
4325 }
4326 xml->attribute = ALT;
4327 line += 4;
4328 *q++ = ATTRIBSEP;
4329 }
4330 }
4331 break;
4332 case mbw_lit('u'):
4333 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4334 if( m_options & (1<<M_OPTION_SHOW_LINKS) ) {
4335 if( !mbw_strncasecmp(line, mbw_lit("urn"), 3) ) {
4336 if( xml->attribute != UNDEF ) {
4337 *q++ = ATTRIBSEP;
4338 }
4339 xml->attribute = SRC;
4340 line += 2;
4341 *q++ = ATTRIBSEP;
4342 }
4343 }
4344 break;
4345 case mbw_lit('v'):
4346 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4347 if( m_options & (1<<M_OPTION_SHOW_FORMS) ) {
4348 if( !mbw_strncasecmp(line, mbw_lit("value"), 5) ) {
4349 if( xml->attribute != UNDEF ) {
4350 *q++ = ATTRIBSEP;
4351 }
4352 xml->attribute = SRC;
4353 line += 4;
4354 *q++ = ATTRIBSEP;
4355 }
4356 }
4357 break;
4358 default:
4359 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4360 break;
4361 }
4362 break;
4363 case XTAGPREQ:
4364 if( line[0] == mbw_lit('\'') ) {
4365 xml->state = XTAGQUOTE;
4366 } else if( line[0] == mbw_lit('"') ) {
4367 xml->state = XTAGDQUOTE;
4368 } else if( !mbw_isspace(line[0]) ) {
4369 xml->state = XTAG;
4370 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4371 }
4372 break;
4373 case XTAGQUOTE:
4374 if( line[0] == mbw_lit('\'') ) {
4375 xml->state = XTAG;
4376 if( xml->attribute != UNDEF ) {
4377 *q++ = ATTRIBSEP;
4378 }
4379 xml->attribute = UNDEF;
4380 } else {
4381 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4382 if( (line[0] == mbw_lit('\\')) && (line[1] == mbw_lit('\'')) ) {
4383 line++;
4384 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4385 }
4386 }
4387 break;
4388 case XTAGDQUOTE:
4389 if( line[0] == mbw_lit('"') ) {
4390 xml->state = XTAG;
4391 if( xml->attribute != UNDEF ) {
4392 *q++ = ATTRIBSEP;
4393 }
4394 xml->attribute = UNDEF;
4395 } else {
4396 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4397 if( (line[0] == mbw_lit('\\')) && (line[1] == mbw_lit('"')) ) {
4398 line++;
4399 mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4400 }
4401 }
4402 break;
4403 case CMNT:
4404 if( (line[0] == mbw_lit('-')) && (line[1] == mbw_lit('-')) ) {
4405 xml->state = TAG;
4406 } else {
4407 /* ignore comments in some circumstances */
4408 if( m_options & (1<<M_OPTION_SHOW_HTML_COMMENTS) ) {
4409 *q++ = *line;
4410 } else switch(xml->hide) {
4411 case SCRIPT:
4412 if( m_options & (1<<M_OPTION_SHOW_SCRIPT) ) {
4413 *q++ = *line;
4414 }
4415 break;
4416 case STYLE:
4417 if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
4418 *q++ = *line;
4419 }
4420 break;
4421 default:
4422 break;
4423 }
4424 }
4425 break;
4426 case DISABLED:
4427 /* don't modify the line at all */
4428 return;
4429 }
4430 line++;
4431 }
4432 *q = mbw_lit('\0'); /* mark the end of the clean text string */
4433
4434 }
4435
4436
4437 /***********************************************************
4438 * TEXT PARSING FUNCTIONS *
4439 ***********************************************************/
4440
mbw_prefix(plain_text_filter)4441 bool_t mbw_prefix(plain_text_filter)(MBOX_State *mbox, mbw_t *line) {
4442 mbw_t *q;
4443 bool_t url = 0;
4444 bool_t censor = 0;
4445
4446
4447 switch(mbox->plainstate) {
4448 case psPLAIN:
4449 if( (line[0] == mbw_lit('b')) &&
4450 !mbw_strncmp(line, mbw_lit("begin "),6) &&
4451 ISOCT(line[6]) && ISOCT(line[7]) && ISOCT(line[8]) ) {
4452 mbox->plainstate = psUUENCODE;
4453 return 1;
4454 }
4455 break;
4456 case psUUENCODE:
4457 switch(mbw_prefix(is_uuline)(line)) {
4458 case -1:
4459 return 1;
4460 case -2:
4461 mbox->plainstate = psPLAIN;
4462 break;
4463 default:
4464 return 0;
4465 }
4466 break;
4467 }
4468
4469 /* now assume psPLAIN */
4470
4471 q = line;
4472 while(*line) {
4473 switch(*line) {
4474 case mbw_lit('%'):
4475 if( !censor ) {
4476 mbw_prefix(decode_uri_character)(&line, &q);
4477 }
4478 break;
4479 case mbw_lit('&'):
4480 if( !censor ) {
4481 if( !url ) {
4482 mbw_prefix(decode_html_entity)(&line, &q);
4483 } else {
4484 censor = 1;
4485 }
4486 }
4487 break;
4488 case mbw_lit('H'):
4489 case mbw_lit('h'):
4490 if( !mbw_strncasecmp(line, mbw_lit("http://"), 7) ) {
4491 censor = 0;
4492 url = 1;
4493 }
4494 if( !censor ) { *q++ = *line; }
4495 break;
4496 case mbw_lit('?'):
4497 case mbw_lit(';'):
4498 case mbw_lit('#'):
4499 if( url ) {
4500 censor = 1;
4501 } else {
4502 *q++ = *line;
4503 }
4504 break;
4505 case mbw_lit(' '):
4506 case mbw_lit('\t'):
4507 case mbw_lit('>'):
4508 case mbw_lit('\''):
4509 case mbw_lit('"'):
4510 if( censor && url ) { censor = 0; url = 0; }
4511 *q++ = *line;
4512 break;
4513 default:
4514 if( !censor ) { *q++ = *line; }
4515 break;
4516 }
4517 line++;
4518 }
4519 *q = mbw_lit('\0');
4520
4521 return 1;
4522 }
4523
4524
4525 /* assume string is binary with embedded NULs replaced by FFs,
4526 the strings that are found are separated by spaces */
4527 /* note: this doesn't work like strings(1) yet, but eventually it will ;-) */
mbw_prefix(strings1_filter)4528 bool_t mbw_prefix(strings1_filter)(mbw_t *line) {
4529 size_t c;
4530 mbw_t *q;
4531
4532 #define MIN_STRING_SIZE 4
4533 for(q = line, c = 0; *line; line++) {
4534 /* if( mbw_isalnum(*line) || */
4535 /* mbw_ispunct(*line) || */
4536 /* (*line == mbw_lit(' ')) ) { */
4537 if( mbw_isprint(*line) && (*line != mbw_lit('\t')) ) {
4538 *q++ = *line;
4539 c++;
4540 } else if( c >= MIN_STRING_SIZE ) {
4541 *q++ = mbw_lit(' ');
4542 c = 0;
4543 } else if( c > 0 ) {
4544 q -= c;
4545 c = 0;
4546 }
4547 }
4548 *q = mbw_lit('\0');
4549
4550 return 1;
4551 }
4552
4553 /***********************************************************
4554 * CALLED OUTSIDE THIS SOURCE FILE *
4555 ***********************************************************/
4556
mbw_prefix(init_decoding_caches)4557 void mbw_prefix(init_decoding_caches)(MBOX_State *mbox) {
4558 mbw_prefix(init_dc)(&(mbox->mbw_prefix(b64_dc)),system_pagesize);
4559 mbw_prefix(init_dc)(&(mbox->mbw_prefix(qp_dc)),system_pagesize);
4560 }
4561
mbw_prefix(free_decoding_caches)4562 void mbw_prefix(free_decoding_caches)(MBOX_State *mbox) {
4563 if( mbox->mbw_prefix(b64_dc).cache ) {
4564 free(mbox->mbw_prefix(b64_dc).cache);
4565 mbox->mbw_prefix(b64_dc).cache = NULL;
4566 }
4567 if( mbox->mbw_prefix(qp_dc).cache ) {
4568 free(mbox->mbw_prefix(qp_dc).cache);
4569 mbox->mbw_prefix(qp_dc).cache = NULL;
4570 }
4571 }
4572
4573 #endif
4574
4575
4576