1 /*
2  * Copyright (C) 2002 Laird Breyer
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
17  *
18  * Author:   Laird Breyer <laird@lbreyer.com>
19  */
20 
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24 
25 #include <ctype.h>
26 #include <string.h>
27 #include <stdlib.h>
28 
29 #include "dbacl.h"
30 
31 
32 /* most functions in this file are logically identical in wide char
33  * and multibyte versions, except that char is replaced by wchar_t
34  * etc. It's become unwieldy to keep two slightly different copies of
35  * all functions involved, so we use the preprocessor to build a poor
36  * man's template facility.
37  *
38  * The "template" macros work as follows:
39  * mbw_lit("abc")             -> "abc" or L"abc"
40  * mbw_t                      -> char  or wchar_t
41  * mbw_prefix(good_char)(x)   -> good_char(x) or w_good_char(x)
42  *
43  * Once the template macros have done their work, we obtain ordinary
44  * functions named in a parallel fashion, where the wide character
45  * functions have a w_ prefix, and instances of char are substituted
46  * with instances of wchar_t.
47  *
48  * The code below is split into uncommon code, where the
49  * implementations of corresponding functions is different, and common
50  * code where the implementation is identical. Only identical code is
51  * "templatized".
52  */
53 
54 #include "mbw.h"
55 
56 extern options_t u_options;
57 extern charparser_t m_cp;
58 extern options_t m_options;
59 
60 extern myregex_t re[MAX_RE];
61 extern regex_count_t regex_count;
62 
63 extern long system_pagesize;
64 
65 /* uncommon code */
66 
67 /***********************************************************
68  * UTILITY FUNCTIONS                                       *
69  ***********************************************************/
70 
71 #if defined HAVE_MBRTOWC && defined MBW_WIDE
72 /* compiler doesn't seem to know this function is in the
73  * library, so we define our own - bug or just plain weird? */
74 static __inline__
mywcsncasecmp(const wchar_t * s1,const wchar_t * s2,size_t n)75 int mywcsncasecmp(const wchar_t *s1, const wchar_t *s2, size_t n) {
76   register size_t i = 0;
77   while( i < n ) {
78     if( tolower(*s1) != tolower(*s2) ) {
79       return towlower(*s1) - towlower(*s2);
80     }
81     s1++;
82     s2++;
83   }
84   return 0;
85 }
86 #endif
87 
88 
89 #if defined HAVE_MBRTOWC && defined MBW_WIDE
90 
91 static __inline__
w_b64_code(wchar_t c)92 int w_b64_code(wchar_t c) {
93   if( (c >= L'A') && (c <= L'Z') ) {
94     return (c - L'A');
95   } else if( (c >= L'a') && (c <= L'z') ) {
96     return (c - L'a') + 26;
97   } else if( (c >= L'0') && (c <= L'9') ) {
98     return (c - L'0') + 52;
99   } else if( c == L'+' ) {
100     return 62;
101   } else if( c == L'/' ) {
102     return 63;
103   } else if( c == L'=' ) {
104     return 64;
105   } else {
106     return -1;
107   }
108 }
109 
110 static __inline__
w_qp_code(wchar_t c)111 int w_qp_code(wchar_t c) {
112   if( (c >= L'0') && (c <= L'9') ) {
113     return (c - L'0');
114   } else if( (c >= L'A') && (c <= L'F') ) {
115     return (c - L'A') + 10;
116   } else {
117     return -1;
118   }
119 }
120 
121 
122 #else
123 
124 /* warning: only use char here so we never have to bother about endianness */
125 static const signed char b64_code_table[256] = {
126   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
127   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
128   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
129   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
130   -1,-1,-1,62,-1,-1,-1,63,52,53,
131   54,55,56,57,58,59,60,61,-1,-1,
132   -1,64,-1,-1,-1, 0, 1, 2, 3, 4,
133    5, 6, 7, 8, 9,10,11,12,13,14,
134   15,16,17,18,19,20,21,22,23,24,
135   25,-1,-1,-1,-1,-1,-1,26,27,28,
136   29,30,31,32,33,34,35,36,37,38,
137   39,40,41,42,43,44,45,46,47,48,
138   49,50,51,-1,-1,-1,-1,-1,-1,-1,
139   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
140   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
141   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
142   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
143   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
144   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
145   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
146   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
147   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
148   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
149   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
150   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
151   -1,-1,-1,-1,-1,-1
152 };
153 
154 #define b64_code(c) ((int)b64_code_table[(int)c])
155 
156 static const signed char qp_code_table[256] = {
157   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
158   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
159   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
160   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
161   -1,-1,-1,-1,-1,-1,-1,-1, 0, 1,
162    2, 3, 4, 5, 6, 7, 8, 9,-1,-1,
163   -1,-1,-1,-1,-1,10,11,12,13,14,
164   15,16,-1,-1,-1,-1,-1,-1,-1,-1,
165   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
166   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
167   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
168   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
169   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
170   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
171   -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
172   -1,-1,-1,-1,-1,-1
173 };
174 
175 #define qp_code(c) (qp_code_table[(int)c])
176 
177 #endif
178 
179 
180 /* common code */
181 
182 #if defined MBW_MB || (defined MBW_WIDE && defined HAVE_MBRTOWC)
183 
184 
185 /* this is dbacl's idea of an empty line. Note that a single \n or
186  * a \r\n both constitute an empty line, contrary to RFC 2822, which
187  * doesn't allow single \n chars in headers. However, we might be
188  * reading the mail from a Unix mbox, where \r\n was replaced with \n.
189  * We don't accept single \r however.
190  */
191 #define MBW_EMPTYLINE(line) ((!(line) || \
192                               (line[0] == mbw_lit('\0')) || \
193                               ((line)[0] == mbw_lit('\n')) || \
194 			      (((line)[0] == mbw_lit('\r')) &&  \
195 			       ((line)[1] == mbw_lit('\n')))) ? 1 : 0)
196 
197 #define MBW_DOUBLEDASH(line) ((line[0] == mbw_lit('-')) && \
198                               (line[1] == mbw_lit('-')) && \
199                               !mbw_isspace(line[2]))
200 
201 
202 /***********************************************************
203  * TABLES                                                  *
204  ***********************************************************/
205 typedef struct {
206   const mbw_t *type_subtype;
207   MIME_Content_Type medium;
208 } mbw_prefix(MIME_Media);
209 
210 /* Wildcards such as text are represented as "text/" and must
211  * be placed after all other text/xxx types.
212  * More generally, comparison uses mystrcasestr, so the smallest strings
213  * must come after the more detailed ones.
214  *
215  * For a description of official mime types, see
216  * http://www.iana.org/assignments/
217  */
218 static const mbw_prefix(MIME_Media) mbw_prefix(mime_media)[] = {
219   { mbw_lit("text/html"), ctTEXT_HTML },
220   { mbw_lit("text/xhtml"), ctTEXT_HTML },
221   { mbw_lit("text/plain"), ctTEXT_PLAIN },
222   { mbw_lit("text/richtext"), ctTEXT_RICH },
223   { mbw_lit("text/enriched"), ctTEXT_RICH },
224   { mbw_lit("text/rtf"), ctTEXT_PLAIN },
225   { mbw_lit("text/xml"), ctTEXT_XML },
226   { mbw_lit("text/sgml"), ctTEXT_SGML },
227   { mbw_lit("text/"), ctTEXT_PLAIN },
228 
229   { mbw_lit("multipart/"), ctTEXT_PLAIN },
230 
231   { mbw_lit("message/rfc822"), ctMESSAGE_RFC822 },
232   { mbw_lit("message/partial"), ctOTHER },
233   { mbw_lit("message/external-body"), ctMESSAGE_RFC822 },
234   { mbw_lit("message/news"), ctMESSAGE_RFC822 },
235   { mbw_lit("message/"), ctOCTET_STREAM },
236 
237   { mbw_lit("application/sgml"), ctTEXT_PLAIN },
238   { mbw_lit("application/xml"), ctTEXT_PLAIN },
239   { mbw_lit("application/rtf"), ctTEXT_PLAIN },
240   { mbw_lit("application/news-transmission"), ctMESSAGE_RFC822 },
241   { mbw_lit("application/andrew-inset"), ctTEXT_PLAIN },
242   { mbw_lit("application/msword"), ctAPPLICATION_MSWORD },
243   { mbw_lit("application/"), ctOCTET_STREAM },
244 
245   { mbw_lit("image/"), ctIMAGE },
246   { mbw_lit("audio/"), ctAUDIO },
247   { mbw_lit("video/"), ctVIDEO },
248   { mbw_lit("model/"), ctMODEL },
249 };
250 static int num_mime_media = sizeof(mbw_prefix(mime_media))/sizeof(mbw_prefix(MIME_Media));
251 
252 static const mbw_t *mbw_prefix(armor_start)[] = {
253   mbw_lit("-----BEGIN PGP MESSAGE"),
254   mbw_lit("-----BEGIN PGP PUBLIC KEY BLOCK"),
255   mbw_lit("-----BEGIN PGP PRIVATE KEY BLOCK"),
256   mbw_lit("-----BEGIN PGP SIGNATURE"),
257 };
258 static int num_armor_start = sizeof(mbw_prefix(armor_start))/sizeof(mbw_t *);
259 
260 static const mbw_t *mbw_prefix(armor_end)[] = {
261   mbw_lit("-----END PGP MESSAGE"),
262   mbw_lit("-----END PGP PUBLIC KEY BLOCK"),
263   mbw_lit("-----END PGP PRIVATE KEY BLOCK"),
264   mbw_lit("-----END PGP SIGNATURE"),
265 };
266 static int num_armor_end = sizeof(mbw_prefix(armor_end))/sizeof(mbw_t *);
267 
268 /***********************************************************
269  * UTILITY FUNCTIONS                                       *
270  ***********************************************************/
271 
272 /* checks if the line is "binary", ie contains printable chars
273    but not too many extended ascii chars */
274 static
mbw_prefix(is_binline)275 bool_t mbw_prefix(is_binline)(const mbw_t *line) {
276   int numa = 0;
277   const mbw_t *p = line;
278   while( *p ) {
279     if( mbw_isspace(*p) ||
280 	(mbw_isascii(*p) && !mbw_iscntrl(*p)) ) {
281       numa++;
282     } else if( !mbw_isprint(*p) ) {
283       return 1;
284     }
285     p++;
286   }
287   return (numa < (p - line)/2);
288 }
289 
290 static
mbw_prefix(is_emptyspace)291 bool_t mbw_prefix(is_emptyspace)(const mbw_t *line) {
292   const mbw_t *p = line;
293   while( *p ) {
294     if( !mbw_isspace(*p) ) {
295       return 0;
296     }
297     p++;
298   }
299   return 1;
300 }
301 
302 
303 static
mbw_prefix(is_b64line)304 bool_t mbw_prefix(is_b64line)(const mbw_t *line) {
305   const mbw_t *p = line;
306   while( *p ) {
307     if( (mbw_prefix(b64_code)(*p) == -1) &&
308 	!mbw_isspace(*p) ) {
309       return 0;
310     }
311     p++;
312   }
313   return 1;
314 }
315 
316 static
mbw_prefix(is_uuline)317 int mbw_prefix(is_uuline)(const mbw_t *line) {
318   int count = 0;
319   const mbw_t *p = line;
320   int len = (int)(line[0] - mbw_lit(' '));
321   if( (len < 0) || (len > 63) ) {
322     return -1;
323   }
324   while(*p && (*p != mbw_lit('\r')) && (*p != mbw_lit('\n')) ) {
325     if( (*p > mbw_lit('`')) ||
326 	(*p < mbw_lit(' ')) ) {
327       return -2;
328     } else {
329       count++;
330     }
331     p++;
332   }
333 
334   return (abs(count - 4*(len/3)) <= 3);
335 }
336 
337 /* detecting true yEnc lines is too hard, so we detect
338    nonprintable characters instead */
339 static
mbw_prefix(is_yencline)340 bool_t mbw_prefix(is_yencline)(const mbw_t *line) {
341   int nonprint = 0;
342   const mbw_t *p = line;
343   while( *p ) {
344     nonprint += !mbw_isprint(*p);
345     p++;
346   }
347   return (nonprint > 5);
348 }
349 
350 /*
351  * this code generates mystrcasestr() and w_mystrcasestr()
352  * (similar to strstr, but case insensitive)
353  */
354 static __inline__
mbw_prefix(mystrcasestr)355 const mbw_t *mbw_prefix(mystrcasestr)(const mbw_t *haystack, const mbw_t *needle) {
356   const mbw_t *p, *q, *r;
357 
358   for(p = haystack; *p; p++) {
359     q = needle; r = p;
360     while( *q && *r && ((mbw_tolower(*q) - mbw_tolower(*r)) == 0) ) {
361       q++; r++;
362     }
363     if( !*q ) {
364       return p;
365     }
366   }
367   return NULL;
368 }
369 
370 static __inline__
mbw_prefix(mystrncasecmp)371 int mbw_prefix(mystrncasecmp)(const mbw_t *s1, const mbw_t *s2, size_t n) {
372   int s = -1;
373   if( s1 && s2 ) {
374     while(--n > 0) {
375       s = (mbw_tolower(*s1++) - mbw_tolower(*s2++));
376       if( (s != 0) || (s1 == mbw_lit('\0')) || (s2 == mbw_lit('\0')) ) {
377 	break;
378       }
379     }
380   }
381   return s;
382 }
383 
384 static __inline__
mbw_prefix(mystrncmp)385 int mbw_prefix(mystrncmp)(const mbw_t *s1, const mbw_t *s2, size_t n) {
386   int s = -1;
387   if( s1 && s2 ) {
388     while(--n > 0) {
389       s = (*s1++ - *s2++);
390       if( (s != 0) || (s1 == mbw_lit('\0')) || (s2 == mbw_lit('\0')) ) {
391 	break;
392       }
393     }
394   }
395   return s;
396 }
397 
398 
399 /***********************************************************
400  * DECODING CACHE FUNCTIONS                                *
401  ***********************************************************/
402 
403 
404 static
mbw_prefix(init_dc)405 void mbw_prefix(init_dc)(mbw_prefix(decoding_cache) *dc, size_t len) {
406   if( !dc->cache ) {
407     dc->cache = (mbw_t *)malloc(len * sizeof(mbw_t));
408     dc->data_ptr = dc->cache;
409     dc->cache_len = dc->cache ? len : 0;
410     dc->max_line_len = len;
411   }
412 }
413 
414 
415 static
mbw_prefix(adjust_cache_size)416 void mbw_prefix(adjust_cache_size)(mbw_prefix(decoding_cache) *dc, size_t n) {
417   mbw_t *p;
418   size_t m = (dc->data_ptr - dc->cache);
419   while( dc->cache_len < m + n ) {
420     p = (mbw_t *)realloc(dc->cache, 2 * dc->cache_len * sizeof(mbw_t));
421     if( p ) {
422       dc->cache = p;
423       dc->data_ptr = p + m;
424       dc->cache_len *= 2;
425     } else {
426       break;
427     }
428   }
429   dc->max_line_len = (dc->max_line_len < n) ? n : dc->max_line_len;
430 }
431 
432 static
mbw_prefix(flush_cache)433 bool_t mbw_prefix(flush_cache)(mbw_prefix(decoding_cache) *dc, mbw_t *line, bool_t all) {
434   mbw_t *q;
435   mbw_t *p;
436   int i;
437   if( dc->cache && (dc->data_ptr > dc->cache) ) {
438     /* never output more bytes than will fit on output_line */
439     p = (dc->data_ptr > dc->cache + dc->max_line_len) ?
440       (dc->cache + dc->max_line_len): dc->data_ptr;
441 
442     if( !all ) {
443       /* we break the line at the last space, or ampersand, or seventy
444 	 chars (> b64/qp limit) from the end - there may well be
445 	 stretches longer than this, but we try to flush as much as
446 	 possible, so the limit should be small. Also, we don't want
447 	 to break entities if possible.
448       */
449 /*       for(i = 25; !mbw_isspace(*p) &&  */
450 /* 	    (p > dc->cache) && i; --p, --i); */
451       for(i = 70; !mbw_isspace(*p) &&
452 	    (*p != mbw_lit('&')) && (p > dc->cache) && i; --p, --i);
453     }
454 
455     if( p > dc->cache ) {
456       for(q = dc->cache; q < p; q++) {
457 	*line++ = *q;
458       }
459       *line = mbw_lit('\0');
460 
461       dc->data_ptr = dc->cache;
462       if( !all ) {
463 	/* now fold unused part back into cache. Note that
464 	 * b64_line_cache is always NUL terminated, so we don't
465 	 * need b64_cache_ptr to mark the end. */
466 	while( *p ) {
467 	  *dc->data_ptr++ = *p++;
468 	}
469       }
470       *dc->data_ptr = mbw_lit('\0');
471       return 1;
472     }
473   }
474   return 0;
475 }
476 
477 /***********************************************************
478  * DECODING FUNCTIONS                                      *
479  ***********************************************************/
480 
481 
482 #define REPNUL mbw_lit('\t')
483 
484 /*
485  * this code generates b64_line_filter2() and w_b64_line_filter2()
486  * works ok so long as q <= line, or q >> line
487  * WARNING: it is assumed that the buffer at q can hold (at most) all of line
488  *
489  * The string which is written is always NUL terminated, but if NULs
490  * were decoded in the middle, those are replaced by tabs (we could
491  * also replace them with a more neutral char, but the cache flushing
492  * code breaks up lines on spaces, and we want to take advantage of that. See
493  * the REPNUL define)
494  */
mbw_prefix(b64_line_filter2)495 mbw_t *mbw_prefix(b64_line_filter2)(mbw_t *line, mbw_t *q) {
496   mbw_t *p = line;
497   mbw_t buf[4];
498   mbw_t *buf_start = buf;
499   mbw_t *buf_end = buf + 4;
500 
501   if( q ) {
502     while( *p ) {
503       if( mbw_prefix(b64_code)(*p) > -1 ) {
504 	*buf_start++ = *p;
505 	if( buf_start == buf_end ) {
506 	  buf_start = buf;
507 	  *q = (mbw_prefix(b64_code)(buf[0])<<2) + (mbw_prefix(b64_code)(buf[1])>>4);
508 	  if( !*q ) { *q = REPNUL; }
509 	  q++;
510 	  if( buf[2] != mbw_lit('=') ) {
511 	    *q = (mbw_prefix(b64_code)(buf[1])<<4) + (mbw_prefix(b64_code)(buf[2])>>2);
512 	    if( !*q ) { *q = REPNUL; }
513 	    q++;
514 	    if( buf[3] != mbw_lit('=') ) {
515 	      *q = (mbw_prefix(b64_code)(buf[2])<<6) + mbw_prefix(b64_code)(buf[3]);
516 	      if( !*q ) { *q = REPNUL; }
517 	      q++;
518 	    } else {
519 	      break;
520 	    }
521 	  } else {
522 	    break;
523 	  }
524 	}
525       }
526       p++;
527     }
528     *q = mbw_lit('\0');
529   }
530 
531   return q;
532 }
533 
534 /*
535  * this code generates b64_line_filter() and w_b64_line_filter()
536  * Decodes a base64 encoded line. The input line is overwritten.
537  *
538  * The b64 standard arbitrarily truncates lines to 57 characters, so
539  * here we place the chunks in a cache and only overwrite line when
540  * the cache is full. Unfortunately, malformed email messages may not
541  * follow the standard, so in practice all this means is that we get
542  * arbitrarily truncated input.
543  *
544  * Note that when we overwrite line with the cached data, we assume
545  * the line is big enough to hold all the cached data. This is guaranteed
546  * by registering the current line length with the cache.
547  */
mbw_prefix(b64_line_filter)548 bool_t mbw_prefix(b64_line_filter)(mbw_prefix(decoding_cache) *b64cache,
549 				   mbw_t *line) {
550   mbw_prefix(adjust_cache_size)(b64cache, mbw_strlen(line));
551   b64cache->data_ptr =
552     mbw_prefix(b64_line_filter2)(line, b64cache->data_ptr);
553   return mbw_prefix(flush_cache)(b64cache, line, 0);
554 }
555 
556 
557 /*
558  * this code generates qp_line_filter2() and w_qp_line_filter2()
559  * this works ok so long as q <= line, or q >> line
560  * WARNING: it is assumed that the buffer at q can hold (at most) all of line
561  */
mbw_prefix(qp_line_filter2)562 mbw_t *mbw_prefix(qp_line_filter2)(mbw_t *line, mbw_t *q) {
563   mbw_t *p = line;
564   if( q ) {
565     while( *p ) {
566       if( *p != mbw_lit('=') ) {
567 	*q++ = *p++;
568       } else {
569 	if( !*(++p) || mbw_isspace(*p) ) {
570 	  break;
571 	} else {
572  	  /* if the equal sign isn't followed by  */
573           /* an upper case hex number, something's wrong */
574 	  *q = mbw_prefix(qp_code)(*p);
575 	  if( ((signed char)*q < 0) || !p[1] || (mbw_prefix(qp_code)(p[1]) < 0) ) {
576 	    *q++ = p[-1];
577 	  } else {
578 	    *q = (*q << 4) + mbw_prefix(qp_code)(p[1]);
579 	    if( *q ) { q++; }
580 	    p += 2;
581 	  }
582 	}
583       }
584     }
585     *q = mbw_lit('\0');
586   }
587   return q;
588 }
589 
590 /*
591  * this code generates qp_line_filter() and w_qp_line_filter()
592  * Decodes a quoted-printable line. The input line is overwritten.
593  *
594  * The QP standard arbitrarily truncates lines to 76 characters, so
595  * here we place the chunks in a cache and only overwrite line when
596  * the cache is full. Unfortunately, malformed email messages may not
597  * follow the standard, so in practice all this means is that we get
598  * arbitrarily truncated input.
599  *
600  * Note that when we overwrite line with the cached data, we assume
601  * the line is big enough to hold all the cached data. This is guaranteed
602  * by registering the current line length with the cache.
603  */
mbw_prefix(qp_line_filter)604 bool_t mbw_prefix(qp_line_filter)(mbw_prefix(decoding_cache) *qpcache,
605 				  mbw_t *line) {
606   mbw_prefix(adjust_cache_size)(qpcache, mbw_strlen(line));
607   qpcache->data_ptr =
608     mbw_prefix(qp_line_filter2)(line, qpcache->data_ptr);
609   return mbw_prefix(flush_cache)(qpcache, line, 0);
610 }
611 
612 
613 /***********************************************************
614  * TOKENIZER FUNCTIONS                                     *
615  ***********************************************************/
616 /* the following modules handle state transitions, you can mix and
617  * match them, or write new ones. The is_func() is called in the
618  * default state, and switches the internal state if necessary. If it
619  * can't recognize the current char, it should return gcUNDEF, not
620  * gcDISCARD, that way the next is_func() can look at the character.
621  * The handle_func() is similar to the is_func(), but is
622  * called when the state is not the default. It should return gcDISCARD
623  * and switch to the default state if it can't recognize the current char,
624  * otherwise it can switch states any way it wants. When it detects the
625  * end of the current token, it must switch back to the default state.
626  */
627 
628 /* these are macros to save typing, modules below */
629 
630 #define SET1(c) ( (*(c) == mbw_lit('\'')) || (*(c) == mbw_lit('-')) || (*(c) == mbw_lit('.')) )
631 /* #define SET1(c) ( (*(c) == mbw_lit('-')) || (*(c) == mbw_lit('+')) || (*(c) == mbw_lit('.')) || (*(c) == mbw_lit('_')) || (*(c) == mbw_lit(',')) || (*(c) >= 0xA0) ) */
632 /* #define SET1(c) ( (*(c) == mbw_lit('-')) || (*(c) == mbw_lit('+')) || (*(c) == mbw_lit('.')) || (*(c) == mbw_lit('_')) || (*(c) == mbw_lit(',')) || (*(c) == mbw_lit('$')) || (*(c) >= 0xA0) ) */
633 #define SET2(c) ( (*(c) == mbw_lit(',')) || (*(c) == mbw_lit('.')) )
634 #define SET3(c) ( (*(c) > mbw_lit(' ')) && (*(c) <= mbw_lit('~')) && (*(c) != mbw_lit('>')) )
635 
636 #define IO(c) ((*(c) & 0xC0) == 0x80)
637 #define I2O(c) ((*(c) & 0xE0) == 0xC0)
638 #define I3O(c) ((*(c) & 0xF0) == 0xE0)
639 #define I4O(c) ((*(c) & 0xF8) == 0xF0)
640 #define I5O(c) ((*(c) & 0xFC) == 0xF8)
641 #define I6O(c) ((*(c) & 0xFE) == 0xFC)
642 
643 #define RANGE(c,x,y) ((*(c) >= x) && (*(c) <= y))
644 #define DRANGE(c,x,y,u,v) (RANGE(c,x,y) || RANGE(c,u,v))
645 #define DTEST(s,t,r) (s && t && (char_filter_state = r))
646 #define TTEST(s,t,u,r) (s && t && u && (char_filter_state = r))
647 #define QTEST(s,t,u,v,r) (s && t && u && v && (char_filter_state = r))
648 #define VTEST(s,t,u,v,w,r) (s && t && u && v && w && (char_filter_state = r))
649 #define STEST(s,t,u,v,w,x,r) (s && t && u && v && w && x && (char_filter_state = r))
650 
651 #define Shift_JIS(c) ( DTEST(DRANGE(c,0x81,0x9F,0xE0,0xFC),DRANGE(c+1,0x40,0x7E,0x80,0xFC),fShift_JIS_1) )
652 
653 #define EUC_Japanese(c) ( DTEST(RANGE(c,0xA1,0xFE),RANGE(c+1,0xA1,0xFE),fEUC_Japanese_1) || DTEST((*c == 0x8E),RANGE(c+1,0xA0,0xDF),fEUC_Japanese_1) || TTEST((*c == 0x8F),RANGE(c+1,0xA1,0xFE),RANGE(c+2,0xA1,0xFE),fEUC_Japanese_2) )
654 
655 #define BIG5(c) ( DTEST(RANGE(c,0xA1,0xFE),DRANGE(c+1,0x40,0x7E,0xA1,0xFE),fBIG5_1) )
656 
657 #define BIG5P(c) ( DTEST(RANGE(c,0x81,0xFE),DRANGE(c+1,0x40,0x7E,0x80,0xFE),fBIG5P_1) )
658 
659 #define EUC_CN(c) ( DTEST(RANGE(c,0xA1,0xFE),RANGE(c+1,0xA1,0xFE),fEUC_CN_1) )
660 
661 #define EUC_TW(c) ( DTEST(RANGE(c,0xA1,0xFE),RANGE(c+1,0xA1,0xFE),fEUC_TW_1) || QTEST((*c == 0x8E),RANGE(c+1,0xA1,0xB0),RANGE(c+2,0xA1,0xFE),RANGE(c+3,0xA1,0xFE),fEUC_TW_3) )
662 
663 #define Johab(c) ( DTEST(RANGE(c,0x84,0xD3),DRANGE(c+1,0x41,0x7E,0x81,0xFE),fJohab_1) || DTEST(DRANGE(c,0xD8,0xDE,0xE0,0xF9),DRANGE(c+1,0x31,0x7E,0x91,0xFE),fJohab_1) )
664 
665 #define UTF8(c) ( DTEST(I2O(c),IO(c+1),fUTF8_1) || TTEST(I3O(c),IO(c+1),IO(c+2),fUTF8_2) ||  QTEST(I4O(c),IO(c+1),IO(c+2),IO(c+3),fUTF8_3) || VTEST(I5O(c),IO(c+1),IO(c+2),IO(c+3),IO(c+4),fUTF8_4) || STEST(I6O(c),IO(c+1),IO(c+2),IO(c+3),IO(c+4),IO(c+5),fUTF8_5) )
666 
667 #define ISO8859(c) ( RANGE(c,0xA1,0xFE) )
668 
669 /* atom without slash */
670 static char rfc2822_atom[256] = {
671   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
672   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
673   0, '!',   0, '#', '$', '%', '&',   0,   0,   0, '*', '+',   0, '-',   0,   0,
674 /*   0, '!',   0, '#', '$', '%', '&','\'',   0,   0, '*', '+',   0, '-',   0, '/', */
675   '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',   0,   0,   0, '=',   0, '?',
676   0, 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
677   'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z',   0,   0,   0, '^', '_',
678   '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
679   'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~',  0,
680 };
681 
682 #define ATOM(c) (((unsigned int)(*(c)) < 255) && rfc2822_atom[(unsigned int)(*(c))])
683 #define DOTTED_ATOM(c) ( ((c[0] == mbw_lit('.')) && ATOM((c+1))) || ATOM(c) )
684 #define COLON_ATOM(c) ( ((c[0] == mbw_lit(':')) && ATOM((c+1))) || ATOM(c) )
685 #define DOTTED_DIGITS(c) ( ((c[0] == mbw_lit('.')) && mbw_isdigit(c[1])) || mbw_isdigit(c[0]) )
686 #define COLON_DIGITS(c) ( ((c[0] == mbw_lit(':')) && mbw_isdigit(c[1])) || mbw_isdigit(c[0]) )
687 #define DOTTED_ALPHA(c) ( ((c[0] == mbw_lit('.')) && mbw_isalpha(c[1])) || mbw_isalpha(c[0]) )
688 
689 
690 /* warning: macros and modules work directly with this structure */
691 static enum {
692   fDEF = 1,
693   fANX,
694   fNAX,
695   fMUL,
696   fCUR,
697   fADD,
698   fSEP_1, fSEP_2, fSEP_3,
699   fUTF8_1, fUTF8_2, fUTF8_3, fUTF8_4, fUTF8_5,
700   fShift_JIS_1,
701   fEUC_Japanese_1, fEUC_Japanese_2,
702   fBIG5_1,
703   fBIG5P_1,
704   fEUC_CN_1,
705   fEUC_TW_1,fEUC_TW_2,fEUC_TW_3,
706   fJohab_1,
707   fALNUM,
708   fALPHA,
709   fNUMERIC,
710   fSYMBOL,
711   fANSX_1, fANSX_2, fANSX_3,
712   fCEF2_ATOM, fCEF2_DOTTED_ATOM, fCEF2_COLON_ATOM
713 } char_filter_state = fDEF;
714 
715 /*
716  * asian character tokens
717  */
718 
719 /* macro to be used in case statememt */
720 #define ASIAN_CASES fShift_JIS_1: case fBIG5_1: case fBIG5P_1: case fEUC_CN_1: case fJohab_1: case fEUC_TW_1: case fEUC_TW_2: case fEUC_TW_3: case fEUC_Japanese_1: case fEUC_Japanese_2
721 
722 static __inline__
mbw_prefix(is_asian_case)723 good_char_t mbw_prefix(is_asian_case)(const mbw_t *c) {
724   return (Shift_JIS(c) || EUC_Japanese(c) ||
725 	  BIG5(c) || BIG5P(c) || EUC_CN(c) || EUC_TW(c) || Johab(c)) ? gcTOKEN : gcUNDEF;
726 }
727 
728 static __inline__
mbw_prefix(handle_asian_case)729 good_char_t mbw_prefix(handle_asian_case)(const mbw_t *c) {
730   switch(char_filter_state) {
731   case fShift_JIS_1:
732   case fBIG5_1:
733   case fBIG5P_1:
734   case fEUC_CN_1:
735   case fJohab_1:
736     char_filter_state = fDEF;
737     return gcTOKEN_END;
738   case fEUC_TW_1:
739     char_filter_state = fDEF;
740     return gcTOKEN_END;
741   case fEUC_TW_2:
742     char_filter_state = fEUC_TW_1;
743     return gcTOKEN;
744   case fEUC_TW_3:
745     char_filter_state = fEUC_TW_2;
746     return gcTOKEN;
747   case fEUC_Japanese_1:
748     char_filter_state = fDEF;
749     return gcTOKEN_END;
750   case fEUC_Japanese_2:
751     char_filter_state = fEUC_Japanese_1;
752     return gcTOKEN;
753   default:
754     break;
755   }
756   char_filter_state = fDEF;
757   return gcDISCARD;
758 }
759 
760 /*
761  * utf8 character tokens - only makes sense if parsing multibyte strings
762  */
763 
764 /* macro to be used in case statememt */
765 #define UNICODE_CASES fUTF8_1: case fUTF8_2: case fUTF8_3: case fUTF8_4: case fUTF8_5
766 
767 static __inline__
mbw_prefix(is_unicode_case)768 good_char_t mbw_prefix(is_unicode_case)(const mbw_t *c) {
769   return (UTF8(c)) ? gcTOKEN : gcUNDEF;
770 }
771 
772 static __inline__
mbw_prefix(handle_unicode_case)773 good_char_t mbw_prefix(handle_unicode_case)(const mbw_t *c) {
774   switch(char_filter_state) {
775   case fUTF8_1:
776     char_filter_state = fDEF;
777     return gcTOKEN_END;
778   case fUTF8_2:
779     char_filter_state = fUTF8_1;
780     return gcTOKEN;
781   case fUTF8_3:
782     char_filter_state = fUTF8_2;
783     return gcTOKEN;
784   case fUTF8_4:
785     char_filter_state = fUTF8_3;
786     return gcTOKEN;
787   case fUTF8_5:
788     char_filter_state = fUTF8_4;
789     return gcTOKEN;
790   default:
791     break;
792   }
793   char_filter_state = fDEF;
794   return gcDISCARD;
795 }
796 
797 /*
798  * alpha character tokens
799  */
800 
801 #define ALPHA_CASES fALPHA
802 
803 /* checks for alpha and switches to alphabetic state, or
804    returns gcUNDEF if unrecognized */
805 static __inline__
mbw_prefix(is_alpha_case)806 good_char_t mbw_prefix(is_alpha_case)(const mbw_t *c) {
807   if( mbw_isalpha(*c++) ) {
808     if( mbw_isalpha(*c) || (*c == mbw_lit('\0')) ) {
809       char_filter_state = fALPHA;
810       return gcTOKEN;
811     }
812     return gcTOKEN_END;
813   }
814   return gcUNDEF;
815 }
816 
817 /* checks for alpha or discards, may switch back to default state */
818 static __inline__
mbw_prefix(handle_alpha_case)819 good_char_t mbw_prefix(handle_alpha_case)(const mbw_t *c) {
820   if( mbw_isalpha(*c++) ) {
821     if( mbw_isalpha(*c) || (*c == mbw_lit('\0')) ) {
822       return gcTOKEN;
823     }
824     char_filter_state = fDEF;
825     return gcTOKEN_END;
826   }
827   char_filter_state = fDEF;
828   return gcDISCARD;
829 }
830 
831 /*
832  * alphanumeric character tokens
833  */
834 
835 #define ALNUM_CASES fALNUM
836 
837 /* checks for alnum and switches to alphanumeric state, or
838    returns gcUNDEF if unrecognized */
839 static __inline__
mbw_prefix(is_alnum_case)840 good_char_t mbw_prefix(is_alnum_case)(const mbw_t *c) {
841   if( mbw_isalnum(*c++) ) {
842     if( mbw_isalnum(*c) || (*c == mbw_lit('\0')) ) {
843       char_filter_state = fALNUM;
844       return gcTOKEN;
845     } else {
846       return gcTOKEN_END;
847     }
848   }
849   return gcUNDEF;
850 }
851 
852 /* checks for alnum or discards, may switch back to default state */
853 static __inline__
mbw_prefix(handle_alnum_case)854 good_char_t mbw_prefix(handle_alnum_case)(const mbw_t *c) {
855   if( mbw_isalnum(*c) ) {
856     if( mbw_isalnum(*(++c)) || (*c == mbw_lit('\0')) ) {
857       return gcTOKEN;
858     }
859     char_filter_state = fDEF;
860     return gcTOKEN_END;
861   }
862   char_filter_state = fDEF;
863   return gcDISCARD;
864 }
865 
866 /*
867  * numeric tokens
868  */
869 
870 #define NUMERIC_CASES fNUMERIC
871 
872 /* checks for digit and switches to numeric state, or
873    returns gcUNDEF if unrecognized */
874 static __inline__
mbw_prefix(is_numeric_case)875 good_char_t mbw_prefix(is_numeric_case)(const mbw_t *c) {
876   if( mbw_isdigit(*c++) ) {
877     if( mbw_isdigit(*c) || (*c == mbw_lit('\0')) ) {
878       char_filter_state = fNUMERIC;
879       return gcTOKEN;
880     } else {
881       return gcTOKEN_END;
882     }
883   }
884   return gcUNDEF;
885 }
886 
887 /* checks for numeric or discards, may switch back to default state */
888 static __inline__
mbw_prefix(handle_numeric_case)889 good_char_t mbw_prefix(handle_numeric_case)(const mbw_t *c) {
890   if( mbw_isdigit(*c++) ) {
891     if( mbw_isdigit(*c) || (*c == mbw_lit('\0')) ) {
892       return gcTOKEN;
893     }
894     char_filter_state = fDEF;
895     return gcTOKEN_END;
896   }
897   char_filter_state = fDEF;
898   return gcDISCARD;
899 }
900 
901 /*
902  * punctuation tokens
903  */
904 
905 #define SYMBOLIC_CASES fSYMBOL
906 
907 static __inline__
mbw_prefix(is_symbolic_case)908 good_char_t mbw_prefix(is_symbolic_case)(const mbw_t *c) {
909   if( mbw_ispunct(*c++) ) {
910     if( mbw_ispunct(*c) || (*c == mbw_lit('\0')) ) {
911       char_filter_state = fSYMBOL;
912       return gcTOKEN;
913     } else {
914       return gcTOKEN_END;
915     }
916   }
917   return gcUNDEF;
918 }
919 
920 static __inline__
mbw_prefix(handle_symbolic_case)921 good_char_t mbw_prefix(handle_symbolic_case)(const mbw_t *c) {
922   if( mbw_ispunct(*c++) ) {
923     if( mbw_ispunct(*c) || (*c == mbw_lit('\0')) ) {
924       if( (c[-1] == c[0]) && (c[0] == c[1]) ) {
925 	return gcIGNORE;
926       }
927       return gcTOKEN;
928     }
929     char_filter_state = fDEF;
930     return gcTOKEN_END;
931   }
932   char_filter_state = fDEF;
933   return gcDISCARD;
934 }
935 
936 /*
937  * repeated character tokens, squeezed down to 3.
938  */
939 
940 #define REPEAT_CASES fSEP_1: case fSEP_2: case fSEP_3
941 
942 /* checks for repeated char, replaces with 3 copies only, or
943    returns gcUNDEF if unrecognized */
944 static __inline__
mbw_prefix(is_repeat_case)945 good_char_t mbw_prefix(is_repeat_case)(const mbw_t *c) {
946   if( (c[1] == c[0]) ) {
947     if( (c[2] == c[0]) ) {
948       char_filter_state = fSEP_3;
949       return gcTOKEN;
950     } else {
951       char_filter_state = fSEP_2;
952       return gcTOKEN;
953     }
954   }
955   return gcUNDEF;
956 }
957 
958 /* checks for repeated char, replaces with 3 copies only */
959 static __inline__
mbw_prefix(handle_repeat_case)960 good_char_t mbw_prefix(handle_repeat_case)(const mbw_t *c) {
961   switch(char_filter_state) {
962   case fSEP_1:
963     if( c[0] != c[1] ) {
964       char_filter_state = fDEF;
965     }
966     return gcDISCARD;
967   case fSEP_2:
968     char_filter_state = fSEP_1;
969     return gcTOKEN;
970   case fSEP_3:
971     char_filter_state = fSEP_2;
972     return gcTOKEN;
973   default:
974     break;
975   }
976   char_filter_state = fDEF;
977   return gcDISCARD;
978 }
979 
980 /*
981  * currency tokens, very simple and naive, not localized
982  */
983 
984 #define CURRENCY_CASES fCUR
985 
986 /* checks for currency, or
987    returns gcUNDEF if unrecognized */
988 static __inline__
mbw_prefix(is_currency_case)989 good_char_t mbw_prefix(is_currency_case)(const mbw_t *c) {
990   /* this should be done properly (locale) sometime ... */
991   if( (*c == mbw_lit('$')) || (*c == mbw_lit('\xa3')) ) {
992     if( mbw_isdigit(c[1]) &&
993 	(!mbw_isdigit(c[2]) || !mbw_isdigit(c[3]) || !mbw_isdigit(c[4])) ) {
994       char_filter_state = fCUR;
995       return gcTOKEN;
996     }
997   }
998   return gcUNDEF;
999 }
1000 
1001 /* checks for currency */
1002 static __inline__
mbw_prefix(handle_currency_case)1003 good_char_t mbw_prefix(handle_currency_case)(const mbw_t *c) {
1004   if( mbw_isdigit(c[1]) ) {
1005     return gcTOKEN;
1006   } else if( SET2(c+1) && mbw_isdigit(c[2]) ) {
1007     char_filter_state = fCUR;
1008     return gcTOKEN;
1009   }
1010   char_filter_state = fDEF;
1011   return gcTOKEN_END;
1012 }
1013 
1014 /*
1015  * internet embedded address
1016  */
1017 
1018 #define ADDRESS_CASES fADD
1019 
1020 static __inline__
mbw_prefix(is_address_case)1021 good_char_t mbw_prefix(is_address_case)(const mbw_t *c) {
1022   if( *c == mbw_lit('<') ) {
1023     for(c++; SET3(c); c++);
1024     if( *c == mbw_lit('>') ) {
1025       char_filter_state = fADD;
1026     }
1027   }
1028   return gcUNDEF;
1029 }
1030 
1031 static __inline__
mbw_prefix(handle_address_case)1032 good_char_t mbw_prefix(handle_address_case)(const mbw_t *c) {
1033   switch(*c) {
1034   case mbw_lit('@'):
1035     /*     case mbw_lit('#'): */
1036     /*     case mbw_lit('?'): */
1037     /*     case mbw_lit('&'): */
1038     /*     case mbw_lit(':'): */
1039     /*     case mbw_lit('/'): */
1040     return gcDISCARD;
1041   case mbw_lit('>'):
1042     char_filter_state = fDEF;
1043     return gcDISCARD;
1044   default:
1045     break;
1046   }
1047   return gcTOKEN;
1048 }
1049 
1050 /*
1051  * multiple alpha tokens separated by punctuation
1052  */
1053 
1054 #define MULTI_ALPHA_CASES fMUL
1055 
1056 /* checks for alpha and switches to alphabetic state, or
1057    returns gcUNDEF if unrecognized */
1058 static __inline__
mbw_prefix(is_multi_alpha_case)1059 good_char_t mbw_prefix(is_multi_alpha_case)(const mbw_t *c) {
1060   /* don't increment c in SET1 */
1061   if( mbw_isalpha(*c++) && SET1(c) && mbw_isalpha(*(++c)) ) {
1062     char_filter_state = fMUL;
1063     return gcTOKEN;
1064   }
1065   return gcUNDEF;
1066 }
1067 
1068 /* checks for alpha or discards, may switch back to default state */
1069 static __inline__
mbw_prefix(handle_multi_alpha_case)1070 good_char_t mbw_prefix(handle_multi_alpha_case)(const mbw_t *c) {
1071   if( mbw_isalpha(c[1]) ) {
1072     return gcTOKEN;
1073   } else if( SET1(c+1) && mbw_isalpha(c[2]) ) {
1074     return gcTOKEN;
1075   }
1076   char_filter_state = fDEF;
1077   return gcTOKEN_END;
1078 }
1079 
1080 /*
1081  * xxx123 identifiers
1082  */
1083 
1084 #define ALPHA_NUMBER_CASES fANX
1085 
1086 static __inline__
mbw_prefix(is_alpha_number_case)1087 good_char_t mbw_prefix(is_alpha_number_case)(const mbw_t *c) {
1088   if( mbw_isalpha(*c++) && mbw_isdigit(*c) ) {
1089     char_filter_state = fANX;
1090     return gcTOKEN;
1091   }
1092   return gcUNDEF;
1093 }
1094 
1095 static __inline__
mbw_prefix(handle_alpha_number_case)1096 good_char_t mbw_prefix(handle_alpha_number_case)(const mbw_t *c) {
1097   if( mbw_isdigit(*c++) ) {
1098     if( mbw_isdigit(*c) ) {
1099       return gcTOKEN;
1100     }
1101     char_filter_state = fDEF;
1102     return gcTOKEN_END;
1103   }
1104   char_filter_state = fDEF;
1105   return gcDISCARD;
1106 }
1107 
1108 /*
1109  * 123xxx identifiers
1110  */
1111 
1112 #define NUMBER_ALPHA_CASES fNAX
1113 
1114 static __inline__
mbw_prefix(is_number_alpha_case)1115 good_char_t mbw_prefix(is_number_alpha_case)(const mbw_t *c) {
1116   if( mbw_isdigit(*c++) && mbw_isalpha(*c) ) {
1117     char_filter_state = fNAX;
1118     return gcTOKEN;
1119   }
1120   return gcUNDEF;
1121 }
1122 
1123 static __inline__
mbw_prefix(handle_number_alpha_case)1124 good_char_t mbw_prefix(handle_number_alpha_case)(const mbw_t *c) {
1125   if( mbw_isalpha(*c++) ) {
1126     if( mbw_isalpha(*c) ) {
1127       return gcTOKEN;
1128     }
1129     char_filter_state = fDEF;
1130     return gcTOKEN_END;
1131   }
1132   char_filter_state = fDEF;
1133   return gcDISCARD;
1134 }
1135 
1136 /*
1137  * xxx123% identifiers
1138  */
1139 
1140 #define ALPHA_NUMBER_SYMBOL_CASES fANSX_1: case fANSX_2: case fANSX_3
1141 
1142 static __inline__
mbw_prefix(is_alpha_number_symbol_case)1143 good_char_t mbw_prefix(is_alpha_number_symbol_case)(const mbw_t *c) {
1144   if( mbw_isalpha(*c++) ) {
1145     if( mbw_isdigit(*c) ) {
1146       char_filter_state = fANSX_2;
1147       return gcTOKEN;
1148     } else if( mbw_ispunct(*c) ) {
1149       char_filter_state = fANSX_1;
1150       return gcTOKEN;
1151     } else if( !mbw_isalpha(*c) || (*c == mbw_lit('\0')) ) {
1152       return gcTOKEN_END;
1153     }
1154     char_filter_state = fANSX_3;
1155     return gcTOKEN;
1156   }
1157   return gcUNDEF;
1158 }
1159 
1160 static __inline__
mbw_prefix(is_number_symbol_case)1161 good_char_t mbw_prefix(is_number_symbol_case)(const mbw_t *c) {
1162   if( mbw_isdigit(*c++) ) {
1163     if( mbw_ispunct(*c) ) {
1164       char_filter_state = fANSX_1;
1165       return gcTOKEN;
1166     } else if( !mbw_isdigit(*c) || (*c == mbw_lit('\0')) ) {
1167       return gcTOKEN_END;
1168     }
1169     char_filter_state = fANSX_2;
1170     return gcTOKEN;
1171   }
1172   return gcUNDEF;
1173 }
1174 
1175 static __inline__
mbw_prefix(handle_alpha_number_symbol_case)1176 good_char_t mbw_prefix(handle_alpha_number_symbol_case)(const mbw_t *c) {
1177   if( *(c++) == mbw_lit('\0') ) {
1178     return gcTOKEN;
1179   }
1180   switch(char_filter_state) {
1181   case fANSX_1:
1182     if( !mbw_ispunct(*c) ) {
1183       char_filter_state = fDEF;
1184       return gcTOKEN_END;
1185     } else if( (c[-1] == c[0]) && (c[0] == c[1]) ) {
1186       return gcIGNORE;
1187     }
1188     return gcTOKEN;
1189   case fANSX_2:
1190     if( mbw_ispunct(*c) ) {
1191       char_filter_state = fANSX_1;
1192       return gcTOKEN;
1193     } else if( mbw_isalpha(*c) || ISO8859(c) ) {
1194       char_filter_state = fANSX_3;
1195       return gcTOKEN;
1196     } else if( !mbw_isdigit(*c) ) {
1197       char_filter_state = fDEF;
1198       return gcTOKEN_END;
1199     }
1200     return gcTOKEN;
1201   case fANSX_3:
1202     if( mbw_isdigit(*c) ) {
1203       char_filter_state = fANSX_2;
1204       return gcTOKEN;
1205     } else if( mbw_ispunct(*c) ) {
1206       char_filter_state = fANSX_1;
1207       return gcTOKEN;
1208     } else if( !mbw_isalpha(*c) && !ISO8859(c) ) {
1209       char_filter_state = fDEF;
1210       return gcTOKEN_END;
1211     } else if( (c[-1] == c[0]) && (c[0] == c[1]) ) {
1212       return gcIGNORE;
1213     }
1214     return gcTOKEN;
1215   default:
1216     /* ignore */
1217     break;
1218   }
1219   char_filter_state = fDEF;
1220   return gcDISCARD;
1221 }
1222 
1223 
1224 /*
1225  * This is the CEF (common encoding formats) tokenizer.
1226  * It was the first attempt at a specialized email tokenizer.
1227  */
1228 
1229 static __inline__
mbw_prefix(is_cef_char)1230 good_char_t mbw_prefix(is_cef_char)(const mbw_t *c) {
1231   good_char_t retval;
1232   switch(char_filter_state) {
1233   case fDEF:
1234 
1235 #if defined MBW_MB
1236     /* this doesn't make sense for wide characters */
1237     if(*c & 0x80) {
1238       if( mbw_prefix(is_unicode_case)(c) ||
1239 	  mbw_prefix(is_asian_case)(c) ) {
1240 	return gcTOKEN;
1241       } else if( *c < 0xa0 ) {
1242 	return gcDISCARD;
1243       }
1244     }
1245 #endif
1246 
1247     if( mbw_isalpha(*c) ) {
1248       if( (retval = mbw_prefix(is_alpha_number_case)(c)) ||
1249 	  (retval = mbw_prefix(is_multi_alpha_case)(c)) ) {
1250 	return retval;
1251       }
1252       return gcTOKEN;
1253     } else if( mbw_ispunct(*c) ) {
1254       if( (retval = mbw_prefix(is_repeat_case)(c)) ||
1255 	  (retval = mbw_prefix(is_currency_case)(c)) ||
1256 	  (retval = mbw_prefix(is_address_case)(c)) ) {
1257 	return retval;
1258       }
1259     } else if( mbw_isdigit(*c) ) {
1260       if( (retval = mbw_prefix(is_number_alpha_case)(c)) ) {
1261 	return retval;
1262       }
1263     }
1264     return gcDISCARD;
1265   case ALPHA_CASES:
1266     retval = mbw_prefix(handle_alpha_case)(c);
1267     if( retval == gcTOKEN_END ) {
1268       if( (retval = mbw_prefix(is_multi_alpha_case)(c)) ||
1269 	  (retval = mbw_prefix(is_alpha_number_case)(c)) ) {
1270 	return retval;
1271       }
1272     }
1273     return retval;
1274   case ALPHA_NUMBER_CASES:
1275     return mbw_prefix(handle_alpha_number_case)(c);
1276   case NUMBER_ALPHA_CASES:
1277     return mbw_prefix(handle_number_alpha_case)(c);
1278   case MULTI_ALPHA_CASES:
1279     return mbw_prefix(handle_multi_alpha_case)(c);
1280   case ADDRESS_CASES:
1281     return mbw_prefix(handle_address_case)(c);
1282   case CURRENCY_CASES:
1283     return mbw_prefix(handle_currency_case)(c);
1284   case REPEAT_CASES:
1285     return mbw_prefix(handle_repeat_case)(c);
1286   case ASIAN_CASES:
1287     return mbw_prefix(handle_asian_case)(c);
1288   case UNICODE_CASES:
1289     return mbw_prefix(handle_unicode_case)(c);
1290   default:
1291     /* nothing */
1292     break;
1293   }
1294   char_filter_state = fDEF;
1295   return gcDISCARD; /* otherwise compiler complains */
1296 }
1297 
1298 
1299 /*
1300  * This is the ADP (alpha digit punctuation) tokenizer.
1301  * It was the second attempt at a specialized email tokenizer.
1302  */
1303 
1304 static __inline__
mbw_prefix(is_adp_char)1305 good_char_t mbw_prefix(is_adp_char)(const mbw_t *c) {
1306   good_char_t retval;
1307   switch(char_filter_state) {
1308   case fDEF:
1309 #if defined MBW_MB
1310     /* this doesn't make sense for wide characters */
1311     if(*c & 0x80) {
1312       if( (retval = mbw_prefix(is_unicode_case)(c)) ||
1313 	  (retval = mbw_prefix(is_asian_case)(c)) ) {
1314 	return retval;
1315       } else if( *c < 0xa0 ) {
1316 	return gcDISCARD;
1317       }
1318     }
1319 #endif
1320     if( (retval = mbw_prefix(is_alpha_number_symbol_case)(c)) ||
1321 	(retval = mbw_prefix(is_number_symbol_case)(c)) ||
1322 	(retval = mbw_prefix(is_symbolic_case)(c)) ) {
1323       return retval;
1324     }
1325     return gcDISCARD;
1326 
1327   case ALPHA_NUMBER_SYMBOL_CASES:
1328     return mbw_prefix(handle_alpha_number_symbol_case)(c);
1329   case SYMBOLIC_CASES:
1330     return mbw_prefix(handle_symbolic_case)(c);
1331   case ASIAN_CASES:
1332     return mbw_prefix(handle_asian_case)(c);
1333   case UNICODE_CASES:
1334     return mbw_prefix(handle_unicode_case)(c);
1335   default:
1336     /* nothing */
1337     break;
1338   }
1339   char_filter_state = fDEF;
1340   return gcDISCARD;
1341 }
1342 
1343 /*
1344  * This is the CEF2 (common email format v2) tokenizer.
1345  */
1346 
1347 static __inline__
mbw_prefix(is_cef2_special_case)1348 good_char_t mbw_prefix(is_cef2_special_case)(const mbw_t *c) {
1349   return gcUNDEF;
1350 }
1351 
1352 static __inline__
mbw_prefix(is_cef2_atom_case)1353 good_char_t mbw_prefix(is_cef2_atom_case)(const mbw_t *c) {
1354   if( ATOM(c) ) {
1355     char_filter_state = fCEF2_ATOM;
1356     return gcTOKEN;
1357   }
1358   return gcUNDEF;
1359 }
1360 
1361 static __inline__
mbw_prefix(handle_cef2_atom_case)1362 good_char_t mbw_prefix(handle_cef2_atom_case)(const mbw_t *c) {
1363   if( ATOM(c) ) {
1364     return gcTOKEN;
1365   } else if( DOTTED_ATOM(c) ) {
1366     char_filter_state = fCEF2_DOTTED_ATOM;
1367     return gcTOKEN;
1368   } else if( COLON_ATOM(c) ) {
1369     char_filter_state = fCEF2_COLON_ATOM;
1370     return gcTOKEN;
1371   }
1372   char_filter_state = fDEF;
1373   return gcDISCARD;
1374 }
1375 
1376 static __inline__
mbw_prefix(handle_cef2_dotted_atom_case)1377 good_char_t mbw_prefix(handle_cef2_dotted_atom_case)(const mbw_t *c) {
1378   if( DOTTED_ATOM(c) ) {
1379     return gcTOKEN;
1380   }
1381   char_filter_state = fDEF;
1382   return gcDISCARD;
1383 }
1384 
1385 static __inline__
mbw_prefix(handle_cef2_colon_atom_case)1386 good_char_t mbw_prefix(handle_cef2_colon_atom_case)(const mbw_t *c) {
1387   if( COLON_ATOM(c) ) {
1388     return gcTOKEN;
1389   }
1390   char_filter_state = fDEF;
1391   return gcDISCARD;
1392 }
1393 
1394 static __inline__
mbw_prefix(is_cef2_char)1395 good_char_t mbw_prefix(is_cef2_char)(const mbw_t *c) {
1396   good_char_t retval;
1397   switch(char_filter_state) {
1398   case fDEF:
1399 #if defined MBW_MB
1400     /* this doesn't make sense for wide characters */
1401     if(*c & 0x80) {
1402       if( (retval = mbw_prefix(is_unicode_case)(c)) ||
1403 	  (retval = mbw_prefix(is_asian_case)(c)) ) {
1404 	return retval;
1405       } else if( *c < 0xa0 ) {
1406 	return gcDISCARD;
1407       }
1408     }
1409 #endif
1410     if( (retval = mbw_prefix(is_cef2_special_case)(c)) ||
1411 	(retval = mbw_prefix(is_cef2_atom_case)(c)) ) {
1412       return retval;
1413     }
1414     return gcDISCARD;
1415   case fCEF2_ATOM:
1416     return mbw_prefix(handle_cef2_atom_case)(c);
1417   case fCEF2_DOTTED_ATOM:
1418     return mbw_prefix(handle_cef2_dotted_atom_case)(c);
1419   case fCEF2_COLON_ATOM:
1420     return mbw_prefix(handle_cef2_colon_atom_case)(c);
1421   case ASIAN_CASES:
1422     return mbw_prefix(handle_asian_case)(c);
1423   case UNICODE_CASES:
1424     return mbw_prefix(handle_unicode_case)(c);
1425   default:
1426     /* nothing */
1427     break;
1428   }
1429   char_filter_state = fDEF;
1430   return gcDISCARD;
1431 }
1432 
1433 
1434 
1435 static __inline__
mbw_prefix(is_char_char)1436 good_char_t mbw_prefix(is_char_char)(const mbw_t *c) {
1437 /*   return (mbw_isgraph(*c) ? gcTOKEN_END :  */
1438 /* 	  (mbw_isspace(*c) ? (mbw_isspace(c[1]) ? gcDISCARD : gcTOKEN_END) :  */
1439 /* 	   gcDISCARD)); */
1440   return mbw_isgraph(*c) ? gcTOKEN_END : gcDISCARD;
1441 }
1442 
1443 /*
1444  * this code generates good_char() and w_good_char()
1445  * returns true if the character is part of a token
1446  *
1447  * gcTOKEN: character should be part of a token
1448  * gcTOKEN_END: like gcTOKEN, but token must end immediately
1449  * gcDISCARD: character is not part of a token
1450  * gcIGNORE: pretend there is no character here
1451  *
1452  * gcDISCARD is also returned if the line is empty
1453  */
mbw_prefix(good_char)1454 good_char_t mbw_prefix(good_char)(mbw_t *c) {
1455   if( c && (*c != mbw_lit('\0')) ) {
1456     if( !(m_options & (1<<M_OPTION_CASEN)) ) {
1457       *c = mbw_tolower(*c);
1458     }
1459     switch(m_cp) {
1460     case CP_ADP:
1461       return mbw_prefix(is_adp_char)(c);
1462     case CP_CEF2:
1463       return mbw_prefix(is_cef2_char)(c);
1464     case CP_CHAR:
1465       return mbw_prefix(is_char_char)(c);
1466     case CP_ALPHA:
1467       return mbw_isalpha(*c) ? gcTOKEN : gcDISCARD;
1468     case CP_CEF:
1469       return mbw_prefix(is_cef_char)(c);
1470     case CP_ALNUM:
1471       return mbw_isalnum(*c) ? gcTOKEN : gcDISCARD;
1472     case CP_GRAPH:
1473       return mbw_isgraph(*c) ? gcTOKEN : gcDISCARD;
1474     case CP_DEFAULT:
1475       break;
1476     }
1477   }
1478   return gcDISCARD;
1479 }
1480 
1481 /*
1482  * The regex tokenizer operates on single lines only, ie regexes cannot
1483  * straddle lines. This makes the code much simpler.
1484  */
mbw_prefix(regex_tokenizer)1485 void mbw_prefix(regex_tokenizer)(mbw_t *p, int i,
1486 				 void (*word_fun)(char *, token_type_t, regex_count_t),
1487 				 token_type_t (*get_tt)(token_order_t)) {
1488   char *q, *cq;
1489   charbuf_len_t k,l, j;
1490   int eflag = 0;
1491   token_type_t tt;
1492   token_order_t z, order;
1493   char tok[(MAX_TOKEN_LEN+1)*MAX_SUBMATCH+EXTRA_TOKEN_LEN];
1494   regmatch_t pmatch[MAX_SUBMATCH];
1495 
1496   k = 0;
1497   l = mbw_strlen(p);
1498   /* see if a match */
1499   while( (k < l) && (mbw_regexec(&re[i].regex, p + k,
1500 				 MAX_SUBMATCH, pmatch, eflag) == 0) ) {
1501     /* all the submatches (delimited by brackets in the regex)
1502        get concatenated and the result gets word_fun'd */
1503     q = tok;
1504     *q++ = DIAMOND;
1505     for(order = 0, z = 1;
1506 	(z < MAX_SUBMATCH) && (pmatch[z].rm_so > -1); z++) {
1507       if( !(re[i].submatches & (1<<z)) )
1508 	{ continue; } else { order++; }
1509       /* transcribe the submatch into tok */
1510       for(j = pmatch[z].rm_so;
1511 	  ((j < (charbuf_len_t)pmatch[z].rm_eo) &&
1512 	   (j < (charbuf_len_t)pmatch[z].rm_so + MAX_TOKEN_LEN)); j++) {
1513 	if( m_options & (1<<M_OPTION_CASEN) ) {
1514 	  mbw_copychar(q,p[k + j]);
1515 	} else {
1516 	  mbw_copychar(q,mbw_tolower(p[k+j]));
1517 	}
1518       }
1519       *q++ = DIAMOND;
1520     }
1521 
1522     tt = (*get_tt)(order);
1523 
1524     cq = q;
1525     *cq++ = CLASSEP;
1526     *cq++ = (char)(AMIN + tt.cls);
1527     *cq = '\0';
1528 
1529     /* now let each category process the token */
1530     (*word_fun)(tok, tt, i + 1); /* +1 because i = 0 means INVALID_RE */
1531 
1532     k += pmatch[0].rm_so + 1; /* advance string and repeat */
1533     eflag = REG_NOTBOL;
1534   }
1535 }
1536 
1537 /*
1538  * The standard tokenizer converts each acceptable token into a char
1539  * string, and passes it to word_fun().  To construct a token, the
1540  * good_char() function is called on each successive input character,
1541  * obtaining a code as follows:
1542  *
1543  * gcTOKEN: The character belongs to an acceptable token, and is
1544  * copied to the holding buffer at the position pointed by nq, unless
1545  * the token length would exceed MAX_TOKEN_LEN, in which case we
1546  * pretend we have a gcTOKEN_END code.
1547  *
1548  * gcTOKEN_END: The character belongs to an acceptable token, but the
1549  * token must be terminated immediately. In this case, we fall through
1550  * to the gcDISCARD case.
1551  *
1552  * gcDISCARD: The character does not belong to a token. In this case,
1553  * we check that the holding buffer contains usable data, to which we
1554  * apply the word_fun(). The holding buffer is then reset.
1555  *
1556  * If p is NULL, we simply flush the token. Tokens normally straddle
1557  * newlines, but if M_OPTION_NGRAM_STRADDLE_NL is set, then each
1558  * newline is flushes the current token also.
1559  */
mbw_prefix(std_tokenizer)1560 void mbw_prefix(std_tokenizer)(mbw_t *p, char **pq, char *hbuf,
1561 			       token_order_t *hbuf_order, token_order_t max_order,
1562 			       void (*word_fun)(char *, token_type_t, regex_count_t),
1563 			       token_type_t (*get_tt)(token_order_t)) {
1564   token_type_t tt;
1565   token_order_t n, o;
1566   char *q;
1567   char *tstart, *qq, *cq;
1568   bool_t reset;
1569 
1570   if( p && (p[0] == mbw_lit('\0')) ) {
1571     /* waste of time */
1572     return;
1573   }
1574 
1575   q = *pq;
1576   o = *hbuf_order;
1577 
1578   if( !q ||
1579       (q < hbuf) ||
1580       (q > hbuf + (MAX_TOKEN_LEN+1)*MAX_SUBMATCH+EXTRA_TOKEN_LEN) ) {
1581     q = hbuf;
1582     *q++ = DIAMOND;
1583 #if defined MBW_WIDE
1584     memset(&copychar_shiftstate, 0, sizeof(mbstate_t));
1585 #endif
1586   }
1587   for(tstart = q - 1; *tstart != DIAMOND; --tstart);
1588 
1589   /* p[0] at least is nonzero */
1590   do {
1591     switch( mbw_prefix(good_char)(p) ) {
1592     case gcIGNORE:
1593       /* pretend there is no character here */
1594       break;
1595     case gcTOKEN:
1596       if( p && q < tstart + MAX_TOKEN_LEN ) {
1597 	mbw_copychar(q,*p);
1598 	break;
1599       }
1600       /* if we're here, fall through */
1601     case gcTOKEN_END:
1602       if( p && q < tstart + MAX_TOKEN_LEN + 1) {
1603 	mbw_copychar(q,*p);
1604       }
1605       /* don't break, always fall through */
1606     case gcUNDEF:
1607     case gcDISCARD:
1608       reset = ( !(m_options & (1<<M_OPTION_NGRAM_STRADDLE_NL)) &&
1609 		p && ( (p[0] == mbw_lit('\0')) ||
1610 		       (p[0] == mbw_lit('\n')) ) );
1611 
1612       if( (p == NULL) || reset || (q[-1] != DIAMOND) ) {
1613 	tstart = q;
1614 	*q++ = DIAMOND;
1615 	*q = '\0';
1616 
1617 	if( max_order == 1 ) {
1618 	  tt = (*get_tt)(1);
1619 	  cq = q;
1620 	  *cq++ = CLASSEP;
1621 	  *cq++ = (char)(AMIN + tt.cls);
1622 	  *cq = '\0';
1623 	  /* let each category process the token */
1624 	  (*word_fun)(hbuf, tt, INVALID_RE);
1625 	  tstart = q = hbuf;
1626 	  *q++ = DIAMOND;
1627 	} else if( p ) {
1628 	  /* do this only if we have a line to work with */
1629 	  if( ++o > max_order ) {
1630 	    o--;
1631 	    /* move all tokens down by one */
1632 	    for(q = hbuf + 1; *q != DIAMOND; q++) {};
1633 	    for(q++, qq = hbuf + 1; *q; *qq++ = *q++) {};
1634 	    *qq = '\0';
1635 	    tstart = q = qq;
1636 	  }
1637 
1638 	  tt = (*get_tt)(o);
1639 
1640 	  cq = q;
1641 	  *cq++ = CLASSEP;
1642 	  *cq++ = (char)(AMIN + tt.cls);
1643 	  *cq = '\0';
1644 
1645 	  qq = hbuf;
1646 	  for(n = o; n > 0; n--) {
1647 	    /* let each category process the token */
1648 	    tt.order = n;
1649 	    (*word_fun)(qq, tt, INVALID_RE);
1650 	    qq++;
1651 	    /* skip to next token and repeat */
1652 	    while(*qq != DIAMOND ) { qq++; }
1653 	  }
1654 	}
1655 	if( reset ) {
1656 	  /* reset the current ngrams to zero */
1657 	  tstart = hbuf;
1658 	  q = hbuf + 1;
1659 	  o = 0;
1660 	}
1661       }
1662 
1663     }
1664   } while( p && (*(p++) != mbw_lit('\0')) );
1665 
1666   *pq = q;
1667   *hbuf_order = o;
1668 }
1669 
1670 
1671 
1672 /***********************************************************
1673  * FILTERING FUNCTIONS                                     *
1674  ***********************************************************/
1675 
1676 /*
1677  * this code generates mhe_line_filter() and w_mhe_line_filter()
1678  * translates a MIME message header extension encoded
1679  * token into its equivalent byte sequence.
1680  */
mbw_prefix(mhe_line_filter)1681 bool_t mbw_prefix(mhe_line_filter)(mbw_t *line) {
1682   mbw_t *p = line;
1683   mbw_t *q = line;
1684   mbw_t *r;
1685 
1686   while( *p ) {
1687     if( (p[0] == mbw_lit('=')) && (p[1] == mbw_lit('?')) ) {
1688       r = p + 2;
1689       while( *r && (*r != mbw_lit('?'))) { r++; }
1690       r++;
1691       /* I think lower case is illegal */
1692       if( (*r == mbw_lit('Q')) || (*r == mbw_lit('q')) ) {
1693 	if( *(++r) == mbw_lit('?') ) {
1694 	  r++;
1695 	  /* we are now committed. find end marker and replace with NUL */
1696 	  for(p = r; *p; p++) {
1697 	    if( *p == mbw_lit('_') ) {
1698 	      *p = ' ';
1699 	    } else if( (p[0] == mbw_lit('?')) && (p[1] == mbw_lit('=')) ) {
1700 	      *p = mbw_lit('\0');
1701 	      break;
1702 	    }
1703 	  }
1704 	  q = mbw_prefix(qp_line_filter2)(r, q);
1705 	  p += 2;
1706 	} else {
1707 	  /* malformed encoding */
1708 	  *q++ = *p++;
1709 	}
1710       } else if( (*r == mbw_lit('B')) || (*r == mbw_lit('b')) ) {
1711 	/* I think lower case is illegal, but we're lenient */
1712 	if( *(++r) == mbw_lit('?') ) {
1713 	  r++;
1714 	  /* we are now committed. find end marker and replace with NUL */
1715 	  for(p = r; *p; p++) {
1716 	    if( (p[0] == mbw_lit('?')) && (p[1] == mbw_lit('=')) ) {
1717 	      *p = mbw_lit('\0');
1718 	      break;
1719 	    }
1720 	  }
1721 	  q = mbw_prefix(b64_line_filter2)(r, q);
1722 	  p += 2;
1723 	} else {
1724 	  /* malformed encoding */
1725 	  *q++ = *p++;
1726 	}
1727       } else {
1728 	/* malformed encoding */
1729 	*q++ = *p++;
1730       }
1731     } else {
1732       *q++ = *p++;
1733     }
1734   }
1735   *q = '\0';
1736   return 1;
1737 }
1738 
mbw_prefix(extract_header_label)1739 int mbw_prefix(extract_header_label)(MBOX_State *mbox, mbw_t *line) {
1740   mbw_t *p = line;
1741 
1742   if( m_options & (1<<M_OPTION_XHEADERS) ) {
1743     if( (mbw_strncasecmp(p, mbw_lit("X-DBACL"),7) == 0) ||
1744 	(mbw_strncasecmp(p, mbw_lit("Date:"),4) == 0) ||
1745 	(mbw_strncasecmp(p, mbw_lit("Path:"),4) == 0) ||
1746 	(mbw_strncasecmp(p, mbw_lit("Posted:"),6) == 0) ||
1747 	(mbw_strncasecmp(p, mbw_lit("Expires:"),7) == 0) ||
1748 	(mbw_strncasecmp(p, mbw_lit("Received:"),8) == 0) ||
1749 	(mbw_strncasecmp(p, mbw_lit("Resent-Date:"),11) == 0) ||
1750 	(mbw_strncasecmp(p, mbw_lit("Delivery-Date:"),13) == 0) ||
1751 	(mbw_isspace(line[0]) && mbox->skip_header) ) {
1752       mbox->skip_header = 1;
1753       return 0;
1754     } else {
1755       mbox->skip_header = 0;
1756       return mbw_prefix(mhe_line_filter)(line);
1757     }
1758   }
1759 
1760   return 0;
1761 }
1762 
1763 /***********************************************************
1764  * MBOX PARSING FUNCTIONS                                  *
1765  ***********************************************************/
1766 
1767 /*
1768  * this code generates extract_mime_boundary() and w_extract_mime_boundary()
1769  * retrieves the MIME boundary if one is found. Doesn't cope with rfc2184
1770  */
mbw_prefix(extract_mime_boundary)1771 bool_t mbw_prefix(extract_mime_boundary)(MBOX_State *mbox,mbw_t *line) {
1772   const mbw_t *q;
1773   mbw_t *r;
1774   int size;
1775   bool_t quoted = 0; /* used both in calsulation and for return value */
1776 
1777   q = mbw_prefix(mystrcasestr)(line, mbw_lit("boundary="));
1778 
1779   if( q ) {
1780     /* we skip white space after = sign, even though it is not allowed */
1781     for(q += 9; mbw_isspace(*q); q++);
1782     if( *q ) {
1783       quoted = (*q == mbw_lit('"'));
1784       r = mbox->boundary.mbw_prefix(identifier)[mbox->boundary.index];
1785       size = 0;
1786       if( quoted ) {
1787 	for(q++; *q && (*q != mbw_lit('"')) &&
1788 	      (size < MAX_BOUNDARY_BUFSIZE); q++, size++) {
1789 	  *r++ = *q;
1790 	}
1791       } else {
1792 	for(; *q && !mbw_isspace(*q) &&
1793 	      (size < MAX_BOUNDARY_BUFSIZE); q++, size++) {
1794 	  *r++ = *q;
1795 	}
1796       }
1797       mbox->boundary.size[mbox->boundary.index] = size;
1798       if( ++mbox->boundary.index >= MAX_BOUNDARIES ) { mbox->boundary.index = 0; }
1799       quoted = (size > 0) ? 1 : 0;
1800     } else {
1801       /* this is bad */
1802       quoted = 0;
1803     }
1804   }
1805 
1806   /* MIME messages look like this: head-preamble-sec1-...-secN-postamble,
1807    * and the RFCs recommend that preambles/postambles be ignored.
1808    * However, this introduces a loophole for spammers, who can define a
1809    * boundary, but then never cite it. "Robust" MUAs will show the
1810    * contents of the preamble, but we would not see it. To ignore preambles,
1811    * define the symbol IGNORE_MIME_PREAMBLE below.
1812    */
1813 #undef IGNORE_MIME_PREAMBLE
1814 
1815 #if defined(IGNORE_MIME_PREAMBLE)
1816   return quoted;
1817 #else
1818   return 0;
1819 #endif
1820 }
1821 
1822 
1823 static
mbw_prefix(check_old_style_digest)1824 bool_t mbw_prefix(check_old_style_digest)(const mbw_t *line) {
1825 
1826 #define THIRTYDASHES mbw_lit("------------------------------")
1827 #define SEVENTYDASHES mbw_lit("----------------------------------------------------------------------")
1828 
1829   /* messages are separated by either exactly 30 or exactly 70 dashes */
1830   return ( ((mbw_strncmp(line, THIRTYDASHES, 30) == 0) &&
1831 	    (line[30] == mbw_lit('\r') || line[30] == mbw_lit('\n'))) ||
1832 	   ((mbw_strncmp(line, SEVENTYDASHES, 70) == 0) &&
1833 	    (line[70] == mbw_lit('\r') || line[70] == mbw_lit('\n'))) );
1834 }
1835 
1836 /* static */
1837 /* bool_t mbw_prefix(outlook_message_announce)(const mbw_t *line) { */
1838 /* #define OLDASHES mbw_lit("-----Original Message-----") */
1839 /* #define OEDASHES mbw_lit("----- Original Message -----") */
1840 /*   return ( ((mbw_strncmp(line, OLDASHES, 26) == 0) && */
1841 /* 	    (line[26] == mbw_lit('\r') || line[26] == mbw_lit('\n'))) || */
1842 /* 	   ((mbw_strncmp(line, OEDASHES, 28) == 0) && */
1843 /* 	    (line[28] == mbw_lit('\r') || line[28] == mbw_lit('\n'))) ); */
1844 /* } */
1845 
1846 /*
1847  * this code generates check_mime_boundary() and w_check_mime_boundary()
1848  * The check is only approximate.
1849  */
mbw_prefix(check_mime_boundary)1850 bool_t mbw_prefix(check_mime_boundary)(MBOX_State *mbox, const mbw_t *line) {
1851   int c = (mbox->boundary.index > 0) ?
1852     (mbox->boundary.index - 1) : (MAX_BOUNDARIES - 1);
1853   int k = 0;
1854   const mbw_t *p = mbox->boundary.mbw_prefix(identifier)[c];
1855   const mbw_t *q = line + 2;
1856   while(*q) {
1857     if( (k >= mbox->boundary.size[c]) || (*q != *p) ) {
1858       c--;
1859       if( c < 0 ) { c = MAX_BOUNDARIES - 1; }
1860       p = &mbox->boundary.mbw_prefix(identifier)[c][k];
1861       if( c == mbox->boundary.index ) {
1862 	if( (k >= mbox->boundary.size[c]) || (*q != *p) ) {
1863 	  mbox->boundary.was_end = 0;
1864 	  return 0;
1865 	}
1866       }
1867     } else if( k == mbox->boundary.size[c] - 1) {
1868       if((q[1] == mbw_lit('-')) && (q[2] == mbw_lit('-'))) {
1869 	mbox->boundary.was_end = 1;
1870       } else {
1871 	mbox->boundary.was_end = 0;
1872       }
1873       return 1;
1874     } else {
1875       /* normally, a space isn't allowed in the boundary, but we're lenient */
1876       q++;
1877       p++;
1878       k++;
1879     }
1880   }
1881   mbox->boundary.was_end = 0;
1882   return 0;
1883 }
1884 
1885 static
mbw_prefix(check_armor_start)1886 bool_t mbw_prefix(check_armor_start)(const mbw_t *line) {
1887   int i;
1888   for(i = 0; i < num_armor_start; i++) {
1889     if( mbw_strncmp(line, mbw_prefix(armor_start)[i],
1890 		    mbw_strlen(mbw_prefix(armor_start)[i])) == 0 ) {
1891       return 1;
1892     }
1893   }
1894   if( (mbw_strncmp(line, mbw_lit("begin "), 6) == 0) &&
1895       ISOCT(line[6]) && ISOCT(line[7]) && ISOCT(line[8]) ) {
1896     /* uuencoded */
1897     return 1;
1898   } else if( (mbw_strncmp(line, mbw_lit("=ybegin"), 7) == 0) &&
1899 	     ((line[7] == mbw_lit(' ')) || (line[7] == mbw_lit('2'))) &&
1900 	     mbw_prefix(mystrcasestr)(line + 8, mbw_lit("line=")) &&
1901 	     mbw_prefix(mystrcasestr)(line + 8, mbw_lit("size=")) &&
1902 	     mbw_prefix(mystrcasestr)(line + 8, mbw_lit("name=")) ) {
1903     /* yEnc */
1904     return 1;
1905   }
1906   return 0;
1907 }
1908 
1909 static
mbw_prefix(check_armor_end)1910 bool_t mbw_prefix(check_armor_end)(const mbw_t *line) {
1911   int i;
1912   for(i = 0; i < num_armor_end; i++) {
1913     if( mbw_strncmp(line, mbw_prefix(armor_end)[i],
1914 		    mbw_strlen(mbw_prefix(armor_end)[i])) == 0 ) {
1915       return 1;
1916     }
1917   }
1918   if( (mbw_strncmp(line, mbw_lit("end"),3) == 0) &&
1919       (!line[3] || (line[3] == mbw_lit('\n')) || (line[3] == mbw_lit('\r'))) ) {
1920     /* uuencoded */
1921     return 1;
1922   } else if( (mbw_strncmp(line, mbw_lit("=yend "), 6) == 0) &&
1923 	     mbw_prefix(mystrcasestr)(line + 8, mbw_lit("size=")) ) {
1924     /* yEnc */
1925     return 1;
1926   }
1927   return 0;
1928 }
1929 
1930 /* return true if line should be shown, false otherwise */
1931 static
mbw_prefix(armor_filter)1932 bool_t mbw_prefix(armor_filter)(const mbw_t *line) {
1933   if( (mbw_prefix(is_b64line)(line) == 1) ||
1934       (mbw_prefix(is_uuline)(line) == 1) ||
1935       (mbw_prefix(is_yencline)(line) == 1) ) {
1936     return 0;
1937   }
1938   return 1;
1939 }
1940 
1941 /*
1942  * this code generates extract_mime_types() and w_extract_mime_types()
1943  */
1944 static
mbw_prefix(extract_mime_types)1945 void mbw_prefix(extract_mime_types)(mbw_t *line, MIME_Struct *ms) {
1946   int i;
1947   if( !mbw_strncasecmp(line, mbw_lit("Content-Type:"), 13) ) {
1948     line += 13;
1949     for(i = 0; i < num_mime_media; i++) {
1950       if( mbw_prefix(mystrcasestr)(line,
1951 				   mbw_prefix(mime_media)[i].type_subtype) ) {
1952 	ms->type = mbw_prefix(mime_media)[i].medium;
1953 	return;
1954       }
1955     }
1956 
1957     ms->type = ctOTHER;
1958 
1959   } else if( !mbw_strncasecmp(line, mbw_lit("Content-Transfer-Encoding:"),
1960 			      26) ) {
1961     line += 26;
1962     if( mbw_prefix(mystrcasestr)(line, mbw_lit("base64")) ) {
1963       ms->encoding = ceB64;
1964     } else if( mbw_prefix(mystrcasestr)(line, mbw_lit("quoted-printable")) ) {
1965       ms->encoding = ceQP;
1966     } else if( mbw_prefix(mystrcasestr)(line, mbw_lit("binary")) ) {
1967       ms->encoding = ceBIN;
1968     } else if( mbw_prefix(mystrcasestr)(line, mbw_lit("7bit")) ) {
1969       ms->encoding = ceSEVEN;
1970     } else {
1971       ms->encoding = ceID;
1972     }
1973 
1974   }
1975 }
1976 
1977 /* scans the line for the character strip_header_char, and truncates
1978  * from that point on, and switches strip_header_char to the special value 1.
1979  * If special value 1, truncates line from second char onwards.
1980  * If special value 0, does nothing.
1981  */
1982 static __inline__
mbw_prefix(strip_from_char)1983 void mbw_prefix(strip_from_char)(MBOX_State *mbox, mbw_t *q) {
1984   if( mbox->mbw_prefix(strip_header_char) == mbw_lit('\x01') ) {
1985     if( *q++ ) {
1986       *q = mbw_lit('\0');
1987     }
1988   } else if( mbox->mbw_prefix(strip_header_char) ) {
1989     while(*q++) {
1990       if( *q == mbox->mbw_prefix(strip_header_char) ) {
1991 	*q++ = mbw_lit('\n');
1992 	*q = mbw_lit('\0');
1993 	mbox->mbw_prefix(strip_header_char) = mbw_lit('\x01');
1994 	break;
1995       }
1996     }
1997   }
1998 }
1999 
2000 static
mbw_prefix(identify_header)2001 Mheaderid mbw_prefix(identify_header)(mbw_t *line) {
2002 #define HDRID(s,l,h) !mbw_strncasecmp(line, s,l) ? h
2003   if( mbw_isspace(*line) ) {
2004     return hidCONTINUATION;
2005   }
2006   switch(mbw_tolower(line[0])) {
2007   case mbw_lit('b'):
2008     return
2009       HDRID(mbw_lit("BCC:"),4,hidBCC) :
2010       hidUNDEF;
2011   case mbw_lit('c'):
2012     return
2013       HDRID(mbw_lit("Content-"),8,hidCONTENT_) :
2014       HDRID(mbw_lit("CC:"),3,hidCC) :
2015       HDRID(mbw_lit("Categor"),7,hidCATEGORY) :
2016       HDRID(mbw_lit("Comments:"),9,hidCOMMENTS) :
2017       hidUNDEF;
2018   case mbw_lit('f'):
2019     return
2020       HDRID(mbw_lit("From:"),5,hidFROM) :
2021       hidUNDEF;
2022   case mbw_lit('i'):
2023     return
2024       HDRID(mbw_lit("In-Reply-To:"),12,hidIN_REPLY_TO) :
2025       HDRID(mbw_lit("Importance:"),11,hidIMPORTANCE) :
2026       hidUNDEF;
2027   case mbw_lit('k'):
2028     return
2029       HDRID(mbw_lit("Keywords:"),9,hidKEYWORDS) :
2030       hidUNDEF;
2031   case mbw_lit('l'):
2032     return
2033       HDRID(mbw_lit("List-"),5,hidLIST_) :
2034       hidUNDEF;
2035   case mbw_lit('m'):
2036     return
2037       HDRID(mbw_lit("Message-ID:"),11,hidMESSAGE_ID) :
2038       HDRID(mbw_lit("MIME-Version:"),13,hidMIME_VERSION) :
2039       hidUNDEF;
2040   case mbw_lit('n'):
2041     return
2042       HDRID(mbw_lit("Notes:"),6,hidNOTE) :
2043       hidUNDEF;
2044   case mbw_lit('o'):
2045     return
2046       HDRID(mbw_lit("Original-"),8,hidORIGINAL_) :
2047       hidUNDEF;
2048   case mbw_lit('p'):
2049     return
2050       HDRID(mbw_lit("Priority:"),9,hidPRIORITY) :
2051       hidUNDEF;
2052   case mbw_lit('r'):
2053     return
2054       HDRID(mbw_lit("Received:"),9,hidRECEIVED) :
2055       HDRID(mbw_lit("Return-Path:"),12,hidRETURN_PATH) :
2056       HDRID(mbw_lit("References:"),11,hidREFERENCES) :
2057       HDRID(mbw_lit("Return-Receipt-To:"),18,hidRETURN_RECEIPT_TO) :
2058       HDRID(mbw_lit("Reply-To:"),9,hidREPLY_TO) :
2059       HDRID(mbw_lit("Resent-"),7,hidRESENT_) :
2060       hidUNDEF;
2061   case mbw_lit('s'):
2062     return
2063       HDRID(mbw_lit("Subject:"),8,hidSUBJECT) :
2064       HDRID(mbw_lit("Sent:"),5,hidSENT) :
2065       HDRID(mbw_lit("Sender:"),7,hidSENDER) :
2066       hidUNDEF;
2067   case mbw_lit('t'):
2068     return
2069       HDRID(mbw_lit("To:"),3,hidTO) :
2070       HDRID(mbw_lit("Thread-"),7,hidTHREAD_) :
2071       hidUNDEF;
2072   case mbw_lit('x'):
2073     return
2074       HDRID(mbw_lit("X-MS"),4,hidX_MS) :
2075       HDRID(mbw_lit("X-"),2,hidX_) :
2076       hidUNDEF;
2077   case mbw_lit('u'):
2078     return
2079       HDRID(mbw_lit("User-Agent:"),11,hidUSER_AGENT) :
2080       hidUNDEF;
2081   }
2082   return hidUNDEF;
2083 }
2084 
2085 
2086 static
mbw_prefix(scan_header_type)2087 HEADER_Type mbw_prefix(scan_header_type)(MBOX_State *mbox, mbw_t *line) {
2088 
2089 #define STRIP(q) {while(*q++) { if( *q == mbw_lit(';') ) { *q++ = mbw_lit('\n'); *q = mbw_lit('\0'); break; }}}
2090 #define HDRIDCHK(x,y) ((mbox->hid == x) && (mbox->hstate = y))
2091   Mheaderid hid = mbw_prefix(identify_header)(line);
2092 
2093   if( hid == hidCONTINUATION ) {
2094     /* we don't update mbox->hid */
2095     mbw_prefix(strip_from_char)(mbox, line);
2096     return htCONT;
2097   }
2098 
2099   mbox->hid = hid;
2100   if( HDRIDCHK(hidFROM,mhsFROM) ||
2101       HDRIDCHK(hidTO,mhsTO) ||
2102       HDRIDCHK(hidMESSAGE_ID,mhsUNDEF) ||
2103       HDRIDCHK(hidIN_REPLY_TO,mhsUNDEF) ||
2104       HDRIDCHK(hidSUBJECT,mhsSUBJECT) ) {
2105     mbox->mbw_prefix(strip_header_char) = mbw_lit('\0');
2106     mbw_prefix(strip_from_char)(mbox, line);
2107     return htSTANDARD;
2108   } else if( HDRIDCHK(hidRETURN_PATH,mhsTRACE) ||
2109 	     HDRIDCHK(hidRECEIVED,mhsTRACE) ) {
2110     mbox->mbw_prefix(strip_header_char) = mbw_lit(';');
2111     mbw_prefix(strip_from_char)(mbox, line);
2112     return htTRACE;
2113   } else if( (mbox->hid == hidCONTENT_) &&
2114 	     mbw_strchr(line + 8, mbw_lit(':')) ) {
2115     mbox->hstate = mhsMIME;
2116     mbox->mbw_prefix(strip_header_char) = mbw_lit('\0');
2117     return htMIME;
2118   } else if( HDRIDCHK(hidSENDER,mhsUNDEF) ||
2119 	     HDRIDCHK(hidREPLY_TO,mhsUNDEF) ||
2120 	     HDRIDCHK(hidBCC,mhsUNDEF) ||
2121 	     HDRIDCHK(hidCC,mhsUNDEF) ||
2122 	     HDRIDCHK(hidREFERENCES,mhsUNDEF) ) {
2123     mbox->mbw_prefix(strip_header_char) = mbw_lit('\0');
2124     mbw_prefix(strip_from_char)(mbox, line);
2125     return htEXTENDED;
2126   } else {
2127     /* if the line starts with a word missing a :, then
2128        it could be a malformed continuation line */
2129     while( *line && !mbw_isspace(*line) && (*line != mbw_lit(':')) ) { line++; }
2130     if( *line == mbw_lit(':') ) {
2131       mbox->hstate = mhsUNDEF;
2132       mbox->mbw_prefix(strip_header_char) = mbw_lit('\0');
2133       return htUNDEF;
2134     } else {
2135       return htCONT;
2136     }
2137   }
2138 }
2139 
2140 /* static */
2141 /* HEADER_Type mbw_prefix(scan_header_type)(MBOX_State *mbox, mbw_t *line) { */
2142 
2143 /* #define STRIP(q) {while(*q++) { if( *q == mbw_lit(';') ) { *q++ = mbw_lit('\n'); *q = mbw_lit('\0'); break; }}} */
2144 
2145 /* #define HDRCHK(s,l,h) (!mbw_strncasecmp(line, s,l) && (mbox->hstate = h)) */
2146 
2147 /*   mbox->mm = mbw_prefix(identify_header)(line); */
2148 
2149 /*   if( mbw_isspace(*line) ) { */
2150 /*     mbw_prefix(strip_from_char)(mbox, line); */
2151 /*     return htCONT; */
2152 /*   } else if( HDRCHK(mbw_lit("From:"),5,mhsFROM) ||  */
2153 /* 	     HDRCHK(mbw_lit("To:"),3,mhsTO) || */
2154 /* 	     HDRCHK(mbw_lit("Message-ID:"),11,mhsUNDEF) || */
2155 /* 	     HDRCHK(mbw_lit("In-Reply-To:"),12,mhsUNDEF) || */
2156 /* 	     HDRCHK(mbw_lit("Subject:"),8,mhsSUBJECT) ) { */
2157 /*     mbox->mbw_prefix(strip_header_char) = mbw_lit('\0'); */
2158 /*     mbw_prefix(strip_from_char)(mbox, line); */
2159 /*     return htSTANDARD; */
2160 /*   } else if( HDRCHK(mbw_lit("Return-Path:"),12,mhsTRACE) ||  */
2161 /* 	     HDRCHK(mbw_lit("Received:"),9,mhsTRACE) ) { */
2162 /*     mbox->mbw_prefix(strip_header_char) = mbw_lit(';'); */
2163 /*     mbw_prefix(strip_from_char)(mbox, line); */
2164 /*     return htTRACE; */
2165 /*   } else if( !mbw_strncasecmp(line, mbw_lit("Content-"),8) && */
2166 /* 	     mbw_strchr(line + 8, mbw_lit(':')) ) { */
2167 /*     mbox->hstate = mhsMIME; */
2168 /*     mbox->mbw_prefix(strip_header_char) = mbw_lit('\0'); */
2169 /*     return htMIME; */
2170 /*   } else if( HDRCHK(mbw_lit("Sender:"),7,mhsUNDEF) || */
2171 /* 	     HDRCHK(mbw_lit("Reply-To:"),9,mhsUNDEF) || */
2172 /* 	     HDRCHK(mbw_lit("Bcc:"),4,mhsUNDEF) || */
2173 /* 	     HDRCHK(mbw_lit("Cc:"),3,mhsUNDEF) || */
2174 /* 	     HDRCHK(mbw_lit("References:"),11,mhsUNDEF) ) { */
2175 /*     mbox->mbw_prefix(strip_header_char) = mbw_lit('\0'); */
2176 /*     mbw_prefix(strip_from_char)(mbox, line); */
2177 /*     return htEXTENDED; */
2178 /*   } else { */
2179 /*     /\* if the line starts with a word missing a :, then  */
2180 /*        it could be a malformed continuation line *\/ */
2181 /*     while( *line && !mbw_isspace(*line) && (*line != mbw_lit(':')) ) { line++; } */
2182 /*     if( *line == mbw_lit(':') ) { */
2183 /*       mbox->hstate = mhsUNDEF; */
2184 /*       mbox->mbw_prefix(strip_header_char) = mbw_lit('\0'); */
2185 /*       return htUNDEF; */
2186 /*     } else { */
2187 /*       return htCONT; */
2188 /*     } */
2189 /*   } */
2190 /* } */
2191 
2192 static
mbw_prefix(extract_mime_label)2193 int mbw_prefix(extract_mime_label)(mbw_t *line) {
2194   mbw_t *q;
2195   if( m_options & (1<<M_OPTION_HEADERS) ) {
2196     if( !mbw_strncasecmp(line, mbw_lit("Content-"),8) ) {
2197       line += 8;
2198       if( !mbw_strncasecmp(line, mbw_lit("Type:"),5) ) {
2199 	/* we want both the mime type and the file name */
2200 	q = (mbw_t *)mbw_prefix(mystrcasestr)(line, mbw_lit("name="));
2201 	if( q ) { STRIP(q); } else { STRIP(line); }
2202 	return 1;
2203       } else if( !mbw_strncasecmp(line, mbw_lit("Disposition:"),12) ) {
2204 	STRIP(line);
2205 	return 1;
2206       } else if( !mbw_strncasecmp(line, mbw_lit("ID:"),3) ||
2207 		 !mbw_strncasecmp(line, mbw_lit("Description:"),12) ) {
2208 	/* note: we only get first line of description */
2209 	return 1;
2210       }
2211     } else if( mbw_isspace(*line) ) {
2212       q = (mbw_t *)mbw_prefix(mystrcasestr)(line, mbw_lit("name="));
2213       if( q ) {
2214 	STRIP(q);
2215 	return 1;
2216       }
2217     }
2218   }
2219   return 0;
2220 }
2221 
2222 /*
2223  * this code generates mbox_line_filter() and w_mbox_line_filter()
2224  *
2225  * returns true if the line should be processed further
2226  * depends on global mbox state
2227  */
mbw_prefix(mbox_line_filter)2228 bool_t mbw_prefix(mbox_line_filter)(MBOX_State *mbox, mbw_t *line,
2229 				    XML_State *xml) {
2230   bool_t line_empty = 0;
2231   bool_t doubledash = 0;
2232   bool_t process_line = 0; /* by default we skip the line */
2233   XML_Reset force_filter = xmlUNDEF;
2234   bool_t octet_stream = 0;
2235 
2236   line_empty = MBW_EMPTYLINE(line);
2237   doubledash = MBW_DOUBLEDASH(line);
2238 
2239   /* STEP 1: first perform state transitions */
2240   switch(mbox->state) {
2241   case msUNDEF:
2242     /* wait until we see the first nonempty line */
2243     if( !line_empty ) {
2244       mbox->state = msHEADER;
2245       mbox->substate = msuUNDEF;
2246       mbox->hid = hidUNDEF;
2247       mbox->hstate = mhsUNDEF;
2248       mbox->armor = maUNDEF;
2249       mbox->skip_until_boundary = 0;
2250     }
2251     break;
2252   case msHEADER:
2253     if( line_empty ) {
2254       mbox->state = msBODY;
2255       mbox->substate = msuUNDEF;
2256       mbox->hid = hidUNDEF;
2257       mbox->hstate = mhsUNDEF;
2258       mbox->armor = maUNDEF;
2259       /* don't reset skip_until_boundary */
2260       mbox->corruption_check = 5;
2261     }
2262     break;
2263   case msBODY:
2264     if( doubledash && mbw_prefix(check_mime_boundary)(mbox, line) ) {
2265       mbox->state = msATTACH;
2266       mbox->substate = msuUNDEF;
2267       mbox->hid = hidUNDEF;
2268       mbox->hstate = mhsUNDEF;
2269       mbox->armor = maUNDEF;
2270       mbox->skip_until_boundary = mbox->boundary.was_end;
2271       mbox->corruption_check = 0;
2272     } else if( doubledash &&
2273 /* 	       mbw_prefix(outlook_message_announce)(line) || */
2274 	       mbw_prefix(check_old_style_digest)(line) ) {
2275       mbox->state = msATTACH;
2276       mbox->substate = msuTRACK;
2277       mbox->hid = hidUNDEF;
2278       mbox->hstate = mhsUNDEF;
2279       mbox->armor = maUNDEF;
2280       mbox->skip_until_boundary = mbox->boundary.was_end;
2281       mbox->corruption_check = 0;
2282       /* since there are no mime headers, we impose a content type */
2283       /* note: we only try to detect digests because we
2284 	 want to remove the Date: headers */
2285       mbox->body.type = ctMESSAGE_RFC822;
2286     } else if( doubledash &&
2287 	       (mbox->substate == msuARMOR) &&
2288 	       mbw_prefix(check_armor_end)(line) ) {
2289       mbox->substate = msuTRACK;
2290       mbox->hid = hidUNDEF;
2291       mbox->hstate = mhsUNDEF;
2292       mbox->armor = maUNDEF;
2293       mbox->skip_until_boundary = mbox->boundary.was_end;
2294       mbox->corruption_check = 0;
2295     } else if( doubledash &&
2296 	       (mbox->substate != msuARMOR) &&
2297 	       mbw_prefix(check_armor_start)(line) ) {
2298       mbox->substate = msuARMOR;
2299       mbox->armor = maENABLED;
2300     } else if( mbox->prev_line_empty ) {
2301       if( doubledash && !(m_options & (1<<M_OPTION_PLAIN)) ) {
2302 	/* could be a corrupted boundary - note
2303 	 * previous empty line is not required, but often true
2304 	 */
2305 	mbox->corruption_check = 5;
2306       } else if( !mbw_strncasecmp(line, mbw_lit("Content-"), 8) ) {
2307 	mbw_prefix(mhe_line_filter)(line);
2308 	switch(mbw_prefix(scan_header_type)(mbox, line)) {
2309 	case htMIME:
2310 	  mbox->state = msATTACH;
2311 	  mbox->substate = msuMIME;
2312 	  mbox->hstate = mhsUNDEF;
2313 	  mbox->armor = maUNDEF;
2314 	  mbox->skip_until_boundary = 0;
2315 	  mbox->corruption_check = 0;
2316 	  break;
2317 	default:
2318 	  /* do nothing - so far so good */
2319 	  break;
2320 	}
2321       } else if( !mbw_strncmp(line, mbw_lit("From "), 5) ) {
2322 	mbox->state = msHEADER;
2323 	mbox->hid = hidUNDEF;
2324 	mbox->substate = msuUNDEF;
2325 	mbox->hstate = mhsUNDEF;
2326 	mbox->armor = maUNDEF;
2327 	mbox->skip_until_boundary = 0;
2328 	mbox->corruption_check = 0;
2329       }
2330     } else if( mbox->corruption_check > 0 ) {
2331       mbox->corruption_check--;
2332       /* we filter out mail header extension codings - shouldn't do any harm */
2333       mbw_prefix(mhe_line_filter)(line);
2334       switch(mbw_prefix(scan_header_type)(mbox, line)) {
2335       case htMIME:
2336 	mbox->state = msATTACH;
2337 	mbox->substate = msuMIME;
2338 	mbox->hstate = mhsUNDEF;
2339 	mbox->armor = maUNDEF;
2340 	mbox->skip_until_boundary = 0;
2341 	mbox->corruption_check = 0;
2342 	break;
2343       default:
2344 	/* do nothing - so far so good */
2345 	break;
2346       }
2347     }
2348     break;
2349   case msATTACH:
2350     if( line_empty ) {
2351       switch(mbox->body.type) {
2352       case ctMESSAGE_RFC822:
2353 	/* our mime parse isn't recursive - instead we start a
2354 	   new message and associate with it all later attachments */
2355 	mbox->state = msHEADER;
2356 	break;
2357       case ctAPPLICATION_MSWORD:
2358 	mbox->state = msBODY;
2359 	/* override encoding if undefined */
2360 	if( mbox->body.encoding == ceUNDEF ) {
2361 	  mbox->body.encoding = ceB64;
2362 	}
2363 	break;
2364       default:
2365 	mbox->state = msBODY;
2366 	break;
2367       }
2368       mbox->substate = msuUNDEF;
2369       mbox->hid = hidUNDEF;
2370       mbox->hstate = mhsUNDEF;
2371       mbox->armor = maUNDEF;
2372       mbox->skip_until_boundary = 0;
2373       mbox->corruption_check = 0;
2374     }
2375     break;
2376   }
2377 
2378   mbox->prev_line_empty = line_empty; /* for next time */
2379 
2380   /* STEP 2: now clean up and prepare the line according to current state
2381    * and substate.
2382    * After cleanup, the variable process_line indicates if the line
2383    * should be ignored.
2384    * The substate can evolve while the current state is unchanging.
2385    */
2386   switch(mbox->state) {
2387   case msUNDEF:
2388     /* line is not processed */
2389     break;
2390   case msHEADER:
2391     switch(mbox->substate) {
2392     case msuUNDEF:
2393       /* flush caches */
2394       process_line =
2395 	mbw_prefix(flush_cache)(&(mbox->mbw_prefix(b64_dc)), line, 1) ||
2396 	mbw_prefix(flush_cache)(&(mbox->mbw_prefix(qp_dc)), line, 1);
2397       if( process_line ) {
2398 	/* we still remember previous type/encoding, decide if we need filter */
2399 	force_filter = select_xml_defaults(&mbox->body);
2400 	octet_stream = (mbox->body.type == ctOCTET_STREAM) ||
2401 	  (mbox->body.type == ctAPPLICATION_MSWORD);
2402       }
2403       /* there are no default mime types for headers */
2404       mbox->header.type = mbox->body.type = ctUNDEF;
2405       mbox->header.encoding = mbox->body.encoding = ceUNDEF;
2406       /* switch to normal state next time */
2407       mbox->substate = msuOTHER;
2408       mbox->corruption_check = 0;
2409       mbox->skip_header = 0;
2410       mbox->plainstate = psPLAIN;
2411       /* don't break, as the current line could contain
2412 	 interesting headers already */
2413 
2414     default:
2415       /* switch substate if necessary */
2416       switch(mbw_prefix(scan_header_type)(mbox, line)) {
2417       case htSTANDARD:
2418 	if( m_options & (1<<M_OPTION_NOHEADERS) ) {
2419 	  mbox->substate = (mbox->hstate == mhsSUBJECT) ? msuTRACK : msuOTHER;
2420 	} else {
2421 	  mbox->substate = msuTRACK;
2422 	}
2423 	break;
2424       case htEXTENDED:
2425 	mbox->substate = (m_options & (1<<M_OPTION_HEADERS)) ? msuTRACK : msuOTHER;
2426 	break;
2427       case htTRACE:
2428 	mbox->substate = (m_options & (1<<M_OPTION_THEADERS)) ? msuTRACK : msuOTHER;
2429 	break;
2430       case htMIME:
2431 	mbox->substate = msuMIME;
2432 	break;
2433       case htCONT:
2434 	/* nothing */
2435 	break;
2436       case htUNDEF:
2437 	mbox->substate = msuOTHER;
2438 	break;
2439       }
2440 
2441       /* process substate */
2442 
2443       switch(mbox->substate) {
2444       case msuTRACK:
2445 	process_line = mbw_prefix(mhe_line_filter)(line);
2446 	break;
2447       case msuMIME:
2448 	mbw_prefix(mhe_line_filter)(line);
2449 	mbw_prefix(extract_mime_types)(line, &mbox->header);
2450 	mbox->skip_until_boundary =
2451 	  mbw_prefix(extract_mime_boundary)(mbox, line) || mbox->skip_until_boundary;
2452 	/* this comes last, modifies line */
2453 	process_line = mbw_prefix(extract_mime_label)(line);
2454 	break;
2455       case msuUNDEF:
2456       case msuOTHER:
2457 	mbox->hstate = mhsXHEADER;
2458       case msuARMOR:
2459 	process_line = mbw_prefix(extract_header_label)(mbox, line);
2460 	break;
2461       }
2462     }
2463     break;
2464   case msBODY:
2465     switch(mbox->substate) {
2466     case msuUNDEF:
2467       /* flush caches */
2468       process_line =
2469 	mbw_prefix(flush_cache)(&(mbox->mbw_prefix(b64_dc)), line, 1) ||
2470 	mbw_prefix(flush_cache)(&(mbox->mbw_prefix(qp_dc)), line, 1);
2471       if( process_line ) {
2472 	/* we still remember previous type/encoding, decide if we need filter */
2473 	force_filter = select_xml_defaults(&mbox->body);
2474 	octet_stream = (mbox->body.type == ctOCTET_STREAM) ||
2475 	  (mbox->body.type == ctAPPLICATION_MSWORD);
2476       }
2477       /* bodies by default inherit the header mime types */
2478       if( mbox->body.type == ctUNDEF )
2479 	{ mbox->body.type = mbox->header.type; }
2480       if( mbox->body.encoding == ceUNDEF )
2481 	{ mbox->body.encoding = mbox->header.encoding; }
2482 
2483       /* switch to normal state next time */
2484       mbox->substate = msuTRACK;
2485       mbox->plainstate = psPLAIN;
2486       break;
2487     case msuARMOR:
2488       switch(mbox->armor) {
2489       case maUNDEF:
2490 	process_line = 1;
2491 	break;
2492       case maENABLED:
2493 	process_line = mbw_prefix(armor_filter)(line);
2494 	break;
2495       }
2496 
2497       break;
2498     default:
2499       if( mbox->skip_until_boundary ) {
2500 	process_line = 0;
2501       } else {
2502 	switch(mbox->body.type) {
2503 	case ctOCTET_STREAM:
2504 	case ctAPPLICATION_MSWORD:
2505 	  if( !(m_options & (1<<M_OPTION_ATTACHMENTS)) ) {
2506 	    process_line = 0;
2507 	    break;
2508 	  } else {
2509 	    /* otherwise fall through */
2510 	    octet_stream = 1;
2511 	  }
2512 	case ctUNDEF:  /* the header didn't say, so we must assume text */
2513 	case ctMESSAGE_RFC822:
2514 	case ctTEXT_PLAIN:
2515 	  switch(mbox->body.encoding) {
2516 	  case ceBIN:
2517 	    process_line = 1;
2518 	    break;
2519 	  case ceUNDEF:
2520 	  case ceSEVEN:
2521 	  case ceID:
2522 	    process_line = ((m_options & (1<<M_OPTION_PLAIN)) ?
2523 			    1 : mbw_prefix(plain_text_filter)(mbox, line));
2524 	    break;
2525 	  case ceQP:
2526  	    process_line =
2527 	      mbw_prefix(qp_line_filter)(&(mbox->mbw_prefix(qp_dc)), line);
2528 	    break;
2529 	  case ceB64:
2530 	    process_line =
2531 	      mbw_prefix(b64_line_filter)(&(mbox->mbw_prefix(b64_dc)), line);
2532 	    break;
2533 	  }
2534 	  break;
2535 	case ctTEXT_RICH:
2536 	case ctTEXT_HTML:
2537 	case ctTEXT_XML:
2538 	case ctTEXT_SGML:
2539 	case ctTEXT_UNKNOWN:
2540 	  switch(mbox->body.encoding) {
2541 	  case ceBIN:
2542 	  case ceUNDEF:
2543 	  case ceSEVEN:
2544 	  case ceID:
2545 	    process_line = 1;
2546 	    break;
2547 	  case ceQP:
2548  	    process_line =
2549 	      mbw_prefix(qp_line_filter)(&(mbox->mbw_prefix(qp_dc)), line);
2550 	    break;
2551 	  case ceB64:
2552 	    process_line =
2553 	      mbw_prefix(b64_line_filter)(&(mbox->mbw_prefix(b64_dc)), line);
2554 	    break;
2555 	  }
2556 	  break;
2557 	case ctIMAGE:
2558 	case ctAUDIO:
2559 	case ctVIDEO:
2560 	case ctMODEL:
2561 	case ctOTHER:
2562 	  process_line = 0;
2563 	  break;
2564 	}
2565       }
2566       break;
2567     }
2568     break;
2569   case msATTACH:
2570     switch(mbox->substate) {
2571     case msuUNDEF:
2572       /* flush caches */
2573       process_line =
2574 	mbw_prefix(flush_cache)(&(mbox->mbw_prefix(b64_dc)), line, 1) ||
2575 	mbw_prefix(flush_cache)(&(mbox->mbw_prefix(qp_dc)), line, 1);
2576       if( process_line ) {
2577 	/* we still remember previous type/encoding, decide if we need filter */
2578 	force_filter = select_xml_defaults(&mbox->body);
2579 	octet_stream = (mbox->body.type == ctOCTET_STREAM) ||
2580 	  (mbox->body.type == ctAPPLICATION_MSWORD);
2581       }
2582       /* attachments by default inherit the header mime types */
2583       mbox->body.type = mbox->header.type;
2584       mbox->body.encoding = mbox->header.encoding;
2585       /* switch to normal state next time */
2586       /* this has a nice side-effect: if the first line is a htCONT,
2587 	 then it gets displayed and that's the right thing to do,
2588 	 because if the first line is a htCONT, then the ATTACH header
2589 	 is not a header at all, ie the paragraph was misidentified. */
2590       mbox->substate = msuTRACK;
2591       mbox->plainstate = psPLAIN;
2592       break;
2593     default:
2594       /* switch substate if necessary */
2595       switch(mbw_prefix(scan_header_type)(mbox, line)) {
2596       case htSTANDARD:
2597 	mbox->substate = msuTRACK;
2598 	break;
2599       case htEXTENDED:
2600 	mbox->substate = (m_options & (1<<M_OPTION_HEADERS)) ? msuTRACK : msuOTHER;
2601 	break;
2602       case htTRACE:
2603 	mbox->substate = (m_options & (1<<M_OPTION_THEADERS)) ? msuTRACK : msuOTHER;
2604 	break;
2605       case htMIME:
2606 	mbox->substate = msuMIME;
2607 	break;
2608       case htCONT:
2609 	/* nothing */
2610 	break;
2611       case htUNDEF:
2612 	mbox->substate = msuOTHER;
2613 	break;
2614       }
2615       /* process substate */
2616       switch(mbox->substate) {
2617       case msuTRACK:
2618 	process_line = 1;
2619 	break;
2620       case msuUNDEF:
2621       case msuOTHER:
2622       case msuARMOR:
2623 	process_line = 0;
2624 	break;
2625       case msuMIME:
2626 	mbw_prefix(mhe_line_filter)(line);
2627 	mbw_prefix(extract_mime_types)(line, &mbox->body);
2628 	mbox->skip_until_boundary =
2629 	  mbw_prefix(extract_mime_boundary)(mbox, line) || mbox->skip_until_boundary;
2630 	/* this comes last, modifies line */
2631 	process_line = mbw_prefix(extract_mime_label)(line);
2632 	break;
2633       }
2634       break;
2635     }
2636     break;
2637   }
2638 
2639   /* STEP 3: activate filters */
2640 
2641   if( octet_stream ) {
2642     process_line = mbw_prefix(strings1_filter)(line);
2643   }
2644 
2645   if( !process_line && line_empty ) {
2646     /* don't touch this: the end of file is artificially marked by an
2647        empty line */
2648     process_line =
2649       mbw_prefix(flush_cache)(&(mbox->mbw_prefix(b64_dc)), line, 1) ||
2650       mbw_prefix(flush_cache)(&(mbox->mbw_prefix(qp_dc)), line, 1);
2651   }
2652 
2653   if( force_filter != xmlUNDEF ) {
2654     reset_xml_character_filter(xml, force_filter);
2655   } else {
2656     if( mbox->state == msBODY ) {
2657       if( mbox->skip_until_boundary ) {
2658 	process_line = 0;
2659       } else {
2660 	reset_xml_character_filter(xml, select_xml_defaults(&(mbox->body)));
2661       }
2662     } else {
2663       reset_xml_character_filter(xml, xmlDISABLE);
2664     }
2665   }
2666 
2667   /* we also process empty lines, as they can be helpful for n-gram boundaries */
2668   return process_line || line_empty;
2669 }
2670 
2671 
2672 
2673 /***********************************************************
2674  * HTML PARSING FUNCTIONS                                  *
2675  ***********************************************************/
2676 
2677 /*
2678  * this code generates decode_html_entity() and w_decode_html_entity()
2679  * (this is ugly, but I am _not_ building a string hash, sheesh).
2680  *
2681  * note: the conversion from unicode to multibyte depends on the current
2682  * locale, but also assumes that wchar_t *is* unicode internally. Both assumptions
2683  * can be false on weird compilers. In case the locale is incapable of doing the job,
2684  * we convert based on the hex code.
2685  *
2686  * note2: the conversion is not always faithful, even so. We try not to convert
2687  * characters which could look like control codes to the html parser later.
2688  *
2689  * note3: upon successful conversion, *qq is incremented by the
2690  * character, and *lline is incremented by the entity length - 1, so
2691  * that you still need to increment *lline by one to obtain the next
2692  * parseable input. If conversion is unsuccessful, the pointers are not
2693  * modified. Check the return value for success or failure.
2694  *
2695  * note4: this function really needs to be reworked a bit. It ought to be possible
2696  * to do the right thing even for machines with missing wchar_t.
2697  */
2698 static
mbw_prefix(decode_html_entity)2699 bool_t mbw_prefix(decode_html_entity)(mbw_t **lline, mbw_t **qq) {
2700   bool_t retval = 0;
2701   mbw_t *line = *lline;
2702   mbw_t *q = *qq;
2703 /*   printf("\nline = %p q = %p (line - q) = %d\n", line, q, line - q); */
2704 /*   printf("[[[%s]]]\n", line); */
2705 #if defined HAVE_MBRTOWC
2706 
2707   mbw_t *r = NULL;
2708 #if defined MBW_MB
2709   int s,t;
2710   mbw_t scratch[16]; /* C compiler complains  about MB_CUR_MAX */
2711 #endif
2712   wchar_t c = 0; /* this must always be wchar_t */
2713 
2714   switch(line[1]) {
2715   case mbw_lit('#'):
2716     if( (line[2] == mbw_lit('x')) || (line[2] == mbw_lit('X')) ) {
2717 #if defined MBW_MB || (defined MBW_WIDE && defined HAVE_WCSTOL)
2718       c = (wchar_t)mbw_strtol(line + 3, &r, 16);
2719 #else
2720       /* can't convert, but skip the payload anyway */
2721       for(r = line + 3; isxdigit(*r); r++);
2722 #endif
2723     } else {
2724 #if defined MBW_MB || (defined MBW_WIDE && defined HAVE_WCSTOL)
2725       c = (wchar_t)mbw_strtol(line + 2, &r, 10);
2726 #else
2727       /* can't convert, but skip the payload anyway */
2728       for(r = line + 2; isdigit(*r); r++);
2729 #endif
2730     }
2731     break;
2732 
2733 #define ENTITY(x,y,z) if( !mbw_strncmp((line + 3), (x + 2), (y - 2)) ) \
2734                            { c = (z); r = line + (y) + 1; }
2735 
2736   case mbw_lit('a'):
2737     switch(line[2]) {
2738     case mbw_lit('a'):
2739       ENTITY(mbw_lit("aacute"),6,0xe1);
2740       break;
2741     case mbw_lit('c'):
2742       ENTITY(mbw_lit("acute"),5,0xb4) else ENTITY(mbw_lit("acirc"),4,0xe2);
2743       break;
2744     case mbw_lit('e'):
2745       ENTITY(mbw_lit("aelig"),5,0xe6);
2746       break;
2747     case mbw_lit('g'):
2748       ENTITY(mbw_lit("agrave"),6,0xe0);
2749       break;
2750     case mbw_lit('l'):
2751       ENTITY(mbw_lit("alpha"),5,0x03b1) else ENTITY(mbw_lit("alefsym"),7,0x2135);
2752       break;
2753     case mbw_lit('m'):
2754       ENTITY(mbw_lit("amp"),3,0x26);
2755       break;
2756     case mbw_lit('n'):
2757       ENTITY(mbw_lit("ang"),3,0x2220) else ENTITY(mbw_lit("and"),3,0x2227);
2758       break;
2759     case mbw_lit('r'):
2760       ENTITY(mbw_lit("aring"),5,0xe5);
2761       break;
2762     case mbw_lit('s'):
2763       ENTITY(mbw_lit("asymp"),5,0x2248);
2764       break;
2765     case mbw_lit('t'):
2766       ENTITY(mbw_lit("atilde"),6,0xe3);
2767       break;
2768     case mbw_lit('u'):
2769       ENTITY(mbw_lit("auml"),4,0xe4);
2770       break;
2771     }
2772     break;
2773 
2774   case mbw_lit('A'):
2775     switch(line[2]) {
2776     case mbw_lit('a'):
2777       ENTITY(mbw_lit("Aacute"),6,0xc1);
2778       break;
2779     case mbw_lit('c'):
2780       ENTITY(mbw_lit("Acirc"),5,0xc2);
2781       break;
2782     case mbw_lit('E'):
2783       ENTITY(mbw_lit("AElig"),5,0xc6);
2784       break;
2785     case mbw_lit('g'):
2786       ENTITY(mbw_lit("Agrave"),6,0xc0);
2787       break;
2788     case mbw_lit('l'):
2789       ENTITY(mbw_lit("Alpha"),5,0x0391);
2790       break;
2791     case mbw_lit('r'):
2792       ENTITY(mbw_lit("Aring"),5,0xc5);
2793       break;
2794     case mbw_lit('t'):
2795       ENTITY(mbw_lit("Atilde"),6,0xc3);
2796       break;
2797     case mbw_lit('u'):
2798       ENTITY(mbw_lit("Auml"),4,0xc4);
2799       break;
2800     }
2801     break;
2802 
2803   case mbw_lit('b'):
2804     switch(line[2]) {
2805     case mbw_lit('d'):
2806       ENTITY(mbw_lit("bdquo"),5,0x201e);
2807       break;
2808     case mbw_lit('e'):
2809       ENTITY(mbw_lit("beta"),4,0x03b2);
2810       break;
2811     case mbw_lit('u'):
2812       ENTITY(mbw_lit("bull"),4,0x2022);
2813       break;
2814     }
2815     break;
2816 
2817   case mbw_lit('B'):
2818     switch(line[2]) {
2819     case mbw_lit('e'):
2820       ENTITY(mbw_lit("Beta"),4,0x0392);
2821       break;
2822     case mbw_lit('r'):
2823       ENTITY(mbw_lit("Brvbar"),6,0xa6);
2824       break;
2825     }
2826     break;
2827 
2828   case mbw_lit('c'):
2829     switch(line[2]) {
2830     case mbw_lit('a'):
2831       ENTITY(mbw_lit("cap"),3,0x2229);
2832       break;
2833     case mbw_lit('c'):
2834       ENTITY(mbw_lit("ccedil"),6,0xe7);
2835       break;
2836     case mbw_lit('e'):
2837       ENTITY(mbw_lit("cent"),4,0xa2) else ENTITY(mbw_lit("cedil"),5,0xb8);
2838       break;
2839     case mbw_lit('h'):
2840       ENTITY(mbw_lit("chi"),3,0x03c7);
2841       break;
2842     case mbw_lit('i'):
2843       ENTITY(mbw_lit("circ"),4,0x02c6);
2844       break;
2845     case mbw_lit('l'):
2846       ENTITY(mbw_lit("clubs"),5,0x2663);
2847       break;
2848     case mbw_lit('o'):
2849       ENTITY(mbw_lit("copy"),4,0xa9) else ENTITY(mbw_lit("cong"),4,0x2245);
2850       break;
2851     case mbw_lit('r'):
2852       ENTITY(mbw_lit("crarr"),5,0x21b5);
2853       break;
2854     case mbw_lit('u'):
2855       ENTITY(mbw_lit("curren"),6,0xa4) else ENTITY(mbw_lit("cup"),3,0x222a);
2856       break;
2857     }
2858     break;
2859 
2860   case mbw_lit('C'):
2861     switch(line[2]) {
2862     case mbw_lit('c'):
2863       ENTITY(mbw_lit("Ccedil"),6,0xc7);
2864       break;
2865     case mbw_lit('h'):
2866       ENTITY(mbw_lit("Chi"),3,0x03a7);
2867       break;
2868     }
2869     break;
2870 
2871   case mbw_lit('d'):
2872     switch(line[2]) {
2873     case mbw_lit('a'):
2874       ENTITY(mbw_lit("darr"),4,0x2193) else ENTITY(mbw_lit("dagger"),6,0x2020);
2875       break;
2876     case mbw_lit('A'):
2877       ENTITY(mbw_lit("dArr"),4,0x21d3);
2878       break;
2879     case mbw_lit('e'):
2880       ENTITY(mbw_lit("delta"),5,0x03b4);
2881       break;
2882     case mbw_lit('i'):
2883       ENTITY(mbw_lit("divide"),6,0xf7) else ENTITY(mbw_lit("diams"),5,0x2666);
2884       break;
2885     }
2886     break;
2887 
2888   case mbw_lit('D'):
2889     switch(line[2]) {
2890     case mbw_lit('a'):
2891       ENTITY(mbw_lit("Dagger"),6,0x2021);
2892       break;
2893     case mbw_lit('e'):
2894       ENTITY(mbw_lit("Deg"),3,0xb0) else ENTITY(mbw_lit("Delta"),5,0x0394);
2895       break;
2896     }
2897     break;
2898 
2899   case mbw_lit('e'):
2900     switch(line[2]) {
2901     case mbw_lit('a'):
2902       ENTITY(mbw_lit("eacute"),6,0xe9);
2903       break;
2904     case mbw_lit('c'):
2905       ENTITY(mbw_lit("ecirc"),5,0xea);
2906       break;
2907     case mbw_lit('g'):
2908       ENTITY(mbw_lit("egrave"),6,0xe8);
2909       break;
2910     case mbw_lit('m'):
2911       ENTITY(mbw_lit("empty"),5,0x2205) else ENTITY(mbw_lit("emsp"),4,0x2003);
2912       break;
2913     case mbw_lit('n'):
2914       ENTITY(mbw_lit("ensp"),4,0x2002);
2915       break;
2916     case mbw_lit('p'):
2917       ENTITY(mbw_lit("epsilon"),7,0x03b5);
2918       break;
2919     case mbw_lit('q'):
2920       ENTITY(mbw_lit("equiv"),5,0x2261);
2921       break;
2922     case mbw_lit('t'):
2923       ENTITY(mbw_lit("eth"),3,0xf0) else ENTITY(mbw_lit("eta"),3,0x03b7);
2924       break;
2925     case mbw_lit('u'):
2926       ENTITY(mbw_lit("euml"),4,0xeb) else ENTITY(mbw_lit("euro"),4,0x20ac);
2927       break;
2928     case mbw_lit('x'):
2929       ENTITY(mbw_lit("exist"),5,0x2203);
2930       break;
2931     }
2932     break;
2933 
2934   case mbw_lit('E'):
2935     switch(line[2]) {
2936     case mbw_lit('a'):
2937       ENTITY(mbw_lit("Eacute"),6,0xc9);
2938       break;
2939     case mbw_lit('c'):
2940       ENTITY(mbw_lit("Ecirc"),5,0xca);
2941       break;
2942     case mbw_lit('g'):
2943       ENTITY(mbw_lit("Egrave"),6,0xc8);
2944       break;
2945     case mbw_lit('p'):
2946       ENTITY(mbw_lit("Epsilon"),7,0x0395);
2947       break;
2948     case mbw_lit('T'):
2949       ENTITY(mbw_lit("ETH"),3,0xd0);
2950       break;
2951     case mbw_lit('t'):
2952       ENTITY(mbw_lit("Eta"),3,0x0397);
2953       break;
2954     case mbw_lit('u'):
2955       ENTITY(mbw_lit("Euml"),4,0xcb);
2956       break;
2957     }
2958     break;
2959 
2960   case mbw_lit('f'):
2961     switch(line[2]) {
2962     case mbw_lit('n'):
2963       ENTITY(mbw_lit("fnof"),4,0x0192);
2964       break;
2965     case mbw_lit('o'):
2966       ENTITY(mbw_lit("forall"),6,0x2200);
2967       break;
2968     case mbw_lit('r'):
2969       ENTITY(mbw_lit("frac14"),6,0xbc) else ENTITY(mbw_lit("frac12"),6,0xbd) else
2970 	ENTITY(mbw_lit("frac34"),6,0xbe) else ENTITY(mbw_lit("frasl"),5,0x2044);
2971       break;
2972     }
2973     break;
2974 
2975   case mbw_lit('F'):
2976     /* nothing */
2977     break;
2978 
2979   case mbw_lit('g'):
2980     switch(line[2]) {
2981     case mbw_lit('a'):
2982       ENTITY(mbw_lit("gamma"),5,0x3b3);
2983       break;
2984     case mbw_lit('e'):
2985       ENTITY(mbw_lit("ge"),2,0x2265);
2986       break;
2987     case mbw_lit('t'):
2988       ENTITY(mbw_lit("gt"),2,0x3e);
2989       break;
2990     }
2991     break;
2992 
2993   case mbw_lit('G'):
2994     switch(line[2]) {
2995     case mbw_lit('a'):
2996       ENTITY(mbw_lit("Gamma"),5,0x0393);
2997       break;
2998     }
2999     break;
3000 
3001   case mbw_lit('h'):
3002     switch(line[2]) {
3003     case mbw_lit('a'):
3004       ENTITY(mbw_lit("harr"),4,0x2194);
3005       break;
3006     case mbw_lit('A'):
3007       ENTITY(mbw_lit("hArr"),4,0x21d4);
3008       break;
3009     case mbw_lit('e'):
3010       ENTITY(mbw_lit("hearts"),6,0x2665) else ENTITY(mbw_lit("hellip"),6,0x2026);
3011       break;
3012     }
3013     break;
3014 
3015   case mbw_lit('H'):
3016     /* nothing */
3017     break;
3018 
3019   case mbw_lit('i'):
3020     switch(line[2]) {
3021     case mbw_lit('a'):
3022       ENTITY(mbw_lit("iacute"),6,0xed);
3023       break;
3024     case mbw_lit('c'):
3025       ENTITY(mbw_lit("icirc"),5,0xee);
3026       break;
3027     case mbw_lit('e'):
3028       ENTITY(mbw_lit("iexcl"),5,0xa1);
3029       break;
3030     case mbw_lit('g'):
3031       ENTITY(mbw_lit("igrave"),6,0xec);
3032       break;
3033     case mbw_lit('m'):
3034       ENTITY(mbw_lit("image"),5,0x2111);
3035       break;
3036     case mbw_lit('n'):
3037       ENTITY(mbw_lit("infin"),5,0x221e) else ENTITY(mbw_lit("int"),3,0x222b);
3038       break;
3039     case mbw_lit('o'):
3040       ENTITY(mbw_lit("iota"),4,0x03b9);
3041       break;
3042     case mbw_lit('q'):
3043       ENTITY(mbw_lit("iquest"),6,0xbf);
3044       break;
3045     case mbw_lit('s'):
3046       ENTITY(mbw_lit("isin"),4,0x2208);
3047       break;
3048     case mbw_lit('u'):
3049       ENTITY(mbw_lit("iuml"),4,0xef);
3050       break;
3051     }
3052     break;
3053 
3054   case mbw_lit('I'):
3055     switch(line[2]) {
3056     case mbw_lit('a'):
3057       ENTITY(mbw_lit("Iacute"),6,0xcd);
3058       break;
3059     case mbw_lit('c'):
3060       ENTITY(mbw_lit("Icirc"),5,0xce);
3061       break;
3062     case mbw_lit('g'):
3063       ENTITY(mbw_lit("Igrave"),6,0xcc);
3064       break;
3065     case mbw_lit('o'):
3066       ENTITY(mbw_lit("Iota"),4,0x0399);
3067       break;
3068     case mbw_lit('u'):
3069       ENTITY(mbw_lit("Iuml"),4,0xcf);
3070       break;
3071     }
3072     break;
3073 
3074   case mbw_lit('j'):
3075     /* nothing */
3076     break;
3077 
3078   case mbw_lit('J'):
3079     /* nothing */
3080     break;
3081 
3082   case mbw_lit('k'):
3083     switch(line[2]) {
3084     case mbw_lit('a'):
3085       ENTITY(mbw_lit("kappa"),5,0x03ba);
3086       break;
3087     }
3088     break;
3089 
3090   case mbw_lit('K'):
3091     switch(line[2]) {
3092     case mbw_lit('a'):
3093       ENTITY(mbw_lit("Kappa"),5,0x039a);
3094       break;
3095     }
3096     break;
3097 
3098   case mbw_lit('l'):
3099     switch(line[2]) {
3100     case mbw_lit('a'):
3101       ENTITY(mbw_lit("lambda"),6,0x03bb) else ENTITY(mbw_lit("lang"),4,0x2329);
3102       break;
3103     case mbw_lit('A'):
3104       ENTITY(mbw_lit("lArr"),4,0x21d0);
3105       break;
3106     case mbw_lit('c'):
3107       ENTITY(mbw_lit("lceil"),5,0x2308);
3108       break;
3109     case mbw_lit('d'):
3110       ENTITY(mbw_lit("ldquo"),5,0x201c);
3111       break;
3112     case mbw_lit('e'):
3113       ENTITY(mbw_lit("le"),2,0x2264);
3114       break;
3115     case mbw_lit('f'):
3116       ENTITY(mbw_lit("lfloor"),6,0x2309);
3117       break;
3118     case mbw_lit('o'):
3119       ENTITY(mbw_lit("lowast"),6,0x2217) else ENTITY(mbw_lit("loz"),3,0x25ca);
3120       break;
3121     case mbw_lit('r'):
3122       ENTITY(mbw_lit("lrm"),3,0x200e);
3123       break;
3124     case mbw_lit('s'):
3125       ENTITY(mbw_lit("lsquo"),5,0x2018) else ENTITY(mbw_lit("lsaquo"),6,0x2039);
3126       break;
3127     case mbw_lit('t'):
3128       ENTITY(mbw_lit("lt"),2,0x3c);
3129       break;
3130     }
3131     break;
3132 
3133   case mbw_lit('L'):
3134     switch(line[2]) {
3135     case mbw_lit('a'):
3136       ENTITY(mbw_lit("Laquo"),5,0xab) else ENTITY(mbw_lit("Lambda"),6,0x039b) else
3137 	ENTITY(mbw_lit("Larr"),4,0x2190);
3138       break;
3139     }
3140     break;
3141 
3142   case mbw_lit('m'):
3143     switch(line[2]) {
3144     case mbw_lit('d'):
3145       ENTITY(mbw_lit("mdash"),5,0x2014);
3146       break;
3147     case mbw_lit('i'):
3148       ENTITY(mbw_lit("minus"),5,0x2212);
3149       break;
3150     case mbw_lit('u'):
3151       ENTITY(mbw_lit("mu"),2,0x03bc);
3152       break;
3153     }
3154     break;
3155 
3156   case mbw_lit('M'):
3157     switch(line[2]) {
3158     case mbw_lit('a'):
3159       ENTITY(mbw_lit("Macr"),4,0xaf);
3160       break;
3161     case mbw_lit('i'):
3162       ENTITY(mbw_lit("Micro"),5,0xb5) else ENTITY(mbw_lit("Middot"),6,0xb7);
3163       break;
3164     case mbw_lit('u'):
3165       ENTITY(mbw_lit("Mu"),2,0x039c);
3166       break;
3167     }
3168     break;
3169 
3170   case mbw_lit('n'):
3171     switch(line[2]) {
3172     case mbw_lit('a'):
3173       ENTITY(mbw_lit("nabla"),5,0x2207);
3174       break;
3175     case mbw_lit('b'):
3176       ENTITY(mbw_lit("nbsp"),4,0xa0);
3177       break;
3178     case mbw_lit('d'):
3179       ENTITY(mbw_lit("ndash"),5,0x2013);
3180       break;
3181     case mbw_lit('e'):
3182       ENTITY(mbw_lit("ne"),2,0x2260);
3183       break;
3184     case mbw_lit('i'):
3185       ENTITY(mbw_lit("ni"),2,0x220b);
3186       break;
3187     case mbw_lit('o'):
3188       ENTITY(mbw_lit("not"),3,0xac) else ENTITY(mbw_lit("notin"),5,0x2209);
3189       break;
3190     case mbw_lit('s'):
3191       ENTITY(mbw_lit("nsub"),4,0x2284);
3192       break;
3193     case mbw_lit('t'):
3194       ENTITY(mbw_lit("ntilde"),6,0xf1);
3195       break;
3196     case mbw_lit('u'):
3197       ENTITY(mbw_lit("nu"),2,0x03bd);
3198       break;
3199     }
3200     break;
3201 
3202   case mbw_lit('N'):
3203     switch(line[2]) {
3204     case mbw_lit('t'):
3205       ENTITY(mbw_lit("Ntilde"),6,0xd1);
3206       break;
3207     case mbw_lit('u'):
3208       ENTITY(mbw_lit("Nu"),2,0x039d);
3209       break;
3210     }
3211     break;
3212 
3213   case mbw_lit('o'):
3214     switch(line[2]) {
3215     case mbw_lit('a'):
3216       ENTITY(mbw_lit("oacute"),6,0xf3);
3217       break;
3218     case mbw_lit('c'):
3219       ENTITY(mbw_lit("ocirc"),5,0xf4);
3220       break;
3221     case mbw_lit('e'):
3222       ENTITY(mbw_lit("oelig"),5,0x0153);
3223       break;
3224     case mbw_lit('g'):
3225       ENTITY(mbw_lit("ograve"),6,0xf2);
3226       break;
3227     case mbw_lit('l'):
3228       ENTITY(mbw_lit("oline"),5,0x203e);
3229       break;
3230     case mbw_lit('m'):
3231       ENTITY(mbw_lit("omicron"),7,0x03bf) else ENTITY(mbw_lit("omega"),5,0x03c9);
3232       break;
3233     case mbw_lit('p'):
3234       ENTITY(mbw_lit("oplus"),5,0x2295);
3235       break;
3236     case mbw_lit('r'):
3237       ENTITY(mbw_lit("ordf"),4,0xaa) else ENTITY(mbw_lit("ordm"),4,0xba) else
3238 	ENTITY(mbw_lit("or"),2,0x2228);
3239       break;
3240     case mbw_lit('s'):
3241       ENTITY(mbw_lit("oslash"),6,0xf8);
3242       break;
3243     case mbw_lit('t'):
3244       ENTITY(mbw_lit("otilde"),6,0xf5) else ENTITY(mbw_lit("otimes"),6,0x2297);
3245       break;
3246     case mbw_lit('u'):
3247       ENTITY(mbw_lit("ouml"),4,0xf6);
3248       break;
3249     }
3250     break;
3251 
3252   case mbw_lit('O'):
3253     switch(line[2]) {
3254     case mbw_lit('a'):
3255       ENTITY(mbw_lit("Oacute"),6,0xd3);
3256       break;
3257     case mbw_lit('c'):
3258       ENTITY(mbw_lit("Ocirc"),5,0xd4);
3259       break;
3260     case mbw_lit('E'):
3261       ENTITY(mbw_lit("OElig"),5,0x0152);
3262       break;
3263     case mbw_lit('m'):
3264       ENTITY(mbw_lit("Omicron"),7,0x039f) else ENTITY(mbw_lit("Omega"),5,0x03a9);
3265       break;
3266     case mbw_lit('g'):
3267       ENTITY(mbw_lit("Ograve"),6,0xd2);
3268       break;
3269     case mbw_lit('s'):
3270       ENTITY(mbw_lit("Oslash"),6,0xd8);
3271       break;
3272     case mbw_lit('t'):
3273       ENTITY(mbw_lit("Otilde"),6,0xd5);
3274       break;
3275     case mbw_lit('u'):
3276       ENTITY(mbw_lit("Ouml"),4,0xd6);
3277       break;
3278     }
3279     break;
3280 
3281   case mbw_lit('p'):
3282     switch(line[2]) {
3283     case mbw_lit('a'):
3284       ENTITY(mbw_lit("part"),4,0x2202);
3285       break;
3286     case mbw_lit('e'):
3287       ENTITY(mbw_lit("perp"),4,0x22a5) else ENTITY(mbw_lit("permil"),6,0x2030);
3288       break;
3289     case mbw_lit('h'):
3290       ENTITY(mbw_lit("phi"),3,0x03c6);
3291       break;
3292     case mbw_lit('i'):
3293       ENTITY(mbw_lit("pi"),2,0x03c0) else ENTITY(mbw_lit("piv"),3,0x03d6);
3294       break;
3295     case mbw_lit('r'):
3296       ENTITY(mbw_lit("prime"),5,0x2032) else	ENTITY(mbw_lit("prod"),4,0x220f);
3297       break;
3298     case mbw_lit('s'):
3299       ENTITY(mbw_lit("psi"),3,0x03c8);
3300       break;
3301     }
3302     break;
3303 
3304   case mbw_lit('P'):
3305     switch(line[2]) {
3306     case mbw_lit('a'):
3307       ENTITY(mbw_lit("Para"),4,0xb6);
3308       break;
3309     case mbw_lit('h'):
3310       ENTITY(mbw_lit("Phi"),3,0x03a6);
3311       break;
3312     case mbw_lit('i'):
3313       ENTITY(mbw_lit("Pi"),2,0x03a0);
3314       break;
3315     case mbw_lit('l'):
3316       ENTITY(mbw_lit("Plusmn"),6,0xb1);
3317       break;
3318     case mbw_lit('o'):
3319       ENTITY(mbw_lit("Pound"),5,0xa3);
3320       break;
3321     case mbw_lit('r'):
3322       ENTITY(mbw_lit("Prime"),5,0x2033) else	ENTITY(mbw_lit("Prop"),4,0x221d);
3323       break;
3324     case mbw_lit('s'):
3325       ENTITY(mbw_lit("Psi"),3,0x03a8);
3326       break;
3327     }
3328     break;
3329 
3330   case mbw_lit('q'):
3331     switch(line[2]) {
3332     case mbw_lit('u'):
3333       ENTITY(mbw_lit("quot"),4,0x22);
3334       break;
3335     }
3336     break;
3337 
3338   case mbw_lit('Q'):
3339     /* nothing */
3340     break;
3341 
3342   case mbw_lit('r'):
3343     switch(line[2]) {
3344     case mbw_lit('a'):
3345       ENTITY(mbw_lit("rarr"),4,0x2192) else ENTITY(mbw_lit("radic"),5,0x221a) else
3346 	ENTITY(mbw_lit("rang"),4,0x232a);
3347       break;
3348     case mbw_lit('A'):
3349       ENTITY(mbw_lit("rArr"),4,0x21d2);
3350       break;
3351     case mbw_lit('c'):
3352       ENTITY(mbw_lit("rceil"),5,0x2309);
3353       break;
3354     case mbw_lit('d'):
3355       ENTITY(mbw_lit("rdquo"),5,0x201d);
3356       break;
3357     case mbw_lit('e'):
3358       ENTITY(mbw_lit("real"),4,0x211C) else ENTITY(mbw_lit("reg"),3,0xae);
3359       break;
3360     case mbw_lit('f'):
3361       ENTITY(mbw_lit("rfloor"),6,0x230a);
3362       break;
3363     case mbw_lit('h'):
3364       ENTITY(mbw_lit("rho"),3,0x03c1);
3365       break;
3366     case mbw_lit('l'):
3367       ENTITY(mbw_lit("rlm"),3,0x200f);
3368       break;
3369     case mbw_lit('s'):
3370       ENTITY(mbw_lit("rsquo"),5,0x2019) else	ENTITY(mbw_lit("rsaquo"),6,0x203a);
3371       break;
3372     }
3373     break;
3374 
3375   case mbw_lit('R'):
3376     switch(line[2]) {
3377     case mbw_lit('a'):
3378       ENTITY(mbw_lit("Raquo"),5,0xbb);
3379       break;
3380     case mbw_lit('e'):
3381       ENTITY(mbw_lit("Reg"),3,0xae);
3382       break;
3383     case mbw_lit('h'):
3384       ENTITY(mbw_lit("Rho"),3,0x03a1);
3385       break;
3386     }
3387     break;
3388 
3389   case mbw_lit('s'):
3390     switch(line[2]) {
3391     case mbw_lit('b'):
3392       ENTITY(mbw_lit("sbquo"),5,0x201a);
3393       break;
3394     case mbw_lit('c'):
3395       ENTITY(mbw_lit("scaron"),6,0x0161);
3396       break;
3397     case mbw_lit('d'):
3398       ENTITY(mbw_lit("sdot"),4,0x22c5);
3399       break;
3400     case mbw_lit('e'):
3401       ENTITY(mbw_lit("sect"),4,0xa7);
3402       break;
3403     case mbw_lit('h'):
3404       ENTITY(mbw_lit("shy"),3,0xad);
3405       break;
3406     case mbw_lit('i'):
3407       ENTITY(mbw_lit("sigmaf"),6,0x03c2) else ENTITY(mbw_lit("sigma"),5,0x03c3) else
3408 	ENTITY(mbw_lit("sim"),3,0x223c);
3409       break;
3410     case mbw_lit('p'):
3411       ENTITY(mbw_lit("spades"),6,0x2660);
3412       break;
3413     case mbw_lit('u'):
3414       ENTITY(mbw_lit("sup2"),4,0xb2) else ENTITY(mbw_lit("sup3"),4,0xb3) else
3415 	ENTITY(mbw_lit("sup1"),4,0xb9) else ENTITY(mbw_lit("sum"),3,0x2211) else
3416 	  ENTITY(mbw_lit("sub"),3,0x2282) else ENTITY(mbw_lit("sup"),3,0x2283) else
3417 	    ENTITY(mbw_lit("sube"),4,0x2286) else ENTITY(mbw_lit("supe"),4,0x2287);
3418       break;
3419     case mbw_lit('z'):
3420       ENTITY(mbw_lit("szlig"),5,0xdf);
3421       break;
3422     }
3423     break;
3424 
3425   case mbw_lit('S'):
3426     switch(line[2]) {
3427     case mbw_lit('c'):
3428       ENTITY(mbw_lit("Scaron"),6,0x0160);
3429       break;
3430     case mbw_lit('i'):
3431       ENTITY(mbw_lit("Sigma"),5,0x03a3);
3432       break;
3433     }
3434     break;
3435 
3436   case mbw_lit('t'):
3437     switch(line[2]) {
3438     case mbw_lit('a'):
3439       ENTITY(mbw_lit("tau"),3,0x03c4);
3440       break;
3441     case mbw_lit('h'):
3442       ENTITY(mbw_lit("thorn"),5,0xfe) else ENTITY(mbw_lit("theta"),5,0x03b8) else
3443 	ENTITY(mbw_lit("thetasym"),8,0x03d1) else ENTITY(mbw_lit("there4"),6,0x2234) else
3444 	  ENTITY(mbw_lit("thinsp"),6,0x2009);
3445       break;
3446     case mbw_lit('i'):
3447       ENTITY(mbw_lit("times"),5,0xd7) else ENTITY(mbw_lit("tilde"),5,0x02dc);
3448       break;
3449     case mbw_lit('r'):
3450       ENTITY(mbw_lit("trade"),5,0x2122);
3451       break;
3452     }
3453     break;
3454 
3455   case mbw_lit('T'):
3456     switch(line[2]) {
3457     case mbw_lit('a'):
3458       ENTITY(mbw_lit("Tau"),3,0x03a4);
3459       break;
3460     case mbw_lit('h'):
3461       ENTITY(mbw_lit("Theta"),5,0x0398);
3462       break;
3463     case mbw_lit('H'):
3464       ENTITY(mbw_lit("THORN"),5,0xde);
3465       break;
3466     }
3467     break;
3468 
3469   case mbw_lit('u'):
3470     switch(line[2]) {
3471     case mbw_lit('a'):
3472       ENTITY(mbw_lit("uacute"),6,0xfa) else ENTITY(mbw_lit("uarr"),4,0x2191);
3473       break;
3474     case mbw_lit('A'):
3475       ENTITY(mbw_lit("uArr"),4,0x21d1);
3476       break;
3477     case mbw_lit('c'):
3478       ENTITY(mbw_lit("ucirc"),5,0xfb);
3479       break;
3480     case mbw_lit('g'):
3481       ENTITY(mbw_lit("ugrave"),6,0xf9);
3482       break;
3483     case mbw_lit('m'):
3484       ENTITY(mbw_lit("uml"),3,0xa8);
3485       break;
3486     case mbw_lit('p'):
3487       ENTITY(mbw_lit("upsilon"),7,0xc5) else ENTITY(mbw_lit("upsih"),5,0x03d2);
3488       break;
3489     case mbw_lit('u'):
3490       ENTITY(mbw_lit("uuml"),4,0xfc);
3491       break;
3492     }
3493     break;
3494 
3495   case mbw_lit('U'):
3496     switch(line[2]) {
3497     case mbw_lit('a'):
3498       ENTITY(mbw_lit("Uacute"),6,0xda);
3499       break;
3500     case mbw_lit('c'):
3501       ENTITY(mbw_lit("Ucirc"),5,0xdb);
3502       break;
3503     case mbw_lit('g'):
3504       ENTITY(mbw_lit("Ugrave"),6,0xd9);
3505       break;
3506     case mbw_lit('p'):
3507       ENTITY(mbw_lit("Upsilon"),7,0xa5);
3508       break;
3509     case mbw_lit('u'):
3510       ENTITY(mbw_lit("Uuml"),4,0xdc);
3511       break;
3512     }
3513     break;
3514 
3515   case mbw_lit('v'):
3516     /* nothing */
3517     break;
3518 
3519   case mbw_lit('V'):
3520     /* nothing */
3521     break;
3522 
3523   case mbw_lit('w'):
3524     switch(line[2]) {
3525     case mbw_lit('e'):
3526       ENTITY(mbw_lit("weierp"),6,0x2118);
3527       break;
3528     }
3529     break;
3530 
3531   case mbw_lit('W'):
3532     /* nothing */
3533     break;
3534 
3535   case mbw_lit('x'):
3536     switch(line[2]) {
3537     case mbw_lit('i'):
3538       ENTITY(mbw_lit("xi"),2,0x03be);
3539       break;
3540     }
3541     break;
3542 
3543   case mbw_lit('X'):
3544     switch(line[2]) {
3545     case mbw_lit('i'):
3546       ENTITY(mbw_lit("Xi"),2,0x039e);
3547       break;
3548     }
3549     break;
3550 
3551   case mbw_lit('y'):
3552     switch(line[2]) {
3553     case mbw_lit('a'):
3554       ENTITY(mbw_lit("yacute"),6,0xfd);
3555       break;
3556     case mbw_lit('e'):
3557       ENTITY(mbw_lit("yen"),3,0xa5);
3558       break;
3559     case mbw_lit('u'):
3560       ENTITY(mbw_lit("yuml"),4,0xff);
3561       break;
3562     }
3563     break;
3564 
3565   case mbw_lit('Y'):
3566     switch(line[2]) {
3567     case mbw_lit('a'):
3568       ENTITY(mbw_lit("Yacute"),6,0xdd);
3569       break;
3570     case mbw_lit('u'):
3571       ENTITY(mbw_lit("Yuml"),4,0x0178);
3572       break;
3573     }
3574     break;
3575 
3576   case mbw_lit('z'):
3577     switch(line[2]) {
3578     case mbw_lit('e'):
3579       ENTITY(mbw_lit("zeta"),4,0x03b6);
3580       break;
3581     case mbw_lit('w'):
3582       ENTITY(mbw_lit("zwnj"),4,0x200c) else ENTITY(mbw_lit("zwj"),3,0x200d);
3583       break;
3584     }
3585     break;
3586 
3587   case mbw_lit('Z'):
3588     switch(line[2]) {
3589     case mbw_lit('e'):
3590       ENTITY(mbw_lit("Zeta"),4,0x0396);
3591       break;
3592     }
3593     break;
3594 
3595   default:
3596     break;
3597   }
3598 
3599   /* some values of c are not allowed, because they interfere with the
3600      html parser */
3601   switch(c) {
3602   case L'\0':
3603   case L'\001': /* TOKENSEP */
3604   case L'\002': /* CLASSEP */
3605   case L'\003': /* DIAMOND */
3606     /* reserved control codes */
3607     c = L' '; break;
3608   case L'<': c = L'('; break;
3609   case L'>': c = L')'; break;
3610   default: break;
3611   }
3612 
3613   /* normally, entities end with ';', which will be skipped
3614      after we exit this function. However, we're lenient: if
3615      we don't point to ';', then we back up by one so that later
3616      we don't skip this character. Note this is safe, because
3617      ENTITY makes r point to at least line +1, and otherwise r is NULL */
3618   if( r && (*r != mbw_lit(';')) ) { r--; }
3619 
3620 #if defined MBW_WIDE
3621 
3622   if( c && r ) {
3623     *q++ = c;
3624     line = r;
3625     retval = 1;
3626   } else {
3627     /* do nothing */
3628   }
3629 #else
3630 
3631   /* now if c is nonzero, then we found the entity */
3632   if( c && r ) {
3633     if( c == 0xa0 ) {
3634       /* shortcut for &nbsp; */
3635       *q++ = mbw_lit(' ');
3636     } else {
3637       s = wcrtomb(scratch,c,NULL);
3638       if( (s > -1) && (q + s <= r) ) {
3639 	for(t = 0; t < s; t++) { *q++ = scratch[t]; }
3640       } else {
3641 	/* locale doesn't recognize this char */
3642 	s = c;
3643 	if( s < 0xFF ) { *q++ = (mbw_t)s; }
3644       }
3645     }
3646     line = r;
3647     retval = 1;
3648   } else {
3649     /* do nothing */
3650   }
3651 #endif
3652 
3653 #else /* HAVE_MBRTOWC is not defined */
3654 
3655   /* do nothing */
3656 
3657 #endif
3658 
3659   /* reminder: if no conversion is possible, we do nothing -
3660      I always forget this and try to update q and line, thereby
3661      introducing bugs */
3662   *lline = line;
3663   *qq = q;
3664   return retval;
3665 }
3666 
3667 /*
3668  * this code generates decode_escaped_uri_character() and
3669  * w_decode_escaped_uri_character().
3670  *
3671  * NOTE: this doesn't cope correctly with the case that the
3672  * URI encoded character is itself encoded as html entities.
3673  * For example, %20 can itself be encoded as &#37;&#50;&#48;
3674  * Since the first char is decoded as '%' (otherwise we wouldn't
3675  * be inside decode_uri_character(), the function effectively
3676  * tries to decode %&#50;&#48; and fails. This is harmless, as
3677  * the ultimately decode line will be %20 instead of ' '.
3678  */
mbw_prefix(decode_uri_character)3679 void mbw_prefix(decode_uri_character)(mbw_t **lline, mbw_t **qq) {
3680   mbw_t *line = *lline;
3681   mbw_t *q = *qq;
3682   mbw_t scratch[3];
3683   mbw_t c = 0;
3684   mbw_t *r;
3685 
3686   if( *line == mbw_lit('%') ) {
3687 #if defined MBW_MB || (defined MBW_WIDE && defined HAVE_WCSTOL)
3688     /* check that the next two chars are hex */
3689     for(r = line + 1; mbw_isspace(*r); r++);
3690     if( mbw_isxdigit(*r) ) {
3691       scratch[0] = *r;
3692       for(r++; mbw_isspace(*r); r++);
3693       if( mbw_isxdigit(*r) ) {
3694 	scratch[1] = *r;
3695 	scratch[2] = mbw_lit('\0');
3696 	c = (mbw_t)mbw_strtol(scratch, NULL, 16);
3697       }
3698     }
3699 #endif
3700     if( c ) {
3701       *q++ = c;
3702       line = r;
3703     } else {
3704       *q++ = *line;
3705     }
3706 
3707   } else if( mbw_isspace(*line) ) {
3708     /* ignore */
3709   } else {
3710     /* not an escaped character */
3711     *q++ = *line;
3712   }
3713 
3714   *lline = line;
3715   *qq = q;
3716 }
3717 
3718 /* this reads one or more characters from line and outputs
3719  * zero or one character at q.
3720  * The characters output depend on the xml.attribute state, but
3721  * the function never outputs more than it reads. Thus if q <= line
3722  * to begin with, this is preserved. This is necessary as the function
3723  * is normally used to modify line in-place.
3724  *
3725  * For URL type attributes, this function assumes the standard URI
3726  * form scheme:netloc/extra, and only prints the netloc part.
3727  *
3728  * A special case is the URI javascript:xxxxx, which is treated differently.
3729  *
3730  * CAUTION: if you increment qq, don't also call decode_uri_character().
3731  *          Just do one or the other.
3732  */
3733 static
mbw_prefix(xml_attribute_filter)3734 void mbw_prefix(xml_attribute_filter)(XML_State *xml, mbw_t **lline, mbw_t **qq) {
3735 
3736   if( (xml->attribute != UNDEF) && (*(*lline) == mbw_lit('&')) ) {
3737     if( mbw_prefix(decode_html_entity)(lline, qq) ) {
3738       (*lline)++;
3739     }
3740   }
3741 
3742   switch(xml->attribute) {
3743   case SRC:
3744     if( *(*lline) == mbw_lit(':') ) {
3745       xml->attribute = SRC_NETLOC_PREFIX;
3746       *(*qq)++ = *(*lline);
3747     } else if( mbw_strncasecmp(*lline, mbw_lit("javascript:"), 11) == 0 ) {
3748       /* scripts are dealt differently */
3749       xml->attribute = JSCRIPT;
3750       if( m_options & (1<<M_OPTION_SHOW_SCRIPT) ) {
3751 	*(*qq)++ = *(*lline);
3752       }
3753     } else {
3754       *(*qq)++ = *(*lline);
3755     }
3756     break;
3757   case SRC_NETLOC_PREFIX:
3758     switch(*(*lline)) {
3759     case mbw_lit('/'):
3760       *(*qq)++ = *(*lline);
3761       break;
3762     case mbw_lit('?'):
3763     case mbw_lit(';'):
3764     case mbw_lit('#'):
3765     case mbw_lit('&'):
3766       xml->attribute = SRC_NETLOC_SUFFIX;
3767       *(*qq)++ = *(*lline);
3768       break;
3769     default:
3770       mbw_prefix(decode_uri_character)(lline,qq);
3771       xml->attribute = SRC_NETLOC;
3772       break;
3773     }
3774     break;
3775   case SRC_NETLOC:
3776     switch(*(*lline)) {
3777     case mbw_lit('/'):
3778       *(*qq)++ = *(*lline);
3779       xml->attribute = SRC_NETLOC_PATH;
3780       break;
3781     case mbw_lit('?'):
3782     case mbw_lit(';'):
3783     case mbw_lit('#'):
3784     case mbw_lit('&'):
3785       xml->attribute = SRC_NETLOC_SUFFIX;
3786       break;
3787     default:
3788       mbw_prefix(decode_uri_character)(lline,qq);
3789       break;
3790     }
3791     break;
3792   case SRC_NETLOC_PATH:
3793     switch(*(*lline)) {
3794     case mbw_lit('.'):
3795       *(*qq)++ = *(*lline);
3796       break;
3797     case mbw_lit('?'):
3798     case mbw_lit(';'):
3799     case mbw_lit('#'):
3800     case mbw_lit('&'):
3801       xml->attribute = SRC_NETLOC_SUFFIX;
3802       *(*qq)++ = *(*lline);
3803       break;
3804     default:
3805       mbw_prefix(decode_uri_character)(lline,qq);
3806       break;
3807     }
3808     break;
3809   case SRC_NETLOC_SUFFIX:
3810 /*     *(*qq)++ = mbw_lit(' '); */
3811     *(*qq)++ = *(*lline);
3812     break;
3813   case ALT:
3814     *(*qq)++ = *(*lline);
3815     break;
3816   case UNDEF:
3817     /* nothing */
3818     break;
3819   case JSCRIPT:
3820     if( m_options & (1<<M_OPTION_SHOW_SCRIPT) ) {
3821       *(*qq)++ = *(*lline);
3822     }
3823     break;
3824   case ASTYLE:
3825     if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
3826       *(*qq)++ = *(*lline);
3827     }
3828     break;
3829   }
3830 }
3831 
3832 /* Removes tags in the string - modifies in place
3833  * the name of this function is a misnomer, since it doesn't
3834  * parse xml properly.
3835  *
3836  * The filter can be called in several "modes" selected by the
3837  * xml.parser variable.
3838  *
3839  * The simplest parsing is xpDUMB mode, which simply skips XML like
3840  * tags without looking inside them.
3841  *
3842  * For HTML parsing, there is xpHTML mode, and its counterpart xpSMART
3843  * mode. xpHTML looks inside common HTML tags and can print the contents
3844  * of attributes. To explain xpSMART mode, remember that HTML documents
3845  * should normally be written inside <html> and </html> tags. If these
3846  * tags are found, then everything outside them is handled by xpSMART mode.
3847  *
3848  * Thus, in particular, a new document should be started in xpSMART mode.
3849  * For text documents, this ensures that any preambles are not rendered,
3850  * until true HTML is encountered.
3851  *
3852  * However, there is a small problem, namely the <html> tags are
3853  * optional.  For text documents, missing <html> tags are rare, but
3854  * email often contains only fragments with <html> missing.  To cope
3855  * with this, in mail mode, xpSMART scans the current line and
3856  * switches immediately to xpHTML mode if the line is printable.
3857  *
3858  * This turns out to be an extremely important function, because
3859  * spammers don't always label attachments correctly. So it's possible
3860  * to get a binary stream labeled as text/html, and of course lots of
3861  * junk tokens. If xpSMART mode detects binary, then it does NOT
3862  * switch to xpHTML mode immediately, and nothing gets printed.  If
3863  * and when a valid <html> tag is found later, HTML will be enabled as
3864  * necessary. I think this is a robust partial solution to an
3865  * intractable problem.
3866  */
mbw_prefix(xml_character_filter)3867 void mbw_prefix(xml_character_filter)(XML_State *xml, mbw_t *line) {
3868   mbw_t *q;
3869   q = line;
3870 /*   int k; */
3871 
3872   /* don't call this with y < 1 */
3873 #define TAGMATCH(x,y) (!mbw_strncasecmp(line + 1, x + 1, y - 1) && (mbw_isspace(line[y]) || (line[y] == mbw_lit('>')) || (line[y] == mbw_lit('\0'))) && (line += (y - 1)))
3874 
3875 #define ATTRMATCH(x,y) (!mbw_strncasecmp(line + 1, x + 1, y - 1) && (mbw_isspace(line[y]) || (line[y] == mbw_lit('=')) || (line[y] == mbw_lit('\0'))) && (line += (y - 1)))
3876 
3877   /* this is convenient for debugging */
3878 #define PDEBUG(x) printf(#x"{%c%c%c%c%c%c%c%c%c%c}\n", line[0], line[1], line[2], line[3], line[4], line[5], line[6], line[7], line[8], line[9])
3879 #define PDEBUG2(x,y) {printf(#x"2{"); for(k = 0; k < y; k++) printf("%c",line[k]); printf("}\n"); }
3880 
3881   /* this is important - read comments above */
3882   if( (m_options & (1<<M_OPTION_MBOX_FORMAT)) &&
3883       (xml->parser == xpSMART) &&
3884       !mbw_prefix(is_binline)(line) &&
3885       !mbw_prefix(is_emptyspace)(line) ) {
3886     xml->parser = xpHTML;
3887   }
3888 
3889   while( *line ) {
3890 /*     printf("%d %d ->", xml->state, xml->attribute); */
3891 /*     PDEBUG(LINE); */
3892     switch(xml->state) {
3893     case TEXT:
3894       switch(line[0]) {
3895       case mbw_lit('<'):
3896 	/* does it look like <x where x is either alpha or punctuation? */
3897 	if( mbw_isalpha(line[1]) ) {
3898 	  line++;
3899 	  /* tags aren't mined, xtags are */
3900 	  xml->state = TAG;
3901 	  xml->attribute = UNDEF;
3902 	  switch(mbw_tolower(line[0])) {
3903 	  case mbw_lit('a'):
3904 	    if( (line[1] == mbw_lit('\0')) || mbw_isspace(line[1]) ||
3905 		TAGMATCH(mbw_lit("area"),4) ||
3906 		TAGMATCH(mbw_lit("applet"),6) ) {
3907 	      xml->state = XTAG;
3908 	    }
3909 	    break;
3910 	  case mbw_lit('b'):
3911 	    if( TAGMATCH(mbw_lit("base"),4) ||
3912 		TAGMATCH(mbw_lit("bgsound"),7) ) {
3913 	      xml->state = XTAG;
3914 	    } else if( TAGMATCH(mbw_lit("br"),2) ) {
3915 	      *q++ = mbw_lit('\n');
3916 	      xml->state = TAG;
3917 	    } else if( TAGMATCH(mbw_lit("body"),4) ) {
3918 	      xml->hide = VISIBLE;
3919 	      if( xml->parser == xpSMART ) {
3920 		xml->parser = xpHTML;
3921 	      }
3922 	      xml->state = XTAG;
3923 	    }
3924 	    break;
3925 	  case mbw_lit('c'):
3926 	    if( TAGMATCH(mbw_lit("comment"),7) ) {
3927 	      if( xml->parser == xpHTML ) {
3928 		xml->hide = COMMENT;
3929 	      }
3930 	    }
3931 	    break;
3932 	  case mbw_lit('d'):
3933 	    if( TAGMATCH(mbw_lit("div"),3) ) {
3934 	      xml->state = XTAG;
3935 	    }
3936 	    break;
3937 	  case mbw_lit('e'):
3938 	    if( TAGMATCH(mbw_lit("embed"),5) ) {
3939 	      xml->state = XTAG;
3940 	    }
3941 	    break;
3942 	  case mbw_lit('f'):
3943 	    if( TAGMATCH(mbw_lit("frame"),5) ||
3944 		TAGMATCH(mbw_lit("form"),4) ) {
3945 	      xml->state = XTAG;
3946 	    }
3947 	    break;
3948 	  case mbw_lit('h'):
3949 	    if( TAGMATCH(mbw_lit("html"),4) || TAGMATCH(mbw_lit("head"),4) ) {
3950 	      xml->hide = VISIBLE;
3951 	      if( xml->parser == xpSMART ) {
3952 		xml->parser = xpHTML;
3953 	      }
3954 	    } else if( TAGMATCH(mbw_lit("hr"),2) ) {
3955 	      *q++ = mbw_lit('\n');
3956 	      xml->state = TAG;
3957 	    }
3958 	    break;
3959 	  case mbw_lit('i'):
3960 	    if( TAGMATCH(mbw_lit("img"),3) ||
3961 		TAGMATCH(mbw_lit("iframe"),6) ||
3962 		TAGMATCH(mbw_lit("ilayer"),6) ||
3963 		TAGMATCH(mbw_lit("input"),5) ) {
3964 	      xml->state = XTAG;
3965 	    }
3966 	    break;
3967 	  case mbw_lit('l'):
3968 	    if( TAGMATCH(mbw_lit("layer"),5) ||
3969 		TAGMATCH(mbw_lit("link"),4) ) {
3970 	      xml->state = XTAG;
3971 	    }
3972 	    break;
3973 	  case mbw_lit('n'):
3974 	    if( (TAGMATCH(mbw_lit("noframes"),8) && (xml->hide = NOFRAMES)) ||
3975 		(TAGMATCH(mbw_lit("nolayer"),7) && (xml->hide = NOLAYER)) ||
3976 		(TAGMATCH(mbw_lit("noscript"),8) && (xml->hide = NOSCRIPT)) ||
3977 		(TAGMATCH(mbw_lit("noembed"),7) && (xml->hide = NOEMBED)) ) {
3978 	      if( xml->parser == xpHTML ) {
3979 		if( (m_options & (1<<M_OPTION_SHOW_ALT)) ) {
3980 		  xml->hide = VISIBLE;
3981 		}
3982 	      }
3983 	    }
3984 	    break;
3985 	  case mbw_lit('o'):
3986 	    if( TAGMATCH(mbw_lit("object"),6) ) {
3987 	      xml->state = XTAG;
3988 	    }
3989 	    break;
3990 	  case mbw_lit('s'):
3991 	    if( TAGMATCH(mbw_lit("span"),4) ) {
3992 	      xml->state = XTAG;
3993 	    } else if( TAGMATCH(mbw_lit("script"),6) ) {
3994 	      xml->hide = SCRIPT;
3995 	    } else if( TAGMATCH(mbw_lit("style"),5) ) {
3996 	      xml->hide = STYLE;
3997 	    }
3998 	    break;
3999 	  case mbw_lit('t'):
4000 	    if( TAGMATCH(mbw_lit("title"),5) ) {
4001 		xml->hide = TITLE;
4002 	    }
4003 	    break;
4004 	  default:
4005 	    /* ignore, ie it's a TAG */
4006 	    break;
4007 	  }
4008 	} else if( line[1] == mbw_lit('/') ) {
4009 	  line++;
4010 	  /* tags aren't mined, xtags are */
4011 	  xml->state = TAG;
4012 	  xml->attribute = UNDEF;
4013 	  switch(mbw_tolower(line[1])) {
4014 	  case mbw_lit('b'):
4015 	    if( TAGMATCH(mbw_lit("/body"),5) ) {
4016 	      if( xml->parser == xpHTML ) {
4017 		xml->parser = xpSMART;
4018 	      }
4019 	      xml->hide = VISIBLE;
4020 	    }
4021 	    break;
4022 	  case mbw_lit('c'):
4023 	    if( (xml->hide == COMMENT) && TAGMATCH(mbw_lit("/comment"),8) ) {
4024 	      /* nothing */
4025 	    }
4026 	    break;
4027 	  case mbw_lit('h'):
4028 	    if( TAGMATCH(mbw_lit("/html"),5) || TAGMATCH(mbw_lit("/head"),5) ) {
4029 	      if( xml->parser == xpHTML ) {
4030 		xml->parser = xpSMART;
4031 	      }
4032 	      xml->hide = VISIBLE;
4033 	    }
4034 	    break;
4035 	  case mbw_lit('n'):
4036 	    if( ((xml->hide == NOFRAMES) && TAGMATCH(mbw_lit("/noframes"),9)) ||
4037 		((xml->hide == NOSCRIPT) && TAGMATCH(mbw_lit("/noscript"),9)) ||
4038 		((xml->hide == NOLAYER) && TAGMATCH(mbw_lit("/nolayer"),8)) ||
4039 		((xml->hide == NOEMBED) && TAGMATCH(mbw_lit("/noembed"),8)) ) {
4040 	      xml->hide = VISIBLE;
4041 	    }
4042 	    break;
4043 	  case mbw_lit('s'):
4044 	    if( ((xml->hide == SCRIPT) && TAGMATCH(mbw_lit("/script"),7)) ||
4045 		((xml->hide == STYLE) && TAGMATCH(mbw_lit("/style"),6)) ) {
4046 	      xml->hide = VISIBLE;
4047 	    }
4048 	    break;
4049 	  case mbw_lit('t'):
4050 	    if( TAGMATCH(mbw_lit("/title"),6) ) {
4051 	      xml->hide = VISIBLE;
4052 	    }
4053 	    break;
4054 	  default:
4055 	    /* ignore, ie it's a TAG */
4056 	    break;
4057 	  }
4058 	} else { /* second char is not alpha or slash */
4059 	  if( mbw_strncmp(line + 1, mbw_lit("!--"), 3) == 0 ) {
4060 	    if( line[4] == mbw_lit('>') ) {
4061 	      /* buggy MSHTML accepts <!--> as a comment, so ignore this */
4062 	      line += 4;
4063 	    } else { /* real comment */
4064 	      xml->state = CMNT;
4065 	      line += 3;
4066 	    }
4067 	  } else if( line[1] == mbw_lit('<') ) {
4068 	    /* stay in TEXT state */
4069 	    if( xml->parser == xpDUMB ) {
4070 	      line++;
4071 	    } else {
4072 	      switch(xml->hide) {
4073 	      case VISIBLE:
4074 	      case TITLE:
4075 		*q++ = *line;
4076 		break;
4077 	      case SCRIPT:
4078 		if( m_options & (1<<M_OPTION_SHOW_SCRIPT) ) {
4079 		  *q++ = *line;
4080 		}
4081 		break;
4082 	      case STYLE:
4083 		if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
4084 		  *q++ = *line;
4085 		}
4086 		break;
4087 	      default:
4088 		break;
4089 	      }
4090 	    }
4091 	  } else {
4092 	    /* bogus tag? */
4093 	    xml->state = TAG;
4094 	    line++;
4095 	  }
4096 	}
4097 	break;
4098       case mbw_lit('&'):
4099 	if( (xml->parser == xpHTML) || (xml->parser == xpDUMB) ) {
4100 	  switch(xml->hide) {
4101 	  case VISIBLE:
4102 	  case TITLE:
4103 	    mbw_prefix(decode_html_entity)(&line, &q);
4104 	    break;
4105 	  case SCRIPT:
4106 	    if( m_options & (1<<M_OPTION_SHOW_SCRIPT) ) {
4107 	      mbw_prefix(decode_html_entity)(&line, &q);
4108 	    }
4109 	    break;
4110 	  case STYLE:
4111 	    if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
4112 	      mbw_prefix(decode_html_entity)(&line, &q);
4113 	    }
4114 	    break;
4115 	  default:
4116 	    break;
4117 	  }
4118 	}
4119 	break;
4120       default:
4121 	if( (xml->parser == xpHTML) || (xml->parser == xpDUMB) ) {
4122 	  switch(xml->hide) {
4123 	  case VISIBLE:
4124 	  case TITLE:
4125 	    *q++ = *line;
4126 	    break;
4127 	  case SCRIPT:
4128 	    if( m_options & (1<<M_OPTION_SHOW_SCRIPT) ) {
4129 	      *q++ = *line;
4130 	    }
4131 	    break;
4132 	  case STYLE:
4133 	    if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
4134 	      *q++ = *line;
4135 	    }
4136 	    break;
4137 	  default:
4138 	    break;
4139 	  }
4140 	}
4141 	break;
4142       }
4143       break;
4144     case TAG:
4145       if( line[0] == mbw_lit('>') ) {
4146 	xml->state = TEXT;
4147       } else if( line[0] == mbw_lit('=') ) {
4148 	xml->state = TAGPREQ;
4149       }
4150       break;
4151     case TAGPREQ:
4152       if( line[0] == mbw_lit('\'') ) {
4153 	xml->state = TAGQUOTE;
4154       } else if( line[0] == mbw_lit('"') ) {
4155 	xml->state = TAGDQUOTE;
4156       } else if( !mbw_isspace(line[0]) ) {
4157 	xml->state = TAG;
4158 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4159       }
4160       break;
4161     case TAGQUOTE:
4162       if( (line[0] == '\\') && (line[1] == mbw_lit('\'')) ) {
4163 	line++;
4164       } else if( line[0] == mbw_lit('\'') ) {
4165 	xml->state = TAG;
4166       }
4167       break;
4168     case TAGDQUOTE:
4169       if( (line[0] == '\\') && (line[1] == mbw_lit('"')) ) {
4170 	line++;
4171       } else if( line[0] == mbw_lit('"') ) {
4172 	xml->state = TAG;
4173       }
4174       break;
4175     case XTAG:
4176       if( xml->parser == xpSMART ) {
4177 	/* we've recognized an HTML tag */
4178 	xml->parser = xpHTML;
4179       }
4180       if( (xml->attribute != UNDEF) &&
4181 	  mbw_isspace(line[0]) &&
4182 	  !MBW_EMPTYLINE(line) ) {
4183 	xml->attribute = UNDEF;
4184 	*q++ = ATTRIBSEP;
4185       } else
4186       switch(mbw_tolower(line[0])) {
4187       case mbw_lit('>'):
4188 	xml->state = TEXT;
4189 	if( xml->attribute != UNDEF ) {
4190 	  xml->attribute = UNDEF;
4191 	  *q++ = ATTRIBSEP;
4192 	}
4193 	break;
4194       case mbw_lit('='):
4195 	xml->state = XTAGPREQ;
4196 	break;
4197       case mbw_lit('a'):
4198 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4199 	if( m_options & (1<<M_OPTION_SHOW_ALT) ) {
4200 	  if( !mbw_strncasecmp(line, mbw_lit("alt"), 3) ) {
4201 	    if( xml->attribute != UNDEF ) {
4202 	      *q++ = ATTRIBSEP;
4203 	    }
4204 	    xml->attribute = ALT;
4205 	    line += 2;
4206 	    *q++ = ATTRIBSEP;
4207 	  }
4208 	}
4209 	if( m_options & (1<<M_OPTION_SHOW_FORMS) ) {
4210 	  if( !mbw_strncasecmp(line, mbw_lit("action"), 6) ) {
4211 	    if( xml->attribute != UNDEF ) {
4212 	      *q++ = ATTRIBSEP;
4213 	    }
4214 	    xml->attribute = SRC;
4215 	    line += 5;
4216 	    *q++ = ATTRIBSEP;
4217 	  }
4218 	}
4219 	break;
4220       case mbw_lit('c'):
4221 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4222 	if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
4223 	  if( !mbw_strncasecmp(line, mbw_lit("class"), 5) ) {
4224 	    if( xml->attribute != UNDEF ) {
4225 	      *q++ = ATTRIBSEP;
4226 	    }
4227 	    xml->attribute = ASTYLE;
4228 	    line += 4;
4229 	    *q++ = ATTRIBSEP;
4230 	  }
4231 	}
4232 	break;
4233       case mbw_lit('d'):
4234 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4235 	if( m_options & (1<<M_OPTION_SHOW_LINKS) ) {
4236 	  if( !mbw_strncasecmp(line, mbw_lit("data"), 4) ) {
4237 	    if( xml->attribute != UNDEF ) {
4238 	      *q++ = ATTRIBSEP;
4239 	    }
4240 	    xml->attribute = SRC;
4241 	    line += 3;
4242 	    *q++ = ATTRIBSEP;
4243 	  }
4244 	}
4245 	break;
4246       case mbw_lit('h'):
4247 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4248 	if( m_options & (1<<M_OPTION_SHOW_LINKS) ) {
4249 	  if( !mbw_strncasecmp(line, mbw_lit("href"), 4) ) {
4250 	    if( xml->attribute != UNDEF ) {
4251 	      *q++ = ATTRIBSEP;
4252 	    }
4253 	    xml->attribute = SRC;
4254 	    line += 3;
4255 	    *q++ = ATTRIBSEP;
4256 	  }
4257 	}
4258 	break;
4259       case mbw_lit('o'):
4260 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4261 	if( mbw_tolower(line[1]) != mbw_lit('n') ) {
4262 	  /* false alarm */
4263 	} else if( ATTRMATCH(mbw_lit("onmousedown"), 11) ||
4264 		   ATTRMATCH(mbw_lit("onmousemove"), 11) ||
4265 		   ATTRMATCH(mbw_lit("onmouseout"), 10) ||
4266 		   ATTRMATCH(mbw_lit("onmouseover"), 11) ||
4267 		   ATTRMATCH(mbw_lit("onmouseup"), 9) ||
4268 
4269 		   ATTRMATCH(mbw_lit("onclick"), 7) ||
4270 		   ATTRMATCH(mbw_lit("ondblclick"), 10) ||
4271 		   ATTRMATCH(mbw_lit("onfocus"), 7) ||
4272 
4273 		   ATTRMATCH(mbw_lit("onkeydown"), 9) ||
4274 		   ATTRMATCH(mbw_lit("onkeypress"), 10) ||
4275 		   ATTRMATCH(mbw_lit("onkeyup"), 7) ||
4276 
4277 		   ATTRMATCH(mbw_lit("ondataavailable"), 15) ||
4278 		   ATTRMATCH(mbw_lit("ondatasetchanged"), 16) ||
4279 		   ATTRMATCH(mbw_lit("ondatasetcomplete"), 17) ||
4280 
4281 		   ATTRMATCH(mbw_lit("onabort"), 7) ||
4282 		   ATTRMATCH(mbw_lit("onload"), 6) ||
4283 		   ATTRMATCH(mbw_lit("onunload"), 8) ||
4284 		   ATTRMATCH(mbw_lit("onmove"), 6) ||
4285 		   ATTRMATCH(mbw_lit("onresize"), 8) ||
4286 		   ATTRMATCH(mbw_lit("onsubmit"), 8) ) {
4287 	  if( xml->attribute != UNDEF ) {
4288 	    *q++ = ATTRIBSEP;
4289 	  }
4290 
4291 	  xml->attribute = JSCRIPT;
4292 	  *q++ = ATTRIBSEP;
4293 	  /* line is already updated by ATTRMATCH */
4294 	}
4295 	break;
4296       case mbw_lit('s'):
4297 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4298 	if( m_options & (1<<M_OPTION_SHOW_LINKS) ) {
4299 	  if( !mbw_strncasecmp(line, mbw_lit("src"), 3) ) {
4300 	    if( xml->attribute != UNDEF ) {
4301 	      *q++ = ATTRIBSEP;
4302 	    }
4303 	    xml->attribute = SRC;
4304 	    line += 2;
4305 	    *q++ = ATTRIBSEP;
4306 	  }
4307 	}
4308 	if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
4309 	  if( !mbw_strncasecmp(line, mbw_lit("style"), 5) ) {
4310 	    if( xml->attribute != UNDEF ) {
4311 	      *q++ = ATTRIBSEP;
4312 	    }
4313 	    xml->attribute = ASTYLE;
4314 	    line += 4;
4315 	    *q++ = ATTRIBSEP;
4316 	  }
4317 	}
4318 	break;
4319       case mbw_lit('t'):
4320 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4321 	if( m_options & (1<<M_OPTION_SHOW_ALT) ) {
4322 	  if( !mbw_strncasecmp(line, mbw_lit("title"), 5) ) {
4323 	    if( xml->attribute != UNDEF ) {
4324 	      *q++ = ATTRIBSEP;
4325 	    }
4326 	    xml->attribute = ALT;
4327 	    line += 4;
4328 	    *q++ = ATTRIBSEP;
4329 	  }
4330 	}
4331 	break;
4332       case mbw_lit('u'):
4333 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4334 	if( m_options & (1<<M_OPTION_SHOW_LINKS) ) {
4335 	  if( !mbw_strncasecmp(line, mbw_lit("urn"), 3) ) {
4336 	    if( xml->attribute != UNDEF ) {
4337 	      *q++ = ATTRIBSEP;
4338 	    }
4339 	    xml->attribute = SRC;
4340 	    line += 2;
4341 	    *q++ = ATTRIBSEP;
4342 	  }
4343 	}
4344 	break;
4345       case mbw_lit('v'):
4346 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4347 	if( m_options & (1<<M_OPTION_SHOW_FORMS) ) {
4348 	  if( !mbw_strncasecmp(line, mbw_lit("value"), 5) ) {
4349 	    if( xml->attribute != UNDEF ) {
4350 	      *q++ = ATTRIBSEP;
4351 	    }
4352 	    xml->attribute = SRC;
4353 	    line += 4;
4354 	    *q++ = ATTRIBSEP;
4355 	  }
4356 	}
4357 	break;
4358       default:
4359 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4360 	break;
4361       }
4362       break;
4363     case XTAGPREQ:
4364       if( line[0] == mbw_lit('\'') ) {
4365 	xml->state = XTAGQUOTE;
4366       } else if( line[0] == mbw_lit('"') ) {
4367 	xml->state = XTAGDQUOTE;
4368       } else if( !mbw_isspace(line[0]) ) {
4369 	xml->state = XTAG;
4370 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4371       }
4372       break;
4373     case XTAGQUOTE:
4374       if( line[0] == mbw_lit('\'') ) {
4375 	xml->state = XTAG;
4376 	if( xml->attribute != UNDEF ) {
4377 	  *q++ = ATTRIBSEP;
4378 	}
4379 	xml->attribute = UNDEF;
4380       } else {
4381 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4382 	if( (line[0] == mbw_lit('\\')) && (line[1] == mbw_lit('\'')) ) {
4383 	  line++;
4384 	  mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4385 	}
4386       }
4387       break;
4388     case XTAGDQUOTE:
4389       if( line[0] == mbw_lit('"') ) {
4390 	xml->state = XTAG;
4391 	if( xml->attribute != UNDEF ) {
4392 	  *q++ = ATTRIBSEP;
4393 	}
4394 	xml->attribute = UNDEF;
4395       } else {
4396 	mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4397 	if( (line[0] == mbw_lit('\\')) && (line[1] == mbw_lit('"')) ) {
4398 	  line++;
4399 	  mbw_prefix(xml_attribute_filter)(xml, &line,&q);
4400 	}
4401       }
4402       break;
4403     case CMNT:
4404       if( (line[0] == mbw_lit('-')) && (line[1] == mbw_lit('-')) ) {
4405 	xml->state = TAG;
4406       } else {
4407 	/* ignore comments in some circumstances */
4408 	if( m_options & (1<<M_OPTION_SHOW_HTML_COMMENTS) ) {
4409 	  *q++ = *line;
4410 	} else switch(xml->hide) {
4411 	case SCRIPT:
4412 	  if( m_options & (1<<M_OPTION_SHOW_SCRIPT) ) {
4413 	    *q++ = *line;
4414 	  }
4415 	  break;
4416 	case STYLE:
4417 	  if( m_options & (1<<M_OPTION_SHOW_STYLE) ) {
4418 	    *q++ = *line;
4419 	  }
4420 	  break;
4421 	default:
4422 	  break;
4423 	}
4424       }
4425       break;
4426     case DISABLED:
4427       /* don't modify the line at all */
4428       return;
4429     }
4430     line++;
4431   }
4432   *q = mbw_lit('\0'); /* mark the end of the clean text string */
4433 
4434 }
4435 
4436 
4437 /***********************************************************
4438  * TEXT PARSING FUNCTIONS                                  *
4439  ***********************************************************/
4440 
mbw_prefix(plain_text_filter)4441 bool_t mbw_prefix(plain_text_filter)(MBOX_State *mbox, mbw_t *line) {
4442   mbw_t *q;
4443   bool_t url = 0;
4444   bool_t censor = 0;
4445 
4446 
4447   switch(mbox->plainstate) {
4448   case psPLAIN:
4449     if( (line[0] == mbw_lit('b')) &&
4450 	!mbw_strncmp(line, mbw_lit("begin "),6) &&
4451 	ISOCT(line[6]) && ISOCT(line[7]) && ISOCT(line[8]) ) {
4452       mbox->plainstate = psUUENCODE;
4453       return 1;
4454     }
4455     break;
4456   case psUUENCODE:
4457     switch(mbw_prefix(is_uuline)(line)) {
4458     case -1:
4459       return 1;
4460     case -2:
4461       mbox->plainstate = psPLAIN;
4462       break;
4463     default:
4464       return 0;
4465     }
4466     break;
4467   }
4468 
4469   /* now assume psPLAIN */
4470 
4471   q = line;
4472   while(*line) {
4473     switch(*line) {
4474     case mbw_lit('%'):
4475       if( !censor ) {
4476 	mbw_prefix(decode_uri_character)(&line, &q);
4477       }
4478       break;
4479     case mbw_lit('&'):
4480       if( !censor ) {
4481 	if( !url ) {
4482 	  mbw_prefix(decode_html_entity)(&line, &q);
4483 	} else {
4484 	  censor = 1;
4485 	}
4486       }
4487       break;
4488     case mbw_lit('H'):
4489     case mbw_lit('h'):
4490       if( !mbw_strncasecmp(line, mbw_lit("http://"), 7) ) {
4491 	censor = 0;
4492 	url = 1;
4493       }
4494       if( !censor ) { *q++ = *line; }
4495       break;
4496     case mbw_lit('?'):
4497     case mbw_lit(';'):
4498     case mbw_lit('#'):
4499       if( url ) {
4500 	censor = 1;
4501       } else {
4502 	*q++ = *line;
4503       }
4504       break;
4505     case mbw_lit(' '):
4506     case mbw_lit('\t'):
4507     case mbw_lit('>'):
4508     case mbw_lit('\''):
4509     case mbw_lit('"'):
4510       if( censor && url ) { censor = 0; url = 0; }
4511       *q++ = *line;
4512       break;
4513     default:
4514       if( !censor ) { *q++ = *line; }
4515       break;
4516     }
4517     line++;
4518   }
4519   *q = mbw_lit('\0');
4520 
4521   return 1;
4522 }
4523 
4524 
4525 /* assume string is binary with embedded NULs replaced by FFs,
4526  the strings that are found are separated by spaces */
4527 /* note: this doesn't work like strings(1) yet, but eventually it will ;-) */
mbw_prefix(strings1_filter)4528 bool_t mbw_prefix(strings1_filter)(mbw_t *line) {
4529   size_t c;
4530   mbw_t *q;
4531 
4532 #define MIN_STRING_SIZE 4
4533   for(q = line, c = 0; *line; line++) {
4534 /*     if( mbw_isalnum(*line) ||  */
4535 /* 	mbw_ispunct(*line) ||  */
4536 /* 	(*line == mbw_lit(' ')) ) { */
4537     if( mbw_isprint(*line) && (*line != mbw_lit('\t')) ) {
4538       *q++ = *line;
4539       c++;
4540     } else if( c >= MIN_STRING_SIZE ) {
4541       *q++ = mbw_lit(' ');
4542       c = 0;
4543     } else if( c > 0 ) {
4544       q -= c;
4545       c = 0;
4546     }
4547   }
4548   *q = mbw_lit('\0');
4549 
4550   return 1;
4551 }
4552 
4553 /***********************************************************
4554  * CALLED OUTSIDE THIS SOURCE FILE                         *
4555  ***********************************************************/
4556 
mbw_prefix(init_decoding_caches)4557 void mbw_prefix(init_decoding_caches)(MBOX_State *mbox) {
4558   mbw_prefix(init_dc)(&(mbox->mbw_prefix(b64_dc)),system_pagesize);
4559   mbw_prefix(init_dc)(&(mbox->mbw_prefix(qp_dc)),system_pagesize);
4560 }
4561 
mbw_prefix(free_decoding_caches)4562 void mbw_prefix(free_decoding_caches)(MBOX_State *mbox) {
4563   if( mbox->mbw_prefix(b64_dc).cache ) {
4564     free(mbox->mbw_prefix(b64_dc).cache);
4565     mbox->mbw_prefix(b64_dc).cache = NULL;
4566   }
4567   if( mbox->mbw_prefix(qp_dc).cache ) {
4568     free(mbox->mbw_prefix(qp_dc).cache);
4569     mbox->mbw_prefix(qp_dc).cache = NULL;
4570   }
4571 }
4572 
4573 #endif
4574 
4575 
4576