1 /*
2 * (c) Copyright 1990, Kim Fabricius Storm. All rights reserved.
3 * Copyright (c) 1996-2005 Michael T Pins. All rights reserved.
4 *
5 * Digest article handling
6 *
7 * The code to do the selective parsing of mail and mmdf formats,
8 * mail from lines and determining folder types is based on patches
9 * contributed by Bernd Wechner (bernd@bhpcpd.kembla.oz.au).
10 */
11
12 #include <stdlib.h>
13 #include <string.h>
14 #include <strings.h>
15 #include <ctype.h>
16 #include "config.h"
17 #include "global.h"
18 #include "debug.h"
19 #include "digest.h"
20 #include "news.h"
21 #include "pack_name.h"
22 #include "nn_term.h"
23
24 /* digest.c */
25
26 static char **dg_hdr_field(register char *lp, int all);
27
28 int strict_from_parse = 2;
29
30 #ifdef DG_TEST
31 #define TEST0(fmt)
32 #define TEST1(fmt, x) if (Debug & DG_TEST) printf(fmt, x)
33 #define TEST2(fmt, x, y) if (Debug & DG_TEST) printf(fmt, x, y)
34
35 #else
36
37 #define TEST0(fmt, x)
38 #define TEST1(fmt, x)
39 #define TEST2(fmt, x, y)
40 #endif
41
42 #define UNIFY 040
43
44 static char digest_pattern[] = "igest";
45
46 void
init_digest_parsing(void)47 init_digest_parsing(void)
48 {
49 register char *m;
50
51 for (m = digest_pattern; *m; m++)
52 *m |= UNIFY;
53 }
54
55 int
is_digest(register char * subject)56 is_digest(register char *subject)
57 {
58 register char c, *q, *m;
59
60 if (subject == NULL)
61 return 0;
62
63 while ((c = *subject++)) {
64 if ((c | UNIFY) != ('d' | UNIFY))
65 continue;
66
67 q = subject;
68 m = digest_pattern;
69 while ((c = *m++) && (*q++ | UNIFY) == c);
70 if (c == NUL)
71 return 1;
72 }
73 return 0;
74 }
75
76
77 /*
78 * is_mail_from_line - Is this a legal unix mail "From " line?
79 *
80 * Given a line of input will check to see if it matches the standard
81 * unix mail "from " header format. Returns 0 if it does and <0 if not.
82 *
83 * The check may be very lax or very strict depending upon
84 * the value of "strict-mail-from-parse":
85 *
86 * 0 - Lax, checks only for the string "From ".
87 * 1 - Strict, checks that the correct number of fields are present.
88 * 2 - Very strict, also checks that each field contains a legal value.
89 *
90 * Assumptions: Not having the definitive unix mailbox reference I have
91 * assumed that unix mailbox headers follow this format:
92 *
93 * From <person> <date> <garbage>
94 *
95 * Where <person> is the address of the sender, being an ordinary
96 * string with no white space imbedded in it, and <date> is the date of
97 * posting, in ctime(3C) format.
98 *
99 * This would, on the face of it, seem valid. I (Bernd) have yet to find a
100 * unix mailbox header which doesn't follow this format.
101 *
102 * From: Bernd Wechner (bernd@bhpcpd.kembla.oz.au)
103 * Obfuscated by: KFS (as usual)
104 */
105
106 #define MAX_FIELDS 10
107
108 static char legal_day[] = "SunMonTueWedThuFriSat";
109 static char legal_month[] = "JanFebMarAprMayJunJulAugSepOctNovDec";
110 static int legal_numbers[] = {1, 31, 0, 23, 0, 59, 0, 60, 1969, 2199};
111
112 static int
is_mail_from_line(char * line,char * namebuf)113 is_mail_from_line(char *line, char *namebuf)
114 /* line Line of text to be checked */
115 /* namebuf Optional buffer to place packed sender info */
116 {
117 char *fields[MAX_FIELDS];
118 char *sender_tail = NULL;
119 register char *lp, **fp;
120 register int n, i;
121
122 if (strncmp(line, "From ", 5))
123 return -100;
124 if (strict_from_parse == 0)
125 return 0;
126
127 lp = line + 5;
128 /* sender day mon dd hh:mm:ss year */
129 for (n = 0, fp = fields; n < MAX_FIELDS; n++) {
130 while (*lp && *lp != NL && isascii(*lp) && isspace(*lp))
131 lp++;
132 if (*lp == NUL || *lp == NL)
133 break;
134 *fp++ = lp;
135 while (*lp && isascii(*lp) && !isspace(*lp))
136 if (*lp++ == ':' && (n == 4 || n == 5))
137 break;
138 if (n == 0)
139 sender_tail = lp;
140 }
141
142 if (n < 8)
143 return -200 - n;
144
145 fp = fields;
146
147 if (n > 8 && !isdigit(fp[7][0]))
148 fp[7] = fp[8]; /* ... TZ year */
149 if (n > 9 && !isdigit(fp[7][0]))
150 fp[7] = fp[9]; /* ... TZ DST year */
151
152 if (namebuf != NULL) {
153 char x = *sender_tail;
154 *sender_tail = NUL;
155 pack_name(namebuf, *fp, Name_Length);
156 *sender_tail = x;
157 }
158 if (strict_from_parse == 1)
159 return 0;
160
161 fp++;
162 for (i = 0; i < 21; i += 3)
163 if (strncmp(*fp, &legal_day[i], 3) == 0)
164 break;
165 if (i == 21)
166 return -1;
167
168 fp++;
169 for (i = 0; i < 36; i += 3)
170 if (strncmp(*fp, &legal_month[i], 3) == 0)
171 break;
172 if (i == 36)
173 return -2;
174
175 for (i = 0; i < 10; i += 2) {
176 lp = *++fp;
177 if (!isdigit(*lp))
178 return -20 - i;
179 n = atoi(lp);
180 if (n < legal_numbers[i] || legal_numbers[i + 1] < n)
181 return -10 - i;
182 }
183 return 0;
184 }
185
186 /*
187 * expect that f is positioned at header of an article
188 */
189
190 static int is_mmdf_folder = 0;
191 static int is_mail_folder = 0;
192
193 /*
194 * get_folder_type
195 *
196 * Given a file descriptor f, will check what type of folder it is.
197 * Must be called at zero offset and caller must reposition if necessary.
198 * Side-effects: sets is_mail_folder, is_mmdf_folder, and current_folder_type.
199 * Return values:
200 * -1: folder is empty,
201 * 0: normal digest,
202 * 1: UNIX mail format
203 * 2: MMDF format
204 */
205
206 int current_folder_type;
207
208 int
get_folder_type(FILE * f)209 get_folder_type(FILE * f)
210 {
211 char line[1024];
212
213 is_mail_folder = 0;
214 is_mmdf_folder = 0;
215
216 if (fgets(line, 1024, f) == NULL)
217 return current_folder_type = -1;
218
219 if (strncmp(line, "\001\001\001\001\n", 5) == 0) {
220 is_mmdf_folder = 1;
221 return current_folder_type = 2;
222 }
223 if (is_mail_from_line(line, (char *) NULL) == 0) {
224 is_mail_folder = 1;
225 return current_folder_type = 1;
226 }
227 return current_folder_type = 0;
228 }
229
230 int
get_digest_article(FILE * f,news_header_buffer hdrbuf)231 get_digest_article(FILE * f, news_header_buffer hdrbuf)
232 {
233 int cont;
234
235 digest.dg_hpos = ftell(f);
236 TEST1("GET DIGEST hp=%ld\n", (long) digest.dg_hpos);
237
238 do {
239 if (!parse_digest_header(f, 0, hdrbuf))
240 return -1;
241 digest.dg_fpos = ftell(f);
242 TEST2("END HEADER hp=%ld fp=%ld\n", (long) digest.dg_hpos, (long) digest.dg_fpos);
243 } while ((cont = skip_digest_body(f)) < 0);
244
245 TEST2("END BODY lp=%ld next=%ld\n", (long) digest.dg_lpos, ftell(f));
246
247 return cont;
248 }
249
250 #define BACKUP_LINES 50 /* remember class + offset for parsed lines */
251
252 #define LN_BLANK 0x01 /* blank line */
253 #define LN_DASHED 0x02 /* dash line */
254 #define LN_HEADER 0x04 /* (possible) header line */
255 #define LN_ASTERISK 0x08 /* asterisk line (near end) */
256 #define LN_END_OF 0x10 /* End of ... line */
257 #define LN_TEXT 0x20 /* unclassified line */
258
259
260 /*
261 * skip until 'Subject: ' (or End of digest) line is found
262 * then backup till start of header
263 */
264
265 /*
266 * Tuning parameters:
267 *
268 * MIN_HEADER_LINES: number of known header lines that must
269 * be found in a block to identify a new
270 * header
271 *
272 * MAX_BLANKS_DASH max no of blanks on a 'dash line'
273 *
274 * MIN_DASHES min no of dashes on a 'dash line'
275 *
276 * MAX_BLANKS_ASTERISKS max no of blanks on an 'asterisk line'
277 *
278 * MIN_ASTERISKS min no of asterisks on an 'asterisk line'
279 *
280 * MAX_BLANKS_END_OF max no of blanks before "End of "
281 */
282
283 #define MIN_HEADER_LINES 2
284 #define MAX_BLANKS_DASH 3
285 #define MIN_DASHES 16
286 #define MAX_BLANKS_ASTERISK 1
287 #define MIN_ASTERISKS 10
288 #define MAX_BLANKS_END_OF 1
289
290 int
skip_digest_body(register FILE * f)291 skip_digest_body(register FILE * f)
292 {
293 long backup_p[BACKUP_LINES];
294 int line_type[BACKUP_LINES];
295 register int backup_index, backup_count;
296 int more_header_lines, end_or_asterisks, blanks;
297 int colon_lines;
298 char line[1024];
299 register char *cp;
300
301 #define decrease_index() \
302 if (--backup_index < 0) backup_index = BACKUP_LINES - 1
303
304 backup_index = -1;
305 backup_count = 0;
306 end_or_asterisks = 0;
307
308 digest.dg_lines = 0;
309
310
311 next_line:
312 more_header_lines = 0;
313 colon_lines = 0;
314
315 next_possible_header_line:
316 digest.dg_lines++;
317
318 if (++backup_index == BACKUP_LINES)
319 backup_index = 0;
320 if (backup_count < BACKUP_LINES)
321 backup_count++;
322
323 backup_p[backup_index] = ftell(f);
324 line_type[backup_index] = LN_TEXT;
325
326 if (fgets(line, 1024, f) == NULL) {
327 TEST2("end_of_file, bc=%d, lines=%d\n", backup_count, digest.dg_lines);
328
329 if (is_mmdf_folder) {
330 digest.dg_lpos = backup_p[backup_index];
331 is_mmdf_folder = 0;
332 return 0;
333 }
334 /* end of file => look for "****" or "End of" line */
335
336 if (end_or_asterisks)
337 while (--backup_count >= 0) {
338 --digest.dg_lines;
339 decrease_index();
340 if (line_type[backup_index] & (LN_ASTERISK | LN_END_OF))
341 break;
342 }
343
344 digest.dg_lpos = backup_p[backup_index];
345
346 if (digest.dg_lines == 0)
347 return 0;
348
349 while (--backup_count >= 0) {
350 --digest.dg_lines;
351 digest.dg_lpos = backup_p[backup_index];
352 decrease_index();
353 if ((line_type[backup_index] &
354 (LN_ASTERISK | LN_END_OF | LN_BLANK | LN_DASHED)) == 0)
355 break;
356 }
357
358 return 0; /* no article follows */
359 }
360 TEST1("\n>>%-.50s ==>>", line);
361
362 if (is_mmdf_folder) {
363 /* in an mmdf folder we simply look for the next ^A^A^A^A line */
364 if (line[0] != '\001' || strcmp(line, "\001\001\001\001\n"))
365 goto next_line;
366
367 digest.dg_lpos = backup_p[backup_index];
368 --digest.dg_lines;
369 return (digest.dg_lines <= 0) ? -1 : 1;
370 }
371 for (cp = line; *cp && isascii(*cp) && isspace(*cp); cp++);
372
373 if (*cp == NUL) {
374 TEST0("BLANK");
375 line_type[backup_index] = LN_BLANK;
376 goto next_line;
377 }
378 if (is_mail_folder) {
379 /* in a mail folder we simply look for the next "From " line */
380 if (line[0] != 'F' || is_mail_from_line(line, (char *) NULL) < 0)
381 goto next_line;
382
383 line_type[backup_index] = LN_HEADER;
384 fseek(f, backup_p[backup_index], 0);
385 goto found_mail_header;
386 }
387 blanks = cp - line;
388
389 if (*cp == '-') {
390 if (blanks > MAX_BLANKS_DASH)
391 goto next_line;
392
393 while (*cp == '-')
394 cp++;
395 if (cp - line - blanks > MIN_DASHES) {
396 while (*cp && (*cp == '-' || (isascii(*cp) && isspace(*cp))))
397 cp++;
398 if (*cp == NUL) {
399 TEST0("DASHED");
400
401 line_type[backup_index] = LN_DASHED;
402 }
403 }
404 goto next_line;
405 }
406 if (*cp == '*') {
407 if (blanks > MAX_BLANKS_ASTERISK)
408 goto next_line;
409
410 while (*cp == '*')
411 cp++;
412 if (cp - line - blanks > MIN_ASTERISKS) {
413 while (*cp && (*cp == '*' || (isascii(*cp) && isspace(*cp))))
414 cp++;
415 if (*cp == NUL) {
416 TEST0("ASTERISK");
417 line_type[backup_index] = LN_ASTERISK;
418 end_or_asterisks++;
419 }
420 }
421 goto next_line;
422 }
423 if (blanks <= MAX_BLANKS_END_OF &&
424 *cp == 'E' && strncmp(cp, "End of ", 7) == 0) {
425 TEST0("END_OF_");
426 line_type[backup_index] = LN_END_OF;
427 end_or_asterisks++;
428 goto next_line;
429 }
430 if (blanks)
431 goto next_possible_header_line;
432 /* must be able to handle continued lines in sub-digest headers...
433 goto next_line;
434 */
435
436 if (!dg_hdr_field(line, 0)) {
437 char *colon;
438 if ((colon = strchr(line, ':'))) {
439 for (cp = line; cp < colon; cp++)
440 if (!isascii(*cp) || isspace(*cp))
441 break;
442 if (cp == colon) {
443 TEST0("COLON");
444 colon_lines++;
445 line_type[backup_index] = LN_HEADER;
446 goto next_possible_header_line;
447 }
448 }
449 if (is_mail_from_line(line, (char *) NULL) == 0) {
450 TEST0("FROM_");
451 colon_lines++;
452 line_type[backup_index] = LN_HEADER;
453 }
454 goto next_possible_header_line;
455 }
456 TEST0("HEADER");
457
458 line_type[backup_index] = LN_HEADER;
459 if (++more_header_lines < MIN_HEADER_LINES)
460 goto next_possible_header_line;
461
462 /* found block with MIN_HEADER_LINES */
463
464 TEST0("\nSearch for start of header\n");
465
466 colon_lines += more_header_lines;
467 for (;;) {
468 fseek(f, backup_p[backup_index], 0);
469 if (line_type[backup_index] == LN_HEADER)
470 if (--colon_lines <= 0)
471 break;
472 --digest.dg_lines;
473 if (--backup_count == 0)
474 break;
475 decrease_index();
476 if ((line_type[backup_index] & (LN_HEADER | LN_TEXT)) == 0)
477 break;
478 }
479
480 if (digest.dg_lines == 0) {
481 TEST0("Skipped empty article\n");
482 return -1;
483 }
484 found_mail_header:
485
486 for (;;) {
487 digest.dg_lpos = backup_p[backup_index];
488 if (--backup_count < 0)
489 break;
490 decrease_index();
491 if ((line_type[backup_index] & (LN_BLANK | LN_DASHED)) == 0)
492 break;
493 --digest.dg_lines;
494 }
495
496 return (digest.dg_lines == 0) ? -1 : 1;
497 }
498
499 int
parse_digest_header(FILE * f,int all,news_header_buffer hdrbuf)500 parse_digest_header(FILE * f, int all, news_header_buffer hdrbuf)
501 {
502 digest.dg_date = digest.dg_from = digest.dg_subj = digest.dg_to = NULL;
503
504 parse_header(f, dg_hdr_field, all, hdrbuf);
505
506 return digest.dg_from || digest.dg_subj;
507 }
508
509
510 static char **
dg_hdr_field(register char * lp,int all)511 dg_hdr_field(register char *lp, int all)
512 {
513 static char *dummy;
514 static char namebuf[33];
515
516 #define check(name, lgt, field) \
517 if (isascii(lp[lgt]) && isspace(lp[lgt]) \
518 && strncasecmp(name, lp, lgt) == 0) {\
519 TEST0("MATCH: " #field " "); \
520 return &digest.field; \
521 }
522
523 TEST1("\nPARSE[%.20s] ==>> ", lp);
524
525 switch (*lp++) {
526
527 case '\001':
528 /* In an mmdf folder ^A^A^A^A is skipped at beginning of header */
529 if (!is_mmdf_folder)
530 break;
531 if (strncmp(lp, "\001\001\001\n", 4))
532 break;
533 digest.dg_hpos += 5;
534 return NULL;
535
536 case 'D':
537 case 'd':
538 check("ate:", 4, dg_date);
539 break;
540
541 case 'F':
542 case 'f':
543 check("rom:", 4, dg_from);
544 if (!is_mail_folder)
545 break;
546 if (*--lp != 'F')
547 break;
548 if (is_mail_from_line(lp, namebuf) < 0)
549 break;
550 /* Store packed sender in dg_from here and return dummy to parser */
551 if (digest.dg_from == NULL)
552 digest.dg_from = namebuf;
553 return &dummy;
554
555 case 'R':
556 case 'r':
557 if (!all)
558 break;
559 check("e:", 2, dg_subj);
560 break;
561
562 case 'S':
563 case 's':
564 check("ubject:", 7, dg_subj);
565 check("ubject", 6, dg_subj);
566 break;
567
568 case 'T':
569 case 't':
570 check("itle:", 5, dg_subj);
571 if (!all)
572 break;
573 check("o:", 2, dg_to);
574 break;
575 }
576
577 #undef check
578 TEST0("NOT MATCHED ");
579
580 return NULL;
581 }
582