1 /*
2  *	(c) Copyright 1990, Kim Fabricius Storm.  All rights reserved.
3  *      Copyright (c) 1996-2005 Michael T Pins.  All rights reserved.
4  *
5  *	Digest article handling
6  *
7  *	The code to do the selective parsing of mail and mmdf formats,
8  *	mail from lines and determining folder types is based on patches
9  *	contributed by Bernd Wechner (bernd@bhpcpd.kembla.oz.au).
10  */
11 
12 #include <stdlib.h>
13 #include <string.h>
14 #include <strings.h>
15 #include <ctype.h>
16 #include "config.h"
17 #include "global.h"
18 #include "debug.h"
19 #include "digest.h"
20 #include "news.h"
21 #include "pack_name.h"
22 #include "nn_term.h"
23 
24 /* digest.c */
25 
26 static char   **dg_hdr_field(register char *lp, int all);
27 
28 int             strict_from_parse = 2;
29 
30 #ifdef DG_TEST
31 #define TEST0(fmt)
32 #define TEST1(fmt, x)    if (Debug & DG_TEST) printf(fmt, x)
33 #define TEST2(fmt, x, y) if (Debug & DG_TEST) printf(fmt, x, y)
34 
35 #else
36 
37 #define TEST0(fmt, x)
38 #define TEST1(fmt, x)
39 #define TEST2(fmt, x, y)
40 #endif
41 
42 #define UNIFY 040
43 
44 static char     digest_pattern[] = "igest";
45 
46 void
init_digest_parsing(void)47 init_digest_parsing(void)
48 {
49     register char  *m;
50 
51     for (m = digest_pattern; *m; m++)
52 	*m |= UNIFY;
53 }
54 
55 int
is_digest(register char * subject)56 is_digest(register char *subject)
57 {
58     register char   c, *q, *m;
59 
60     if (subject == NULL)
61 	return 0;
62 
63     while ((c = *subject++)) {
64 	if ((c | UNIFY) != ('d' | UNIFY))
65 	    continue;
66 
67 	q = subject;
68 	m = digest_pattern;
69 	while ((c = *m++) && (*q++ | UNIFY) == c);
70 	if (c == NUL)
71 	    return 1;
72     }
73     return 0;
74 }
75 
76 
77 /*
78  * is_mail_from_line - Is this a legal unix mail "From " line?
79  *
80  * Given a line of input will check to see if it matches the standard
81  * unix mail "from " header format. Returns 0 if it does and <0 if not.
82  *
83  * The check may be very lax or very strict depending upon
84  * the value of "strict-mail-from-parse":
85  *
86  * 0 - Lax, checks only for the string "From ".
87  * 1 - Strict, checks that the correct number of fields are present.
88  * 2 - Very strict, also checks that each field contains a legal value.
89  *
90  * Assumptions: Not having the definitive unix mailbox reference I have
91  * assumed that unix mailbox headers follow this format:
92  *
93  * From <person> <date> <garbage>
94  *
95  * Where <person> is the address of the sender, being an ordinary
96  * string with no white space imbedded in it, and <date> is the date of
97  * posting, in ctime(3C) format.
98  *
99  * This would, on the face of it, seem valid. I (Bernd) have yet to find a
100  * unix mailbox header which doesn't follow this format.
101  *
102  * From: Bernd Wechner (bernd@bhpcpd.kembla.oz.au)
103  * Obfuscated by: KFS (as usual)
104  */
105 
106 #define MAX_FIELDS 10
107 
108 static char     legal_day[] = "SunMonTueWedThuFriSat";
109 static char     legal_month[] = "JanFebMarAprMayJunJulAugSepOctNovDec";
110 static int      legal_numbers[] = {1, 31, 0, 23, 0, 59, 0, 60, 1969, 2199};
111 
112 static int
is_mail_from_line(char * line,char * namebuf)113 is_mail_from_line(char *line, char *namebuf)
114  /* line		Line of text to be checked */
115  /* namebuf	Optional buffer to place packed sender info */
116 {
117     char           *fields[MAX_FIELDS];
118     char           *sender_tail = NULL;
119     register char  *lp, **fp;
120     register int    n, i;
121 
122     if (strncmp(line, "From ", 5))
123 	return -100;
124     if (strict_from_parse == 0)
125 	return 0;
126 
127     lp = line + 5;
128     /* sender day mon dd hh:mm:ss year */
129     for (n = 0, fp = fields; n < MAX_FIELDS; n++) {
130 	while (*lp && *lp != NL && isascii(*lp) && isspace(*lp))
131 	    lp++;
132 	if (*lp == NUL || *lp == NL)
133 	    break;
134 	*fp++ = lp;
135 	while (*lp && isascii(*lp) && !isspace(*lp))
136 	    if (*lp++ == ':' && (n == 4 || n == 5))
137 		break;
138 	if (n == 0)
139 	    sender_tail = lp;
140     }
141 
142     if (n < 8)
143 	return -200 - n;
144 
145     fp = fields;
146 
147     if (n > 8 && !isdigit(fp[7][0]))
148 	fp[7] = fp[8];		/* ... TZ year */
149     if (n > 9 && !isdigit(fp[7][0]))
150 	fp[7] = fp[9];		/* ... TZ DST year */
151 
152     if (namebuf != NULL) {
153 	char            x = *sender_tail;
154 	*sender_tail = NUL;
155 	pack_name(namebuf, *fp, Name_Length);
156 	*sender_tail = x;
157     }
158     if (strict_from_parse == 1)
159 	return 0;
160 
161     fp++;
162     for (i = 0; i < 21; i += 3)
163 	if (strncmp(*fp, &legal_day[i], 3) == 0)
164 	    break;
165     if (i == 21)
166 	return -1;
167 
168     fp++;
169     for (i = 0; i < 36; i += 3)
170 	if (strncmp(*fp, &legal_month[i], 3) == 0)
171 	    break;
172     if (i == 36)
173 	return -2;
174 
175     for (i = 0; i < 10; i += 2) {
176 	lp = *++fp;
177 	if (!isdigit(*lp))
178 	    return -20 - i;
179 	n = atoi(lp);
180 	if (n < legal_numbers[i] || legal_numbers[i + 1] < n)
181 	    return -10 - i;
182     }
183     return 0;
184 }
185 
186 /*
187  * expect that f is positioned at header of an article
188  */
189 
190 static int      is_mmdf_folder = 0;
191 static int      is_mail_folder = 0;
192 
193 /*
194  * get_folder_type
195  *
196  * Given a file descriptor f, will check what type of folder it is.
197  * Must be called at zero offset and caller must reposition if necessary.
198  * Side-effects: sets is_mail_folder, is_mmdf_folder, and current_folder_type.
199  * Return values:
200  *	-1: folder is empty,
201  *	 0: normal digest,
202  *	 1: UNIX mail format
203  *	 2: MMDF format
204  */
205 
206 int             current_folder_type;
207 
208 int
get_folder_type(FILE * f)209 get_folder_type(FILE * f)
210 {
211     char            line[1024];
212 
213     is_mail_folder = 0;
214     is_mmdf_folder = 0;
215 
216     if (fgets(line, 1024, f) == NULL)
217 	return current_folder_type = -1;
218 
219     if (strncmp(line, "\001\001\001\001\n", 5) == 0) {
220 	is_mmdf_folder = 1;
221 	return current_folder_type = 2;
222     }
223     if (is_mail_from_line(line, (char *) NULL) == 0) {
224 	is_mail_folder = 1;
225 	return current_folder_type = 1;
226     }
227     return current_folder_type = 0;
228 }
229 
230 int
get_digest_article(FILE * f,news_header_buffer hdrbuf)231 get_digest_article(FILE * f, news_header_buffer hdrbuf)
232 {
233     int             cont;
234 
235     digest.dg_hpos = ftell(f);
236     TEST1("GET DIGEST hp=%ld\n", (long) digest.dg_hpos);
237 
238     do {
239 	if (!parse_digest_header(f, 0, hdrbuf))
240 	    return -1;
241 	digest.dg_fpos = ftell(f);
242 	TEST2("END HEADER hp=%ld fp=%ld\n", (long) digest.dg_hpos, (long) digest.dg_fpos);
243     } while ((cont = skip_digest_body(f)) < 0);
244 
245     TEST2("END BODY lp=%ld next=%ld\n", (long) digest.dg_lpos, ftell(f));
246 
247     return cont;
248 }
249 
250 #define BACKUP_LINES	 50	/* remember class + offset for parsed lines */
251 
252 #define	LN_BLANK	0x01	/* blank line */
253 #define	LN_DASHED	0x02	/* dash line */
254 #define	LN_HEADER	0x04	/* (possible) header line */
255 #define	LN_ASTERISK	0x08	/* asterisk line (near end) */
256 #define	LN_END_OF	0x10	/* End of ... line */
257 #define	LN_TEXT		0x20	/* unclassified line */
258 
259 
260 /*
261  * skip until 'Subject: ' (or End of digest) line is found
262  * then backup till start of header
263  */
264 
265 /*
266  * Tuning parameters:
267  *
268  *	MIN_HEADER_LINES:	number of known header lines that must
269  *				be found in a block to identify a new
270  *				header
271  *
272  *	MAX_BLANKS_DASH		max no of blanks on a 'dash line'
273  *
274  *	MIN_DASHES		min no of dashes on a 'dash line'
275  *
276  *	MAX_BLANKS_ASTERISKS	max no of blanks on an 'asterisk line'
277  *
278  *	MIN_ASTERISKS		min no of asterisks on an 'asterisk line'
279  *
280  *	MAX_BLANKS_END_OF	max no of blanks before "End of "
281  */
282 
283 #define	MIN_HEADER_LINES	2
284 #define	MAX_BLANKS_DASH		3
285 #define	MIN_DASHES		16
286 #define	MAX_BLANKS_ASTERISK	1
287 #define	MIN_ASTERISKS		10
288 #define	MAX_BLANKS_END_OF	1
289 
290 int
skip_digest_body(register FILE * f)291 skip_digest_body(register FILE * f)
292 {
293     long            backup_p[BACKUP_LINES];
294     int             line_type[BACKUP_LINES];
295     register int    backup_index, backup_count;
296     int             more_header_lines, end_or_asterisks, blanks;
297     int             colon_lines;
298     char            line[1024];
299     register char  *cp;
300 
301 #define	decrease_index()	\
302     if (--backup_index < 0) backup_index = BACKUP_LINES - 1
303 
304     backup_index = -1;
305     backup_count = 0;
306     end_or_asterisks = 0;
307 
308     digest.dg_lines = 0;
309 
310 
311 next_line:
312     more_header_lines = 0;
313     colon_lines = 0;
314 
315 next_possible_header_line:
316     digest.dg_lines++;
317 
318     if (++backup_index == BACKUP_LINES)
319 	backup_index = 0;
320     if (backup_count < BACKUP_LINES)
321 	backup_count++;
322 
323     backup_p[backup_index] = ftell(f);
324     line_type[backup_index] = LN_TEXT;
325 
326     if (fgets(line, 1024, f) == NULL) {
327 	TEST2("end_of_file, bc=%d, lines=%d\n", backup_count, digest.dg_lines);
328 
329 	if (is_mmdf_folder) {
330 	    digest.dg_lpos = backup_p[backup_index];
331 	    is_mmdf_folder = 0;
332 	    return 0;
333 	}
334 	/* end of file => look for "****" or "End of" line */
335 
336 	if (end_or_asterisks)
337 	    while (--backup_count >= 0) {
338 		--digest.dg_lines;
339 		decrease_index();
340 		if (line_type[backup_index] & (LN_ASTERISK | LN_END_OF))
341 		    break;
342 	    }
343 
344 	digest.dg_lpos = backup_p[backup_index];
345 
346 	if (digest.dg_lines == 0)
347 	    return 0;
348 
349 	while (--backup_count >= 0) {
350 	    --digest.dg_lines;
351 	    digest.dg_lpos = backup_p[backup_index];
352 	    decrease_index();
353 	    if ((line_type[backup_index] &
354 		 (LN_ASTERISK | LN_END_OF | LN_BLANK | LN_DASHED)) == 0)
355 		break;
356 	}
357 
358 	return 0;		/* no article follows */
359     }
360     TEST1("\n>>%-.50s ==>>", line);
361 
362     if (is_mmdf_folder) {
363 	/* in an mmdf folder we simply look for the next ^A^A^A^A line */
364 	if (line[0] != '\001' || strcmp(line, "\001\001\001\001\n"))
365 	    goto next_line;
366 
367 	digest.dg_lpos = backup_p[backup_index];
368 	--digest.dg_lines;
369 	return (digest.dg_lines <= 0) ? -1 : 1;
370     }
371     for (cp = line; *cp && isascii(*cp) && isspace(*cp); cp++);
372 
373     if (*cp == NUL) {
374 	TEST0("BLANK");
375 	line_type[backup_index] = LN_BLANK;
376 	goto next_line;
377     }
378     if (is_mail_folder) {
379 	/* in a mail folder we simply look for the next "From " line */
380 	if (line[0] != 'F' || is_mail_from_line(line, (char *) NULL) < 0)
381 	    goto next_line;
382 
383 	line_type[backup_index] = LN_HEADER;
384 	fseek(f, backup_p[backup_index], 0);
385 	goto found_mail_header;
386     }
387     blanks = cp - line;
388 
389     if (*cp == '-') {
390 	if (blanks > MAX_BLANKS_DASH)
391 	    goto next_line;
392 
393 	while (*cp == '-')
394 	    cp++;
395 	if (cp - line - blanks > MIN_DASHES) {
396 	    while (*cp && (*cp == '-' || (isascii(*cp) && isspace(*cp))))
397 		cp++;
398 	    if (*cp == NUL) {
399 		TEST0("DASHED");
400 
401 		line_type[backup_index] = LN_DASHED;
402 	    }
403 	}
404 	goto next_line;
405     }
406     if (*cp == '*') {
407 	if (blanks > MAX_BLANKS_ASTERISK)
408 	    goto next_line;
409 
410 	while (*cp == '*')
411 	    cp++;
412 	if (cp - line - blanks > MIN_ASTERISKS) {
413 	    while (*cp && (*cp == '*' || (isascii(*cp) && isspace(*cp))))
414 		cp++;
415 	    if (*cp == NUL) {
416 		TEST0("ASTERISK");
417 		line_type[backup_index] = LN_ASTERISK;
418 		end_or_asterisks++;
419 	    }
420 	}
421 	goto next_line;
422     }
423     if (blanks <= MAX_BLANKS_END_OF &&
424 	*cp == 'E' && strncmp(cp, "End of ", 7) == 0) {
425 	TEST0("END_OF_");
426 	line_type[backup_index] = LN_END_OF;
427 	end_or_asterisks++;
428 	goto next_line;
429     }
430     if (blanks)
431 	goto next_possible_header_line;
432 /* must be able to handle continued lines in sub-digest headers...
433 	goto next_line;
434 */
435 
436     if (!dg_hdr_field(line, 0)) {
437 	char           *colon;
438 	if ((colon = strchr(line, ':'))) {
439 	    for (cp = line; cp < colon; cp++)
440 		if (!isascii(*cp) || isspace(*cp))
441 		    break;
442 	    if (cp == colon) {
443 		TEST0("COLON");
444 		colon_lines++;
445 		line_type[backup_index] = LN_HEADER;
446 		goto next_possible_header_line;
447 	    }
448 	}
449 	if (is_mail_from_line(line, (char *) NULL) == 0) {
450 	    TEST0("FROM_");
451 	    colon_lines++;
452 	    line_type[backup_index] = LN_HEADER;
453 	}
454 	goto next_possible_header_line;
455     }
456     TEST0("HEADER");
457 
458     line_type[backup_index] = LN_HEADER;
459     if (++more_header_lines < MIN_HEADER_LINES)
460 	goto next_possible_header_line;
461 
462     /* found block with MIN_HEADER_LINES */
463 
464     TEST0("\nSearch for start of header\n");
465 
466     colon_lines += more_header_lines;
467     for (;;) {
468 	fseek(f, backup_p[backup_index], 0);
469 	if (line_type[backup_index] == LN_HEADER)
470 	    if (--colon_lines <= 0)
471 		break;
472 	--digest.dg_lines;
473 	if (--backup_count == 0)
474 	    break;
475 	decrease_index();
476 	if ((line_type[backup_index] & (LN_HEADER | LN_TEXT)) == 0)
477 	    break;
478     }
479 
480     if (digest.dg_lines == 0) {
481 	TEST0("Skipped empty article\n");
482 	return -1;
483     }
484 found_mail_header:
485 
486     for (;;) {
487 	digest.dg_lpos = backup_p[backup_index];
488 	if (--backup_count < 0)
489 	    break;
490 	decrease_index();
491 	if ((line_type[backup_index] & (LN_BLANK | LN_DASHED)) == 0)
492 	    break;
493 	--digest.dg_lines;
494     }
495 
496     return (digest.dg_lines == 0) ? -1 : 1;
497 }
498 
499 int
parse_digest_header(FILE * f,int all,news_header_buffer hdrbuf)500 parse_digest_header(FILE * f, int all, news_header_buffer hdrbuf)
501 {
502     digest.dg_date = digest.dg_from = digest.dg_subj = digest.dg_to = NULL;
503 
504     parse_header(f, dg_hdr_field, all, hdrbuf);
505 
506     return digest.dg_from || digest.dg_subj;
507 }
508 
509 
510 static char   **
dg_hdr_field(register char * lp,int all)511 dg_hdr_field(register char *lp, int all)
512 {
513     static char    *dummy;
514     static char     namebuf[33];
515 
516 #define check(name, lgt, field) \
517     if (isascii(lp[lgt]) && isspace(lp[lgt]) \
518 	&& strncasecmp(name, lp, lgt) == 0) {\
519 	TEST0("MATCH: " #field " "); \
520 	return &digest.field; \
521     }
522 
523     TEST1("\nPARSE[%.20s] ==>> ", lp);
524 
525     switch (*lp++) {
526 
527 	case '\001':
528 	    /* In an mmdf folder ^A^A^A^A is skipped at beginning of header */
529 	    if (!is_mmdf_folder)
530 		break;
531 	    if (strncmp(lp, "\001\001\001\n", 4))
532 		break;
533 	    digest.dg_hpos += 5;
534 	    return NULL;
535 
536 	case 'D':
537 	case 'd':
538 	    check("ate:", 4, dg_date);
539 	    break;
540 
541 	case 'F':
542 	case 'f':
543 	    check("rom:", 4, dg_from);
544 	    if (!is_mail_folder)
545 		break;
546 	    if (*--lp != 'F')
547 		break;
548 	    if (is_mail_from_line(lp, namebuf) < 0)
549 		break;
550 	    /* Store packed sender in dg_from here and return dummy to parser */
551 	    if (digest.dg_from == NULL)
552 		digest.dg_from = namebuf;
553 	    return &dummy;
554 
555 	case 'R':
556 	case 'r':
557 	    if (!all)
558 		break;
559 	    check("e:", 2, dg_subj);
560 	    break;
561 
562 	case 'S':
563 	case 's':
564 	    check("ubject:", 7, dg_subj);
565 	    check("ubject", 6, dg_subj);
566 	    break;
567 
568 	case 'T':
569 	case 't':
570 	    check("itle:", 5, dg_subj);
571 	    if (!all)
572 		break;
573 	    check("o:", 2, dg_to);
574 	    break;
575     }
576 
577 #undef check
578     TEST0("NOT MATCHED ");
579 
580     return NULL;
581 }
582