1 /*
2  *	(c) Copyright 1990, Kim Fabricius Storm.  All rights reserved.
3  *      Copyright (c) 1996-2005 Michael T Pins.  All rights reserved.
4  *
5  *	Collect and save article information in database.
6  */
7 
8 #include <unistd.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include "config.h"
12 #include "global.h"
13 #include "db.h"
14 #include "digest.h"
15 #include "master.h"
16 #include "news.h"
17 #include "nntp.h"
18 #include "pack_date.h"
19 #include "pack_name.h"
20 #include "pack_subject.h"
21 
22 /* collect.c */
23 
24 static void     do_auto_archive(group_header * gh, register FILE * f, article_number num);
25 static void     build_hdr(int type);
26 static int      collect_article(register group_header * gh, article_number art_num);
27 static long     collect_group(register group_header * gh);
28 
29 
30 
31 #define COUNT_RE_REFERENCES	/* no of >>> depends on Reference: line */
32 
33 int             ignore_bad_articles = 1;	/* no Newsgroups: line */
34 int             remove_bad_articles = 0;
35 time_t          max_article_age = 0;
36 
37 extern int      trace, debug_mode;
38 
39 #ifdef NNTP
40 extern int      nntp_failed;
41 #endif
42 
43 static long     bad_count;
44 
45 static FILE    *ix, *data;
46 
47 static void
do_auto_archive(group_header * gh,register FILE * f,article_number num)48 do_auto_archive(group_header * gh, register FILE * f, article_number num)
49 {
50     char            line[200];
51     article_number  last;
52     register FILE  *arc;
53     register int    c;
54     long            start;
55     static char    *arc_header = "Archived-Last: ";
56     /* Header format: Archived-Last: 88888888 group.name */
57     /* Fixed constants length == 15 and offset == 24 are used below */
58 
59     arc = open_file(gh->archive_file, OPEN_READ);
60     last = 0;
61     start = 0;
62     if (arc != NULL) {
63 	while (fgets(line, 200, arc) != NULL) {
64 	    if (strncmp(line, arc_header, 15)) {
65 		log_entry('E', "%s not archive for %s\n",
66 			  gh->archive_file, gh->group_name);
67 		gh->master_flag &= ~M_AUTO_ARCHIVE;
68 		fclose(arc);
69 		return;
70 	    }
71 	    if (strncmp(line + 24, gh->group_name, gh->group_name_length)) {
72 		start = ftell(arc);
73 		continue;
74 	    }
75 	    last = atol(line + 15);
76 	    break;
77 	}
78 	fclose(arc);
79     }
80     if (last >= num)
81 	return;
82 
83     arc = open_file(gh->archive_file, last > 0 ? OPEN_UPDATE : OPEN_CREATE);
84     if (arc == NULL) {
85 	log_entry('E', "Cannot create archive file: %s\n", gh->archive_file);
86 	gh->master_flag &= ~M_AUTO_ARCHIVE;
87 	return;
88     }
89     fseek(arc, start, 0);
90     fprintf(arc, "%s%8ld %s\n", arc_header, (long) num, gh->group_name);
91     fseek(arc, 0, 2);
92 
93     fseek(f, 0, 0);
94     while ((c = getc(f)) != EOF)
95 	putc(c, arc);
96     putc(NL, arc);
97     fclose(arc);
98 }
99 
100 static void
build_hdr(int type)101 build_hdr(int type)
102 {
103     register char  *name, *subj;
104     int             re;
105 
106     db_data.dh_type = type;
107 
108     if (type == DH_SUB_DIGEST) {
109 
110 	name = digest.dg_from;
111 	subj = digest.dg_subj;
112 
113 	db_hdr.dh_lines = digest.dg_lines;
114 
115 	db_hdr.dh_hpos = digest.dg_hpos;
116 	db_hdr.dh_fpos = (int16) (digest.dg_fpos - db_hdr.dh_hpos);
117 	db_hdr.dh_lpos = digest.dg_lpos;
118 
119 	db_hdr.dh_date = pack_date(digest.dg_date ? digest.dg_date : news.ng_date);
120     } else {
121 
122 	if (!news.ng_from)
123 	    news.ng_from = news.ng_reply;
124 
125 	name = news.ng_from;
126 	subj = news.ng_subj;
127 
128 	db_hdr.dh_lines = news.ng_lines;
129 
130 	db_hdr.dh_hpos = 0;
131 	db_hdr.dh_fpos = (int16) (news.ng_fpos);
132 	db_hdr.dh_lpos = news.ng_lpos;
133 
134 	db_hdr.dh_date = pack_date(news.ng_date);
135     }
136 
137     if (name) {
138 	db_hdr.dh_sender_length = pack_name(db_data.dh_sender, name, NAME_LENGTH);
139     } else
140 	db_hdr.dh_sender_length = 0;
141 
142     if (type == DH_DIGEST_HEADER) {
143 	db_hdr.dh_subject_length = 1;
144 	db_data.dh_subject[0] = '@';
145     } else
146 	db_hdr.dh_subject_length = 0;
147 
148     db_hdr.dh_subject_length +=
149 	pack_subject(db_data.dh_subject + db_hdr.dh_subject_length, subj, &re,
150 		     DBUF_SIZE);
151 
152 #ifdef COUNT_RE_REFERENCES
153     if (re)
154 	re = 0x80;
155     if (news.ng_ref) {
156 	for (name = news.ng_ref; *name; name++) {
157 	    if ((re & 0x7f) == 0x7f)
158 		break;
159 	    if (*name == '<')
160 		re++;
161 	}
162     }
163 #endif
164 
165     db_hdr.dh_replies = re;
166 
167     if (db_write_art(data) < 0)
168 	write_error();
169 }
170 
171 
172 static int
collect_article(register group_header * gh,article_number art_num)173 collect_article(register group_header * gh, article_number art_num)
174 {
175     FILE           *art_file;
176     news_header_buffer nhbuf, dgbuf;
177     article_header  art_hdr;
178     int             mode, count;
179     cross_post_number *cp_ptr;
180     long            age;
181 
182     count = 0;
183 
184     db_hdr.dh_number = art_num;
185 
186     /* get article header */
187 
188     art_hdr.a_number = art_num;
189     art_hdr.hpos = 0;
190     art_hdr.lpos = (off_t) 0;
191     art_hdr.flag = 0;
192 
193     mode = FILL_NEWS_HEADER | FILL_OFFSETS | SKIP_HEADER;
194     if ((gh->master_flag & (M_CONTROL | M_NEVER_DIGEST | M_ALWAYS_DIGEST)) == 0)
195 	mode |= DIGEST_CHECK;
196 
197 #ifdef NNTP
198     if ((gh->master_flag & M_ALWAYS_DIGEST) == 0)
199 	mode |= LAZY_BODY;
200 #endif
201 
202     if ((art_file = open_news_article(&art_hdr, mode, nhbuf, (char *) NULL)) == NULL) {
203 
204 #ifdef NNTP
205 	if (nntp_failed) {
206 
207 	    /*
208 	     * connection to nntp_server is broken stop collection of
209 	     * articles immediately
210 	     */
211 	    return -1;
212 	}
213 #endif
214 
215 	/*
216 	 * it is not really necessary to save anything in the data file we
217 	 * simply use the index file to get the *first* available article
218 	 */
219 	return 0;
220     }
221     if (art_file == (FILE *) 1) {	/* empty file */
222 	if (!ignore_bad_articles)
223 	    return 0;
224 	news.ng_groups = NULL;
225 	art_file = NULL;
226     } else if (max_article_age &&	/* == 0 if use_nntp */
227 	       (gh->master_flag & M_INCLUDE_OLD) == 0 &&
228 	       (age = m_time(art_file)) < max_article_age) {
229 
230 	if (remove_bad_articles)
231 	    unlink(group_path_name);
232 
233 	log_entry('O', "%sold article (%ld days): %s/%ld",
234 		  remove_bad_articles ? "removed " : "",
235 		  (cur_time() - age) / (24 * 60 * 60),
236 		  current_group->group_name, (long) art_num);
237 	bad_count++;
238 	fclose(art_file);
239 	return 0;
240     }
241     if (ignore_bad_articles && news.ng_groups == NULL) {
242 	char           *rem = "";
243 
244 	if (!use_nntp && remove_bad_articles) {
245 	    unlink(group_path_name);
246 	    rem = "removed ";
247 	}
248 	log_entry('B', "%sbad article: %s/%ld", rem,
249 		  current_group->group_name, (long) art_num);
250 	if (art_file != NULL)
251 	    fclose(art_file);
252 	bad_count++;
253 	return 0;
254     }
255     /* map cross-postings into a list of group numbers */
256 
257     db_hdr.dh_cross_postings = 0;
258 
259     if (gh->master_flag & M_CONTROL) {
260 	/* we cannot trust the Newsgroups: line in the control group */
261 	/* so we simply ignore it (i.e. use "Newsgroups: control") */
262 	goto dont_digest;
263     }
264     if (news.ng_groups) {
265 	char           *curg, *nextg;
266 	group_header   *gh1;
267 
268 	for (nextg = news.ng_groups, cp_ptr = db_data.dh_cross; *nextg;) {
269 	    curg = nextg;
270 
271 	    if ((nextg = strchr(curg, ',')))
272 		*nextg++ = NUL;
273 	    else
274 		nextg = "";
275 
276 	    if (strcmp(gh->group_name, curg) == 0)
277 		gh1 = gh;
278 	    else if ((gh1 = lookup(curg)) == NULL)
279 		continue;
280 
281 	    *cp_ptr++ = NETW_CROSS_EXT(gh1->group_num);
282 	    if (++db_hdr.dh_cross_postings == DBUF_SIZE)
283 		break;
284 	}
285     }
286     if (db_hdr.dh_cross_postings == 1)
287 	db_hdr.dh_cross_postings = 0;	/* only current group */
288 
289     if (gh->master_flag & M_NEVER_DIGEST)
290 	goto dont_digest;
291 
292     /* split digest */
293 
294     if ((gh->master_flag & M_ALWAYS_DIGEST) || (news.ng_flag & N_DIGEST)) {
295 	int             any = 0, cont = 1;
296 
297 	skip_digest_body(art_file);
298 
299 	while (cont && (cont = get_digest_article(art_file, dgbuf)) >= 0) {
300 
301 	    if (any == 0) {
302 		build_hdr(DH_DIGEST_HEADER);	/* write DIGEST_HEADER */
303 		count++;
304 		db_hdr.dh_cross_postings = 0;	/* no cross post in sub */
305 		any++;
306 	    }
307 	    build_hdr(DH_SUB_DIGEST);	/* write SUB_DIGEST */
308 	    count++;
309 	}
310 
311 	if (any)
312 	    goto finish;
313     }
314     /* not a digest */
315 
316 dont_digest:
317 
318     build_hdr(DH_NORMAL);	/* normal article */
319     count++;
320 
321 finish:
322 
323     if (gh->master_flag & M_AUTO_ARCHIVE) {
324 
325 #ifdef NNTP
326 	FILE           *f;
327 	f = nntp_get_article(art_num, 0);
328 	do_auto_archive(gh, f, art_num);
329 	fclose(f);
330 #else
331 	do_auto_archive(gh, art_file, art_num);
332 #endif				/* NNTP */
333     }
334     fclose(art_file);
335 
336     return count;
337 }
338 
339 
340 /*
341  *	Collect unread articles in current group
342  *
343  *	On entry, init_group has been called to setup the proper environment
344  */
345 
346 static long
collect_group(register group_header * gh)347 collect_group(register group_header * gh)
348 {
349     long            article_count, temp, obad;
350     article_number  start_collect;
351 
352     if (gh->last_db_article == 0) {
353 	gh->first_db_article = gh->first_a_article;
354 	gh->last_db_article = gh->first_db_article - 1;
355     }
356     if (gh->last_db_article >= gh->last_a_article)
357 	return 0;
358 
359     if (gh->index_write_offset) {
360 	ix = open_data_file(gh, 'x', OPEN_UPDATE | MUST_EXIST);
361 	fseek(ix, gh->index_write_offset, 0);
362     } else
363 	ix = open_data_file(gh, 'x', OPEN_CREATE | MUST_EXIST);
364 
365     if (gh->data_write_offset) {
366 	data = open_data_file(gh, 'd', OPEN_UPDATE | MUST_EXIST);
367 	fseek(data, gh->data_write_offset, 0);
368     } else
369 	data = open_data_file(gh, 'd', OPEN_CREATE | MUST_EXIST);
370 
371     article_count = 0;
372     start_collect = gh->last_db_article + 1;
373 
374     if (debug_mode) {
375 	printf("\t\t%s (%ld..%ld)\r",
376 	       gh->group_name, start_collect, gh->last_a_article);
377 	fl;
378     }
379     bad_count = obad = 0;
380 
381     while (gh->last_db_article < gh->last_a_article) {
382 	if (s_hangup)
383 	    break;
384 	gh->last_db_article++;
385 	if (debug_mode) {
386 	    printf("\r%ld", gh->last_db_article);
387 	    if (obad != bad_count)
388 		printf("\t%ld", bad_count);
389 	    obad = bad_count;
390 	    fl;
391 	}
392 	gh->data_write_offset = ftell(data);
393 
394 #ifdef NNTP
395 	gh->index_write_offset = ftell(ix);
396 #endif
397 
398 	temp = collect_article(gh, gh->last_db_article);
399 
400 #ifdef NNTP
401 	if (temp < 0) {
402 	    /* connection failed, current article is not collected */
403 	    gh->last_db_article--;
404 	    article_count = -1;
405 	    goto out;
406 	}
407 #endif
408 
409 #ifndef RENUMBER_DANGER
410 	if (temp == 0 && gh->data_write_offset == 0) {
411 	    gh->first_db_article = gh->last_db_article + 1;
412 	    continue;
413 	}
414 #endif
415 
416 	if (!db_write_offset(ix, &(gh->data_write_offset)))
417 	    write_error();
418 	article_count += temp;
419     }
420 
421     if (start_collect < gh->first_db_article)
422 	start_collect = gh->first_db_article;
423 
424     if (trace && start_collect <= gh->last_db_article)
425 	log_entry('T', "Col %s (%d to %d) %d",
426 		  gh->group_name,
427 		  start_collect, gh->last_db_article,
428 		  article_count);
429 
430     if (debug_mode)
431 	printf("\nCol %s (%ld to %ld) %ld",
432 	       gh->group_name,
433 	       start_collect, gh->last_db_article,
434 	       article_count);
435 
436     gh->data_write_offset = ftell(data);
437     gh->index_write_offset = ftell(ix);
438 
439 #ifdef NNTP
440 out:
441 #endif
442 
443     fclose(data);
444     fclose(ix);
445 
446     if (debug_mode)
447 	putchar(NL);
448 
449     return article_count;
450 }
451 
452 int
do_collect(void)453 do_collect(void)
454 {
455     register group_header *gh;
456     long            col_article_count, temp;
457     int             col_group_count;
458     time_t          start_time;
459 
460     start_time = cur_time();
461     col_article_count = col_group_count = 0;
462     current_group = NULL;	/* for init_group */
463     temp = 0;
464 
465     Loop_Groups_Header(gh) {
466 	if (s_hangup) {
467 	    temp = -1;
468 	    break;
469 	}
470 	if (gh->master_flag & M_IGNORE_GROUP)
471 	    continue;
472 
473 	if (gh->master_flag & M_MUST_CLEAN)
474 	    clean_group(gh);
475 
476 	if (gh->last_db_article == gh->last_a_article) {
477 	    if (gh->master_flag & M_BLOCKED)
478 		goto unblock_group;
479 	    continue;
480 	}
481 	if (!init_group(gh)) {
482 	    if ((gh->master_flag & M_NO_DIRECTORY) == 0) {
483 		log_entry('R', "%s: no directory", gh->group_name);
484 		gh->master_flag |= M_NO_DIRECTORY;
485 	    }
486 	    gh->last_db_article = gh->last_a_article;
487 	    gh->first_db_article = gh->last_a_article;	/* OBS: not first */
488 	    gh->master_flag &= ~(M_EXPIRE | M_BLOCKED);
489 	    db_write_group(gh);
490 	    continue;
491 	}
492 	if (gh->master_flag & M_NO_DIRECTORY) {
493 	    /* The directory has been created now */
494 	    gh->master_flag &= ~M_NO_DIRECTORY;
495 	    clean_group(gh);
496 	}
497 	temp = collect_group(gh);
498 
499 #ifdef NNTP
500 	if (temp < 0) {
501 	    /* connection broken */
502 	    gh->master_flag &= ~M_EXPIRE;	/* remains blocked */
503 	    db_write_group(gh);
504 	    break;
505 	}
506 #endif
507 
508 	if (temp > 0) {
509 	    col_article_count += temp;
510 	    col_group_count++;
511 	}
512 unblock_group:
513 	gh->master_flag &= ~(M_EXPIRE | M_BLOCKED);
514 	db_write_group(gh);
515     }
516 
517     if (col_article_count > 0)
518 	log_entry('C', "Collect: %ld art, %d gr, %ld s",
519 		  col_article_count, col_group_count,
520 		  cur_time() - start_time);
521 
522     return temp > 0;		/* return true IF we got articles */
523 }
524