1 /*
2 * (c) Copyright 1990, Kim Fabricius Storm. All rights reserved.
3 * Copyright (c) 1996-2005 Michael T Pins. All rights reserved.
4 *
5 * Collect and save article information in database.
6 */
7
8 #include <unistd.h>
9 #include <stdlib.h>
10 #include <string.h>
11 #include "config.h"
12 #include "global.h"
13 #include "db.h"
14 #include "digest.h"
15 #include "master.h"
16 #include "news.h"
17 #include "nntp.h"
18 #include "pack_date.h"
19 #include "pack_name.h"
20 #include "pack_subject.h"
21
22 /* collect.c */
23
24 static void do_auto_archive(group_header * gh, register FILE * f, article_number num);
25 static void build_hdr(int type);
26 static int collect_article(register group_header * gh, article_number art_num);
27 static long collect_group(register group_header * gh);
28
29
30
31 #define COUNT_RE_REFERENCES /* no of >>> depends on Reference: line */
32
33 int ignore_bad_articles = 1; /* no Newsgroups: line */
34 int remove_bad_articles = 0;
35 time_t max_article_age = 0;
36
37 extern int trace, debug_mode;
38
39 #ifdef NNTP
40 extern int nntp_failed;
41 #endif
42
43 static long bad_count;
44
45 static FILE *ix, *data;
46
47 static void
do_auto_archive(group_header * gh,register FILE * f,article_number num)48 do_auto_archive(group_header * gh, register FILE * f, article_number num)
49 {
50 char line[200];
51 article_number last;
52 register FILE *arc;
53 register int c;
54 long start;
55 static char *arc_header = "Archived-Last: ";
56 /* Header format: Archived-Last: 88888888 group.name */
57 /* Fixed constants length == 15 and offset == 24 are used below */
58
59 arc = open_file(gh->archive_file, OPEN_READ);
60 last = 0;
61 start = 0;
62 if (arc != NULL) {
63 while (fgets(line, 200, arc) != NULL) {
64 if (strncmp(line, arc_header, 15)) {
65 log_entry('E', "%s not archive for %s\n",
66 gh->archive_file, gh->group_name);
67 gh->master_flag &= ~M_AUTO_ARCHIVE;
68 fclose(arc);
69 return;
70 }
71 if (strncmp(line + 24, gh->group_name, gh->group_name_length)) {
72 start = ftell(arc);
73 continue;
74 }
75 last = atol(line + 15);
76 break;
77 }
78 fclose(arc);
79 }
80 if (last >= num)
81 return;
82
83 arc = open_file(gh->archive_file, last > 0 ? OPEN_UPDATE : OPEN_CREATE);
84 if (arc == NULL) {
85 log_entry('E', "Cannot create archive file: %s\n", gh->archive_file);
86 gh->master_flag &= ~M_AUTO_ARCHIVE;
87 return;
88 }
89 fseek(arc, start, 0);
90 fprintf(arc, "%s%8ld %s\n", arc_header, (long) num, gh->group_name);
91 fseek(arc, 0, 2);
92
93 fseek(f, 0, 0);
94 while ((c = getc(f)) != EOF)
95 putc(c, arc);
96 putc(NL, arc);
97 fclose(arc);
98 }
99
100 static void
build_hdr(int type)101 build_hdr(int type)
102 {
103 register char *name, *subj;
104 int re;
105
106 db_data.dh_type = type;
107
108 if (type == DH_SUB_DIGEST) {
109
110 name = digest.dg_from;
111 subj = digest.dg_subj;
112
113 db_hdr.dh_lines = digest.dg_lines;
114
115 db_hdr.dh_hpos = digest.dg_hpos;
116 db_hdr.dh_fpos = (int16) (digest.dg_fpos - db_hdr.dh_hpos);
117 db_hdr.dh_lpos = digest.dg_lpos;
118
119 db_hdr.dh_date = pack_date(digest.dg_date ? digest.dg_date : news.ng_date);
120 } else {
121
122 if (!news.ng_from)
123 news.ng_from = news.ng_reply;
124
125 name = news.ng_from;
126 subj = news.ng_subj;
127
128 db_hdr.dh_lines = news.ng_lines;
129
130 db_hdr.dh_hpos = 0;
131 db_hdr.dh_fpos = (int16) (news.ng_fpos);
132 db_hdr.dh_lpos = news.ng_lpos;
133
134 db_hdr.dh_date = pack_date(news.ng_date);
135 }
136
137 if (name) {
138 db_hdr.dh_sender_length = pack_name(db_data.dh_sender, name, NAME_LENGTH);
139 } else
140 db_hdr.dh_sender_length = 0;
141
142 if (type == DH_DIGEST_HEADER) {
143 db_hdr.dh_subject_length = 1;
144 db_data.dh_subject[0] = '@';
145 } else
146 db_hdr.dh_subject_length = 0;
147
148 db_hdr.dh_subject_length +=
149 pack_subject(db_data.dh_subject + db_hdr.dh_subject_length, subj, &re,
150 DBUF_SIZE);
151
152 #ifdef COUNT_RE_REFERENCES
153 if (re)
154 re = 0x80;
155 if (news.ng_ref) {
156 for (name = news.ng_ref; *name; name++) {
157 if ((re & 0x7f) == 0x7f)
158 break;
159 if (*name == '<')
160 re++;
161 }
162 }
163 #endif
164
165 db_hdr.dh_replies = re;
166
167 if (db_write_art(data) < 0)
168 write_error();
169 }
170
171
172 static int
collect_article(register group_header * gh,article_number art_num)173 collect_article(register group_header * gh, article_number art_num)
174 {
175 FILE *art_file;
176 news_header_buffer nhbuf, dgbuf;
177 article_header art_hdr;
178 int mode, count;
179 cross_post_number *cp_ptr;
180 long age;
181
182 count = 0;
183
184 db_hdr.dh_number = art_num;
185
186 /* get article header */
187
188 art_hdr.a_number = art_num;
189 art_hdr.hpos = 0;
190 art_hdr.lpos = (off_t) 0;
191 art_hdr.flag = 0;
192
193 mode = FILL_NEWS_HEADER | FILL_OFFSETS | SKIP_HEADER;
194 if ((gh->master_flag & (M_CONTROL | M_NEVER_DIGEST | M_ALWAYS_DIGEST)) == 0)
195 mode |= DIGEST_CHECK;
196
197 #ifdef NNTP
198 if ((gh->master_flag & M_ALWAYS_DIGEST) == 0)
199 mode |= LAZY_BODY;
200 #endif
201
202 if ((art_file = open_news_article(&art_hdr, mode, nhbuf, (char *) NULL)) == NULL) {
203
204 #ifdef NNTP
205 if (nntp_failed) {
206
207 /*
208 * connection to nntp_server is broken stop collection of
209 * articles immediately
210 */
211 return -1;
212 }
213 #endif
214
215 /*
216 * it is not really necessary to save anything in the data file we
217 * simply use the index file to get the *first* available article
218 */
219 return 0;
220 }
221 if (art_file == (FILE *) 1) { /* empty file */
222 if (!ignore_bad_articles)
223 return 0;
224 news.ng_groups = NULL;
225 art_file = NULL;
226 } else if (max_article_age && /* == 0 if use_nntp */
227 (gh->master_flag & M_INCLUDE_OLD) == 0 &&
228 (age = m_time(art_file)) < max_article_age) {
229
230 if (remove_bad_articles)
231 unlink(group_path_name);
232
233 log_entry('O', "%sold article (%ld days): %s/%ld",
234 remove_bad_articles ? "removed " : "",
235 (cur_time() - age) / (24 * 60 * 60),
236 current_group->group_name, (long) art_num);
237 bad_count++;
238 fclose(art_file);
239 return 0;
240 }
241 if (ignore_bad_articles && news.ng_groups == NULL) {
242 char *rem = "";
243
244 if (!use_nntp && remove_bad_articles) {
245 unlink(group_path_name);
246 rem = "removed ";
247 }
248 log_entry('B', "%sbad article: %s/%ld", rem,
249 current_group->group_name, (long) art_num);
250 if (art_file != NULL)
251 fclose(art_file);
252 bad_count++;
253 return 0;
254 }
255 /* map cross-postings into a list of group numbers */
256
257 db_hdr.dh_cross_postings = 0;
258
259 if (gh->master_flag & M_CONTROL) {
260 /* we cannot trust the Newsgroups: line in the control group */
261 /* so we simply ignore it (i.e. use "Newsgroups: control") */
262 goto dont_digest;
263 }
264 if (news.ng_groups) {
265 char *curg, *nextg;
266 group_header *gh1;
267
268 for (nextg = news.ng_groups, cp_ptr = db_data.dh_cross; *nextg;) {
269 curg = nextg;
270
271 if ((nextg = strchr(curg, ',')))
272 *nextg++ = NUL;
273 else
274 nextg = "";
275
276 if (strcmp(gh->group_name, curg) == 0)
277 gh1 = gh;
278 else if ((gh1 = lookup(curg)) == NULL)
279 continue;
280
281 *cp_ptr++ = NETW_CROSS_EXT(gh1->group_num);
282 if (++db_hdr.dh_cross_postings == DBUF_SIZE)
283 break;
284 }
285 }
286 if (db_hdr.dh_cross_postings == 1)
287 db_hdr.dh_cross_postings = 0; /* only current group */
288
289 if (gh->master_flag & M_NEVER_DIGEST)
290 goto dont_digest;
291
292 /* split digest */
293
294 if ((gh->master_flag & M_ALWAYS_DIGEST) || (news.ng_flag & N_DIGEST)) {
295 int any = 0, cont = 1;
296
297 skip_digest_body(art_file);
298
299 while (cont && (cont = get_digest_article(art_file, dgbuf)) >= 0) {
300
301 if (any == 0) {
302 build_hdr(DH_DIGEST_HEADER); /* write DIGEST_HEADER */
303 count++;
304 db_hdr.dh_cross_postings = 0; /* no cross post in sub */
305 any++;
306 }
307 build_hdr(DH_SUB_DIGEST); /* write SUB_DIGEST */
308 count++;
309 }
310
311 if (any)
312 goto finish;
313 }
314 /* not a digest */
315
316 dont_digest:
317
318 build_hdr(DH_NORMAL); /* normal article */
319 count++;
320
321 finish:
322
323 if (gh->master_flag & M_AUTO_ARCHIVE) {
324
325 #ifdef NNTP
326 FILE *f;
327 f = nntp_get_article(art_num, 0);
328 do_auto_archive(gh, f, art_num);
329 fclose(f);
330 #else
331 do_auto_archive(gh, art_file, art_num);
332 #endif /* NNTP */
333 }
334 fclose(art_file);
335
336 return count;
337 }
338
339
340 /*
341 * Collect unread articles in current group
342 *
343 * On entry, init_group has been called to setup the proper environment
344 */
345
346 static long
collect_group(register group_header * gh)347 collect_group(register group_header * gh)
348 {
349 long article_count, temp, obad;
350 article_number start_collect;
351
352 if (gh->last_db_article == 0) {
353 gh->first_db_article = gh->first_a_article;
354 gh->last_db_article = gh->first_db_article - 1;
355 }
356 if (gh->last_db_article >= gh->last_a_article)
357 return 0;
358
359 if (gh->index_write_offset) {
360 ix = open_data_file(gh, 'x', OPEN_UPDATE | MUST_EXIST);
361 fseek(ix, gh->index_write_offset, 0);
362 } else
363 ix = open_data_file(gh, 'x', OPEN_CREATE | MUST_EXIST);
364
365 if (gh->data_write_offset) {
366 data = open_data_file(gh, 'd', OPEN_UPDATE | MUST_EXIST);
367 fseek(data, gh->data_write_offset, 0);
368 } else
369 data = open_data_file(gh, 'd', OPEN_CREATE | MUST_EXIST);
370
371 article_count = 0;
372 start_collect = gh->last_db_article + 1;
373
374 if (debug_mode) {
375 printf("\t\t%s (%ld..%ld)\r",
376 gh->group_name, start_collect, gh->last_a_article);
377 fl;
378 }
379 bad_count = obad = 0;
380
381 while (gh->last_db_article < gh->last_a_article) {
382 if (s_hangup)
383 break;
384 gh->last_db_article++;
385 if (debug_mode) {
386 printf("\r%ld", gh->last_db_article);
387 if (obad != bad_count)
388 printf("\t%ld", bad_count);
389 obad = bad_count;
390 fl;
391 }
392 gh->data_write_offset = ftell(data);
393
394 #ifdef NNTP
395 gh->index_write_offset = ftell(ix);
396 #endif
397
398 temp = collect_article(gh, gh->last_db_article);
399
400 #ifdef NNTP
401 if (temp < 0) {
402 /* connection failed, current article is not collected */
403 gh->last_db_article--;
404 article_count = -1;
405 goto out;
406 }
407 #endif
408
409 #ifndef RENUMBER_DANGER
410 if (temp == 0 && gh->data_write_offset == 0) {
411 gh->first_db_article = gh->last_db_article + 1;
412 continue;
413 }
414 #endif
415
416 if (!db_write_offset(ix, &(gh->data_write_offset)))
417 write_error();
418 article_count += temp;
419 }
420
421 if (start_collect < gh->first_db_article)
422 start_collect = gh->first_db_article;
423
424 if (trace && start_collect <= gh->last_db_article)
425 log_entry('T', "Col %s (%d to %d) %d",
426 gh->group_name,
427 start_collect, gh->last_db_article,
428 article_count);
429
430 if (debug_mode)
431 printf("\nCol %s (%ld to %ld) %ld",
432 gh->group_name,
433 start_collect, gh->last_db_article,
434 article_count);
435
436 gh->data_write_offset = ftell(data);
437 gh->index_write_offset = ftell(ix);
438
439 #ifdef NNTP
440 out:
441 #endif
442
443 fclose(data);
444 fclose(ix);
445
446 if (debug_mode)
447 putchar(NL);
448
449 return article_count;
450 }
451
452 int
do_collect(void)453 do_collect(void)
454 {
455 register group_header *gh;
456 long col_article_count, temp;
457 int col_group_count;
458 time_t start_time;
459
460 start_time = cur_time();
461 col_article_count = col_group_count = 0;
462 current_group = NULL; /* for init_group */
463 temp = 0;
464
465 Loop_Groups_Header(gh) {
466 if (s_hangup) {
467 temp = -1;
468 break;
469 }
470 if (gh->master_flag & M_IGNORE_GROUP)
471 continue;
472
473 if (gh->master_flag & M_MUST_CLEAN)
474 clean_group(gh);
475
476 if (gh->last_db_article == gh->last_a_article) {
477 if (gh->master_flag & M_BLOCKED)
478 goto unblock_group;
479 continue;
480 }
481 if (!init_group(gh)) {
482 if ((gh->master_flag & M_NO_DIRECTORY) == 0) {
483 log_entry('R', "%s: no directory", gh->group_name);
484 gh->master_flag |= M_NO_DIRECTORY;
485 }
486 gh->last_db_article = gh->last_a_article;
487 gh->first_db_article = gh->last_a_article; /* OBS: not first */
488 gh->master_flag &= ~(M_EXPIRE | M_BLOCKED);
489 db_write_group(gh);
490 continue;
491 }
492 if (gh->master_flag & M_NO_DIRECTORY) {
493 /* The directory has been created now */
494 gh->master_flag &= ~M_NO_DIRECTORY;
495 clean_group(gh);
496 }
497 temp = collect_group(gh);
498
499 #ifdef NNTP
500 if (temp < 0) {
501 /* connection broken */
502 gh->master_flag &= ~M_EXPIRE; /* remains blocked */
503 db_write_group(gh);
504 break;
505 }
506 #endif
507
508 if (temp > 0) {
509 col_article_count += temp;
510 col_group_count++;
511 }
512 unblock_group:
513 gh->master_flag &= ~(M_EXPIRE | M_BLOCKED);
514 db_write_group(gh);
515 }
516
517 if (col_article_count > 0)
518 log_entry('C', "Collect: %ld art, %d gr, %ld s",
519 col_article_count, col_group_count,
520 cur_time() - start_time);
521
522 return temp > 0; /* return true IF we got articles */
523 }
524