1 /*
2 mairix - message index builder and finder for maildir folders.
3
4 **********************************************************************
5 * Copyright (C) Richard P. Curnow 2002,2003,2004,2005,2006,2007,2009
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of version 2 of the GNU General Public License as
9 * published by the Free Software Foundation.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License along
17 * with this program; if not, write to the Free Software Foundation, Inc.,
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19 *
20 **********************************************************************
21 */
22
23 /* Handle complete database */
24
25 #include "mairix.h"
26 #include "reader.h"
27 #include <ctype.h>
28 #include <assert.h>
29 #include <sys/time.h>
30 #include <unistd.h>
31 #include "imapinterface.h"
32
33 struct sortable_token {/*{{{*/
34 char *text;
35 int index;
36 };
37 /*}}}*/
compare_sortable_tokens(const void * a,const void * b)38 static int compare_sortable_tokens(const void *a, const void *b)/*{{{*/
39 {
40 const struct sortable_token *aa = (const struct sortable_token *) a;
41 const struct sortable_token *bb = (const struct sortable_token *) b;
42 int foo;
43 foo = strcmp(aa->text, bb->text);
44 if (foo) {
45 return foo;
46 } else {
47 if (aa->index < bb->index) return -1;
48 else if (aa->index > bb->index) return +1;
49 else return 0;
50 }
51 }
52 /*}}}*/
check_toktable_enc_integrity(int n_msgs,struct toktable * table)53 static void check_toktable_enc_integrity(int n_msgs, struct toktable *table)/*{{{*/
54 {
55 /* FIXME : Check reachability of tokens that are displaced from their natural
56 * hash bucket (if deletions have occurred during purge). */
57
58 int idx, incr;
59 int i, k;
60 unsigned char *j, *last_char;
61 int broken_chains = 0;
62 struct sortable_token *sort_list;
63 int any_duplicates;
64
65 for (i=0; i<table->size; i++) {
66 struct token *tok = table->tokens[i];
67 if (tok) {
68 idx = 0;
69 incr = 0;
70 last_char = tok->match0.msginfo + tok->match0.n;
71 for (j = tok->match0.msginfo; j < last_char; ) {
72 incr = read_increment(&j);
73 idx += incr;
74 }
75 if (idx != tok->match0.highest) {
76 fprintf(stderr, "broken encoding chain for token <%s>, highest=%ld\n", tok->text, tok->match0.highest);
77 fflush(stderr);
78 broken_chains = 1;
79 }
80 if (idx >= n_msgs) {
81 fprintf(stderr, "end of chain higher than number of message paths (%d) for token <%s>\n", n_msgs, tok->text);
82 fflush(stderr);
83 broken_chains = 1;
84 }
85 }
86 }
87
88 assert(!broken_chains);
89
90 /* Check there are no duplicated tokens in the table. */
91 sort_list = new_array(struct sortable_token, table->n);
92 k = 0;
93 for (i=0; i<table->size; i++) {
94 struct token *tok = table->tokens[i];
95 if (tok) {
96 sort_list[k].text = new_string(tok->text);
97 sort_list[k].index = i;
98 k++;
99 }
100 }
101 assert(k == table->n);
102
103 qsort(sort_list, table->n, sizeof(struct sortable_token), compare_sortable_tokens);
104 /* Check for uniqueness of neighbouring token texts */
105 any_duplicates = 0;
106 for (i=0; i<(table->n - 1); i++) {
107 if (!strcmp(sort_list[i].text, sort_list[i+1].text)) {
108 fprintf(stderr, "Token table contains duplicated token %s at indices %d and %d\n",
109 sort_list[i].text, sort_list[i].index, sort_list[i+1].index);
110 any_duplicates = 1;
111 }
112 }
113
114 /* release */
115 for (i=0; i<table->n; i++) {
116 free(sort_list[i].text);
117 }
118 free(sort_list);
119
120 if (any_duplicates) {
121 fprintf(stderr, "Token table contained duplicate entries, aborting\n");
122 assert(0);
123 }
124 }
125 /*}}}*/
compare_strings(const void * a,const void * b)126 static int compare_strings(const void *a, const void *b)/*{{{*/
127 {
128 const char **aa = (const char **) a;
129 const char **bb = (const char **) b;
130 return strcmp(*aa, *bb);
131 }
132 /*}}}*/
check_message_path_integrity(struct database * db)133 static void check_message_path_integrity(struct database *db)/*{{{*/
134 {
135 /* TODO : for now only checks integrity of non-mbox paths. */
136 /* Check there are no duplicates */
137 int i;
138 int n, imap_n;
139 int has_duplicate = 0;
140
141 char **paths, **imaps;
142 paths = new_array(char *, db->n_msgs);
143 imaps = new_array(char *, db->n_msgs);
144 for (i=0, n=0, imap_n=0; i<db->n_msgs; i++) {
145 switch (db->type[i]) {
146 case MTY_DEAD:
147 case MTY_MBOX:
148 break;
149 case MTY_FILE:
150 paths[n++] = db->msgs[i].src.mpf.path;
151 break;
152 case MTY_IMAP:
153 imaps[imap_n++] = db->msgs[i].src.mpf.path;
154 break;
155 }
156 }
157
158 qsort(paths, n, sizeof(char *), compare_strings);
159 qsort(imaps, imap_n, sizeof(char *), compare_strings);
160
161 for (i=1; i<n; i++) {
162 if (!strcmp(paths[i-1], paths[i])) {
163 fprintf(stderr, "Path <%s> repeated\n", paths[i]);
164 has_duplicate = 1;
165 }
166 }
167 for (i=1; i<imap_n; i++) {
168 if (!strcmp(imaps[i-1], imaps[i])) {
169 fprintf(stderr, "IMAP message <%s> repeated\n", imaps[i]);
170 has_duplicate = 1;
171 }
172 }
173
174 fflush(stderr);
175 assert(!has_duplicate);
176
177 free(paths);
178 free(imaps);
179 return;
180 }
181 /*}}}*/
check_database_integrity(struct database * db)182 void check_database_integrity(struct database *db)/*{{{*/
183 {
184 if (verbose) fprintf(stderr, "Checking message path integrity\n");
185 check_message_path_integrity(db);
186
187 /* Just check encoding chains for now */
188 if (verbose) fprintf(stderr, "Checking to\n");
189 check_toktable_enc_integrity(db->n_msgs, db->to);
190 if (verbose) fprintf(stderr, "Checking cc\n");
191 check_toktable_enc_integrity(db->n_msgs, db->cc);
192 if (verbose) fprintf(stderr, "Checking from\n");
193 check_toktable_enc_integrity(db->n_msgs, db->from);
194 if (verbose) fprintf(stderr, "Checking subject\n");
195 check_toktable_enc_integrity(db->n_msgs, db->subject);
196 if (verbose) fprintf(stderr, "Checking body\n");
197 check_toktable_enc_integrity(db->n_msgs, db->body);
198 if (verbose) fprintf(stderr, "Checking attachment_name\n");
199 check_toktable_enc_integrity(db->n_msgs, db->attachment_name);
200 }
201 /*}}}*/
new_database(unsigned int hash_key)202 struct database *new_database(unsigned int hash_key)/*{{{*/
203 {
204 struct database *result = new(struct database);
205 struct timeval tv;
206 pid_t pid;
207
208 result->to = new_toktable();
209 result->cc = new_toktable();
210 result->from = new_toktable();
211 result->subject = new_toktable();
212 result->body = new_toktable();
213 result->attachment_name = new_toktable();
214
215 result->msg_ids = new_toktable2();
216
217 if ( hash_key == CREATE_RANDOM_DATABASE_HASH )
218 {
219 gettimeofday(&tv, NULL);
220 pid = getpid();
221 hash_key = tv.tv_sec ^ (pid ^ (tv.tv_usec << 15));
222 }
223 result->hash_key = hash_key;
224
225 result->msgs = NULL;
226 result->type = NULL;
227 result->n_msgs = 0;
228 result->max_msgs = 0;
229
230 result->mboxen = NULL;
231 result->n_mboxen = 0;
232 result->max_mboxen = 0;
233
234 return result;
235 }
236 /*}}}*/
free_database(struct database * db)237 void free_database(struct database *db)/*{{{*/
238 {
239 int i;
240
241 free_toktable(db->to);
242 free_toktable(db->cc);
243 free_toktable(db->from);
244 free_toktable(db->subject);
245 free_toktable(db->body);
246 free_toktable(db->attachment_name);
247 free_toktable2(db->msg_ids);
248
249 if (db->msgs) {
250 for (i=0; i<db->n_msgs; i++) {
251 switch (db->type[i]) {
252 case MTY_DEAD:
253 break;
254 case MTY_MBOX:
255 break;
256 case MTY_FILE:
257 case MTY_IMAP:
258 assert(db->msgs[i].src.mpf.path);
259 free(db->msgs[i].src.mpf.path);
260 break;
261 }
262 }
263 free(db->msgs);
264 free(db->type);
265 }
266
267 free(db);
268 }
269 /*}}}*/
270
get_max(int a,int b)271 static int get_max (int a, int b) {/*{{{*/
272 return (a > b) ? a : b;
273 }
274 /*}}}*/
import_toktable(char * data,unsigned int hash_key,int n_msgs,struct toktable_db * in,struct toktable * out)275 static void import_toktable(char *data, unsigned int hash_key, int n_msgs, struct toktable_db *in, struct toktable *out)/*{{{*/
276 {
277 int n, size, i;
278
279 n = in->n;
280 size = 1;
281 while (size < n) size <<= 1;
282 size <<= 1; /* safe hash table size */
283
284 out->size = size;
285 out->mask = size - 1;
286 out->n = n;
287 out->tokens = new_array(struct token *, size);
288 memset(out->tokens, 0, size * sizeof(struct token *));
289 out->hwm = (n + size) >> 1;
290
291 for (i=0; i<n; i++) {
292 unsigned int hash, index;
293 char *text;
294 unsigned char *enc;
295 int enc_len;
296 struct token *nt;
297 int enc_hi;
298 int idx, incr;
299 unsigned char *j;
300
301 /* Recover enc_len and enc_hi from the data */
302 enc = (unsigned char *) data + in->enc_offsets[i];
303 idx = 0;
304 for (j = enc; *j != 0xff; ) {
305 incr = read_increment(&j);
306 idx += incr;
307 }
308 enc_len = j - enc;
309 enc_hi = idx;
310
311 text = data + in->tok_offsets[i];
312 hash = hashfn((unsigned char *) text, strlen(text), hash_key);
313
314 nt = new(struct token);
315 nt->hashval = hash;
316 nt->text = new_string(text);
317 /* Allow a bit of headroom for adding more entries later */
318 nt->match0.max = get_max(16, enc_len + (enc_len >> 1));
319 nt->match0.n = enc_len;
320 nt->match0.highest = enc_hi;
321 assert(nt->match0.highest < n_msgs);
322 nt->match0.msginfo = new_array(unsigned char, nt->match0.max);
323 memcpy(nt->match0.msginfo, enc, nt->match0.n);
324
325 index = hash & out->mask;
326 while (out->tokens[index]) {
327 /* Audit to look for corrupt database with multiple entries for the same
328 * string. */
329 if (!strcmp(nt->text, out->tokens[index]->text)) {
330 fprintf(stderr, "\n!!! Corrupt token table found in database, token <%s> duplicated, aborting\n",
331 nt->text);
332 fprintf(stderr, " Delete the database file and rebuild from scratch as a workaround\n");
333 /* No point going on - need to find out why the database got corrupted
334 * in the 1st place. Workaround for user - rebuild database from
335 * scratch by deleting it then rerunning. */
336 unlock_and_exit(1);
337 }
338 ++index;
339 index &= out->mask;
340 }
341
342 out->tokens[index] = nt;
343 }
344 }
345 /*}}}*/
import_toktable2(char * data,unsigned int hash_key,int n_msgs,struct toktable2_db * in,struct toktable2 * out)346 static void import_toktable2(char *data, unsigned int hash_key, int n_msgs, struct toktable2_db *in, struct toktable2 *out)/*{{{*/
347 {
348 int n, size, i;
349
350 n = in->n;
351 size = 1;
352 while (size < n) size <<= 1;
353 size <<= 1; /* safe hash table size */
354
355 out->size = size;
356 out->mask = size - 1;
357 out->n = n;
358 out->tokens = new_array(struct token2 *, size);
359 memset(out->tokens, 0, size * sizeof(struct token *));
360 out->hwm = (n + size) >> 1;
361
362 for (i=0; i<n; i++) {
363 unsigned int hash, index;
364 char *text;
365 struct token2 *nt;
366 unsigned char *enc0, *enc1;
367 int enc0_len, enc1_len;
368 int enc0_hi, enc1_hi;
369 int idx, incr;
370 unsigned char *j;
371
372 /*{{{ do enc0*/
373 enc0 = (unsigned char *) data + in->enc0_offsets[i];
374 idx = 0;
375 for (j = enc0; *j != 0xff; ) {
376 incr = read_increment(&j);
377 idx += incr;
378 }
379 enc0_len = j - enc0;
380 enc0_hi = idx;
381 /*}}}*/
382 /*{{{ do enc1*/
383 enc1 = (unsigned char *) data + in->enc1_offsets[i];
384 idx = 0;
385 for (j = enc1; *j != 0xff; ) {
386 incr = read_increment(&j);
387 idx += incr;
388 }
389 enc1_len = j - enc1;
390 enc1_hi = idx;
391 /*}}}*/
392
393 text = data + in->tok_offsets[i];
394 hash = hashfn((unsigned char *) text, strlen(text), hash_key);
395
396 nt = new(struct token2);
397 nt->hashval = hash;
398 nt->text = new_string(text);
399 /* Allow a bit of headroom for adding more entries later */
400 /*{{{ set up match0 chain */
401 nt->match0.max = get_max(16, enc0_len + (enc0_len >> 1));
402 nt->match0.n = enc0_len;
403 nt->match0.highest = enc0_hi;
404 assert(nt->match0.highest < n_msgs);
405 nt->match0.msginfo = new_array(unsigned char, nt->match0.max);
406 memcpy(nt->match0.msginfo, enc0, nt->match0.n);
407 /*}}}*/
408 /*{{{ set up match1 chain */
409 nt->match1.max = get_max(16, enc1_len + (enc1_len >> 1));
410 nt->match1.n = enc1_len;
411 nt->match1.highest = enc1_hi;
412 assert(nt->match1.highest < n_msgs);
413 nt->match1.msginfo = new_array(unsigned char, nt->match1.max);
414 memcpy(nt->match1.msginfo, enc1, nt->match1.n);
415 /*}}}*/
416
417 index = hash & out->mask;
418 while (out->tokens[index]) {
419 ++index;
420 index &= out->mask;
421 }
422
423 out->tokens[index] = nt;
424 }
425 }
426 /*}}}*/
new_database_from_file(char * db_filename,int do_integrity_checks)427 struct database *new_database_from_file(char *db_filename, int do_integrity_checks)/*{{{*/
428 {
429 /* Read existing database from file for doing incremental update */
430 struct database *result;
431 struct read_db *input;
432 int i, n, N;
433
434 result = new_database( CREATE_RANDOM_DATABASE_HASH );
435 input = open_db(db_filename);
436 if (!input) {
437 /* Nothing to initialise */
438 if (verbose) printf("Database file was empty, creating a new database\n");
439 return result;
440 }
441
442 /* Build pathname information */
443 n = result->n_msgs = input->n_msgs;
444 result->max_msgs = input->n_msgs; /* let it be extended as-and-when */
445 result->msgs = new_array(struct msgpath, n);
446 result->type = new_array(enum message_type, n);
447
448 result->hash_key = input->hash_key;
449
450 /* Set up mbox structures */
451 N = result->n_mboxen = result->max_mboxen = input->n_mboxen;
452 result->mboxen = N ? (new_array(struct mbox, N)) : NULL;
453 for (i=0; i<N; i++) {
454 int nn;
455 if (input->mbox_paths_table[i]) {
456 result->mboxen[i].path = new_string(input->data + input->mbox_paths_table[i]);
457 } else {
458 /* mbox is dead. */
459 result->mboxen[i].path = NULL;
460 }
461 result->mboxen[i].file_mtime = input->mbox_mtime_table[i];
462 result->mboxen[i].file_size = input->mbox_size_table[i];
463 nn = result->mboxen[i].n_msgs = input->mbox_entries_table[i];
464 result->mboxen[i].max_msgs = nn;
465 result->mboxen[i].start = new_array(off_t, nn);
466 result->mboxen[i].len = new_array(size_t, nn);
467 result->mboxen[i].check_all = new_array(checksum_t, nn);
468 /* Copy the entire checksum table in one go. */
469 memcpy(result->mboxen[i].check_all,
470 input->data + input->mbox_checksum_table[i],
471 nn * sizeof(checksum_t));
472 result->mboxen[i].n_so_far = 0;
473 }
474
475 for (i=0; i<n; i++) {
476 switch (rd_msg_type(input, i)) {
477 case DB_MSG_DEAD:
478 result->type[i] = MTY_DEAD;
479 break;
480 case DB_MSG_FILE:
481 result->type[i] = MTY_FILE;
482 result->msgs[i].src.mpf.path = new_string(input->data + input->path_offsets[i]);
483 result->msgs[i].src.mpf.mtime = input->mtime_table[i];
484 result->msgs[i].src.mpf.size = input->size_table[i];
485 break;
486 case DB_MSG_IMAP:
487 result->type[i] = MTY_IMAP;
488 result->msgs[i].src.mpf.path = new_string(input->data + input->path_offsets[i]);
489 result->msgs[i].src.mpf.mtime = 0;
490 result->msgs[i].src.mpf.size = 0;
491 break;
492 case DB_MSG_MBOX:
493 {
494 unsigned int mbi, msgi;
495 int n;
496 struct mbox *mb;
497 result->type[i] = MTY_MBOX;
498 decode_mbox_indices(input->path_offsets[i], &mbi, &msgi);
499 result->msgs[i].src.mbox.file_index = mbi;
500 mb = &result->mboxen[mbi];
501 assert(mb->n_so_far == msgi);
502 n = mb->n_so_far;
503 result->msgs[i].src.mbox.msg_index = n;
504 mb->start[n] = input->mtime_table[i];
505 mb->len[n] = input->size_table[i];
506 ++mb->n_so_far;
507 }
508
509 break;
510 }
511 result->msgs[i].seen = (input->msg_type_and_flags[i] & FLAG_SEEN) ? 1:0;
512 result->msgs[i].replied = (input->msg_type_and_flags[i] & FLAG_REPLIED) ? 1:0;
513 result->msgs[i].flagged = (input->msg_type_and_flags[i] & FLAG_FLAGGED) ? 1:0;
514 result->msgs[i].date = input->date_table[i];
515 result->msgs[i].tid = input->tid_table[i];
516 }
517
518 import_toktable(input->data, input->hash_key, result->n_msgs, &input->to, result->to);
519 import_toktable(input->data, input->hash_key, result->n_msgs, &input->cc, result->cc);
520 import_toktable(input->data, input->hash_key, result->n_msgs, &input->from, result->from);
521 import_toktable(input->data, input->hash_key, result->n_msgs, &input->subject, result->subject);
522 import_toktable(input->data, input->hash_key, result->n_msgs, &input->body, result->body);
523 import_toktable(input->data, input->hash_key, result->n_msgs, &input->attachment_name, result->attachment_name);
524 import_toktable2(input->data, input->hash_key, result->n_msgs, &input->msg_ids, result->msg_ids);
525
526 close_db(input);
527
528 if (do_integrity_checks) {
529 check_database_integrity(result);
530 }
531
532 return result;
533 }
534 /*}}}*/
535
add_angled_terms(int file_index,unsigned int hash_key,struct toktable2 * table,int add_to_chain1,char * s)536 static void add_angled_terms(int file_index, unsigned int hash_key, struct toktable2 *table, int add_to_chain1, char *s)/*{{{*/
537 {
538 char *left, *right;
539
540 if (s) {
541 left = strchr(s, '<');
542 while (left) {
543 right = strchr(left, '>');
544 if (right) {
545 *right = '\0';
546 add_token2_in_file(file_index, hash_key, left+1, table, add_to_chain1);
547 *right = '>'; /* restore */
548 } else {
549 break;
550 }
551 left = strchr(right, '<');
552 }
553 }
554 }
555 /*}}}*/
556
557 /* Macro for what characters can make up token strings.
558
559 The following characters have special meanings:
560 0x2b +
561 0x2d -
562 0x2e .
563 0x40 @
564 0x5f _
565
566 since they can occur within email addresses and message IDs when considered
567 as a whole rather than as individual words. Underscore (0x5f) is considered
568 a word-character always too.
569
570 */
571 static unsigned char special_table[256] = {
572 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 00-0f */
573 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10-1f */
574 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, /* 20-2f */
575 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 30-3f */
576 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40-4f */
577 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, /* 50-5f */
578 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60-6f */
579 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 70-7f */
580 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8f */
581 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 90-9f */
582 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a0-af */
583 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* b0-bf */
584 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* c0-cf */
585 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* d0-df */
586 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* e0-ef */
587 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* f0-ff */
588 };
589
590 #if 0
591 #define CHAR_VALID(x,mask) (isalnum((unsigned char) x) || (special_table[(unsigned int)(unsigned char) x] & mask))
592 #endif
char_valid_p(char x,unsigned int mask)593 static inline int char_valid_p(char x, unsigned int mask)/*{{{*/
594 {
595 unsigned char xx = (unsigned char) x;
596 if (isalnum(xx)) return 1;
597 else if (special_table[(unsigned int) xx] & mask) return 1;
598 else return 0;
599 }
600 /*}}}*/
tokenise_string(int file_index,unsigned int hash_key,struct toktable * table,char * data,int match_mask)601 static void tokenise_string(int file_index, unsigned int hash_key, struct toktable *table, char *data, int match_mask)/*{{{*/
602 {
603 char *ss, *es, old_es;
604 ss = data;
605 for (;;) {
606 while (*ss && !char_valid_p(*ss,match_mask)) ss++;
607 if (!*ss) break;
608 es = ss + 1;
609 while (*es && char_valid_p(*es,match_mask)) es++;
610
611 /* deal with token [ss,es) */
612 old_es = *es;
613 *es = '\0';
614 /* FIXME: Ought to do this by passing start and length - clean up later */
615 add_token_in_file(file_index, hash_key, ss, table);
616 *es = old_es;
617
618 if (!*es) break;
619 ss = es;
620 }
621 }
622 /*}}}*/
tokenise_html_string(int file_index,unsigned int hash_key,struct toktable * table,char * data)623 static void tokenise_html_string(int file_index, unsigned int hash_key, struct toktable *table, char *data)/*{{{*/
624 {
625 char *ss, *es, old_es;
626
627 /* FIXME : Probably want to rewrite this as an explicit FSM */
628
629 ss = data;
630 for (;;) {
631 /* Assume < and > are never valid token characters ! */
632 while (*ss && !char_valid_p(*ss, 1)) {
633 if (*ss++ == '<') {
634 /* Skip over HTML tag */
635 while (*ss && (*ss != '>')) ss++;
636 }
637 }
638 if (!*ss) break;
639
640 es = ss + 1;
641 while (*es && char_valid_p(*es, 1)) es++;
642
643 /* deal with token [ss,es) */
644 old_es = *es;
645 *es = '\0';
646 /* FIXME: Ought to do this by passing start and length - clean up later */
647 add_token_in_file(file_index, hash_key, ss, table);
648 *es = old_es;
649
650 if (!*es) break;
651 ss = es;
652 }
653 }
654 /*}}}*/
tokenise_message(int file_index,struct database * db,struct rfc822 * msg)655 void tokenise_message(int file_index, struct database *db, struct rfc822 *msg)/*{{{*/
656 {
657 struct attachment *a;
658
659 /* Match on whole addresses in these headers as well as the individual words */
660 if (msg->hdrs.to) {
661 tokenise_string(file_index, db->hash_key, db->to, msg->hdrs.to, 1);
662 tokenise_string(file_index, db->hash_key, db->to, msg->hdrs.to, 2);
663 }
664 if (msg->hdrs.cc) {
665 tokenise_string(file_index, db->hash_key, db->cc, msg->hdrs.cc, 1);
666 tokenise_string(file_index, db->hash_key, db->cc, msg->hdrs.cc, 2);
667 }
668 if (msg->hdrs.from) {
669 tokenise_string(file_index, db->hash_key, db->from, msg->hdrs.from, 1);
670 tokenise_string(file_index, db->hash_key, db->from, msg->hdrs.from, 2);
671 }
672 if (msg->hdrs.subject) tokenise_string(file_index, db->hash_key, db->subject, msg->hdrs.subject, 1);
673
674 for (a=msg->atts.next; a!=&msg->atts; a=a->next) {
675 switch (a->ct) {
676 case CT_TEXT_PLAIN:
677 tokenise_string(file_index, db->hash_key, db->body, a->data.normal.bytes, 1);
678 break;
679 case CT_TEXT_HTML:
680 tokenise_html_string(file_index, db->hash_key, db->body, a->data.normal.bytes);
681 break;
682 case CT_MESSAGE_RFC822:
683 /* Just recurse for now - maybe we should have separate token tables
684 * for tokens occurring in embedded messages? */
685
686 if (a->data.rfc822) {
687 tokenise_message(file_index, db, a->data.rfc822);
688 }
689 break;
690 default:
691 /* Don't do anything - unknown text format or some nasty binary stuff.
692 * In future, we could have all kinds of 'plug-ins' here, e.g.
693 * something that can parse PDF to get the basic text strings out of
694 * the pages? */
695 break;
696 }
697
698 if (a->filename) {
699 add_token_in_file(file_index, db->hash_key, a->filename, db->attachment_name);
700 }
701
702 }
703
704 /* Deal with threading information */
705 add_angled_terms(file_index, db->hash_key, db->msg_ids, 1, msg->hdrs.message_id);
706 add_angled_terms(file_index, db->hash_key, db->msg_ids, 0, msg->hdrs.in_reply_to);
707 add_angled_terms(file_index, db->hash_key, db->msg_ids, 0, msg->hdrs.references);
708 }
709 /*}}}*/
710
scan_maildir_flags(struct msgpath * m)711 static void scan_maildir_flags(struct msgpath *m)/*{{{*/
712 {
713 const char *p, *start;
714 start = m->src.mpf.path;
715 m->seen = 0;
716 m->replied = 0;
717 m->flagged = 0;
718 for (p=start; *p; p++) {}
719 for (p--; (p >= start) && ((*p) != ':'); p--) {}
720 if (p >= start) {
721 if (!strncmp(p, ":2,", 3)) {
722 p += 3;
723 while (*p) {
724 switch (*p) {
725 case 'F': m->flagged = 1; break;
726 case 'R': m->replied = 1; break;
727 case 'S': m->seen = 1; break;
728 default: break;
729 }
730 p++;
731 }
732 }
733 }
734 }
735 /*}}}*/
scan_new_messages(struct database * db,int start_at,struct imap_ll * imapc)736 static void scan_new_messages(struct database *db, int start_at, struct imap_ll *imapc)/*{{{*/
737 {
738 int i;
739 for (i=start_at; i<db->n_msgs; i++) {
740 struct rfc822 *msg = NULL;
741 int len = strlen(db->msgs[i].src.mpf.path);
742
743 if (len > 10 && !strcmp(db->msgs[i].src.mpf.path + len - 11, "/.gitignore"))
744 continue;
745
746 switch (db->type[i]) {
747 case MTY_DEAD:
748 assert(0);
749 break;
750 case MTY_MBOX:
751 assert(0); /* Should never get here - mbox messages are scanned elsewhere. */
752 break;
753 case MTY_FILE:
754 if (verbose) fprintf(stderr, "Scanning <%s>\n", db->msgs[i].src.mpf.path);
755 msg = make_rfc822(db->msgs[i].src.mpf.path);
756 break;
757 case MTY_IMAP:
758 if (verbose) fprintf(stderr, "Scanning IMAP <%s>\n", db->msgs[i].src.mpf.path);
759 msg = make_rfc822_from_imap(db->msgs[i].src.mpf.path, imapc);
760 break;
761 }
762 if(msg)
763 {
764 db->msgs[i].date = msg->hdrs.date;
765 scan_maildir_flags(&db->msgs[i]);
766 tokenise_message(i, db, msg);
767 free_rfc822(msg);
768 }
769 else
770 fprintf(stderr, "Skipping %s (could not parse message)\n", db->msgs[i].src.mpf.path);
771 }
772 }
773 /*}}}*/
774
set_bit(unsigned long * x,int n)775 static inline void set_bit(unsigned long *x, int n)/*{{{*/
776 {
777 int set;
778 unsigned long mask;
779 set = (n >> 5);
780 mask = (1UL << (n & 31));
781 x[set] |= mask;
782 }
783 /*}}}*/
isset_bit(unsigned long * x,int n)784 static inline int isset_bit(unsigned long *x, int n)/*{{{*/
785 {
786 int set;
787 unsigned long mask;
788 set = (n >> 5);
789 mask = (1UL << (n & 31));
790 return (x[set] & mask) ? 1 : 0;
791 }
792 /*}}}*/
find_base(int * table,int index)793 static int find_base(int *table, int index) {/*{{{*/
794 int a = index;
795
796 /* TODO : make this compress the path lengths down to the base entry */
797 while (table[a] != a) {
798 a = table[a];
799 }
800 return a;
801 }
802 /*}}}*/
find_threading(struct database * db)803 static void find_threading(struct database *db)/*{{{*/
804 {
805
806 /* ix is a table mapping path array index to the lowest path array index that
807 * is known to share at least one message ID in its hdrs somewhere (i.e. they
808 * must be in the same thread) */
809 int *ix;
810
811 int i, m, np, sm;
812 int next_tid;
813
814 np = db->n_msgs;
815 sm = db->msg_ids->size;
816
817 ix = new_array(int, np);
818 for (i=0; i<np; i++) {
819 ix[i] = i; /* default - every message in a thread of its own */
820 }
821
822 for (m=0; m<sm; m++) {
823 struct token2 *tok = db->msg_ids->tokens[m];
824 if (tok) {
825 unsigned char *j = tok->match0.msginfo;
826 unsigned char *last_char = j + tok->match0.n;
827 int cur = 0, incr, first=1;
828 int new_base=-1, old_base;
829 while (j < last_char) {
830 incr = read_increment(&j);
831 cur += incr;
832 if (first) {
833 new_base = find_base(ix, cur);
834 first = 0;
835 } else {
836 old_base = find_base(ix, cur);
837 if (old_base < new_base) {
838 ix[new_base] = old_base;
839 new_base = old_base;
840 } else if (old_base > new_base) {
841 assert(new_base != -1);
842 ix[old_base] = new_base;
843 }
844 }
845 }
846 }
847 }
848
849 /* Now make each entry point directly to its base */
850 for (i=0; i<np; i++) {
851 if (ix[i] != i) {
852 /* Sure to work as we're going up from the bottom */
853 ix[i] = ix[ix[i]];
854 }
855 }
856
857 /* Now allocate contiguous thread group numbers */
858 next_tid = 0;
859 for (i=0; i<np; i++) {
860 if (ix[i] == i) {
861 db->msgs[i].tid = next_tid++;
862 } else {
863 db->msgs[i].tid = db->msgs[ix[i]].tid;
864 }
865 }
866
867 free(ix);
868 return;
869 }
870 /*}}}*/
lookup_msgpath(struct msgpath * sorted_paths,int n_msgs,char * key,enum message_type type)871 static int lookup_msgpath(struct msgpath *sorted_paths, int n_msgs, char *key, enum message_type type)/*{{{*/
872 {
873 /* Implement bisection search */
874 int l, h, m, r;
875 l = 0, h = n_msgs;
876 m = -1;
877 while (h > l) {
878 m = (h + l) >> 1;
879 /* Should only get called on 'file' type messages - TBC */
880 if (sorted_paths[m].type < type) {
881 r = -1;
882 } else if (sorted_paths[m].type > type) {
883 r = 1;
884 } else {
885 r = strcmp(sorted_paths[m].src.mpf.path, key);
886 }
887 if (r == 0) break;
888 if (l == m) return -1;
889 if (r > 0) h = m;
890 else l = m;
891 }
892 return m;
893 }
894 /*}}}*/
maybe_grow_message_arrays(struct database * db)895 void maybe_grow_message_arrays(struct database *db)/*{{{*/
896 {
897 if (db->n_msgs == db->max_msgs) {
898 if (db->max_msgs <= 128) {
899 db->max_msgs = 256;
900 } else {
901 db->max_msgs += (db->max_msgs >> 1);
902 }
903 db->msgs = grow_array(struct msgpath, db->max_msgs, db->msgs);
904 db->type = grow_array(enum message_type, db->max_msgs, db->type);
905 }
906 }
907 /*}}}*/
add_msg_path(struct database * db,char * path,time_t mtime,size_t message_size,enum message_type type)908 static void add_msg_path(struct database *db, char *path, time_t mtime, size_t message_size, enum message_type type)/*{{{*/
909 {
910 maybe_grow_message_arrays(db);
911 db->type[db->n_msgs] = type;
912 db->msgs[db->n_msgs].src.mpf.path = new_string(path);
913 db->msgs[db->n_msgs].src.mpf.mtime = mtime;
914 db->msgs[db->n_msgs].src.mpf.size = message_size;
915 ++db->n_msgs;
916 }
917 /*}}}*/
918
919 enum stat_result {/*{{{*/
920 STAT_RESULT_BAD,
921 STAT_RESULT_FILE,
922 STAT_RESULT_DIR,
923 };
924 /*}}}*/
925
do_stat(struct msgpath * mp)926 static int do_stat(struct msgpath *mp)/*{{{*/
927 {
928 struct stat sb;
929 int status;
930 if (mp->type == MTY_IMAP) {
931 mp->src.mpf.mtime = 0;
932 mp->src.mpf.size = 0;
933 return STAT_RESULT_FILE;
934 }
935 status = stat(mp->src.mpf.path, &sb);
936 if (status < 0) {
937 return STAT_RESULT_BAD;
938 }
939 if (S_ISREG(sb.st_mode)) {
940 mp->src.mpf.mtime = sb.st_mtime;
941 mp->src.mpf.size = sb.st_size;
942 return STAT_RESULT_FILE;
943 } else if (S_ISDIR(sb.st_mode)) {
944 return STAT_RESULT_DIR;
945 }
946 return STAT_RESULT_BAD;
947 }
948 /*}}}*/
update_database(struct database * db,struct msgpath * sorted_paths,int n_msgs,int do_fast_index,struct imap_ll * imapc)949 int update_database(struct database *db, struct msgpath *sorted_paths, int n_msgs, int do_fast_index, struct imap_ll *imapc)/*{{{*/
950 {
951 /* The incoming list must be sorted into order, to make binary searching
952 * possible. We search for each existing path in the incoming sorted array.
953 * If the date differs, or the file no longer exist, the existing database
954 * entry for that file is nulled. (These are only recovered if the database
955 * is actively compressed.) If the date differed, a new entry for the file
956 * is put at the end of the list. Similarly, any new file goes at the end.
957 * These new entries are all rescanned to find tokens and add them to the
958 * database. */
959
960 char *file_in_db, *file_in_new_list;
961 int matched_index;
962 int i, new_entries_start_at;
963 int any_new, n_newly_pruned, n_already_dead;
964
965 file_in_db = new_array(char, n_msgs);
966 file_in_new_list = new_array(char, db->n_msgs);
967 bzero(file_in_db, n_msgs);
968 bzero(file_in_new_list, db->n_msgs);
969
970 n_already_dead = 0;
971 n_newly_pruned = 0;
972
973 for (i=0; i<db->n_msgs; i++) {
974 switch (db->type[i]) {
975 case MTY_FILE:
976 matched_index = lookup_msgpath(sorted_paths, n_msgs, db->msgs[i].src.mpf.path, MTY_FILE);
977 if (matched_index >= 0) {
978 if (do_fast_index) {
979 /* Assume the presence of a matching path is good enough without
980 * even bothering to stat the file that's there now. */
981 file_in_db[matched_index] = 1;
982 file_in_new_list[i] = 1;
983 } else {
984 switch (do_stat(sorted_paths + matched_index)) {
985 case STAT_RESULT_FILE:
986 if (sorted_paths[matched_index].src.mpf.mtime == db->msgs[i].src.mpf.mtime) {
987 /* Treat stale files as though the path has changed. */
988 file_in_db[matched_index] = 1;
989 file_in_new_list[i] = 1;
990 }
991 break;
992 default:
993 /* This path will get treated as dead, and be re-stated below.
994 * When that stat fails, the path won't get added to the db. */
995 break;
996 }
997 }
998 }
999 break;
1000 case MTY_IMAP:
1001 matched_index = lookup_msgpath(sorted_paths, n_msgs, db->msgs[i].src.mpf.path, MTY_IMAP);
1002 if (matched_index >= 0) {
1003 file_in_db[matched_index] = 1;
1004 file_in_new_list[i] = 1;
1005 }
1006 break;
1007 case MTY_MBOX:
1008 /* Nothing to do on this pass. */
1009 break;
1010 case MTY_DEAD:
1011 break;
1012 }
1013 }
1014
1015 /* Add new entries to database */
1016 new_entries_start_at = db->n_msgs;
1017
1018 for (i=0; i<db->n_msgs; i++) {
1019 /* Weed dead entries */
1020 switch (db->type[i]) {
1021 case MTY_FILE:
1022 case MTY_IMAP:
1023 if (!file_in_new_list[i]) {
1024 free(db->msgs[i].src.mpf.path);
1025 db->msgs[i].src.mpf.path = NULL;
1026 db->type[i] = MTY_DEAD;
1027 ++n_newly_pruned;
1028 }
1029 break;
1030 case MTY_MBOX:
1031 {
1032 int msg_index, file_index, number_valid;
1033 int mbox_valid;
1034 msg_index = db->msgs[i].src.mbox.msg_index;
1035 file_index = db->msgs[i].src.mbox.file_index;
1036 assert (file_index < db->n_mboxen);
1037 mbox_valid = (db->mboxen[file_index].path) ? 1 : 0;
1038 number_valid = db->mboxen[file_index].n_old_msgs_valid;
1039 if (!mbox_valid || (msg_index >= number_valid)) {
1040 db->type[i] = MTY_DEAD;
1041 ++n_newly_pruned;
1042 }
1043 }
1044 break;
1045 case MTY_DEAD:
1046 /* already dead */
1047 ++n_already_dead;
1048 break;
1049 }
1050 }
1051
1052 if (verbose) {
1053 fprintf(stderr, "%d newly dead messages, %d messages now dead in total\n", n_newly_pruned, n_newly_pruned+n_already_dead);
1054 }
1055
1056 any_new = 0;
1057 for (i=0; i<n_msgs; i++) {
1058 if (!file_in_db[i]) {
1059 any_new = 1;
1060 /* The 'sorted_paths' array is only used for file-per-message folders. */
1061 switch(do_stat(sorted_paths + i)) {
1062 case STAT_RESULT_FILE:
1063 /* We only add files that could be successfully stat()'d as regular
1064 * files. */
1065 add_msg_path(db, sorted_paths[i].src.mpf.path, sorted_paths[i].src.mpf.mtime, sorted_paths[i].src.mpf.size, sorted_paths[i].type);
1066 break;
1067 case STAT_RESULT_DIR:
1068 if (sorted_paths[i].source_ft == FT_MH) {
1069 /* With MH, something that looks like a message but is a
1070 directory is actually a subfolder with a numeric name.
1071 Don't warn about it */
1072 break;
1073 }
1074 /* In all other cases, treat any non-regular-file the same
1075 and warn about it. */
1076 /* fallthrough */
1077 default:
1078 fprintf(stderr, "Cannot add '%s' to database; stat() failed\n", sorted_paths[i].src.mpf.path);
1079 break;
1080 }
1081 }
1082 }
1083
1084 if (any_new) {
1085 scan_new_messages(db, new_entries_start_at, imapc);
1086 }
1087
1088 /* Add newly found mbox messages. */
1089 any_new |= add_mbox_messages(db);
1090
1091 if (any_new) {
1092 find_threading(db);
1093 } else {
1094 if (verbose) fprintf(stderr, "No new messages found\n");
1095 }
1096
1097 free(file_in_db);
1098 free(file_in_new_list);
1099
1100 return any_new || (n_newly_pruned > 0);
1101 }
1102 /*}}}*/
recode_encoding(struct matches * m,int * new_idx)1103 static void recode_encoding(struct matches *m, int *new_idx)/*{{{*/
1104 {
1105 unsigned char *new_enc, *old_enc;
1106 unsigned char *j, *last_char;
1107 int incr, idx, n_idx;
1108
1109 old_enc = m->msginfo;
1110 j = old_enc;
1111 last_char = old_enc + m->n;
1112
1113 new_enc = new_array(unsigned char, m->max); /* Probably not bigger than this. */
1114 m->n = 0;
1115 m->highest = 0;
1116 m->msginfo = new_enc;
1117 idx = 0;
1118
1119 while (j < last_char) {
1120 incr = read_increment(&j);
1121 idx += incr;
1122 n_idx = new_idx[idx];
1123 if (n_idx >= 0) {
1124 check_and_enlarge_encoding(m);
1125 insert_index_on_encoding(m, n_idx);
1126 }
1127 }
1128 free(old_enc);
1129 }
1130 /*}}}*/
recode_toktable(struct toktable * tbl,int * new_idx)1131 static void recode_toktable(struct toktable *tbl, int *new_idx)/*{{{*/
1132 {
1133 /* Re-encode the vectors according to the new path indices */
1134 int i;
1135 int any_dead = 0;
1136 int any_moved, pass;
1137
1138 for (i=0; i<tbl->size; i++) {
1139 struct token *tok = tbl->tokens[i];
1140 if (tok) {
1141 recode_encoding(&tok->match0, new_idx);
1142 if (tok->match0.n == 0) {
1143 /* Delete this token. Gotcha - there may be tokens further on in the
1144 * array that didn't get their natural hash bucket due to collisions.
1145 * Need to shuffle such tokens up to guarantee that the buckets between
1146 * the natural one and the one where they are now are all occupied, to
1147 * prevent their lookups failing. */
1148
1149 #if 0
1150 fprintf(stderr, "Token <%s> (bucket %d) no longer has files containing it, deleting\n", tok->text, i);
1151 #endif
1152 free_token(tok);
1153 tbl->tokens[i] = NULL;
1154 --tbl->n; /* Maintain number in use counter */
1155 any_dead = 1;
1156 }
1157
1158 }
1159 }
1160
1161
1162 if (any_dead) {
1163 /* Now close gaps. This has to be done in a second pass, otherwise we get a
1164 * problem with moving entries that need deleting back before the current
1165 scan point. */
1166
1167 pass = 1;
1168 for (;;) {
1169 int i;
1170
1171 if (verbose) {
1172 fprintf(stderr, "Pass %d\n", pass);
1173 }
1174
1175 any_moved = 0;
1176
1177 for (i=0; i<tbl->size; i++) {
1178 if (tbl->tokens[i]) {
1179 int nat_bucket_i;
1180 nat_bucket_i = tbl->tokens[i]->hashval & tbl->mask;
1181 if (nat_bucket_i != i) {
1182 /* Find earliest bucket that we could move i to */
1183 int j = nat_bucket_i;
1184 while (j != i) {
1185 if (!tbl->tokens[j]) {
1186 /* put it here */
1187 #if 0
1188 fprintf(stderr, "Moved <%s> from bucket %d to %d (natural bucket %d)\n", tbl->tokens[i]->text, i, j, nat_bucket_i);
1189 #endif
1190 tbl->tokens[j] = tbl->tokens[i];
1191 tbl->tokens[i] = NULL;
1192 any_moved = 1;
1193 break;
1194 } else {
1195 j++;
1196 j &= tbl->mask;
1197 }
1198 }
1199 if (tbl->tokens[i]) {
1200 #if 0
1201 fprintf(stderr, "NOT moved <%s> from bucket %d (natural bucket %d)\n", tbl->tokens[i]->text, i, nat_bucket_i);
1202 #endif
1203 }
1204 }
1205 }
1206 }
1207
1208 if (!any_moved) break;
1209 pass++;
1210 }
1211 }
1212 }
1213 /*}}}*/
recode_toktable2(struct toktable2 * tbl,int * new_idx)1214 static void recode_toktable2(struct toktable2 *tbl, int *new_idx)/*{{{*/
1215 {
1216 /* Re-encode the vectors according to the new path indices */
1217 int i;
1218 int any_dead = 0;
1219 int any_moved, pass;
1220
1221 for (i=0; i<tbl->size; i++) {
1222 struct token2 *tok = tbl->tokens[i];
1223 if (tok) {
1224 recode_encoding(&tok->match0, new_idx);
1225 recode_encoding(&tok->match1, new_idx);
1226 if ((tok->match0.n == 0) && (tok->match1.n == 0)) {
1227 /* Delete this token. Gotcha - there may be tokens further on in the
1228 * array that didn't get their natural hash bucket due to collisions.
1229 * Need to shuffle such tokens up to guarantee that the buckets between
1230 * the natural one and the one where they are now are all occupied, to
1231 * prevent their lookups failing. */
1232
1233 #if 0
1234 fprintf(stderr, "Token <%s> (bucket %d) no longer has files containing it, deleting\n", tok->text, i);
1235 #endif
1236 free_token2(tok);
1237 tbl->tokens[i] = NULL;
1238 --tbl->n; /* Maintain number in use counter */
1239 any_dead = 1;
1240 }
1241 }
1242 }
1243
1244 if (any_dead) {
1245 /* Now close gaps. This has to be done in a second pass, otherwise we get a
1246 * problem with moving entries that need deleting back before the current
1247 scan point. */
1248
1249 pass = 1;
1250 for (;;) {
1251 int i;
1252
1253 if (verbose) {
1254 fprintf(stderr, "Pass %d\n", pass);
1255 }
1256
1257 any_moved = 0;
1258
1259 for (i=0; i<tbl->size; i++) {
1260 if (tbl->tokens[i]) {
1261 int nat_bucket_i;
1262 nat_bucket_i = tbl->tokens[i]->hashval & tbl->mask;
1263 if (nat_bucket_i != i) {
1264 /* Find earliest bucket that we could move i to */
1265 int j = nat_bucket_i;
1266 while (j != i) {
1267 if (!tbl->tokens[j]) {
1268 /* put it here */
1269 #if 0
1270 fprintf(stderr, "Moved <%s> from bucket %d to %d (natural bucket %d)\n", tbl->tokens[i]->text, i, j, nat_bucket_i);
1271 #endif
1272 tbl->tokens[j] = tbl->tokens[i];
1273 tbl->tokens[i] = NULL;
1274 any_moved = 1;
1275 break;
1276 } else {
1277 j++;
1278 j &= tbl->mask;
1279 }
1280 }
1281 if (tbl->tokens[i]) {
1282 #if 0
1283 fprintf(stderr, "NOT moved <%s> from bucket %d (natural bucket %d)\n", tbl->tokens[i]->text, i, nat_bucket_i);
1284 #endif
1285 }
1286 }
1287 }
1288 }
1289
1290 if (!any_moved) break;
1291 pass++;
1292 }
1293 }
1294 }
1295 /*}}}*/
cull_dead_messages(struct database * db,int do_integrity_checks)1296 int cull_dead_messages(struct database *db, int do_integrity_checks)/*{{{*/
1297 {
1298 /* Return true if any culled */
1299
1300 int *new_idx, i, j, n_old;
1301 int any_culled = 0;
1302
1303 /* Check db is OK before we start on this. (Check afterwards is done in the
1304 * writer.c code.) */
1305 if (do_integrity_checks) {
1306 check_database_integrity(db);
1307 }
1308
1309 if (verbose) {
1310 fprintf(stderr, "Culling dead messages\n");
1311 }
1312
1313 n_old = db->n_msgs;
1314
1315 new_idx = new_array(int, n_old);
1316 for (i=0, j=0; i<n_old; i++) {
1317 switch (db->type[i]) {
1318 case MTY_FILE:
1319 case MTY_MBOX:
1320 case MTY_IMAP:
1321 new_idx[i] = j++;
1322 break;
1323 case MTY_DEAD:
1324 new_idx[i] = -1;
1325 any_culled = 1;
1326 break;
1327 }
1328 }
1329
1330 recode_toktable(db->to, new_idx);
1331 recode_toktable(db->cc, new_idx);
1332 recode_toktable(db->from, new_idx);
1333 recode_toktable(db->subject, new_idx);
1334 recode_toktable(db->body, new_idx);
1335 recode_toktable(db->attachment_name, new_idx);
1336 recode_toktable2(db->msg_ids, new_idx);
1337
1338 /* And crunch down the filename table */
1339 for (i=0, j=0; i<n_old; i++) {
1340 switch (db->type[i]) {
1341 case MTY_DEAD:
1342 break;
1343 case MTY_FILE:
1344 case MTY_MBOX:
1345 case MTY_IMAP:
1346 if (i > j) {
1347 db->msgs[j] = db->msgs[i];
1348 db->type[j] = db->type[i];
1349 }
1350 j++;
1351 break;
1352 }
1353 }
1354 db->n_msgs = j;
1355
1356 free(new_idx);
1357
1358 /* .. and cull dead mboxen */
1359 cull_dead_mboxen(db);
1360
1361 return any_culled;
1362 }
1363 /*}}}*/
1364