1 /*
2   mairix - message index builder and finder for maildir folders.
3 
4  **********************************************************************
5  * Copyright (C) Richard P. Curnow  2002,2003,2004,2005,2006,2007,2009
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of version 2 of the GNU General Public License as
9  * published by the Free Software Foundation.
10  *
11  * This program is distributed in the hope that it will be useful, but
12  * WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14  * General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License along
17  * with this program; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19  *
20  **********************************************************************
21  */
22 
23 /* Handle complete database */
24 
25 #include "mairix.h"
26 #include "reader.h"
27 #include <ctype.h>
28 #include <assert.h>
29 #include <sys/time.h>
30 #include <unistd.h>
31 #include "imapinterface.h"
32 
33 struct sortable_token {/*{{{*/
34   char *text;
35   int index;
36 };
37 /*}}}*/
compare_sortable_tokens(const void * a,const void * b)38 static int compare_sortable_tokens(const void *a, const void *b)/*{{{*/
39 {
40   const struct sortable_token *aa = (const struct sortable_token *) a;
41   const struct sortable_token *bb = (const struct sortable_token *) b;
42   int foo;
43   foo = strcmp(aa->text, bb->text);
44   if (foo) {
45     return foo;
46   } else {
47     if (aa->index < bb->index) return -1;
48     else if (aa->index > bb->index) return +1;
49     else return 0;
50   }
51 }
52 /*}}}*/
check_toktable_enc_integrity(int n_msgs,struct toktable * table)53 static void check_toktable_enc_integrity(int n_msgs, struct toktable *table)/*{{{*/
54 {
55   /* FIXME : Check reachability of tokens that are displaced from their natural
56    * hash bucket (if deletions have occurred during purge). */
57 
58   int idx, incr;
59   int i, k;
60   unsigned char *j, *last_char;
61   int broken_chains = 0;
62   struct sortable_token *sort_list;
63   int any_duplicates;
64 
65   for (i=0; i<table->size; i++) {
66     struct token *tok = table->tokens[i];
67     if (tok) {
68       idx = 0;
69       incr = 0;
70       last_char = tok->match0.msginfo + tok->match0.n;
71       for (j = tok->match0.msginfo; j < last_char; ) {
72         incr = read_increment(&j);
73         idx += incr;
74       }
75       if (idx != tok->match0.highest) {
76         fprintf(stderr, "broken encoding chain for token <%s>, highest=%ld\n", tok->text, tok->match0.highest);
77         fflush(stderr);
78         broken_chains = 1;
79       }
80       if (idx >= n_msgs) {
81         fprintf(stderr, "end of chain higher than number of message paths (%d) for token <%s>\n", n_msgs, tok->text);
82         fflush(stderr);
83         broken_chains = 1;
84       }
85     }
86   }
87 
88   assert(!broken_chains);
89 
90   /* Check there are no duplicated tokens in the table. */
91   sort_list = new_array(struct sortable_token, table->n);
92   k = 0;
93   for (i=0; i<table->size; i++) {
94     struct token *tok = table->tokens[i];
95     if (tok) {
96       sort_list[k].text = new_string(tok->text);
97       sort_list[k].index = i;
98       k++;
99     }
100   }
101   assert(k == table->n);
102 
103   qsort(sort_list, table->n, sizeof(struct sortable_token), compare_sortable_tokens);
104   /* Check for uniqueness of neighbouring token texts */
105   any_duplicates = 0;
106   for (i=0; i<(table->n - 1); i++) {
107     if (!strcmp(sort_list[i].text, sort_list[i+1].text)) {
108       fprintf(stderr, "Token table contains duplicated token %s at indices %d and %d\n",
109                sort_list[i].text, sort_list[i].index, sort_list[i+1].index);
110       any_duplicates = 1;
111     }
112   }
113 
114   /* release */
115   for (i=0; i<table->n; i++) {
116     free(sort_list[i].text);
117   }
118   free(sort_list);
119 
120   if (any_duplicates) {
121     fprintf(stderr, "Token table contained duplicate entries, aborting\n");
122     assert(0);
123   }
124 }
125 /*}}}*/
compare_strings(const void * a,const void * b)126 static int compare_strings(const void *a, const void *b)/*{{{*/
127 {
128   const char **aa = (const char **) a;
129   const char **bb = (const char **) b;
130   return strcmp(*aa, *bb);
131 }
132 /*}}}*/
check_message_path_integrity(struct database * db)133 static void check_message_path_integrity(struct database *db)/*{{{*/
134 {
135   /* TODO : for now only checks integrity of non-mbox paths. */
136   /* Check there are no duplicates */
137   int i;
138   int n, imap_n;
139   int has_duplicate = 0;
140 
141   char **paths, **imaps;
142   paths = new_array(char *, db->n_msgs);
143   imaps = new_array(char *, db->n_msgs);
144   for (i=0, n=0, imap_n=0; i<db->n_msgs; i++) {
145     switch (db->type[i]) {
146       case MTY_DEAD:
147       case MTY_MBOX:
148         break;
149       case MTY_FILE:
150         paths[n++] = db->msgs[i].src.mpf.path;
151         break;
152       case MTY_IMAP:
153         imaps[imap_n++] = db->msgs[i].src.mpf.path;
154         break;
155     }
156   }
157 
158   qsort(paths, n, sizeof(char *), compare_strings);
159   qsort(imaps, imap_n, sizeof(char *), compare_strings);
160 
161   for (i=1; i<n; i++) {
162     if (!strcmp(paths[i-1], paths[i])) {
163       fprintf(stderr, "Path <%s> repeated\n", paths[i]);
164       has_duplicate = 1;
165     }
166   }
167   for (i=1; i<imap_n; i++) {
168     if (!strcmp(imaps[i-1], imaps[i])) {
169       fprintf(stderr, "IMAP message <%s> repeated\n", imaps[i]);
170       has_duplicate = 1;
171     }
172   }
173 
174   fflush(stderr);
175   assert(!has_duplicate);
176 
177   free(paths);
178   free(imaps);
179   return;
180 }
181 /*}}}*/
check_database_integrity(struct database * db)182 void check_database_integrity(struct database *db)/*{{{*/
183 {
184   if (verbose) fprintf(stderr, "Checking message path integrity\n");
185   check_message_path_integrity(db);
186 
187   /* Just check encoding chains for now */
188   if (verbose) fprintf(stderr, "Checking to\n");
189   check_toktable_enc_integrity(db->n_msgs, db->to);
190   if (verbose) fprintf(stderr, "Checking cc\n");
191   check_toktable_enc_integrity(db->n_msgs, db->cc);
192   if (verbose) fprintf(stderr, "Checking from\n");
193   check_toktable_enc_integrity(db->n_msgs, db->from);
194   if (verbose) fprintf(stderr, "Checking subject\n");
195   check_toktable_enc_integrity(db->n_msgs, db->subject);
196   if (verbose) fprintf(stderr, "Checking body\n");
197   check_toktable_enc_integrity(db->n_msgs, db->body);
198   if (verbose) fprintf(stderr, "Checking attachment_name\n");
199   check_toktable_enc_integrity(db->n_msgs, db->attachment_name);
200 }
201 /*}}}*/
new_database(unsigned int hash_key)202 struct database *new_database(unsigned int hash_key)/*{{{*/
203 {
204   struct database *result = new(struct database);
205   struct timeval tv;
206   pid_t  pid;
207 
208   result->to = new_toktable();
209   result->cc = new_toktable();
210   result->from = new_toktable();
211   result->subject = new_toktable();
212   result->body = new_toktable();
213   result->attachment_name = new_toktable();
214 
215   result->msg_ids = new_toktable2();
216 
217   if ( hash_key == CREATE_RANDOM_DATABASE_HASH )
218     {
219       gettimeofday(&tv, NULL);
220       pid = getpid();
221       hash_key = tv.tv_sec ^ (pid ^ (tv.tv_usec << 15));
222     }
223   result->hash_key = hash_key;
224 
225   result->msgs = NULL;
226   result->type = NULL;
227   result->n_msgs = 0;
228   result->max_msgs = 0;
229 
230   result->mboxen = NULL;
231   result->n_mboxen = 0;
232   result->max_mboxen = 0;
233 
234   return result;
235 }
236 /*}}}*/
free_database(struct database * db)237 void free_database(struct database *db)/*{{{*/
238 {
239   int i;
240 
241   free_toktable(db->to);
242   free_toktable(db->cc);
243   free_toktable(db->from);
244   free_toktable(db->subject);
245   free_toktable(db->body);
246   free_toktable(db->attachment_name);
247   free_toktable2(db->msg_ids);
248 
249   if (db->msgs) {
250     for (i=0; i<db->n_msgs; i++) {
251       switch (db->type[i]) {
252         case MTY_DEAD:
253           break;
254         case MTY_MBOX:
255           break;
256         case MTY_FILE:
257         case MTY_IMAP:
258           assert(db->msgs[i].src.mpf.path);
259           free(db->msgs[i].src.mpf.path);
260           break;
261       }
262     }
263     free(db->msgs);
264     free(db->type);
265   }
266 
267   free(db);
268 }
269 /*}}}*/
270 
get_max(int a,int b)271 static int get_max (int a, int b) {/*{{{*/
272   return (a > b) ? a : b;
273 }
274 /*}}}*/
import_toktable(char * data,unsigned int hash_key,int n_msgs,struct toktable_db * in,struct toktable * out)275 static void import_toktable(char *data, unsigned int hash_key, int n_msgs, struct toktable_db *in, struct toktable *out)/*{{{*/
276 {
277   int n, size, i;
278 
279   n = in->n;
280   size = 1;
281   while (size < n) size <<= 1;
282   size <<= 1; /* safe hash table size */
283 
284   out->size = size;
285   out->mask = size - 1;
286   out->n = n;
287   out->tokens = new_array(struct token *, size);
288   memset(out->tokens, 0, size * sizeof(struct token *));
289   out->hwm = (n + size) >> 1;
290 
291   for (i=0; i<n; i++) {
292     unsigned int hash, index;
293     char *text;
294     unsigned char *enc;
295     int enc_len;
296     struct token *nt;
297     int enc_hi;
298     int idx, incr;
299     unsigned char *j;
300 
301     /* Recover enc_len and enc_hi from the data */
302     enc = (unsigned char *) data + in->enc_offsets[i];
303     idx = 0;
304     for (j = enc; *j != 0xff; ) {
305       incr = read_increment(&j);
306       idx += incr;
307     }
308     enc_len = j - enc;
309     enc_hi = idx;
310 
311     text = data + in->tok_offsets[i];
312     hash = hashfn((unsigned char *) text, strlen(text), hash_key);
313 
314     nt = new(struct token);
315     nt->hashval = hash;
316     nt->text = new_string(text);
317     /* Allow a bit of headroom for adding more entries later */
318     nt->match0.max = get_max(16, enc_len + (enc_len >> 1));
319     nt->match0.n = enc_len;
320     nt->match0.highest = enc_hi;
321     assert(nt->match0.highest < n_msgs);
322     nt->match0.msginfo = new_array(unsigned char, nt->match0.max);
323     memcpy(nt->match0.msginfo, enc, nt->match0.n);
324 
325     index = hash & out->mask;
326     while (out->tokens[index]) {
327       /* Audit to look for corrupt database with multiple entries for the same
328        * string. */
329       if (!strcmp(nt->text, out->tokens[index]->text)) {
330         fprintf(stderr, "\n!!! Corrupt token table found in database, token <%s> duplicated, aborting\n",
331             nt->text);
332         fprintf(stderr, "  Delete the database file and rebuild from scratch as a workaround\n");
333         /* No point going on - need to find out why the database got corrupted
334          * in the 1st place.  Workaround for user - rebuild database from
335          * scratch by deleting it then rerunning. */
336         unlock_and_exit(1);
337       }
338       ++index;
339       index &= out->mask;
340     }
341 
342     out->tokens[index] = nt;
343   }
344 }
345 /*}}}*/
import_toktable2(char * data,unsigned int hash_key,int n_msgs,struct toktable2_db * in,struct toktable2 * out)346 static void import_toktable2(char *data, unsigned int hash_key, int n_msgs, struct toktable2_db *in, struct toktable2 *out)/*{{{*/
347 {
348   int n, size, i;
349 
350   n = in->n;
351   size = 1;
352   while (size < n) size <<= 1;
353   size <<= 1; /* safe hash table size */
354 
355   out->size = size;
356   out->mask = size - 1;
357   out->n = n;
358   out->tokens = new_array(struct token2 *, size);
359   memset(out->tokens, 0, size * sizeof(struct token *));
360   out->hwm = (n + size) >> 1;
361 
362   for (i=0; i<n; i++) {
363     unsigned int hash, index;
364     char *text;
365     struct token2 *nt;
366     unsigned char *enc0, *enc1;
367     int enc0_len, enc1_len;
368     int enc0_hi, enc1_hi;
369     int idx, incr;
370     unsigned char *j;
371 
372 /*{{{ do enc0*/
373     enc0 = (unsigned char *) data + in->enc0_offsets[i];
374     idx = 0;
375     for (j = enc0; *j != 0xff; ) {
376       incr = read_increment(&j);
377       idx += incr;
378     }
379     enc0_len = j - enc0;
380     enc0_hi = idx;
381 /*}}}*/
382 /*{{{ do enc1*/
383     enc1 = (unsigned char *) data + in->enc1_offsets[i];
384     idx = 0;
385     for (j = enc1; *j != 0xff; ) {
386       incr = read_increment(&j);
387       idx += incr;
388     }
389     enc1_len = j - enc1;
390     enc1_hi = idx;
391 /*}}}*/
392 
393     text = data + in->tok_offsets[i];
394     hash = hashfn((unsigned char *) text, strlen(text), hash_key);
395 
396     nt = new(struct token2);
397     nt->hashval = hash;
398     nt->text = new_string(text);
399     /* Allow a bit of headroom for adding more entries later */
400     /*{{{ set up match0 chain */
401     nt->match0.max = get_max(16, enc0_len + (enc0_len >> 1));
402     nt->match0.n = enc0_len;
403     nt->match0.highest = enc0_hi;
404     assert(nt->match0.highest < n_msgs);
405     nt->match0.msginfo = new_array(unsigned char, nt->match0.max);
406     memcpy(nt->match0.msginfo, enc0, nt->match0.n);
407     /*}}}*/
408     /*{{{ set up match1 chain */
409     nt->match1.max = get_max(16, enc1_len + (enc1_len >> 1));
410     nt->match1.n = enc1_len;
411     nt->match1.highest = enc1_hi;
412     assert(nt->match1.highest < n_msgs);
413     nt->match1.msginfo = new_array(unsigned char, nt->match1.max);
414     memcpy(nt->match1.msginfo, enc1, nt->match1.n);
415     /*}}}*/
416 
417     index = hash & out->mask;
418     while (out->tokens[index]) {
419       ++index;
420       index &= out->mask;
421     }
422 
423     out->tokens[index] = nt;
424   }
425 }
426 /*}}}*/
new_database_from_file(char * db_filename,int do_integrity_checks)427 struct database *new_database_from_file(char *db_filename, int do_integrity_checks)/*{{{*/
428 {
429   /* Read existing database from file for doing incremental update */
430   struct database *result;
431   struct read_db *input;
432   int i, n, N;
433 
434   result = new_database( CREATE_RANDOM_DATABASE_HASH );
435   input = open_db(db_filename);
436   if (!input) {
437     /* Nothing to initialise */
438     if (verbose) printf("Database file was empty, creating a new database\n");
439     return result;
440   }
441 
442   /* Build pathname information */
443   n = result->n_msgs = input->n_msgs;
444   result->max_msgs = input->n_msgs; /* let it be extended as-and-when */
445   result->msgs = new_array(struct msgpath, n);
446   result->type = new_array(enum message_type, n);
447 
448   result->hash_key = input->hash_key;
449 
450   /* Set up mbox structures */
451   N = result->n_mboxen = result->max_mboxen = input->n_mboxen;
452   result->mboxen = N ? (new_array(struct mbox, N)) : NULL;
453   for (i=0; i<N; i++) {
454     int nn;
455     if (input->mbox_paths_table[i]) {
456       result->mboxen[i].path = new_string(input->data + input->mbox_paths_table[i]);
457     } else {
458       /* mbox is dead. */
459       result->mboxen[i].path = NULL;
460     }
461     result->mboxen[i].file_mtime = input->mbox_mtime_table[i];
462     result->mboxen[i].file_size  = input->mbox_size_table[i];
463     nn = result->mboxen[i].n_msgs = input->mbox_entries_table[i];
464     result->mboxen[i].max_msgs = nn;
465     result->mboxen[i].start = new_array(off_t, nn);
466     result->mboxen[i].len   = new_array(size_t, nn);
467     result->mboxen[i].check_all = new_array(checksum_t, nn);
468     /* Copy the entire checksum table in one go. */
469     memcpy(result->mboxen[i].check_all,
470            input->data + input->mbox_checksum_table[i],
471            nn * sizeof(checksum_t));
472     result->mboxen[i].n_so_far = 0;
473   }
474 
475   for (i=0; i<n; i++) {
476     switch (rd_msg_type(input, i)) {
477       case DB_MSG_DEAD:
478         result->type[i] = MTY_DEAD;
479         break;
480       case DB_MSG_FILE:
481         result->type[i] = MTY_FILE;
482         result->msgs[i].src.mpf.path = new_string(input->data + input->path_offsets[i]);
483         result->msgs[i].src.mpf.mtime = input->mtime_table[i];
484         result->msgs[i].src.mpf.size  = input->size_table[i];
485         break;
486       case DB_MSG_IMAP:
487         result->type[i] = MTY_IMAP;
488         result->msgs[i].src.mpf.path = new_string(input->data + input->path_offsets[i]);
489         result->msgs[i].src.mpf.mtime = 0;
490         result->msgs[i].src.mpf.size  = 0;
491         break;
492       case DB_MSG_MBOX:
493         {
494           unsigned int mbi, msgi;
495           int n;
496           struct mbox *mb;
497           result->type[i] = MTY_MBOX;
498           decode_mbox_indices(input->path_offsets[i], &mbi, &msgi);
499           result->msgs[i].src.mbox.file_index = mbi;
500           mb = &result->mboxen[mbi];
501           assert(mb->n_so_far == msgi);
502           n = mb->n_so_far;
503           result->msgs[i].src.mbox.msg_index = n;
504           mb->start[n] = input->mtime_table[i];
505           mb->len[n] = input->size_table[i];
506           ++mb->n_so_far;
507         }
508 
509         break;
510     }
511     result->msgs[i].seen    = (input->msg_type_and_flags[i] & FLAG_SEEN)    ? 1:0;
512     result->msgs[i].replied = (input->msg_type_and_flags[i] & FLAG_REPLIED) ? 1:0;
513     result->msgs[i].flagged = (input->msg_type_and_flags[i] & FLAG_FLAGGED) ? 1:0;
514     result->msgs[i].date  = input->date_table[i];
515     result->msgs[i].tid   = input->tid_table[i];
516   }
517 
518   import_toktable(input->data, input->hash_key, result->n_msgs, &input->to, result->to);
519   import_toktable(input->data, input->hash_key, result->n_msgs, &input->cc, result->cc);
520   import_toktable(input->data, input->hash_key, result->n_msgs, &input->from, result->from);
521   import_toktable(input->data, input->hash_key, result->n_msgs, &input->subject, result->subject);
522   import_toktable(input->data, input->hash_key, result->n_msgs, &input->body, result->body);
523   import_toktable(input->data, input->hash_key, result->n_msgs, &input->attachment_name, result->attachment_name);
524   import_toktable2(input->data, input->hash_key, result->n_msgs, &input->msg_ids, result->msg_ids);
525 
526   close_db(input);
527 
528   if (do_integrity_checks) {
529     check_database_integrity(result);
530   }
531 
532   return result;
533 }
534 /*}}}*/
535 
add_angled_terms(int file_index,unsigned int hash_key,struct toktable2 * table,int add_to_chain1,char * s)536 static void add_angled_terms(int file_index, unsigned int hash_key, struct toktable2 *table, int add_to_chain1, char *s)/*{{{*/
537 {
538   char *left, *right;
539 
540   if (s) {
541     left = strchr(s, '<');
542     while (left) {
543       right = strchr(left, '>');
544       if (right) {
545         *right = '\0';
546         add_token2_in_file(file_index, hash_key, left+1, table, add_to_chain1);
547         *right = '>'; /* restore */
548       } else {
549         break;
550       }
551       left = strchr(right, '<');
552     }
553   }
554 }
555 /*}}}*/
556 
557 /* Macro for what characters can make up token strings.
558 
559    The following characters have special meanings:
560    0x2b +
561    0x2d -
562    0x2e .
563    0x40 @
564    0x5f _
565 
566    since they can occur within email addresses and message IDs when considered
567    as a whole rather than as individual words.  Underscore (0x5f) is considered
568    a word-character always too.
569 
570    */
571 static unsigned char special_table[256] = {
572   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 00-0f */
573   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 10-1f */
574   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 0, /* 20-2f */
575   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 30-3f */
576   2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 40-4f */
577   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, /* 50-5f */
578   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 60-6f */
579   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 70-7f */
580   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8f */
581   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 90-9f */
582   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* a0-af */
583   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* b0-bf */
584   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* c0-cf */
585   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* d0-df */
586   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* e0-ef */
587   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* f0-ff */
588 };
589 
590 #if 0
591 #define CHAR_VALID(x,mask) (isalnum((unsigned char) x) || (special_table[(unsigned int)(unsigned char) x] & mask))
592 #endif
char_valid_p(char x,unsigned int mask)593 static inline int char_valid_p(char x, unsigned int mask)/*{{{*/
594 {
595   unsigned char xx = (unsigned char) x;
596   if (isalnum(xx)) return 1;
597   else if (special_table[(unsigned int) xx] & mask) return 1;
598   else return 0;
599 }
600 /*}}}*/
tokenise_string(int file_index,unsigned int hash_key,struct toktable * table,char * data,int match_mask)601 static void tokenise_string(int file_index, unsigned int hash_key, struct toktable *table, char *data, int match_mask)/*{{{*/
602 {
603   char *ss, *es, old_es;
604   ss = data;
605   for (;;) {
606     while (*ss && !char_valid_p(*ss,match_mask)) ss++;
607     if (!*ss) break;
608     es = ss + 1;
609     while (*es && char_valid_p(*es,match_mask)) es++;
610 
611     /* deal with token [ss,es) */
612     old_es = *es;
613     *es = '\0';
614     /* FIXME: Ought to do this by passing start and length - clean up later */
615     add_token_in_file(file_index, hash_key, ss, table);
616     *es = old_es;
617 
618     if (!*es) break;
619     ss = es;
620   }
621 }
622 /*}}}*/
tokenise_html_string(int file_index,unsigned int hash_key,struct toktable * table,char * data)623 static void tokenise_html_string(int file_index, unsigned int hash_key, struct toktable *table, char *data)/*{{{*/
624 {
625   char *ss, *es, old_es;
626 
627   /* FIXME : Probably want to rewrite this as an explicit FSM */
628 
629   ss = data;
630   for (;;) {
631     /* Assume < and > are never valid token characters ! */
632     while (*ss && !char_valid_p(*ss, 1)) {
633       if (*ss++ == '<') {
634         /* Skip over HTML tag */
635         while (*ss && (*ss != '>')) ss++;
636       }
637     }
638     if (!*ss) break;
639 
640     es = ss + 1;
641     while (*es && char_valid_p(*es, 1)) es++;
642 
643     /* deal with token [ss,es) */
644     old_es = *es;
645     *es = '\0';
646     /* FIXME: Ought to do this by passing start and length - clean up later */
647     add_token_in_file(file_index, hash_key, ss, table);
648     *es = old_es;
649 
650     if (!*es) break;
651     ss = es;
652   }
653 }
654 /*}}}*/
tokenise_message(int file_index,struct database * db,struct rfc822 * msg)655 void tokenise_message(int file_index, struct database *db, struct rfc822 *msg)/*{{{*/
656 {
657   struct attachment *a;
658 
659   /* Match on whole addresses in these headers as well as the individual words */
660   if (msg->hdrs.to) {
661     tokenise_string(file_index, db->hash_key, db->to, msg->hdrs.to, 1);
662     tokenise_string(file_index, db->hash_key, db->to, msg->hdrs.to, 2);
663   }
664   if (msg->hdrs.cc) {
665     tokenise_string(file_index, db->hash_key, db->cc, msg->hdrs.cc, 1);
666     tokenise_string(file_index, db->hash_key, db->cc, msg->hdrs.cc, 2);
667   }
668   if (msg->hdrs.from) {
669     tokenise_string(file_index, db->hash_key, db->from, msg->hdrs.from, 1);
670     tokenise_string(file_index, db->hash_key, db->from, msg->hdrs.from, 2);
671   }
672   if (msg->hdrs.subject) tokenise_string(file_index, db->hash_key, db->subject, msg->hdrs.subject, 1);
673 
674   for (a=msg->atts.next; a!=&msg->atts; a=a->next) {
675     switch (a->ct) {
676       case CT_TEXT_PLAIN:
677         tokenise_string(file_index, db->hash_key, db->body, a->data.normal.bytes, 1);
678         break;
679       case CT_TEXT_HTML:
680         tokenise_html_string(file_index, db->hash_key, db->body, a->data.normal.bytes);
681         break;
682       case CT_MESSAGE_RFC822:
683         /* Just recurse for now - maybe we should have separate token tables
684          * for tokens occurring in embedded messages? */
685 
686         if (a->data.rfc822) {
687           tokenise_message(file_index, db, a->data.rfc822);
688         }
689         break;
690       default:
691         /* Don't do anything - unknown text format or some nasty binary stuff.
692          * In future, we could have all kinds of 'plug-ins' here, e.g.
693          * something that can parse PDF to get the basic text strings out of
694          * the pages? */
695         break;
696     }
697 
698     if (a->filename) {
699       add_token_in_file(file_index, db->hash_key, a->filename, db->attachment_name);
700     }
701 
702   }
703 
704   /* Deal with threading information */
705   add_angled_terms(file_index, db->hash_key, db->msg_ids, 1, msg->hdrs.message_id);
706   add_angled_terms(file_index, db->hash_key, db->msg_ids, 0, msg->hdrs.in_reply_to);
707   add_angled_terms(file_index, db->hash_key, db->msg_ids, 0, msg->hdrs.references);
708 }
709 /*}}}*/
710 
scan_maildir_flags(struct msgpath * m)711 static void scan_maildir_flags(struct msgpath *m)/*{{{*/
712 {
713   const char *p, *start;
714   start = m->src.mpf.path;
715   m->seen = 0;
716   m->replied = 0;
717   m->flagged = 0;
718   for (p=start; *p; p++) {}
719   for (p--; (p >= start) && ((*p) != ':'); p--) {}
720   if (p >= start) {
721     if (!strncmp(p, ":2,", 3)) {
722       p += 3;
723       while (*p) {
724         switch (*p) {
725           case 'F': m->flagged = 1; break;
726           case 'R': m->replied = 1; break;
727           case 'S': m->seen = 1; break;
728           default: break;
729         }
730         p++;
731       }
732     }
733   }
734 }
735 /*}}}*/
scan_new_messages(struct database * db,int start_at,struct imap_ll * imapc)736 static void scan_new_messages(struct database *db, int start_at, struct imap_ll *imapc)/*{{{*/
737 {
738   int i;
739   for (i=start_at; i<db->n_msgs; i++) {
740     struct rfc822 *msg = NULL;
741     int len = strlen(db->msgs[i].src.mpf.path);
742 
743     if (len > 10 && !strcmp(db->msgs[i].src.mpf.path + len - 11, "/.gitignore"))
744       continue;
745 
746     switch (db->type[i]) {
747       case MTY_DEAD:
748         assert(0);
749         break;
750       case MTY_MBOX:
751         assert(0); /* Should never get here - mbox messages are scanned elsewhere. */
752         break;
753       case MTY_FILE:
754         if (verbose) fprintf(stderr, "Scanning <%s>\n", db->msgs[i].src.mpf.path);
755         msg = make_rfc822(db->msgs[i].src.mpf.path);
756         break;
757       case MTY_IMAP:
758         if (verbose) fprintf(stderr, "Scanning IMAP <%s>\n", db->msgs[i].src.mpf.path);
759         msg = make_rfc822_from_imap(db->msgs[i].src.mpf.path, imapc);
760         break;
761     }
762     if(msg)
763     {
764       db->msgs[i].date = msg->hdrs.date;
765       scan_maildir_flags(&db->msgs[i]);
766       tokenise_message(i, db, msg);
767       free_rfc822(msg);
768     }
769     else
770       fprintf(stderr, "Skipping %s (could not parse message)\n", db->msgs[i].src.mpf.path);
771   }
772 }
773 /*}}}*/
774 
set_bit(unsigned long * x,int n)775 static inline void set_bit(unsigned long *x, int n)/*{{{*/
776 {
777   int set;
778   unsigned long mask;
779   set = (n >> 5);
780   mask = (1UL << (n & 31));
781   x[set] |= mask;
782 }
783 /*}}}*/
isset_bit(unsigned long * x,int n)784 static inline int isset_bit(unsigned long *x, int n)/*{{{*/
785 {
786   int set;
787   unsigned long mask;
788   set = (n >> 5);
789   mask = (1UL << (n & 31));
790   return (x[set] & mask) ? 1 : 0;
791 }
792 /*}}}*/
find_base(int * table,int index)793 static int find_base(int *table, int index) {/*{{{*/
794   int a = index;
795 
796   /* TODO : make this compress the path lengths down to the base entry */
797   while (table[a] != a) {
798     a = table[a];
799   }
800   return a;
801 }
802 /*}}}*/
find_threading(struct database * db)803 static void find_threading(struct database *db)/*{{{*/
804 {
805 
806   /* ix is a table mapping path array index to the lowest path array index that
807    * is known to share at least one message ID in its hdrs somewhere (i.e. they
808    * must be in the same thread) */
809   int *ix;
810 
811   int i, m, np, sm;
812   int next_tid;
813 
814   np = db->n_msgs;
815   sm = db->msg_ids->size;
816 
817   ix = new_array(int, np);
818   for (i=0; i<np; i++) {
819     ix[i] = i; /* default - every message in a thread of its own */
820   }
821 
822   for (m=0; m<sm; m++) {
823     struct token2 *tok = db->msg_ids->tokens[m];
824     if (tok) {
825       unsigned char *j = tok->match0.msginfo;
826       unsigned char *last_char = j + tok->match0.n;
827       int cur = 0, incr, first=1;
828       int new_base=-1, old_base;
829       while (j < last_char) {
830         incr = read_increment(&j);
831         cur += incr;
832         if (first) {
833           new_base = find_base(ix, cur);
834           first = 0;
835         } else {
836           old_base = find_base(ix, cur);
837           if (old_base < new_base) {
838             ix[new_base] = old_base;
839             new_base = old_base;
840           } else if (old_base > new_base) {
841             assert(new_base != -1);
842             ix[old_base] = new_base;
843           }
844         }
845       }
846     }
847   }
848 
849   /* Now make each entry point directly to its base */
850   for (i=0; i<np; i++) {
851     if (ix[i] != i) {
852       /* Sure to work as we're going up from the bottom */
853       ix[i] = ix[ix[i]];
854     }
855   }
856 
857   /* Now allocate contiguous thread group numbers */
858   next_tid = 0;
859   for (i=0; i<np; i++) {
860     if (ix[i] == i) {
861       db->msgs[i].tid = next_tid++;
862     } else {
863       db->msgs[i].tid = db->msgs[ix[i]].tid;
864     }
865   }
866 
867   free(ix);
868   return;
869 }
870 /*}}}*/
lookup_msgpath(struct msgpath * sorted_paths,int n_msgs,char * key,enum message_type type)871 static int lookup_msgpath(struct msgpath *sorted_paths, int n_msgs, char *key, enum message_type type)/*{{{*/
872 {
873   /* Implement bisection search */
874  int l, h, m, r;
875  l = 0, h = n_msgs;
876  m = -1;
877  while (h > l) {
878    m = (h + l) >> 1;
879    /* Should only get called on 'file' type messages - TBC */
880    if (sorted_paths[m].type < type) {
881      r = -1;
882    } else if (sorted_paths[m].type > type) {
883      r = 1;
884    } else {
885      r = strcmp(sorted_paths[m].src.mpf.path, key);
886    }
887    if (r == 0) break;
888    if (l == m) return -1;
889    if (r > 0) h = m;
890    else       l = m;
891  }
892  return m;
893 }
894 /*}}}*/
maybe_grow_message_arrays(struct database * db)895 void maybe_grow_message_arrays(struct database *db)/*{{{*/
896 {
897   if (db->n_msgs == db->max_msgs) {
898     if (db->max_msgs <= 128) {
899       db->max_msgs = 256;
900     } else {
901       db->max_msgs += (db->max_msgs >> 1);
902     }
903     db->msgs  = grow_array(struct msgpath,    db->max_msgs, db->msgs);
904     db->type  = grow_array(enum message_type, db->max_msgs, db->type);
905   }
906 }
907 /*}}}*/
add_msg_path(struct database * db,char * path,time_t mtime,size_t message_size,enum message_type type)908 static void add_msg_path(struct database *db, char *path, time_t mtime, size_t message_size, enum message_type type)/*{{{*/
909 {
910   maybe_grow_message_arrays(db);
911   db->type[db->n_msgs] = type;
912   db->msgs[db->n_msgs].src.mpf.path = new_string(path);
913   db->msgs[db->n_msgs].src.mpf.mtime = mtime;
914   db->msgs[db->n_msgs].src.mpf.size = message_size;
915   ++db->n_msgs;
916 }
917 /*}}}*/
918 
919 enum stat_result {/*{{{*/
920   STAT_RESULT_BAD,
921   STAT_RESULT_FILE,
922   STAT_RESULT_DIR,
923 };
924 /*}}}*/
925 
do_stat(struct msgpath * mp)926 static int do_stat(struct msgpath *mp)/*{{{*/
927 {
928   struct stat sb;
929   int status;
930   if (mp->type == MTY_IMAP) {
931     mp->src.mpf.mtime = 0;
932     mp->src.mpf.size = 0;
933     return STAT_RESULT_FILE;
934   }
935   status = stat(mp->src.mpf.path, &sb);
936   if (status < 0) {
937     return STAT_RESULT_BAD;
938   }
939   if (S_ISREG(sb.st_mode)) {
940     mp->src.mpf.mtime = sb.st_mtime;
941     mp->src.mpf.size = sb.st_size;
942     return STAT_RESULT_FILE;
943   } else if (S_ISDIR(sb.st_mode)) {
944     return STAT_RESULT_DIR;
945   }
946   return STAT_RESULT_BAD;
947 }
948 /*}}}*/
update_database(struct database * db,struct msgpath * sorted_paths,int n_msgs,int do_fast_index,struct imap_ll * imapc)949 int update_database(struct database *db, struct msgpath *sorted_paths, int n_msgs, int do_fast_index, struct imap_ll *imapc)/*{{{*/
950 {
951   /* The incoming list must be sorted into order, to make binary searching
952    * possible.  We search for each existing path in the incoming sorted array.
953    * If the date differs, or the file no longer exist, the existing database
954    * entry for that file is nulled.  (These are only recovered if the database
955    * is actively compressed.)  If the date differed, a new entry for the file
956    * is put at the end of the list.  Similarly, any new file goes at the end.
957    * These new entries are all rescanned to find tokens and add them to the
958    * database. */
959 
960   char *file_in_db, *file_in_new_list;
961   int matched_index;
962   int i, new_entries_start_at;
963   int any_new, n_newly_pruned, n_already_dead;
964 
965   file_in_db = new_array(char, n_msgs);
966   file_in_new_list = new_array(char, db->n_msgs);
967   bzero(file_in_db, n_msgs);
968   bzero(file_in_new_list, db->n_msgs);
969 
970   n_already_dead = 0;
971   n_newly_pruned = 0;
972 
973   for (i=0; i<db->n_msgs; i++) {
974     switch (db->type[i]) {
975       case MTY_FILE:
976         matched_index = lookup_msgpath(sorted_paths, n_msgs, db->msgs[i].src.mpf.path, MTY_FILE);
977         if (matched_index >= 0) {
978           if (do_fast_index) {
979             /* Assume the presence of a matching path is good enough without
980              * even bothering to stat the file that's there now. */
981             file_in_db[matched_index] = 1;
982             file_in_new_list[i] = 1;
983           } else {
984             switch (do_stat(sorted_paths + matched_index)) {
985               case STAT_RESULT_FILE:
986                 if (sorted_paths[matched_index].src.mpf.mtime == db->msgs[i].src.mpf.mtime) {
987                   /* Treat stale files as though the path has changed. */
988                   file_in_db[matched_index] = 1;
989                   file_in_new_list[i] = 1;
990                 }
991                 break;
992               default:
993                 /* This path will get treated as dead, and be re-stated below.
994                  * When that stat fails, the path won't get added to the db. */
995                 break;
996             }
997           }
998         }
999         break;
1000       case MTY_IMAP:
1001         matched_index = lookup_msgpath(sorted_paths, n_msgs, db->msgs[i].src.mpf.path, MTY_IMAP);
1002         if (matched_index >= 0) {
1003           file_in_db[matched_index] = 1;
1004           file_in_new_list[i] = 1;
1005         }
1006         break;
1007       case MTY_MBOX:
1008         /* Nothing to do on this pass. */
1009         break;
1010       case MTY_DEAD:
1011         break;
1012     }
1013   }
1014 
1015   /* Add new entries to database */
1016   new_entries_start_at = db->n_msgs;
1017 
1018   for (i=0; i<db->n_msgs; i++) {
1019     /* Weed dead entries */
1020     switch (db->type[i]) {
1021       case MTY_FILE:
1022       case MTY_IMAP:
1023         if (!file_in_new_list[i]) {
1024           free(db->msgs[i].src.mpf.path);
1025           db->msgs[i].src.mpf.path = NULL;
1026           db->type[i] = MTY_DEAD;
1027           ++n_newly_pruned;
1028         }
1029         break;
1030       case MTY_MBOX:
1031         {
1032           int msg_index, file_index, number_valid;
1033           int mbox_valid;
1034           msg_index = db->msgs[i].src.mbox.msg_index;
1035           file_index = db->msgs[i].src.mbox.file_index;
1036           assert (file_index < db->n_mboxen);
1037           mbox_valid = (db->mboxen[file_index].path) ? 1 : 0;
1038           number_valid = db->mboxen[file_index].n_old_msgs_valid;
1039           if (!mbox_valid || (msg_index >= number_valid)) {
1040             db->type[i] = MTY_DEAD;
1041             ++n_newly_pruned;
1042           }
1043         }
1044         break;
1045       case MTY_DEAD:
1046         /* already dead */
1047         ++n_already_dead;
1048         break;
1049     }
1050   }
1051 
1052   if (verbose) {
1053     fprintf(stderr, "%d newly dead messages, %d messages now dead in total\n", n_newly_pruned, n_newly_pruned+n_already_dead);
1054   }
1055 
1056   any_new = 0;
1057   for (i=0; i<n_msgs; i++) {
1058     if (!file_in_db[i]) {
1059       any_new = 1;
1060       /* The 'sorted_paths' array is only used for file-per-message folders. */
1061       switch(do_stat(sorted_paths + i)) {
1062         case STAT_RESULT_FILE:
1063           /* We only add files that could be successfully stat()'d as regular
1064            * files. */
1065           add_msg_path(db, sorted_paths[i].src.mpf.path, sorted_paths[i].src.mpf.mtime, sorted_paths[i].src.mpf.size, sorted_paths[i].type);
1066           break;
1067         case STAT_RESULT_DIR:
1068           if (sorted_paths[i].source_ft == FT_MH) {
1069             /* With MH, something that looks like a message but is a
1070                directory is actually a subfolder with a numeric name.
1071                Don't warn about it */
1072             break;
1073           }
1074           /* In all other cases, treat any non-regular-file the same
1075              and warn about it. */
1076           /* fallthrough */
1077         default:
1078           fprintf(stderr, "Cannot add '%s' to database; stat() failed\n", sorted_paths[i].src.mpf.path);
1079           break;
1080       }
1081     }
1082   }
1083 
1084   if (any_new) {
1085     scan_new_messages(db, new_entries_start_at, imapc);
1086   }
1087 
1088   /* Add newly found mbox messages. */
1089   any_new |= add_mbox_messages(db);
1090 
1091   if (any_new) {
1092     find_threading(db);
1093   } else {
1094     if (verbose) fprintf(stderr, "No new messages found\n");
1095   }
1096 
1097   free(file_in_db);
1098   free(file_in_new_list);
1099 
1100   return any_new || (n_newly_pruned > 0);
1101 }
1102 /*}}}*/
recode_encoding(struct matches * m,int * new_idx)1103 static void recode_encoding(struct matches *m, int *new_idx)/*{{{*/
1104 {
1105   unsigned char *new_enc, *old_enc;
1106   unsigned char *j, *last_char;
1107   int incr, idx, n_idx;
1108 
1109   old_enc = m->msginfo;
1110   j = old_enc;
1111   last_char = old_enc + m->n;
1112 
1113   new_enc = new_array(unsigned char, m->max); /* Probably not bigger than this. */
1114   m->n = 0;
1115   m->highest = 0;
1116   m->msginfo = new_enc;
1117   idx = 0;
1118 
1119   while (j < last_char) {
1120     incr = read_increment(&j);
1121     idx += incr;
1122     n_idx = new_idx[idx];
1123     if (n_idx >= 0) {
1124       check_and_enlarge_encoding(m);
1125       insert_index_on_encoding(m, n_idx);
1126     }
1127   }
1128   free(old_enc);
1129 }
1130 /*}}}*/
recode_toktable(struct toktable * tbl,int * new_idx)1131 static void recode_toktable(struct toktable *tbl, int *new_idx)/*{{{*/
1132 {
1133   /* Re-encode the vectors according to the new path indices */
1134   int i;
1135   int any_dead = 0;
1136   int any_moved, pass;
1137 
1138   for (i=0; i<tbl->size; i++) {
1139     struct token *tok = tbl->tokens[i];
1140     if (tok) {
1141       recode_encoding(&tok->match0, new_idx);
1142       if (tok->match0.n == 0) {
1143         /* Delete this token.  Gotcha - there may be tokens further on in the
1144          * array that didn't get their natural hash bucket due to collisions.
1145          * Need to shuffle such tokens up to guarantee that the buckets between
1146          * the natural one and the one where they are now are all occupied, to
1147          * prevent their lookups failing. */
1148 
1149 #if 0
1150         fprintf(stderr, "Token <%s> (bucket %d) no longer has files containing it, deleting\n", tok->text, i);
1151 #endif
1152         free_token(tok);
1153         tbl->tokens[i] = NULL;
1154         --tbl->n; /* Maintain number in use counter */
1155         any_dead = 1;
1156       }
1157 
1158     }
1159   }
1160 
1161 
1162   if (any_dead) {
1163     /* Now close gaps.  This has to be done in a second pass, otherwise we get a
1164      * problem with moving entries that need deleting back before the current
1165        scan point. */
1166 
1167     pass = 1;
1168     for (;;) {
1169       int i;
1170 
1171       if (verbose) {
1172         fprintf(stderr, "Pass %d\n", pass);
1173       }
1174 
1175       any_moved = 0;
1176 
1177       for (i=0; i<tbl->size; i++) {
1178         if (tbl->tokens[i]) {
1179           int nat_bucket_i;
1180           nat_bucket_i = tbl->tokens[i]->hashval & tbl->mask;
1181           if (nat_bucket_i != i) {
1182             /* Find earliest bucket that we could move i to */
1183             int j = nat_bucket_i;
1184             while (j != i) {
1185               if (!tbl->tokens[j]) {
1186                 /* put it here */
1187 #if 0
1188                 fprintf(stderr, "Moved <%s> from bucket %d to %d (natural bucket %d)\n", tbl->tokens[i]->text, i, j, nat_bucket_i);
1189 #endif
1190                 tbl->tokens[j] = tbl->tokens[i];
1191                 tbl->tokens[i] = NULL;
1192                 any_moved = 1;
1193                 break;
1194               } else {
1195                 j++;
1196                 j &= tbl->mask;
1197               }
1198             }
1199             if (tbl->tokens[i]) {
1200 #if 0
1201               fprintf(stderr, "NOT moved <%s> from bucket %d (natural bucket %d)\n", tbl->tokens[i]->text, i, nat_bucket_i);
1202 #endif
1203             }
1204           }
1205         }
1206       }
1207 
1208       if (!any_moved) break;
1209       pass++;
1210     }
1211   }
1212 }
1213 /*}}}*/
recode_toktable2(struct toktable2 * tbl,int * new_idx)1214 static void recode_toktable2(struct toktable2 *tbl, int *new_idx)/*{{{*/
1215 {
1216   /* Re-encode the vectors according to the new path indices */
1217   int i;
1218   int any_dead = 0;
1219   int any_moved, pass;
1220 
1221   for (i=0; i<tbl->size; i++) {
1222     struct token2 *tok = tbl->tokens[i];
1223     if (tok) {
1224       recode_encoding(&tok->match0, new_idx);
1225       recode_encoding(&tok->match1, new_idx);
1226       if ((tok->match0.n == 0) && (tok->match1.n == 0)) {
1227         /* Delete this token.  Gotcha - there may be tokens further on in the
1228          * array that didn't get their natural hash bucket due to collisions.
1229          * Need to shuffle such tokens up to guarantee that the buckets between
1230          * the natural one and the one where they are now are all occupied, to
1231          * prevent their lookups failing. */
1232 
1233 #if 0
1234         fprintf(stderr, "Token <%s> (bucket %d) no longer has files containing it, deleting\n", tok->text, i);
1235 #endif
1236         free_token2(tok);
1237         tbl->tokens[i] = NULL;
1238         --tbl->n; /* Maintain number in use counter */
1239         any_dead = 1;
1240       }
1241     }
1242   }
1243 
1244   if (any_dead) {
1245     /* Now close gaps.  This has to be done in a second pass, otherwise we get a
1246      * problem with moving entries that need deleting back before the current
1247        scan point. */
1248 
1249     pass = 1;
1250     for (;;) {
1251       int i;
1252 
1253       if (verbose) {
1254         fprintf(stderr, "Pass %d\n", pass);
1255       }
1256 
1257       any_moved = 0;
1258 
1259       for (i=0; i<tbl->size; i++) {
1260         if (tbl->tokens[i]) {
1261           int nat_bucket_i;
1262           nat_bucket_i = tbl->tokens[i]->hashval & tbl->mask;
1263           if (nat_bucket_i != i) {
1264             /* Find earliest bucket that we could move i to */
1265             int j = nat_bucket_i;
1266             while (j != i) {
1267               if (!tbl->tokens[j]) {
1268                 /* put it here */
1269 #if 0
1270                 fprintf(stderr, "Moved <%s> from bucket %d to %d (natural bucket %d)\n", tbl->tokens[i]->text, i, j, nat_bucket_i);
1271 #endif
1272                 tbl->tokens[j] = tbl->tokens[i];
1273                 tbl->tokens[i] = NULL;
1274                 any_moved = 1;
1275                 break;
1276               } else {
1277                 j++;
1278                 j &= tbl->mask;
1279               }
1280             }
1281             if (tbl->tokens[i]) {
1282 #if 0
1283               fprintf(stderr, "NOT moved <%s> from bucket %d (natural bucket %d)\n", tbl->tokens[i]->text, i, nat_bucket_i);
1284 #endif
1285             }
1286           }
1287         }
1288       }
1289 
1290       if (!any_moved) break;
1291       pass++;
1292     }
1293   }
1294 }
1295 /*}}}*/
cull_dead_messages(struct database * db,int do_integrity_checks)1296 int cull_dead_messages(struct database *db, int do_integrity_checks)/*{{{*/
1297 {
1298   /* Return true if any culled */
1299 
1300   int *new_idx, i, j, n_old;
1301   int any_culled = 0;
1302 
1303   /* Check db is OK before we start on this. (Check afterwards is done in the
1304    * writer.c code.) */
1305   if (do_integrity_checks) {
1306     check_database_integrity(db);
1307   }
1308 
1309   if (verbose) {
1310     fprintf(stderr, "Culling dead messages\n");
1311   }
1312 
1313   n_old = db->n_msgs;
1314 
1315   new_idx = new_array(int, n_old);
1316   for (i=0, j=0; i<n_old; i++) {
1317     switch (db->type[i]) {
1318       case MTY_FILE:
1319       case MTY_MBOX:
1320       case MTY_IMAP:
1321         new_idx[i] = j++;
1322         break;
1323       case MTY_DEAD:
1324         new_idx[i] = -1;
1325         any_culled = 1;
1326         break;
1327     }
1328   }
1329 
1330   recode_toktable(db->to, new_idx);
1331   recode_toktable(db->cc, new_idx);
1332   recode_toktable(db->from, new_idx);
1333   recode_toktable(db->subject, new_idx);
1334   recode_toktable(db->body, new_idx);
1335   recode_toktable(db->attachment_name, new_idx);
1336   recode_toktable2(db->msg_ids, new_idx);
1337 
1338   /* And crunch down the filename table */
1339   for (i=0, j=0; i<n_old; i++) {
1340     switch (db->type[i]) {
1341       case MTY_DEAD:
1342         break;
1343       case MTY_FILE:
1344       case MTY_MBOX:
1345       case MTY_IMAP:
1346         if (i > j) {
1347           db->msgs[j] = db->msgs[i];
1348           db->type[j]  = db->type[i];
1349         }
1350         j++;
1351         break;
1352     }
1353   }
1354   db->n_msgs = j;
1355 
1356   free(new_idx);
1357 
1358   /* .. and cull dead mboxen */
1359   cull_dead_mboxen(db);
1360 
1361   return any_culled;
1362 }
1363 /*}}}*/
1364