1 /* $Id: Index.cpp 1649 2009-10-19 14:35:01Z terpstra $
2 *
3 * index.cpp - Insert all the keywords from the given email
4 *
5 * Copyright (C) 2002 - Wesley W. Terpstra
6 *
7 * License: GPL
8 *
9 * Authors: 'Wesley W. Terpstra' <wesley@terpstra.ca>
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; version 2.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 */
24
25 #define _FILE_OFFSET_BITS 64
26
27 #include <mimelib/headers.h>
28 #include <mimelib/datetime.h>
29 #include <mimelib/addrlist.h>
30 #include <mimelib/address.h>
31 #include <mimelib/group.h>
32 #include <mimelib/mboxlist.h>
33 #include <mimelib/mailbox.h>
34 #include <mimelib/text.h>
35 #include <mimelib/param.h>
36 #include <mimelib/enum.h>
37 #include <mimelib/body.h>
38 #include <mimelib/bodypart.h>
39 #include <mimelib/utility.h>
40
41 #include <CharsetEscape.h>
42 #include <Keys.h>
43 #include <md5.h>
44 #include <cstdlib>
45
46 #include "Index.h"
47 #include "Summary.h"
48
49 #include <string>
50 #include <vector>
51 #include <iostream>
52
53 #include <unistd.h>
54 #include <iconv.h>
55 #include <cerrno>
56
57 using namespace std;
58
59 #define MAX_MESSAGE_ID 80
60
utf8Truncate(string & str,string::size_type len)61 void utf8Truncate(string& str, string::size_type len)
62 {
63 if (str.length() < len) return;
64
65 // look for nasty utf-8 stuff that's dangling and crop it
66 while (len && ((unsigned char)str[len-1]) >= 0x80 &&
67 ((unsigned char)str[len-1]) <= 0xBF)
68 --len;
69 // now rewind off potential utf-8 start bytes
70 while (len && ((unsigned char)str[len-1]) >= 0xC0)
71 --len;
72
73 // len is now at the end of a complete multi-byte element or ascii
74
75 str.resize(len);
76 }
77
my_toupper(char x)78 static inline char my_toupper(char x)
79 {
80 if (x >= 'a' && x <= 'z')
81 return x - 'a' + 'A';
82 else return x;
83 }
84
strings_equal_case_ignored(const string & a,const string & b)85 static bool strings_equal_case_ignored(const string& a, const string& b)
86 {
87 if (a.length() != b.length()) return false;
88
89 for (string::size_type i = 0; i < a.length(); ++i)
90 if (my_toupper(a[i]) != my_toupper(b[i]))
91 return false;
92 return true;
93 }
94
95 // first = address, second = name
pickAddress(DwAddress * a,const char * charset)96 pair<string, string> pickAddress(DwAddress* a, const char* charset)
97 {
98 for (; a != 0; a = a->Next())
99 {
100 if (a->IsGroup())
101 {
102 DwGroup* g = dynamic_cast<DwGroup*>(a);
103 if (g)
104 {
105 pair<string, string> out =
106 pickAddress(
107 g->MailboxList().FirstMailbox(),
108 charset);
109 if (out.first != "") return out;
110 }
111 }
112 else
113 {
114 DwMailbox* m = dynamic_cast<DwMailbox*>(a);
115 if (m)
116 {
117 string name = m->FullName().c_str();
118 name = decode_header(name, charset);
119 DwString addr = m->LocalPart() + "@" + m->Domain();
120
121 // fucked address? (one cannot safely cut this)
122 if (addr.length() > 128 ||
123 m->LocalPart() == "" || m->Domain() == "")
124 {
125 addr = "";
126 }
127
128 for (size_t i = 0; i < addr.length(); ++i)
129 {
130 if (addr[i] <= 0x20 || addr[i] >= 0x7f)
131 { // fucked up address
132 addr = "";
133 break;
134 }
135 }
136
137 // prune any optional quotes
138 if (name.length() >= 2 && name[0] == '"')
139 name = name.substr(1, name.length()-2);
140
141 if (addr != "")
142 return pair<string, string>(addr.c_str(), name);
143 }
144 }
145 }
146
147 return pair<string, string>("", "");
148 }
149
index_author()150 int Index::index_author()
151 {
152 // one always has headers, but not always this function:
153 // if (message.hasHeaders())
154
155 charset = "ISO-8859-1"; // a good default as any
156
157 if (message.Headers().HasContentType())
158 {
159 DwParameter* p = message.Headers().ContentType().FirstParameter();
160 while (p)
161 {
162 if (p->Attribute() == "charset")
163 charset = p->Value().c_str();
164 p = p->Next();
165 }
166 }
167
168 // pickAddress only gives an author_name if it gave an author_email
169
170 if (message.Headers().HasReplyTo())
171 {
172 pair<string, string> addr = pickAddress(
173 message.Headers().ReplyTo().FirstAddress(),
174 charset.c_str());
175
176 author_email = addr.first;
177 author_name = addr.second;
178
179 // Some evil mailing lists set reply-to the list.
180 if (strings_equal_case_ignored(author_email, list.address))
181 {
182 author_email = "";
183 author_name = "";
184 }
185 }
186
187 // Given a reply-to that is not the list, we allow the from to
188 // provide a fullname under the assumption it is the same person.
189
190 if (message.Headers().HasFrom())
191 {
192 pair<string, string> addr = pickAddress(
193 message.Headers().From().FirstMailbox(),
194 charset.c_str());
195
196 if (!author_email.length()) author_email = addr.first;
197 if (!author_name .length()) author_name = addr.second;
198 }
199
200 // ditto
201
202 if (message.Headers().HasSender())
203 {
204 pair<string, string> addr = pickAddress(
205 &message.Headers().Sender(),
206 charset.c_str());
207
208 if (!author_email.length()) author_email = addr.first;
209 if (!author_name .length()) author_name = addr.second;
210 }
211
212 author_name = whitespace_sanitize(author_name);
213 utf8Truncate(author_name, 100);
214 // - nothing longer than 128 could get here (from above)
215 // - one can never safely truncate an email address
216 // utf8Truncate(author_email, 100);
217
218 return 0;
219 }
220
221 // Doesn't vary with charset
lu_isspace(char x)222 inline bool lu_isspace(char x)
223 {
224 return x == ' ' || x == '\n' || x == '\r' || x == '\t';
225 }
226
build_message_hash(const char * str,unsigned char * hash)227 void build_message_hash(const char* str, unsigned char* hash)
228 {
229 MD5Context ctx;
230
231 MD5Init(&ctx);
232 MD5Update(&ctx, (const unsigned char*)str, strlen(str));
233
234 unsigned char buf[16];
235 MD5Final(buf, &ctx);
236
237 hash[0] = buf[0] ^ buf[4] ^ buf[ 8] ^ buf[12];
238 hash[1] = buf[1] ^ buf[5] ^ buf[ 9] ^ buf[13];
239 hash[2] = buf[2] ^ buf[6] ^ buf[10] ^ buf[14];
240 hash[3] = buf[3] ^ buf[7] ^ buf[11] ^ buf[15];
241 }
242
feed_writer(const char * keyword,void * arg)243 int feed_writer(const char* keyword, void* arg)
244 {
245 Index* i = (Index*)arg;
246
247 string x(LU_KEYWORD);
248 x += keyword;
249 x += '\0';
250 x += i->id.raw();
251
252 return i->writer->insert(x);
253 }
254
index_id(bool userdate,time_t server,bool & exist)255 int Index::index_id(bool userdate, time_t server, bool& exist)
256 {
257 time_t stamp = server;
258 string messageId;
259 unsigned char hash[4];
260
261 // if (message.hasHeaders())
262
263 if (message.Headers().HasDate())
264 {
265 time_t user = message.Headers().Date().AsUnixTime();
266
267 /* User time must be earlier; there is delivery delay!
268 * However, more than 7 day delivery time is unlikely.
269 */
270 if ((user <= server && server < user+7*60*60*24) ||
271 userdate || // trusting the userdate?
272 server <= 0) // server is on crack?
273 stamp = user;
274 }
275
276 if (stamp <= 0)
277 { // this is crazy; I don't care if they agree: it's wrong
278 stamp = 1; // liers all have timestamp 1970-01-01 00:00:01
279 }
280
281 if (message.Headers().HasMessageId())
282 {
283 vector<string> ids = extract_message_ids(
284 message.Headers().MessageId().AsString().c_str());
285
286 if (!ids.empty())
287 messageId = ids.front();
288 }
289
290 if (messageId.length())
291 {
292 // Constant message-id across import, and threadable
293 build_message_hash(messageId.c_str(), hash);
294 }
295 else if (author_email.length())
296 {
297 // This means no proper threading.
298 // At least the message-id is constant across import.
299 build_message_hash(author_email.c_str(), hash);
300 }
301 else
302 {
303 // Can't make any guarantees; just import it.
304 hash[0] = random() % 256;
305 hash[1] = random() % 256;
306 hash[2] = random() % 256;
307 hash[3] = random() % 256;
308 }
309
310 id = MessageId(stamp, hash);
311 if (blacklist.find(id) != blacklist.end())
312 {
313 // Messages marked as blacklisted use the 'exist' flag to
314 // avoid being imported into the database.
315 exist = true;
316 return 0;
317 }
318
319 if (messageId.length())
320 {
321 // Raw message-id for threading
322 if (writer->insert(
323 LU_KEYWORD +
324 string(LU_KEYWORD_MESSAGE_ID) +
325 messageId +
326 '\0' +
327 id.raw()) != 0)
328 {
329 cerr << "Failed to insert message id keyword!" << endl;
330 return -1;
331 }
332
333 // digested message-id for user searches
334 if (my_keyword_digest_string(
335 messageId.c_str(), messageId.length(),
336 LU_KEYWORD_MESSAGE_ID, &feed_writer, this, 0) != 0)
337 {
338 cerr << "Failed to index message-id" << endl;
339 return -1;
340 }
341 }
342
343 if (writer->insert(
344 LU_KEYWORD +
345 string(LU_KEYWORD_EVERYTHING) +
346 '\0' +
347 id.raw()) != 0)
348 {
349 cerr << "Failed to the any keyword!" << endl;
350 return -1;
351 }
352
353 return 0;
354 }
355
index_summary(bool check,bool & exist)356 int Index::index_summary(bool check, bool& exist)
357 {
358 string prefix = LU_SUMMARY + id.raw();
359
360 if (message.Headers().HasSubject())
361 {
362 subject = message.Headers().Subject().AsString().c_str();
363 subject = decode_header(subject, charset.c_str());
364 }
365
366 if (subject == "")
367 subject = "[...]";
368
369 string mbox = prefix + LU_MESSAGE_MBOX + list.mbox + '\0';
370
371 if (check)
372 {
373 // Check for existance
374 auto_ptr<ESort::Walker> w(writer->seek(mbox, "", ESort::Forward));
375
376 if (w->advance() == -1)
377 { // was it just eof?
378 if (errno != 0) return -1;
379 }
380 else
381 { // if it suceeded. then ... it is already in there
382 exist = true;
383 return 0;
384 }
385 }
386
387 unsigned char buf[12];
388 off_t o = off;
389 long l = len;
390 int i;
391
392 for (i = 7; i >= 0; --i)
393 {
394 buf[i] = (o & 0xFF);
395 o >>= 8;
396 }
397 for (i = 11; i >= 8; --i)
398 {
399 buf[i] = (l & 0xFF);
400 l >>= 8;
401 }
402
403 // Don't let crazy stuff in there.
404 utf8Truncate(subject, 200);
405
406 if (writer->insert(prefix + LU_MESSAGE_AUTHOR_EMAIL + author_email) != 0 ||
407 writer->insert(prefix + LU_MESSAGE_AUTHOR_NAME + author_name) != 0 ||
408 writer->insert(prefix + LU_MESSAGE_SUBJECT + subject) != 0 ||
409 writer->insert(mbox + string((char*)buf, 12)) != 0)
410 {
411 cerr << "Failed to insert summary keys" << endl;
412 return -1;
413 }
414
415 return 0;
416 }
417
index_threading()418 int Index::index_threading()
419 {
420 string shash = subject_hash(subject.c_str());
421 string suffix;
422
423 unsigned char hash[4];
424
425 if (writer->insert(
426 LU_KEYWORD
427 LU_KEYWORD_THREAD +
428 shash +
429 '\0' +
430 id.raw()) != 0)
431 {
432 cerr << "Failed to insert threading keyword" << endl;
433 return -1;
434 }
435
436 // if (message.hasHeaders())
437
438 if (message.Headers().HasInReplyTo())
439 {
440 vector<string> ids = extract_message_ids(
441 message.Headers().InReplyTo().AsString().c_str());
442
443 // first in-reply-to is most relevant
444 for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
445 {
446 build_message_hash(i->c_str(), hash);
447
448 // keep it reasonable; too many reply-tos is bad
449 if (suffix.length() < 200)
450 suffix.append((const char*)hash, 4);
451 }
452 }
453
454 if (message.Headers().HasReferences())
455 {
456 vector<string> ids = extract_message_ids(
457 message.Headers().References().AsString().c_str());
458
459 // last references is most recently added (most likely irt)
460 for (vector<string>::reverse_iterator i = ids.rbegin();
461 i != ids.rend(); ++i)
462 {
463 build_message_hash(i->c_str(), hash);
464 // keep it reasonable; too many reply-tos is bad
465 if (suffix.length() < 200)
466 suffix.append((const char*)hash, 4);
467 }
468 }
469
470 if (writer->insert(
471 LU_THREADING
472 + shash
473 + id.raw()
474 + suffix) != 0)
475 {
476 cerr << "Failed to insert threading keys" << endl;
477 return -1;
478 }
479
480 if (writer->insert(
481 LU_NEW_TOPICS
482 + list.mbox + '\0'
483 + id.raw().substr(0, 4)
484 + shash) != 0)
485 {
486 cerr << "Failed to insert new topics keys" << endl;
487 return -1;
488 }
489
490 return 0;
491 }
492
index_control(time_t import)493 int Index::index_control(time_t import)
494 {
495 bool ok = true;
496 if (writer->insert(
497 LU_KEYWORD
498 LU_KEYWORD_LIST +
499 list.mbox +
500 '\0' +
501 id.raw()) != 0) ok = false;
502
503 /* emulated group and language searches are impossibly slow.
504 * these keywords are a must for large archives.
505 */
506 if (writer->insert(
507 LU_KEYWORD
508 LU_KEYWORD_GROUP +
509 list.group +
510 '\0' +
511 id.raw()) != 0) ok = false;
512
513 set<string>::const_iterator i, e;
514 for (i = list.languages.begin(), e = list.languages.end(); i != e; ++i)
515 if (writer->insert(
516 LU_KEYWORD
517 LU_KEYWORD_LANGUAGE +
518 *i +
519 '\0' +
520 id.raw()) != 0) ok = false;
521
522 MessageId importStamp(import);
523 if (writer->insert(
524 LU_CACHE +
525 importStamp.raw().substr(0, 4) +
526 id.raw()) != 0) ok = false;
527
528 if (author_email.length())
529 {
530 if (my_keyword_digest_string(
531 author_email.c_str(), author_email.length(),
532 LU_KEYWORD_AUTHOR, &feed_writer, this, 1) != 0)
533 ok = false;
534 }
535
536 if (author_name.length())
537 {
538 if (my_keyword_digest_string(
539 author_name.c_str(), author_name.length(),
540 LU_KEYWORD_AUTHOR, &feed_writer, this, 1) != 0)
541 ok = false;
542 }
543
544 if (subject.length())
545 {
546 if (my_keyword_digest_string(
547 subject.c_str(), subject.length(),
548 LU_KEYWORD_SUBJECT, &feed_writer, this, 1) != 0)
549 ok = false;
550 }
551
552 if (message.Headers().HasInReplyTo())
553 {
554 vector<string> ids = extract_message_ids(
555 message.Headers().InReplyTo().AsString().c_str());
556 for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
557 if (writer->insert(
558 LU_KEYWORD
559 LU_KEYWORD_REPLY_TO +
560 *i + '\0' + id.raw()) != 0)
561 ok = false;
562 }
563
564 #if 0 // this is questionable...
565 if (message.Headers().HasReferences())
566 {
567 vector<string> ids = extract_message_ids(
568 message.Headers().References().AsString().c_str());
569 for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
570 if (writer->insert(
571 LU_KEYWORD
572 LU_KEYWORD_REPLY_TO +
573 *i + '\0' + id.raw()) != 0)
574 ok = false;
575 }
576 #endif
577
578 if (!ok)
579 {
580 cerr << "Failed to insert control keys" << endl;
581 return -1;
582 }
583
584 return 0;
585 }
586
index_entity(DwEntity & e,const string & charset)587 int Index::index_entity(DwEntity& e, const string& charset)
588 {
589 DwString text;
590 if (e.Headers().HasContentTransferEncoding())
591 {
592 switch (e.Headers().ContentTransferEncoding().AsEnum())
593 {
594 case DwMime::kCteQuotedPrintable:
595 DwDecodeQuotedPrintable(e.Body().AsString(), text);
596 break;
597
598 case DwMime::kCteBase64:
599 DwDecodeBase64(e.Body().AsString(), text);
600 break;
601
602 case DwMime::kCteNull:
603 case DwMime::kCteUnknown:
604 case DwMime::kCte7bit:
605 case DwMime::kCte8bit:
606 case DwMime::kCteBinary:
607 text = e.Body().AsString();
608 break;
609 }
610 }
611 else
612 {
613 text = e.Body().AsString();
614 }
615
616 CharsetEscape decode(charset.c_str());
617 string utf8 = decode.write(text.c_str(), text.length());
618
619 if (my_keyword_digest_string(
620 utf8.c_str(), utf8.length(),
621 LU_KEYWORD_WORD, &feed_writer, this, 1) != 0)
622 {
623 cerr << "Failed to index un-typed segment" << endl;
624 return -1;
625 }
626
627 return 0;
628 }
629
index_keywords(DwEntity & e,const string & parentCharset)630 int Index::index_keywords(DwEntity& e, const string& parentCharset)
631 {
632 string charset = parentCharset;
633
634 if (e.Headers().HasContentType())
635 {
636 DwMediaType& mt = e.Headers().ContentType();
637
638 for (DwParameter* p = mt.FirstParameter(); p; p = p->Next())
639 {
640 DwString attr = p->Attribute();
641 attr.ConvertToLowerCase(); // case insens
642 if (attr == "charset") charset = p->Value().c_str();
643 }
644 }
645
646 // if (e.hasHeaders() &&
647 if (e.Headers().HasContentType())
648 {
649 DwMediaType& t = e.Headers().ContentType();
650 switch (t.Type())
651 {
652 case DwMime::kTypeMessage:
653 if (e.Body().Message())
654 index_keywords(*e.Body().Message(), charset);
655 break;
656
657 case DwMime::kTypeMultipart:
658 // index all alternatives in multipart
659 for (DwBodyPart* p = e.Body().FirstBodyPart(); p != 0; p = p->Next())
660 index_keywords(*p, charset);
661 break;
662
663 case DwMime::kTypeText:
664 if (t.Subtype() == DwMime::kSubtypePlain)
665 {
666 if (index_entity(e, charset) != 0) return -1;
667 }
668 break;
669 }
670 }
671 else
672 {
673 if (index_entity(e, charset) != 0) return -1;
674 }
675
676 return 0;
677 }
678
index(bool userdate,time_t envelope,time_t import,bool check,bool & exist)679 int Index::index(bool userdate, time_t envelope, time_t import, bool check, bool& exist)
680 {
681 exist = false;
682
683 // cout << message.Headers().Subject().AsString().c_str() << endl;
684
685 if (index_author() < 0) return -1;
686
687 /* If the message is blacklisted, we mark it as 'existing' */
688 if (index_id(userdate, envelope, exist) < 0) return -1;
689 if (exist) return 0;
690
691 /* If the message is already imported, mark it as 'existing' */
692 if (index_summary(check, exist) < 0) return -1;
693 if (exist) return 0;
694
695 if (index_threading( ) < 0) return -1;
696 if (index_control (import) < 0) return -1;
697 if (index_keywords (message, "ISO-8859-1") < 0) return -1;
698
699 return 0;
700 }
701