1 /*  $Id: Index.cpp 1649 2009-10-19 14:35:01Z terpstra $
2  *
3  *  index.cpp - Insert all the keywords from the given email
4  *
5  *  Copyright (C) 2002 - Wesley W. Terpstra
6  *
7  *  License: GPL
8  *
9  *  Authors: 'Wesley W. Terpstra' <wesley@terpstra.ca>
10  *
11  *    This program is free software; you can redistribute it and/or modify
12  *    it under the terms of the GNU General Public License as published by
13  *    the Free Software Foundation; version 2.
14  *
15  *    This program is distributed in the hope that it will be useful,
16  *    but WITHOUT ANY WARRANTY; without even the implied warranty of
17  *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  *    GNU General Public License for more details.
19  *
20  *    You should have received a copy of the GNU General Public License
21  *    along with this program; if not, write to the Free Software
22  *    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23  */
24 
25 #define _FILE_OFFSET_BITS 64
26 
27 #include <mimelib/headers.h>
28 #include <mimelib/datetime.h>
29 #include <mimelib/addrlist.h>
30 #include <mimelib/address.h>
31 #include <mimelib/group.h>
32 #include <mimelib/mboxlist.h>
33 #include <mimelib/mailbox.h>
34 #include <mimelib/text.h>
35 #include <mimelib/param.h>
36 #include <mimelib/enum.h>
37 #include <mimelib/body.h>
38 #include <mimelib/bodypart.h>
39 #include <mimelib/utility.h>
40 
41 #include <CharsetEscape.h>
42 #include <Keys.h>
43 #include <md5.h>
44 #include <cstdlib>
45 
46 #include "Index.h"
47 #include "Summary.h"
48 
49 #include <string>
50 #include <vector>
51 #include <iostream>
52 
53 #include <unistd.h>
54 #include <iconv.h>
55 #include <cerrno>
56 
57 using namespace std;
58 
59 #define MAX_MESSAGE_ID	80
60 
utf8Truncate(string & str,string::size_type len)61 void utf8Truncate(string& str, string::size_type len)
62 {
63 	if (str.length() < len) return;
64 
65 	// look for nasty utf-8 stuff that's dangling and crop it
66 	while (len && ((unsigned char)str[len-1]) >= 0x80 &&
67 	              ((unsigned char)str[len-1]) <= 0xBF)
68 		--len;
69 	// now rewind off potential utf-8 start bytes
70 	while (len && ((unsigned char)str[len-1]) >= 0xC0)
71 		--len;
72 
73 	// len is now at the end of a complete multi-byte element or ascii
74 
75 	str.resize(len);
76 }
77 
my_toupper(char x)78 static inline char my_toupper(char x)
79 {
80 	if (x >= 'a' && x <= 'z')
81 		return x - 'a' + 'A';
82 	else	return x;
83 }
84 
strings_equal_case_ignored(const string & a,const string & b)85 static bool strings_equal_case_ignored(const string& a, const string& b)
86 {
87 	if (a.length() != b.length()) return false;
88 
89 	for (string::size_type i = 0; i < a.length(); ++i)
90 		if (my_toupper(a[i]) != my_toupper(b[i]))
91 			return false;
92 	return true;
93 }
94 
95 // first = address, second = name
pickAddress(DwAddress * a,const char * charset)96 pair<string, string> pickAddress(DwAddress* a, const char* charset)
97 {
98 	for (; a != 0; a = a->Next())
99 	{
100 		if (a->IsGroup())
101 		{
102 			DwGroup* g = dynamic_cast<DwGroup*>(a);
103 			if (g)
104 			{
105 				pair<string, string> out =
106 					pickAddress(
107 						g->MailboxList().FirstMailbox(),
108 						charset);
109 				if (out.first != "") return out;
110 			}
111 		}
112 		else
113 		{
114 			DwMailbox* m = dynamic_cast<DwMailbox*>(a);
115 			if (m)
116 			{
117 				string name = m->FullName().c_str();
118 				name = decode_header(name, charset);
119 				DwString addr = m->LocalPart() + "@" + m->Domain();
120 
121 				// fucked address? (one cannot safely cut this)
122 				if (addr.length() > 128 ||
123 				    m->LocalPart() == "" || m->Domain() == "")
124 				{
125 					addr = "";
126 				}
127 
128 				for (size_t i = 0; i < addr.length(); ++i)
129 				{
130 					if (addr[i] <= 0x20 || addr[i] >= 0x7f)
131 					{	// fucked up address
132 						addr = "";
133 						break;
134 					}
135 				}
136 
137 				// prune any optional quotes
138 				if (name.length() >= 2 && name[0] == '"')
139 					name = name.substr(1, name.length()-2);
140 
141 				if (addr != "")
142 					return pair<string, string>(addr.c_str(), name);
143 			}
144 		}
145 	}
146 
147 	return pair<string, string>("", "");
148 }
149 
index_author()150 int Index::index_author()
151 {
152 	// one always has headers, but not always this function:
153 	// if (message.hasHeaders())
154 
155 	charset = "ISO-8859-1"; // a good default as any
156 
157 	if (message.Headers().HasContentType())
158 	{
159 		DwParameter* p = message.Headers().ContentType().FirstParameter();
160 		while (p)
161 		{
162 			if (p->Attribute() == "charset")
163 				charset = p->Value().c_str();
164 			p = p->Next();
165 		}
166 	}
167 
168 	// pickAddress only gives an author_name if it gave an author_email
169 
170 	if (message.Headers().HasReplyTo())
171 	{
172 		pair<string, string> addr = pickAddress(
173 			message.Headers().ReplyTo().FirstAddress(),
174 			charset.c_str());
175 
176 		author_email = addr.first;
177 		author_name  = addr.second;
178 
179 		// Some evil mailing lists set reply-to the list.
180 		if (strings_equal_case_ignored(author_email, list.address))
181 		{
182 			author_email = "";
183 			author_name = "";
184 		}
185 	}
186 
187 	// Given a reply-to that is not the list, we allow the from to
188 	// provide a fullname under the assumption it is the same person.
189 
190 	if (message.Headers().HasFrom())
191 	{
192 		pair<string, string> addr = pickAddress(
193 			message.Headers().From().FirstMailbox(),
194 			charset.c_str());
195 
196 		if (!author_email.length()) author_email = addr.first;
197 		if (!author_name .length()) author_name  = addr.second;
198 	}
199 
200 	// ditto
201 
202 	if (message.Headers().HasSender())
203 	{
204 		pair<string, string> addr = pickAddress(
205 			&message.Headers().Sender(),
206 			charset.c_str());
207 
208 		if (!author_email.length()) author_email = addr.first;
209 		if (!author_name .length()) author_name  = addr.second;
210 	}
211 
212 	author_name = whitespace_sanitize(author_name);
213 	utf8Truncate(author_name, 100);
214 	//  - nothing longer than 128 could get here (from above)
215 	//  - one can never safely truncate an email address
216 	// utf8Truncate(author_email, 100);
217 
218 	return 0;
219 }
220 
221 // Doesn't vary with charset
lu_isspace(char x)222 inline bool lu_isspace(char x)
223 {
224 	return x == ' ' || x == '\n' || x == '\r' || x == '\t';
225 }
226 
build_message_hash(const char * str,unsigned char * hash)227 void build_message_hash(const char* str, unsigned char* hash)
228 {
229 	MD5Context ctx;
230 
231 	MD5Init(&ctx);
232 	MD5Update(&ctx, (const unsigned char*)str, strlen(str));
233 
234 	unsigned char buf[16];
235 	MD5Final(buf, &ctx);
236 
237 	hash[0] = buf[0] ^ buf[4] ^ buf[ 8] ^ buf[12];
238 	hash[1] = buf[1] ^ buf[5] ^ buf[ 9] ^ buf[13];
239 	hash[2] = buf[2] ^ buf[6] ^ buf[10] ^ buf[14];
240 	hash[3] = buf[3] ^ buf[7] ^ buf[11] ^ buf[15];
241 }
242 
feed_writer(const char * keyword,void * arg)243 int feed_writer(const char* keyword, void* arg)
244 {
245 	Index* i = (Index*)arg;
246 
247 	string x(LU_KEYWORD);
248 	x += keyword;
249 	x += '\0';
250 	x += i->id.raw();
251 
252 	return i->writer->insert(x);
253 }
254 
index_id(bool userdate,time_t server,bool & exist)255 int Index::index_id(bool userdate, time_t server, bool& exist)
256 {
257 	time_t stamp = server;
258 	string messageId;
259 	unsigned char hash[4];
260 
261 	// if (message.hasHeaders())
262 
263 	if (message.Headers().HasDate())
264 	{
265 		time_t user = message.Headers().Date().AsUnixTime();
266 
267 		/* User time must be earlier; there is delivery delay!
268 		 * However, more than 7 day delivery time is unlikely.
269 		 */
270 		if ((user <= server && server < user+7*60*60*24) ||
271 		    userdate ||  // trusting the userdate?
272 		    server <= 0) // server is on crack?
273 			stamp = user;
274 	}
275 
276 	if (stamp <= 0)
277 	{	// this is crazy; I don't care if they agree: it's wrong
278 		stamp = 1; // liers all have timestamp 1970-01-01 00:00:01
279 	}
280 
281 	if (message.Headers().HasMessageId())
282 	{
283 		vector<string> ids = extract_message_ids(
284 			message.Headers().MessageId().AsString().c_str());
285 
286 		if (!ids.empty())
287 			messageId = ids.front();
288 	}
289 
290 	if (messageId.length())
291 	{
292 		// Constant message-id across import, and threadable
293 		build_message_hash(messageId.c_str(), hash);
294 	}
295 	else if (author_email.length())
296 	{
297 		// This means no proper threading.
298 		// At least the message-id is constant across import.
299 		build_message_hash(author_email.c_str(), hash);
300 	}
301 	else
302 	{
303 		// Can't make any guarantees; just import it.
304 		hash[0] = random() % 256;
305 		hash[1] = random() % 256;
306 		hash[2] = random() % 256;
307 		hash[3] = random() % 256;
308 	}
309 
310 	id = MessageId(stamp, hash);
311 	if (blacklist.find(id) != blacklist.end())
312 	{
313 		// Messages marked as blacklisted use the 'exist' flag to
314 		// avoid being imported into the database.
315 		exist = true;
316 		return 0;
317 	}
318 
319 	if (messageId.length())
320 	{
321 		// Raw message-id for threading
322 		if (writer->insert(
323 			LU_KEYWORD +
324 			string(LU_KEYWORD_MESSAGE_ID) +
325 			messageId +
326 			'\0' +
327 			id.raw()) != 0)
328 		{
329 			cerr << "Failed to insert message id keyword!" << endl;
330 			return -1;
331 		}
332 
333 		// digested message-id for user searches
334 		if (my_keyword_digest_string(
335 			messageId.c_str(), messageId.length(),
336 			LU_KEYWORD_MESSAGE_ID, &feed_writer, this, 0) != 0)
337 		{
338 			cerr << "Failed to index message-id" << endl;
339 			return -1;
340 		}
341 	}
342 
343 	if (writer->insert(
344 		LU_KEYWORD +
345 		string(LU_KEYWORD_EVERYTHING) +
346 		'\0' +
347 		id.raw()) != 0)
348 	{
349 		cerr << "Failed to the any keyword!" << endl;
350 		return -1;
351 	}
352 
353 	return 0;
354 }
355 
index_summary(bool check,bool & exist)356 int Index::index_summary(bool check, bool& exist)
357 {
358 	string prefix = LU_SUMMARY + id.raw();
359 
360 	if (message.Headers().HasSubject())
361 	{
362 		subject = message.Headers().Subject().AsString().c_str();
363 		subject = decode_header(subject, charset.c_str());
364 	}
365 
366 	if (subject == "")
367 		subject = "[...]";
368 
369 	string mbox = prefix + LU_MESSAGE_MBOX + list.mbox + '\0';
370 
371 	if (check)
372 	{
373 		// Check for existance
374 		auto_ptr<ESort::Walker> w(writer->seek(mbox, "", ESort::Forward));
375 
376 		if (w->advance() == -1)
377 		{	// was it just eof?
378 			if (errno != 0) return -1;
379 		}
380 		else
381 		{	// if it suceeded. then ... it is already in there
382 			exist = true;
383 			return 0;
384 		}
385 	}
386 
387 	unsigned char buf[12];
388 	off_t o = off;
389 	long l = len;
390 	int i;
391 
392 	for (i = 7; i >= 0; --i)
393 	{
394 		buf[i] = (o & 0xFF);
395 		o >>= 8;
396 	}
397 	for (i = 11; i >= 8; --i)
398 	{
399 		buf[i] = (l & 0xFF);
400 		l >>= 8;
401 	}
402 
403 	// Don't let crazy stuff in there.
404 	utf8Truncate(subject, 200);
405 
406 	if (writer->insert(prefix + LU_MESSAGE_AUTHOR_EMAIL + author_email) != 0 ||
407 	    writer->insert(prefix + LU_MESSAGE_AUTHOR_NAME  + author_name)  != 0 ||
408 	    writer->insert(prefix + LU_MESSAGE_SUBJECT      + subject)      != 0 ||
409 	    writer->insert(mbox + string((char*)buf, 12)) != 0)
410 	{
411 		cerr << "Failed to insert summary keys" << endl;
412 		return -1;
413 	}
414 
415 	return 0;
416 }
417 
index_threading()418 int Index::index_threading()
419 {
420 	string shash = subject_hash(subject.c_str());
421 	string suffix;
422 
423 	unsigned char hash[4];
424 
425 	if (writer->insert(
426 		LU_KEYWORD
427 		LU_KEYWORD_THREAD +
428 		shash +
429 		'\0' +
430 		id.raw()) != 0)
431 	{
432 		cerr << "Failed to insert threading keyword" << endl;
433 		return -1;
434 	}
435 
436 	// if (message.hasHeaders())
437 
438 	if (message.Headers().HasInReplyTo())
439 	{
440 		vector<string> ids = extract_message_ids(
441 			message.Headers().InReplyTo().AsString().c_str());
442 
443 		// first in-reply-to is most relevant
444 		for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
445 		{
446 			build_message_hash(i->c_str(), hash);
447 
448 			// keep it reasonable; too many reply-tos is bad
449 			if (suffix.length() < 200)
450 				suffix.append((const char*)hash, 4);
451 		}
452 	}
453 
454 	if (message.Headers().HasReferences())
455 	{
456 		vector<string> ids = extract_message_ids(
457 			message.Headers().References().AsString().c_str());
458 
459 		// last references is most recently added (most likely irt)
460 		for (vector<string>::reverse_iterator i = ids.rbegin();
461 		     i != ids.rend(); ++i)
462 		{
463 			build_message_hash(i->c_str(), hash);
464 			// keep it reasonable; too many reply-tos is bad
465 			if (suffix.length() < 200)
466 				suffix.append((const char*)hash, 4);
467 		}
468 	}
469 
470 	if (writer->insert(
471 		LU_THREADING
472 		+ shash
473 		+ id.raw()
474 		+ suffix) != 0)
475 	{
476 		cerr << "Failed to insert threading keys" << endl;
477 		return -1;
478 	}
479 
480 	if (writer->insert(
481 		LU_NEW_TOPICS
482 		+ list.mbox + '\0'
483 		+ id.raw().substr(0, 4)
484 		+ shash) != 0)
485 	{
486 		cerr << "Failed to insert new topics keys" << endl;
487 		return -1;
488 	}
489 
490 	return 0;
491 }
492 
index_control(time_t import)493 int Index::index_control(time_t import)
494 {
495 	bool ok = true;
496 	if (writer->insert(
497 		LU_KEYWORD
498 		LU_KEYWORD_LIST +
499 		list.mbox +
500 		'\0' +
501 		id.raw()) != 0) ok = false;
502 
503 	/* emulated group and language searches are impossibly slow.
504 	 * these keywords are a must for large archives.
505 	 */
506 	if (writer->insert(
507 		LU_KEYWORD
508 		LU_KEYWORD_GROUP +
509 		list.group +
510 		'\0' +
511 		id.raw()) != 0) ok = false;
512 
513 	set<string>::const_iterator i, e;
514 	for (i = list.languages.begin(), e = list.languages.end(); i != e; ++i)
515 		if (writer->insert(
516 			LU_KEYWORD
517 			LU_KEYWORD_LANGUAGE +
518 			*i +
519 			'\0' +
520 			id.raw()) != 0) ok = false;
521 
522 	MessageId importStamp(import);
523 	if (writer->insert(
524 		LU_CACHE +
525 		importStamp.raw().substr(0, 4) +
526 		id.raw()) != 0) ok = false;
527 
528 	if (author_email.length())
529 	{
530 		if (my_keyword_digest_string(
531 			author_email.c_str(), author_email.length(),
532 			LU_KEYWORD_AUTHOR, &feed_writer, this, 1) != 0)
533 			ok = false;
534 	}
535 
536 	if (author_name.length())
537 	{
538 		if (my_keyword_digest_string(
539 			author_name.c_str(), author_name.length(),
540 			LU_KEYWORD_AUTHOR, &feed_writer, this, 1) != 0)
541 			ok = false;
542 	}
543 
544 	if (subject.length())
545 	{
546 		if (my_keyword_digest_string(
547 			subject.c_str(), subject.length(),
548 			LU_KEYWORD_SUBJECT, &feed_writer, this, 1) != 0)
549 			ok = false;
550 	}
551 
552 	if (message.Headers().HasInReplyTo())
553 	{
554 		vector<string> ids = extract_message_ids(
555 			message.Headers().InReplyTo().AsString().c_str());
556 		for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
557 			if (writer->insert(
558 				LU_KEYWORD
559 				LU_KEYWORD_REPLY_TO +
560 				*i + '\0' + id.raw()) != 0)
561 				ok = false;
562 	}
563 
564 #if 0	// this is questionable...
565 	if (message.Headers().HasReferences())
566 	{
567 		vector<string> ids = extract_message_ids(
568 			message.Headers().References().AsString().c_str());
569 		for (vector<string>::iterator i = ids.begin(); i != ids.end(); ++i)
570 			if (writer->insert(
571 				LU_KEYWORD
572 				LU_KEYWORD_REPLY_TO +
573 				*i + '\0' + id.raw()) != 0)
574 				ok = false;
575 	}
576 #endif
577 
578 	if (!ok)
579 	{
580 		cerr << "Failed to insert control keys" << endl;
581 		return -1;
582 	}
583 
584 	return 0;
585 }
586 
index_entity(DwEntity & e,const string & charset)587 int Index::index_entity(DwEntity& e, const string& charset)
588 {
589 	DwString text;
590 	if (e.Headers().HasContentTransferEncoding())
591 	{
592 		switch (e.Headers().ContentTransferEncoding().AsEnum())
593 		{
594 		case DwMime::kCteQuotedPrintable:
595 			DwDecodeQuotedPrintable(e.Body().AsString(), text);
596 			break;
597 
598 		case DwMime::kCteBase64:
599 			DwDecodeBase64(e.Body().AsString(), text);
600 			break;
601 
602 		case DwMime::kCteNull:
603 		case DwMime::kCteUnknown:
604 		case DwMime::kCte7bit:
605 		case DwMime::kCte8bit:
606 		case DwMime::kCteBinary:
607 			text = e.Body().AsString();
608 			break;
609 		}
610 	}
611 	else
612 	{
613 		text = e.Body().AsString();
614 	}
615 
616 	CharsetEscape decode(charset.c_str());
617 	string utf8 = decode.write(text.c_str(), text.length());
618 
619 	if (my_keyword_digest_string(
620 		utf8.c_str(), utf8.length(),
621 		LU_KEYWORD_WORD, &feed_writer, this, 1) != 0)
622 	{
623 		cerr << "Failed to index un-typed segment" << endl;
624 		return -1;
625 	}
626 
627 	return 0;
628 }
629 
index_keywords(DwEntity & e,const string & parentCharset)630 int Index::index_keywords(DwEntity& e, const string& parentCharset)
631 {
632 	string charset = parentCharset;
633 
634 	if (e.Headers().HasContentType())
635 	{
636 		DwMediaType& mt = e.Headers().ContentType();
637 
638 		for (DwParameter* p = mt.FirstParameter(); p; p = p->Next())
639 		{
640 			DwString attr = p->Attribute();
641 			attr.ConvertToLowerCase(); // case insens
642 			if (attr == "charset") charset = p->Value().c_str();
643 		}
644 	}
645 
646 	// if (e.hasHeaders() &&
647 	if (e.Headers().HasContentType())
648 	{
649 		DwMediaType& t = e.Headers().ContentType();
650 		switch (t.Type())
651 		{
652 		case DwMime::kTypeMessage:
653 			if (e.Body().Message())
654 				index_keywords(*e.Body().Message(), charset);
655 			break;
656 
657 		case DwMime::kTypeMultipart:
658 			// index all alternatives in multipart
659 			for (DwBodyPart* p = e.Body().FirstBodyPart(); p != 0; p = p->Next())
660 				index_keywords(*p, charset);
661 			break;
662 
663 		case DwMime::kTypeText:
664 			if (t.Subtype() == DwMime::kSubtypePlain)
665 			{
666 				if (index_entity(e, charset) != 0) return -1;
667 			}
668 			break;
669 		}
670 	}
671 	else
672 	{
673 		if (index_entity(e, charset) != 0) return -1;
674 	}
675 
676 	return 0;
677 }
678 
index(bool userdate,time_t envelope,time_t import,bool check,bool & exist)679 int Index::index(bool userdate, time_t envelope, time_t import, bool check, bool& exist)
680 {
681 	exist = false;
682 
683 //	cout << message.Headers().Subject().AsString().c_str() << endl;
684 
685 	if (index_author() < 0) return -1;
686 
687 	/* If the message is blacklisted, we mark it as 'existing' */
688 	if (index_id(userdate, envelope, exist) < 0) return -1;
689 	if (exist) return 0;
690 
691 	/* If the message is already imported, mark it as 'existing' */
692 	if (index_summary(check, exist) < 0) return -1;
693 	if (exist) return 0;
694 
695 	if (index_threading(      )                < 0) return -1;
696 	if (index_control  (import)                < 0) return -1;
697 	if (index_keywords (message, "ISO-8859-1") < 0) return -1;
698 
699 	return 0;
700 }
701