1 /* -*- Mode: c++; -*- */
2 /*  --------------------------------------------------------------------
3  *  Filename:
4  *    mime-parsefull.cc
5  *
6  *  Description:
7  *    Implementation of main mime parser components
8  *
9  *  Authors:
10  *    Andreas Aardal Hanssen <andreas-binc curly bincimap spot org>
11  *
12  *  Bugs:
13  *
14  *  ChangeLog:
15  *
16  *  --------------------------------------------------------------------
17  *  Copyright 2002-2005 Andreas Aardal Hanssen
18  *
19  *  This program is free software; you can redistribute it and/or modify
20  *  it under the terms of the GNU General Public License as published by
21  *  the Free Software Foundation; either version 2 of the License, or
22  *  (at your option) any later version.
23  *
24  *  This program is distributed in the hope that it will be useful,
25  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
26  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
27  *  GNU General Public License for more details.
28  *
29  *  You should have received a copy of the GNU General Public License
30  *  along with this program; if not, write to the Free Software
31  *  Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
32  *  --------------------------------------------------------------------
33  */
34 #ifdef HAVE_CONFIG_H
35 #include <config.h>
36 #endif
37 
38 #include "mime.h"
39 #include "mime-utils.h"
40 #include "convert.h"
41 #include "io.h"
42 #include <string>
43 #include <vector>
44 #include <map>
45 #include <exception>
46 #include <iostream>
47 
48 #include <string.h>
49 #include <ctype.h>
50 #include <stdio.h>
51 #include <errno.h>
52 
53 using namespace ::std;
54 
55 int crlffile = 0;
56 char crlfdata[4096];
57 unsigned int crlftail = 0;
58 unsigned int crlfhead = 0;
59 unsigned int crlfoffset = 0;
60 char lastchar = '\0';
61 
62 //------------------------------------------------------------------------
fillInputBuffer(void)63 bool fillInputBuffer(void)
64 {
65   char raw[1024];
66 
67   ssize_t nbytes;
68   for (;;) {
69     nbytes = read(crlffile, raw, sizeof(raw) - 1);
70     if (nbytes <= 0) {
71       // FIXME: If ferror(crlffile) we should log this.
72       return false;
73     }
74     else break;
75   }
76 
77   for (ssize_t i = 0; i < nbytes; ++i) {
78     const char c = raw[i];
79     switch (c) {
80     case '\r':
81       if (lastchar == '\r') {
82 	crlfdata[crlftail++ & 0xfff] = '\r';
83 	crlfdata[crlftail++ & 0xfff] = '\n';
84       }
85       break;
86     case '\n':
87       crlfdata[crlftail++ & 0xfff] = '\r';
88       crlfdata[crlftail++ & 0xfff] = '\n';
89       break;
90     default:
91       if (lastchar == '\r') {
92 	crlfdata[crlftail++ & 0xfff] = '\r';
93 	crlfdata[crlftail++ & 0xfff] = '\n';
94       }
95 
96       crlfdata[crlftail++ & 0xfff] = c;
97       break;
98     }
99 
100     lastchar = c;
101   }
102 
103   return true;
104 }
105 
106 
107 //------------------------------------------------------------------------
parseFull(int fd) const108 void Binc::MimeDocument::parseFull(int fd) const
109 {
110   if (allIsParsed)
111     return;
112 
113   allIsParsed = true;
114 
115   crlffile = fd;
116   crlfReset();
117 
118   headerstartoffsetcrlf = 0;
119   headerlength = 0;
120   bodystartoffsetcrlf = 0;
121   bodylength = 0;
122   size = 0;
123   messagerfc822 = false;
124   multipart = false;
125 
126   int bsize = 0;
127   MimePart::parseFull("", bsize);
128 
129   // eat any trailing junk to get the correct size
130   char c;
131   while (crlfGetChar(c));
132 
133   size = crlfoffset;
134 }
135 
136 //------------------------------------------------------------------------
parseFull(const string & toboundary,int & boundarysize) const137 int Binc::MimePart::parseFull(const string &toboundary, int &boundarysize) const
138 {
139   string name;
140   string content;
141   char cqueue[4];
142   memset(cqueue, 0, sizeof(cqueue));
143 
144   bool quit = false;
145   char c;
146   bool eof = false;
147 
148   headerstartoffsetcrlf = crlfoffset;
149 
150   while (!quit && !eof) {
151     // read name
152     while (1) {
153       if (!crlfGetChar(c)) {
154 	eof = true;
155 	break;
156       }
157 
158       if (c == '\n') ++nlines;
159       if (c == ':') break;
160       if (c == '\n') {
161 	// If we encounter a \n before we got to the first ':', then
162 	// if the line is not empty, rewind back to the start of the
163 	// line and assume we're at the start of the body. If not,
164 	// just skip the line and assume we're at the start of the
165 	// body.
166 	string ntmp = name;
167 	trim(ntmp);
168 	if (ntmp != "")
169 	  for (int i = name.length() - 1; i >= 0; --i)
170 	    crlfUnGetChar();
171 
172 	quit = true;
173 	name = "";
174 	break;
175       }
176 
177       name += c;
178 
179       if (name.length() == 2 && name.substr(0, 2) == "\r\n") {
180 	name = "";
181 	quit = true;
182 	break;
183       }
184     }
185 
186     if (name.length() == 1 && name[0] == '\r') {
187       name = "";
188       break;
189     }
190 
191     if (quit || eof) break;
192 
193     while (!quit) {
194       if (!crlfGetChar(c)) {
195 	quit = true;
196 	break;
197       }
198 
199       if (c == '\n') ++nlines;
200 
201       for (int i = 0; i < 3; ++i)
202 	cqueue[i] = cqueue[i + 1];
203       cqueue[3] = c;
204 
205       if (strncmp(cqueue, "\r\n\r\n", 4) == 0) {
206 	quit = true;
207 	break;
208       }
209 
210       if (cqueue[2] == '\n') {
211 
212 	// guess the mime rfc says what can not appear on the beginning
213 	// of a line.
214 	if (!isspace(cqueue[3])) {
215 	  if (content.length() > 2)
216 	    content.resize(content.length() - 2);
217 
218 	  trim(content);
219 	  h.add(name, content);
220 
221 	  name = c;
222 	  content = "";
223 	  break;
224 	}
225       }
226 
227       content += c;
228     }
229   }
230 
231   if (name != "") {
232     if (content.length() > 2)
233       content.resize(content.length() - 2);
234     h.add(name, content);
235   }
236 
237   // Headerlength includes the seperating CRLF. Body starts after the
238   // CRLF.
239   headerlength = crlfoffset - headerstartoffsetcrlf;
240   bodystartoffsetcrlf = crlfoffset;
241   bodylength = 0;
242 
243   // If we encounter the end of file, we return 1 as if we found our
244   // parent's terminal boundary. This will cause a safe exit, and
245   // whatever we parsed until now will be available.
246   if (eof)
247     return 1;
248 
249   // Do simple parsing of headers to determine the
250   // type of message (multipart,messagerfc822 etc)
251   HeaderItem ctype;
252   if (h.getFirstHeader("content-type", ctype)) {
253     vector<string> types;
254     split(ctype.getValue(), ";", types);
255 
256     if (types.size() > 0) {
257       // first element should describe content type
258       string tmp = types[0];
259       trim(tmp);
260       vector<string> v;
261       split(tmp, "/", v);
262       string key, value;
263 
264       key = (v.size() > 0) ? v[0] : "text";
265       value = (v.size() > 1) ? v[1] : "plain";
266       lowercase(key);
267 
268       if (key == "multipart") {
269 	multipart = true;
270 	lowercase(value);
271 	subtype = value;
272       } else if (key == "message") {
273 	lowercase(value);
274 	if (value == "rfc822")
275 	  messagerfc822 = true;
276       }
277     }
278 
279     for (vector<string>::const_iterator i = types.begin();
280 	 i != types.end(); ++i) {
281       string element = *i;
282       trim(element);
283 
284       if (element.find("=") != string::npos) {
285 	string::size_type pos = element.find('=');
286 	string key = element.substr(0, pos);
287 	string value = element.substr(pos + 1);
288 
289 	lowercase(key);
290 	trim(key);
291 
292 	if (key == "boundary") {
293 	  trim(value, " \"");
294 	  boundary = value;
295 	}
296       }
297     }
298   }
299 
300   bool foundendofpart = false;
301   if (messagerfc822) {
302     // message rfc822 means a completely enclosed mime document. we
303     // call the parser recursively, and pass on the boundary string
304     // that we got. when parse() finds this boundary, it returns 0. if
305     // it finds the end boundary (boundary + "--"), it returns != 0.
306     MimePart m;
307 
308     // parsefull returns the number of bytes that need to be removed
309     // from the body because of the terminating boundary string.
310     int bsize;
311     if (m.parseFull(toboundary, bsize))
312       foundendofpart = true;
313 
314     // make sure bodylength doesn't overflow
315     bodylength = crlfoffset;
316     if (bodylength >= bodystartoffsetcrlf) {
317       bodylength -= bodystartoffsetcrlf;
318       if (bodylength >= (unsigned int) bsize) {
319 	bodylength -= (unsigned int) bsize;
320       } else {
321 	bodylength = 0;
322       }
323     } else {
324       bodylength = 0;
325     }
326 
327     nbodylines += m.getNofLines();
328 
329     members.push_back(m);
330 
331   } else if (multipart) {
332     // multipart parsing starts with skipping to the first
333     // boundary. then we call parse() for all parts. the last parse()
334     // command will return a code indicating that it found the last
335     // boundary of this multipart. Note that the first boundary does
336     // not have to start with CRLF.
337     string delimiter = "--" + boundary;
338 
339     char *delimiterqueue = 0;
340     int endpos = delimiter.length();
341     delimiterqueue = new char[endpos];
342     int delimiterpos = 0;
343     bool eof = false;
344 
345     // first, skip to the first delimiter string. Anything between the
346     // header and the first delimiter string is simply ignored (it's
347     // usually a text message intended for non-mime clients)
348     do {
349       if (!crlfGetChar(c)) {
350 	eof = true;
351 	break;
352       }
353 
354       if (c == '\n')
355 	++nlines;
356 
357       delimiterqueue[delimiterpos++ % endpos] = c;
358 
359       // Fixme: Must also check for all parents' delimiters.
360     } while (!compareStringToQueue(delimiter, delimiterqueue, delimiterpos, endpos));
361 
362     delete delimiterqueue;
363 
364     if (!eof)
365       boundarysize = delimiter.size();
366 
367     // Read two more characters. This may be CRLF, it may be "--" and
368     // it may be any other two characters.
369     char a;
370     if (!crlfGetChar(a))
371       eof = true;
372 
373     if (a == '\n')
374       ++nlines;
375 
376     char b;
377     if (!crlfGetChar(b))
378       eof = true;
379 
380     if (b == '\n')
381       ++nlines;
382 
383     // If we find two dashes after the boundary, then this is the end
384     // of boundary marker.
385     if (!eof) {
386       if (a == '-' && b == '-') {
387 	foundendofpart = true;
388 	boundarysize += 2;
389 
390 	if (!crlfGetChar(a))
391 	  eof = true;
392 
393 	if (a == '\n')
394 	  ++nlines;
395 
396 	if (!crlfGetChar(b))
397 	  eof = true;
398 
399 	if (b == '\n')
400 	  ++nlines;
401       }
402 
403       if (a == '\r' && b == '\n') {
404 	// This exception is to handle a special case where the
405 	// delimiter of one part is not followed by CRLF, but
406 	// immediately followed by a CRLF prefixed delimiter.
407 	if (!crlfGetChar(a) || !crlfGetChar(b))
408 	  eof = true;
409 	else if (a == '-' && b == '-') {
410 	  crlfUnGetChar();
411 	  crlfUnGetChar();
412 	  crlfUnGetChar();
413 	  crlfUnGetChar();
414 	} else {
415 	  crlfUnGetChar();
416 	  crlfUnGetChar();
417 	}
418 
419 	boundarysize += 2;
420       } else {
421 	crlfUnGetChar();
422 	crlfUnGetChar();
423       }
424     }
425 
426     // read all mime parts.
427     if (!foundendofpart && !eof) {
428       bool quit = false;
429       do {
430 	MimePart m;
431 
432 	// If parseFull returns != 0, then it encountered the multipart's
433 	// final boundary.
434 	int bsize = 0;
435 	if (m.parseFull(boundary, bsize)) {
436 	  quit = true;
437 	  boundarysize = bsize;
438 	}
439 
440 	members.push_back(m);
441 	nlines += m.getNofLines();
442 
443       } while (!quit);
444     }
445 
446     if (!foundendofpart && !eof) {
447       // multipart parsing starts with skipping to the first
448       // boundary. then we call parse() for all parts. the last parse()
449       // command will return a code indicating that it found the last
450       // boundary of this multipart. Note that the first boundary does
451       // not have to start with CRLF.
452       string delimiter = "\r\n--" + toboundary;
453 
454       char *delimiterqueue = 0;
455       int endpos = delimiter.length();
456       delimiterqueue = new char[endpos];
457       int delimiterpos = 0;
458       bool eof = false;
459 
460       // first, skip to the first delimiter string. Anything between the
461       // header and the first delimiter string is simply ignored (it's
462       // usually a text message intended for non-mime clients)
463       do {
464 	if (!crlfGetChar(c)) {
465 	  eof = true;
466 	  break;
467 	}
468 
469 	if (c == '\n')
470 	  ++nlines;
471 
472 	delimiterqueue[delimiterpos++ % endpos] = c;
473 
474 	// Fixme: Must also check for all parents' delimiters.
475       } while (!compareStringToQueue(delimiter, delimiterqueue, delimiterpos, endpos));
476 
477       delete delimiterqueue;
478 
479       if (!eof)
480 	boundarysize = delimiter.size();
481 
482       // Read two more characters. This may be CRLF, it may be "--" and
483       // it may be any other two characters.
484       char a;
485       if (!crlfGetChar(a))
486 	eof = true;
487 
488       if (a == '\n')
489 	++nlines;
490 
491       char b;
492       if (!crlfGetChar(b))
493 	eof = true;
494 
495       if (b == '\n')
496 	++nlines;
497 
498       // If we find two dashes after the boundary, then this is the end
499       // of boundary marker.
500       if (!eof) {
501 	if (a == '-' && b == '-') {
502 	  foundendofpart = true;
503 	  boundarysize += 2;
504 
505 	  if (!crlfGetChar(a))
506 	    eof = true;
507 
508 	  if (a == '\n')
509 	    ++nlines;
510 
511 	  if (!crlfGetChar(b))
512 	    eof = true;
513 
514 	  if (b == '\n')
515 	    ++nlines;
516 	}
517 
518 	if (a == '\r' && b == '\n') {
519 	  // This exception is to handle a special case where the
520 	  // delimiter of one part is not followed by CRLF, but
521 	  // immediately followed by a CRLF prefixed delimiter.
522 	  if (!crlfGetChar(a) || !crlfGetChar(b))
523 	    eof = true;
524 	  else if (a == '-' && b == '-') {
525 	    crlfUnGetChar();
526 	    crlfUnGetChar();
527 	    crlfUnGetChar();
528 	    crlfUnGetChar();
529 	  } else {
530 	    crlfUnGetChar();
531 	    crlfUnGetChar();
532 	  }
533 
534 	  boundarysize += 2;
535 	} else {
536 	  crlfUnGetChar();
537 	  crlfUnGetChar();
538 	}
539       }
540     }
541 
542     // make sure bodylength doesn't overflow
543     bodylength = crlfoffset;
544     if (bodylength >= bodystartoffsetcrlf) {
545       bodylength -= bodystartoffsetcrlf;
546       if (bodylength >= (unsigned int) boundarysize) {
547 	bodylength -= (unsigned int) boundarysize;
548       } else {
549 	bodylength = 0;
550       }
551     } else {
552       bodylength = 0;
553     }
554 
555   } else {
556     // If toboundary is empty, then we read until the end of the
557     // file. Otherwise we will read until we encounter toboundary.
558     string _toboundary;
559     if (toboundary != "") {
560       _toboundary = "\r\n--";
561       _toboundary += toboundary;
562     }
563 
564     char *boundaryqueue = 0;
565     int endpos = _toboundary.length();
566     if (toboundary != "")
567       boundaryqueue = new char[endpos];
568     int boundarypos = 0;
569 
570     boundarysize = 0;
571 
572     string line;
573     int nchars = 0;
574     while (crlfGetChar(c)) {
575       if (c == '\n') { ++nbodylines; ++nlines; }
576       nchars++;
577 
578       if (toboundary == "")
579 	continue;
580 
581       // find boundary
582       boundaryqueue[boundarypos++ % endpos] = c;
583 
584       if (compareStringToQueue(_toboundary, boundaryqueue, boundarypos, endpos)) {
585 	boundarysize = _toboundary.length();
586 	break;
587       }
588     }
589 
590     delete boundaryqueue;
591 
592     if (toboundary != "") {
593       char a;
594       if (!crlfGetChar(a))
595 	eof = true;
596 
597       if (a == '\n')
598 	++nlines;
599       char b;
600       if (!crlfGetChar(b))
601 	eof = true;
602 
603       if (b == '\n')
604 	++nlines;
605 
606       if (a == '-' && b == '-') {
607 	boundarysize += 2;
608 	foundendofpart = true;
609 	if (!crlfGetChar(a))
610 	  eof = true;
611 
612 	if (a == '\n')
613 	  ++nlines;
614 
615 	if (!crlfGetChar(b))
616 	  eof = true;
617 
618 	if (b == '\n')
619 	  ++nlines;
620       }
621 
622       if (a == '\r' && b == '\n') {
623 	// This exception is to handle a special case where the
624 	// delimiter of one part is not followed by CRLF, but
625 	// immediately followed by a CRLF prefixed delimiter.
626 	if (!crlfGetChar(a) || !crlfGetChar(b))
627 	  eof = true;
628 	else if (a == '-' && b == '-') {
629 	  crlfUnGetChar();
630 	  crlfUnGetChar();
631 	  crlfUnGetChar();
632 	  crlfUnGetChar();
633 	} else {
634 	  crlfUnGetChar();
635 	  crlfUnGetChar();
636 	}
637 
638 	boundarysize += 2;
639       } else {
640 	crlfUnGetChar();
641 	crlfUnGetChar();
642       }
643     }
644 
645     // make sure bodylength doesn't overflow
646     bodylength = crlfoffset;
647     if (bodylength >= bodystartoffsetcrlf) {
648       bodylength -= bodystartoffsetcrlf;
649       if (bodylength >= (unsigned int) boundarysize) {
650 	bodylength -= (unsigned int) boundarysize;
651       } else {
652 	bodylength = 0;
653       }
654     } else {
655       bodylength = 0;
656     }
657   }
658 
659   return (eof || foundendofpart) ? 1 : 0;
660 }
661