1  /* -*- mode:c++;c-basic-offset:2 -*- */
2 /*  --------------------------------------------------------------------
3  *  Filename:
4  *    mime-parsefull.cc
5  *
6  *  Description:
7  *    Implementation of main mime parser components
8  *  --------------------------------------------------------------------
9  *  Copyright 2002-2005 Andreas Aardal Hanssen
10  *
11  *  This program is free software; you can redistribute it and/or modify
12  *  it under the terms of the GNU General Public License as published by
13  *  the Free Software Foundation; either version 2 of the License, or
14  *  (at your option) any later version.
15  *
16  *  This program is distributed in the hope that it will be useful,
17  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
18  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  *  GNU General Public License for more details.
20  *
21  *  You should have received a copy of the GNU General Public License
22  *  along with this program; if not, write to the Free Software
23  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24  *  --------------------------------------------------------------------
25  */
26 #include "autoconfig.h"
27 
28 #include <string.h>
29 #include <ctype.h>
30 #include <stdio.h>
31 #include <errno.h>
32 
33 #include <string>
34 #include <vector>
35 #include <map>
36 #include <exception>
37 #include <iostream>
38 
39 #include "mime.h"
40 #include "mime-utils.h"
41 #include "mime-inputsource.h"
42 #include "convert.h"
43 
44 using namespace std;
45 
46 
47 // #define MPF
48 #ifdef MPF
49 #define MPFDEB(X) fprintf X
50 #else
51 #define MPFDEB(X)
52 #endif
53 
54 //------------------------------------------------------------------------
parseFull(int fd)55 void Binc::MimeDocument::parseFull(int fd)
56 {
57   if (allIsParsed)
58     return;
59 
60   allIsParsed = true;
61 
62   delete doc_mimeSource;
63   doc_mimeSource = new MimeInputSource(fd);
64 
65   headerstartoffsetcrlf = 0;
66   headerlength = 0;
67   bodystartoffsetcrlf = 0;
68   bodylength = 0;
69   size = 0;
70   messagerfc822 = false;
71   multipart = false;
72 
73   int bsize = 0;
74   string bound;
75   doParseFull(doc_mimeSource, bound, bsize);
76 
77   // eat any trailing junk to get the correct size
78   char c;
79   while (doc_mimeSource->getChar(&c));
80 
81   size = doc_mimeSource->getOffset();
82 }
83 
parseFull(istream & s)84 void Binc::MimeDocument::parseFull(istream& s)
85 {
86   if (allIsParsed)
87     return;
88 
89   allIsParsed = true;
90 
91   delete doc_mimeSource;
92   doc_mimeSource = new MimeInputSourceStream(s);
93 
94   headerstartoffsetcrlf = 0;
95   headerlength = 0;
96   bodystartoffsetcrlf = 0;
97   bodylength = 0;
98   size = 0;
99   messagerfc822 = false;
100   multipart = false;
101 
102   int bsize = 0;
103   string bound;
104   doParseFull(doc_mimeSource, bound, bsize);
105 
106   // eat any trailing junk to get the correct size
107   char c;
108   while (doc_mimeSource->getChar(&c));
109 
110   size = doc_mimeSource->getOffset();
111 }
112 
113 //------------------------------------------------------------------------
parseOneHeaderLine(Binc::Header * header,unsigned int * nlines)114 bool Binc::MimePart::parseOneHeaderLine(Binc::Header *header,
115                     unsigned int *nlines)
116 {
117   using namespace ::Binc;
118   char c;
119   bool eof = false;
120   char cqueue[4];
121   string name;
122   string content;
123 
124   while (mimeSource->getChar(&c)) {
125     // If we encounter a \r before we got to the first ':', then
126     // rewind back to the start of the line and assume we're at the
127     // start of the body.
128     if (c == '\r') {
129       for (int i = 0; i < (int) name.length() + 1; ++i)
130     mimeSource->ungetChar();
131       return false;
132     }
133 
134     // A colon marks the end of the header name
135     if (c == ':') break;
136 
137     // Otherwise add to the header name
138     name += c;
139   }
140 
141   cqueue[0] = '\0';
142   cqueue[1] = '\0';
143   cqueue[2] = '\0';
144   cqueue[3] = '\0';
145 
146   // Read until the end of the header.
147   bool endOfHeaders = false;
148   while (!endOfHeaders) {
149     if (!mimeSource->getChar(&c)) {
150       eof = true;
151       break;
152     }
153 
154     if (c == '\n') ++*nlines;
155 
156     for (int i = 0; i < 3; ++i)
157       cqueue[i] = cqueue[i + 1];
158     cqueue[3] = c;
159 
160     if (strncmp(cqueue, "\r\n\r\n", 4) == 0) {
161       endOfHeaders = true;
162       break;
163     }
164 
165     // If the last character was a newline, and the first now is not
166     // whitespace, then rewind one character and store the current
167     // key,value pair.
168     if (cqueue[2] == '\n' && c != ' ' && c != '\t') {
169       if (content.length() > 2)
170     content.resize(content.length() - 2);
171 
172       trim(content);
173       header->add(name, content);
174 
175       if (c != '\r') {
176     mimeSource->ungetChar();
177     if (c == '\n') --*nlines;
178     return true;
179       }
180 
181       mimeSource->getChar(&c);
182       return false;
183     }
184 
185     content += c;
186   }
187 
188   if (name != "") {
189     if (content.length() > 2)
190       content.resize(content.length() - 2);
191     header->add(name, content);
192   }
193 
194   return !(eof || endOfHeaders);
195 }
196 
197 //------------------------------------------------------------------------
parseHeader(Binc::Header * header,unsigned int * nlines)198 void Binc::MimePart::parseHeader(Binc::Header *header, unsigned int *nlines)
199 {
200   while (parseOneHeaderLine(header, nlines))
201   { }
202 }
203 
204 //------------------------------------------------------------------------
analyzeHeader(Binc::Header * header,bool * multipart,bool * messagerfc822,string * subtype,string * boundary)205 void Binc::MimePart::analyzeHeader(Binc::Header *header, bool *multipart,
206                    bool *messagerfc822, string *subtype,
207                    string *boundary)
208 {
209   using namespace ::Binc;
210 
211   // Do simple parsing of headers to determine the
212   // type of message (multipart,messagerfc822 etc)
213   HeaderItem ctype;
214   if (header->getFirstHeader("content-type", ctype)) {
215     vector<string> types;
216     split(ctype.getValue(), ";", types);
217 
218     if (types.size() > 0) {
219       // first element should describe content type
220       string tmp = types[0];
221       trim(tmp);
222       vector<string> v;
223       split(tmp, "/", v);
224       string key, value;
225 
226       key = (v.size() > 0) ? v[0] : "text";
227       value = (v.size() > 1) ? v[1] : "plain";
228       lowercase(key);
229 
230       if (key == "multipart") {
231     *multipart = true;
232     lowercase(value);
233     *subtype = value;
234       } else if (key == "message") {
235     lowercase(value);
236     if (value == "rfc822")
237       *messagerfc822 = true;
238       }
239     }
240 
241     for (vector<string>::const_iterator i = types.begin();
242      i != types.end(); ++i) {
243       string element = *i;
244       trim(element);
245 
246       if (element.find("=") != string::npos) {
247     string::size_type pos = element.find('=');
248     string key = element.substr(0, pos);
249     string value = element.substr(pos + 1);
250 
251     lowercase(key);
252     trim(key);
253 
254     if (key == "boundary") {
255       trim(value, " \"");
256       *boundary = value;
257     }
258       }
259     }
260   }
261 }
262 
parseMessageRFC822(vector<Binc::MimePart> * members,bool * foundendofpart,unsigned int * bodylength,unsigned int * nbodylines,const string & toboundary)263 void Binc::MimePart::parseMessageRFC822(vector<Binc::MimePart> *members,
264                     bool *foundendofpart,
265                     unsigned int *bodylength,
266                     unsigned int *nbodylines,
267                     const string &toboundary)
268 {
269   using namespace ::Binc;
270 
271   // message rfc822 means a completely enclosed mime document. we
272   // call the parser recursively, and pass on the boundary string
273   // that we got. when parse() finds this boundary, it returns 0. if
274   // it finds the end boundary (boundary + "--"), it returns != 0.
275   MimePart m;
276 
277   unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
278 
279   // parsefull returns the number of bytes that need to be removed
280   // from the body because of the terminating boundary string.
281   int bsize = 0;
282   if (m.doParseFull(mimeSource, toboundary, bsize))
283     *foundendofpart = true;
284 
285   // make sure bodylength doesn't overflow
286   *bodylength = mimeSource->getOffset();
287   if (*bodylength >= bodystartoffsetcrlf) {
288     *bodylength -= bodystartoffsetcrlf;
289     if (*bodylength >= (unsigned int) bsize) {
290       *bodylength -= (unsigned int) bsize;
291     } else {
292       *bodylength = 0;
293     }
294   } else {
295     *bodylength = 0;
296   }
297 
298   *nbodylines += m.getNofLines();
299 
300   members->push_back(m);
301 }
302 
skipUntilBoundary(const string & delimiter,unsigned int * nlines,bool * eof)303 bool Binc::MimePart::skipUntilBoundary(const string &delimiter,
304                        unsigned int *nlines, bool *eof)
305 {
306   string::size_type endpos = delimiter.length();
307   char *delimiterqueue = 0;
308   string::size_type delimiterpos = 0;
309   const char *delimiterStr = delimiter.c_str();
310   if (delimiter != "") {
311     delimiterqueue = new char[endpos];
312     memset(delimiterqueue, 0, endpos);
313   }
314 
315   // first, skip to the first delimiter string. Anything between the
316   // header and the first delimiter string is simply ignored (it's
317   // usually a text message intended for non-mime clients)
318   char c;
319 
320   bool foundBoundary = false;
321   for (;;) {
322     if (!mimeSource->getChar(&c)) {
323       *eof = true;
324       break;
325     }
326 
327     if (c == '\n')
328       ++*nlines;
329 
330     // if there is no delimiter, we just read until the end of the
331     // file.
332     if (!delimiterqueue)
333       continue;
334 
335     delimiterqueue[delimiterpos++] = c;
336     if (delimiterpos ==  endpos)
337       delimiterpos = 0;
338 
339     if (compareStringToQueue(delimiterStr, delimiterqueue,
340                  delimiterpos, int(endpos))) {
341       foundBoundary = true;
342       break;
343     }
344   }
345 
346   delete [] delimiterqueue;
347   delimiterqueue = 0;
348 
349   return foundBoundary;
350 }
351 
352 // JFD: Things we do after finding a boundary (something like CRLF--somestring)
353 // Need to see if this is a final one (with an additional -- at the end),
354 // and need to check if it is immediately followed by another boundary
355 // (in this case, we give up our final CRLF in its favour)
postBoundaryProcessing(bool * eof,unsigned int * nlines,int * boundarysize,bool * foundendofpart)356 inline void Binc::MimePart::postBoundaryProcessing(bool *eof,
357                            unsigned int *nlines,
358                            int *boundarysize,
359                            bool *foundendofpart)
360 {
361     // Read two more characters. This may be CRLF, it may be "--" and
362     // it may be any other two characters.
363     char a = '\0';
364     if (!mimeSource->getChar(&a))
365       *eof = true;
366     if (a == '\n')
367       ++*nlines;
368 
369     char b = '\0';
370     if (!mimeSource->getChar(&b))
371       *eof = true;
372     if (b == '\n')
373       ++*nlines;
374 
375     // If eof, we're done here
376     if (*eof)
377       return;
378 
379     // If we find two dashes after the boundary, then this is the end
380     // of boundary marker, and we need to get 2 more chars
381     if (a == '-' && b == '-') {
382       *foundendofpart = true;
383       *boundarysize += 2;
384 
385       if (!mimeSource->getChar(&a))
386     *eof = true;
387       if (a == '\n')
388     ++*nlines;
389 
390       if (!mimeSource->getChar(&b))
391     *eof = true;
392       if (b == '\n')
393     ++*nlines;
394     }
395 
396     // If the boundary is followed by CRLF, we need to handle the
397     // special case where another boundary line follows
398     // immediately. In this case we consider the CRLF to be part of
399     // the NEXT boundary.
400     if (a == '\r' && b == '\n') {
401       // Get 2 more
402       if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b)) {
403     *eof = true;
404       } else if (a == '-' && b == '-') {
405     MPFDEB((stderr, "BINC: consecutive delimiters, giving up CRLF\n"));
406     mimeSource->ungetChar();
407     mimeSource->ungetChar();
408     mimeSource->ungetChar();
409     mimeSource->ungetChar();
410       } else {
411     // We unget the 2 chars, and keep our crlf (increasing our own size)
412     MPFDEB((stderr, "BINC: keeping my CRLF\n"));
413     mimeSource->ungetChar();
414     mimeSource->ungetChar();
415     *boundarysize += 2;
416       }
417 
418     } else {
419       // Boundary string not followed by CRLF, don't read more and let
420       // others skip the rest. Note that this is allowed but quite uncommon
421       mimeSource->ungetChar();
422       mimeSource->ungetChar();
423     }
424 }
425 
parseMultipart(const string & boundary,const string & toboundary,bool * eof,unsigned int * nlines,int * boundarysize,bool * foundendofpart,unsigned int * bodylength,vector<Binc::MimePart> * members)426 void Binc::MimePart::parseMultipart(const string &boundary,
427                     const string &toboundary,
428                     bool *eof,
429                     unsigned int *nlines,
430                     int *boundarysize,
431                     bool *foundendofpart,
432                     unsigned int *bodylength,
433                     vector<Binc::MimePart> *members)
434 {
435   MPFDEB((stderr, "BINC: ParseMultipart: boundary [%s], toboundary[%s]\n",
436       boundary.c_str(),
437       toboundary.c_str()));
438   using namespace ::Binc;
439   unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
440 
441   // multipart parsing starts with skipping to the first
442   // boundary. then we call parse() for all parts. the last parse()
443   // command will return a code indicating that it found the last
444   // boundary of this multipart. Note that the first boundary does
445   // not have to start with CRLF.
446   string delimiter = "--" + boundary;
447 
448   skipUntilBoundary(delimiter, nlines, eof);
449 
450   if (!eof)
451     *boundarysize = int(delimiter.size());
452 
453   postBoundaryProcessing(eof, nlines, boundarysize, foundendofpart);
454 
455   // read all mime parts.
456   if (!*foundendofpart && !*eof) {
457     bool quit = false;
458     do {
459       MimePart m;
460 
461       // If parseFull returns != 0, then it encountered the multipart's
462       // final boundary.
463       int bsize = 0;
464       if (m.doParseFull(mimeSource, boundary, bsize)) {
465     quit = true;
466     *boundarysize = bsize;
467       }
468 
469       members->push_back(m);
470 
471     } while (!quit);
472   }
473 
474   if (!*foundendofpart && !*eof) {
475     // multipart parsing starts with skipping to the first
476     // boundary. then we call parse() for all parts. the last parse()
477     // command will return a code indicating that it found the last
478     // boundary of this multipart. Note that the first boundary does
479     // not have to start with CRLF.
480     string delimiter = "\r\n--" + toboundary;
481     skipUntilBoundary(delimiter, nlines, eof);
482 
483     if (!*eof)
484       *boundarysize = int(delimiter.size());
485 
486     postBoundaryProcessing(eof, nlines, boundarysize, foundendofpart);
487   }
488 
489   // make sure bodylength doesn't overflow
490   *bodylength = mimeSource->getOffset();
491   if (*bodylength >= bodystartoffsetcrlf) {
492     *bodylength -= bodystartoffsetcrlf;
493     if (*bodylength >= (unsigned int) *boundarysize) {
494       *bodylength -= (unsigned int) *boundarysize;
495     } else {
496       *bodylength = 0;
497     }
498   } else {
499     *bodylength = 0;
500   }
501   MPFDEB((stderr, "BINC: ParseMultipart return\n"));
502 }
503 
parseSinglePart(const string & toboundary,int * boundarysize,unsigned int * nbodylines,unsigned int * nlines,bool * eof,bool * foundendofpart,unsigned int * bodylength)504 void Binc::MimePart::parseSinglePart(const string &toboundary,
505                 int *boundarysize,
506                 unsigned int *nbodylines,
507                 unsigned int *nlines,
508                 bool *eof, bool *foundendofpart,
509                 unsigned int *bodylength)
510 {
511   MPFDEB((stderr, "BINC: parseSinglePart, boundary [%s]\n",
512       toboundary.c_str()));
513   using namespace ::Binc;
514   unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
515 
516   // If toboundary is empty, then we read until the end of the
517   // file. Otherwise we will read until we encounter toboundary.
518   string _toboundary;
519   if (toboundary != "") {
520     _toboundary = "\r\n--";
521     _toboundary += toboundary;
522   }
523 
524   //  if (skipUntilBoundary(_toboundary, nlines, eof))
525   //    *boundarysize = _toboundary.length();
526 
527   char *boundaryqueue = 0;
528   size_t endpos = _toboundary.length();
529   if (toboundary != "") {
530     boundaryqueue = new char[endpos];
531     memset(boundaryqueue, 0, endpos);
532   }
533 
534   *boundarysize = 0;
535 
536   const char *_toboundaryStr = _toboundary.c_str();
537   string line;
538   bool toboundaryIsEmpty = (toboundary == "");
539   char c;
540   string::size_type boundarypos = 0;
541   while (mimeSource->getChar(&c)) {
542     if (c == '\n') { ++*nbodylines; ++*nlines; }
543 
544     if (toboundaryIsEmpty)
545       continue;
546 
547     // find boundary
548     boundaryqueue[boundarypos++] = c;
549     if (boundarypos == endpos)
550       boundarypos = 0;
551 
552     if (compareStringToQueue(_toboundaryStr, boundaryqueue,
553                  boundarypos, int(endpos))) {
554       *boundarysize = static_cast<int>(_toboundary.length());
555       break;
556     }
557   }
558 
559   delete [] boundaryqueue;
560 
561   if (toboundary != "") {
562     postBoundaryProcessing(eof, nlines, boundarysize, foundendofpart);
563   } else {
564     // Recoll: in the case of a multipart body with a null
565     // boundary (probably illegal but wtf), eof was not set and
566     // multipart went into a loop until bad alloc.
567     *eof = true;
568   }
569 
570   // make sure bodylength doesn't overflow
571   *bodylength = mimeSource->getOffset();
572   if (*bodylength >= bodystartoffsetcrlf) {
573     *bodylength -= bodystartoffsetcrlf;
574     if (*bodylength >= (unsigned int) *boundarysize) {
575       *bodylength -= (unsigned int) *boundarysize;
576     } else {
577       *bodylength = 0;
578     }
579   } else {
580     *bodylength = 0;
581   }
582   MPFDEB((stderr, "BINC: parseSimple ret: bodylength %d, boundarysize %d\n",
583       *bodylength, *boundarysize));
584 }
585 
586 //------------------------------------------------------------------------
doParseFull(MimeInputSource * ms,const string & toboundary,int & boundarysize)587 int Binc::MimePart::doParseFull(MimeInputSource *ms, const string &toboundary,
588                 int &boundarysize)
589 {
590   MPFDEB((stderr, "BINC: doParsefull, toboundary[%s]\n", toboundary.c_str()));
591   mimeSource = ms;
592   headerstartoffsetcrlf = mimeSource->getOffset();
593 
594   // Parse the header of this mime part.
595   parseHeader(&h, &nlines);
596 
597   // Headerlength includes the seperating CRLF. Body starts after the
598   // CRLF.
599   headerlength = mimeSource->getOffset() - headerstartoffsetcrlf;
600   bodystartoffsetcrlf = mimeSource->getOffset();
601   MPFDEB((stderr, "BINC: doParsefull, bodystartoffsetcrlf %d\n", bodystartoffsetcrlf));
602   bodylength = 0;
603 
604   // Determine the type of mime part by looking at fields in the
605   // header.
606   analyzeHeader(&h, &multipart, &messagerfc822, &subtype, &boundary);
607 
608   bool eof = false;
609   bool foundendofpart = false;
610 
611   if (messagerfc822) {
612     parseMessageRFC822(&members, &foundendofpart, &bodylength,
613                &nbodylines, toboundary);
614 
615   } else if (multipart) {
616     parseMultipart(boundary, toboundary, &eof, &nlines, &boundarysize,
617            &foundendofpart, &bodylength,
618            &members);
619   } else {
620     parseSinglePart(toboundary, &boundarysize, &nbodylines, &nlines,
621             &eof, &foundendofpart, &bodylength);
622   }
623 
624   MPFDEB((stderr, "BINC: doParsefull ret, toboundary[%s]\n", toboundary.c_str()));
625   return (eof || foundendofpart) ? 1 : 0;
626 }
627