1 /* -*- mode:c++;c-basic-offset:2 -*- */
2 /* --------------------------------------------------------------------
3 * Filename:
4 * mime-parsefull.cc
5 *
6 * Description:
7 * Implementation of main mime parser components
8 * --------------------------------------------------------------------
9 * Copyright 2002-2005 Andreas Aardal Hanssen
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24 * --------------------------------------------------------------------
25 */
26 #include "autoconfig.h"
27
28 #include <string.h>
29 #include <ctype.h>
30 #include <stdio.h>
31 #include <errno.h>
32
33 #include <string>
34 #include <vector>
35 #include <map>
36 #include <exception>
37 #include <iostream>
38
39 #include "mime.h"
40 #include "mime-utils.h"
41 #include "mime-inputsource.h"
42 #include "convert.h"
43
44 using namespace std;
45
46
47 // #define MPF
48 #ifdef MPF
49 #define MPFDEB(X) fprintf X
50 #else
51 #define MPFDEB(X)
52 #endif
53
54 //------------------------------------------------------------------------
parseFull(int fd)55 void Binc::MimeDocument::parseFull(int fd)
56 {
57 if (allIsParsed)
58 return;
59
60 allIsParsed = true;
61
62 delete doc_mimeSource;
63 doc_mimeSource = new MimeInputSource(fd);
64
65 headerstartoffsetcrlf = 0;
66 headerlength = 0;
67 bodystartoffsetcrlf = 0;
68 bodylength = 0;
69 size = 0;
70 messagerfc822 = false;
71 multipart = false;
72
73 int bsize = 0;
74 string bound;
75 doParseFull(doc_mimeSource, bound, bsize);
76
77 // eat any trailing junk to get the correct size
78 char c;
79 while (doc_mimeSource->getChar(&c));
80
81 size = doc_mimeSource->getOffset();
82 }
83
parseFull(istream & s)84 void Binc::MimeDocument::parseFull(istream& s)
85 {
86 if (allIsParsed)
87 return;
88
89 allIsParsed = true;
90
91 delete doc_mimeSource;
92 doc_mimeSource = new MimeInputSourceStream(s);
93
94 headerstartoffsetcrlf = 0;
95 headerlength = 0;
96 bodystartoffsetcrlf = 0;
97 bodylength = 0;
98 size = 0;
99 messagerfc822 = false;
100 multipart = false;
101
102 int bsize = 0;
103 string bound;
104 doParseFull(doc_mimeSource, bound, bsize);
105
106 // eat any trailing junk to get the correct size
107 char c;
108 while (doc_mimeSource->getChar(&c));
109
110 size = doc_mimeSource->getOffset();
111 }
112
113 //------------------------------------------------------------------------
parseOneHeaderLine(Binc::Header * header,unsigned int * nlines)114 bool Binc::MimePart::parseOneHeaderLine(Binc::Header *header,
115 unsigned int *nlines)
116 {
117 using namespace ::Binc;
118 char c;
119 bool eof = false;
120 char cqueue[4];
121 string name;
122 string content;
123
124 while (mimeSource->getChar(&c)) {
125 // If we encounter a \r before we got to the first ':', then
126 // rewind back to the start of the line and assume we're at the
127 // start of the body.
128 if (c == '\r') {
129 for (int i = 0; i < (int) name.length() + 1; ++i)
130 mimeSource->ungetChar();
131 return false;
132 }
133
134 // A colon marks the end of the header name
135 if (c == ':') break;
136
137 // Otherwise add to the header name
138 name += c;
139 }
140
141 cqueue[0] = '\0';
142 cqueue[1] = '\0';
143 cqueue[2] = '\0';
144 cqueue[3] = '\0';
145
146 // Read until the end of the header.
147 bool endOfHeaders = false;
148 while (!endOfHeaders) {
149 if (!mimeSource->getChar(&c)) {
150 eof = true;
151 break;
152 }
153
154 if (c == '\n') ++*nlines;
155
156 for (int i = 0; i < 3; ++i)
157 cqueue[i] = cqueue[i + 1];
158 cqueue[3] = c;
159
160 if (strncmp(cqueue, "\r\n\r\n", 4) == 0) {
161 endOfHeaders = true;
162 break;
163 }
164
165 // If the last character was a newline, and the first now is not
166 // whitespace, then rewind one character and store the current
167 // key,value pair.
168 if (cqueue[2] == '\n' && c != ' ' && c != '\t') {
169 if (content.length() > 2)
170 content.resize(content.length() - 2);
171
172 trim(content);
173 header->add(name, content);
174
175 if (c != '\r') {
176 mimeSource->ungetChar();
177 if (c == '\n') --*nlines;
178 return true;
179 }
180
181 mimeSource->getChar(&c);
182 return false;
183 }
184
185 content += c;
186 }
187
188 if (name != "") {
189 if (content.length() > 2)
190 content.resize(content.length() - 2);
191 header->add(name, content);
192 }
193
194 return !(eof || endOfHeaders);
195 }
196
197 //------------------------------------------------------------------------
parseHeader(Binc::Header * header,unsigned int * nlines)198 void Binc::MimePart::parseHeader(Binc::Header *header, unsigned int *nlines)
199 {
200 while (parseOneHeaderLine(header, nlines))
201 { }
202 }
203
204 //------------------------------------------------------------------------
analyzeHeader(Binc::Header * header,bool * multipart,bool * messagerfc822,string * subtype,string * boundary)205 void Binc::MimePart::analyzeHeader(Binc::Header *header, bool *multipart,
206 bool *messagerfc822, string *subtype,
207 string *boundary)
208 {
209 using namespace ::Binc;
210
211 // Do simple parsing of headers to determine the
212 // type of message (multipart,messagerfc822 etc)
213 HeaderItem ctype;
214 if (header->getFirstHeader("content-type", ctype)) {
215 vector<string> types;
216 split(ctype.getValue(), ";", types);
217
218 if (types.size() > 0) {
219 // first element should describe content type
220 string tmp = types[0];
221 trim(tmp);
222 vector<string> v;
223 split(tmp, "/", v);
224 string key, value;
225
226 key = (v.size() > 0) ? v[0] : "text";
227 value = (v.size() > 1) ? v[1] : "plain";
228 lowercase(key);
229
230 if (key == "multipart") {
231 *multipart = true;
232 lowercase(value);
233 *subtype = value;
234 } else if (key == "message") {
235 lowercase(value);
236 if (value == "rfc822")
237 *messagerfc822 = true;
238 }
239 }
240
241 for (vector<string>::const_iterator i = types.begin();
242 i != types.end(); ++i) {
243 string element = *i;
244 trim(element);
245
246 if (element.find("=") != string::npos) {
247 string::size_type pos = element.find('=');
248 string key = element.substr(0, pos);
249 string value = element.substr(pos + 1);
250
251 lowercase(key);
252 trim(key);
253
254 if (key == "boundary") {
255 trim(value, " \"");
256 *boundary = value;
257 }
258 }
259 }
260 }
261 }
262
parseMessageRFC822(vector<Binc::MimePart> * members,bool * foundendofpart,unsigned int * bodylength,unsigned int * nbodylines,const string & toboundary)263 void Binc::MimePart::parseMessageRFC822(vector<Binc::MimePart> *members,
264 bool *foundendofpart,
265 unsigned int *bodylength,
266 unsigned int *nbodylines,
267 const string &toboundary)
268 {
269 using namespace ::Binc;
270
271 // message rfc822 means a completely enclosed mime document. we
272 // call the parser recursively, and pass on the boundary string
273 // that we got. when parse() finds this boundary, it returns 0. if
274 // it finds the end boundary (boundary + "--"), it returns != 0.
275 MimePart m;
276
277 unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
278
279 // parsefull returns the number of bytes that need to be removed
280 // from the body because of the terminating boundary string.
281 int bsize = 0;
282 if (m.doParseFull(mimeSource, toboundary, bsize))
283 *foundendofpart = true;
284
285 // make sure bodylength doesn't overflow
286 *bodylength = mimeSource->getOffset();
287 if (*bodylength >= bodystartoffsetcrlf) {
288 *bodylength -= bodystartoffsetcrlf;
289 if (*bodylength >= (unsigned int) bsize) {
290 *bodylength -= (unsigned int) bsize;
291 } else {
292 *bodylength = 0;
293 }
294 } else {
295 *bodylength = 0;
296 }
297
298 *nbodylines += m.getNofLines();
299
300 members->push_back(m);
301 }
302
skipUntilBoundary(const string & delimiter,unsigned int * nlines,bool * eof)303 bool Binc::MimePart::skipUntilBoundary(const string &delimiter,
304 unsigned int *nlines, bool *eof)
305 {
306 string::size_type endpos = delimiter.length();
307 char *delimiterqueue = 0;
308 string::size_type delimiterpos = 0;
309 const char *delimiterStr = delimiter.c_str();
310 if (delimiter != "") {
311 delimiterqueue = new char[endpos];
312 memset(delimiterqueue, 0, endpos);
313 }
314
315 // first, skip to the first delimiter string. Anything between the
316 // header and the first delimiter string is simply ignored (it's
317 // usually a text message intended for non-mime clients)
318 char c;
319
320 bool foundBoundary = false;
321 for (;;) {
322 if (!mimeSource->getChar(&c)) {
323 *eof = true;
324 break;
325 }
326
327 if (c == '\n')
328 ++*nlines;
329
330 // if there is no delimiter, we just read until the end of the
331 // file.
332 if (!delimiterqueue)
333 continue;
334
335 delimiterqueue[delimiterpos++] = c;
336 if (delimiterpos == endpos)
337 delimiterpos = 0;
338
339 if (compareStringToQueue(delimiterStr, delimiterqueue,
340 delimiterpos, int(endpos))) {
341 foundBoundary = true;
342 break;
343 }
344 }
345
346 delete [] delimiterqueue;
347 delimiterqueue = 0;
348
349 return foundBoundary;
350 }
351
352 // JFD: Things we do after finding a boundary (something like CRLF--somestring)
353 // Need to see if this is a final one (with an additional -- at the end),
354 // and need to check if it is immediately followed by another boundary
355 // (in this case, we give up our final CRLF in its favour)
postBoundaryProcessing(bool * eof,unsigned int * nlines,int * boundarysize,bool * foundendofpart)356 inline void Binc::MimePart::postBoundaryProcessing(bool *eof,
357 unsigned int *nlines,
358 int *boundarysize,
359 bool *foundendofpart)
360 {
361 // Read two more characters. This may be CRLF, it may be "--" and
362 // it may be any other two characters.
363 char a = '\0';
364 if (!mimeSource->getChar(&a))
365 *eof = true;
366 if (a == '\n')
367 ++*nlines;
368
369 char b = '\0';
370 if (!mimeSource->getChar(&b))
371 *eof = true;
372 if (b == '\n')
373 ++*nlines;
374
375 // If eof, we're done here
376 if (*eof)
377 return;
378
379 // If we find two dashes after the boundary, then this is the end
380 // of boundary marker, and we need to get 2 more chars
381 if (a == '-' && b == '-') {
382 *foundendofpart = true;
383 *boundarysize += 2;
384
385 if (!mimeSource->getChar(&a))
386 *eof = true;
387 if (a == '\n')
388 ++*nlines;
389
390 if (!mimeSource->getChar(&b))
391 *eof = true;
392 if (b == '\n')
393 ++*nlines;
394 }
395
396 // If the boundary is followed by CRLF, we need to handle the
397 // special case where another boundary line follows
398 // immediately. In this case we consider the CRLF to be part of
399 // the NEXT boundary.
400 if (a == '\r' && b == '\n') {
401 // Get 2 more
402 if (!mimeSource->getChar(&a) || !mimeSource->getChar(&b)) {
403 *eof = true;
404 } else if (a == '-' && b == '-') {
405 MPFDEB((stderr, "BINC: consecutive delimiters, giving up CRLF\n"));
406 mimeSource->ungetChar();
407 mimeSource->ungetChar();
408 mimeSource->ungetChar();
409 mimeSource->ungetChar();
410 } else {
411 // We unget the 2 chars, and keep our crlf (increasing our own size)
412 MPFDEB((stderr, "BINC: keeping my CRLF\n"));
413 mimeSource->ungetChar();
414 mimeSource->ungetChar();
415 *boundarysize += 2;
416 }
417
418 } else {
419 // Boundary string not followed by CRLF, don't read more and let
420 // others skip the rest. Note that this is allowed but quite uncommon
421 mimeSource->ungetChar();
422 mimeSource->ungetChar();
423 }
424 }
425
parseMultipart(const string & boundary,const string & toboundary,bool * eof,unsigned int * nlines,int * boundarysize,bool * foundendofpart,unsigned int * bodylength,vector<Binc::MimePart> * members)426 void Binc::MimePart::parseMultipart(const string &boundary,
427 const string &toboundary,
428 bool *eof,
429 unsigned int *nlines,
430 int *boundarysize,
431 bool *foundendofpart,
432 unsigned int *bodylength,
433 vector<Binc::MimePart> *members)
434 {
435 MPFDEB((stderr, "BINC: ParseMultipart: boundary [%s], toboundary[%s]\n",
436 boundary.c_str(),
437 toboundary.c_str()));
438 using namespace ::Binc;
439 unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
440
441 // multipart parsing starts with skipping to the first
442 // boundary. then we call parse() for all parts. the last parse()
443 // command will return a code indicating that it found the last
444 // boundary of this multipart. Note that the first boundary does
445 // not have to start with CRLF.
446 string delimiter = "--" + boundary;
447
448 skipUntilBoundary(delimiter, nlines, eof);
449
450 if (!eof)
451 *boundarysize = int(delimiter.size());
452
453 postBoundaryProcessing(eof, nlines, boundarysize, foundendofpart);
454
455 // read all mime parts.
456 if (!*foundendofpart && !*eof) {
457 bool quit = false;
458 do {
459 MimePart m;
460
461 // If parseFull returns != 0, then it encountered the multipart's
462 // final boundary.
463 int bsize = 0;
464 if (m.doParseFull(mimeSource, boundary, bsize)) {
465 quit = true;
466 *boundarysize = bsize;
467 }
468
469 members->push_back(m);
470
471 } while (!quit);
472 }
473
474 if (!*foundendofpart && !*eof) {
475 // multipart parsing starts with skipping to the first
476 // boundary. then we call parse() for all parts. the last parse()
477 // command will return a code indicating that it found the last
478 // boundary of this multipart. Note that the first boundary does
479 // not have to start with CRLF.
480 string delimiter = "\r\n--" + toboundary;
481 skipUntilBoundary(delimiter, nlines, eof);
482
483 if (!*eof)
484 *boundarysize = int(delimiter.size());
485
486 postBoundaryProcessing(eof, nlines, boundarysize, foundendofpart);
487 }
488
489 // make sure bodylength doesn't overflow
490 *bodylength = mimeSource->getOffset();
491 if (*bodylength >= bodystartoffsetcrlf) {
492 *bodylength -= bodystartoffsetcrlf;
493 if (*bodylength >= (unsigned int) *boundarysize) {
494 *bodylength -= (unsigned int) *boundarysize;
495 } else {
496 *bodylength = 0;
497 }
498 } else {
499 *bodylength = 0;
500 }
501 MPFDEB((stderr, "BINC: ParseMultipart return\n"));
502 }
503
parseSinglePart(const string & toboundary,int * boundarysize,unsigned int * nbodylines,unsigned int * nlines,bool * eof,bool * foundendofpart,unsigned int * bodylength)504 void Binc::MimePart::parseSinglePart(const string &toboundary,
505 int *boundarysize,
506 unsigned int *nbodylines,
507 unsigned int *nlines,
508 bool *eof, bool *foundendofpart,
509 unsigned int *bodylength)
510 {
511 MPFDEB((stderr, "BINC: parseSinglePart, boundary [%s]\n",
512 toboundary.c_str()));
513 using namespace ::Binc;
514 unsigned int bodystartoffsetcrlf = mimeSource->getOffset();
515
516 // If toboundary is empty, then we read until the end of the
517 // file. Otherwise we will read until we encounter toboundary.
518 string _toboundary;
519 if (toboundary != "") {
520 _toboundary = "\r\n--";
521 _toboundary += toboundary;
522 }
523
524 // if (skipUntilBoundary(_toboundary, nlines, eof))
525 // *boundarysize = _toboundary.length();
526
527 char *boundaryqueue = 0;
528 size_t endpos = _toboundary.length();
529 if (toboundary != "") {
530 boundaryqueue = new char[endpos];
531 memset(boundaryqueue, 0, endpos);
532 }
533
534 *boundarysize = 0;
535
536 const char *_toboundaryStr = _toboundary.c_str();
537 string line;
538 bool toboundaryIsEmpty = (toboundary == "");
539 char c;
540 string::size_type boundarypos = 0;
541 while (mimeSource->getChar(&c)) {
542 if (c == '\n') { ++*nbodylines; ++*nlines; }
543
544 if (toboundaryIsEmpty)
545 continue;
546
547 // find boundary
548 boundaryqueue[boundarypos++] = c;
549 if (boundarypos == endpos)
550 boundarypos = 0;
551
552 if (compareStringToQueue(_toboundaryStr, boundaryqueue,
553 boundarypos, int(endpos))) {
554 *boundarysize = static_cast<int>(_toboundary.length());
555 break;
556 }
557 }
558
559 delete [] boundaryqueue;
560
561 if (toboundary != "") {
562 postBoundaryProcessing(eof, nlines, boundarysize, foundendofpart);
563 } else {
564 // Recoll: in the case of a multipart body with a null
565 // boundary (probably illegal but wtf), eof was not set and
566 // multipart went into a loop until bad alloc.
567 *eof = true;
568 }
569
570 // make sure bodylength doesn't overflow
571 *bodylength = mimeSource->getOffset();
572 if (*bodylength >= bodystartoffsetcrlf) {
573 *bodylength -= bodystartoffsetcrlf;
574 if (*bodylength >= (unsigned int) *boundarysize) {
575 *bodylength -= (unsigned int) *boundarysize;
576 } else {
577 *bodylength = 0;
578 }
579 } else {
580 *bodylength = 0;
581 }
582 MPFDEB((stderr, "BINC: parseSimple ret: bodylength %d, boundarysize %d\n",
583 *bodylength, *boundarysize));
584 }
585
586 //------------------------------------------------------------------------
doParseFull(MimeInputSource * ms,const string & toboundary,int & boundarysize)587 int Binc::MimePart::doParseFull(MimeInputSource *ms, const string &toboundary,
588 int &boundarysize)
589 {
590 MPFDEB((stderr, "BINC: doParsefull, toboundary[%s]\n", toboundary.c_str()));
591 mimeSource = ms;
592 headerstartoffsetcrlf = mimeSource->getOffset();
593
594 // Parse the header of this mime part.
595 parseHeader(&h, &nlines);
596
597 // Headerlength includes the seperating CRLF. Body starts after the
598 // CRLF.
599 headerlength = mimeSource->getOffset() - headerstartoffsetcrlf;
600 bodystartoffsetcrlf = mimeSource->getOffset();
601 MPFDEB((stderr, "BINC: doParsefull, bodystartoffsetcrlf %d\n", bodystartoffsetcrlf));
602 bodylength = 0;
603
604 // Determine the type of mime part by looking at fields in the
605 // header.
606 analyzeHeader(&h, &multipart, &messagerfc822, &subtype, &boundary);
607
608 bool eof = false;
609 bool foundendofpart = false;
610
611 if (messagerfc822) {
612 parseMessageRFC822(&members, &foundendofpart, &bodylength,
613 &nbodylines, toboundary);
614
615 } else if (multipart) {
616 parseMultipart(boundary, toboundary, &eof, &nlines, &boundarysize,
617 &foundendofpart, &bodylength,
618 &members);
619 } else {
620 parseSinglePart(toboundary, &boundarysize, &nbodylines, &nlines,
621 &eof, &foundendofpart, &bodylength);
622 }
623
624 MPFDEB((stderr, "BINC: doParsefull ret, toboundary[%s]\n", toboundary.c_str()));
625 return (eof || foundendofpart) ? 1 : 0;
626 }
627