1 /* -*- Mode: c++; -*- */
2 /* --------------------------------------------------------------------
3 * Filename:
4 * mime-parsefull.cc
5 *
6 * Description:
7 * Implementation of main mime parser components
8 *
9 * Authors:
10 * Andreas Aardal Hanssen <andreas-binc curly bincimap spot org>
11 *
12 * Bugs:
13 *
14 * ChangeLog:
15 *
16 * --------------------------------------------------------------------
17 * Copyright 2002-2005 Andreas Aardal Hanssen
18 *
19 * This program is free software; you can redistribute it and/or modify
20 * it under the terms of the GNU General Public License as published by
21 * the Free Software Foundation; either version 2 of the License, or
22 * (at your option) any later version.
23 *
24 * This program is distributed in the hope that it will be useful,
25 * but WITHOUT ANY WARRANTY; without even the implied warranty of
26 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 * GNU General Public License for more details.
28 *
29 * You should have received a copy of the GNU General Public License
30 * along with this program; if not, write to the Free Software
31 * Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
32 * --------------------------------------------------------------------
33 */
34 #ifdef HAVE_CONFIG_H
35 #include <config.h>
36 #endif
37
38 #include "mime.h"
39 #include "mime-utils.h"
40 #include "convert.h"
41 #include "io.h"
42 #include <string>
43 #include <vector>
44 #include <map>
45 #include <exception>
46 #include <iostream>
47
48 #include <string.h>
49 #include <ctype.h>
50 #include <stdio.h>
51 #include <errno.h>
52
53 using namespace ::std;
54
55 int crlffile = 0;
56 char crlfdata[4096];
57 unsigned int crlftail = 0;
58 unsigned int crlfhead = 0;
59 unsigned int crlfoffset = 0;
60 char lastchar = '\0';
61
62 //------------------------------------------------------------------------
fillInputBuffer(void)63 bool fillInputBuffer(void)
64 {
65 char raw[1024];
66
67 ssize_t nbytes;
68 for (;;) {
69 nbytes = read(crlffile, raw, sizeof(raw) - 1);
70 if (nbytes <= 0) {
71 // FIXME: If ferror(crlffile) we should log this.
72 return false;
73 }
74 else break;
75 }
76
77 for (ssize_t i = 0; i < nbytes; ++i) {
78 const char c = raw[i];
79 switch (c) {
80 case '\r':
81 if (lastchar == '\r') {
82 crlfdata[crlftail++ & 0xfff] = '\r';
83 crlfdata[crlftail++ & 0xfff] = '\n';
84 }
85 break;
86 case '\n':
87 crlfdata[crlftail++ & 0xfff] = '\r';
88 crlfdata[crlftail++ & 0xfff] = '\n';
89 break;
90 default:
91 if (lastchar == '\r') {
92 crlfdata[crlftail++ & 0xfff] = '\r';
93 crlfdata[crlftail++ & 0xfff] = '\n';
94 }
95
96 crlfdata[crlftail++ & 0xfff] = c;
97 break;
98 }
99
100 lastchar = c;
101 }
102
103 return true;
104 }
105
106
107 //------------------------------------------------------------------------
parseFull(int fd) const108 void Binc::MimeDocument::parseFull(int fd) const
109 {
110 if (allIsParsed)
111 return;
112
113 allIsParsed = true;
114
115 crlffile = fd;
116 crlfReset();
117
118 headerstartoffsetcrlf = 0;
119 headerlength = 0;
120 bodystartoffsetcrlf = 0;
121 bodylength = 0;
122 size = 0;
123 messagerfc822 = false;
124 multipart = false;
125
126 int bsize = 0;
127 MimePart::parseFull("", bsize);
128
129 // eat any trailing junk to get the correct size
130 char c;
131 while (crlfGetChar(c));
132
133 size = crlfoffset;
134 }
135
136 //------------------------------------------------------------------------
parseFull(const string & toboundary,int & boundarysize) const137 int Binc::MimePart::parseFull(const string &toboundary, int &boundarysize) const
138 {
139 string name;
140 string content;
141 char cqueue[4];
142 memset(cqueue, 0, sizeof(cqueue));
143
144 bool quit = false;
145 char c;
146 bool eof = false;
147
148 headerstartoffsetcrlf = crlfoffset;
149
150 while (!quit && !eof) {
151 // read name
152 while (1) {
153 if (!crlfGetChar(c)) {
154 eof = true;
155 break;
156 }
157
158 if (c == '\n') ++nlines;
159 if (c == ':') break;
160 if (c == '\n') {
161 // If we encounter a \n before we got to the first ':', then
162 // if the line is not empty, rewind back to the start of the
163 // line and assume we're at the start of the body. If not,
164 // just skip the line and assume we're at the start of the
165 // body.
166 string ntmp = name;
167 trim(ntmp);
168 if (ntmp != "")
169 for (int i = name.length() - 1; i >= 0; --i)
170 crlfUnGetChar();
171
172 quit = true;
173 name = "";
174 break;
175 }
176
177 name += c;
178
179 if (name.length() == 2 && name.substr(0, 2) == "\r\n") {
180 name = "";
181 quit = true;
182 break;
183 }
184 }
185
186 if (name.length() == 1 && name[0] == '\r') {
187 name = "";
188 break;
189 }
190
191 if (quit || eof) break;
192
193 while (!quit) {
194 if (!crlfGetChar(c)) {
195 quit = true;
196 break;
197 }
198
199 if (c == '\n') ++nlines;
200
201 for (int i = 0; i < 3; ++i)
202 cqueue[i] = cqueue[i + 1];
203 cqueue[3] = c;
204
205 if (strncmp(cqueue, "\r\n\r\n", 4) == 0) {
206 quit = true;
207 break;
208 }
209
210 if (cqueue[2] == '\n') {
211
212 // guess the mime rfc says what can not appear on the beginning
213 // of a line.
214 if (!isspace(cqueue[3])) {
215 if (content.length() > 2)
216 content.resize(content.length() - 2);
217
218 trim(content);
219 h.add(name, content);
220
221 name = c;
222 content = "";
223 break;
224 }
225 }
226
227 content += c;
228 }
229 }
230
231 if (name != "") {
232 if (content.length() > 2)
233 content.resize(content.length() - 2);
234 h.add(name, content);
235 }
236
237 // Headerlength includes the seperating CRLF. Body starts after the
238 // CRLF.
239 headerlength = crlfoffset - headerstartoffsetcrlf;
240 bodystartoffsetcrlf = crlfoffset;
241 bodylength = 0;
242
243 // If we encounter the end of file, we return 1 as if we found our
244 // parent's terminal boundary. This will cause a safe exit, and
245 // whatever we parsed until now will be available.
246 if (eof)
247 return 1;
248
249 // Do simple parsing of headers to determine the
250 // type of message (multipart,messagerfc822 etc)
251 HeaderItem ctype;
252 if (h.getFirstHeader("content-type", ctype)) {
253 vector<string> types;
254 split(ctype.getValue(), ";", types);
255
256 if (types.size() > 0) {
257 // first element should describe content type
258 string tmp = types[0];
259 trim(tmp);
260 vector<string> v;
261 split(tmp, "/", v);
262 string key, value;
263
264 key = (v.size() > 0) ? v[0] : "text";
265 value = (v.size() > 1) ? v[1] : "plain";
266 lowercase(key);
267
268 if (key == "multipart") {
269 multipart = true;
270 lowercase(value);
271 subtype = value;
272 } else if (key == "message") {
273 lowercase(value);
274 if (value == "rfc822")
275 messagerfc822 = true;
276 }
277 }
278
279 for (vector<string>::const_iterator i = types.begin();
280 i != types.end(); ++i) {
281 string element = *i;
282 trim(element);
283
284 if (element.find("=") != string::npos) {
285 string::size_type pos = element.find('=');
286 string key = element.substr(0, pos);
287 string value = element.substr(pos + 1);
288
289 lowercase(key);
290 trim(key);
291
292 if (key == "boundary") {
293 trim(value, " \"");
294 boundary = value;
295 }
296 }
297 }
298 }
299
300 bool foundendofpart = false;
301 if (messagerfc822) {
302 // message rfc822 means a completely enclosed mime document. we
303 // call the parser recursively, and pass on the boundary string
304 // that we got. when parse() finds this boundary, it returns 0. if
305 // it finds the end boundary (boundary + "--"), it returns != 0.
306 MimePart m;
307
308 // parsefull returns the number of bytes that need to be removed
309 // from the body because of the terminating boundary string.
310 int bsize;
311 if (m.parseFull(toboundary, bsize))
312 foundendofpart = true;
313
314 // make sure bodylength doesn't overflow
315 bodylength = crlfoffset;
316 if (bodylength >= bodystartoffsetcrlf) {
317 bodylength -= bodystartoffsetcrlf;
318 if (bodylength >= (unsigned int) bsize) {
319 bodylength -= (unsigned int) bsize;
320 } else {
321 bodylength = 0;
322 }
323 } else {
324 bodylength = 0;
325 }
326
327 nbodylines += m.getNofLines();
328
329 members.push_back(m);
330
331 } else if (multipart) {
332 // multipart parsing starts with skipping to the first
333 // boundary. then we call parse() for all parts. the last parse()
334 // command will return a code indicating that it found the last
335 // boundary of this multipart. Note that the first boundary does
336 // not have to start with CRLF.
337 string delimiter = "--" + boundary;
338
339 char *delimiterqueue = 0;
340 int endpos = delimiter.length();
341 delimiterqueue = new char[endpos];
342 int delimiterpos = 0;
343 bool eof = false;
344
345 // first, skip to the first delimiter string. Anything between the
346 // header and the first delimiter string is simply ignored (it's
347 // usually a text message intended for non-mime clients)
348 do {
349 if (!crlfGetChar(c)) {
350 eof = true;
351 break;
352 }
353
354 if (c == '\n')
355 ++nlines;
356
357 delimiterqueue[delimiterpos++ % endpos] = c;
358
359 // Fixme: Must also check for all parents' delimiters.
360 } while (!compareStringToQueue(delimiter, delimiterqueue, delimiterpos, endpos));
361
362 delete delimiterqueue;
363
364 if (!eof)
365 boundarysize = delimiter.size();
366
367 // Read two more characters. This may be CRLF, it may be "--" and
368 // it may be any other two characters.
369 char a;
370 if (!crlfGetChar(a))
371 eof = true;
372
373 if (a == '\n')
374 ++nlines;
375
376 char b;
377 if (!crlfGetChar(b))
378 eof = true;
379
380 if (b == '\n')
381 ++nlines;
382
383 // If we find two dashes after the boundary, then this is the end
384 // of boundary marker.
385 if (!eof) {
386 if (a == '-' && b == '-') {
387 foundendofpart = true;
388 boundarysize += 2;
389
390 if (!crlfGetChar(a))
391 eof = true;
392
393 if (a == '\n')
394 ++nlines;
395
396 if (!crlfGetChar(b))
397 eof = true;
398
399 if (b == '\n')
400 ++nlines;
401 }
402
403 if (a == '\r' && b == '\n') {
404 // This exception is to handle a special case where the
405 // delimiter of one part is not followed by CRLF, but
406 // immediately followed by a CRLF prefixed delimiter.
407 if (!crlfGetChar(a) || !crlfGetChar(b))
408 eof = true;
409 else if (a == '-' && b == '-') {
410 crlfUnGetChar();
411 crlfUnGetChar();
412 crlfUnGetChar();
413 crlfUnGetChar();
414 } else {
415 crlfUnGetChar();
416 crlfUnGetChar();
417 }
418
419 boundarysize += 2;
420 } else {
421 crlfUnGetChar();
422 crlfUnGetChar();
423 }
424 }
425
426 // read all mime parts.
427 if (!foundendofpart && !eof) {
428 bool quit = false;
429 do {
430 MimePart m;
431
432 // If parseFull returns != 0, then it encountered the multipart's
433 // final boundary.
434 int bsize = 0;
435 if (m.parseFull(boundary, bsize)) {
436 quit = true;
437 boundarysize = bsize;
438 }
439
440 members.push_back(m);
441 nlines += m.getNofLines();
442
443 } while (!quit);
444 }
445
446 if (!foundendofpart && !eof) {
447 // multipart parsing starts with skipping to the first
448 // boundary. then we call parse() for all parts. the last parse()
449 // command will return a code indicating that it found the last
450 // boundary of this multipart. Note that the first boundary does
451 // not have to start with CRLF.
452 string delimiter = "\r\n--" + toboundary;
453
454 char *delimiterqueue = 0;
455 int endpos = delimiter.length();
456 delimiterqueue = new char[endpos];
457 int delimiterpos = 0;
458 bool eof = false;
459
460 // first, skip to the first delimiter string. Anything between the
461 // header and the first delimiter string is simply ignored (it's
462 // usually a text message intended for non-mime clients)
463 do {
464 if (!crlfGetChar(c)) {
465 eof = true;
466 break;
467 }
468
469 if (c == '\n')
470 ++nlines;
471
472 delimiterqueue[delimiterpos++ % endpos] = c;
473
474 // Fixme: Must also check for all parents' delimiters.
475 } while (!compareStringToQueue(delimiter, delimiterqueue, delimiterpos, endpos));
476
477 delete delimiterqueue;
478
479 if (!eof)
480 boundarysize = delimiter.size();
481
482 // Read two more characters. This may be CRLF, it may be "--" and
483 // it may be any other two characters.
484 char a;
485 if (!crlfGetChar(a))
486 eof = true;
487
488 if (a == '\n')
489 ++nlines;
490
491 char b;
492 if (!crlfGetChar(b))
493 eof = true;
494
495 if (b == '\n')
496 ++nlines;
497
498 // If we find two dashes after the boundary, then this is the end
499 // of boundary marker.
500 if (!eof) {
501 if (a == '-' && b == '-') {
502 foundendofpart = true;
503 boundarysize += 2;
504
505 if (!crlfGetChar(a))
506 eof = true;
507
508 if (a == '\n')
509 ++nlines;
510
511 if (!crlfGetChar(b))
512 eof = true;
513
514 if (b == '\n')
515 ++nlines;
516 }
517
518 if (a == '\r' && b == '\n') {
519 // This exception is to handle a special case where the
520 // delimiter of one part is not followed by CRLF, but
521 // immediately followed by a CRLF prefixed delimiter.
522 if (!crlfGetChar(a) || !crlfGetChar(b))
523 eof = true;
524 else if (a == '-' && b == '-') {
525 crlfUnGetChar();
526 crlfUnGetChar();
527 crlfUnGetChar();
528 crlfUnGetChar();
529 } else {
530 crlfUnGetChar();
531 crlfUnGetChar();
532 }
533
534 boundarysize += 2;
535 } else {
536 crlfUnGetChar();
537 crlfUnGetChar();
538 }
539 }
540 }
541
542 // make sure bodylength doesn't overflow
543 bodylength = crlfoffset;
544 if (bodylength >= bodystartoffsetcrlf) {
545 bodylength -= bodystartoffsetcrlf;
546 if (bodylength >= (unsigned int) boundarysize) {
547 bodylength -= (unsigned int) boundarysize;
548 } else {
549 bodylength = 0;
550 }
551 } else {
552 bodylength = 0;
553 }
554
555 } else {
556 // If toboundary is empty, then we read until the end of the
557 // file. Otherwise we will read until we encounter toboundary.
558 string _toboundary;
559 if (toboundary != "") {
560 _toboundary = "\r\n--";
561 _toboundary += toboundary;
562 }
563
564 char *boundaryqueue = 0;
565 int endpos = _toboundary.length();
566 if (toboundary != "")
567 boundaryqueue = new char[endpos];
568 int boundarypos = 0;
569
570 boundarysize = 0;
571
572 string line;
573 int nchars = 0;
574 while (crlfGetChar(c)) {
575 if (c == '\n') { ++nbodylines; ++nlines; }
576 nchars++;
577
578 if (toboundary == "")
579 continue;
580
581 // find boundary
582 boundaryqueue[boundarypos++ % endpos] = c;
583
584 if (compareStringToQueue(_toboundary, boundaryqueue, boundarypos, endpos)) {
585 boundarysize = _toboundary.length();
586 break;
587 }
588 }
589
590 delete boundaryqueue;
591
592 if (toboundary != "") {
593 char a;
594 if (!crlfGetChar(a))
595 eof = true;
596
597 if (a == '\n')
598 ++nlines;
599 char b;
600 if (!crlfGetChar(b))
601 eof = true;
602
603 if (b == '\n')
604 ++nlines;
605
606 if (a == '-' && b == '-') {
607 boundarysize += 2;
608 foundendofpart = true;
609 if (!crlfGetChar(a))
610 eof = true;
611
612 if (a == '\n')
613 ++nlines;
614
615 if (!crlfGetChar(b))
616 eof = true;
617
618 if (b == '\n')
619 ++nlines;
620 }
621
622 if (a == '\r' && b == '\n') {
623 // This exception is to handle a special case where the
624 // delimiter of one part is not followed by CRLF, but
625 // immediately followed by a CRLF prefixed delimiter.
626 if (!crlfGetChar(a) || !crlfGetChar(b))
627 eof = true;
628 else if (a == '-' && b == '-') {
629 crlfUnGetChar();
630 crlfUnGetChar();
631 crlfUnGetChar();
632 crlfUnGetChar();
633 } else {
634 crlfUnGetChar();
635 crlfUnGetChar();
636 }
637
638 boundarysize += 2;
639 } else {
640 crlfUnGetChar();
641 crlfUnGetChar();
642 }
643 }
644
645 // make sure bodylength doesn't overflow
646 bodylength = crlfoffset;
647 if (bodylength >= bodystartoffsetcrlf) {
648 bodylength -= bodystartoffsetcrlf;
649 if (bodylength >= (unsigned int) boundarysize) {
650 bodylength -= (unsigned int) boundarysize;
651 } else {
652 bodylength = 0;
653 }
654 } else {
655 bodylength = 0;
656 }
657 }
658
659 return (eof || foundendofpart) ? 1 : 0;
660 }
661