1 /* Copyright (C) 2004 J.F.Dockes
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the
14 * Free Software Foundation, Inc.,
15 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16 */
17
18 #ifndef TEST_MIMEPARSE
19 #include "autoconfig.h"
20
21 #include <string>
22 #include <vector>
23
24 #include <ctype.h>
25 #include <stdio.h>
26 #include <ctype.h>
27 #include <time.h>
28 #include <cstdlib>
29 #include <cstring>
30
31 #include "mimeparse.h"
32 #include "base64.h"
33 #include "transcode.h"
34 #include "smallut.h"
35
36 using namespace std;
37
38 //#define DEBUG_MIMEPARSE
39 #ifdef DEBUG_MIMEPARSE
40 #define DPRINT(X) fprintf X
41 #else
42 #define DPRINT(X)
43 #endif
44
45 // Parsing a header value. Only content-type and content-disposition
46 // have parameters, but others are compatible with content-type
47 // syntax, only, parameters are not used. So we can parse all like:
48 //
49 // headertype: value [; paramname=paramvalue] ...
50 //
51 // Value and paramvalues can be quoted strings, and there can be
52 // comments too. Note that RFC2047 is explicitly forbidden for
53 // parameter values (RFC2231 must be used), but I have seen it used
54 // anyway (ie: thunderbird 1.0)
55 //
56 // Ref: RFC2045/6/7 (MIME) RFC2183/2231 (content-disposition and encodings)
57
58
59
60 /** Decode a MIME parameter value encoded according to rfc2231
61 *
62 * Example input withs input charset == "":
63 * [iso-8859-1'french'RE%A0%3A_Smoke_Tests%20bla]
64 * Or (if charset is set) : RE%A0%3A_Smoke_Tests%20bla
65 *
66 * @param in input string, ascii with rfc2231 markup
67 * @param out output string
68 * @param charset if empty: decode string like 'charset'lang'more%20stuff,
69 * else just do the %XX part
70 * @return out output string encoded in utf-8
71 */
rfc2231_decode(const string & in,string & out,string & charset)72 bool rfc2231_decode(const string &in, string &out, string &charset)
73 {
74 string::size_type pos1, pos2=0;
75
76 if (charset.empty()) {
77 if ((pos1 = in.find("'")) == string::npos)
78 return false;
79 charset = in.substr(0, pos1);
80 // fprintf(stderr, "Charset: [%s]\n", charset.c_str());
81 pos1++;
82
83 if ((pos2 = in.find("'", pos1)) == string::npos)
84 return false;
85 // We have no use for lang for now
86 // string lang = in.substr(pos1, pos2-pos1);
87 // fprintf(stderr, "Lang: [%s]\n", lang.c_str());
88 pos2++;
89 }
90
91 string raw;
92 qp_decode(in.substr(pos2), raw, '%');
93 // fprintf(stderr, "raw [%s]\n", raw.c_str());
94 if (!transcode(raw, out, charset, "UTF-8"))
95 return false;
96 return true;
97 }
98
99
100 /////////////////////////////////////////
101 /// Decoding of MIME fields values and parameters
102
103 // The lexical token returned by find_next_token
104 class Lexical {
105 public:
106 enum kind {none, token, separator};
107 kind what;
108 string value;
109 string error;
110 char quote;
Lexical()111 Lexical() : what(none), quote(0) {}
reset()112 void reset() {what = none; value.erase(); error.erase();quote = 0;}
113 };
114
115 // Skip mime comment. This must be called with in[start] == '('
116 static string::size_type
skip_comment(const string & in,string::size_type start,Lexical & lex)117 skip_comment(const string &in, string::size_type start, Lexical &lex)
118 {
119 int commentlevel = 0;
120 for (; start < in.size(); start++) {
121 if (in[start] == '\\') {
122 // Skip escaped char.
123 if (start+1 < in.size()) {
124 start++;
125 continue;
126 } else {
127 lex.error.append("\\ at end of string ");
128 return in.size();
129 }
130 }
131 if (in[start] == '(')
132 commentlevel++;
133 if (in[start] == ')') {
134 if (--commentlevel == 0)
135 break;
136 }
137 }
138 if (start == in.size() && commentlevel != 0) {
139 lex.error.append("Unclosed comment ");
140 return in.size();
141 }
142 return start;
143 }
144
145 // Skip initial whitespace and (possibly nested) comments.
146 static string::size_type
skip_whitespace_and_comment(const string & in,string::size_type start,Lexical & lex)147 skip_whitespace_and_comment(const string &in, string::size_type start,
148 Lexical &lex)
149 {
150 while (1) {
151 if ((start = in.find_first_not_of(" \t\r\n", start)) == string::npos)
152 return in.size();
153 if (in[start] == '(') {
154 if ((start = skip_comment(in, start, lex)) == string::npos)
155 return string::npos;
156 } else {
157 break;
158 }
159 }
160 return start;
161 }
162
163 /// Find next token in mime header value string.
164 /// @return the next starting position in string, string::npos for error
165 /// @param in the input string
166 /// @param start the starting position
167 /// @param lex the returned token and its description
168 /// @param delims separators we should look for
169 static string::size_type
find_next_token(const string & in,string::size_type start,Lexical & lex,string delims=";=")170 find_next_token(const string &in, string::size_type start,
171 Lexical &lex, string delims = ";=")
172 {
173 char oquot, cquot;
174
175 start = skip_whitespace_and_comment(in, start, lex);
176 if (start == string::npos || start == in.size())
177 return in.size();
178
179 // Begins with separator ? return it.
180 string::size_type delimi = delims.find_first_of(in[start]);
181 if (delimi != string::npos) {
182 lex.what = Lexical::separator;
183 lex.value = delims[delimi];
184 return start+1;
185 }
186
187 // Check for start of quoted string
188 oquot = in[start];
189 switch (oquot) {
190 case '<': cquot = '>';break;
191 case '"': cquot = '"';break;
192 default: cquot = 0; break;
193 }
194
195 if (cquot != 0) {
196 // Quoted string parsing
197 string::size_type end;
198 start++; // Skip quote character
199 for (end = start;end < in.size() && in[end] != cquot; end++) {
200 if (in[end] == '\\') {
201 // Skip escaped char.
202 if (end+1 < in.size()) {
203 end++;
204 } else {
205 // backslash at end of string: error
206 lex.error.append("\\ at end of string ");
207 return string::npos;
208 }
209 }
210 }
211 if (end == in.size()) {
212 // Found end of string before closing quote character: error
213 lex.error.append("Unclosed quoted string ");
214 return string::npos;
215 }
216 lex.what = Lexical::token;
217 lex.value = in.substr(start, end-start);
218 lex.quote = oquot;
219 return ++end;
220 } else {
221 string::size_type end = in.find_first_of(delims + "\r\n \t(", start);
222 lex.what = Lexical::token;
223 lex.quote = 0;
224 if (end == string::npos) {
225 end = in.size();
226 lex.value = in.substr(start);
227 } else {
228 lex.value = in.substr(start, end-start);
229 }
230 return end;
231 }
232 }
233
234 // Classes for handling rfc2231 value continuations
235 class Chunk {
236 public:
Chunk()237 Chunk() : decode(false) {}
238 bool decode;
239 string value;
240 };
241 class Chunks {
242 public:
243 vector<Chunk> chunks;
244 };
245
stringtolower(string & out,const string & in)246 void stringtolower(string &out, const string& in)
247 {
248 for (string::size_type i = 0; i < in.size(); i++)
249 out.append(1, char(tolower(in[i])));
250 }
251
252 // Parse MIME field value. Should look like:
253 // somevalue ; param1=val1;param2=val2
parseMimeHeaderValue(const string & value,MimeHeaderValue & parsed)254 bool parseMimeHeaderValue(const string& value, MimeHeaderValue& parsed)
255 {
256 parsed.value.erase();
257 parsed.params.clear();
258
259 Lexical lex;
260 string::size_type start = 0;
261
262 // Get the field value
263 start = find_next_token(value, start, lex);
264 if (start == string::npos || lex.what != Lexical::token)
265 return false;
266 parsed.value = lex.value;
267
268 map<string, string> rawparams;
269 // Look for parameters
270 for (;;) {
271 string paramname, paramvalue;
272 lex.reset();
273 start = find_next_token(value, start, lex);
274 if (start == value.size())
275 break;
276 if (start == string::npos) {
277 //fprintf(stderr, "Find_next_token error(1)\n");
278 return false;
279 }
280 if (lex.what == Lexical::separator && lex.value[0] == ';')
281 continue;
282 if (lex.what != Lexical::token)
283 return false;
284 stringtolower(paramname, lex.value);
285
286 start = find_next_token(value, start, lex);
287 if (start == string::npos || lex.what != Lexical::separator ||
288 lex.value[0] != '=') {
289 //fprintf(stderr, "Find_next_token error (2)\n");
290 return false;
291 }
292
293 start = find_next_token(value, start, lex);
294 if (start == string::npos || lex.what != Lexical::token) {
295 //fprintf(stderr, "Parameter has no value!");
296 return false;
297 }
298 paramvalue = lex.value;
299 rawparams[paramname] = paramvalue;
300 //fprintf(stderr, "RAW: name [%s], value [%s]\n", paramname.c_str(),
301 // paramvalue.c_str());
302 }
303 // fprintf(stderr, "Number of raw params %d\n", rawparams.size());
304
305 // RFC2231 handling:
306 // - if a parameter name ends in * it must be decoded
307 // - If a parameter name looks line name*ii[*] it is a
308 // partial value, and must be concatenated with other such.
309
310 map<string, Chunks> chunks;
311 for (map<string, string>::const_iterator it = rawparams.begin();
312 it != rawparams.end(); it++) {
313 string nm = it->first;
314 // fprintf(stderr, "NM: [%s]\n", nm.c_str());
315 if (nm.empty()) // ??
316 continue;
317
318 Chunk chunk;
319 if (nm[nm.length()-1] == '*') {
320 nm.erase(nm.length() - 1);
321 chunk.decode = true;
322 } else
323 chunk.decode = false;
324 // fprintf(stderr, "NM1: [%s]\n", nm.c_str());
325
326 chunk.value = it->second;
327
328 // Look for another asterisk in nm. If none, assign index 0
329 string::size_type aster;
330 int idx = 0;
331 if ((aster = nm.rfind("*")) != string::npos) {
332 string num = nm.substr(aster+1);
333 //fprintf(stderr, "NUM: [%s]\n", num.c_str());
334 nm.erase(aster);
335 idx = atoi(num.c_str());
336 }
337 Chunks empty;
338 if (chunks.find(nm) == chunks.end())
339 chunks[nm] = empty;
340 chunks[nm].chunks.resize(idx+1);
341 chunks[nm].chunks[idx] = chunk;
342 //fprintf(stderr, "CHNKS: nm [%s], idx %d, decode %d, value [%s]\n",
343 // nm.c_str(), idx, int(chunk.decode), chunk.value.c_str());
344 }
345
346 // For each parameter name, concatenate its chunks and possibly
347 // decode Note that we pass the whole concatenated string to
348 // decoding if the first chunk indicates that decoding is needed,
349 // which is not right because there might be uncoded chunks
350 // according to the rfc.
351 for (map<string, Chunks>::const_iterator it = chunks.begin();
352 it != chunks.end(); it++) {
353 if (it->second.chunks.empty())
354 continue;
355 string nm = it->first;
356 // Create the name entry
357 if (parsed.params.find(nm) == parsed.params.end())
358 parsed.params[nm].clear();
359 // Concatenate all chunks and decode the whole if the first one needs
360 // to. Yes, this is not quite right.
361 string value;
362 for (vector<Chunk>::const_iterator vi = it->second.chunks.begin();
363 vi != it->second.chunks.end(); vi++) {
364 value += vi->value;
365 }
366 if (it->second.chunks[0].decode) {
367 string charset;
368 rfc2231_decode(value, parsed.params[nm], charset);
369 } else {
370 // rfc2047 MUST NOT but IS used by some agents
371 rfc2047_decode(value, parsed.params[nm]);
372 }
373 //fprintf(stderr, "FINAL: nm [%s], value [%s]\n",
374 //nm.c_str(), parsed.params[nm].c_str());
375 }
376
377 return true;
378 }
379
380 // Decode a string encoded with quoted-printable encoding.
381 // we reuse the code for rfc2231 % encoding, even if the eol
382 // processing is not useful in this case
qp_decode(const string & in,string & out,char esc)383 bool qp_decode(const string& in, string &out, char esc)
384 {
385 out.reserve(in.length());
386 string::size_type ii;
387 for (ii = 0; ii < in.length(); ii++) {
388 if (in[ii] == esc) {
389 ii++; // Skip '=' or '%'
390 if(ii >= in.length() - 1) { // Need at least 2 more chars
391 break;
392 } else if (in[ii] == '\r' && in[ii+1] == '\n') { // Soft nl, skip
393 ii++;
394 } else if (in[ii] != '\n' && in[ii] != '\r') { // decode
395 char c = in[ii];
396 char co;
397 if(c >= 'A' && c <= 'F') {
398 co = char((c - 'A' + 10) * 16);
399 } else if (c >= 'a' && c <= 'f') {
400 co = char((c - 'a' + 10) * 16);
401 } else if (c >= '0' && c <= '9') {
402 co = char((c - '0') * 16);
403 } else {
404 return false;
405 }
406 if(++ii >= in.length())
407 break;
408 c = in[ii];
409 if (c >= 'A' && c <= 'F') {
410 co += char(c - 'A' + 10);
411 } else if (c >= 'a' && c <= 'f') {
412 co += char(c - 'a' + 10);
413 } else if (c >= '0' && c <= '9') {
414 co += char(c - '0');
415 } else {
416 return false;
417 }
418 out += co;
419 }
420 } else {
421 out += in[ii];
422 }
423 }
424 return true;
425 }
426
427 // Decode an word encoded as quoted printable or base 64
rfc2047_decodeParsed(const std::string & charset,const std::string & encoding,const std::string & value,std::string & utf8)428 static bool rfc2047_decodeParsed(const std::string& charset,
429 const std::string& encoding,
430 const std::string& value,
431 std::string &utf8)
432 {
433 DPRINT((stderr, "DecodeParsed: charset [%s] enc [%s] val [%s]\n",
434 charset.c_str(), encoding.c_str(), value.c_str()));
435 utf8.clear();
436
437 string decoded;
438 if (!stringlowercmp("b", encoding)) {
439 if (!base64_decode(value, decoded))
440 return false;
441 DPRINT((stderr, "FromB64: [%s]\n", decoded.c_str()));
442 } else if (!stringlowercmp("q", encoding)) {
443 if (!qp_decode(value, decoded))
444 return false;
445 // Need to translate _ to ' ' here
446 string temp;
447 for (string::size_type pos = 0; pos < decoded.length(); pos++)
448 if (decoded[pos] == '_')
449 temp += ' ';
450 else
451 temp += decoded[pos];
452 decoded = temp;
453 DPRINT((stderr, "FromQP: [%s]\n", decoded.c_str()));
454 } else {
455 DPRINT((stderr, "Bad encoding [%s]\n", encoding.c_str()));
456 return false;
457 }
458
459 if (!transcode(decoded, utf8, charset, "UTF-8")) {
460 DPRINT((stderr, "Transcode failed\n"));
461 return false;
462 }
463 return true;
464 }
465
466 // Parse a mail header value encoded according to RFC2047.
467 // This is not supposed to be used for MIME parameter values, but it
468 // happens.
469 // Bugs:
470 // - We should turn off decoding while inside quoted strings
471 //
472 typedef enum {rfc2047ready, rfc2047open_eq,
473 rfc2047charset, rfc2047encoding,
474 rfc2047value, rfc2047close_q} Rfc2047States;
475
rfc2047_decode(const std::string & in,std::string & out)476 bool rfc2047_decode(const std::string& in, std::string &out)
477 {
478 DPRINT((stderr, "rfc2047_decode: [%s]\n", in.c_str()));
479
480 Rfc2047States state = rfc2047ready;
481 string encoding, charset, value, utf8;
482
483 out.clear();
484
485 for (string::size_type ii = 0; ii < in.length(); ii++) {
486 char ch = in[ii];
487 switch (state) {
488 case rfc2047ready:
489 {
490 DPRINT((stderr, "STATE: ready, ch %c\n", ch));
491 switch (ch) {
492 // Whitespace: stay ready
493 case ' ': case '\t': value += ch;break;
494 // '=' -> forward to next state
495 case '=': state = rfc2047open_eq; break;
496 DPRINT((stderr, "STATE: open_eq\n"));
497 // Other: go back to sleep
498 default: value += ch; state = rfc2047ready;
499 }
500 }
501 break;
502 case rfc2047open_eq:
503 {
504 DPRINT((stderr, "STATE: open_eq, ch %c\n", ch));
505 switch (ch) {
506 case '?':
507 {
508 // Transcode current (unencoded part) value:
509 // we sometimes find 8-bit chars in
510 // there. Interpret as Iso8859.
511 if (value.length() > 0) {
512 transcode(value, utf8, "ISO-8859-1", "UTF-8");
513 out += utf8;
514 value.clear();
515 }
516 state = rfc2047charset;
517 }
518 break;
519 default: state = rfc2047ready; value += '='; value += ch;break;
520 }
521 }
522 break;
523 case rfc2047charset:
524 {
525 DPRINT((stderr, "STATE: charset, ch %c\n", ch));
526 switch (ch) {
527 case '?': state = rfc2047encoding; break;
528 default: charset += ch; break;
529 }
530 }
531 break;
532 case rfc2047encoding:
533 {
534 DPRINT((stderr, "STATE: encoding, ch %c\n", ch));
535 switch (ch) {
536 case '?': state = rfc2047value; break;
537 default: encoding += ch; break;
538 }
539 }
540 break;
541 case rfc2047value:
542 {
543 DPRINT((stderr, "STATE: value, ch %c\n", ch));
544 switch (ch) {
545 case '?': state = rfc2047close_q; break;
546 default: value += ch;break;
547 }
548 }
549 break;
550 case rfc2047close_q:
551 {
552 DPRINT((stderr, "STATE: close_q, ch %c\n", ch));
553 switch (ch) {
554 case '=':
555 {
556 DPRINT((stderr, "End of encoded area. Charset %s, Encoding %s\n", charset.c_str(), encoding.c_str()));
557 string utf8;
558 state = rfc2047ready;
559 if (!rfc2047_decodeParsed(charset, encoding, value,
560 utf8)) {
561 return false;
562 }
563 out += utf8;
564 charset.clear();
565 encoding.clear();
566 value.clear();
567 }
568 break;
569 default: state = rfc2047value; value += '?';value += ch;break;
570 }
571 }
572 break;
573 default: // ??
574 DPRINT((stderr, "STATE: default ?? ch %c\n", ch));
575 return false;
576 }
577 }
578
579 if (value.length() > 0) {
580 transcode(value, utf8, "CP1252", "UTF-8");
581 out += utf8;
582 value.clear();
583 }
584 if (state != rfc2047ready)
585 return false;
586 return true;
587 }
588
589 #define DEBUGDATE 0
590 #if DEBUGDATE
591 #define DATEDEB(X) fprintf X
592 #else
593 #define DATEDEB(X)
594 #endif
595
596 // Convert rfc822 date to unix time. A date string normally looks like:
597 // Mon, 3 Jul 2006 09:51:58 +0200
598 // But there are many close common variations
599 // And also hopeless things like: Fri Nov 3 13:13:33 2006
rfc2822DateToUxTime(const string & dt)600 time_t rfc2822DateToUxTime(const string& dt)
601 {
602 // Strip everything up to first comma if any, we don't need weekday,
603 // then break into tokens
604 vector<string> toks;
605 string::size_type idx;
606 if ((idx = dt.find_first_of(",")) != string::npos) {
607 if (idx == dt.length() - 1) {
608 DATEDEB((stderr, "Bad rfc822 date format (short1): [%s]\n",
609 dt.c_str()));
610 return (time_t)-1;
611 }
612 string date = dt.substr(idx+1);
613 stringToTokens(date, toks, " \t:");
614 } else {
615 // No comma. Enter strangeland
616 stringToTokens(dt, toks, " \t:");
617 // Test for date like: Sun Nov 19 06:18:41 2006
618 // 0 1 2 3 4 5 6
619 // and change to: 19 Nov 2006 06:18:41
620 if (toks.size() == 7) {
621 if (toks[0].length() == 3 &&
622 toks[0].find_first_of("0123456789") == string::npos) {
623 swap(toks[0], toks[2]);
624 swap(toks[6], toks[2]);
625 toks.pop_back();
626 }
627 }
628 }
629
630 #if DEBUGDATE
631 for (list<string>::iterator it = toks.begin(); it != toks.end(); it++) {
632 DATEDEB((stderr, "[%s] ", it->c_str()));
633 }
634 DATEDEB((stderr, "\n"));
635 #endif
636
637 if (toks.size() < 6) {
638 DATEDEB((stderr, "Bad rfc822 date format (toks cnt): [%s]\n",
639 dt.c_str()));
640 return (time_t)-1;
641 }
642
643 if (toks.size() == 6) {
644 // Probably no timezone, sometimes happens
645 toks.push_back("+0000");
646 }
647
648 struct tm tm;
649 memset(&tm, 0, sizeof(tm));
650
651 // Load struct tm with appropriate tokens, possibly converting
652 // when needed
653
654 vector<string>::iterator it = toks.begin();
655
656 // Day of month: no conversion needed
657 tm.tm_mday = atoi(it->c_str());
658 it++;
659
660 // Month. Only Jan-Dec are legal. January, February do happen
661 // though. Convert to 0-11
662 if (*it == "Jan" || *it == "January") tm.tm_mon = 0; else if
663 (*it == "Feb" || *it == "February") tm.tm_mon = 1; else if
664 (*it == "Mar" || *it == "March") tm.tm_mon = 2; else if
665 (*it == "Apr" || *it == "April") tm.tm_mon = 3; else if
666 (*it == "May") tm.tm_mon = 4; else if
667 (*it == "Jun" || *it == "June") tm.tm_mon = 5; else if
668 (*it == "Jul" || *it == "July") tm.tm_mon = 6; else if
669 (*it == "Aug" || *it == "August") tm.tm_mon = 7; else if
670 (*it == "Sep" || *it == "September") tm.tm_mon = 8; else if
671 (*it == "Oct" || *it == "October") tm.tm_mon = 9; else if
672 (*it == "Nov" || *it == "November") tm.tm_mon = 10; else if
673 (*it == "Dec" || *it == "December") tm.tm_mon = 11; else {
674 DATEDEB((stderr, "Bad rfc822 date format (month): [%s]\n",
675 dt.c_str()));
676 return (time_t)-1;
677 }
678 it++;
679
680 // Year. Struct tm counts from 1900. 2 char years are quite rare
681 // but do happen. I've seen 00 happen so count small values from 2000
682 tm.tm_year = atoi(it->c_str());
683 if (it->length() == 2) {
684 if (tm.tm_year < 10)
685 tm.tm_year += 2000;
686 else
687 tm.tm_year += 1900;
688 }
689 if (tm.tm_year > 1900)
690 tm.tm_year -= 1900;
691 it++;
692
693 // Hour minute second need no adjustments
694 tm.tm_hour = atoi(it->c_str()); it++;
695 tm.tm_min = atoi(it->c_str()); it++;
696 tm.tm_sec = atoi(it->c_str()); it++;
697
698
699 // Timezone is supposed to be either +-XYZT or a zone name
700 int zonesecs = 0;
701 if (it->length() < 1) {
702 DATEDEB((stderr, "Bad rfc822 date format (zlen): [%s]\n", dt.c_str()));
703 return (time_t)-1;
704 }
705 if (it->at(0) == '-' || it->at(0) == '+') {
706 // Note that +xy:zt (instead of +xyzt) sometimes happen, we
707 // may want to process it one day
708 if (it->length() < 5) {
709 DATEDEB((stderr, "Bad rfc822 date format (zlen1): [%s]\n",
710 dt.c_str()));
711 goto nozone;
712 }
713 zonesecs = 3600*((it->at(1)-'0') * 10 + it->at(2)-'0')+
714 (it->at(3)-'0')*10 + it->at(4)-'0';
715 zonesecs = it->at(0) == '+' ? -1 * zonesecs : zonesecs;
716 } else {
717 int hours;
718 if (*it == "A") hours= 1; else if (*it == "B") hours= 2;
719 else if (*it == "C") hours= 3; else if (*it == "D") hours= 4;
720 else if (*it == "E") hours= 5; else if (*it == "F") hours= 6;
721 else if (*it == "G") hours= 7; else if (*it == "H") hours= 8;
722 else if (*it == "I") hours= 9; else if (*it == "K") hours= 10;
723 else if (*it == "L") hours= 11; else if (*it == "M") hours= 12;
724 else if (*it == "N") hours= -1; else if (*it == "O") hours= -2;
725 else if (*it == "P") hours= -3; else if (*it == "Q") hours= -4;
726 else if (*it == "R") hours= -5; else if (*it == "S") hours= -6;
727 else if (*it == "T") hours= -7; else if (*it == "U") hours= -8;
728 else if (*it == "V") hours= -9; else if (*it == "W") hours= -10;
729 else if (*it == "X") hours= -11; else if (*it == "Y") hours= -12;
730 else if (*it == "Z") hours= 0; else if (*it == "UT") hours= 0;
731 else if (*it == "GMT") hours= 0; else if (*it == "EST") hours= 5;
732 else if (*it == "EDT") hours= 4; else if (*it == "CST") hours= 6;
733 else if (*it == "CDT") hours= 5; else if (*it == "MST") hours= 7;
734 else if (*it == "MDT") hours= 6; else if (*it == "PST") hours= 8;
735 else if (*it == "PDT") hours= 7;
736 // Non standard names
737 // Standard Time (or Irish Summer Time?) is actually +5.5
738 else if (*it == "CET") hours= -1; else if (*it == "JST") hours= -9;
739 else if (*it == "IST") hours= -5; else if (*it == "WET") hours= 0;
740 else if (*it == "MET") hours= -1;
741 else {
742 DATEDEB((stderr, "Bad rfc822 date format (zname): [%s]\n",
743 dt.c_str()));
744 // Forget tz
745 goto nozone;
746 }
747 zonesecs = 3600 * hours;
748 }
749 DATEDEB((stderr, "Tz: [%s] -> %d\n", it->c_str(), zonesecs));
750 nozone:
751
752 // Compute the UTC Unix time value
753 #ifndef sun
754 time_t tim = timegm(&tm);
755 #else
756 // No timegm on Sun. Use mktime, then correct for local timezone
757 time_t tim = mktime(&tm);
758 // altzone and timezone hold the difference in seconds between UTC
759 // and local. They are negative for places east of greenwich
760 //
761 // mktime takes our buffer to be local time, so it adds timezone
762 // to the conversion result (if timezone is < 0 it's currently
763 // earlier in greenwhich).
764 //
765 // We have to substract it back (hey! hopefully! maybe we have to
766 // add it). Who can really know?
767 tim -= timezone;
768 #endif
769
770 // And add in the correction from the email's Tz
771 tim += zonesecs;
772
773 DATEDEB((stderr, "Date: %s uxtime %ld \n", ctime(&tim), tim));
774 return tim;
775 }
776
777 #else
778
779 #include <stdio.h>
780 #include <stdlib.h>
781 #include <string.h>
782 #include <time.h>
783
784 #include <string>
785 #include "mimeparse.h"
786 #include "readfile.h"
787
788
789 using namespace std;
790 extern bool rfc2231_decode(const string& in, string& out, string& charset);
791 extern time_t rfc2822DateToUxTime(const string& date);
792 static const char *thisprog;
793
794 static char usage [] =
795 "-p: header value and parameter test\n"
796 "-q: qp decoding\n"
797 "-b: base64\n"
798 "-7: rfc2047\n"
799 "-1: rfc2331\n"
800 "-t: date time\n"
801 " \n\n"
802 ;
803 static void
Usage(void)804 Usage(void)
805 {
806 fprintf(stderr, "%s: usage:\n%s", thisprog, usage);
807 exit(1);
808 }
809
810 static int op_flags;
811 #define OPT_MOINS 0x1
812 #define OPT_p 0x2
813 #define OPT_q 0x4
814 #define OPT_b 0x8
815 #define OPT_7 0x10
816 #define OPT_1 0x20
817 #define OPT_t 0x40
818 int
main(int argc,const char ** argv)819 main(int argc, const char **argv)
820 {
821 int count = 10;
822
823 thisprog = argv[0];
824 argc--; argv++;
825
826 while (argc > 0 && **argv == '-') {
827 (*argv)++;
828 if (!(**argv))
829 /* Cas du "adb - core" */
830 Usage();
831 while (**argv)
832 switch (*(*argv)++) {
833 case 'p': op_flags |= OPT_p; break;
834 case 'q': op_flags |= OPT_q; break;
835 case 'b': op_flags |= OPT_b; break;
836 case '1': op_flags |= OPT_1; break;
837 case '7': op_flags |= OPT_7; break;
838 case 't': op_flags |= OPT_t; break;
839 default: Usage(); break;
840 }
841 b1: argc--; argv++;
842 }
843
844 if (argc != 0)
845 Usage();
846
847 if (op_flags & OPT_p) {
848 // Mime header value and parameters extraction
849 const char *tr[] = {
850 "text/html;charset = UTF-8 ; otherparam=garb; \n"
851 "QUOTEDPARAM=\"quoted value\"",
852
853 "text/plain; charset=ASCII\r\n name=\"809D3016_5691DPS_5.2.LIC\"",
854
855 "application/x-stuff;"
856 "title*0*=us-ascii'en'This%20is%20even%20more%20;"
857 "title*1*=%2A%2A%2Afun%2A%2A%2A%20;"
858 "title*2=\"isn't it!\"",
859
860 // The following are all invalid, trying to crash the parser...
861 "",
862 // This does not parse because of whitespace in the value.
863 " complete garbage;",
864 // This parses, but only the first word gets into the value
865 " some value",
866 " word ;", ";", "=", "; = ", "a;=\"toto tutu\"=", ";;;;a=b",
867 };
868
869 for (unsigned int i = 0; i < sizeof(tr) / sizeof(char *); i++) {
870 MimeHeaderValue parsed;
871 if (!parseMimeHeaderValue(tr[i], parsed)) {
872 fprintf(stderr, "PARSE ERROR for [%s]\n", tr[i]);
873 continue;
874 }
875 printf("Field value: [%s]\n", parsed.value.c_str());
876 map<string, string>::iterator it;
877 for (it = parsed.params.begin();it != parsed.params.end();it++) {
878 if (it == parsed.params.begin())
879 printf("Parameters:\n");
880 printf(" [%s] = [%s]\n", it->first.c_str(), it->second.c_str());
881 }
882 }
883
884 } else if (op_flags & OPT_q) {
885 // Quoted printable stuff
886 const char *qp =
887 "=41=68 =e0 boire=\r\n continue 1ere\ndeuxieme\n\r3eme "
888 "agrave is: '=E0' probable skipped decode error: =\n"
889 "Actual decode error =xx this wont show";
890
891 string out;
892 if (!qp_decode(string(qp), out)) {
893 fprintf(stderr, "qp_decode returned error\n");
894 }
895 printf("Decoded: '%s'\n", out.c_str());
896 } else if (op_flags & OPT_b) {
897 // Base64
898 //'C'est � boire qu'il nous faut �viter l'exc�s.'
899 //'Deuxi�me ligne'
900 //'Troisi�me ligne'
901 //'Et la fin (pas de nl). '
902 const char *b64 =
903 "Qydlc3Qg4CBib2lyZSBxdSdpbCBub3VzIGZhdXQg6XZpdGVyIGwnZXhj6HMuCkRldXhp6G1l\r\n"
904 "IGxpZ25lClRyb2lzaehtZSBsaWduZQpFdCBsYSBmaW4gKHBhcyBkZSBubCkuIA==\r\n";
905
906 string out;
907 if (!base64_decode(string(b64), out)) {
908 fprintf(stderr, "base64_decode returned error\n");
909 exit(1);
910 }
911 printf("Decoded: [%s]\n", out.c_str());
912 #if 0
913 string coded, decoded;
914 const char *fname = "/tmp/recoll_decodefail";
915 if (!file_to_string(fname, coded)) {
916 fprintf(stderr, "Cant read %s\n", fname);
917 exit(1);
918 }
919
920 if (!base64_decode(coded, decoded)) {
921 fprintf(stderr, "base64_decode returned error\n");
922 exit(1);
923 }
924 printf("Decoded: [%s]\n", decoded.c_str());
925 #endif
926
927 } else if (op_flags & (OPT_7|OPT_1)) {
928 // rfc2047
929 char line [1024];
930 string out;
931 bool res;
932 while (fgets(line, 1023, stdin)) {
933 int l = strlen(line);
934 if (l == 0)
935 continue;
936 line[l-1] = 0;
937 fprintf(stderr, "Line: [%s]\n", line);
938 string charset;
939 if (op_flags & OPT_7) {
940 res = rfc2047_decode(line, out);
941 } else {
942 res = rfc2231_decode(line, out, charset);
943 }
944 if (res)
945 fprintf(stderr, "Out: [%s] cs %s\n", out.c_str(), charset.c_str());
946 else
947 fprintf(stderr, "Decoding failed\n");
948 }
949 } else if (op_flags & OPT_t) {
950 time_t t;
951
952 const char *dates[] = {
953 " Wed, 13 Sep 2006 11:40:26 -0700 (PDT)",
954 " Mon, 3 Jul 2006 09:51:58 +0200",
955 " Wed, 13 Sep 2006 08:19:48 GMT-07:00",
956 " Wed, 13 Sep 2006 11:40:26 -0700 (PDT)",
957 " Sat, 23 Dec 89 19:27:12 EST",
958 " 13 Jan 90 08:23:29 GMT"};
959
960 for (unsigned int i = 0; i <sizeof(dates) / sizeof(char *); i++) {
961 t = rfc2822DateToUxTime(dates[i]);
962 struct tm *tm = localtime(&t);
963 char datebuf[100];
964 strftime(datebuf, 99, " %Y-%m-%d %H:%M:%S %z", tm);
965 printf("[%s] -> [%s]\n", dates[i], datebuf);
966 }
967 printf("Enter date:\n");
968 char line [1024];
969 while (fgets(line, 1023, stdin)) {
970 int l = strlen(line);
971 if (l == 0) continue;
972 line[l-1] = 0;
973 t = rfc2822DateToUxTime(line);
974 struct tm *tm = localtime(&t);
975 char datebuf[100];
976 strftime(datebuf, 99, " %Y-%m-%d %H:%M:%S %z", tm);
977 printf("[%s] -> [%s]\n", line, datebuf);
978 }
979
980
981 }
982 exit(0);
983 }
984
985 #endif // TEST_MIMEPARSE
986