1 /* Copyright (C) 2004 J.F.Dockes
2  *   This program is free software; you can redistribute it and/or modify
3  *   it under the terms of the GNU General Public License as published by
4  *   the Free Software Foundation; either version 2 of the License, or
5  *   (at your option) any later version.
6  *
7  *   This program is distributed in the hope that it will be useful,
8  *   but WITHOUT ANY WARRANTY; without even the implied warranty of
9  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10  *   GNU General Public License for more details.
11  *
12  *   You should have received a copy of the GNU General Public License
13  *   along with this program; if not, write to the
14  *   Free Software Foundation, Inc.,
15  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
16  */
17 
18 #ifndef TEST_MIMEPARSE
19 #include "autoconfig.h"
20 
21 #include <string>
22 #include <vector>
23 
24 #include <ctype.h>
25 #include <stdio.h>
26 #include <ctype.h>
27 #include <time.h>
28 #include <cstdlib>
29 #include <cstring>
30 
31 #include "mimeparse.h"
32 #include "base64.h"
33 #include "transcode.h"
34 #include "smallut.h"
35 
36 using namespace std;
37 
38 //#define DEBUG_MIMEPARSE
39 #ifdef DEBUG_MIMEPARSE
40 #define DPRINT(X) fprintf X
41 #else
42 #define DPRINT(X)
43 #endif
44 
45 // Parsing a header value. Only content-type and content-disposition
46 // have parameters, but others are compatible with content-type
47 // syntax, only, parameters are not used. So we can parse all like:
48 //
49 //    headertype: value [; paramname=paramvalue] ...
50 //
51 // Value and paramvalues can be quoted strings, and there can be
52 // comments too. Note that RFC2047 is explicitly forbidden for
53 // parameter values (RFC2231 must be used), but I have seen it used
54 // anyway (ie: thunderbird 1.0)
55 //
56 // Ref: RFC2045/6/7 (MIME) RFC2183/2231 (content-disposition and encodings)
57 
58 
59 
60 /** Decode a MIME parameter value encoded according to rfc2231
61  *
62  * Example input withs input charset == "":
63  *     [iso-8859-1'french'RE%A0%3A_Smoke_Tests%20bla]
64  * Or (if charset is set) : RE%A0%3A_Smoke_Tests%20bla
65  *
66  * @param in input string, ascii with rfc2231 markup
67  * @param out output string
68  * @param charset if empty: decode string like 'charset'lang'more%20stuff,
69  *      else just do the %XX part
70  * @return out output string encoded in utf-8
71  */
rfc2231_decode(const string & in,string & out,string & charset)72 bool rfc2231_decode(const string &in, string &out, string &charset)
73 {
74     string::size_type pos1, pos2=0;
75 
76     if (charset.empty()) {
77         if ((pos1 = in.find("'")) == string::npos)
78             return false;
79         charset = in.substr(0, pos1);
80         // fprintf(stderr, "Charset: [%s]\n", charset.c_str());
81         pos1++;
82 
83         if ((pos2 = in.find("'", pos1)) == string::npos)
84             return false;
85         // We have no use for lang for now
86         // string lang = in.substr(pos1, pos2-pos1);
87         // fprintf(stderr, "Lang: [%s]\n", lang.c_str());
88         pos2++;
89     }
90 
91     string raw;
92     qp_decode(in.substr(pos2), raw, '%');
93     // fprintf(stderr, "raw [%s]\n", raw.c_str());
94     if (!transcode(raw, out, charset, "UTF-8"))
95         return false;
96     return true;
97 }
98 
99 
100 /////////////////////////////////////////
101 /// Decoding of MIME fields values and parameters
102 
103 // The lexical token returned by find_next_token
104 class Lexical {
105 public:
106     enum kind {none, token, separator};
107     kind   what;
108     string value;
109     string error;
110     char quote;
Lexical()111     Lexical() : what(none), quote(0) {}
reset()112     void reset() {what = none; value.erase(); error.erase();quote = 0;}
113 };
114 
115 // Skip mime comment. This must be called with in[start] == '('
116 static string::size_type
skip_comment(const string & in,string::size_type start,Lexical & lex)117 skip_comment(const string &in, string::size_type start, Lexical &lex)
118 {
119     int commentlevel = 0;
120     for (; start < in.size(); start++) {
121         if (in[start] == '\\') {
122             // Skip escaped char.
123             if (start+1 < in.size()) {
124                 start++;
125                 continue;
126             } else {
127                 lex.error.append("\\ at end of string ");
128                 return in.size();
129             }
130         }
131         if (in[start] == '(')
132             commentlevel++;
133         if (in[start] == ')') {
134             if (--commentlevel == 0)
135                 break;
136         }
137     }
138     if (start == in.size() && commentlevel != 0) {
139         lex.error.append("Unclosed comment ");
140         return in.size();
141     }
142     return start;
143 }
144 
145 // Skip initial whitespace and (possibly nested) comments.
146 static string::size_type
skip_whitespace_and_comment(const string & in,string::size_type start,Lexical & lex)147 skip_whitespace_and_comment(const string &in, string::size_type start,
148                             Lexical &lex)
149 {
150     while (1) {
151         if ((start = in.find_first_not_of(" \t\r\n", start)) == string::npos)
152             return in.size();
153         if (in[start] == '(') {
154             if ((start = skip_comment(in, start, lex)) == string::npos)
155                 return string::npos;
156         } else {
157             break;
158         }
159     }
160     return start;
161 }
162 
163 /// Find next token in mime header value string.
164 /// @return the next starting position in string, string::npos for error
165 /// @param in the input string
166 /// @param start the starting position
167 /// @param lex  the returned token and its description
168 /// @param delims separators we should look for
169 static string::size_type
find_next_token(const string & in,string::size_type start,Lexical & lex,string delims=";=")170 find_next_token(const string &in, string::size_type start,
171                 Lexical &lex, string delims = ";=")
172 {
173     char oquot, cquot;
174 
175     start = skip_whitespace_and_comment(in, start, lex);
176     if (start == string::npos || start == in.size())
177         return in.size();
178 
179     // Begins with separator ? return it.
180     string::size_type delimi = delims.find_first_of(in[start]);
181     if (delimi != string::npos) {
182         lex.what = Lexical::separator;
183         lex.value = delims[delimi];
184         return start+1;
185     }
186 
187     // Check for start of quoted string
188     oquot = in[start];
189     switch (oquot) {
190     case '<': cquot = '>';break;
191     case '"': cquot = '"';break;
192     default: cquot = 0; break;
193     }
194 
195     if (cquot != 0) {
196         // Quoted string parsing
197         string::size_type end;
198         start++; // Skip quote character
199         for (end = start;end < in.size() && in[end] != cquot; end++) {
200             if (in[end] == '\\') {
201                 // Skip escaped char.
202                 if (end+1 < in.size()) {
203                     end++;
204                 } else {
205                     // backslash at end of string: error
206                     lex.error.append("\\ at end of string ");
207                     return string::npos;
208                 }
209             }
210         }
211         if (end == in.size()) {
212             // Found end of string before closing quote character: error
213             lex.error.append("Unclosed quoted string ");
214             return string::npos;
215         }
216         lex.what = Lexical::token;
217         lex.value = in.substr(start, end-start);
218         lex.quote = oquot;
219         return ++end;
220     } else {
221         string::size_type end = in.find_first_of(delims + "\r\n \t(", start);
222         lex.what = Lexical::token;
223         lex.quote = 0;
224         if (end == string::npos) {
225             end = in.size();
226             lex.value = in.substr(start);
227         } else {
228             lex.value = in.substr(start, end-start);
229         }
230         return end;
231     }
232 }
233 
234 // Classes for handling rfc2231 value continuations
235 class Chunk {
236 public:
Chunk()237     Chunk() : decode(false) {}
238     bool decode;
239     string value;
240 };
241 class Chunks {
242 public:
243     vector<Chunk> chunks;
244 };
245 
stringtolower(string & out,const string & in)246 void stringtolower(string &out, const string& in)
247 {
248     for (string::size_type i = 0; i < in.size(); i++)
249         out.append(1, char(tolower(in[i])));
250 }
251 
252 // Parse MIME field value. Should look like:
253 //  somevalue ; param1=val1;param2=val2
parseMimeHeaderValue(const string & value,MimeHeaderValue & parsed)254 bool parseMimeHeaderValue(const string& value, MimeHeaderValue& parsed)
255 {
256     parsed.value.erase();
257     parsed.params.clear();
258 
259     Lexical lex;
260     string::size_type start = 0;
261 
262     // Get the field value
263     start = find_next_token(value, start, lex);
264     if (start == string::npos || lex.what != Lexical::token)
265         return false;
266     parsed.value = lex.value;
267 
268     map<string, string> rawparams;
269     // Look for parameters
270     for (;;) {
271         string paramname, paramvalue;
272         lex.reset();
273         start = find_next_token(value, start, lex);
274         if (start == value.size())
275             break;
276         if (start == string::npos) {
277             //fprintf(stderr, "Find_next_token error(1)\n");
278             return false;
279         }
280         if (lex.what == Lexical::separator && lex.value[0] == ';')
281             continue;
282         if (lex.what != Lexical::token)
283             return false;
284         stringtolower(paramname, lex.value);
285 
286         start = find_next_token(value, start, lex);
287         if (start == string::npos || lex.what != Lexical::separator ||
288             lex.value[0] != '=') {
289             //fprintf(stderr, "Find_next_token error (2)\n");
290             return false;
291         }
292 
293         start = find_next_token(value, start, lex);
294         if (start == string::npos || lex.what != Lexical::token) {
295             //fprintf(stderr, "Parameter has no value!");
296             return false;
297         }
298         paramvalue = lex.value;
299         rawparams[paramname] = paramvalue;
300         //fprintf(stderr, "RAW: name [%s], value [%s]\n", paramname.c_str(),
301         //              paramvalue.c_str());
302     }
303     //    fprintf(stderr, "Number of raw params %d\n", rawparams.size());
304 
305     // RFC2231 handling:
306     // - if a parameter name ends in * it must be decoded
307     // - If a parameter name looks line name*ii[*] it is a
308     //   partial value, and must be concatenated with other such.
309 
310     map<string, Chunks> chunks;
311     for (map<string, string>::const_iterator it = rawparams.begin();
312          it != rawparams.end(); it++) {
313         string nm = it->first;
314         //      fprintf(stderr, "NM: [%s]\n", nm.c_str());
315         if (nm.empty()) // ??
316             continue;
317 
318         Chunk chunk;
319         if (nm[nm.length()-1] == '*') {
320             nm.erase(nm.length() - 1);
321             chunk.decode = true;
322         } else
323             chunk.decode = false;
324         //      fprintf(stderr, "NM1: [%s]\n", nm.c_str());
325 
326         chunk.value = it->second;
327 
328         // Look for another asterisk in nm. If none, assign index 0
329         string::size_type aster;
330         int idx = 0;
331         if ((aster = nm.rfind("*")) != string::npos) {
332             string num = nm.substr(aster+1);
333             //fprintf(stderr, "NUM: [%s]\n", num.c_str());
334             nm.erase(aster);
335             idx = atoi(num.c_str());
336         }
337         Chunks empty;
338         if (chunks.find(nm) == chunks.end())
339             chunks[nm] = empty;
340         chunks[nm].chunks.resize(idx+1);
341         chunks[nm].chunks[idx] = chunk;
342         //fprintf(stderr, "CHNKS: nm [%s], idx %d, decode %d, value [%s]\n",
343         // nm.c_str(), idx, int(chunk.decode), chunk.value.c_str());
344     }
345 
346     // For each parameter name, concatenate its chunks and possibly
347     // decode Note that we pass the whole concatenated string to
348     // decoding if the first chunk indicates that decoding is needed,
349     // which is not right because there might be uncoded chunks
350     // according to the rfc.
351     for (map<string, Chunks>::const_iterator it = chunks.begin();
352          it != chunks.end(); it++) {
353         if (it->second.chunks.empty())
354             continue;
355         string nm = it->first;
356         // Create the name entry
357         if (parsed.params.find(nm) == parsed.params.end())
358             parsed.params[nm].clear();
359         // Concatenate all chunks and decode the whole if the first one needs
360         // to. Yes, this is not quite right.
361         string value;
362         for (vector<Chunk>::const_iterator vi = it->second.chunks.begin();
363              vi != it->second.chunks.end(); vi++) {
364             value += vi->value;
365         }
366         if (it->second.chunks[0].decode) {
367             string charset;
368             rfc2231_decode(value, parsed.params[nm], charset);
369         } else {
370             // rfc2047 MUST NOT but IS used by some agents
371             rfc2047_decode(value, parsed.params[nm]);
372         }
373         //fprintf(stderr, "FINAL: nm [%s], value [%s]\n",
374         //nm.c_str(), parsed.params[nm].c_str());
375     }
376 
377     return true;
378 }
379 
380 // Decode a string encoded with quoted-printable encoding.
381 // we reuse the code for rfc2231 % encoding, even if the eol
382 // processing is not useful in this case
qp_decode(const string & in,string & out,char esc)383 bool qp_decode(const string& in, string &out, char esc)
384 {
385     out.reserve(in.length());
386     string::size_type ii;
387     for (ii = 0; ii < in.length(); ii++) {
388         if (in[ii] == esc) {
389             ii++; // Skip '=' or '%'
390             if(ii >= in.length() - 1) { // Need at least 2 more chars
391                 break;
392             } else if (in[ii] == '\r' && in[ii+1] == '\n') { // Soft nl, skip
393                 ii++;
394             } else if (in[ii] != '\n' && in[ii] != '\r') { // decode
395                 char c = in[ii];
396                 char co;
397                 if(c >= 'A' && c <= 'F') {
398                     co = char((c - 'A' + 10) * 16);
399                 } else if (c >= 'a' && c <= 'f') {
400                     co = char((c - 'a' + 10) * 16);
401                 } else if (c >= '0' && c <= '9') {
402                     co = char((c - '0') * 16);
403                 } else {
404                     return false;
405                 }
406                 if(++ii >= in.length())
407                     break;
408                 c = in[ii];
409                 if (c >= 'A' && c <= 'F') {
410                     co += char(c - 'A' + 10);
411                 } else if (c >= 'a' && c <= 'f') {
412                     co += char(c - 'a' + 10);
413                 } else if (c >= '0' && c <= '9') {
414                     co += char(c - '0');
415                 } else {
416                     return false;
417                 }
418                 out += co;
419             }
420         } else {
421             out += in[ii];
422         }
423     }
424     return true;
425 }
426 
427 // Decode an word encoded as quoted printable or base 64
rfc2047_decodeParsed(const std::string & charset,const std::string & encoding,const std::string & value,std::string & utf8)428 static bool rfc2047_decodeParsed(const std::string& charset,
429                                  const std::string& encoding,
430                                  const std::string& value,
431                                  std::string &utf8)
432 {
433     DPRINT((stderr, "DecodeParsed: charset [%s] enc [%s] val [%s]\n",
434             charset.c_str(), encoding.c_str(), value.c_str()));
435     utf8.clear();
436 
437     string decoded;
438     if (!stringlowercmp("b", encoding)) {
439         if (!base64_decode(value, decoded))
440             return false;
441         DPRINT((stderr, "FromB64: [%s]\n", decoded.c_str()));
442     } else if (!stringlowercmp("q", encoding)) {
443         if (!qp_decode(value, decoded))
444             return false;
445         // Need to translate _ to ' ' here
446         string temp;
447         for (string::size_type pos = 0; pos < decoded.length(); pos++)
448             if (decoded[pos] == '_')
449                 temp += ' ';
450             else
451                 temp += decoded[pos];
452         decoded = temp;
453         DPRINT((stderr, "FromQP: [%s]\n", decoded.c_str()));
454     } else {
455         DPRINT((stderr, "Bad encoding [%s]\n", encoding.c_str()));
456         return false;
457     }
458 
459     if (!transcode(decoded, utf8, charset, "UTF-8")) {
460         DPRINT((stderr, "Transcode failed\n"));
461         return false;
462     }
463     return true;
464 }
465 
466 // Parse a mail header value encoded according to RFC2047.
467 // This is not supposed to be used for MIME parameter values, but it
468 // happens.
469 // Bugs:
470 //    - We should turn off decoding while inside quoted strings
471 //
472 typedef enum  {rfc2047ready, rfc2047open_eq,
473                rfc2047charset, rfc2047encoding,
474                rfc2047value, rfc2047close_q} Rfc2047States;
475 
rfc2047_decode(const std::string & in,std::string & out)476 bool rfc2047_decode(const std::string& in, std::string &out)
477 {
478     DPRINT((stderr, "rfc2047_decode: [%s]\n", in.c_str()));
479 
480     Rfc2047States state = rfc2047ready;
481     string encoding, charset, value, utf8;
482 
483     out.clear();
484 
485     for (string::size_type ii = 0; ii < in.length(); ii++) {
486         char ch = in[ii];
487         switch (state) {
488         case rfc2047ready:
489         {
490             DPRINT((stderr, "STATE: ready, ch %c\n", ch));
491             switch (ch) {
492                 // Whitespace: stay ready
493             case ' ': case '\t': value += ch;break;
494                 // '=' -> forward to next state
495             case '=': state = rfc2047open_eq; break;
496                 DPRINT((stderr, "STATE: open_eq\n"));
497                 // Other: go back to sleep
498             default: value += ch; state = rfc2047ready;
499             }
500         }
501         break;
502         case rfc2047open_eq:
503         {
504             DPRINT((stderr, "STATE: open_eq, ch %c\n", ch));
505             switch (ch) {
506             case '?':
507             {
508                 // Transcode current (unencoded part) value:
509                 // we sometimes find 8-bit chars in
510                 // there. Interpret as Iso8859.
511                 if (value.length() > 0) {
512                     transcode(value, utf8, "ISO-8859-1", "UTF-8");
513                     out += utf8;
514                     value.clear();
515                 }
516                 state = rfc2047charset;
517             }
518             break;
519             default: state = rfc2047ready; value += '='; value += ch;break;
520             }
521         }
522         break;
523         case rfc2047charset:
524         {
525             DPRINT((stderr, "STATE: charset, ch %c\n", ch));
526             switch (ch) {
527             case '?': state = rfc2047encoding; break;
528             default: charset += ch; break;
529             }
530         }
531         break;
532         case rfc2047encoding:
533         {
534             DPRINT((stderr, "STATE: encoding, ch %c\n", ch));
535             switch (ch) {
536             case '?': state = rfc2047value; break;
537             default: encoding += ch; break;
538             }
539         }
540         break;
541         case rfc2047value:
542         {
543             DPRINT((stderr, "STATE: value, ch %c\n", ch));
544             switch (ch) {
545             case '?': state = rfc2047close_q; break;
546             default: value += ch;break;
547             }
548         }
549         break;
550         case rfc2047close_q:
551         {
552             DPRINT((stderr, "STATE: close_q, ch %c\n", ch));
553             switch (ch) {
554             case '=':
555             {
556                 DPRINT((stderr, "End of encoded area. Charset %s, Encoding %s\n", charset.c_str(), encoding.c_str()));
557                 string utf8;
558                 state = rfc2047ready;
559                 if (!rfc2047_decodeParsed(charset, encoding, value,
560                                           utf8)) {
561                     return false;
562                 }
563                 out += utf8;
564                 charset.clear();
565                 encoding.clear();
566                 value.clear();
567             }
568             break;
569             default: state = rfc2047value; value += '?';value += ch;break;
570             }
571         }
572         break;
573         default: // ??
574             DPRINT((stderr, "STATE: default ?? ch %c\n", ch));
575             return false;
576         }
577     }
578 
579     if (value.length() > 0) {
580         transcode(value, utf8, "CP1252", "UTF-8");
581         out += utf8;
582         value.clear();
583     }
584     if (state != rfc2047ready)
585         return false;
586     return true;
587 }
588 
589 #define DEBUGDATE 0
590 #if DEBUGDATE
591 #define DATEDEB(X) fprintf X
592 #else
593 #define DATEDEB(X)
594 #endif
595 
596 // Convert rfc822 date to unix time. A date string normally looks like:
597 //  Mon, 3 Jul 2006 09:51:58 +0200
598 // But there are many close common variations
599 // And also hopeless things like: Fri Nov  3 13:13:33 2006
rfc2822DateToUxTime(const string & dt)600 time_t rfc2822DateToUxTime(const string& dt)
601 {
602     // Strip everything up to first comma if any, we don't need weekday,
603     // then break into tokens
604     vector<string> toks;
605     string::size_type idx;
606     if ((idx = dt.find_first_of(",")) != string::npos) {
607         if (idx == dt.length() - 1) {
608             DATEDEB((stderr, "Bad rfc822 date format (short1): [%s]\n",
609                      dt.c_str()));
610             return (time_t)-1;
611         }
612         string date = dt.substr(idx+1);
613         stringToTokens(date, toks, " \t:");
614     } else {
615         // No comma. Enter strangeland
616         stringToTokens(dt, toks, " \t:");
617         // Test for date like: Sun Nov 19 06:18:41 2006
618         //                      0   1  2   3 4  5  6
619         // and change to:      19 Nov 2006 06:18:41
620         if (toks.size() == 7) {
621             if (toks[0].length() == 3 &&
622                 toks[0].find_first_of("0123456789") == string::npos) {
623                 swap(toks[0], toks[2]);
624                 swap(toks[6], toks[2]);
625                 toks.pop_back();
626             }
627         }
628     }
629 
630 #if DEBUGDATE
631     for (list<string>::iterator it = toks.begin(); it != toks.end(); it++) {
632         DATEDEB((stderr, "[%s] ", it->c_str()));
633     }
634     DATEDEB((stderr, "\n"));
635 #endif
636 
637     if (toks.size() < 6) {
638         DATEDEB((stderr, "Bad rfc822 date format (toks cnt): [%s]\n",
639                  dt.c_str()));
640         return (time_t)-1;
641     }
642 
643     if (toks.size() == 6) {
644         // Probably no timezone, sometimes happens
645         toks.push_back("+0000");
646     }
647 
648     struct tm tm;
649     memset(&tm, 0, sizeof(tm));
650 
651     // Load struct tm with appropriate tokens, possibly converting
652     // when needed
653 
654     vector<string>::iterator it = toks.begin();
655 
656     // Day of month: no conversion needed
657     tm.tm_mday = atoi(it->c_str());
658     it++;
659 
660     // Month. Only Jan-Dec are legal. January, February do happen
661     // though. Convert to 0-11
662     if (*it == "Jan" || *it == "January") tm.tm_mon = 0; else if
663         (*it == "Feb" || *it == "February") tm.tm_mon = 1; else if
664         (*it == "Mar" || *it == "March") tm.tm_mon = 2; else if
665         (*it == "Apr" || *it == "April") tm.tm_mon = 3; else if
666         (*it == "May") tm.tm_mon = 4; else if
667         (*it == "Jun" || *it == "June") tm.tm_mon = 5; else if
668         (*it == "Jul" || *it == "July") tm.tm_mon = 6; else if
669         (*it == "Aug" || *it == "August") tm.tm_mon = 7; else if
670         (*it == "Sep" || *it == "September") tm.tm_mon = 8; else if
671         (*it == "Oct" || *it == "October") tm.tm_mon = 9; else if
672         (*it == "Nov" || *it == "November") tm.tm_mon = 10; else if
673         (*it == "Dec" || *it == "December") tm.tm_mon = 11; else {
674         DATEDEB((stderr, "Bad rfc822 date format (month): [%s]\n",
675                  dt.c_str()));
676         return (time_t)-1;
677     }
678     it++;
679 
680     // Year. Struct tm counts from 1900. 2 char years are quite rare
681     // but do happen. I've seen 00 happen so count small values from 2000
682     tm.tm_year = atoi(it->c_str());
683     if (it->length() == 2) {
684         if (tm.tm_year < 10)
685             tm.tm_year += 2000;
686         else
687             tm.tm_year += 1900;
688     }
689     if (tm.tm_year > 1900)
690         tm.tm_year -= 1900;
691     it++;
692 
693     // Hour minute second need no adjustments
694     tm.tm_hour = atoi(it->c_str()); it++;
695     tm.tm_min  = atoi(it->c_str()); it++;
696     tm.tm_sec  = atoi(it->c_str()); it++;
697 
698 
699     // Timezone is supposed to be either +-XYZT or a zone name
700     int zonesecs = 0;
701     if (it->length() < 1) {
702         DATEDEB((stderr, "Bad rfc822 date format (zlen): [%s]\n", dt.c_str()));
703         return (time_t)-1;
704     }
705     if (it->at(0) == '-' || it->at(0) == '+') {
706         // Note that +xy:zt (instead of +xyzt) sometimes happen, we
707         // may want to process it one day
708         if (it->length() < 5) {
709             DATEDEB((stderr, "Bad rfc822 date format (zlen1): [%s]\n",
710                      dt.c_str()));
711             goto nozone;
712         }
713         zonesecs = 3600*((it->at(1)-'0') * 10 + it->at(2)-'0')+
714             (it->at(3)-'0')*10 + it->at(4)-'0';
715         zonesecs = it->at(0) == '+' ? -1 * zonesecs : zonesecs;
716     } else {
717         int hours;
718         if (*it == "A") hours= 1; else if (*it == "B") hours= 2;
719         else if (*it == "C") hours= 3; else if (*it == "D") hours= 4;
720         else if (*it == "E") hours= 5; else if (*it == "F") hours= 6;
721         else if (*it == "G") hours= 7; else if (*it == "H") hours= 8;
722         else if (*it == "I") hours= 9; else if (*it == "K") hours= 10;
723         else if (*it == "L") hours= 11; else if (*it == "M") hours= 12;
724         else if (*it == "N") hours= -1; else if (*it == "O") hours= -2;
725         else if (*it == "P") hours= -3; else if (*it == "Q") hours= -4;
726         else if (*it == "R") hours= -5; else if (*it == "S") hours= -6;
727         else if (*it == "T") hours= -7; else if (*it == "U") hours= -8;
728         else if (*it == "V") hours= -9; else if (*it == "W") hours= -10;
729         else if (*it == "X") hours= -11; else if (*it == "Y") hours= -12;
730         else if (*it == "Z") hours=  0; else if  (*it == "UT") hours= 0;
731         else if (*it == "GMT") hours= 0; else if (*it == "EST") hours= 5;
732         else if (*it == "EDT") hours= 4; else if (*it == "CST") hours= 6;
733         else if (*it == "CDT") hours= 5; else if (*it == "MST") hours= 7;
734         else if (*it == "MDT") hours= 6; else if (*it == "PST") hours= 8;
735         else if (*it == "PDT") hours= 7;
736         // Non standard names
737         // Standard Time (or Irish Summer Time?) is actually +5.5
738         else if (*it == "CET") hours= -1; else if (*it == "JST") hours= -9;
739         else if (*it == "IST") hours= -5; else if (*it == "WET") hours= 0;
740         else if (*it == "MET") hours= -1;
741         else {
742             DATEDEB((stderr, "Bad rfc822 date format (zname): [%s]\n",
743                      dt.c_str()));
744             // Forget tz
745             goto nozone;
746         }
747         zonesecs = 3600 * hours;
748     }
749     DATEDEB((stderr, "Tz: [%s] -> %d\n", it->c_str(), zonesecs));
750 nozone:
751 
752     // Compute the UTC Unix time value
753 #ifndef sun
754     time_t tim = timegm(&tm);
755 #else
756     // No timegm on Sun. Use mktime, then correct for local timezone
757     time_t tim = mktime(&tm);
758     // altzone and timezone hold the difference in seconds between UTC
759     // and local. They are negative for places east of greenwich
760     //
761     // mktime takes our buffer to be local time, so it adds timezone
762     // to the conversion result (if timezone is < 0 it's currently
763     // earlier in greenwhich).
764     //
765     // We have to substract it back (hey! hopefully! maybe we have to
766     // add it). Who can really know?
767     tim -= timezone;
768 #endif
769 
770     // And add in the correction from the email's Tz
771     tim += zonesecs;
772 
773     DATEDEB((stderr, "Date: %s  uxtime %ld \n", ctime(&tim), tim));
774     return tim;
775 }
776 
777 #else
778 
779 #include <stdio.h>
780 #include <stdlib.h>
781 #include <string.h>
782 #include <time.h>
783 
784 #include <string>
785 #include "mimeparse.h"
786 #include "readfile.h"
787 
788 
789 using namespace std;
790 extern bool rfc2231_decode(const string& in, string& out, string& charset);
791 extern time_t rfc2822DateToUxTime(const string& date);
792 static const char *thisprog;
793 
794 static char usage [] =
795     "-p: header value and parameter test\n"
796     "-q: qp decoding\n"
797     "-b: base64\n"
798     "-7: rfc2047\n"
799     "-1: rfc2331\n"
800     "-t: date time\n"
801     "  \n\n"
802     ;
803 static void
Usage(void)804 Usage(void)
805 {
806     fprintf(stderr, "%s: usage:\n%s", thisprog, usage);
807     exit(1);
808 }
809 
810 static int     op_flags;
811 #define OPT_MOINS 0x1
812 #define OPT_p     0x2
813 #define OPT_q     0x4
814 #define OPT_b     0x8
815 #define OPT_7     0x10
816 #define OPT_1     0x20
817 #define OPT_t     0x40
818 int
main(int argc,const char ** argv)819 main(int argc, const char **argv)
820 {
821     int count = 10;
822 
823     thisprog = argv[0];
824     argc--; argv++;
825 
826     while (argc > 0 && **argv == '-') {
827         (*argv)++;
828         if (!(**argv))
829             /* Cas du "adb - core" */
830             Usage();
831         while (**argv)
832             switch (*(*argv)++) {
833             case 'p':   op_flags |= OPT_p; break;
834             case 'q':   op_flags |= OPT_q; break;
835             case 'b':   op_flags |= OPT_b; break;
836             case '1':   op_flags |= OPT_1; break;
837             case '7':   op_flags |= OPT_7; break;
838             case 't':   op_flags |= OPT_t; break;
839             default: Usage();   break;
840             }
841     b1: argc--; argv++;
842     }
843 
844     if (argc != 0)
845         Usage();
846 
847     if (op_flags & OPT_p) {
848         // Mime header value and parameters extraction
849         const char *tr[] = {
850             "text/html;charset = UTF-8 ; otherparam=garb; \n"
851             "QUOTEDPARAM=\"quoted value\"",
852 
853             "text/plain; charset=ASCII\r\n name=\"809D3016_5691DPS_5.2.LIC\"",
854 
855             "application/x-stuff;"
856             "title*0*=us-ascii'en'This%20is%20even%20more%20;"
857             "title*1*=%2A%2A%2Afun%2A%2A%2A%20;"
858             "title*2=\"isn't it!\"",
859 
860             // The following are all invalid, trying to crash the parser...
861             "",
862             // This does not parse because of whitespace in the value.
863             " complete garbage;",
864             // This parses, but only the first word gets into the value
865             " some value",
866             " word ;",  ";",  "=",  "; = ",  "a;=\"toto tutu\"=", ";;;;a=b",
867         };
868 
869         for (unsigned int i = 0; i < sizeof(tr) / sizeof(char *); i++) {
870             MimeHeaderValue parsed;
871             if (!parseMimeHeaderValue(tr[i], parsed)) {
872                 fprintf(stderr, "PARSE ERROR for [%s]\n", tr[i]);
873                 continue;
874             }
875             printf("Field value: [%s]\n", parsed.value.c_str());
876             map<string, string>::iterator it;
877             for (it = parsed.params.begin();it != parsed.params.end();it++) {
878                 if (it == parsed.params.begin())
879                     printf("Parameters:\n");
880                 printf("  [%s] = [%s]\n", it->first.c_str(), it->second.c_str());
881             }
882         }
883 
884     } else if (op_flags & OPT_q) {
885         // Quoted printable stuff
886         const char *qp =
887             "=41=68 =e0 boire=\r\n continue 1ere\ndeuxieme\n\r3eme "
888             "agrave is: '=E0' probable skipped decode error: =\n"
889             "Actual decode error =xx this wont show";
890 
891         string out;
892         if (!qp_decode(string(qp), out)) {
893             fprintf(stderr, "qp_decode returned error\n");
894         }
895         printf("Decoded: '%s'\n", out.c_str());
896     } else if (op_flags & OPT_b) {
897         // Base64
898         //'C'est � boire qu'il nous faut �viter l'exc�s.'
899         //'Deuxi�me ligne'
900         //'Troisi�me ligne'
901         //'Et la fin (pas de nl). '
902         const char *b64 =
903             "Qydlc3Qg4CBib2lyZSBxdSdpbCBub3VzIGZhdXQg6XZpdGVyIGwnZXhj6HMuCkRldXhp6G1l\r\n"
904             "IGxpZ25lClRyb2lzaehtZSBsaWduZQpFdCBsYSBmaW4gKHBhcyBkZSBubCkuIA==\r\n";
905 
906         string out;
907         if (!base64_decode(string(b64), out)) {
908             fprintf(stderr, "base64_decode returned error\n");
909             exit(1);
910         }
911         printf("Decoded: [%s]\n", out.c_str());
912 #if 0
913         string coded, decoded;
914         const char *fname = "/tmp/recoll_decodefail";
915         if (!file_to_string(fname, coded)) {
916             fprintf(stderr, "Cant read %s\n", fname);
917             exit(1);
918         }
919 
920         if (!base64_decode(coded, decoded)) {
921             fprintf(stderr, "base64_decode returned error\n");
922             exit(1);
923         }
924         printf("Decoded: [%s]\n", decoded.c_str());
925 #endif
926 
927     } else if (op_flags & (OPT_7|OPT_1)) {
928         // rfc2047
929         char line [1024];
930         string out;
931         bool res;
932         while (fgets(line, 1023, stdin)) {
933             int l = strlen(line);
934             if (l == 0)
935                 continue;
936             line[l-1] = 0;
937             fprintf(stderr, "Line: [%s]\n", line);
938             string charset;
939             if (op_flags & OPT_7) {
940                 res = rfc2047_decode(line, out);
941             } else {
942                 res = rfc2231_decode(line, out, charset);
943             }
944             if (res)
945                 fprintf(stderr, "Out:  [%s] cs %s\n", out.c_str(), charset.c_str());
946             else
947                 fprintf(stderr, "Decoding failed\n");
948         }
949     } else if (op_flags & OPT_t) {
950         time_t t;
951 
952         const char *dates[] = {
953             " Wed, 13 Sep 2006 11:40:26 -0700 (PDT)",
954             " Mon, 3 Jul 2006 09:51:58 +0200",
955             " Wed, 13 Sep 2006 08:19:48 GMT-07:00",
956             " Wed, 13 Sep 2006 11:40:26 -0700 (PDT)",
957             " Sat, 23 Dec 89 19:27:12 EST",
958             "   13 Jan 90 08:23:29 GMT"};
959 
960         for (unsigned int i = 0; i <sizeof(dates) / sizeof(char *); i++) {
961             t = rfc2822DateToUxTime(dates[i]);
962             struct tm *tm = localtime(&t);
963             char datebuf[100];
964             strftime(datebuf, 99, "&nbsp;%Y-%m-%d&nbsp;%H:%M:%S %z", tm);
965             printf("[%s] -> [%s]\n", dates[i], datebuf);
966         }
967         printf("Enter date:\n");
968         char line [1024];
969         while (fgets(line, 1023, stdin)) {
970             int l = strlen(line);
971             if (l == 0) continue;
972             line[l-1] = 0;
973             t = rfc2822DateToUxTime(line);
974             struct tm *tm = localtime(&t);
975             char datebuf[100];
976             strftime(datebuf, 99, "&nbsp;%Y-%m-%d&nbsp;%H:%M:%S %z", tm);
977             printf("[%s] -> [%s]\n", line, datebuf);
978         }
979 
980 
981     }
982     exit(0);
983 }
984 
985 #endif // TEST_MIMEPARSE
986