1 /*
2  * File: misc.c
3  *
4  * Copyright (C) 2000-2007 Jorge Arellano Cid <jcid@dillo.org>,
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 3 of the License, or
9  * (at your option) any later version.
10  */
11 
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <assert.h>
17 
18 #include "utf8.hh"
19 #include "msg.h"
20 #include "misc.h"
21 
22 /*
23  * Escape characters as %XX sequences.
24  * Return value: New string.
25  */
a_Misc_escape_chars(const char * str,const char * esc_set)26 char *a_Misc_escape_chars(const char *str, const char *esc_set)
27 {
28    static const char *const hex = "0123456789ABCDEF";
29    char *p = NULL;
30    Dstr *dstr;
31    int i;
32 
33    dstr = dStr_sized_new(64);
34    for (i = 0; str[i]; ++i) {
35       if (str[i] <= 0x1F || str[i] == 0x7F || strchr(esc_set, str[i])) {
36          dStr_append_c(dstr, '%');
37          dStr_append_c(dstr, hex[(str[i] >> 4) & 15]);
38          dStr_append_c(dstr, hex[str[i] & 15]);
39       } else {
40          dStr_append_c(dstr, str[i]);
41       }
42    }
43    p = dstr->str;
44    dStr_free(dstr, FALSE);
45 
46    return p;
47 }
48 
49 #define TAB_SIZE 8
50 /*
51  * Takes a string and converts any tabs to spaces.
52  */
53 int
a_Misc_expand_tabs(char ** start,char * end,char * buf,int buflen)54 a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen)
55 {
56    int j, pos = 0, written = 0, old_pos, char_len;
57    uint_t code;
58    static const int combining_char_space = 32;
59 
60    while (*start < end && written < buflen - TAB_SIZE - combining_char_space) {
61       code = a_Utf8_decode(*start, end, &char_len);
62 
63       if (code == '\t') {
64          /* Fill with whitespaces until the next tab. */
65          old_pos = pos;
66          pos += TAB_SIZE - (pos % TAB_SIZE);
67          for (j = old_pos; j < pos; j++)
68             buf[written++] = ' ';
69       } else {
70          assert(char_len <= 4);
71          for (j = 0; j < char_len; j++)
72             buf[written++] = (*start)[j];
73          pos++;
74       }
75 
76       *start += char_len;
77    }
78 
79    /* If following chars are combining chars (e.g. accents) add them to the
80     * buffer. We have reserved combining_char_space bytes for this.
81     * If there should be more combining chars, we split nevertheless.
82     */
83    while (*start < end && written < buflen - 4) {
84       code = a_Utf8_decode(*start, end, &char_len);
85 
86       if (! a_Utf8_combining_char(code))
87          break;
88 
89       assert(char_len <= 4);
90       for (j = 0; j < char_len; j++)
91          buf[written++] = (*start)[j];
92 
93       *start += char_len;
94    }
95 
96    return written;
97 }
98 
99 /* TODO: could use dStr ADT! */
100 typedef struct {
101    const char *str;
102    int len;
103 } ContentType_t;
104 
105 static const ContentType_t MimeTypes[] = {
106    { "application/octet-stream", 24 },
107    { "application/xhtml+xml", 21 },
108    { "text/html", 9 },
109    { "text/plain", 10 },
110    { "image/gif", 9 },
111    { "image/png", 9 },
112    { "image/jpeg", 10 },
113    { NULL, 0 }
114 };
115 
116 typedef enum {
117    DT_OCTET_STREAM = 0,
118    DT_PLACEHOLDER,
119    DT_TEXT_HTML,
120    DT_TEXT_PLAIN,
121    DT_IMAGE_GIF,
122    DT_IMAGE_PNG,
123    DT_IMAGE_JPG,
124 } DetectedContentType;
125 
126 /*
127  * Detects 'Content-Type' from a data stream sample.
128  *
129  * It uses the magic(5) logic from file(1). Currently, it
130  * only checks the few mime types that Dillo supports.
131  *
132  * 'Data' is a pointer to the first bytes of the raw data.
133  *
134  * Return value: (0 on success, 1 on doubt, 2 on lack of data).
135  */
a_Misc_get_content_type_from_data(void * Data,size_t Size,const char ** PT)136 int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
137 {
138    size_t i, non_ascci, non_ascci_text, bin_chars;
139    char *p = Data;
140    int st = 1;      /* default to "doubt' */
141    DetectedContentType Type = DT_OCTET_STREAM; /* default to binary */
142 
143    /* HTML try */
144    for (i = 0; i < Size && dIsspace(p[i]); ++i);
145    if ((Size - i >= 5  && !dStrnAsciiCasecmp(p+i, "<html", 5)) ||
146        (Size - i >= 5  && !dStrnAsciiCasecmp(p+i, "<head", 5)) ||
147        (Size - i >= 6  && !dStrnAsciiCasecmp(p+i, "<title", 6)) ||
148        (Size - i >= 14 && !dStrnAsciiCasecmp(p+i, "<!doctype html", 14)) ||
149        /* this line is workaround for FTP through the Squid proxy */
150        (Size - i >= 17 && !dStrnAsciiCasecmp(p+i, "<!-- HTML listing", 17))) {
151 
152       Type = DT_TEXT_HTML;
153       st = 0;
154    /* Images */
155    } else if (Size >= 4 && !strncmp(p, "GIF8", 4)) {
156       Type = DT_IMAGE_GIF;
157       st = 0;
158    } else if (Size >= 4 && !strncmp(p, "\x89PNG", 4)) {
159       Type = DT_IMAGE_PNG;
160       st = 0;
161    } else if (Size >= 2 && !strncmp(p, "\xff\xd8", 2)) {
162       /* JPEG has the first 2 bytes set to 0xffd8 in BigEndian - looking
163        * at the character representation should be machine independent. */
164       Type = DT_IMAGE_JPG;
165       st = 0;
166 
167    /* Text */
168    } else {
169       /* Heuristic for "text/plain"
170        * {ASCII, LATIN1, UTF8, KOI8-R, CP-1251}
171        * All in the above set regard [00-31] as control characters.
172        * LATIN1: [7F-9F] unused
173        * CP-1251 {7F,98} unused (two characters).
174        *
175        * We'll use [0-31] as indicators of non-text content.
176        * Better heuristics are welcomed! :-) */
177 
178       non_ascci = non_ascci_text = bin_chars = 0;
179       Size = MIN (Size, 256);
180       for (i = 0; i < Size; i++) {
181          int ch = (uchar_t) p[i];
182          if (ch < 32 && !dIsspace(ch))
183             ++bin_chars;
184          if (ch > 126)
185             ++non_ascci;
186          if (ch > 190)
187             ++non_ascci_text;
188       }
189       if (bin_chars == 0 && (non_ascci - non_ascci_text) <= Size/10) {
190          /* Let's say text: if "rare" chars are <= 10% */
191          Type = DT_TEXT_PLAIN;
192       } else if (Size > 0) {
193          /* a special check for UTF-8 */
194          Size = a_Utf8_end_of_char(p, Size - 1) + 1;
195          if (a_Utf8_test(p, Size) > 0)
196             Type = DT_TEXT_PLAIN;
197       }
198       if (Size >= 256)
199          st = 0;
200    }
201 
202    *PT = MimeTypes[Type].str;
203    return st;
204 }
205 
206 /*
207  * Parse Content-Type string, e.g., "text/html; charset=utf-8".
208  * Content-Type is defined in RFC 2045 section 5.1.
209  */
a_Misc_parse_content_type(const char * type,char ** major,char ** minor,char ** charset)210 void a_Misc_parse_content_type(const char *type, char **major, char **minor,
211                                char **charset)
212 {
213    static const char tspecials_space[] = "()<>@,;:\\\"/[]?= ";
214    const char *str, *s;
215 
216    if (major)
217       *major = NULL;
218    if (minor)
219       *minor = NULL;
220    if (charset)
221       *charset = NULL;
222    if (!(str = type))
223       return;
224 
225    for (s = str; *s && isascii((uchar_t)*s) && !iscntrl((uchar_t)*s) &&
226         !strchr(tspecials_space, *s); s++) ;
227    if (major)
228       *major = dStrndup(str, s - str);
229 
230    if (*s == '/') {
231       for (str = ++s; *s && isascii((uchar_t)*s) && !iscntrl((uchar_t)*s) &&
232            !strchr(tspecials_space, *s); s++) ;
233       if (minor)
234          *minor = dStrndup(str, s - str);
235    }
236    if (charset && *s &&
237        (dStrnAsciiCasecmp(type, "text/", 5) == 0 ||
238         dStrnAsciiCasecmp(type, "application/xhtml+xml", 21) == 0)) {
239       /* "charset" parameter defined for text media type in RFC 2046,
240        * application/xhtml+xml in RFC 3236.
241        *
242        * Note that RFC 3023 lists some main xml media types and provides
243        * the convention of using the "+xml" minor type suffix for other
244        * xml types, so it would be reasonable to check for that suffix if
245        * we have need to care about various xml types someday.
246        */
247       const char terminators[] = " ;\t";
248       const char key[] = "charset";
249 
250       if ((s = dStriAsciiStr(str, key)) &&
251           (s == str || strchr(terminators, s[-1]))) {
252          s += sizeof(key) - 1;
253          for ( ; *s == ' ' || *s == '\t'; ++s);
254          if (*s == '=') {
255             size_t len;
256             for (++s; *s == ' ' || *s == '\t'; ++s);
257             if ((len = strcspn(s, terminators))) {
258                if (*s == '"' && s[len-1] == '"' && len > 1) {
259                  /* quoted string */
260                  s++;
261                  len -= 2;
262                }
263                *charset = dStrndup(s, len);
264             }
265          }
266       }
267    }
268 }
269 
270 /*
271  * Compare two Content-Type strings.
272  * Return 0 if they are equivalent, and 1 otherwise.
273  */
a_Misc_content_type_cmp(const char * ct1,const char * ct2)274 int a_Misc_content_type_cmp(const char *ct1, const char *ct2)
275 {
276    char *major1, *major2, *minor1, *minor2, *charset1, *charset2;
277    int ret;
278 
279    if ((!ct1 || !*ct1) && (!ct2 || !*ct2))
280       return 0;
281    if ((!ct1 || !*ct1) || (!ct2 || !*ct2))
282       return 1;
283 
284    a_Misc_parse_content_type(ct1, &major1, &minor1, &charset1);
285    a_Misc_parse_content_type(ct2, &major2, &minor2, &charset2);
286 
287    if (major1 && major2 && !dStrAsciiCasecmp(major1, major2) &&
288        minor1 && minor2 && !dStrAsciiCasecmp(minor1, minor2) &&
289        ((!charset1 && !charset2) ||
290         (charset1 && charset2 && !dStrAsciiCasecmp(charset1, charset2)) ||
291         (!charset1 && charset2 && !dStrAsciiCasecmp(charset2, "UTF-8")) ||
292         (charset1 && !charset2 && !dStrAsciiCasecmp(charset1, "UTF-8")))) {
293       ret = 0;
294    } else {
295       ret = 1;
296    }
297    dFree(major1); dFree(major2);
298    dFree(minor1); dFree(minor2);
299    dFree(charset1); dFree(charset2);
300 
301    return ret;
302 }
303 
304 /*
305  * Check the server-supplied 'Content-Type' against our detected type.
306  * (some servers seem to default to "text/plain").
307  *
308  * Return value:
309  *  0,  if they match
310  *  -1, if a mismatch is detected
311  *
312  * There are many MIME types Dillo doesn't know, they're handled
313  * as "application/octet-stream" (as the SPEC says).
314  *
315  * A mismatch happens when receiving a binary stream as
316  * "text/plain" or "text/html", or an image that's not an image of its kind.
317  *
318  * Note: this is a basic security procedure.
319  *
320  */
a_Misc_content_type_check(const char * EntryType,const char * DetectedType)321 int a_Misc_content_type_check(const char *EntryType, const char *DetectedType)
322 {
323    int i;
324    int st = -1;
325 
326    _MSG("Type check:  [Srv: %s  Det: %s]\n", EntryType, DetectedType);
327 
328    if (!EntryType)
329       return 0; /* there's no mismatch without server type */
330 
331    for (i = 1; MimeTypes[i].str; ++i)
332       if (dStrnAsciiCasecmp(EntryType, MimeTypes[i].str, MimeTypes[i].len) ==0)
333          break;
334 
335    if (!MimeTypes[i].str) {
336       /* type not found, no mismatch */
337       st = 0;
338    } else if (dStrnAsciiCasecmp(EntryType, "image/", 6) == 0 &&
339              !dStrnAsciiCasecmp(DetectedType, MimeTypes[i].str,
340                                 MimeTypes[i].len)){
341       /* An image, and there's an exact match */
342       st = 0;
343    } else if (dStrnAsciiCasecmp(EntryType, "text/", 5) ||
344               dStrnAsciiCasecmp(DetectedType, "application/", 12)) {
345       /* Not an application sent as text */
346       st = 0;
347    } else if (dStrnAsciiCasecmp(EntryType, "application/xhtml+xml", 21) &&
348               dStrnAsciiCasecmp(DetectedType, "text/html", 9)) {
349       /* XML version of HTML */
350       st = 0;
351    }
352    _MSG("Type check: %s\n", st == 0 ? "MATCH" : "MISMATCH");
353 
354    return st;
355 }
356 
357 /*
358  * Parse a geometry string.
359  */
a_Misc_parse_geometry(char * str,int * x,int * y,int * w,int * h)360 int a_Misc_parse_geometry(char *str, int *x, int *y, int *w, int *h)
361 {
362    char *p, *t1, *t2;
363    int n1, n2;
364    int ret = 0;
365 
366    if ((p = strchr(str, 'x')) || (p = strchr(str, 'X'))) {
367       n1 = strtol(str, &t1, 10);
368       n2 = strtol(++p, &t2, 10);
369       if (t1 != str && t2 != p) {
370          *w = n1;
371          *h = n2;
372          ret = 1;
373          /* parse x,y now */
374          p = t2;
375          n1 = strtol(p, &t1, 10);
376          n2 = strtol(t1, &t2, 10);
377          if (t1 != p && t2 != t1) {
378             *x = n1;
379             *y = n2;
380          }
381       }
382    }
383    _MSG("geom: w,h,x,y = (%d,%d,%d,%d)\n", *w, *h, *x, *y);
384    return ret;
385 }
386 
387 /*
388  * Parse dillorc's search_url string ("[<label> ]<url>")
389  * Return value: -1 on error, 0 on success (and label and urlstr pointers)
390  */
a_Misc_parse_search_url(char * source,char ** label,char ** urlstr)391 int a_Misc_parse_search_url(char *source, char **label, char **urlstr)
392 {
393    static char buf[32];
394    char *p, *q;
395    int ret = -1;
396 
397    if ((p = strrchr(source, ' '))) {
398       /* label and url pair */
399       strncpy(buf,source,MIN(p-source,31));
400       buf[MIN(p-source,31)] = 0;
401       source = p+1;
402       if ((p = strchr(source, '/')) && p[1] && (q = strchr(p+2,'/'))) {
403          *urlstr = source;
404          ret = 0;
405       }
406    } else {
407       /* url only, make a custom label */
408       if ((p = strchr(source, '/')) && p[1] && (q = strchr(p+2,'/'))) {
409          strncpy(buf,p+2,MIN(q-p-2,31));
410          buf[MIN(q-p-2,31)] = 0;
411          *urlstr = source;
412          ret = 0;
413       }
414    }
415    *label = buf;
416    if (ret == -1)
417       MSG("Invalid search_url: \"%s\"\n", source);
418    return ret;
419 }
420 
421 /*
422  * Encodes string using base64 encoding.
423  * Return value: new string or NULL if input string is empty.
424  */
a_Misc_encode_base64(const char * in)425 char *a_Misc_encode_base64(const char *in)
426 {
427    static const char *const base64_hex = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
428                                          "abcdefghijklmnopqrstuvwxyz"
429                                          "0123456789+/";
430    char *out = NULL;
431    int len, i = 0;
432 
433    if (in == NULL) return NULL;
434    len = strlen(in);
435 
436    out = (char *)dMalloc((len + 2) / 3 * 4 + 1);
437 
438    for (; len >= 3; len -= 3) {
439       out[i++] = base64_hex[in[0] >> 2];
440       out[i++] = base64_hex[((in[0]<<4) & 0x30) | (in[1]>>4)];
441       out[i++] = base64_hex[((in[1]<<2) & 0x3c) | (in[2]>>6)];
442       out[i++] = base64_hex[in[2] & 0x3f];
443       in += 3;
444    }
445 
446    if (len > 0) {
447       unsigned char fragment;
448       out[i++] = base64_hex[in[0] >> 2];
449       fragment = (in[0] << 4) & 0x30;
450       if (len > 1) fragment |= in[1] >> 4;
451       out[i++] = base64_hex[fragment];
452       out[i++] = (len < 2) ? '=' : base64_hex[(in[1] << 2) & 0x3c];
453       out[i++] = '=';
454    }
455    out[i] = '\0';
456    return out;
457 }
458 
459 /*
460  * Load a local file into a dStr.
461  * Return value: dStr on success, NULL on error.
462  * TODO: a filesize threshold may be implemented.
463  */
a_Misc_file2dstr(const char * filename)464 Dstr *a_Misc_file2dstr(const char *filename)
465 {
466    FILE *F_in;
467    int n;
468    char buf[4096];
469    Dstr *dstr = NULL;
470 
471    if ((F_in = fopen(filename, "r"))) {
472       dstr = dStr_sized_new(4096);
473       while ((n = fread (buf, 1, 4096, F_in)) > 0) {
474          dStr_append_l(dstr, buf, n);
475       }
476       fclose(F_in);
477    }
478    return dstr;
479 }
480