1 /*
2 * File: misc.c
3 *
4 * Copyright (C) 2000-2007 Jorge Arellano Cid <jcid@dillo.org>,
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 3 of the License, or
9 * (at your option) any later version.
10 */
11
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <ctype.h>
16 #include <assert.h>
17
18 #include "utf8.hh"
19 #include "msg.h"
20 #include "misc.h"
21
22 /*
23 * Escape characters as %XX sequences.
24 * Return value: New string.
25 */
a_Misc_escape_chars(const char * str,const char * esc_set)26 char *a_Misc_escape_chars(const char *str, const char *esc_set)
27 {
28 static const char *const hex = "0123456789ABCDEF";
29 char *p = NULL;
30 Dstr *dstr;
31 int i;
32
33 dstr = dStr_sized_new(64);
34 for (i = 0; str[i]; ++i) {
35 if (str[i] <= 0x1F || str[i] == 0x7F || strchr(esc_set, str[i])) {
36 dStr_append_c(dstr, '%');
37 dStr_append_c(dstr, hex[(str[i] >> 4) & 15]);
38 dStr_append_c(dstr, hex[str[i] & 15]);
39 } else {
40 dStr_append_c(dstr, str[i]);
41 }
42 }
43 p = dstr->str;
44 dStr_free(dstr, FALSE);
45
46 return p;
47 }
48
49 #define TAB_SIZE 8
50 /*
51 * Takes a string and converts any tabs to spaces.
52 */
53 int
a_Misc_expand_tabs(char ** start,char * end,char * buf,int buflen)54 a_Misc_expand_tabs(char **start, char *end, char *buf, int buflen)
55 {
56 int j, pos = 0, written = 0, old_pos, char_len;
57 uint_t code;
58 static const int combining_char_space = 32;
59
60 while (*start < end && written < buflen - TAB_SIZE - combining_char_space) {
61 code = a_Utf8_decode(*start, end, &char_len);
62
63 if (code == '\t') {
64 /* Fill with whitespaces until the next tab. */
65 old_pos = pos;
66 pos += TAB_SIZE - (pos % TAB_SIZE);
67 for (j = old_pos; j < pos; j++)
68 buf[written++] = ' ';
69 } else {
70 assert(char_len <= 4);
71 for (j = 0; j < char_len; j++)
72 buf[written++] = (*start)[j];
73 pos++;
74 }
75
76 *start += char_len;
77 }
78
79 /* If following chars are combining chars (e.g. accents) add them to the
80 * buffer. We have reserved combining_char_space bytes for this.
81 * If there should be more combining chars, we split nevertheless.
82 */
83 while (*start < end && written < buflen - 4) {
84 code = a_Utf8_decode(*start, end, &char_len);
85
86 if (! a_Utf8_combining_char(code))
87 break;
88
89 assert(char_len <= 4);
90 for (j = 0; j < char_len; j++)
91 buf[written++] = (*start)[j];
92
93 *start += char_len;
94 }
95
96 return written;
97 }
98
99 /* TODO: could use dStr ADT! */
100 typedef struct {
101 const char *str;
102 int len;
103 } ContentType_t;
104
105 static const ContentType_t MimeTypes[] = {
106 { "application/octet-stream", 24 },
107 { "application/xhtml+xml", 21 },
108 { "text/html", 9 },
109 { "text/plain", 10 },
110 { "image/gif", 9 },
111 { "image/png", 9 },
112 { "image/jpeg", 10 },
113 { NULL, 0 }
114 };
115
116 typedef enum {
117 DT_OCTET_STREAM = 0,
118 DT_PLACEHOLDER,
119 DT_TEXT_HTML,
120 DT_TEXT_PLAIN,
121 DT_IMAGE_GIF,
122 DT_IMAGE_PNG,
123 DT_IMAGE_JPG,
124 } DetectedContentType;
125
126 /*
127 * Detects 'Content-Type' from a data stream sample.
128 *
129 * It uses the magic(5) logic from file(1). Currently, it
130 * only checks the few mime types that Dillo supports.
131 *
132 * 'Data' is a pointer to the first bytes of the raw data.
133 *
134 * Return value: (0 on success, 1 on doubt, 2 on lack of data).
135 */
a_Misc_get_content_type_from_data(void * Data,size_t Size,const char ** PT)136 int a_Misc_get_content_type_from_data(void *Data, size_t Size, const char **PT)
137 {
138 size_t i, non_ascci, non_ascci_text, bin_chars;
139 char *p = Data;
140 int st = 1; /* default to "doubt' */
141 DetectedContentType Type = DT_OCTET_STREAM; /* default to binary */
142
143 /* HTML try */
144 for (i = 0; i < Size && dIsspace(p[i]); ++i);
145 if ((Size - i >= 5 && !dStrnAsciiCasecmp(p+i, "<html", 5)) ||
146 (Size - i >= 5 && !dStrnAsciiCasecmp(p+i, "<head", 5)) ||
147 (Size - i >= 6 && !dStrnAsciiCasecmp(p+i, "<title", 6)) ||
148 (Size - i >= 14 && !dStrnAsciiCasecmp(p+i, "<!doctype html", 14)) ||
149 /* this line is workaround for FTP through the Squid proxy */
150 (Size - i >= 17 && !dStrnAsciiCasecmp(p+i, "<!-- HTML listing", 17))) {
151
152 Type = DT_TEXT_HTML;
153 st = 0;
154 /* Images */
155 } else if (Size >= 4 && !strncmp(p, "GIF8", 4)) {
156 Type = DT_IMAGE_GIF;
157 st = 0;
158 } else if (Size >= 4 && !strncmp(p, "\x89PNG", 4)) {
159 Type = DT_IMAGE_PNG;
160 st = 0;
161 } else if (Size >= 2 && !strncmp(p, "\xff\xd8", 2)) {
162 /* JPEG has the first 2 bytes set to 0xffd8 in BigEndian - looking
163 * at the character representation should be machine independent. */
164 Type = DT_IMAGE_JPG;
165 st = 0;
166
167 /* Text */
168 } else {
169 /* Heuristic for "text/plain"
170 * {ASCII, LATIN1, UTF8, KOI8-R, CP-1251}
171 * All in the above set regard [00-31] as control characters.
172 * LATIN1: [7F-9F] unused
173 * CP-1251 {7F,98} unused (two characters).
174 *
175 * We'll use [0-31] as indicators of non-text content.
176 * Better heuristics are welcomed! :-) */
177
178 non_ascci = non_ascci_text = bin_chars = 0;
179 Size = MIN (Size, 256);
180 for (i = 0; i < Size; i++) {
181 int ch = (uchar_t) p[i];
182 if (ch < 32 && !dIsspace(ch))
183 ++bin_chars;
184 if (ch > 126)
185 ++non_ascci;
186 if (ch > 190)
187 ++non_ascci_text;
188 }
189 if (bin_chars == 0 && (non_ascci - non_ascci_text) <= Size/10) {
190 /* Let's say text: if "rare" chars are <= 10% */
191 Type = DT_TEXT_PLAIN;
192 } else if (Size > 0) {
193 /* a special check for UTF-8 */
194 Size = a_Utf8_end_of_char(p, Size - 1) + 1;
195 if (a_Utf8_test(p, Size) > 0)
196 Type = DT_TEXT_PLAIN;
197 }
198 if (Size >= 256)
199 st = 0;
200 }
201
202 *PT = MimeTypes[Type].str;
203 return st;
204 }
205
206 /*
207 * Parse Content-Type string, e.g., "text/html; charset=utf-8".
208 * Content-Type is defined in RFC 2045 section 5.1.
209 */
a_Misc_parse_content_type(const char * type,char ** major,char ** minor,char ** charset)210 void a_Misc_parse_content_type(const char *type, char **major, char **minor,
211 char **charset)
212 {
213 static const char tspecials_space[] = "()<>@,;:\\\"/[]?= ";
214 const char *str, *s;
215
216 if (major)
217 *major = NULL;
218 if (minor)
219 *minor = NULL;
220 if (charset)
221 *charset = NULL;
222 if (!(str = type))
223 return;
224
225 for (s = str; *s && isascii((uchar_t)*s) && !iscntrl((uchar_t)*s) &&
226 !strchr(tspecials_space, *s); s++) ;
227 if (major)
228 *major = dStrndup(str, s - str);
229
230 if (*s == '/') {
231 for (str = ++s; *s && isascii((uchar_t)*s) && !iscntrl((uchar_t)*s) &&
232 !strchr(tspecials_space, *s); s++) ;
233 if (minor)
234 *minor = dStrndup(str, s - str);
235 }
236 if (charset && *s &&
237 (dStrnAsciiCasecmp(type, "text/", 5) == 0 ||
238 dStrnAsciiCasecmp(type, "application/xhtml+xml", 21) == 0)) {
239 /* "charset" parameter defined for text media type in RFC 2046,
240 * application/xhtml+xml in RFC 3236.
241 *
242 * Note that RFC 3023 lists some main xml media types and provides
243 * the convention of using the "+xml" minor type suffix for other
244 * xml types, so it would be reasonable to check for that suffix if
245 * we have need to care about various xml types someday.
246 */
247 const char terminators[] = " ;\t";
248 const char key[] = "charset";
249
250 if ((s = dStriAsciiStr(str, key)) &&
251 (s == str || strchr(terminators, s[-1]))) {
252 s += sizeof(key) - 1;
253 for ( ; *s == ' ' || *s == '\t'; ++s);
254 if (*s == '=') {
255 size_t len;
256 for (++s; *s == ' ' || *s == '\t'; ++s);
257 if ((len = strcspn(s, terminators))) {
258 if (*s == '"' && s[len-1] == '"' && len > 1) {
259 /* quoted string */
260 s++;
261 len -= 2;
262 }
263 *charset = dStrndup(s, len);
264 }
265 }
266 }
267 }
268 }
269
270 /*
271 * Compare two Content-Type strings.
272 * Return 0 if they are equivalent, and 1 otherwise.
273 */
a_Misc_content_type_cmp(const char * ct1,const char * ct2)274 int a_Misc_content_type_cmp(const char *ct1, const char *ct2)
275 {
276 char *major1, *major2, *minor1, *minor2, *charset1, *charset2;
277 int ret;
278
279 if ((!ct1 || !*ct1) && (!ct2 || !*ct2))
280 return 0;
281 if ((!ct1 || !*ct1) || (!ct2 || !*ct2))
282 return 1;
283
284 a_Misc_parse_content_type(ct1, &major1, &minor1, &charset1);
285 a_Misc_parse_content_type(ct2, &major2, &minor2, &charset2);
286
287 if (major1 && major2 && !dStrAsciiCasecmp(major1, major2) &&
288 minor1 && minor2 && !dStrAsciiCasecmp(minor1, minor2) &&
289 ((!charset1 && !charset2) ||
290 (charset1 && charset2 && !dStrAsciiCasecmp(charset1, charset2)) ||
291 (!charset1 && charset2 && !dStrAsciiCasecmp(charset2, "UTF-8")) ||
292 (charset1 && !charset2 && !dStrAsciiCasecmp(charset1, "UTF-8")))) {
293 ret = 0;
294 } else {
295 ret = 1;
296 }
297 dFree(major1); dFree(major2);
298 dFree(minor1); dFree(minor2);
299 dFree(charset1); dFree(charset2);
300
301 return ret;
302 }
303
304 /*
305 * Check the server-supplied 'Content-Type' against our detected type.
306 * (some servers seem to default to "text/plain").
307 *
308 * Return value:
309 * 0, if they match
310 * -1, if a mismatch is detected
311 *
312 * There are many MIME types Dillo doesn't know, they're handled
313 * as "application/octet-stream" (as the SPEC says).
314 *
315 * A mismatch happens when receiving a binary stream as
316 * "text/plain" or "text/html", or an image that's not an image of its kind.
317 *
318 * Note: this is a basic security procedure.
319 *
320 */
a_Misc_content_type_check(const char * EntryType,const char * DetectedType)321 int a_Misc_content_type_check(const char *EntryType, const char *DetectedType)
322 {
323 int i;
324 int st = -1;
325
326 _MSG("Type check: [Srv: %s Det: %s]\n", EntryType, DetectedType);
327
328 if (!EntryType)
329 return 0; /* there's no mismatch without server type */
330
331 for (i = 1; MimeTypes[i].str; ++i)
332 if (dStrnAsciiCasecmp(EntryType, MimeTypes[i].str, MimeTypes[i].len) ==0)
333 break;
334
335 if (!MimeTypes[i].str) {
336 /* type not found, no mismatch */
337 st = 0;
338 } else if (dStrnAsciiCasecmp(EntryType, "image/", 6) == 0 &&
339 !dStrnAsciiCasecmp(DetectedType, MimeTypes[i].str,
340 MimeTypes[i].len)){
341 /* An image, and there's an exact match */
342 st = 0;
343 } else if (dStrnAsciiCasecmp(EntryType, "text/", 5) ||
344 dStrnAsciiCasecmp(DetectedType, "application/", 12)) {
345 /* Not an application sent as text */
346 st = 0;
347 } else if (dStrnAsciiCasecmp(EntryType, "application/xhtml+xml", 21) &&
348 dStrnAsciiCasecmp(DetectedType, "text/html", 9)) {
349 /* XML version of HTML */
350 st = 0;
351 }
352 _MSG("Type check: %s\n", st == 0 ? "MATCH" : "MISMATCH");
353
354 return st;
355 }
356
357 /*
358 * Parse a geometry string.
359 */
a_Misc_parse_geometry(char * str,int * x,int * y,int * w,int * h)360 int a_Misc_parse_geometry(char *str, int *x, int *y, int *w, int *h)
361 {
362 char *p, *t1, *t2;
363 int n1, n2;
364 int ret = 0;
365
366 if ((p = strchr(str, 'x')) || (p = strchr(str, 'X'))) {
367 n1 = strtol(str, &t1, 10);
368 n2 = strtol(++p, &t2, 10);
369 if (t1 != str && t2 != p) {
370 *w = n1;
371 *h = n2;
372 ret = 1;
373 /* parse x,y now */
374 p = t2;
375 n1 = strtol(p, &t1, 10);
376 n2 = strtol(t1, &t2, 10);
377 if (t1 != p && t2 != t1) {
378 *x = n1;
379 *y = n2;
380 }
381 }
382 }
383 _MSG("geom: w,h,x,y = (%d,%d,%d,%d)\n", *w, *h, *x, *y);
384 return ret;
385 }
386
387 /*
388 * Parse dillorc's search_url string ("[<label> ]<url>")
389 * Return value: -1 on error, 0 on success (and label and urlstr pointers)
390 */
a_Misc_parse_search_url(char * source,char ** label,char ** urlstr)391 int a_Misc_parse_search_url(char *source, char **label, char **urlstr)
392 {
393 static char buf[32];
394 char *p, *q;
395 int ret = -1;
396
397 if ((p = strrchr(source, ' '))) {
398 /* label and url pair */
399 strncpy(buf,source,MIN(p-source,31));
400 buf[MIN(p-source,31)] = 0;
401 source = p+1;
402 if ((p = strchr(source, '/')) && p[1] && (q = strchr(p+2,'/'))) {
403 *urlstr = source;
404 ret = 0;
405 }
406 } else {
407 /* url only, make a custom label */
408 if ((p = strchr(source, '/')) && p[1] && (q = strchr(p+2,'/'))) {
409 strncpy(buf,p+2,MIN(q-p-2,31));
410 buf[MIN(q-p-2,31)] = 0;
411 *urlstr = source;
412 ret = 0;
413 }
414 }
415 *label = buf;
416 if (ret == -1)
417 MSG("Invalid search_url: \"%s\"\n", source);
418 return ret;
419 }
420
421 /*
422 * Encodes string using base64 encoding.
423 * Return value: new string or NULL if input string is empty.
424 */
a_Misc_encode_base64(const char * in)425 char *a_Misc_encode_base64(const char *in)
426 {
427 static const char *const base64_hex = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
428 "abcdefghijklmnopqrstuvwxyz"
429 "0123456789+/";
430 char *out = NULL;
431 int len, i = 0;
432
433 if (in == NULL) return NULL;
434 len = strlen(in);
435
436 out = (char *)dMalloc((len + 2) / 3 * 4 + 1);
437
438 for (; len >= 3; len -= 3) {
439 out[i++] = base64_hex[in[0] >> 2];
440 out[i++] = base64_hex[((in[0]<<4) & 0x30) | (in[1]>>4)];
441 out[i++] = base64_hex[((in[1]<<2) & 0x3c) | (in[2]>>6)];
442 out[i++] = base64_hex[in[2] & 0x3f];
443 in += 3;
444 }
445
446 if (len > 0) {
447 unsigned char fragment;
448 out[i++] = base64_hex[in[0] >> 2];
449 fragment = (in[0] << 4) & 0x30;
450 if (len > 1) fragment |= in[1] >> 4;
451 out[i++] = base64_hex[fragment];
452 out[i++] = (len < 2) ? '=' : base64_hex[(in[1] << 2) & 0x3c];
453 out[i++] = '=';
454 }
455 out[i] = '\0';
456 return out;
457 }
458
459 /*
460 * Load a local file into a dStr.
461 * Return value: dStr on success, NULL on error.
462 * TODO: a filesize threshold may be implemented.
463 */
a_Misc_file2dstr(const char * filename)464 Dstr *a_Misc_file2dstr(const char *filename)
465 {
466 FILE *F_in;
467 int n;
468 char buf[4096];
469 Dstr *dstr = NULL;
470
471 if ((F_in = fopen(filename, "r"))) {
472 dstr = dStr_sized_new(4096);
473 while ((n = fread (buf, 1, 4096, F_in)) > 0) {
474 dStr_append_l(dstr, buf, n);
475 }
476 fclose(F_in);
477 }
478 return dstr;
479 }
480