1 /*
2 * This program is free software; you can redistribute it and/or modify it
3 * under the terms of the GNU Lesser General Public License as published by
4 * the Free Software Foundation.
5 *
6 * This program is distributed in the hope that it will be useful, but
7 * WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
8 * or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
9 * for more details.
10 *
11 * You should have received a copy of the GNU Lesser General Public License
12 * along with this program; if not, see <http://www.gnu.org/licenses/>.
13 *
14 *
15 * Authors:
16 * Dan Winship <danw@ximian.com>
17 *
18 * Copyright (C) 1999-2008 Novell, Inc. (www.novell.com)
19 *
20 */
21
22 #include "evolution-config.h"
23
24 #include <ctype.h>
25 #include <stdio.h>
26 #include <string.h>
27
28 #include "e-html-utils.h"
29
30 static gchar *
check_size(gchar ** buffer,gint * buffer_size,gchar * out,gint len)31 check_size (gchar **buffer,
32 gint *buffer_size,
33 gchar *out,
34 gint len)
35 {
36 if (out + len + 1> *buffer + *buffer_size) {
37 gint index = out - *buffer;
38
39 *buffer_size = MAX (index + len + 1, *buffer_size * 2);
40 *buffer = g_realloc (*buffer, *buffer_size);
41 out = *buffer + index;
42 }
43 return out;
44 }
45
46 /* auto-urlification hints: the goal is not to be strictly RFC-compliant,
47 * but rather to accurately distinguish urls/addresses from non-urls/
48 * addresses in real-world email.
49 *
50 * 1 = non-email-address chars: ()<>@,;:\"[]`'{}|
51 * 2 = trailing url garbage: ,.!?;:>)]}`'-_
52 * 4 = allowed dns chars
53 * 8 = non-url chars: "|
54 */
55 static gint special_chars[] = {
56 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, /* nul - 0x0f */
57 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, /* 0x10 - 0x1f */
58 9, 2, 9, 0, 0, 0, 0, 3, 1, 3, 0, 0, 3, 6, 6, 0, /* sp - / */
59 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 1, 0, 3, 2, /* 0 - ? */
60 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* @ - O */
61 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 3, 0, 2, /* P - _ */
62 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, /* ` - o */
63 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 9, 3, 0, 3 /* p - del */
64 };
65
66 #define is_addr_char(c) (c < 128 && !(special_chars[c] & 1))
67 #define is_url_char(c) (c < 128 && !(special_chars[c] & 8))
68 #define is_trailing_garbage(c) (c > 127 || (special_chars[c] & 2))
69 #define is_domain_name_char(c) (c < 128 && (special_chars[c] & 4))
70
71 /* (http|https|ftp|nntp)://[^ "|/]+\.([^ "|]*[^ ,.!?;:>)\]}`'"|_-])+ */
72 /* www\.[A-Za-z0-9.-]+(/([^ "|]*[^ ,.!?;:>)\]}`'"|_-])+) */
73
74 static gchar *
url_extract(const guchar ** text,gboolean full_url,gboolean use_whole_text)75 url_extract (const guchar **text,
76 gboolean full_url,
77 gboolean use_whole_text)
78 {
79 const guchar *end = *text, *p;
80 gchar *out;
81
82 if (use_whole_text) {
83 end = (*text) + strlen ((const gchar *) (*text));
84 } else {
85 while (*end && is_url_char (*end))
86 end++;
87 }
88
89 /* Back up if we probably went too far. */
90 while (end > *text && is_trailing_garbage (*(end - 1)))
91 end--;
92
93 if (full_url) {
94 /* Make sure this really looks like a URL. */
95 p = memchr (*text, ':', end - *text);
96 if (!p || end - p < 4)
97 return NULL;
98 } else {
99 /* Make sure this really looks like a hostname. */
100 p = memchr (*text, '.', end - *text);
101 if (!p || p >= end - 2)
102 return NULL;
103 p = memchr (p + 2, '.', end - (p + 2));
104 if (!p || p >= end - 2)
105 return NULL;
106 }
107
108 out = g_strndup ((gchar *) * text, end - *text);
109 *text = end;
110 return out;
111 }
112
113 static gchar *
email_address_extract(const guchar ** cur,gchar ** out,const guchar * linestart)114 email_address_extract (const guchar **cur,
115 gchar **out,
116 const guchar *linestart)
117 {
118 const guchar *start, *end, *dot;
119 gchar *addr;
120
121 /* *cur points to the '@'. Look backward for a valid local-part */
122 for (start = *cur; start - 1 >= linestart && is_addr_char (*(start - 1)); start--)
123 ;
124 if (start == *cur)
125 return NULL;
126 if (start > linestart + 2 &&
127 start[-1] == ':' && start[0] == '/' && start[1] == '/')
128 return NULL;
129
130 /* Now look forward for a valid domain part */
131 for (end = *cur + 1, dot = NULL; is_domain_name_char (*end); end++) {
132 if (*end == '.' && !dot)
133 dot = end;
134 }
135 if (!dot)
136 return NULL;
137
138 /* Remove trailing garbage */
139 while (is_trailing_garbage (*(end - 1)))
140 end--;
141 if (dot > end)
142 return NULL;
143
144 addr = g_strndup ((gchar *) start, end - start);
145 *out -= *cur - start;
146 *cur = end;
147
148 return addr;
149 }
150
151 static gboolean
is_citation(const guchar * c,gboolean saw_citation)152 is_citation (const guchar *c,
153 gboolean saw_citation)
154 {
155 const guchar *p;
156
157 if (*c != '>')
158 return FALSE;
159
160 /* A line that starts with a ">" is a citation, unless it's
161 * just mbox From-mangling...
162 */
163 if (strncmp ((const gchar *) c, ">From ", 6) != 0)
164 return TRUE;
165
166 /* If the previous line was a citation, then say this
167 * one is too.
168 */
169 if (saw_citation)
170 return TRUE;
171
172 /* Same if the next line is */
173 p = (const guchar *) strchr ((const gchar *) c, '\n');
174 if (p && *++p == '>')
175 return TRUE;
176
177 /* Otherwise, it was just an isolated ">From" line. */
178 return FALSE;
179 }
180
181 /**
182 * e_text_to_html_full:
183 * @input: a NUL-terminated input buffer
184 * @flags: some combination of the E_TEXT_TO_HTML_* flags defined
185 * in e-html-utils.h
186 * @color: color for citation highlighting
187 *
188 * This takes a buffer of text as input and produces a buffer of
189 * "equivalent" HTML, subject to certain transformation rules.
190 *
191 * The set of possible flags is:
192 *
193 * - E_TEXT_TO_HTML_PRE: wrap the output HTML in <PRE> and
194 * </PRE> Should only be used if @input is the entire
195 * buffer to be converted. If e_text_to_html is being called with
196 * small pieces of data, you should wrap the entire result in
197 * <PRE> yourself.
198 *
199 * - E_TEXT_TO_HTML_CONVERT_NL: convert "\n" to "<BR>n" on
200 * output. (Should not be used with E_TEXT_TO_HTML_PRE, since
201 * that would result in double-newlines.)
202 *
203 * - E_TEXT_TO_HTML_CONVERT_SPACES: convert a block of N spaces
204 * into N-1 non-breaking spaces and one normal space. A space
205 * at the start of the buffer is always converted to a
206 * non-breaking space, regardless of the following character,
207 * which probably means you don't want to use this flag on
208 * pieces of data that aren't delimited by at least line breaks.
209 *
210 * If E_TEXT_TO_HTML_CONVERT_NL and E_TEXT_TO_HTML_CONVERT_SPACES
211 * are both defined, then TABs will also be converted to spaces.
212 *
213 * - E_TEXT_TO_HTML_CONVERT_ALL_SPACES: similar to E_TEXT_TO_HTML_CONVERT_SPACES,
214 * but converts all spaces to non-breaking spaces.
215 *
216 * - E_TEXT_TO_HTML_CONVERT_URLS: wrap <a href="..."> </a>
217 * around strings that look like URLs.
218 *
219 * - E_TEXT_TO_HTML_CONVERT_ADDRESSES: wrap <a href="mailto:...">
220 * </a> around strings that look like mail addresses.
221 *
222 * - E_TEXT_TO_HTML_MARK_CITATION: wrap <font color="...">
223 * </font> around citations (lines beginning with "> ", etc).
224 *
225 * - E_TEXT_TO_HTML_ESCAPE_8BIT: flatten everything to US-ASCII
226 *
227 * - E_TEXT_TO_HTML_CITE: quote the text with "> " at the start of each
228 * line.
229 *
230 * - E_TEXT_TO_HTML_HIDE_URL_SCHEME: hides scheme part of the URL in
231 * the display part of the generated text (thus, instead of "http://www.example.com",
232 * user will only see "www.example.com")
233 *
234 * - E_TEXT_TO_HTML_URL_IS_WHOLE_TEXT: set when the whole @input text
235 * represents a URL; any spaces are removed in the href part.
236 *
237 * Returns: a newly-allocated string containing HTML
238 **/
239 gchar *
e_text_to_html_full(const gchar * input,guint flags,guint32 color)240 e_text_to_html_full (const gchar *input,
241 guint flags,
242 guint32 color)
243 {
244 const guchar *cur, *next, *linestart;
245 gchar *buffer = NULL;
246 gchar *out = NULL;
247 gint buffer_size = 0, col;
248 gboolean colored = FALSE, saw_citation = FALSE;
249
250 /* Allocate a translation buffer. */
251 buffer_size = strlen (input) * 2 + 5;
252 buffer = g_malloc (buffer_size);
253
254 out = buffer;
255 if (flags & E_TEXT_TO_HTML_PRE)
256 out += sprintf (out, "<PRE>");
257
258 col = 0;
259
260 for (cur = linestart = (const guchar *) input; cur && *cur; cur = next) {
261 gunichar u;
262
263 if (flags & E_TEXT_TO_HTML_MARK_CITATION && col == 0) {
264 saw_citation = is_citation (cur, saw_citation);
265 if (saw_citation) {
266 if (!colored) {
267 gchar font[25];
268
269 g_snprintf (font, 25, "<FONT COLOR=\"#%06x\">", color);
270
271 out = check_size (&buffer, &buffer_size, out, 25);
272 out += sprintf (out, "%s", font);
273 colored = TRUE;
274 }
275 } else if (colored) {
276 const gchar *no_font = "</FONT>";
277
278 out = check_size (&buffer, &buffer_size, out, 9);
279 out += sprintf (out, "%s", no_font);
280 colored = FALSE;
281 }
282
283 /* Display mbox-mangled ">From" as "From" */
284 if (*cur == '>' && !saw_citation)
285 cur++;
286 } else if (flags & E_TEXT_TO_HTML_CITE && col == 0) {
287 out = check_size (&buffer, &buffer_size, out, 5);
288 out += sprintf (out, "> ");
289 }
290
291 u = g_utf8_get_char ((gchar *) cur);
292 if (g_unichar_isalpha (u) &&
293 (flags & E_TEXT_TO_HTML_CONVERT_URLS)) {
294 gchar *tmpurl = NULL, *refurl = NULL, *dispurl = NULL;
295
296 if (!g_ascii_strncasecmp ((gchar *) cur, "http://", 7) ||
297 !g_ascii_strncasecmp ((gchar *) cur, "https://", 8) ||
298 !g_ascii_strncasecmp ((gchar *) cur, "ftp://", 6) ||
299 !g_ascii_strncasecmp ((gchar *) cur, "nntp://", 7) ||
300 !g_ascii_strncasecmp ((gchar *) cur, "mailto:", 7) ||
301 !g_ascii_strncasecmp ((gchar *) cur, "news:", 5) ||
302 !g_ascii_strncasecmp ((gchar *) cur, "file:", 5) ||
303 !g_ascii_strncasecmp ((gchar *) cur, "callto:", 7) ||
304 !g_ascii_strncasecmp ((gchar *) cur, "h323:", 5) ||
305 !g_ascii_strncasecmp ((gchar *) cur, "sip:", 4) ||
306 !g_ascii_strncasecmp ((gchar *) cur, "tel:", 4) ||
307 !g_ascii_strncasecmp ((gchar *) cur, "webcal:", 7)) {
308 tmpurl = url_extract (&cur, TRUE, (flags & E_TEXT_TO_HTML_URL_IS_WHOLE_TEXT) != 0);
309 if (tmpurl) {
310 refurl = e_text_to_html (tmpurl, 0);
311 if ((flags & E_TEXT_TO_HTML_HIDE_URL_SCHEME) != 0) {
312 const gchar *str;
313
314 str = strchr (refurl, ':');
315 if (str) {
316 str++;
317 if (g_ascii_strncasecmp (str, "//", 2) == 0) {
318 str += 2;
319 }
320
321 dispurl = g_strdup (str);
322 } else {
323 dispurl = g_strdup (refurl);
324 }
325 } else {
326 dispurl = g_strdup (refurl);
327 }
328 }
329 } else if (!g_ascii_strncasecmp ((gchar *) cur, "www.", 4) &&
330 is_url_char (*(cur + 4))) {
331 tmpurl = url_extract (&cur, FALSE, (flags & E_TEXT_TO_HTML_URL_IS_WHOLE_TEXT) != 0);
332 if (tmpurl) {
333 dispurl = e_text_to_html (tmpurl, 0);
334 refurl = g_strdup_printf (
335 "http://%s", dispurl);
336 }
337 }
338
339 if (tmpurl) {
340 if ((flags & E_TEXT_TO_HTML_URL_IS_WHOLE_TEXT) != 0) {
341 /* also remove any spaces in refurl */
342 gchar *replaced, **split_url;
343
344 split_url = g_strsplit (refurl, " ", 0);
345 replaced = g_strjoinv ("", split_url);
346 g_strfreev (split_url);
347
348 g_free (refurl);
349 refurl = replaced;
350 }
351
352 out = check_size (
353 &buffer, &buffer_size, out,
354 strlen (refurl) +
355 strlen (dispurl) + 15);
356 out += sprintf (out,
357 "<a href=\"%s\">%s</a>",
358 refurl, dispurl);
359 col += strlen (tmpurl);
360 g_free (tmpurl);
361 g_free (refurl);
362 g_free (dispurl);
363 }
364
365 if (!*cur)
366 break;
367 u = g_utf8_get_char ((gchar *) cur);
368 }
369
370 if (u == '@' && (flags & E_TEXT_TO_HTML_CONVERT_ADDRESSES)) {
371 gchar *addr, *dispaddr, *outaddr;
372
373 addr = email_address_extract (&cur, &out, linestart);
374 if (addr) {
375 dispaddr = e_text_to_html (addr, 0);
376 outaddr = g_strdup_printf (
377 "<a href=\"mailto:%s\">%s</a>",
378 addr, dispaddr);
379 out = check_size (&buffer, &buffer_size, out, strlen (outaddr));
380 out += sprintf (out, "%s", outaddr);
381 col += strlen (addr);
382 g_free (addr);
383 g_free (dispaddr);
384 g_free (outaddr);
385
386 if (!*cur)
387 break;
388 u = g_utf8_get_char ((gchar *) cur);
389 }
390 }
391
392 if (!g_unichar_validate (u)) {
393 /* Sigh. Someone sent undeclared 8-bit data.
394 * Assume it's iso-8859-1.
395 */
396 u = *cur;
397 next = cur + 1;
398 } else
399 next = (const guchar *) g_utf8_next_char (cur);
400
401 out = check_size (&buffer, &buffer_size, out, 10);
402
403 switch (u) {
404 case '<':
405 strcpy (out, "<");
406 out += 4;
407 col++;
408 break;
409
410 case '>':
411 strcpy (out, ">");
412 out += 4;
413 col++;
414 break;
415
416 case '&':
417 strcpy (out, "&");
418 out += 5;
419 col++;
420 break;
421
422 case '"':
423 strcpy (out, """);
424 out += 6;
425 col++;
426 break;
427
428 case '\n':
429 if (flags & E_TEXT_TO_HTML_CONVERT_NL) {
430 strcpy (out, "<br>");
431 out += 4;
432 }
433 *out++ = *cur;
434 linestart = cur;
435 col = 0;
436 break;
437
438 case '\t':
439 if (flags & (E_TEXT_TO_HTML_CONVERT_SPACES |
440 E_TEXT_TO_HTML_CONVERT_NL)) {
441 do {
442 out = check_size (
443 &buffer, &buffer_size, out, 7);
444 strcpy (out, " ");
445 out += 6;
446 col++;
447 } while (col % 8);
448 break;
449 }
450 /* falls through */
451
452 case ' ':
453 if ((flags & (E_TEXT_TO_HTML_CONVERT_SPACES | E_TEXT_TO_HTML_CONVERT_ALL_SPACES)) != 0) {
454 if ((flags & E_TEXT_TO_HTML_CONVERT_ALL_SPACES) != 0 ||
455 cur == (const guchar *) input ||
456 *(cur + 1) == ' ' || *(cur + 1) == '\t' ||
457 *(cur - 1) == '\n') {
458 strcpy (out, " ");
459 out += 6;
460 col++;
461 break;
462 }
463 }
464 /* falls through */
465
466 default:
467 if ((u >= 0x20 && u < 0x80) ||
468 (u == '\r' || u == '\t')) {
469 /* Default case, just copy. */
470 *out++ = u;
471 } else {
472 if (flags & E_TEXT_TO_HTML_ESCAPE_8BIT)
473 *out++ = '?';
474 else
475 out += g_snprintf (out, 9, "&#%d;", u);
476 }
477 col++;
478 break;
479 }
480 }
481
482 out = check_size (&buffer, &buffer_size, out, 7);
483 if (flags & E_TEXT_TO_HTML_PRE)
484 strcpy (out, "</PRE>");
485 else
486 *out = '\0';
487
488 return buffer;
489 }
490
491 gchar *
e_text_to_html(const gchar * input,guint flags)492 e_text_to_html (const gchar *input,
493 guint flags)
494 {
495 return e_text_to_html_full (input, flags, 0);
496 }
497
498 #ifdef E_HTML_UTILS_TEST
499
500 struct {
501 gchar *text, *url;
502 } url_tests[] = {
503 { "bob@foo.com", "mailto:bob@foo.com" },
504 { "Ends with bob@foo.com", "mailto:bob@foo.com" },
505 { "bob@foo.com at start", "mailto:bob@foo.com" },
506 { "bob@foo.com.", "mailto:bob@foo.com" },
507 { "\"bob@foo.com\"", "mailto:bob@foo.com" },
508 { "<bob@foo.com>", "mailto:bob@foo.com" },
509 { "(bob@foo.com)", "mailto:bob@foo.com" },
510 { "bob@foo.com, 555-9999", "mailto:bob@foo.com" },
511 { "|bob@foo.com|555-9999|", "mailto:bob@foo.com" },
512 { "bob@ no match bob@", NULL },
513 { "@foo.com no match @foo.com", NULL },
514 { "\"bob\"@foo.com", NULL },
515 { "M@ke money fast!", NULL },
516 { "ASCII art @_@ @>->-", NULL },
517
518 { "http://www.foo.com", "http://www.foo.com" },
519 { "Ends with http://www.foo.com", "http://www.foo.com" },
520 { "http://www.foo.com at start", "http://www.foo.com" },
521 { "http://www.foo.com.", "http://www.foo.com" },
522 { "http://www.foo.com/.", "http://www.foo.com/" },
523 { "<http://www.foo.com>", "http://www.foo.com" },
524 { "(http://www.foo.com)", "http://www.foo.com" },
525 { "http://www.foo.com, 555-9999", "http://www.foo.com" },
526 { "|http://www.foo.com|555-9999|", "http://www.foo.com" },
527 { "foo http://www.foo.com/ bar", "http://www.foo.com/" },
528 { "foo http://www.foo.com/index.html bar",
529 "http://www.foo.com/index.html" },
530 { "foo http://www.foo.com/q?99 bar", "http://www.foo.com/q?99" },
531 { "foo http://www.foo.com/;foo=bar&baz=quux bar",
532 "http://www.foo.com/;foo=bar&baz=quux" },
533 { "foo http://www.foo.com/index.html#anchor bar",
534 "http://www.foo.com/index.html#anchor" },
535 { "http://www.foo.com/index.html; foo",
536 "http://www.foo.com/index.html" },
537 { "http://www.foo.com/index.html: foo",
538 "http://www.foo.com/index.html" },
539 { "http://www.foo.com/index.html-- foo",
540 "http://www.foo.com/index.html" },
541 { "http://www.foo.com/index.html?",
542 "http://www.foo.com/index.html" },
543 { "http://www.foo.com/index.html!",
544 "http://www.foo.com/index.html" },
545 { "\"http://www.foo.com/index.html\"",
546 "http://www.foo.com/index.html" },
547 { "'http://www.foo.com/index.html'",
548 "http://www.foo.com/index.html" },
549 { "http://bob@www.foo.com/bar/baz/",
550 "http://bob@www.foo.com/bar/baz/" },
551 { "http no match http", NULL },
552 { "http: no match http:", NULL },
553 { "http:// no match http://", NULL },
554 { "unrecognized://bob@foo.com/path", NULL },
555
556 { "src/www.c", NULL },
557 { "Ewwwwww.Gross.", NULL },
558
559 };
560 gint num_url_tests = G_N_ELEMENTS (url_tests);
561
562 gint
main(gint argc,gchar ** argv)563 main (gint argc,
564 gchar **argv)
565 {
566 gint i, errors = 0;
567 gchar *html, *url, *p;
568
569 for (i = 0; i < num_url_tests; i++) {
570 html = e_text_to_html (
571 url_tests[i].text,
572 E_TEXT_TO_HTML_CONVERT_URLS |
573 E_TEXT_TO_HTML_CONVERT_ADDRESSES);
574
575 url = strstr (html, "href=\"");
576 if (url) {
577 url += 6;
578 p = strchr (url, '"');
579 if (p)
580 *p = '\0';
581
582 while ((p = strstr (url, "&")))
583 memmove (p + 1, p + 5, strlen (p + 5) + 1);
584 }
585
586 if ((url && (!url_tests[i].url || strcmp (url, url_tests[i].url) != 0)) ||
587 (!url && url_tests[i].url)) {
588 printf (
589 "FAILED on \"%s\" -> %s\n (got %s)\n\n",
590 url_tests[i].text,
591 url_tests[i].url ? url_tests[i].url : "(nothing)",
592 url ? url : "(nothing)");
593 errors++;
594 }
595
596 g_free (html);
597 }
598
599 printf ("\n%d errors\n", errors);
600 return errors;
601 }
602 #endif
603