1 /* Copyright (c) 2011-2018 Dovecot authors, see the included COPYING file */
2
3 #include "lib.h"
4 #include "buffer.h"
5 #include "unichar.h"
6 #include "message-parser.h"
7 #include "mail-html2text.h"
8
9 /* Zero-width space (​) apparently also belongs here, but that gets a
10 bit tricky to handle.. is it actually used anywhere? */
11 #define HTML_WHITESPACE(c) \
12 ((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n')
13
14 enum html_state {
15 /* regular text */
16 HTML_STATE_TEXT,
17 /* tag outside "quoted string" */
18 HTML_STATE_TAG,
19 /* tag inside "double quoted string" */
20 HTML_STATE_TAG_DQUOTED,
21 /* tag -> "escape\ */
22 HTML_STATE_TAG_DQUOTED_ESCAPE,
23 /* tag inside 'single quoted string' */
24 HTML_STATE_TAG_SQUOTED,
25 /* tag -> 'escape\ */
26 HTML_STATE_TAG_SQUOTED_ESCAPE,
27 /* comment */
28 HTML_STATE_COMMENT,
29 /* comment is ending, we've seen "--" and now just waiting for ">" */
30 HTML_STATE_COMMENT_END,
31 /* (java)script */
32 HTML_STATE_SCRIPT,
33 /* CSS style */
34 HTML_STATE_STYLE,
35 /* <![CDATA[...]]> */
36 HTML_STATE_CDATA
37 };
38
39 struct mail_html2text {
40 enum mail_html2text_flags flags;
41 enum html_state state;
42 buffer_t *input;
43 unsigned int quote_level;
44 bool add_newline;
45 };
46
47 static struct {
48 const char *name;
49 unichar_t chr;
50 } html_entities[] = {
51 #include "html-entities.h"
52 };
53
54 struct mail_html2text *
mail_html2text_init(enum mail_html2text_flags flags)55 mail_html2text_init(enum mail_html2text_flags flags)
56 {
57 struct mail_html2text *ht;
58
59 ht = i_new(struct mail_html2text, 1);
60 ht->flags = flags;
61 ht->input = buffer_create_dynamic(default_pool, 512);
62 return ht;
63 }
64
65 static size_t
parse_tag_name(struct mail_html2text * ht,const unsigned char * data,size_t size)66 parse_tag_name(struct mail_html2text *ht,
67 const unsigned char *data, size_t size)
68 {
69 size_t i;
70
71 if (size >= 3 && memcmp(data, "!--", 3) == 0) {
72 ht->state = HTML_STATE_COMMENT;
73 return 3 + 1;
74 }
75 if (size >= 7 && i_memcasecmp(data, "script", 6) == 0 &&
76 (HTML_WHITESPACE(data[6]) || data[6] == '>')) {
77 ht->state = HTML_STATE_SCRIPT;
78 return 7 + 1;
79 }
80 if (size >= 6 && i_memcasecmp(data, "style", 5) == 0 &&
81 (HTML_WHITESPACE(data[5]) || data[5] == '>')) {
82 ht->state = HTML_STATE_STYLE;
83 return 6 + 1;
84 }
85 if (size >= 8 && i_memcasecmp(data, "![CDATA[", 8) == 0) {
86 ht->state = HTML_STATE_CDATA;
87 return 8 + 1;
88 }
89
90 if (size >= 11 && i_memcasecmp(data, "blockquote", 10) == 0 &&
91 (HTML_WHITESPACE(data[10]) || data[10] == '>')) {
92 ht->quote_level++;
93 ht->state = HTML_STATE_TAG;
94 return 1;
95 } else if (ht->quote_level > 0 &&
96 size >= 12 && i_memcasecmp(data, "/blockquote>", 12) == 0) {
97 ht->quote_level--;
98 if ((ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) == 0)
99 ht->add_newline = TRUE;
100 ht->state = HTML_STATE_TAG;
101 return 1;
102 }
103 if (size < 12) {
104 /* can we see the whole tag name? */
105 for (i = 0; i < size; i++) {
106 if (HTML_WHITESPACE(data[i]) || data[i] == '>')
107 break;
108 }
109 if (i == size) {
110 /* need more data */
111 return 0;
112 }
113 }
114 ht->state = HTML_STATE_TAG;
115 return 1;
116 }
117
html_entity_get_unichar(const char * name,unichar_t * chr_r)118 static bool html_entity_get_unichar(const char *name, unichar_t *chr_r)
119 {
120 unichar_t chr;
121
122 for (size_t i = 0; i < N_ELEMENTS(html_entities); i++) {
123 if (strcmp(html_entities[i].name, name) == 0) {
124 *chr_r = html_entities[i].chr;
125 return TRUE;
126 }
127 }
128
129 /* maybe it's just encoded binary byte
130 it can be &#nnn; or &#xnnn;
131 */
132 if (name[0] == '#' &&
133 ((name[1] == 'x' &&
134 str_to_uint32_hex(name+2, &chr) == 0) ||
135 str_to_uint32(name+1, &chr) == 0) &&
136 uni_is_valid_ucs4(chr)) {
137 *chr_r = chr;
138 return TRUE;
139 }
140
141 return FALSE;
142 }
143
parse_entity(const unsigned char * data,size_t size,buffer_t * output)144 static size_t parse_entity(const unsigned char *data, size_t size,
145 buffer_t *output)
146 {
147 char entity[10];
148 unichar_t chr;
149 size_t i;
150
151 for (i = 0; i < size; i++) {
152 if (HTML_WHITESPACE(data[i]) || i >= sizeof(entity)) {
153 /* broken entity */
154 return 1;
155 }
156 if (data[i] == ';')
157 break;
158 }
159 if (i == size)
160 return 0;
161
162 i_assert(i < sizeof(entity));
163 memcpy(entity, data, i); entity[i] = '\0';
164
165 if (html_entity_get_unichar(entity, &chr))
166 uni_ucs4_to_utf8_c(chr, output);
167 return i + 1 + 1;
168 }
169
mail_html2text_add_space(buffer_t * output)170 static void mail_html2text_add_space(buffer_t *output)
171 {
172 const unsigned char *data = output->data;
173
174 if (output->used > 0 && data[output->used-1] != ' ' &&
175 data[output->used-1] != '\n')
176 buffer_append_c(output, ' ');
177 }
178
179 static size_t
parse_data(struct mail_html2text * ht,const unsigned char * data,size_t size,buffer_t * output)180 parse_data(struct mail_html2text *ht,
181 const unsigned char *data, size_t size, buffer_t *output)
182 {
183 size_t i, ret;
184
185 for (i = 0; i < size; i++) {
186 unsigned char c = data[i];
187
188 switch (ht->state) {
189 case HTML_STATE_TEXT:
190 if (c == '<') {
191 ret = parse_tag_name(ht, data+i+1, size-i-1);
192 if (ret == 0)
193 return i;
194 i += ret - 1;
195 } else if (ht->quote_level > 0 &&
196 (ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) != 0) {
197 break;
198 } else if (c == '&') {
199 ret = parse_entity(data+i+1, size-i-1, output);
200 if (ret == 0)
201 return i;
202 i += ret - 1;
203 } else {
204 buffer_append_c(output, c);
205 }
206 break;
207 case HTML_STATE_TAG:
208 if (c == '"')
209 ht->state = HTML_STATE_TAG_DQUOTED;
210 else if (c == '\'')
211 ht->state = HTML_STATE_TAG_SQUOTED;
212 else if (c == '>') {
213 ht->state = HTML_STATE_TEXT;
214 if (ht->quote_level > 0 &&
215 (ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) == 0) {
216 buffer_append(output, "\n>", 2);
217 } else if (ht->add_newline) {
218 buffer_append_c(output, '\n');
219 }
220 ht->add_newline = FALSE;
221 mail_html2text_add_space(output);
222 }
223 break;
224 case HTML_STATE_TAG_DQUOTED:
225 if (c == '"')
226 ht->state = HTML_STATE_TAG;
227 else if (c == '\\')
228 ht->state = HTML_STATE_TAG_DQUOTED_ESCAPE;
229 break;
230 case HTML_STATE_TAG_DQUOTED_ESCAPE:
231 ht->state = HTML_STATE_TAG_DQUOTED;
232 break;
233 case HTML_STATE_TAG_SQUOTED:
234 if (c == '\'')
235 ht->state = HTML_STATE_TAG;
236 else if (c == '\\')
237 ht->state = HTML_STATE_TAG_SQUOTED_ESCAPE;
238 break;
239 case HTML_STATE_TAG_SQUOTED_ESCAPE:
240 ht->state = HTML_STATE_TAG_SQUOTED;
241 break;
242 case HTML_STATE_COMMENT:
243 if (c == '-') {
244 if (i+1 == size)
245 return i;
246 if (data[i+1] == '-') {
247 ht->state = HTML_STATE_COMMENT_END;
248 i++;
249 }
250 }
251 break;
252 case HTML_STATE_COMMENT_END:
253 if (c == '>')
254 ht->state = HTML_STATE_TEXT;
255 else if (!HTML_WHITESPACE(c))
256 ht->state = HTML_STATE_COMMENT;
257 break;
258 case HTML_STATE_SCRIPT:
259 if (c == '<') {
260 unsigned int max_len = I_MIN(size-i, 9);
261
262 if (i_memcasecmp(data+i, "</script>", max_len) == 0) {
263 if (max_len < 9)
264 return i;
265 mail_html2text_add_space(output);
266 ht->state = HTML_STATE_TEXT;
267 i += 8;
268 }
269 }
270 break;
271 case HTML_STATE_STYLE:
272 if (c == '<') {
273 unsigned int max_len = I_MIN(size-i, 8);
274
275 if (i_memcasecmp(data+i, "</style>", max_len) == 0) {
276 if (max_len < 8)
277 return i;
278 mail_html2text_add_space(output);
279 ht->state = HTML_STATE_TEXT;
280 i += 7;
281 }
282 }
283 break;
284 case HTML_STATE_CDATA:
285 if (c == ']') {
286 unsigned int max_len = I_MIN(size-i, 3);
287
288 if (i_memcasecmp(data+i, "]]>", max_len) == 0) {
289 if (max_len < 3)
290 return i;
291 ht->state = HTML_STATE_TEXT;
292 i += 2;
293 break;
294 }
295 }
296 if (ht->quote_level == 0 ||
297 (ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) == 0)
298 buffer_append_c(output, c);
299 break;
300 }
301 }
302 return i;
303 }
304
mail_html2text_more(struct mail_html2text * ht,const unsigned char * data,size_t size,buffer_t * output)305 void mail_html2text_more(struct mail_html2text *ht,
306 const unsigned char *data, size_t size,
307 buffer_t *output)
308 {
309 size_t pos, inc_size, buf_orig_size;
310
311 i_assert(size > 0);
312
313 while (ht->input->used > 0) {
314 /* we didn't get enough input the last time to know
315 what to do. */
316 buf_orig_size = ht->input->used;
317
318 inc_size = I_MIN(size, 128);
319 buffer_append(ht->input, data, inc_size);
320 pos = parse_data(ht, ht->input->data,
321 ht->input->used, output);
322 if (pos == 0) {
323 /* we need to add more data into buffer */
324 data += inc_size;
325 size -= inc_size;
326 if (size == 0)
327 return;
328 } else if (pos >= buf_orig_size) {
329 /* we parsed forward */
330 data += pos - buf_orig_size;
331 size -= pos - buf_orig_size;
332 buffer_set_used_size(ht->input, 0);
333 } else {
334 /* invalid input - eat away what we parsed so far
335 and retry */
336 buffer_set_used_size(ht->input, buf_orig_size);
337 buffer_delete(ht->input, 0, pos);
338 }
339 }
340 pos = parse_data(ht, data, size, output);
341 buffer_append(ht->input, data + pos, size - pos);
342 }
343
mail_html2text_deinit(struct mail_html2text ** _ht)344 void mail_html2text_deinit(struct mail_html2text **_ht)
345 {
346 struct mail_html2text *ht = *_ht;
347
348 if (ht == NULL)
349 return;
350
351 *_ht = NULL;
352 buffer_free(&ht->input);
353 i_free(ht);
354 }
355