1 /* Copyright (c) 2011-2018 Dovecot authors, see the included COPYING file */
2 
3 #include "lib.h"
4 #include "buffer.h"
5 #include "unichar.h"
6 #include "message-parser.h"
7 #include "mail-html2text.h"
8 
9 /* Zero-width space (​) apparently also belongs here, but that gets a
10    bit tricky to handle.. is it actually used anywhere? */
11 #define HTML_WHITESPACE(c) \
12 	((c) == ' ' || (c) == '\t' || (c) == '\r' || (c) == '\n')
13 
14 enum html_state {
15 	/* regular text */
16 	HTML_STATE_TEXT,
17 	/* tag outside "quoted string" */
18 	HTML_STATE_TAG,
19 	/* tag inside "double quoted string" */
20 	HTML_STATE_TAG_DQUOTED,
21 	/* tag -> "escape\ */
22 	HTML_STATE_TAG_DQUOTED_ESCAPE,
23 	/* tag inside 'single quoted string' */
24 	HTML_STATE_TAG_SQUOTED,
25 	/* tag -> 'escape\ */
26 	HTML_STATE_TAG_SQUOTED_ESCAPE,
27 	/* comment */
28 	HTML_STATE_COMMENT,
29 	/* comment is ending, we've seen "--" and now just waiting for ">" */
30 	HTML_STATE_COMMENT_END,
31 	/* (java)script */
32 	HTML_STATE_SCRIPT,
33 	/* CSS style */
34 	HTML_STATE_STYLE,
35 	/* <![CDATA[...]]> */
36 	HTML_STATE_CDATA
37 };
38 
39 struct mail_html2text {
40 	enum mail_html2text_flags flags;
41 	enum html_state state;
42 	buffer_t *input;
43 	unsigned int quote_level;
44 	bool add_newline;
45 };
46 
47 static struct {
48 	const char *name;
49 	unichar_t chr;
50 } html_entities[] = {
51 #include "html-entities.h"
52 };
53 
54 struct mail_html2text *
mail_html2text_init(enum mail_html2text_flags flags)55 mail_html2text_init(enum mail_html2text_flags flags)
56 {
57 	struct mail_html2text *ht;
58 
59 	ht = i_new(struct mail_html2text, 1);
60 	ht->flags = flags;
61 	ht->input = buffer_create_dynamic(default_pool, 512);
62 	return ht;
63 }
64 
65 static size_t
parse_tag_name(struct mail_html2text * ht,const unsigned char * data,size_t size)66 parse_tag_name(struct mail_html2text *ht,
67 	       const unsigned char *data, size_t size)
68 {
69 	size_t i;
70 
71 	if (size >= 3 && memcmp(data, "!--", 3) == 0) {
72 		ht->state = HTML_STATE_COMMENT;
73 		return 3 + 1;
74 	}
75 	if (size >= 7 && i_memcasecmp(data, "script", 6) == 0 &&
76 	    (HTML_WHITESPACE(data[6]) || data[6] == '>')) {
77 		ht->state = HTML_STATE_SCRIPT;
78 		return 7 + 1;
79 	}
80 	if (size >= 6 && i_memcasecmp(data, "style", 5) == 0 &&
81 	    (HTML_WHITESPACE(data[5]) || data[5] == '>')) {
82 		ht->state = HTML_STATE_STYLE;
83 		return 6 + 1;
84 	}
85 	if (size >= 8 && i_memcasecmp(data, "![CDATA[", 8) == 0) {
86 		ht->state = HTML_STATE_CDATA;
87 		return 8 + 1;
88 	}
89 
90 	if (size >= 11 && i_memcasecmp(data, "blockquote", 10) == 0 &&
91 	    (HTML_WHITESPACE(data[10]) || data[10] == '>')) {
92 		ht->quote_level++;
93 		ht->state = HTML_STATE_TAG;
94 		return 1;
95 	} else if (ht->quote_level > 0 &&
96 		   size >= 12 && i_memcasecmp(data, "/blockquote>", 12) == 0) {
97 		ht->quote_level--;
98 		if ((ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) == 0)
99 			ht->add_newline = TRUE;
100 		ht->state = HTML_STATE_TAG;
101 		return 1;
102 	}
103 	if (size < 12) {
104 		/* can we see the whole tag name? */
105 		for (i = 0; i < size; i++) {
106 			if (HTML_WHITESPACE(data[i]) || data[i] == '>')
107 				break;
108 		}
109 		if (i == size) {
110 			/* need more data */
111 			return 0;
112 		}
113 	}
114 	ht->state = HTML_STATE_TAG;
115 	return 1;
116 }
117 
html_entity_get_unichar(const char * name,unichar_t * chr_r)118 static bool html_entity_get_unichar(const char *name, unichar_t *chr_r)
119 {
120 	unichar_t chr;
121 
122 	for (size_t i = 0; i < N_ELEMENTS(html_entities); i++) {
123 		if (strcmp(html_entities[i].name, name) == 0) {
124 			*chr_r = html_entities[i].chr;
125 			return TRUE;
126 		}
127 	}
128 
129 	/* maybe it's just encoded binary byte
130 	   it can be &#nnn; or &#xnnn;
131 	*/
132 	if (name[0] == '#' &&
133 	    ((name[1] == 'x' &&
134 	      str_to_uint32_hex(name+2, &chr) == 0) ||
135 	     str_to_uint32(name+1, &chr) == 0) &&
136 	     uni_is_valid_ucs4(chr)) {
137 		*chr_r = chr;
138 		return TRUE;
139 	}
140 
141 	return FALSE;
142 }
143 
parse_entity(const unsigned char * data,size_t size,buffer_t * output)144 static size_t parse_entity(const unsigned char *data, size_t size,
145 			   buffer_t *output)
146 {
147 	char entity[10];
148 	unichar_t chr;
149 	size_t i;
150 
151 	for (i = 0; i < size; i++) {
152 		if (HTML_WHITESPACE(data[i]) || i >= sizeof(entity)) {
153 			/* broken entity */
154 			return 1;
155 		}
156 		if (data[i] == ';')
157 			break;
158 	}
159 	if (i == size)
160 		return 0;
161 
162 	i_assert(i < sizeof(entity));
163 	memcpy(entity, data, i); entity[i] = '\0';
164 
165 	if (html_entity_get_unichar(entity, &chr))
166 		uni_ucs4_to_utf8_c(chr, output);
167 	return i + 1 + 1;
168 }
169 
mail_html2text_add_space(buffer_t * output)170 static void mail_html2text_add_space(buffer_t *output)
171 {
172 	const unsigned char *data = output->data;
173 
174 	if (output->used > 0 && data[output->used-1] != ' ' &&
175 	    data[output->used-1] != '\n')
176 		buffer_append_c(output, ' ');
177 }
178 
179 static size_t
parse_data(struct mail_html2text * ht,const unsigned char * data,size_t size,buffer_t * output)180 parse_data(struct mail_html2text *ht,
181 	   const unsigned char *data, size_t size, buffer_t *output)
182 {
183 	size_t i, ret;
184 
185 	for (i = 0; i < size; i++) {
186 		unsigned char c = data[i];
187 
188 		switch (ht->state) {
189 		case HTML_STATE_TEXT:
190 			if (c == '<') {
191 				ret = parse_tag_name(ht, data+i+1, size-i-1);
192 				if (ret == 0)
193 					return i;
194 				i += ret - 1;
195 			} else if (ht->quote_level > 0 &&
196 				   (ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) != 0) {
197 					break;
198 			} else if (c == '&') {
199 				ret = parse_entity(data+i+1, size-i-1, output);
200 				if (ret == 0)
201 					return i;
202 				i += ret - 1;
203 			} else {
204 				buffer_append_c(output, c);
205 			}
206 			break;
207 		case HTML_STATE_TAG:
208 			if (c == '"')
209 				ht->state = HTML_STATE_TAG_DQUOTED;
210 			else if (c == '\'')
211 				ht->state = HTML_STATE_TAG_SQUOTED;
212 			else if (c == '>') {
213 				ht->state = HTML_STATE_TEXT;
214 				if (ht->quote_level > 0 &&
215 				    (ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) == 0) {
216 					buffer_append(output, "\n>", 2);
217 				} else if (ht->add_newline) {
218 					buffer_append_c(output, '\n');
219 				}
220 				ht->add_newline = FALSE;
221 				mail_html2text_add_space(output);
222 			}
223 			break;
224 		case HTML_STATE_TAG_DQUOTED:
225 			if (c == '"')
226 				ht->state = HTML_STATE_TAG;
227 			else if (c == '\\')
228 				ht->state = HTML_STATE_TAG_DQUOTED_ESCAPE;
229 			break;
230 		case HTML_STATE_TAG_DQUOTED_ESCAPE:
231 			ht->state = HTML_STATE_TAG_DQUOTED;
232 			break;
233 		case HTML_STATE_TAG_SQUOTED:
234 			if (c == '\'')
235 				ht->state = HTML_STATE_TAG;
236 			else if (c == '\\')
237 				ht->state = HTML_STATE_TAG_SQUOTED_ESCAPE;
238 			break;
239 		case HTML_STATE_TAG_SQUOTED_ESCAPE:
240 			ht->state = HTML_STATE_TAG_SQUOTED;
241 			break;
242 		case HTML_STATE_COMMENT:
243 			if (c == '-') {
244 				if (i+1 == size)
245 					return i;
246 				if (data[i+1] == '-') {
247 					ht->state = HTML_STATE_COMMENT_END;
248 					i++;
249 				}
250 			}
251 			break;
252 		case HTML_STATE_COMMENT_END:
253 			if (c == '>')
254 				ht->state = HTML_STATE_TEXT;
255 			else if (!HTML_WHITESPACE(c))
256 				ht->state = HTML_STATE_COMMENT;
257 			break;
258 		case HTML_STATE_SCRIPT:
259 			if (c == '<') {
260 				unsigned int max_len = I_MIN(size-i, 9);
261 
262 				if (i_memcasecmp(data+i, "</script>", max_len) == 0) {
263 					if (max_len < 9)
264 						return i;
265 					mail_html2text_add_space(output);
266 					ht->state = HTML_STATE_TEXT;
267 					i += 8;
268 				}
269 			}
270 			break;
271 		case HTML_STATE_STYLE:
272 			if (c == '<') {
273 				unsigned int max_len = I_MIN(size-i, 8);
274 
275 				if (i_memcasecmp(data+i, "</style>", max_len) == 0) {
276 					if (max_len < 8)
277 						return i;
278 					mail_html2text_add_space(output);
279 					ht->state = HTML_STATE_TEXT;
280 					i += 7;
281 				}
282 			}
283 			break;
284 		case HTML_STATE_CDATA:
285 			if (c == ']') {
286 				unsigned int max_len = I_MIN(size-i, 3);
287 
288 				if (i_memcasecmp(data+i, "]]>", max_len) == 0) {
289 					if (max_len < 3)
290 						return i;
291 					ht->state = HTML_STATE_TEXT;
292 					i += 2;
293 					break;
294 				}
295 			}
296 			if (ht->quote_level == 0 ||
297 			    (ht->flags & MAIL_HTML2TEXT_FLAG_SKIP_QUOTED) == 0)
298 				buffer_append_c(output, c);
299 			break;
300 		}
301 	}
302 	return i;
303 }
304 
mail_html2text_more(struct mail_html2text * ht,const unsigned char * data,size_t size,buffer_t * output)305 void mail_html2text_more(struct mail_html2text *ht,
306 			 const unsigned char *data, size_t size,
307 			 buffer_t *output)
308 {
309 	size_t pos, inc_size, buf_orig_size;
310 
311 	i_assert(size > 0);
312 
313 	while (ht->input->used > 0) {
314 		/* we didn't get enough input the last time to know
315 		   what to do. */
316 		buf_orig_size = ht->input->used;
317 
318 		inc_size = I_MIN(size, 128);
319 		buffer_append(ht->input, data, inc_size);
320 		pos = parse_data(ht, ht->input->data,
321 				 ht->input->used, output);
322 		if (pos == 0) {
323 			/* we need to add more data into buffer */
324 			data += inc_size;
325 			size -= inc_size;
326 			if (size == 0)
327 				return;
328 		} else if (pos >= buf_orig_size) {
329 			/* we parsed forward */
330 			data += pos - buf_orig_size;
331 			size -= pos - buf_orig_size;
332 			buffer_set_used_size(ht->input, 0);
333 		} else {
334 			/* invalid input - eat away what we parsed so far
335 			   and retry */
336 			buffer_set_used_size(ht->input, buf_orig_size);
337 			buffer_delete(ht->input, 0, pos);
338 		}
339 	}
340 	pos = parse_data(ht, data, size, output);
341 	buffer_append(ht->input, data + pos, size - pos);
342 }
343 
mail_html2text_deinit(struct mail_html2text ** _ht)344 void mail_html2text_deinit(struct mail_html2text **_ht)
345 {
346 	struct mail_html2text *ht = *_ht;
347 
348 	if (ht == NULL)
349 		return;
350 
351 	*_ht = NULL;
352 	buffer_free(&ht->input);
353 	i_free(ht);
354 }
355