1 /* Copyright (c) 2009-2018 Dovecot authors, see the included COPYING file */
2 
3 #include "lib.h"
4 #include "str.h"
5 #include "unichar.h"
6 #include "base64.h"
7 #include "message-header-encode.h"
8 
9 #define MIME_WRAPPER_LEN (strlen("=?utf-8?q?""?="))
10 #define MIME_MAX_LINE_LEN 76
11 
12 #define IS_LWSP(c) \
13 	((c) == ' ' || (c) == '\t' || (c) == '\n')
14 
15 static bool
input_idx_need_encoding(const unsigned char * input,size_t i,size_t len)16 input_idx_need_encoding(const unsigned char *input, size_t i, size_t len)
17 {
18 	switch (input[i]) {
19 	case '\r':
20 		if (i+1 == len || input[i+1] != '\n')
21 			return TRUE;
22 		i++;
23 		/* fall through - verify the LF as well */
24 	case '\n':
25 		if (i+1 == len) {
26 			/* trailing LF - we need to drop it */
27 			return TRUE;
28 		}
29 		i_assert(i+1 < len);
30 		if (input[i+1] != '\t' && input[i+1] != ' ') {
31 			/* LF not followed by whitespace - we need to
32 			   add the whitespace */
33 			return TRUE;
34 		}
35 		break;
36 	case '\t':
37 		/* TAB doesn't need to be encoded */
38 		break;
39 	case '=':
40 		/* <LWSP>=? - we need to check backwards a bit to see if
41 		   there is LWSP (note that we don't want to return TRUE for
42 		   the LWSP itself yet, so we need to do this backwards
43 		   check) */
44 		if ((i == 0 || IS_LWSP(input[i-1])) && i+2 <= len &&
45 		    memcmp(input + i, "=?", 2) == 0)
46 			return TRUE;
47 		break;
48 	default:
49 		/* 8bit chars */
50 		if ((input[i] & 0x80) != 0)
51 			return TRUE;
52 		/* control chars */
53 		if (input[i] < 32)
54 			return TRUE;
55 		break;
56 	}
57 	return FALSE;
58 }
59 
message_header_encode_q(const unsigned char * input,size_t len,string_t * output,size_t first_line_len)60 void message_header_encode_q(const unsigned char *input, size_t len,
61 			     string_t *output, size_t first_line_len)
62 {
63 	static const unsigned char *rep_char =
64 		(const unsigned char *)UNICODE_REPLACEMENT_CHAR_UTF8;
65 	static const unsigned int rep_char_len =
66 		UNICODE_REPLACEMENT_CHAR_UTF8_LEN;
67 	size_t line_len_left;
68 	bool invalid_char = FALSE;
69 
70 	if (len == 0)
71 		return;
72 
73 	line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN;
74 
75 	if (first_line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - 3) {
76 		str_append(output, "\n\t");
77 		line_len_left--;
78 	} else {
79 		line_len_left -= first_line_len;
80 	}
81 
82 	str_append(output, "=?utf-8?q?");
83 	for (;;) {
84 		unichar_t ch;
85 		int nch = 1;
86 		size_t n_in, n_out = 0, j;
87 
88 		/* Determine how many bytes are to be consumed from input and
89 		   written to output. */
90 		switch (input[0]) {
91 		case ' ':
92 			/* Space is translated to a single '_'. */
93 			n_out = 1;
94 			n_in = 1;
95 			break;
96 		case '=':
97 		case '?':
98 		case '_':
99 			/* Special characters are escaped. */
100 			n_in = 1;
101 			n_out = 3;
102 			break;
103 		default:
104 			nch = uni_utf8_get_char_n(input, len, &ch);
105 			if (nch <= 0) {
106 				/* Invalid UTF-8 character */
107 				n_in = 1;
108 				if (!invalid_char) {
109 					/* First octet of bad stuff; will emit
110 					   replacement character. */
111 					n_out = rep_char_len * 3;
112 				} else {
113 					/* Emit only one replacement char for
114 					   a burst of bad stuff. */
115 					n_out = 0;
116 				}
117 			} else if (nch > 1) {
118 				/* Unicode characters are escaped as several
119 				   escape sequences for each octet. */
120 				n_in = nch;
121 				n_out = nch * 3;
122 			} else if (ch < 0x20 || ch > 0x7e) {
123 				/* Control characters are escaped. */
124 				i_assert(ch < 0x80);
125 				n_in = 1;
126 				n_out = 3;
127 			} else {
128 				/* Other ASCII characters are written to output
129 				   directly. */
130 				n_in = 1;
131 				n_out = 1;
132 			}
133 		}
134 		invalid_char = (nch <= 0);
135 
136 		/* Start a new line once unsufficient space is available to
137 		   write more to the current line. */
138 		if (line_len_left < n_out) {
139 			str_append(output, "?=\n\t=?utf-8?q?");
140 			line_len_left = MIME_MAX_LINE_LEN -
141 				MIME_WRAPPER_LEN - 1;
142 		}
143 
144 		/* Encode the character */
145 		if (input[0] == ' ') {
146 			/* Write special escape sequence for space character */
147 			str_append_c(output, '_');
148 		} else if (invalid_char) {
149 			/* Write replacement character for invalid UTF-8 code
150 			   point. */
151 			for (j = 0; n_out > 0 && j < rep_char_len; j++)
152 				str_printfa(output, "=%02X", rep_char[j]);
153 		} else if (n_out > 1) {
154 			/* Write one or more escape sequences for a special
155 			   character, a control character, or a valid UTF-8
156 			   code point. */
157 			for (j = 0; j < n_in; j++)
158 				str_printfa(output, "=%02X", input[j]);
159 		} else {
160 			/* Write other ASCII characters directly to output. */
161 			str_append_c(output, input[0]);
162 		}
163 
164 		/* Update sizes and pointers */
165 		i_assert(len >= n_in);
166 		line_len_left -= n_out;
167 		input += n_in;
168 		len -= n_in;
169 
170 		if (len == 0)
171 			break;
172 	}
173 	str_append(output, "?=");
174 }
175 
message_header_encode_b(const unsigned char * input,size_t len,string_t * output,size_t first_line_len)176 void message_header_encode_b(const unsigned char *input, size_t len,
177 			     string_t *output, size_t first_line_len)
178 {
179 	static const unsigned char *rep_char =
180 		(const unsigned char *)UNICODE_REPLACEMENT_CHAR_UTF8;
181 	static const unsigned int rep_char_len =
182 		UNICODE_REPLACEMENT_CHAR_UTF8_LEN;
183 	struct base64_encoder b64enc;
184 	size_t line_len_left;
185 
186 	if (len == 0)
187 		return;
188 
189 	line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN;
190 
191 	if (first_line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - 3) {
192 		str_append(output, "\n\t");
193 		line_len_left--;
194 	} else {
195 		line_len_left -= first_line_len;
196 	}
197 
198 	str_append(output, "=?utf-8?b?");
199 	base64_encode_init(&b64enc, &base64_scheme, 0, 0);
200 	for (;;) {
201 		unichar_t ch;
202 		size_t space, max, old_bufsize, n_in, n_out;
203 		int nch = 1;
204 
205 		/* Determine how many octets can be encoded on (the remainder
206 		   of) this line */
207 		space = base64_encode_get_full_space(&b64enc, line_len_left);
208 		max = I_MIN(space, len);
209 
210 		/* Check UTF-8 code points in the input and determine a proper
211 		   boundary for the end of this fragment if the encoded size
212 		   exceeds the maximum (remaining) line length. */
213 		for (n_in = 0; n_in < max;) {
214 			nch = uni_utf8_get_char_n(&input[n_in],
215 						  len - n_in, &ch);
216 			if (nch <= 0)
217 				break;
218 			if ((n_in + nch) > max)
219 				break;
220 			n_in += nch;
221 		}
222 
223 		/* Encode this fragment up until the maximum fragment size or
224 		   the first invalid UTF-8 code point in the input. */
225 		if (n_in > 0) {
226 			old_bufsize = output->used;
227 			if (!base64_encode_more(&b64enc, input, n_in,
228 						  &n_in, output))
229 				i_unreached();
230 			n_out = output->used - old_bufsize;
231 
232 			/* Update sizes and pointers */
233 			i_assert(len >= n_in);
234 			i_assert(line_len_left >= n_out);
235 			input += n_in;
236 			len -= n_in;
237 			line_len_left -= n_out;
238 		}
239 
240 		/* Determine whether a repacement character needs to be written
241 		   and how much space there is left for it on the current line.
242 		 */
243 		space = 0;
244 		if (nch <= 0) {
245 			space = base64_encode_get_full_space(
246 				&b64enc, line_len_left);
247 		}
248 
249 		/* Start a new line once insufficient space is available. */
250 		if ((nch > 0 && len > 0) ||
251 		    (nch <= 0 && space < rep_char_len)) {
252 			old_bufsize = output->used;
253 			if (!base64_encode_finish(&b64enc, output))
254 				i_unreached();
255 			n_out = output->used - old_bufsize;
256 			i_assert(line_len_left >= n_out);
257 
258 			str_append(output, "?=\n\t=?utf-8?b?");
259 			line_len_left = MIME_MAX_LINE_LEN -
260 				MIME_WRAPPER_LEN - 1;
261 			base64_encode_reset(&b64enc);
262 		}
263 
264 		/* Write replacement character if needed. */
265 		n_in = 0;
266 		n_out = 0;
267 		if (nch <= 0) {
268 			old_bufsize = output->used;
269 			if (!base64_encode_more(&b64enc, rep_char, rep_char_len,
270 						NULL, output))
271 				i_unreached();
272 
273 			n_in = 1;
274 			n_out = output->used - old_bufsize;
275 
276 			/* Skip more invalid characters in the input. */
277 			for (; n_in < len; n_in++) {
278 				nch = uni_utf8_get_char_n(&input[n_in],
279 							  len - n_in, &ch);
280 				if (nch > 0)
281 					break;
282 			}
283 		}
284 
285 		/* Update sizes and pointers */
286 		i_assert(line_len_left >= n_out);
287 		input += n_in;
288 		len -= n_in;
289 		line_len_left -= n_out;
290 
291 		if (len == 0)
292 			break;
293 	}
294 	if (!base64_encode_finish(&b64enc, output))
295 		i_unreached();
296 	str_append(output, "?=");
297 }
298 
message_header_encode(const char * input,string_t * output)299 void message_header_encode(const char *input, string_t *output)
300 {
301 	message_header_encode_data((const void *)input, strlen(input), output);
302 }
303 
message_header_encode_data(const unsigned char * input,size_t len,string_t * output)304 void message_header_encode_data(const unsigned char *input, size_t len,
305 				string_t *output)
306 {
307 	size_t i, j, first_line_len, cur_line_len, last_idx;
308 	size_t enc_chars, enc_len, base64_len, q_len;
309 	const unsigned char *next_line_input;
310 	size_t next_line_len = 0;
311 	bool use_q, cr;
312 
313 	/* find the first word that needs encoding */
314 	for (i = 0; i < len; i++) {
315 		if (input_idx_need_encoding(input, i, len))
316 			break;
317 	}
318 	if (i == len) {
319 		/* no encoding necessary */
320 		str_append_data(output, input, len);
321 		return;
322 	}
323 	/* go back to the beginning of the word so it is fully encoded */
324 	if (input[i] != '\r' && input[i] != '\n') {
325 		while (i > 0 && !IS_LWSP(input[i-1]))
326 			i--;
327 	}
328 
329 	/* write the prefix */
330 	str_append_data(output, input, i);
331 	first_line_len = j = i;
332 	while (j > 0 && input[j-1] != '\n') j--;
333 	if (j != 0)
334 		first_line_len = j;
335 
336 	input += i;
337 	len -= i;
338 
339 	/* we'll encode data only up to the next LF, the rest is handled
340 	   recursively. */
341 	next_line_input = memchr(input, '\n', len);
342 	if (next_line_input != NULL) {
343 		cur_line_len = next_line_input - input;
344 		if (cur_line_len > 0 && input[cur_line_len-1] == '\r') {
345 			cur_line_len--;
346 			next_line_input = input + cur_line_len;
347 		}
348 		next_line_len = len - cur_line_len;
349 		len = cur_line_len;
350 	}
351 
352 	/* find the last word that needs encoding */
353 	last_idx = 0; enc_chars = 0;
354 	for (i = 0; i < len; i++) {
355 		if (input_idx_need_encoding(input, i, len)) {
356 			last_idx = i + 1;
357 			enc_chars++;
358 		}
359 	}
360 	while (last_idx < len && !IS_LWSP(input[last_idx]))
361 		last_idx++;
362 
363 	/* figure out if we should use Q or B encoding. Prefer Q if it's not
364 	   too much larger. */
365 	enc_len = last_idx;
366 	base64_len = MAX_BASE64_ENCODED_SIZE(enc_len);
367 	q_len = enc_len + enc_chars*3;
368 	use_q = q_len*2/3 <= base64_len;
369 
370 	/* and do it */
371 	if (enc_len == 0)
372 		;
373 	else if (use_q)
374 		message_header_encode_q(input, enc_len, output, first_line_len);
375 	else
376 		message_header_encode_b(input, enc_len, output, first_line_len);
377 	str_append_data(output, input + last_idx, len - last_idx);
378 
379 	if (next_line_input != NULL) {
380 		/* we're at [CR]LF */
381 		i = 0;
382 		if (next_line_input[0] == '\r') {
383 			cr = TRUE;
384 			i++;
385 		} else {
386 			cr = FALSE;
387 		}
388 		i_assert(next_line_input[i] == '\n');
389 		if (++i == next_line_len)
390 			return; /* drop trailing [CR]LF */
391 
392 		if (cr)
393 			str_append_c(output, '\r');
394 		str_append_c(output, '\n');
395 
396 		if (next_line_input[i] == ' ' || next_line_input[i] == '\t') {
397 			str_append_c(output, next_line_input[i]);
398 			i++;
399 		} else {
400 			/* make it valid folding whitespace by adding a TAB */
401 			str_append_c(output, '\t');
402 		}
403 		message_header_encode_data(next_line_input+i, next_line_len-i,
404 					   output);
405 	}
406 }
407