1 /* Copyright (c) 2009-2018 Dovecot authors, see the included COPYING file */
2
3 #include "lib.h"
4 #include "str.h"
5 #include "unichar.h"
6 #include "base64.h"
7 #include "message-header-encode.h"
8
9 #define MIME_WRAPPER_LEN (strlen("=?utf-8?q?""?="))
10 #define MIME_MAX_LINE_LEN 76
11
12 #define IS_LWSP(c) \
13 ((c) == ' ' || (c) == '\t' || (c) == '\n')
14
15 static bool
input_idx_need_encoding(const unsigned char * input,size_t i,size_t len)16 input_idx_need_encoding(const unsigned char *input, size_t i, size_t len)
17 {
18 switch (input[i]) {
19 case '\r':
20 if (i+1 == len || input[i+1] != '\n')
21 return TRUE;
22 i++;
23 /* fall through - verify the LF as well */
24 case '\n':
25 if (i+1 == len) {
26 /* trailing LF - we need to drop it */
27 return TRUE;
28 }
29 i_assert(i+1 < len);
30 if (input[i+1] != '\t' && input[i+1] != ' ') {
31 /* LF not followed by whitespace - we need to
32 add the whitespace */
33 return TRUE;
34 }
35 break;
36 case '\t':
37 /* TAB doesn't need to be encoded */
38 break;
39 case '=':
40 /* <LWSP>=? - we need to check backwards a bit to see if
41 there is LWSP (note that we don't want to return TRUE for
42 the LWSP itself yet, so we need to do this backwards
43 check) */
44 if ((i == 0 || IS_LWSP(input[i-1])) && i+2 <= len &&
45 memcmp(input + i, "=?", 2) == 0)
46 return TRUE;
47 break;
48 default:
49 /* 8bit chars */
50 if ((input[i] & 0x80) != 0)
51 return TRUE;
52 /* control chars */
53 if (input[i] < 32)
54 return TRUE;
55 break;
56 }
57 return FALSE;
58 }
59
message_header_encode_q(const unsigned char * input,size_t len,string_t * output,size_t first_line_len)60 void message_header_encode_q(const unsigned char *input, size_t len,
61 string_t *output, size_t first_line_len)
62 {
63 static const unsigned char *rep_char =
64 (const unsigned char *)UNICODE_REPLACEMENT_CHAR_UTF8;
65 static const unsigned int rep_char_len =
66 UNICODE_REPLACEMENT_CHAR_UTF8_LEN;
67 size_t line_len_left;
68 bool invalid_char = FALSE;
69
70 if (len == 0)
71 return;
72
73 line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN;
74
75 if (first_line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - 3) {
76 str_append(output, "\n\t");
77 line_len_left--;
78 } else {
79 line_len_left -= first_line_len;
80 }
81
82 str_append(output, "=?utf-8?q?");
83 for (;;) {
84 unichar_t ch;
85 int nch = 1;
86 size_t n_in, n_out = 0, j;
87
88 /* Determine how many bytes are to be consumed from input and
89 written to output. */
90 switch (input[0]) {
91 case ' ':
92 /* Space is translated to a single '_'. */
93 n_out = 1;
94 n_in = 1;
95 break;
96 case '=':
97 case '?':
98 case '_':
99 /* Special characters are escaped. */
100 n_in = 1;
101 n_out = 3;
102 break;
103 default:
104 nch = uni_utf8_get_char_n(input, len, &ch);
105 if (nch <= 0) {
106 /* Invalid UTF-8 character */
107 n_in = 1;
108 if (!invalid_char) {
109 /* First octet of bad stuff; will emit
110 replacement character. */
111 n_out = rep_char_len * 3;
112 } else {
113 /* Emit only one replacement char for
114 a burst of bad stuff. */
115 n_out = 0;
116 }
117 } else if (nch > 1) {
118 /* Unicode characters are escaped as several
119 escape sequences for each octet. */
120 n_in = nch;
121 n_out = nch * 3;
122 } else if (ch < 0x20 || ch > 0x7e) {
123 /* Control characters are escaped. */
124 i_assert(ch < 0x80);
125 n_in = 1;
126 n_out = 3;
127 } else {
128 /* Other ASCII characters are written to output
129 directly. */
130 n_in = 1;
131 n_out = 1;
132 }
133 }
134 invalid_char = (nch <= 0);
135
136 /* Start a new line once unsufficient space is available to
137 write more to the current line. */
138 if (line_len_left < n_out) {
139 str_append(output, "?=\n\t=?utf-8?q?");
140 line_len_left = MIME_MAX_LINE_LEN -
141 MIME_WRAPPER_LEN - 1;
142 }
143
144 /* Encode the character */
145 if (input[0] == ' ') {
146 /* Write special escape sequence for space character */
147 str_append_c(output, '_');
148 } else if (invalid_char) {
149 /* Write replacement character for invalid UTF-8 code
150 point. */
151 for (j = 0; n_out > 0 && j < rep_char_len; j++)
152 str_printfa(output, "=%02X", rep_char[j]);
153 } else if (n_out > 1) {
154 /* Write one or more escape sequences for a special
155 character, a control character, or a valid UTF-8
156 code point. */
157 for (j = 0; j < n_in; j++)
158 str_printfa(output, "=%02X", input[j]);
159 } else {
160 /* Write other ASCII characters directly to output. */
161 str_append_c(output, input[0]);
162 }
163
164 /* Update sizes and pointers */
165 i_assert(len >= n_in);
166 line_len_left -= n_out;
167 input += n_in;
168 len -= n_in;
169
170 if (len == 0)
171 break;
172 }
173 str_append(output, "?=");
174 }
175
message_header_encode_b(const unsigned char * input,size_t len,string_t * output,size_t first_line_len)176 void message_header_encode_b(const unsigned char *input, size_t len,
177 string_t *output, size_t first_line_len)
178 {
179 static const unsigned char *rep_char =
180 (const unsigned char *)UNICODE_REPLACEMENT_CHAR_UTF8;
181 static const unsigned int rep_char_len =
182 UNICODE_REPLACEMENT_CHAR_UTF8_LEN;
183 struct base64_encoder b64enc;
184 size_t line_len_left;
185
186 if (len == 0)
187 return;
188
189 line_len_left = MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN;
190
191 if (first_line_len >= MIME_MAX_LINE_LEN - MIME_WRAPPER_LEN - 3) {
192 str_append(output, "\n\t");
193 line_len_left--;
194 } else {
195 line_len_left -= first_line_len;
196 }
197
198 str_append(output, "=?utf-8?b?");
199 base64_encode_init(&b64enc, &base64_scheme, 0, 0);
200 for (;;) {
201 unichar_t ch;
202 size_t space, max, old_bufsize, n_in, n_out;
203 int nch = 1;
204
205 /* Determine how many octets can be encoded on (the remainder
206 of) this line */
207 space = base64_encode_get_full_space(&b64enc, line_len_left);
208 max = I_MIN(space, len);
209
210 /* Check UTF-8 code points in the input and determine a proper
211 boundary for the end of this fragment if the encoded size
212 exceeds the maximum (remaining) line length. */
213 for (n_in = 0; n_in < max;) {
214 nch = uni_utf8_get_char_n(&input[n_in],
215 len - n_in, &ch);
216 if (nch <= 0)
217 break;
218 if ((n_in + nch) > max)
219 break;
220 n_in += nch;
221 }
222
223 /* Encode this fragment up until the maximum fragment size or
224 the first invalid UTF-8 code point in the input. */
225 if (n_in > 0) {
226 old_bufsize = output->used;
227 if (!base64_encode_more(&b64enc, input, n_in,
228 &n_in, output))
229 i_unreached();
230 n_out = output->used - old_bufsize;
231
232 /* Update sizes and pointers */
233 i_assert(len >= n_in);
234 i_assert(line_len_left >= n_out);
235 input += n_in;
236 len -= n_in;
237 line_len_left -= n_out;
238 }
239
240 /* Determine whether a repacement character needs to be written
241 and how much space there is left for it on the current line.
242 */
243 space = 0;
244 if (nch <= 0) {
245 space = base64_encode_get_full_space(
246 &b64enc, line_len_left);
247 }
248
249 /* Start a new line once insufficient space is available. */
250 if ((nch > 0 && len > 0) ||
251 (nch <= 0 && space < rep_char_len)) {
252 old_bufsize = output->used;
253 if (!base64_encode_finish(&b64enc, output))
254 i_unreached();
255 n_out = output->used - old_bufsize;
256 i_assert(line_len_left >= n_out);
257
258 str_append(output, "?=\n\t=?utf-8?b?");
259 line_len_left = MIME_MAX_LINE_LEN -
260 MIME_WRAPPER_LEN - 1;
261 base64_encode_reset(&b64enc);
262 }
263
264 /* Write replacement character if needed. */
265 n_in = 0;
266 n_out = 0;
267 if (nch <= 0) {
268 old_bufsize = output->used;
269 if (!base64_encode_more(&b64enc, rep_char, rep_char_len,
270 NULL, output))
271 i_unreached();
272
273 n_in = 1;
274 n_out = output->used - old_bufsize;
275
276 /* Skip more invalid characters in the input. */
277 for (; n_in < len; n_in++) {
278 nch = uni_utf8_get_char_n(&input[n_in],
279 len - n_in, &ch);
280 if (nch > 0)
281 break;
282 }
283 }
284
285 /* Update sizes and pointers */
286 i_assert(line_len_left >= n_out);
287 input += n_in;
288 len -= n_in;
289 line_len_left -= n_out;
290
291 if (len == 0)
292 break;
293 }
294 if (!base64_encode_finish(&b64enc, output))
295 i_unreached();
296 str_append(output, "?=");
297 }
298
message_header_encode(const char * input,string_t * output)299 void message_header_encode(const char *input, string_t *output)
300 {
301 message_header_encode_data((const void *)input, strlen(input), output);
302 }
303
message_header_encode_data(const unsigned char * input,size_t len,string_t * output)304 void message_header_encode_data(const unsigned char *input, size_t len,
305 string_t *output)
306 {
307 size_t i, j, first_line_len, cur_line_len, last_idx;
308 size_t enc_chars, enc_len, base64_len, q_len;
309 const unsigned char *next_line_input;
310 size_t next_line_len = 0;
311 bool use_q, cr;
312
313 /* find the first word that needs encoding */
314 for (i = 0; i < len; i++) {
315 if (input_idx_need_encoding(input, i, len))
316 break;
317 }
318 if (i == len) {
319 /* no encoding necessary */
320 str_append_data(output, input, len);
321 return;
322 }
323 /* go back to the beginning of the word so it is fully encoded */
324 if (input[i] != '\r' && input[i] != '\n') {
325 while (i > 0 && !IS_LWSP(input[i-1]))
326 i--;
327 }
328
329 /* write the prefix */
330 str_append_data(output, input, i);
331 first_line_len = j = i;
332 while (j > 0 && input[j-1] != '\n') j--;
333 if (j != 0)
334 first_line_len = j;
335
336 input += i;
337 len -= i;
338
339 /* we'll encode data only up to the next LF, the rest is handled
340 recursively. */
341 next_line_input = memchr(input, '\n', len);
342 if (next_line_input != NULL) {
343 cur_line_len = next_line_input - input;
344 if (cur_line_len > 0 && input[cur_line_len-1] == '\r') {
345 cur_line_len--;
346 next_line_input = input + cur_line_len;
347 }
348 next_line_len = len - cur_line_len;
349 len = cur_line_len;
350 }
351
352 /* find the last word that needs encoding */
353 last_idx = 0; enc_chars = 0;
354 for (i = 0; i < len; i++) {
355 if (input_idx_need_encoding(input, i, len)) {
356 last_idx = i + 1;
357 enc_chars++;
358 }
359 }
360 while (last_idx < len && !IS_LWSP(input[last_idx]))
361 last_idx++;
362
363 /* figure out if we should use Q or B encoding. Prefer Q if it's not
364 too much larger. */
365 enc_len = last_idx;
366 base64_len = MAX_BASE64_ENCODED_SIZE(enc_len);
367 q_len = enc_len + enc_chars*3;
368 use_q = q_len*2/3 <= base64_len;
369
370 /* and do it */
371 if (enc_len == 0)
372 ;
373 else if (use_q)
374 message_header_encode_q(input, enc_len, output, first_line_len);
375 else
376 message_header_encode_b(input, enc_len, output, first_line_len);
377 str_append_data(output, input + last_idx, len - last_idx);
378
379 if (next_line_input != NULL) {
380 /* we're at [CR]LF */
381 i = 0;
382 if (next_line_input[0] == '\r') {
383 cr = TRUE;
384 i++;
385 } else {
386 cr = FALSE;
387 }
388 i_assert(next_line_input[i] == '\n');
389 if (++i == next_line_len)
390 return; /* drop trailing [CR]LF */
391
392 if (cr)
393 str_append_c(output, '\r');
394 str_append_c(output, '\n');
395
396 if (next_line_input[i] == ' ' || next_line_input[i] == '\t') {
397 str_append_c(output, next_line_input[i]);
398 i++;
399 } else {
400 /* make it valid folding whitespace by adding a TAB */
401 str_append_c(output, '\t');
402 }
403 message_header_encode_data(next_line_input+i, next_line_len-i,
404 output);
405 }
406 }
407