1 /* Copyright (c) 2015-2018 Dovecot authors, see the included COPYING file */
2 
3 #include "lib.h"
4 #include "buffer.h"
5 #include "str.h"
6 #include "istream.h"
7 #include "mail-html2text.h"
8 #include "message-parser.h"
9 #include "message-decoder.h"
10 #include "message-snippet.h"
11 
12 #include <ctype.h>
13 
14 enum snippet_state {
15 	/* beginning of the line */
16 	SNIPPET_STATE_NEWLINE = 0,
17 	/* within normal text */
18 	SNIPPET_STATE_NORMAL,
19 	/* within quoted text - skip until EOL */
20 	SNIPPET_STATE_QUOTED
21 };
22 
23 struct snippet_data {
24 	string_t *snippet;
25 	unsigned int chars_left;
26 };
27 
28 struct snippet_context {
29 	struct snippet_data snippet;
30 	struct snippet_data quoted_snippet;
31 	enum snippet_state state;
32 	bool add_whitespace;
33 	struct mail_html2text *html2text;
34 	buffer_t *plain_output;
35 };
36 
snippet_add_content(struct snippet_context * ctx,struct snippet_data * target,const unsigned char * data,size_t size,size_t * count_r)37 static void snippet_add_content(struct snippet_context *ctx,
38 				struct snippet_data *target,
39 				const unsigned char *data, size_t size,
40 				size_t *count_r)
41 {
42 	i_assert(target != NULL);
43 	if (size == 0)
44 		return;
45 	if (size >= 3 &&
46 	     ((data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) ||
47 	      (data[0] == 0xBF && data[1] == 0xBB && data[2] == 0xEF))) {
48 		*count_r = 3;
49 		return;
50 	}
51 	if (data[0] == '\0') {
52 		/* skip NULs without increasing snippet size */
53 		return;
54 	}
55 	if (i_isspace(*data)) {
56 		/* skip any leading whitespace */
57 		if (str_len(target->snippet) > 0)
58 			ctx->add_whitespace = TRUE;
59 		if (data[0] == '\n')
60 			ctx->state = SNIPPET_STATE_NEWLINE;
61 		return;
62 	}
63 	if (target->chars_left == 0)
64 		return;
65 	target->chars_left--;
66 	if (ctx->add_whitespace) {
67 		if (target->chars_left == 0) {
68 			/* don't add a trailing whitespace */
69 			return;
70 		}
71 		str_append_c(target->snippet, ' ');
72 		ctx->add_whitespace = FALSE;
73 		target->chars_left--;
74 	}
75 	*count_r = uni_utf8_char_bytes(data[0]);
76 	i_assert(*count_r <= size);
77 	str_append_data(target->snippet, data, *count_r);
78 }
79 
snippet_generate(struct snippet_context * ctx,const unsigned char * data,size_t size)80 static bool snippet_generate(struct snippet_context *ctx,
81 			     const unsigned char *data, size_t size)
82 {
83 	size_t i, count;
84 	struct snippet_data *target;
85 
86 	if (ctx->html2text != NULL) {
87 		buffer_set_used_size(ctx->plain_output, 0);
88 		mail_html2text_more(ctx->html2text, data, size,
89 				    ctx->plain_output);
90 		data = ctx->plain_output->data;
91 		size = ctx->plain_output->used;
92 	}
93 
94 	if (ctx->state == SNIPPET_STATE_QUOTED)
95 		target = &ctx->quoted_snippet;
96 	else
97 		target = &ctx->snippet;
98 
99 	/* message-decoder should feed us only valid and complete
100 	   UTF-8 input */
101 
102 	for (i = 0; i < size; i += count) {
103 		count = 1;
104 		switch (ctx->state) {
105 		case SNIPPET_STATE_NEWLINE:
106 			if (data[i] == '>') {
107 				ctx->state = SNIPPET_STATE_QUOTED;
108 				i++;
109 				target = &ctx->quoted_snippet;
110 			} else {
111 				ctx->state = SNIPPET_STATE_NORMAL;
112 				target = &ctx->snippet;
113 			}
114 			/* fallthrough */
115 		case SNIPPET_STATE_NORMAL:
116 		case SNIPPET_STATE_QUOTED:
117 			snippet_add_content(ctx, target, CONST_PTR_OFFSET(data, i),
118 					    size-i, &count);
119 			/* break here if we have enough non-quoted data,
120 			   quoted data does not need to break here as it's
121 			   only used if the actual snippet is left empty. */
122 			if (ctx->snippet.chars_left == 0)
123 				return FALSE;
124 			break;
125 		}
126 	}
127 	return TRUE;
128 }
129 
snippet_copy(const char * src,string_t * dst)130 static void snippet_copy(const char *src, string_t *dst)
131 {
132 	while (*src != '\0' && i_isspace(*src)) src++;
133 	str_append(dst, src);
134 }
135 
message_snippet_generate(struct istream * input,unsigned int max_snippet_chars,string_t * snippet)136 int message_snippet_generate(struct istream *input,
137 			     unsigned int max_snippet_chars,
138 			     string_t *snippet)
139 {
140 	const struct message_parser_settings parser_set = { .flags = 0 };
141 	struct message_parser_ctx *parser;
142 	struct message_part *parts;
143 	struct message_part *skip_part = NULL;
144 	struct message_decoder_context *decoder;
145 	struct message_block raw_block, block;
146 	struct snippet_context ctx;
147 	pool_t pool;
148 	int ret;
149 
150 	i_zero(&ctx);
151 	pool = pool_alloconly_create("message snippet", 2048);
152 	ctx.snippet.snippet = str_new(pool, max_snippet_chars);
153 	ctx.snippet.chars_left = max_snippet_chars;
154 	ctx.quoted_snippet.snippet = str_new(pool, max_snippet_chars);
155 	ctx.quoted_snippet.chars_left = max_snippet_chars - 1; /* -1 for '>' */
156 	parser = message_parser_init(pool_datastack_create(), input, &parser_set);
157 	decoder = message_decoder_init(NULL, 0);
158 	while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) {
159 		if (raw_block.part == skip_part)
160 			continue;
161 		if (!message_decoder_decode_next_block(decoder, &raw_block, &block))
162 			continue;
163 		if (block.size == 0) {
164 			const char *ct;
165 
166 			if (block.hdr != NULL)
167 				continue;
168 
169 			/* We already have a snippet, don't look for more in
170 			   subsequent parts. */
171 			if (ctx.snippet.snippet->used != 0 ||
172 			    ctx.quoted_snippet.snippet->used != 0)
173 				break;
174 
175 			skip_part = NULL;
176 
177 			/* end of headers - verify that we can use this
178 			   Content-Type. we get here only once, because we
179 			   always handle only one non-multipart MIME part. */
180 			ct = message_decoder_current_content_type(decoder);
181 			if (ct == NULL)
182 				/* text/plain */ ;
183 			else if (mail_html2text_content_type_match(ct)) {
184 				mail_html2text_deinit(&ctx.html2text);
185 				ctx.html2text = mail_html2text_init(0);
186 				if (ctx.plain_output == NULL) {
187 					ctx.plain_output =
188 						buffer_create_dynamic(pool, 1024);
189 				}
190 			} else if (strncasecmp(ct, "text/", 5) != 0)
191 				skip_part = raw_block.part;
192 		} else if (!snippet_generate(&ctx, block.data, block.size))
193 			break;
194 	}
195 	i_assert(ret != 0);
196 	message_decoder_deinit(&decoder);
197 	message_parser_deinit(&parser, &parts);
198 	mail_html2text_deinit(&ctx.html2text);
199 	if (ctx.snippet.snippet->used != 0)
200 		snippet_copy(str_c(ctx.snippet.snippet), snippet);
201 	else if (ctx.quoted_snippet.snippet->used != 0) {
202 		str_append_c(snippet, '>');
203 		snippet_copy(str_c(ctx.quoted_snippet.snippet), snippet);
204 	}
205 	pool_unref(&pool);
206 	return input->stream_errno == 0 ? 0 : -1;
207 }
208