1 /* Copyright (c) 2015-2018 Dovecot authors, see the included COPYING file */
2
3 #include "lib.h"
4 #include "buffer.h"
5 #include "str.h"
6 #include "istream.h"
7 #include "mail-html2text.h"
8 #include "message-parser.h"
9 #include "message-decoder.h"
10 #include "message-snippet.h"
11
12 #include <ctype.h>
13
14 enum snippet_state {
15 /* beginning of the line */
16 SNIPPET_STATE_NEWLINE = 0,
17 /* within normal text */
18 SNIPPET_STATE_NORMAL,
19 /* within quoted text - skip until EOL */
20 SNIPPET_STATE_QUOTED
21 };
22
23 struct snippet_data {
24 string_t *snippet;
25 unsigned int chars_left;
26 };
27
28 struct snippet_context {
29 struct snippet_data snippet;
30 struct snippet_data quoted_snippet;
31 enum snippet_state state;
32 bool add_whitespace;
33 struct mail_html2text *html2text;
34 buffer_t *plain_output;
35 };
36
snippet_add_content(struct snippet_context * ctx,struct snippet_data * target,const unsigned char * data,size_t size,size_t * count_r)37 static void snippet_add_content(struct snippet_context *ctx,
38 struct snippet_data *target,
39 const unsigned char *data, size_t size,
40 size_t *count_r)
41 {
42 i_assert(target != NULL);
43 if (size == 0)
44 return;
45 if (size >= 3 &&
46 ((data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) ||
47 (data[0] == 0xBF && data[1] == 0xBB && data[2] == 0xEF))) {
48 *count_r = 3;
49 return;
50 }
51 if (data[0] == '\0') {
52 /* skip NULs without increasing snippet size */
53 return;
54 }
55 if (i_isspace(*data)) {
56 /* skip any leading whitespace */
57 if (str_len(target->snippet) > 0)
58 ctx->add_whitespace = TRUE;
59 if (data[0] == '\n')
60 ctx->state = SNIPPET_STATE_NEWLINE;
61 return;
62 }
63 if (target->chars_left == 0)
64 return;
65 target->chars_left--;
66 if (ctx->add_whitespace) {
67 if (target->chars_left == 0) {
68 /* don't add a trailing whitespace */
69 return;
70 }
71 str_append_c(target->snippet, ' ');
72 ctx->add_whitespace = FALSE;
73 target->chars_left--;
74 }
75 *count_r = uni_utf8_char_bytes(data[0]);
76 i_assert(*count_r <= size);
77 str_append_data(target->snippet, data, *count_r);
78 }
79
snippet_generate(struct snippet_context * ctx,const unsigned char * data,size_t size)80 static bool snippet_generate(struct snippet_context *ctx,
81 const unsigned char *data, size_t size)
82 {
83 size_t i, count;
84 struct snippet_data *target;
85
86 if (ctx->html2text != NULL) {
87 buffer_set_used_size(ctx->plain_output, 0);
88 mail_html2text_more(ctx->html2text, data, size,
89 ctx->plain_output);
90 data = ctx->plain_output->data;
91 size = ctx->plain_output->used;
92 }
93
94 if (ctx->state == SNIPPET_STATE_QUOTED)
95 target = &ctx->quoted_snippet;
96 else
97 target = &ctx->snippet;
98
99 /* message-decoder should feed us only valid and complete
100 UTF-8 input */
101
102 for (i = 0; i < size; i += count) {
103 count = 1;
104 switch (ctx->state) {
105 case SNIPPET_STATE_NEWLINE:
106 if (data[i] == '>') {
107 ctx->state = SNIPPET_STATE_QUOTED;
108 i++;
109 target = &ctx->quoted_snippet;
110 } else {
111 ctx->state = SNIPPET_STATE_NORMAL;
112 target = &ctx->snippet;
113 }
114 /* fallthrough */
115 case SNIPPET_STATE_NORMAL:
116 case SNIPPET_STATE_QUOTED:
117 snippet_add_content(ctx, target, CONST_PTR_OFFSET(data, i),
118 size-i, &count);
119 /* break here if we have enough non-quoted data,
120 quoted data does not need to break here as it's
121 only used if the actual snippet is left empty. */
122 if (ctx->snippet.chars_left == 0)
123 return FALSE;
124 break;
125 }
126 }
127 return TRUE;
128 }
129
snippet_copy(const char * src,string_t * dst)130 static void snippet_copy(const char *src, string_t *dst)
131 {
132 while (*src != '\0' && i_isspace(*src)) src++;
133 str_append(dst, src);
134 }
135
message_snippet_generate(struct istream * input,unsigned int max_snippet_chars,string_t * snippet)136 int message_snippet_generate(struct istream *input,
137 unsigned int max_snippet_chars,
138 string_t *snippet)
139 {
140 const struct message_parser_settings parser_set = { .flags = 0 };
141 struct message_parser_ctx *parser;
142 struct message_part *parts;
143 struct message_part *skip_part = NULL;
144 struct message_decoder_context *decoder;
145 struct message_block raw_block, block;
146 struct snippet_context ctx;
147 pool_t pool;
148 int ret;
149
150 i_zero(&ctx);
151 pool = pool_alloconly_create("message snippet", 2048);
152 ctx.snippet.snippet = str_new(pool, max_snippet_chars);
153 ctx.snippet.chars_left = max_snippet_chars;
154 ctx.quoted_snippet.snippet = str_new(pool, max_snippet_chars);
155 ctx.quoted_snippet.chars_left = max_snippet_chars - 1; /* -1 for '>' */
156 parser = message_parser_init(pool_datastack_create(), input, &parser_set);
157 decoder = message_decoder_init(NULL, 0);
158 while ((ret = message_parser_parse_next_block(parser, &raw_block)) > 0) {
159 if (raw_block.part == skip_part)
160 continue;
161 if (!message_decoder_decode_next_block(decoder, &raw_block, &block))
162 continue;
163 if (block.size == 0) {
164 const char *ct;
165
166 if (block.hdr != NULL)
167 continue;
168
169 /* We already have a snippet, don't look for more in
170 subsequent parts. */
171 if (ctx.snippet.snippet->used != 0 ||
172 ctx.quoted_snippet.snippet->used != 0)
173 break;
174
175 skip_part = NULL;
176
177 /* end of headers - verify that we can use this
178 Content-Type. we get here only once, because we
179 always handle only one non-multipart MIME part. */
180 ct = message_decoder_current_content_type(decoder);
181 if (ct == NULL)
182 /* text/plain */ ;
183 else if (mail_html2text_content_type_match(ct)) {
184 mail_html2text_deinit(&ctx.html2text);
185 ctx.html2text = mail_html2text_init(0);
186 if (ctx.plain_output == NULL) {
187 ctx.plain_output =
188 buffer_create_dynamic(pool, 1024);
189 }
190 } else if (strncasecmp(ct, "text/", 5) != 0)
191 skip_part = raw_block.part;
192 } else if (!snippet_generate(&ctx, block.data, block.size))
193 break;
194 }
195 i_assert(ret != 0);
196 message_decoder_deinit(&decoder);
197 message_parser_deinit(&parser, &parts);
198 mail_html2text_deinit(&ctx.html2text);
199 if (ctx.snippet.snippet->used != 0)
200 snippet_copy(str_c(ctx.snippet.snippet), snippet);
201 else if (ctx.quoted_snippet.snippet->used != 0) {
202 str_append_c(snippet, '>');
203 snippet_copy(str_c(ctx.quoted_snippet.snippet), snippet);
204 }
205 pool_unref(&pool);
206 return input->stream_errno == 0 ? 0 : -1;
207 }
208