1 /*
2 * Copyright (C) 2013-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
3 * Copyright (C) 2007-2013 Sourcefire, Inc.
4 *
5 * Authors: Trog
6 *
7 * Summary: Normalise HTML text. Decode MS Script Encoder protection.
8 * The ScrEnc decoder was initially based upon an analysis by Andreas Marx.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License version 2 as
12 * published by the Free Software Foundation.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
22 * MA 02110-1301, USA.
23 */
24
25 #if HAVE_CONFIG_H
26 #include "clamav-config.h"
27 #endif
28
29 #include <stdio.h>
30 #ifdef HAVE_UNISTD_H
31 #include <unistd.h>
32 #endif
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <fcntl.h>
36 #ifdef HAVE_STRINGS_H
37 #include <strings.h>
38 #endif
39 #include <string.h>
40 #include <errno.h>
41 #include <stdio.h>
42 #include <ctype.h>
43
44 #include "clamav.h"
45 #include "fmap.h"
46 #include "others.h"
47 #include "htmlnorm.h"
48
49 #include "entconv.h"
50 #include "jsparse/js-norm.h"
51
52 #define HTML_STR_LENGTH 1024
53 #define MAX_TAG_CONTENTS_LENGTH HTML_STR_LENGTH
54
55 typedef enum {
56 HTML_BAD_STATE,
57 HTML_NORM,
58 HTML_8BIT,
59 HTML_COMMENT,
60 HTML_CHAR_REF,
61 HTML_ENTITY_REF_DECODE,
62 HTML_SKIP_WS,
63 HTML_TRIM_WS,
64 HTML_TAG,
65 HTML_TAG_ARG,
66 HTML_TAG_ARG_VAL,
67 HTML_TAG_ARG_EQUAL,
68 HTML_PROCESS_TAG,
69 HTML_CHAR_REF_DECODE,
70 HTML_LOOKFOR_SCRENC,
71 HTML_JSDECODE,
72 HTML_JSDECODE_LENGTH,
73 HTML_JSDECODE_DECRYPT,
74 HTML_SPECIAL_CHAR,
75 HTML_RFC2397_TYPE,
76 HTML_RFC2397_INIT,
77 HTML_RFC2397_DATA,
78 HTML_RFC2397_FINISH,
79 HTML_RFC2397_ESC,
80 HTML_ESCAPE_CHAR
81 } html_state;
82
83 typedef enum {
84 SINGLE_QUOTED,
85 DOUBLE_QUOTED,
86 NOT_QUOTED
87 } quoted_state;
88
89 #define HTML_FILE_BUFF_LEN 8192
90
91 typedef struct file_buff_tag {
92 int fd;
93 unsigned char buffer[HTML_FILE_BUFF_LEN];
94 uint64_t length;
95 } file_buff_t;
96
97 struct tag_contents {
98 size_t pos;
99 unsigned char contents[MAX_TAG_CONTENTS_LENGTH + 1];
100 };
101
102 // clang-format off
103 static const int64_t base64_chars[256] = {
104 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
105 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
106 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
107 52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
108 -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 11,12,13,14,
109 15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
110 -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
111 41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,
112 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
113 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
114 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
115 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
116 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
117 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
118 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
119 -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
120 };
121 // clang-format on
122
123 int table_order[] = {
124 00, 02, 01, 00, 02, 01, 02, 01, 01, 02, 01, 02, 00, 01, 02, 01,
125 00, 01, 02, 01, 00, 00, 02, 01, 01, 02, 00, 01, 02, 01, 01, 02,
126 00, 00, 01, 02, 01, 02, 01, 00, 01, 00, 00, 02, 01, 00, 01, 02,
127 00, 01, 02, 01, 00, 00, 02, 01, 01, 00, 00, 02, 01, 00, 01, 02};
128
129 int decrypt_tables[3][128] = {
130 {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x57, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
131 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
132 0x2E, 0x47, 0x7A, 0x56, 0x42, 0x6A, 0x2F, 0x26, 0x49, 0x41, 0x34, 0x32, 0x5B, 0x76, 0x72, 0x43,
133 0x38, 0x39, 0x70, 0x45, 0x68, 0x71, 0x4F, 0x09, 0x62, 0x44, 0x23, 0x75, 0x3C, 0x7E, 0x3E, 0x5E,
134 0xFF, 0x77, 0x4A, 0x61, 0x5D, 0x22, 0x4B, 0x6F, 0x4E, 0x3B, 0x4C, 0x50, 0x67, 0x2A, 0x7D, 0x74,
135 0x54, 0x2B, 0x2D, 0x2C, 0x30, 0x6E, 0x6B, 0x66, 0x35, 0x25, 0x21, 0x64, 0x4D, 0x52, 0x63, 0x3F,
136 0x7B, 0x78, 0x29, 0x28, 0x73, 0x59, 0x33, 0x7F, 0x6D, 0x55, 0x53, 0x7C, 0x3A, 0x5F, 0x65, 0x46,
137 0x58, 0x31, 0x69, 0x6C, 0x5A, 0x48, 0x27, 0x5C, 0x3D, 0x24, 0x79, 0x37, 0x60, 0x51, 0x20, 0x36},
138
139 {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x7B, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
140 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
141 0x32, 0x30, 0x21, 0x29, 0x5B, 0x38, 0x33, 0x3D, 0x58, 0x3A, 0x35, 0x65, 0x39, 0x5C, 0x56, 0x73,
142 0x66, 0x4E, 0x45, 0x6B, 0x62, 0x59, 0x78, 0x5E, 0x7D, 0x4A, 0x6D, 0x71, 0x3C, 0x60, 0x3E, 0x53,
143 0xFF, 0x42, 0x27, 0x48, 0x72, 0x75, 0x31, 0x37, 0x4D, 0x52, 0x22, 0x54, 0x6A, 0x47, 0x64, 0x2D,
144 0x20, 0x7F, 0x2E, 0x4C, 0x5D, 0x7E, 0x6C, 0x6F, 0x79, 0x74, 0x43, 0x26, 0x76, 0x25, 0x24, 0x2B,
145 0x28, 0x23, 0x41, 0x34, 0x09, 0x2A, 0x44, 0x3F, 0x77, 0x3B, 0x55, 0x69, 0x61, 0x63, 0x50, 0x67,
146 0x51, 0x49, 0x4F, 0x46, 0x68, 0x7C, 0x36, 0x70, 0x6E, 0x7A, 0x2F, 0x5F, 0x4B, 0x5A, 0x2C, 0x57},
147
148 {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x6E, 0x0A, 0x0B, 0x0C, 0x06, 0x0E, 0x0F,
149 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
150 0x2D, 0x75, 0x52, 0x60, 0x71, 0x5E, 0x49, 0x5C, 0x62, 0x7D, 0x29, 0x36, 0x20, 0x7C, 0x7A, 0x7F,
151 0x6B, 0x63, 0x33, 0x2B, 0x68, 0x51, 0x66, 0x76, 0x31, 0x64, 0x54, 0x43, 0x3C, 0x3A, 0x3E, 0x7E,
152 0xFF, 0x45, 0x2C, 0x2A, 0x74, 0x27, 0x37, 0x44, 0x79, 0x59, 0x2F, 0x6F, 0x26, 0x72, 0x6A, 0x39,
153 0x7B, 0x3F, 0x38, 0x77, 0x67, 0x53, 0x47, 0x34, 0x78, 0x5D, 0x30, 0x23, 0x5A, 0x5B, 0x6C, 0x48,
154 0x55, 0x70, 0x69, 0x2E, 0x4C, 0x21, 0x24, 0x4E, 0x50, 0x09, 0x56, 0x73, 0x35, 0x61, 0x4B, 0x58,
155 0x3B, 0x57, 0x22, 0x6D, 0x4D, 0x25, 0x28, 0x46, 0x4A, 0x32, 0x41, 0x3D, 0x5F, 0x4F, 0x42, 0x65}};
156
rewind_tospace(const unsigned char * chunk,unsigned int len)157 static inline unsigned int rewind_tospace(const unsigned char *chunk, unsigned int len)
158 {
159 unsigned int count = len;
160 while (!isspace(chunk[len - 1]) && (len > 1)) {
161 len--;
162 }
163 if (len == 1) {
164 return count;
165 }
166 return len;
167 }
168
169 /* read at most @max_len of data from @m_area or @stream, skipping NULL chars.
170 * This used to be called cli_readline, but we don't stop at end-of-line anymore */
cli_readchunk(FILE * stream,m_area_t * m_area,unsigned int max_len)171 static unsigned char *cli_readchunk(FILE *stream, m_area_t *m_area, unsigned int max_len)
172 {
173 unsigned char *chunk, *start, *ptr, *end;
174 unsigned int chunk_len, count;
175
176 chunk = (unsigned char *)cli_malloc(max_len);
177 if (!chunk) {
178 cli_errmsg("readchunk: Unable to allocate memory for chunk\n");
179 return NULL;
180 }
181
182 /* Try and use the memory buffer first */
183 if (m_area) {
184 /* maximum we can copy into the buffer,
185 * we could have less than max_len bytes available */
186 chunk_len = MIN(m_area->length - m_area->offset, max_len - 1);
187 if (!chunk_len) {
188 free(chunk);
189 return NULL;
190 }
191 if (m_area->map)
192 ptr = (unsigned char *)fmap_need_off_once(m_area->map, m_area->offset, chunk_len);
193 else
194 ptr = m_area->buffer + m_area->offset;
195 start = ptr;
196 end = ptr - m_area->offset + m_area->length;
197
198 if ((start >= end) || !start) {
199 free(chunk);
200 return NULL;
201 }
202
203 /* look for NULL chars */
204 ptr = memchr(start, 0, chunk_len);
205 if (!ptr) {
206 /* no NULL chars found, copy all */
207 memcpy(chunk, start, chunk_len);
208 chunk[chunk_len] = '\0';
209 m_area->offset += chunk_len;
210 /* point ptr to end of chunk,
211 * so we can check and rewind to a space below */
212 ptr = start + chunk_len;
213 } else {
214 /* copy portion that doesn't contain NULL chars */
215 chunk_len = ptr - start;
216 if (chunk_len < max_len) {
217 memcpy(chunk, start, chunk_len);
218 } else {
219 chunk_len = 0;
220 ptr = start;
221 }
222 if (m_area->map)
223 ptr = (unsigned char *)fmap_need_ptr_once(m_area->map, ptr, end - ptr);
224 if (!ptr) {
225 cli_warnmsg("fmap inconsistency\n");
226 ptr = end;
227 }
228 /* we have unknown number of NULL chars,
229 * copy char-by-char and skip them */
230 while ((ptr < end) && (chunk_len < max_len - 1)) {
231 const unsigned char c = *ptr++;
232 /* we can't use chunk_len to determine how many bytes we read, since
233 * we skipped chars */
234 if (c) {
235 chunk[chunk_len++] = c;
236 }
237 }
238 m_area->offset += ptr - start;
239 chunk[chunk_len] = '\0';
240 }
241 if (ptr && ptr < end && !isspace(*ptr)) {
242 /* we hit max_len, rewind to a space */
243 count = rewind_tospace(chunk, chunk_len);
244 if (count < chunk_len) {
245 chunk[count] = '\0';
246 m_area->offset -= chunk_len - count;
247 }
248 }
249 } else {
250 if (!stream) {
251 cli_dbgmsg("No HTML stream\n");
252 free(chunk);
253 return NULL;
254 }
255 chunk_len = fread(chunk, 1, max_len - 1, stream);
256 if (!chunk_len || chunk_len > max_len - 1) {
257 /* EOF, or prevent overflow */
258 free(chunk);
259 return NULL;
260 }
261
262 /* Look for NULL chars */
263 ptr = memchr(chunk, 0, chunk_len);
264 if (ptr) {
265 /* NULL char found */
266 /* save buffer limits */
267 start = ptr;
268 end = chunk + chunk_len;
269
270 /* start of NULL chars, we will copy non-NULL characters
271 * to this position */
272 chunk_len = ptr - chunk;
273
274 /* find first non-NULL char */
275 while ((ptr < end) && !(*ptr)) {
276 ptr++;
277 }
278 /* skip over NULL chars, and move back the rest */
279 while ((ptr < end) && (chunk_len < max_len - 1)) {
280 const unsigned char c = *ptr++;
281 if (c) {
282 chunk[chunk_len++] = c;
283 }
284 }
285 }
286 chunk[chunk_len] = '\0';
287 if (chunk_len == max_len - 1) {
288 /* rewind to a space (which includes newline) */
289 count = rewind_tospace(chunk, chunk_len);
290 if (count < chunk_len) {
291 chunk[count] = '\0';
292 /* seek-back to space */
293 fseek(stream, -(long)(chunk_len - count), SEEK_CUR);
294 }
295 }
296 }
297
298 return chunk;
299 }
300
html_output_flush(file_buff_t * fbuff)301 static void html_output_flush(file_buff_t *fbuff)
302 {
303 if (fbuff && (fbuff->length > 0)) {
304 cli_writen(fbuff->fd, fbuff->buffer, fbuff->length);
305 fbuff->length = 0;
306 }
307 }
308
html_output_c(file_buff_t * fbuff1,unsigned char c)309 static inline void html_output_c(file_buff_t *fbuff1, unsigned char c)
310 {
311 if (fbuff1) {
312 if (fbuff1->length == HTML_FILE_BUFF_LEN) {
313 html_output_flush(fbuff1);
314 }
315 fbuff1->buffer[fbuff1->length++] = c;
316 }
317 }
318
html_output_str(file_buff_t * fbuff,const unsigned char * str,size_t len)319 static void html_output_str(file_buff_t *fbuff, const unsigned char *str, size_t len)
320 {
321 if (fbuff) {
322 if ((fbuff->length + len) >= HTML_FILE_BUFF_LEN) {
323 html_output_flush(fbuff);
324 }
325 if (len >= HTML_FILE_BUFF_LEN) {
326 html_output_flush(fbuff);
327 cli_writen(fbuff->fd, str, len);
328 } else {
329 memcpy(fbuff->buffer + fbuff->length, str, len);
330 fbuff->length += len;
331 }
332 }
333 }
334
html_tag_arg_value(tag_arguments_t * tags,const char * tag)335 static char *html_tag_arg_value(tag_arguments_t *tags, const char *tag)
336 {
337 int i;
338
339 for (i = 0; i < tags->count; i++) {
340 if (strcmp((const char *)tags->tag[i], tag) == 0) {
341 return (char *)tags->value[i];
342 }
343 }
344 return NULL;
345 }
346
html_tag_arg_set(tag_arguments_t * tags,const char * tag,const char * value)347 static void html_tag_arg_set(tag_arguments_t *tags, const char *tag, const char *value)
348 {
349 int i;
350
351 for (i = 0; i < tags->count; i++) {
352 if (strcmp((const char *)tags->tag[i], tag) == 0) {
353 free(tags->value[i]);
354 tags->value[i] = (unsigned char *)cli_strdup(value);
355 return;
356 }
357 }
358 return;
359 }
html_tag_arg_add(tag_arguments_t * tags,const char * tag,char * value)360 void html_tag_arg_add(tag_arguments_t *tags,
361 const char *tag, char *value)
362 {
363 int len, i;
364 tags->count++;
365 tags->tag = (unsigned char **)cli_realloc2(tags->tag,
366 tags->count * sizeof(char *));
367 if (!tags->tag) {
368 goto abort;
369 }
370 tags->value = (unsigned char **)cli_realloc2(tags->value,
371 tags->count * sizeof(char *));
372 if (!tags->value) {
373 goto abort;
374 }
375 if (tags->scanContents) {
376 tags->contents = (unsigned char **)cli_realloc2(tags->contents,
377 tags->count * sizeof(*tags->contents));
378 if (!tags->contents) {
379 goto abort;
380 }
381 tags->contents[tags->count - 1] = NULL;
382 }
383 tags->tag[tags->count - 1] = (unsigned char *)cli_strdup(tag);
384 if (value) {
385 if (*value == '"') {
386 tags->value[tags->count - 1] = (unsigned char *)cli_strdup(value + 1);
387 len = strlen((const char *)value + 1);
388 if (len > 0) {
389 tags->value[tags->count - 1][len - 1] = '\0';
390 }
391 } else {
392 tags->value[tags->count - 1] = (unsigned char *)cli_strdup(value);
393 }
394 } else {
395 tags->value[tags->count - 1] = NULL;
396 }
397 return;
398
399 abort:
400 /* Bad error - can't do 100% recovery */
401 tags->count--;
402 for (i = 0; i < tags->count; i++) {
403 if (tags->tag) {
404 free(tags->tag[i]);
405 }
406 if (tags->value) {
407 free(tags->value[i]);
408 }
409 if (tags->contents) {
410 if (tags->contents[i])
411 free(tags->contents[i]);
412 }
413 }
414 if (tags->tag) {
415 free(tags->tag);
416 }
417 if (tags->value) {
418 free(tags->value);
419 }
420 if (tags->contents)
421 free(tags->contents);
422 tags->contents = NULL;
423 tags->tag = tags->value = NULL;
424 tags->count = 0;
425 return;
426 }
427
html_output_tag(file_buff_t * fbuff,char * tag,tag_arguments_t * tags)428 static void html_output_tag(file_buff_t *fbuff, char *tag, tag_arguments_t *tags)
429 {
430 int i, j, len;
431
432 html_output_c(fbuff, '<');
433 html_output_str(fbuff, (const unsigned char *)tag, strlen(tag));
434 for (i = 0; i < tags->count; i++) {
435 html_output_c(fbuff, ' ');
436 html_output_str(fbuff, tags->tag[i], strlen((const char *)tags->tag[i]));
437 if (tags->value[i]) {
438 html_output_str(fbuff, (const unsigned char *)"=\"", 2);
439 len = strlen((const char *)tags->value[i]);
440 for (j = 0; j < len; j++) {
441 html_output_c(fbuff, tolower(tags->value[i][j]));
442 }
443 html_output_c(fbuff, '"');
444 }
445 }
446 html_output_c(fbuff, '>');
447 }
448
html_tag_arg_free(tag_arguments_t * tags)449 void html_tag_arg_free(tag_arguments_t *tags)
450 {
451 int i;
452
453 for (i = 0; i < tags->count; i++) {
454 free(tags->tag[i]);
455 if (tags->value[i]) {
456 free(tags->value[i]);
457 }
458 if (tags->contents)
459 if (tags->contents[i])
460 free(tags->contents[i]);
461 }
462 if (tags->tag) {
463 free(tags->tag);
464 }
465 if (tags->value) {
466 free(tags->value);
467 }
468 if (tags->contents)
469 free(tags->contents);
470 tags->contents = NULL;
471 tags->tag = tags->value = NULL;
472 tags->count = 0;
473 }
474
475 /**
476 * the displayed text for an <a href> tag
477 */
html_tag_contents_append(struct tag_contents * cont,const unsigned char * begin,const unsigned char * end)478 static inline void html_tag_contents_append(struct tag_contents *cont, const unsigned char *begin, const unsigned char *end)
479 {
480 size_t i;
481 uint32_t mbchar = 0;
482 if (!begin || !end)
483 return;
484 for (i = cont->pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end); i++) {
485 uint8_t c = *begin++;
486 if (mbchar && (c < 0x80 || mbchar >= 0x10000)) {
487 if (mbchar == 0xE38082 || mbchar == 0xEFBC8E || mbchar == 0xEFB992 ||
488 (mbchar == 0xA1 && (c == 0x43 || c == 0x44 || c == 0x4F))) {
489 cont->contents[i++] = '.';
490 if (mbchar == 0xA1) {
491 --i;
492 mbchar = 0;
493 continue;
494 }
495 } else {
496 uint8_t c0 = mbchar >> 16;
497 uint8_t c1 = (mbchar >> 8) & 0xff;
498 uint8_t c2 = (mbchar & 0xff);
499 if (c0 && i + 1 < MAX_TAG_CONTENTS_LENGTH)
500 cont->contents[i++] = c0;
501 if ((c0 || c1) && i + 1 < MAX_TAG_CONTENTS_LENGTH)
502 cont->contents[i++] = c1;
503 if (i + 1 < MAX_TAG_CONTENTS_LENGTH)
504 cont->contents[i++] = c2;
505 }
506 mbchar = 0;
507 }
508 if (c >= 0x80) {
509 mbchar = (mbchar << 8) | c;
510 --i;
511 } else
512 cont->contents[i] = c;
513 }
514 cont->pos = i;
515 }
516
html_tag_contents_done(tag_arguments_t * tags,int idx,struct tag_contents * cont)517 static inline void html_tag_contents_done(tag_arguments_t *tags, int idx, struct tag_contents *cont)
518 {
519 unsigned char *p;
520 cont->contents[cont->pos++] = '\0';
521 p = cli_malloc(cont->pos);
522 if (!p) {
523 cli_errmsg("html_tag_contents_done: Unable to allocate memory for p\n");
524 return;
525 }
526 memcpy(p, cont->contents, cont->pos);
527 tags->contents[idx - 1] = p;
528 cont->pos = 0;
529 }
530
531 struct screnc_state {
532 uint32_t length;
533 uint32_t sum;
534 uint8_t table_pos;
535 };
536
537 /* inplace decoding, so that we can normalize it later */
screnc_decode(unsigned char * ptr,struct screnc_state * s)538 static void screnc_decode(unsigned char *ptr, struct screnc_state *s)
539 {
540 uint8_t value;
541 unsigned char *dst = ptr;
542
543 if (!ptr || !s)
544 return;
545 while (s->length > 0 && *ptr) {
546 if ((*ptr == '\n') || (*ptr == '\r')) {
547 ptr++;
548 continue;
549 }
550 if (*ptr < 0x80) {
551 value = decrypt_tables[table_order[s->table_pos]][*ptr];
552 if (value == 0xFF) { /* special character */
553 ptr++;
554 s->length--;
555 switch (*ptr) {
556 case '\0':
557 /* Fixup for end of line */
558 ptr--;
559 break;
560 case 0x21:
561 value = 0x3c;
562 break;
563 case 0x23:
564 value = 0x0d;
565 break;
566 case 0x24:
567 value = 0x40;
568 break;
569 case 0x26:
570 value = 0x0a;
571 break;
572 case 0x2a:
573 value = 0x3e;
574 break;
575 }
576 }
577 s->sum += value;
578 *dst++ = value;
579 s->table_pos = (s->table_pos + 1) % 64;
580 } else {
581 *dst++ = *ptr++;
582 *dst++ = *ptr;
583 if (!*ptr) {
584 dst--;
585 break;
586 }
587 }
588 ptr++;
589 s->length--;
590 }
591 if (!s->length) {
592 size_t remaining;
593 if (strlen((const char *)ptr) >= 12) {
594 uint64_t expected;
595 expected = base64_chars[ptr[0]] < 0 ? 0 : base64_chars[ptr[0]] << 2;
596 expected += base64_chars[ptr[1]] >> 4;
597 expected += (base64_chars[ptr[1]] & 0x0f) << 12;
598 expected += ((base64_chars[ptr[2]] >> 2) < 0 ? 0 : (base64_chars[ptr[2]] >> 2)) << 8;
599 expected += (base64_chars[ptr[2]] & 0x03) << 22;
600 expected += base64_chars[ptr[3]] < 0 ? 0 : base64_chars[ptr[3]] << 16;
601 expected += (base64_chars[ptr[4]] < 0 ? 0 : base64_chars[ptr[4]] << 2) << 24;
602 expected += ((base64_chars[ptr[5]] >> 4) < 0 ? 0 : (base64_chars[ptr[5]] >> 4)) << 24;
603 ptr += 8;
604 if (s->sum != expected) {
605 cli_dbgmsg("screnc_decode: checksum mismatch: %u != %" PRIu64 "\n", s->sum, expected);
606 } else {
607 if (strncmp((const char *)ptr, "^#~@", 4) != 0) {
608 cli_dbgmsg("screnc_decode: terminator not found\n");
609 } else {
610 cli_dbgmsg("screnc_decode: OK\n");
611 }
612 }
613 ptr += 4;
614 }
615 /* copy remaining */
616 remaining = strlen((const char *)ptr) + 1;
617 memmove(dst, ptr, remaining);
618 } else {
619 *dst = '\0';
620 }
621 }
622
js_process(struct parser_state * js_state,const unsigned char * js_begin,const unsigned char * js_end,const unsigned char * line,const unsigned char * ptr,int in_script,const char * dirname)623 static void js_process(struct parser_state *js_state, const unsigned char *js_begin, const unsigned char *js_end,
624 const unsigned char *line, const unsigned char *ptr, int in_script, const char *dirname)
625 {
626 if (!js_begin)
627 js_begin = line;
628 if (!js_end)
629 js_end = ptr;
630 if (js_end > js_begin &&
631 CLI_ISCONTAINED(line, 8192, js_begin, 1) &&
632 CLI_ISCONTAINED(line, 8192, js_end, 1)) {
633 cli_js_process_buffer(js_state, (const char *)js_begin, js_end - js_begin);
634 }
635 if (!in_script) {
636 /* we found a /script, normalize script now */
637 cli_js_parse_done(js_state);
638 cli_js_output(js_state, dirname);
639 cli_js_destroy(js_state);
640 }
641 }
642
cli_html_normalise(int fd,m_area_t * m_area,const char * dirname,tag_arguments_t * hrefs,const struct cli_dconf * dconf)643 static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
644 {
645 int fd_tmp, tag_length = 0, tag_arg_length = 0, binary;
646 int64_t retval = FALSE, escape = FALSE, value = 0, hex = FALSE, tag_val_length = 0;
647 int look_for_screnc = FALSE, in_screnc = FALSE, in_script = FALSE, text_space_written = FALSE;
648 FILE *stream_in = NULL;
649 html_state state = HTML_NORM, next_state = HTML_BAD_STATE, saved_next_state = HTML_BAD_STATE;
650 char filename[1024], tag[HTML_STR_LENGTH + 1], tag_arg[HTML_STR_LENGTH + 1];
651 char tag_val[HTML_STR_LENGTH + 1], *tmp_file, *arg_value;
652 unsigned char *line = NULL, *ptr, *ptr_screnc = NULL;
653 tag_arguments_t tag_args;
654 quoted_state quoted = NOT_QUOTED;
655 unsigned long length = 0;
656 struct screnc_state screnc_state;
657 file_buff_t *file_buff_o2, *file_buff_text;
658 file_buff_t *file_tmp_o1 = NULL;
659 int in_ahref = 0; /* index of <a> tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of <a>*/
660 unsigned char *href_contents_begin = NULL; /*beginning of the next portion of <a> contents*/
661 unsigned char *ptrend = NULL; /*end of <a> contents*/
662 unsigned char *in_form_action = NULL; /* the action URL of the current <form> tag, if any*/
663
664 struct entity_conv conv;
665 unsigned char entity_val[HTML_STR_LENGTH + 1];
666 size_t entity_val_length = 0;
667 const int dconf_entconv = dconf ? dconf->phishing & PHISHING_CONF_ENTCONV : 1;
668 const int dconf_js = dirname && (dconf ? dconf->doc & DOC_CONF_JSNORM : 1); /* TODO */
669 /* dconf for phishing engine sets scanContents, so no need for a flag here */
670 struct parser_state *js_state = NULL;
671 const unsigned char *js_begin = NULL, *js_end = NULL;
672 struct tag_contents contents;
673 uint32_t mbchar = 0;
674 uint32_t mbchar2 = 0;
675
676 /*
677 * Initialize stack buffers.
678 */
679 memset(filename, 0, sizeof(filename));
680 memset(tag, 0, sizeof(tag));
681 memset(tag_arg, 0, sizeof(tag_arg));
682 memset(tag_val, 0, sizeof(tag_val));
683 memset(entity_val, 0, sizeof(entity_val));
684
685 tag_args.scanContents = 0; /* do we need to store the contents of <a></a>?*/
686 contents.pos = 0;
687 if (!m_area) {
688 if (fd < 0) {
689 cli_dbgmsg("Invalid HTML fd\n");
690 return FALSE;
691 }
692 lseek(fd, 0, SEEK_SET);
693 fd_tmp = dup(fd);
694 if (fd_tmp < 0) {
695 return FALSE;
696 }
697 stream_in = fdopen(fd_tmp, "r");
698 if (!stream_in) {
699 close(fd_tmp);
700 return FALSE;
701 }
702 }
703
704 tag_args.count = 0;
705 tag_args.tag = NULL;
706 tag_args.value = NULL;
707 tag_args.contents = NULL;
708 if (dirname) {
709 file_buff_o2 = (file_buff_t *)cli_malloc(sizeof(file_buff_t));
710 if (!file_buff_o2) {
711 cli_errmsg("cli_html_normalise: Unable to allocate memory for file_buff_o2\n");
712 file_buff_o2 = file_buff_text = NULL;
713 goto abort;
714 }
715
716 /* this will still contains scripts that are inside comments */
717 snprintf(filename, 1024, "%s" PATHSEP "nocomment.html", dirname);
718 file_buff_o2->fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR);
719 if (file_buff_o2->fd == -1) {
720 cli_dbgmsg("open failed: %s\n", filename);
721 free(file_buff_o2);
722 file_buff_o2 = file_buff_text = NULL;
723 goto abort;
724 }
725
726 file_buff_text = (file_buff_t *)cli_malloc(sizeof(file_buff_t));
727 if (!file_buff_text) {
728 close(file_buff_o2->fd);
729 free(file_buff_o2);
730 file_buff_o2 = file_buff_text = NULL;
731 cli_errmsg("cli_html_normalise: Unable to allocate memory for file_buff_text\n");
732 goto abort;
733 }
734
735 snprintf(filename, 1024, "%s" PATHSEP "notags.html", dirname);
736 file_buff_text->fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR);
737 if (file_buff_text->fd == -1) {
738 cli_dbgmsg("open failed: %s\n", filename);
739 close(file_buff_o2->fd);
740 free(file_buff_o2);
741 free(file_buff_text);
742 file_buff_o2 = file_buff_text = NULL;
743 goto abort;
744 }
745 file_buff_o2->length = 0;
746 file_buff_text->length = 0;
747 } else {
748 file_buff_o2 = NULL;
749 file_buff_text = NULL;
750 }
751
752 binary = FALSE;
753
754 ptr = line = cli_readchunk(stream_in, m_area, 8192);
755
756 while (line) {
757 if (href_contents_begin)
758 href_contents_begin = ptr; /*start of a new line, last line already appended to contents see below*/
759 while (*ptr && isspace(*ptr)) {
760 ptr++;
761 }
762 while (*ptr) {
763 if (!binary && *ptr == '\n') {
764 /* Convert it to a space and re-process */
765 *ptr = ' ';
766 continue;
767 }
768 if (!binary && *ptr == '\r') {
769 ptr++;
770 continue;
771 }
772 switch (state) {
773 case HTML_SPECIAL_CHAR:
774 cli_dbgmsg("Impossible, special_char can't occur here\n");
775 break;
776 case HTML_BAD_STATE:
777 /* An engine error has occurred */
778 cli_dbgmsg("HTML Engine Error\n");
779 goto abort;
780 case HTML_SKIP_WS:
781 if (isspace(*ptr)) {
782 ptr++;
783 } else {
784 state = next_state;
785 next_state = HTML_BAD_STATE;
786 }
787 break;
788 case HTML_TRIM_WS:
789 if (isspace(*ptr)) {
790 ptr++;
791 } else {
792 if (!in_script)
793 html_output_c(file_buff_o2, ' ');
794 state = next_state;
795 next_state = HTML_BAD_STATE;
796 }
797 break;
798 case HTML_8BIT:
799 if (*ptr < 0x80 || mbchar >= 0x10000) {
800 if (mbchar == 0xE38082 || mbchar == 0xEFBC8E || mbchar == 0xEFB992 ||
801 (mbchar == 0xA1 && (*ptr == 0x43 || *ptr == 0x44 || *ptr == 0x4F))) {
802 /* bb #4097 */
803 html_output_c(file_buff_o2, '.');
804 html_output_c(file_buff_text, '.');
805 if (mbchar == 0xA1) {
806 ptr++;
807 mbchar = 0;
808 continue;
809 }
810 } else {
811 uint8_t c0 = mbchar >> 16;
812 uint8_t c1 = (mbchar >> 8) & 0xff;
813 uint8_t c2 = (mbchar & 0xff);
814 if (c0) {
815 html_output_c(file_buff_o2, c0);
816 html_output_c(file_buff_text, c0);
817 }
818 if (c0 || c1) {
819 html_output_c(file_buff_o2, c1);
820 html_output_c(file_buff_text, c1);
821 }
822 html_output_c(file_buff_o2, c2);
823 html_output_c(file_buff_text, c1);
824 }
825 mbchar = 0;
826 state = next_state;
827 next_state = HTML_NORM;
828 } else {
829 mbchar = (mbchar << 8) | *ptr;
830 ptr++;
831 }
832 break;
833 case HTML_NORM:
834 if (*ptr == '<') {
835 ptrend = ptr; /* for use by scanContents */
836 html_output_c(file_buff_o2, '<');
837 if (!in_script && !text_space_written) {
838 html_output_c(file_buff_text, ' ');
839 text_space_written = TRUE;
840 }
841 if (hrefs && hrefs->scanContents && in_ahref && href_contents_begin) {
842 /*append this text portion to the contents of <a>*/
843 html_tag_contents_append(&contents, href_contents_begin, ptr);
844 href_contents_begin = NULL; /*We just encountered another tag inside <a>, so skip it*/
845 }
846 ptr++;
847 state = HTML_SKIP_WS;
848 tag_length = 0;
849 next_state = HTML_TAG;
850 } else if (isspace(*ptr)) {
851 if (!text_space_written && !in_script) {
852 html_output_c(file_buff_text, ' ');
853 text_space_written = TRUE;
854 }
855 state = HTML_TRIM_WS;
856 next_state = HTML_NORM;
857 } else if (*ptr == '&') {
858 if (!text_space_written && !in_script) {
859 html_output_c(file_buff_text, ' ');
860 text_space_written = TRUE;
861 }
862 state = HTML_CHAR_REF;
863 next_state = HTML_NORM;
864 ptr++;
865 } else if (*ptr >= 0x80) {
866 state = HTML_8BIT;
867 next_state = HTML_NORM;
868 mbchar = *ptr;
869 ptr++;
870 } else {
871 unsigned char c = tolower(*ptr);
872 /* normalize ' to " for scripts */
873 if (in_script && c == '\'') c = '"';
874 html_output_c(file_buff_o2, c);
875 if (!in_script) {
876 if (*ptr < 0x20) {
877 if (!text_space_written) {
878 html_output_c(file_buff_text, ' ');
879 text_space_written = TRUE;
880 }
881 } else {
882 html_output_c(file_buff_text, c);
883 text_space_written = FALSE;
884 }
885 }
886 ptr++;
887 }
888 break;
889 case HTML_TAG:
890 if ((tag_length == 0) && (*ptr == '!')) {
891 /* Comment */
892 if (in_script) {
893 /* we still write scripts to nocomment.html */
894 html_output_c(file_buff_o2, '!');
895 } else {
896 /* Need to rewind in the no-comment output stream */
897 if (file_buff_o2 && (file_buff_o2->length > 0)) {
898 file_buff_o2->length--;
899 }
900 }
901 state = HTML_COMMENT;
902 next_state = HTML_BAD_STATE;
903 ptr++;
904 } else if (*ptr == '>') {
905 html_output_c(file_buff_o2, '>');
906 ptr++;
907 tag[tag_length] = '\0';
908 state = HTML_SKIP_WS;
909 next_state = HTML_PROCESS_TAG;
910 } else if (!isspace(*ptr)) {
911 html_output_c(file_buff_o2, tolower(*ptr));
912 /* if we're inside a script we only care for </script>.*/
913 if (in_script && tag_length == 0 && *ptr != '/') {
914 state = HTML_NORM;
915 }
916 if (tag_length < HTML_STR_LENGTH) {
917 tag[tag_length++] = tolower(*ptr);
918 }
919 ptr++;
920 } else {
921 tag[tag_length] = '\0';
922 state = HTML_SKIP_WS;
923 tag_arg_length = 0;
924 /* if we'd go to HTML_TAG_ARG whitespace would be inconsistently normalized for in_script*/
925 next_state = !in_script ? HTML_TAG_ARG : HTML_PROCESS_TAG;
926 }
927 break;
928 case HTML_TAG_ARG:
929 if (*ptr == '=') {
930 html_output_c(file_buff_o2, '=');
931 tag_arg[tag_arg_length] = '\0';
932 ptr++;
933 state = HTML_SKIP_WS;
934 escape = FALSE;
935 quoted = NOT_QUOTED;
936 tag_val_length = 0;
937 next_state = HTML_TAG_ARG_VAL;
938 } else if (isspace(*ptr)) {
939 ptr++;
940 tag_arg[tag_arg_length] = '\0';
941 state = HTML_SKIP_WS;
942 next_state = HTML_TAG_ARG_EQUAL;
943 } else if (*ptr == '>') {
944 html_output_c(file_buff_o2, '>');
945 if (tag_arg_length > 0) {
946 tag_arg[tag_arg_length] = '\0';
947 html_tag_arg_add(&tag_args, tag_arg, NULL);
948 }
949 ptr++;
950 state = HTML_PROCESS_TAG;
951 next_state = HTML_BAD_STATE;
952 } else {
953 if (tag_arg_length == 0) {
954 /* Start of new tag - add space */
955 html_output_c(file_buff_o2, ' ');
956 }
957 html_output_c(file_buff_o2, tolower(*ptr));
958 if (tag_arg_length < HTML_STR_LENGTH) {
959 tag_arg[tag_arg_length++] = tolower(*ptr);
960 }
961 ptr++;
962 }
963 break;
964 case HTML_TAG_ARG_EQUAL:
965 if (*ptr == '=') {
966 html_output_c(file_buff_o2, '=');
967 ptr++;
968 state = HTML_SKIP_WS;
969 escape = FALSE;
970 quoted = NOT_QUOTED;
971 tag_val_length = 0;
972 next_state = HTML_TAG_ARG_VAL;
973 } else {
974 if (tag_arg_length > 0) {
975 tag_arg[tag_arg_length] = '\0';
976 html_tag_arg_add(&tag_args, tag_arg, NULL);
977 }
978 tag_arg_length = 0;
979 state = HTML_TAG_ARG;
980 next_state = HTML_BAD_STATE;
981 }
982 break;
983 case HTML_TAG_ARG_VAL:
984 if ((tag_val_length == 5) && (strncmp(tag_val, "data:", 5) == 0)) {
985 /* RFC2397 inline data */
986
987 /* Rewind one byte so we don't recursive */
988 if (file_buff_o2 && (file_buff_o2->length > 0)) {
989 file_buff_o2->length--;
990 }
991
992 if (quoted != NOT_QUOTED) {
993 html_output_c(file_buff_o2, '"');
994 }
995 tag_val_length = 0;
996 state = HTML_RFC2397_TYPE;
997 next_state = HTML_TAG_ARG;
998 } else if ((tag_val_length == 6) && (strncmp(tag_val, "\"data:", 6) == 0)) {
999 /* RFC2397 inline data */
1000
1001 /* Rewind one byte so we don't recursive */
1002 if (file_buff_o2 && (file_buff_o2->length > 0)) {
1003 file_buff_o2->length--;
1004 }
1005
1006 if (quoted != NOT_QUOTED) {
1007 html_output_c(file_buff_o2, '"');
1008 }
1009
1010 tag_val_length = 0;
1011 state = HTML_RFC2397_TYPE;
1012 next_state = HTML_TAG_ARG;
1013 } else if (*ptr == '&') {
1014 state = HTML_CHAR_REF;
1015 next_state = HTML_TAG_ARG_VAL;
1016 ptr++;
1017 } else if (*ptr == '\'') {
1018 if (tag_val_length == 0) {
1019 quoted = SINGLE_QUOTED;
1020 html_output_c(file_buff_o2, '"');
1021 if (tag_val_length < HTML_STR_LENGTH) {
1022 tag_val[tag_val_length++] = '"';
1023 }
1024 ptr++;
1025 } else {
1026 if (!escape && (quoted == SINGLE_QUOTED)) {
1027 html_output_c(file_buff_o2, '"');
1028 if (tag_val_length < HTML_STR_LENGTH) {
1029 tag_val[tag_val_length++] = '"';
1030 }
1031 tag_val[tag_val_length] = '\0';
1032 html_tag_arg_add(&tag_args, tag_arg, tag_val);
1033 ptr++;
1034 state = HTML_SKIP_WS;
1035 tag_arg_length = 0;
1036 next_state = HTML_TAG_ARG;
1037 } else {
1038 html_output_c(file_buff_o2, '"');
1039 if (tag_val_length < HTML_STR_LENGTH) {
1040 tag_val[tag_val_length++] = '"';
1041 }
1042 ptr++;
1043 }
1044 }
1045 } else if (*ptr == '"') {
1046 if (tag_val_length == 0) {
1047 quoted = DOUBLE_QUOTED;
1048 html_output_c(file_buff_o2, '"');
1049 if (tag_val_length < HTML_STR_LENGTH) {
1050 tag_val[tag_val_length++] = '"';
1051 }
1052 ptr++;
1053 } else {
1054 if (!escape && (quoted == DOUBLE_QUOTED)) {
1055 html_output_c(file_buff_o2, '"');
1056 if (tag_val_length < HTML_STR_LENGTH) {
1057 tag_val[tag_val_length++] = '"';
1058 }
1059 tag_val[tag_val_length] = '\0';
1060 html_tag_arg_add(&tag_args, tag_arg, tag_val);
1061 ptr++;
1062 state = HTML_SKIP_WS;
1063 tag_arg_length = 0;
1064 next_state = HTML_TAG_ARG;
1065 } else {
1066 html_output_c(file_buff_o2, '"');
1067 if (tag_val_length < HTML_STR_LENGTH) {
1068 tag_val[tag_val_length++] = '"';
1069 }
1070 ptr++;
1071 }
1072 }
1073 } else if (isspace(*ptr) || (*ptr == '>')) {
1074 if (quoted == NOT_QUOTED) {
1075 tag_val[tag_val_length] = '\0';
1076 html_tag_arg_add(&tag_args, tag_arg, tag_val);
1077 state = HTML_SKIP_WS;
1078 tag_arg_length = 0;
1079 next_state = HTML_TAG_ARG;
1080 } else {
1081 html_output_c(file_buff_o2, *ptr);
1082 if (tag_val_length < HTML_STR_LENGTH) {
1083 if (isspace(*ptr)) {
1084 tag_val[tag_val_length++] = ' ';
1085 } else {
1086 tag_val[tag_val_length++] = '>';
1087 }
1088 }
1089 state = HTML_SKIP_WS;
1090 escape = FALSE;
1091 quoted = NOT_QUOTED;
1092 next_state = HTML_TAG_ARG_VAL;
1093 ptr++;
1094 }
1095 } else {
1096 if (mbchar2 && (*ptr < 0x80 || mbchar2 >= 0x10000)) {
1097 if (mbchar2 == 0xE38082 || mbchar2 == 0xEFBC8E || mbchar2 == 0xEFB992 ||
1098 (mbchar2 == 0xA1 && (*ptr == 0x43 || *ptr == 0x44 || *ptr == 0x4F))) {
1099 html_output_c(file_buff_o2, '.');
1100 if (tag_val_length < HTML_STR_LENGTH)
1101 tag_val[tag_val_length++] = '.';
1102 if (mbchar2 == 0xA1) {
1103 ptr++;
1104 mbchar2 = 0;
1105 continue;
1106 }
1107 } else {
1108 uint8_t c0 = mbchar2 >> 16;
1109 uint8_t c1 = (mbchar2 >> 8) & 0xff;
1110 uint8_t c2 = (mbchar2 & 0xff);
1111 if (c0)
1112 html_output_c(file_buff_o2, c0);
1113 if (c0 || c1)
1114 html_output_c(file_buff_o2, c1);
1115 html_output_c(file_buff_o2, c2);
1116 if (c0 && tag_val_length < HTML_STR_LENGTH)
1117 tag_val[tag_val_length++] = c0;
1118 if ((c0 || c1) && tag_val_length < HTML_STR_LENGTH)
1119 tag_val[tag_val_length++] = c1;
1120 if (tag_val_length < HTML_STR_LENGTH)
1121 tag_val[tag_val_length++] = c2;
1122 }
1123 mbchar2 = 0;
1124 }
1125 if (*ptr >= 0x80)
1126 mbchar2 = (mbchar2 << 8) | *ptr;
1127 else {
1128 html_output_c(file_buff_o2, tolower(*ptr));
1129 if (tag_val_length < HTML_STR_LENGTH) {
1130 tag_val[tag_val_length++] = *ptr;
1131 }
1132 }
1133 ptr++;
1134 }
1135
1136 if (*ptr == '\\') {
1137 escape = TRUE;
1138 } else {
1139 escape = FALSE;
1140 }
1141 break;
1142 case HTML_COMMENT:
1143 if (in_script && !isspace(*ptr)) {
1144 unsigned char c = tolower(*ptr);
1145 /* dump script to nocomment.html, since we no longer have
1146 * comment.html/script.html */
1147 if (c == '\'') c = '"';
1148 html_output_c(file_buff_o2, c);
1149 }
1150 if (*ptr == '>') {
1151 state = HTML_SKIP_WS;
1152 next_state = HTML_NORM;
1153 }
1154 ptr++;
1155 break;
1156 case HTML_PROCESS_TAG:
1157
1158 /* Default to no action for this tag */
1159 state = HTML_SKIP_WS;
1160 next_state = HTML_NORM;
1161 if (tag[0] == '/') {
1162 /* End tag */
1163 state = HTML_SKIP_WS;
1164 next_state = HTML_NORM;
1165 if (strcmp(tag, "/script") == 0) {
1166 in_script = FALSE;
1167 if (js_state) {
1168 js_end = ptr;
1169 js_process(js_state, js_begin, js_end, line, ptr, in_script, dirname);
1170 js_state = NULL;
1171 js_begin = js_end = NULL;
1172 }
1173 /*don't output newlines in nocomment.html
1174 * html_output_c(file_buff_o2, '\n');*/
1175 }
1176 if (hrefs && hrefs->scanContents && in_ahref) {
1177 if (strcmp(tag, "/a") == 0) {
1178 html_tag_contents_done(hrefs, in_ahref, &contents);
1179 in_ahref = 0; /* we are no longer inside an <a href>
1180 nesting <a> tags not supported, and shouldn't be supported*/
1181 }
1182 href_contents_begin = ptr;
1183 }
1184 if (strcmp(tag, "/form") == 0) {
1185 if (in_form_action)
1186 free(in_form_action);
1187 in_form_action = NULL;
1188 }
1189 } else if (strcmp(tag, "script") == 0) {
1190 arg_value = html_tag_arg_value(&tag_args, "language");
1191 /* TODO: maybe we can output all tags only via html_output_tag */
1192 if (arg_value && (strcasecmp((const char *)arg_value, "jscript.encode") == 0)) {
1193 html_tag_arg_set(&tag_args, "language", "javascript");
1194 state = HTML_SKIP_WS;
1195 next_state = HTML_JSDECODE;
1196 /* we already output the old tag, output the new tag now */
1197 html_output_tag(file_buff_o2, tag, &tag_args);
1198 } else if (arg_value && (strcasecmp((const char *)arg_value, "vbscript.encode") == 0)) {
1199 html_tag_arg_set(&tag_args, "language", "vbscript");
1200 state = HTML_SKIP_WS;
1201 next_state = HTML_JSDECODE;
1202 /* we already output the old tag, output the new tag now */
1203 html_output_tag(file_buff_o2, tag, &tag_args);
1204 }
1205 in_script = TRUE;
1206 if (dconf_js && !js_state) {
1207 js_state = cli_js_init();
1208 if (!js_state) {
1209 cli_dbgmsg("htmlnorm: Failed to initialize js parser\n");
1210 }
1211 js_begin = ptr;
1212 js_end = NULL;
1213 }
1214 } else if (strcmp(tag, "%@") == 0) {
1215 arg_value = html_tag_arg_value(&tag_args, "language");
1216 if (arg_value && (strcasecmp((const char *)arg_value, "jscript.encode") == 0 ||
1217 strcasecmp((const char *)arg_value, "vbscript.encode") == 0)) {
1218
1219 saved_next_state = next_state;
1220 next_state = state;
1221 look_for_screnc = FALSE;
1222 state = HTML_LOOKFOR_SCRENC;
1223 }
1224 } else if (hrefs) {
1225 if (in_ahref && !href_contents_begin)
1226 href_contents_begin = ptr;
1227 if (strcmp(tag, "a") == 0) {
1228 arg_value = html_tag_arg_value(&tag_args, "href");
1229 if (arg_value && strlen((const char *)arg_value) > 0) {
1230 if (hrefs->scanContents) {
1231 char *arg_value_title = html_tag_arg_value(&tag_args, "title");
1232 /*beginning of an <a> tag*/
1233 if (in_ahref)
1234 /*we encountered nested <a> tags, pretend previous closed*/
1235 if (href_contents_begin) {
1236 html_tag_contents_append(&contents, href_contents_begin, ptrend);
1237 /*add pending contents between tags*/
1238 html_tag_contents_done(hrefs, in_ahref, &contents);
1239 in_ahref = 0;
1240 }
1241 if (arg_value_title) {
1242 /* title is a 'displayed link'*/
1243 html_tag_arg_add(hrefs, "href_title", arg_value_title);
1244 html_tag_contents_append(&contents, (const unsigned char *)arg_value,
1245 (const unsigned char *)arg_value + strlen(arg_value));
1246 html_tag_contents_done(hrefs, hrefs->count, &contents);
1247 }
1248 if (in_form_action) {
1249 /* form action is the real URL, and href is the 'displayed' */
1250 html_tag_arg_add(hrefs, "form", arg_value);
1251 contents.pos = 0;
1252 html_tag_contents_append(&contents, in_form_action,
1253 in_form_action + strlen((const char *)in_form_action));
1254 html_tag_contents_done(hrefs, hrefs->count, &contents);
1255 }
1256 }
1257 html_tag_arg_add(hrefs, "href", arg_value);
1258 if (hrefs->scanContents) {
1259 in_ahref = hrefs->count; /* index of this tag (counted from 1) */
1260 href_contents_begin = ptr; /* contents begin after <a ..> ends */
1261 contents.pos = 0;
1262 }
1263 }
1264 } else if (strcmp(tag, "form") == 0 && hrefs->scanContents) {
1265 const char *arg_action_value = html_tag_arg_value(&tag_args, "action");
1266 if (arg_action_value) {
1267 if (in_form_action)
1268 free(in_form_action);
1269 in_form_action = (unsigned char *)cli_strdup(arg_action_value);
1270 }
1271 } else if (strcmp(tag, "img") == 0) {
1272 arg_value = html_tag_arg_value(&tag_args, "src");
1273 if (arg_value && strlen(arg_value) > 0) {
1274 html_tag_arg_add(hrefs, "src", arg_value);
1275 if (hrefs->scanContents && in_ahref)
1276 /* "contents" of an img tag, is the URL of its parent <a> tag */
1277 hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_strdup((const char *)hrefs->value[in_ahref - 1]);
1278 if (in_form_action) {
1279 /* form action is the real URL, and href is the 'displayed' */
1280 html_tag_arg_add(hrefs, "form", arg_value);
1281 contents.pos = 0;
1282 html_tag_contents_append(&contents, in_form_action,
1283 in_form_action + strlen((const char *)in_form_action));
1284 html_tag_contents_done(hrefs, hrefs->count, &contents);
1285 }
1286 }
1287 arg_value = html_tag_arg_value(&tag_args, "dynsrc");
1288 if (arg_value && strlen(arg_value) > 0) {
1289 html_tag_arg_add(hrefs, "dynsrc", arg_value);
1290 if (hrefs->scanContents && in_ahref)
1291 /* see above */
1292 hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_strdup((const char *)hrefs->value[in_ahref - 1]);
1293 if (in_form_action) {
1294 /* form action is the real URL, and href is the 'displayed' */
1295 html_tag_arg_add(hrefs, "form", arg_value);
1296 contents.pos = 0;
1297 html_tag_contents_append(&contents, in_form_action,
1298 in_form_action + strlen((const char *)in_form_action));
1299 html_tag_contents_done(hrefs, hrefs->count, &contents);
1300 }
1301 }
1302 } else if (strcmp(tag, "iframe") == 0) {
1303 arg_value = html_tag_arg_value(&tag_args, "src");
1304 if (arg_value && strlen(arg_value) > 0) {
1305 html_tag_arg_add(hrefs, "iframe", arg_value);
1306 if (hrefs->scanContents && in_ahref)
1307 /* see above */
1308 hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_strdup((const char *)hrefs->value[in_ahref - 1]);
1309 if (in_form_action) {
1310 /* form action is the real URL, and href is the 'displayed' */
1311 html_tag_arg_add(hrefs, "form", arg_value);
1312 contents.pos = 0;
1313 html_tag_contents_append(&contents, in_form_action,
1314 in_form_action + strlen((const char *)in_form_action));
1315 html_tag_contents_done(hrefs, hrefs->count, &contents);
1316 }
1317 }
1318 } else if (strcmp(tag, "area") == 0) {
1319 arg_value = html_tag_arg_value(&tag_args, "href");
1320 if (arg_value && strlen(arg_value) > 0) {
1321 html_tag_arg_add(hrefs, "area", arg_value);
1322 if (hrefs->scanContents && in_ahref)
1323 /* see above */
1324 hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_strdup((const char *)hrefs->value[in_ahref - 1]);
1325 if (in_form_action) {
1326 /* form action is the real URL, and href is the 'displayed' */
1327 html_tag_arg_add(hrefs, "form", arg_value);
1328 contents.pos = 0;
1329 html_tag_contents_append(&contents, in_form_action,
1330 in_form_action + strlen((const char *)in_form_action));
1331 html_tag_contents_done(hrefs, hrefs->count, &contents);
1332 }
1333 }
1334 }
1335 /* TODO:imagemaps can have urls too */
1336 } else if (strcmp(tag, "a") == 0) {
1337 /* a/img tags for buff_text can be processed only if we're not processing hrefs */
1338 arg_value = html_tag_arg_value(&tag_args, "href");
1339 if (arg_value && arg_value[0]) {
1340 html_output_str(file_buff_text, (const unsigned char *)arg_value, strlen((const char *)arg_value));
1341 html_output_c(file_buff_text, ' ');
1342 text_space_written = TRUE;
1343 }
1344 } else if (strcmp(tag, "img") == 0) {
1345 arg_value = html_tag_arg_value(&tag_args, "src");
1346 if (arg_value && arg_value[0]) {
1347 html_output_str(file_buff_text, (const unsigned char *)arg_value, strlen((const char *)arg_value));
1348 html_output_c(file_buff_text, ' ');
1349 text_space_written = TRUE;
1350 }
1351 }
1352 html_tag_arg_free(&tag_args);
1353 break;
1354 case HTML_CHAR_REF:
1355 if (*ptr == '#') {
1356 value = 0;
1357 hex = FALSE;
1358 state = HTML_CHAR_REF_DECODE;
1359 ptr++;
1360 } else {
1361 if (dconf_entconv)
1362 state = HTML_ENTITY_REF_DECODE;
1363 else {
1364 if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1365 tag_val[tag_val_length++] = '&';
1366 }
1367 html_output_c(file_buff_o2, '&');
1368
1369 state = next_state;
1370 next_state = HTML_BAD_STATE;
1371 }
1372 }
1373 break;
1374 case HTML_ENTITY_REF_DECODE:
1375 if (*ptr == ';') {
1376 size_t i;
1377 const char *normalized;
1378 entity_val[entity_val_length] = '\0';
1379 normalized = entity_norm(&conv, entity_val);
1380 if (normalized) {
1381 for (i = 0; i < strlen(normalized); i++) {
1382 const unsigned char c = normalized[i] & 0xff;
1383 html_output_c(file_buff_o2, c);
1384 if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1385 tag_val[tag_val_length++] = c;
1386 }
1387 }
1388 } else {
1389 html_output_c(file_buff_o2, '&');
1390 if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1391 tag_val[tag_val_length++] = '&';
1392 }
1393 for (i = 0; i < entity_val_length; i++) {
1394 const char c = tolower(entity_val[i]);
1395 html_output_c(file_buff_o2, c);
1396 if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1397 tag_val[tag_val_length++] = c;
1398 }
1399 }
1400 if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1401 tag_val[tag_val_length++] = ';';
1402 }
1403 html_output_c(file_buff_o2, ';');
1404 }
1405 entity_val_length = 0;
1406 state = next_state;
1407 next_state = HTML_BAD_STATE;
1408 ptr++;
1409 } else if ((isalnum(*ptr) || *ptr == '_' || *ptr == ':' || (*ptr == '-')) && entity_val_length < HTML_STR_LENGTH) {
1410 entity_val[entity_val_length++] = *ptr++;
1411 } else {
1412 /* entity too long, or not valid, dump it */
1413 size_t i;
1414 if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1415 tag_val[tag_val_length++] = '&';
1416 }
1417 html_output_c(file_buff_o2, '&');
1418 for (i = 0; i < entity_val_length; i++) {
1419 const char c = tolower(entity_val[i]);
1420 html_output_c(file_buff_o2, c);
1421 if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1422 tag_val[tag_val_length++] = c;
1423 }
1424 }
1425
1426 state = next_state;
1427 next_state = HTML_BAD_STATE;
1428 entity_val_length = 0;
1429 }
1430 break;
1431 case HTML_CHAR_REF_DECODE:
1432 if ((value == 0) && ((*ptr == 'x') || (*ptr == 'X'))) {
1433 hex = TRUE;
1434 ptr++;
1435 } else if (*ptr == ';') {
1436 if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1437 tag_val[tag_val_length++] = value; /* store encoded values too */
1438 }
1439 if (dconf_entconv) {
1440
1441 if (value < 0x80)
1442 html_output_c(file_buff_o2, tolower(value));
1443 else {
1444 unsigned char buff[10];
1445 unsigned char *out = u16_normalize_tobuffer(value, buff, 10);
1446 if (out && out > buff) {
1447 html_output_str(file_buff_o2, buff, out - buff - 1);
1448 }
1449 }
1450 } else
1451 html_output_c(file_buff_o2, tolower(value & 0xff));
1452 state = next_state;
1453 next_state = HTML_BAD_STATE;
1454 ptr++;
1455 } else if (isdigit(*ptr) || (hex && isxdigit(*ptr))) {
1456 int64_t increment = 0;
1457
1458 if (hex && value < INT64_MAX / 16) {
1459 value *= 16;
1460 } else if (value < INT64_MAX / 10) {
1461 value *= 10;
1462 } else {
1463 html_output_c(file_buff_o2, value);
1464 state = next_state;
1465 next_state = HTML_BAD_STATE;
1466 ptr++;
1467 break;
1468 }
1469 if (isdigit(*ptr)) {
1470 increment = *ptr - '0';
1471 } else {
1472 increment = tolower(*ptr) - 'a' + 10;
1473 }
1474 if (value > INT64_MAX - increment) {
1475 /* Addition would result in integer overflow. */
1476 html_output_c(file_buff_o2, value);
1477 state = next_state;
1478 next_state = HTML_BAD_STATE;
1479 ptr++;
1480 break;
1481 }
1482 value += increment;
1483 ptr++;
1484 } else {
1485 html_output_c(file_buff_o2, value);
1486 state = next_state;
1487 next_state = HTML_BAD_STATE;
1488 }
1489 break;
1490 case HTML_LOOKFOR_SCRENC:
1491 look_for_screnc = TRUE;
1492 ptr_screnc = (unsigned char *)strstr((char *)ptr, "#@~^");
1493 if (ptr_screnc) {
1494 ptr_screnc[0] = '/';
1495 ptr_screnc[1] = '/';
1496 ptr_screnc += 4;
1497 }
1498 state = next_state;
1499 next_state = saved_next_state;
1500 break;
1501 case HTML_JSDECODE:
1502 /* Check for start marker */
1503 if (strncmp((const char *)ptr, "#@~^", 4) == 0) {
1504 ptr[0] = '/';
1505 ptr[1] = '/';
1506 ptr += 4;
1507 state = HTML_JSDECODE_LENGTH;
1508 next_state = HTML_BAD_STATE;
1509 } else {
1510 html_output_c(file_buff_o2, tolower(*ptr));
1511 ptr++;
1512 }
1513 break;
1514 case HTML_JSDECODE_LENGTH:
1515 if (strlen((const char *)ptr) < 8) {
1516 state = HTML_NORM;
1517 next_state = HTML_BAD_STATE;
1518 break;
1519 }
1520 memset(&screnc_state, 0, sizeof(screnc_state));
1521 screnc_state.length = base64_chars[ptr[0]] < 0 ? 0 : base64_chars[ptr[0]] << 2;
1522 screnc_state.length += base64_chars[ptr[1]] >> 4;
1523 screnc_state.length += (base64_chars[ptr[1]] & 0x0f) << 12;
1524 screnc_state.length += ((base64_chars[ptr[2]] >> 2) < 0 ? 0 : (base64_chars[ptr[2]] >> 2)) << 8;
1525 screnc_state.length += (base64_chars[ptr[2]] & 0x03) << 22;
1526 screnc_state.length += base64_chars[ptr[3]] < 0 ? 0 : base64_chars[ptr[3]] << 16;
1527 screnc_state.length += (base64_chars[ptr[4]] < 0 ? 0 : base64_chars[ptr[4]] << 2) << 24;
1528 screnc_state.length += ((base64_chars[ptr[5]] >> 4) < 0 ? 0 : (base64_chars[ptr[5]] >> 4)) << 24;
1529 state = HTML_JSDECODE_DECRYPT;
1530 in_screnc = TRUE;
1531 next_state = HTML_BAD_STATE;
1532 /* for JS normalizer */
1533 ptr[7] = '\n';
1534 ptr += 8;
1535 break;
1536 case HTML_JSDECODE_DECRYPT:
1537 screnc_decode(ptr, &screnc_state);
1538 if (!screnc_state.length) {
1539 state = HTML_NORM;
1540 next_state = HTML_BAD_STATE;
1541 in_screnc = FALSE;
1542 break;
1543 } else {
1544 state = HTML_NORM;
1545 next_state = HTML_BAD_STATE;
1546 }
1547 break;
1548 case HTML_RFC2397_TYPE:
1549 if (*ptr == '\'') {
1550 if (!escape && (quoted == SINGLE_QUOTED)) {
1551 /* Early end of data detected. Error */
1552 ptr++;
1553 state = HTML_SKIP_WS;
1554 tag_arg_length = 0;
1555 next_state = HTML_TAG_ARG;
1556 } else {
1557 if (tag_val_length < HTML_STR_LENGTH) {
1558 tag_val[tag_val_length++] = '"';
1559 }
1560 ptr++;
1561 }
1562 } else if (*ptr == '"') {
1563 if (!escape && (quoted == DOUBLE_QUOTED)) {
1564 /* Early end of data detected. Error */
1565 ptr++;
1566 state = HTML_SKIP_WS;
1567 tag_arg_length = 0;
1568 next_state = HTML_TAG_ARG;
1569 } else {
1570 if (tag_val_length < HTML_STR_LENGTH) {
1571 tag_val[tag_val_length++] = '"';
1572 }
1573 ptr++;
1574 }
1575 } else if (isspace(*ptr) || (*ptr == '>')) {
1576 if (quoted == NOT_QUOTED) {
1577 /* Early end of data detected. Error */
1578 state = HTML_SKIP_WS;
1579 tag_arg_length = 0;
1580 next_state = HTML_TAG_ARG;
1581 } else {
1582 if (tag_val_length < HTML_STR_LENGTH) {
1583 if (isspace(*ptr)) {
1584 tag_val[tag_val_length++] = ' ';
1585 } else {
1586 tag_val[tag_val_length++] = '>';
1587 }
1588 }
1589 state = HTML_SKIP_WS;
1590 escape = FALSE;
1591 quoted = NOT_QUOTED;
1592 next_state = HTML_RFC2397_TYPE;
1593 ptr++;
1594 }
1595 } else if (*ptr == ',') {
1596 /* Beginning of data */
1597 tag_val[tag_val_length] = '\0';
1598 state = HTML_RFC2397_INIT;
1599 escape = FALSE;
1600 next_state = HTML_BAD_STATE;
1601 ptr++;
1602
1603 } else {
1604 if (tag_val_length < HTML_STR_LENGTH) {
1605 tag_val[tag_val_length++] = tolower(*ptr);
1606 }
1607 ptr++;
1608 }
1609 if (*ptr == '\\') {
1610 escape = TRUE;
1611 } else {
1612 escape = FALSE;
1613 }
1614 break;
1615 case HTML_RFC2397_INIT:
1616 if (dirname) {
1617 STATBUF statbuf;
1618
1619 if (NULL != file_tmp_o1) {
1620 if (file_tmp_o1->fd != -1) {
1621 html_output_flush(file_tmp_o1);
1622 close(file_tmp_o1->fd);
1623 file_tmp_o1->fd = -1;
1624 }
1625 free(file_tmp_o1);
1626 }
1627
1628 file_tmp_o1 = (file_buff_t *)cli_malloc(sizeof(file_buff_t));
1629 if (!file_tmp_o1) {
1630 cli_errmsg("cli_html_normalise: Unable to allocate memory for file_tmp_o1\n");
1631 goto abort;
1632 }
1633 file_tmp_o1->fd = -1;
1634
1635 /* Create rfc2397 directory if it doesn't already exist */
1636 snprintf(filename, 1024, "%s" PATHSEP "rfc2397", dirname);
1637 if (LSTAT(filename, &statbuf) == -1) {
1638 if (mkdir(filename, 0700) && errno != EEXIST) {
1639 cli_errmsg("Failed to create directory: %s\n", dirname);
1640 goto abort;
1641 }
1642 }
1643
1644 tmp_file = cli_gentemp(filename);
1645 if (!tmp_file) {
1646 goto abort;
1647 }
1648 cli_dbgmsg("RFC2397 data file: %s\n", tmp_file);
1649 file_tmp_o1->fd = open(tmp_file, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR);
1650 free(tmp_file);
1651 if (file_tmp_o1->fd < 0) {
1652 cli_dbgmsg("open failed: %s\n", filename);
1653 goto abort;
1654 }
1655 file_tmp_o1->length = 0;
1656
1657 html_output_str(file_tmp_o1, (const unsigned char *)"From html-normalise\n", 20);
1658 html_output_str(file_tmp_o1, (const unsigned char *)"Content-type: ", 14);
1659 if ((tag_val_length == 0) && (*tag_val == ';')) {
1660 html_output_str(file_tmp_o1, (const unsigned char *)"text/plain\n", 11);
1661 }
1662 html_output_str(file_tmp_o1, (const unsigned char *)tag_val, tag_val_length);
1663 html_output_c(file_tmp_o1, '\n');
1664 if (strstr(tag_val, ";base64") != NULL) {
1665 html_output_str(file_tmp_o1, (const unsigned char *)"Content-transfer-encoding: base64\n", 34);
1666 }
1667 html_output_c(file_tmp_o1, '\n');
1668 } else {
1669 file_tmp_o1 = NULL;
1670 }
1671 state = HTML_RFC2397_DATA;
1672 binary = TRUE;
1673 break;
1674 case HTML_RFC2397_DATA:
1675 if (*ptr == '&') {
1676 state = HTML_CHAR_REF;
1677 next_state = HTML_RFC2397_DATA;
1678 ptr++;
1679 } else if (*ptr == '%') {
1680 length = 0;
1681 value = 0;
1682 state = HTML_ESCAPE_CHAR;
1683 next_state = HTML_RFC2397_ESC;
1684 ptr++;
1685 } else if (*ptr == '\'') {
1686 if (!escape && (quoted == SINGLE_QUOTED)) {
1687 state = HTML_RFC2397_FINISH;
1688 ptr++;
1689 } else {
1690 html_output_c(file_tmp_o1, *ptr);
1691 ptr++;
1692 }
1693 } else if (*ptr == '\"') {
1694 if (!escape && (quoted == DOUBLE_QUOTED)) {
1695 state = HTML_RFC2397_FINISH;
1696 ptr++;
1697 } else {
1698 html_output_c(file_tmp_o1, *ptr);
1699 ptr++;
1700 }
1701 } else if (isspace(*ptr) || (*ptr == '>')) {
1702 if (quoted == NOT_QUOTED) {
1703 state = HTML_RFC2397_FINISH;
1704 ptr++;
1705 } else {
1706 html_output_c(file_tmp_o1, *ptr);
1707 ptr++;
1708 }
1709 } else {
1710 html_output_c(file_tmp_o1, *ptr);
1711 ptr++;
1712 }
1713 if (*ptr == '\\') {
1714 escape = TRUE;
1715 } else {
1716 escape = FALSE;
1717 }
1718 break;
1719 case HTML_RFC2397_FINISH:
1720 if (file_tmp_o1) {
1721 if (file_tmp_o1->fd != -1) {
1722 html_output_flush(file_tmp_o1);
1723 close(file_tmp_o1->fd);
1724 file_tmp_o1->fd = -1;
1725 }
1726 free(file_tmp_o1);
1727 file_tmp_o1 = NULL;
1728 }
1729 state = HTML_SKIP_WS;
1730 escape = FALSE;
1731 quoted = NOT_QUOTED;
1732 next_state = HTML_TAG_ARG;
1733 binary = FALSE;
1734 break;
1735 case HTML_RFC2397_ESC:
1736 if (length == 2) {
1737 html_output_c(file_tmp_o1, value);
1738 } else if (length == 1) {
1739 html_output_c(file_tmp_o1, '%');
1740 html_output_c(file_tmp_o1, value + '0');
1741 } else {
1742 html_output_c(file_tmp_o1, '%');
1743 }
1744 state = HTML_RFC2397_DATA;
1745 break;
1746 case HTML_ESCAPE_CHAR:
1747 if (value < INT64_MAX / 16) {
1748 value *= 16;
1749 } else {
1750 state = next_state;
1751 next_state = HTML_BAD_STATE;
1752 ptr++;
1753 break;
1754 }
1755 length++;
1756 if (isxdigit(*ptr)) {
1757 if (isdigit(*ptr)) {
1758 value += (*ptr - '0');
1759 } else {
1760 value += (tolower(*ptr) - 'a' + 10);
1761 }
1762 } else {
1763 state = next_state;
1764 }
1765 if (length == 2) {
1766 state = next_state;
1767 }
1768 ptr++;
1769 break;
1770 }
1771 }
1772 if (hrefs && hrefs->scanContents && in_ahref && href_contents_begin)
1773 /* end of line, append contents now, resume on next line */
1774 html_tag_contents_append(&contents, href_contents_begin, ptr);
1775 ptrend = NULL;
1776
1777 if (js_state) {
1778 js_process(js_state, js_begin, js_end, line, ptr, in_script, dirname);
1779 js_begin = js_end = NULL;
1780 if (!in_script) {
1781 js_state = NULL;
1782 }
1783 }
1784 if (look_for_screnc && ptr_screnc) {
1785 /* start found, and stuff before it already processed */
1786 ptr = ptr_screnc;
1787 ptr_screnc = NULL;
1788 state = HTML_JSDECODE_LENGTH;
1789 next_state = HTML_BAD_STATE;
1790 continue;
1791 }
1792 free(line);
1793 ptr = line = cli_readchunk(stream_in, m_area, 8192);
1794 if (in_screnc) {
1795 state = HTML_JSDECODE_DECRYPT;
1796 next_state = HTML_BAD_STATE;
1797 } else if (look_for_screnc && !ptr_screnc &&
1798 state != HTML_LOOKFOR_SCRENC) {
1799 saved_next_state = next_state;
1800 next_state = state;
1801 state = HTML_LOOKFOR_SCRENC;
1802 }
1803 if (next_state == state) {
1804 /* safeguard against infloop */
1805 cli_dbgmsg("htmlnorm.c: next_state == state, changing next_state\n");
1806 next_state = HTML_BAD_STATE;
1807 }
1808 }
1809
1810 if (dconf_entconv) {
1811 /* handle "unfinished" entities */
1812 size_t i;
1813 const char *normalized;
1814 entity_val[entity_val_length] = '\0';
1815 normalized = entity_norm(&conv, entity_val);
1816 if (normalized) {
1817 for (i = 0; i < strlen(normalized); i++)
1818 html_output_c(file_buff_o2, normalized[i] & 0xff);
1819 } else {
1820 if (entity_val_length) {
1821 html_output_c(file_buff_o2, '&');
1822 for (i = 0; i < entity_val_length; i++)
1823 html_output_c(file_buff_o2, tolower(entity_val[i]));
1824 }
1825 }
1826 }
1827 retval = TRUE;
1828 abort:
1829 if (line) /* only needed for abort case */
1830 free(line);
1831 if (in_form_action)
1832 free(in_form_action);
1833 if (in_ahref) /* tag not closed, force closing */
1834 html_tag_contents_done(hrefs, in_ahref, &contents);
1835
1836 if (js_state) {
1837 /* output script so far */
1838 cli_js_parse_done(js_state);
1839 cli_js_output(js_state, dirname);
1840 cli_js_destroy(js_state);
1841 js_state = NULL;
1842 }
1843 html_tag_arg_free(&tag_args);
1844 if (!m_area) {
1845 fclose(stream_in);
1846 }
1847 if (file_buff_o2) {
1848 html_output_flush(file_buff_o2);
1849 if (file_buff_o2->fd != -1)
1850 close(file_buff_o2->fd);
1851 free(file_buff_o2);
1852 }
1853 if (file_buff_text) {
1854 html_output_flush(file_buff_text);
1855 if (file_buff_text->fd != -1)
1856 close(file_buff_text->fd);
1857 free(file_buff_text);
1858 file_buff_text = NULL;
1859 }
1860 if (file_tmp_o1) {
1861 if (file_tmp_o1->fd != -1) {
1862 html_output_flush(file_tmp_o1);
1863 close(file_tmp_o1->fd);
1864 }
1865 free(file_tmp_o1);
1866 }
1867 return retval;
1868 }
1869
html_normalise_mem(unsigned char * in_buff,off_t in_size,const char * dirname,tag_arguments_t * hrefs,const struct cli_dconf * dconf)1870 int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
1871 {
1872 m_area_t m_area;
1873
1874 m_area.buffer = in_buff;
1875 m_area.length = in_size;
1876 m_area.offset = 0;
1877 m_area.map = NULL;
1878
1879 return cli_html_normalise(-1, &m_area, dirname, hrefs, dconf);
1880 }
1881
html_normalise_map(fmap_t * map,const char * dirname,tag_arguments_t * hrefs,const struct cli_dconf * dconf)1882 int html_normalise_map(fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
1883 {
1884 int retval = FALSE;
1885 m_area_t m_area;
1886
1887 m_area.length = map->len;
1888 m_area.offset = 0;
1889 m_area.map = map;
1890 retval = cli_html_normalise(-1, &m_area, dirname, hrefs, dconf);
1891 return retval;
1892 }
1893
html_screnc_decode(fmap_t * map,const char * dirname)1894 int html_screnc_decode(fmap_t *map, const char *dirname)
1895 {
1896 int count, retval = FALSE;
1897 unsigned char *line = NULL, tmpstr[6];
1898 unsigned char *ptr, filename[1024];
1899 int ofd;
1900 struct screnc_state screnc_state;
1901 m_area_t m_area;
1902
1903 memset(&m_area, 0, sizeof(m_area));
1904 m_area.length = map->len;
1905 m_area.offset = 0;
1906 m_area.map = map;
1907
1908 snprintf((char *)filename, 1024, "%s" PATHSEP "screnc.html", dirname);
1909 ofd = open((const char *)filename, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR);
1910
1911 if (ofd < 0) {
1912 cli_dbgmsg("open failed: %s\n", filename);
1913 return FALSE;
1914 }
1915
1916 while ((line = cli_readchunk(NULL, &m_area, 8192)) != NULL) {
1917 ptr = (unsigned char *)strstr((char *)line, "#@~^");
1918 if (ptr) {
1919 break;
1920 }
1921 free(line);
1922 line = NULL;
1923 }
1924 if (!line) {
1925 goto abort;
1926 }
1927
1928 /* Calculate the length of the encoded string */
1929 ptr += 4;
1930 count = 0;
1931 do {
1932 if (!*ptr) {
1933 free(line);
1934 ptr = line = cli_readchunk(NULL, &m_area, 8192);
1935 if (!line) {
1936 goto abort;
1937 }
1938 }
1939 if (count < 6)
1940 tmpstr[count] = *ptr;
1941 count++;
1942 ptr++;
1943 } while (count < 8);
1944
1945 memset(&screnc_state, 0, sizeof(screnc_state));
1946 screnc_state.length = base64_chars[tmpstr[0]] < 0 ? 0 : base64_chars[tmpstr[0]] << 2;
1947 screnc_state.length += base64_chars[tmpstr[1]] >> 4;
1948 screnc_state.length += (base64_chars[tmpstr[1]] & 0x0f) << 12;
1949 screnc_state.length += ((base64_chars[tmpstr[2]] >> 2) < 0 ? 0 : (base64_chars[tmpstr[2]] >> 2)) << 8;
1950 screnc_state.length += (base64_chars[tmpstr[2]] & 0x03) << 22;
1951 screnc_state.length += base64_chars[tmpstr[3]] < 0 ? 0 : base64_chars[tmpstr[3]] << 16;
1952 screnc_state.length += (base64_chars[tmpstr[4]] < 0 ? 0 : base64_chars[tmpstr[4]] << 2) << 24;
1953 screnc_state.length += ((base64_chars[tmpstr[5]] >> 4) < 0 ? 0 : (base64_chars[tmpstr[5]] >> 4)) << 24;
1954 cli_writen(ofd, "<script>", strlen("<script>"));
1955 while (screnc_state.length && line) {
1956 screnc_decode(ptr, &screnc_state);
1957 cli_writen(ofd, ptr, strlen((const char *)ptr));
1958 free(line);
1959 line = NULL;
1960 if (screnc_state.length) {
1961 ptr = line = cli_readchunk(NULL, &m_area, 8192);
1962 }
1963 }
1964 cli_writen(ofd, "</script>", strlen("</script>"));
1965 if (screnc_state.length)
1966 cli_dbgmsg("html_screnc_decode: missing %u bytes\n", screnc_state.length);
1967 retval = TRUE;
1968
1969 abort:
1970 close(ofd);
1971 if (line) {
1972 free(line);
1973 }
1974 return retval;
1975 }
1976