1 /*
2  *  Copyright (C) 2013-2022 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
3  *  Copyright (C) 2007-2013 Sourcefire, Inc.
4  *
5  *  Authors: Trog
6  *
7  *  Summary: Normalise HTML text. Decode MS Script Encoder protection.
8  *           The ScrEnc decoder was initially based upon an analysis by Andreas Marx.
9  *
10  *  This program is free software; you can redistribute it and/or modify
11  *  it under the terms of the GNU General Public License version 2 as
12  *  published by the Free Software Foundation.
13  *
14  *  This program is distributed in the hope that it will be useful,
15  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  *  GNU General Public License for more details.
18  *
19  *  You should have received a copy of the GNU General Public License
20  *  along with this program; if not, write to the Free Software
21  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
22  *  MA 02110-1301, USA.
23  */
24 
25 #if HAVE_CONFIG_H
26 #include "clamav-config.h"
27 #endif
28 
29 #include <stdio.h>
30 #ifdef HAVE_UNISTD_H
31 #include <unistd.h>
32 #endif
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <fcntl.h>
36 #ifdef HAVE_STRINGS_H
37 #include <strings.h>
38 #endif
39 #include <string.h>
40 #include <errno.h>
41 #include <stdio.h>
42 #include <ctype.h>
43 
44 #include "clamav.h"
45 #include "fmap.h"
46 #include "others.h"
47 #include "htmlnorm.h"
48 
49 #include "entconv.h"
50 #include "jsparse/js-norm.h"
51 
52 #define HTML_STR_LENGTH 1024
53 #define MAX_TAG_CONTENTS_LENGTH HTML_STR_LENGTH
54 
55 typedef enum {
56     HTML_BAD_STATE,
57     HTML_NORM,
58     HTML_8BIT,
59     HTML_COMMENT,
60     HTML_CHAR_REF,
61     HTML_ENTITY_REF_DECODE,
62     HTML_SKIP_WS,
63     HTML_TRIM_WS,
64     HTML_TAG,
65     HTML_TAG_ARG,
66     HTML_TAG_ARG_VAL,
67     HTML_TAG_ARG_EQUAL,
68     HTML_PROCESS_TAG,
69     HTML_CHAR_REF_DECODE,
70     HTML_LOOKFOR_SCRENC,
71     HTML_JSDECODE,
72     HTML_JSDECODE_LENGTH,
73     HTML_JSDECODE_DECRYPT,
74     HTML_SPECIAL_CHAR,
75     HTML_RFC2397_TYPE,
76     HTML_RFC2397_INIT,
77     HTML_RFC2397_DATA,
78     HTML_RFC2397_FINISH,
79     HTML_RFC2397_ESC,
80     HTML_ESCAPE_CHAR
81 } html_state;
82 
83 typedef enum {
84     SINGLE_QUOTED,
85     DOUBLE_QUOTED,
86     NOT_QUOTED
87 } quoted_state;
88 
89 #define HTML_FILE_BUFF_LEN 8192
90 
91 typedef struct file_buff_tag {
92     int fd;
93     unsigned char buffer[HTML_FILE_BUFF_LEN];
94     uint64_t length;
95 } file_buff_t;
96 
97 struct tag_contents {
98     size_t pos;
99     unsigned char contents[MAX_TAG_CONTENTS_LENGTH + 1];
100 };
101 
102 // clang-format off
103 static const int64_t base64_chars[256] = {
104     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
105     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
106     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,62, -1,-1,-1,63,
107     52,53,54,55, 56,57,58,59, 60,61,-1,-1, -1,-1,-1,-1,
108     -1, 0, 1, 2,  3, 4, 5, 6,  7, 8, 9,10, 11,12,13,14,
109     15,16,17,18, 19,20,21,22, 23,24,25,-1, -1,-1,-1,-1,
110     -1,26,27,28, 29,30,31,32, 33,34,35,36, 37,38,39,40,
111     41,42,43,44, 45,46,47,48, 49,50,51,-1, -1,-1,-1,-1,
112     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
113     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
114     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
115     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
116     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
117     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
118     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
119     -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1, -1,-1,-1,-1,
120 };
121 // clang-format on
122 
123 int table_order[] = {
124     00, 02, 01, 00, 02, 01, 02, 01, 01, 02, 01, 02, 00, 01, 02, 01,
125     00, 01, 02, 01, 00, 00, 02, 01, 01, 02, 00, 01, 02, 01, 01, 02,
126     00, 00, 01, 02, 01, 02, 01, 00, 01, 00, 00, 02, 01, 00, 01, 02,
127     00, 01, 02, 01, 00, 00, 02, 01, 01, 00, 00, 02, 01, 00, 01, 02};
128 
129 int decrypt_tables[3][128] = {
130     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x57, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
131      0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
132      0x2E, 0x47, 0x7A, 0x56, 0x42, 0x6A, 0x2F, 0x26, 0x49, 0x41, 0x34, 0x32, 0x5B, 0x76, 0x72, 0x43,
133      0x38, 0x39, 0x70, 0x45, 0x68, 0x71, 0x4F, 0x09, 0x62, 0x44, 0x23, 0x75, 0x3C, 0x7E, 0x3E, 0x5E,
134      0xFF, 0x77, 0x4A, 0x61, 0x5D, 0x22, 0x4B, 0x6F, 0x4E, 0x3B, 0x4C, 0x50, 0x67, 0x2A, 0x7D, 0x74,
135      0x54, 0x2B, 0x2D, 0x2C, 0x30, 0x6E, 0x6B, 0x66, 0x35, 0x25, 0x21, 0x64, 0x4D, 0x52, 0x63, 0x3F,
136      0x7B, 0x78, 0x29, 0x28, 0x73, 0x59, 0x33, 0x7F, 0x6D, 0x55, 0x53, 0x7C, 0x3A, 0x5F, 0x65, 0x46,
137      0x58, 0x31, 0x69, 0x6C, 0x5A, 0x48, 0x27, 0x5C, 0x3D, 0x24, 0x79, 0x37, 0x60, 0x51, 0x20, 0x36},
138 
139     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x7B, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
140      0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
141      0x32, 0x30, 0x21, 0x29, 0x5B, 0x38, 0x33, 0x3D, 0x58, 0x3A, 0x35, 0x65, 0x39, 0x5C, 0x56, 0x73,
142      0x66, 0x4E, 0x45, 0x6B, 0x62, 0x59, 0x78, 0x5E, 0x7D, 0x4A, 0x6D, 0x71, 0x3C, 0x60, 0x3E, 0x53,
143      0xFF, 0x42, 0x27, 0x48, 0x72, 0x75, 0x31, 0x37, 0x4D, 0x52, 0x22, 0x54, 0x6A, 0x47, 0x64, 0x2D,
144      0x20, 0x7F, 0x2E, 0x4C, 0x5D, 0x7E, 0x6C, 0x6F, 0x79, 0x74, 0x43, 0x26, 0x76, 0x25, 0x24, 0x2B,
145      0x28, 0x23, 0x41, 0x34, 0x09, 0x2A, 0x44, 0x3F, 0x77, 0x3B, 0x55, 0x69, 0x61, 0x63, 0x50, 0x67,
146      0x51, 0x49, 0x4F, 0x46, 0x68, 0x7C, 0x36, 0x70, 0x6E, 0x7A, 0x2F, 0x5F, 0x4B, 0x5A, 0x2C, 0x57},
147 
148     {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x6E, 0x0A, 0x0B, 0x0C, 0x06, 0x0E, 0x0F,
149      0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
150      0x2D, 0x75, 0x52, 0x60, 0x71, 0x5E, 0x49, 0x5C, 0x62, 0x7D, 0x29, 0x36, 0x20, 0x7C, 0x7A, 0x7F,
151      0x6B, 0x63, 0x33, 0x2B, 0x68, 0x51, 0x66, 0x76, 0x31, 0x64, 0x54, 0x43, 0x3C, 0x3A, 0x3E, 0x7E,
152      0xFF, 0x45, 0x2C, 0x2A, 0x74, 0x27, 0x37, 0x44, 0x79, 0x59, 0x2F, 0x6F, 0x26, 0x72, 0x6A, 0x39,
153      0x7B, 0x3F, 0x38, 0x77, 0x67, 0x53, 0x47, 0x34, 0x78, 0x5D, 0x30, 0x23, 0x5A, 0x5B, 0x6C, 0x48,
154      0x55, 0x70, 0x69, 0x2E, 0x4C, 0x21, 0x24, 0x4E, 0x50, 0x09, 0x56, 0x73, 0x35, 0x61, 0x4B, 0x58,
155      0x3B, 0x57, 0x22, 0x6D, 0x4D, 0x25, 0x28, 0x46, 0x4A, 0x32, 0x41, 0x3D, 0x5F, 0x4F, 0x42, 0x65}};
156 
rewind_tospace(const unsigned char * chunk,unsigned int len)157 static inline unsigned int rewind_tospace(const unsigned char *chunk, unsigned int len)
158 {
159     unsigned int count = len;
160     while (!isspace(chunk[len - 1]) && (len > 1)) {
161         len--;
162     }
163     if (len == 1) {
164         return count;
165     }
166     return len;
167 }
168 
169 /* read at most @max_len of data from @m_area or @stream, skipping NULL chars.
170  * This used to be called cli_readline, but we don't stop at end-of-line anymore */
cli_readchunk(FILE * stream,m_area_t * m_area,unsigned int max_len)171 static unsigned char *cli_readchunk(FILE *stream, m_area_t *m_area, unsigned int max_len)
172 {
173     unsigned char *chunk, *start, *ptr, *end;
174     unsigned int chunk_len, count;
175 
176     chunk = (unsigned char *)cli_malloc(max_len);
177     if (!chunk) {
178         cli_errmsg("readchunk: Unable to allocate memory for chunk\n");
179         return NULL;
180     }
181 
182     /* Try and use the memory buffer first */
183     if (m_area) {
184         /* maximum we can copy into the buffer,
185 		 * we could have less than max_len bytes available */
186         chunk_len = MIN(m_area->length - m_area->offset, max_len - 1);
187         if (!chunk_len) {
188             free(chunk);
189             return NULL;
190         }
191         if (m_area->map)
192             ptr = (unsigned char *)fmap_need_off_once(m_area->map, m_area->offset, chunk_len);
193         else
194             ptr = m_area->buffer + m_area->offset;
195         start = ptr;
196         end   = ptr - m_area->offset + m_area->length;
197 
198         if ((start >= end) || !start) {
199             free(chunk);
200             return NULL;
201         }
202 
203         /* look for NULL chars */
204         ptr = memchr(start, 0, chunk_len);
205         if (!ptr) {
206             /* no NULL chars found, copy all */
207             memcpy(chunk, start, chunk_len);
208             chunk[chunk_len] = '\0';
209             m_area->offset += chunk_len;
210             /* point ptr to end of chunk,
211 			 * so we can check and rewind to a space below */
212             ptr = start + chunk_len;
213         } else {
214             /* copy portion that doesn't contain NULL chars */
215             chunk_len = ptr - start;
216             if (chunk_len < max_len) {
217                 memcpy(chunk, start, chunk_len);
218             } else {
219                 chunk_len = 0;
220                 ptr       = start;
221             }
222             if (m_area->map)
223                 ptr = (unsigned char *)fmap_need_ptr_once(m_area->map, ptr, end - ptr);
224             if (!ptr) {
225                 cli_warnmsg("fmap inconsistency\n");
226                 ptr = end;
227             }
228             /* we have unknown number of NULL chars,
229 			 * copy char-by-char and skip them */
230             while ((ptr < end) && (chunk_len < max_len - 1)) {
231                 const unsigned char c = *ptr++;
232                 /* we can't use chunk_len to determine how many bytes we read, since
233 				 * we skipped chars */
234                 if (c) {
235                     chunk[chunk_len++] = c;
236                 }
237             }
238             m_area->offset += ptr - start;
239             chunk[chunk_len] = '\0';
240         }
241         if (ptr && ptr < end && !isspace(*ptr)) {
242             /* we hit max_len, rewind to a space */
243             count = rewind_tospace(chunk, chunk_len);
244             if (count < chunk_len) {
245                 chunk[count] = '\0';
246                 m_area->offset -= chunk_len - count;
247             }
248         }
249     } else {
250         if (!stream) {
251             cli_dbgmsg("No HTML stream\n");
252             free(chunk);
253             return NULL;
254         }
255         chunk_len = fread(chunk, 1, max_len - 1, stream);
256         if (!chunk_len || chunk_len > max_len - 1) {
257             /* EOF, or prevent overflow */
258             free(chunk);
259             return NULL;
260         }
261 
262         /* Look for NULL chars */
263         ptr = memchr(chunk, 0, chunk_len);
264         if (ptr) {
265             /* NULL char found */
266             /* save buffer limits */
267             start = ptr;
268             end   = chunk + chunk_len;
269 
270             /* start of NULL chars, we will copy non-NULL characters
271 			 * to this position */
272             chunk_len = ptr - chunk;
273 
274             /* find first non-NULL char */
275             while ((ptr < end) && !(*ptr)) {
276                 ptr++;
277             }
278             /* skip over NULL chars, and move back the rest */
279             while ((ptr < end) && (chunk_len < max_len - 1)) {
280                 const unsigned char c = *ptr++;
281                 if (c) {
282                     chunk[chunk_len++] = c;
283                 }
284             }
285         }
286         chunk[chunk_len] = '\0';
287         if (chunk_len == max_len - 1) {
288             /* rewind to a space (which includes newline) */
289             count = rewind_tospace(chunk, chunk_len);
290             if (count < chunk_len) {
291                 chunk[count] = '\0';
292                 /* seek-back to space */
293                 fseek(stream, -(long)(chunk_len - count), SEEK_CUR);
294             }
295         }
296     }
297 
298     return chunk;
299 }
300 
html_output_flush(file_buff_t * fbuff)301 static void html_output_flush(file_buff_t *fbuff)
302 {
303     if (fbuff && (fbuff->length > 0)) {
304         cli_writen(fbuff->fd, fbuff->buffer, fbuff->length);
305         fbuff->length = 0;
306     }
307 }
308 
html_output_c(file_buff_t * fbuff1,unsigned char c)309 static inline void html_output_c(file_buff_t *fbuff1, unsigned char c)
310 {
311     if (fbuff1) {
312         if (fbuff1->length == HTML_FILE_BUFF_LEN) {
313             html_output_flush(fbuff1);
314         }
315         fbuff1->buffer[fbuff1->length++] = c;
316     }
317 }
318 
html_output_str(file_buff_t * fbuff,const unsigned char * str,size_t len)319 static void html_output_str(file_buff_t *fbuff, const unsigned char *str, size_t len)
320 {
321     if (fbuff) {
322         if ((fbuff->length + len) >= HTML_FILE_BUFF_LEN) {
323             html_output_flush(fbuff);
324         }
325         if (len >= HTML_FILE_BUFF_LEN) {
326             html_output_flush(fbuff);
327             cli_writen(fbuff->fd, str, len);
328         } else {
329             memcpy(fbuff->buffer + fbuff->length, str, len);
330             fbuff->length += len;
331         }
332     }
333 }
334 
html_tag_arg_value(tag_arguments_t * tags,const char * tag)335 static char *html_tag_arg_value(tag_arguments_t *tags, const char *tag)
336 {
337     int i;
338 
339     for (i = 0; i < tags->count; i++) {
340         if (strcmp((const char *)tags->tag[i], tag) == 0) {
341             return (char *)tags->value[i];
342         }
343     }
344     return NULL;
345 }
346 
html_tag_arg_set(tag_arguments_t * tags,const char * tag,const char * value)347 static void html_tag_arg_set(tag_arguments_t *tags, const char *tag, const char *value)
348 {
349     int i;
350 
351     for (i = 0; i < tags->count; i++) {
352         if (strcmp((const char *)tags->tag[i], tag) == 0) {
353             free(tags->value[i]);
354             tags->value[i] = (unsigned char *)cli_strdup(value);
355             return;
356         }
357     }
358     return;
359 }
html_tag_arg_add(tag_arguments_t * tags,const char * tag,char * value)360 void html_tag_arg_add(tag_arguments_t *tags,
361                       const char *tag, char *value)
362 {
363     int len, i;
364     tags->count++;
365     tags->tag = (unsigned char **)cli_realloc2(tags->tag,
366                                                tags->count * sizeof(char *));
367     if (!tags->tag) {
368         goto abort;
369     }
370     tags->value = (unsigned char **)cli_realloc2(tags->value,
371                                                  tags->count * sizeof(char *));
372     if (!tags->value) {
373         goto abort;
374     }
375     if (tags->scanContents) {
376         tags->contents = (unsigned char **)cli_realloc2(tags->contents,
377                                                         tags->count * sizeof(*tags->contents));
378         if (!tags->contents) {
379             goto abort;
380         }
381         tags->contents[tags->count - 1] = NULL;
382     }
383     tags->tag[tags->count - 1] = (unsigned char *)cli_strdup(tag);
384     if (value) {
385         if (*value == '"') {
386             tags->value[tags->count - 1] = (unsigned char *)cli_strdup(value + 1);
387             len                          = strlen((const char *)value + 1);
388             if (len > 0) {
389                 tags->value[tags->count - 1][len - 1] = '\0';
390             }
391         } else {
392             tags->value[tags->count - 1] = (unsigned char *)cli_strdup(value);
393         }
394     } else {
395         tags->value[tags->count - 1] = NULL;
396     }
397     return;
398 
399 abort:
400     /* Bad error - can't do 100% recovery */
401     tags->count--;
402     for (i = 0; i < tags->count; i++) {
403         if (tags->tag) {
404             free(tags->tag[i]);
405         }
406         if (tags->value) {
407             free(tags->value[i]);
408         }
409         if (tags->contents) {
410             if (tags->contents[i])
411                 free(tags->contents[i]);
412         }
413     }
414     if (tags->tag) {
415         free(tags->tag);
416     }
417     if (tags->value) {
418         free(tags->value);
419     }
420     if (tags->contents)
421         free(tags->contents);
422     tags->contents = NULL;
423     tags->tag = tags->value = NULL;
424     tags->count             = 0;
425     return;
426 }
427 
html_output_tag(file_buff_t * fbuff,char * tag,tag_arguments_t * tags)428 static void html_output_tag(file_buff_t *fbuff, char *tag, tag_arguments_t *tags)
429 {
430     int i, j, len;
431 
432     html_output_c(fbuff, '<');
433     html_output_str(fbuff, (const unsigned char *)tag, strlen(tag));
434     for (i = 0; i < tags->count; i++) {
435         html_output_c(fbuff, ' ');
436         html_output_str(fbuff, tags->tag[i], strlen((const char *)tags->tag[i]));
437         if (tags->value[i]) {
438             html_output_str(fbuff, (const unsigned char *)"=\"", 2);
439             len = strlen((const char *)tags->value[i]);
440             for (j = 0; j < len; j++) {
441                 html_output_c(fbuff, tolower(tags->value[i][j]));
442             }
443             html_output_c(fbuff, '"');
444         }
445     }
446     html_output_c(fbuff, '>');
447 }
448 
html_tag_arg_free(tag_arguments_t * tags)449 void html_tag_arg_free(tag_arguments_t *tags)
450 {
451     int i;
452 
453     for (i = 0; i < tags->count; i++) {
454         free(tags->tag[i]);
455         if (tags->value[i]) {
456             free(tags->value[i]);
457         }
458         if (tags->contents)
459             if (tags->contents[i])
460                 free(tags->contents[i]);
461     }
462     if (tags->tag) {
463         free(tags->tag);
464     }
465     if (tags->value) {
466         free(tags->value);
467     }
468     if (tags->contents)
469         free(tags->contents);
470     tags->contents = NULL;
471     tags->tag = tags->value = NULL;
472     tags->count             = 0;
473 }
474 
475 /**
476  * the displayed text for an <a href> tag
477  */
html_tag_contents_append(struct tag_contents * cont,const unsigned char * begin,const unsigned char * end)478 static inline void html_tag_contents_append(struct tag_contents *cont, const unsigned char *begin, const unsigned char *end)
479 {
480     size_t i;
481     uint32_t mbchar = 0;
482     if (!begin || !end)
483         return;
484     for (i = cont->pos; i < MAX_TAG_CONTENTS_LENGTH && (begin < end); i++) {
485         uint8_t c = *begin++;
486         if (mbchar && (c < 0x80 || mbchar >= 0x10000)) {
487             if (mbchar == 0xE38082 || mbchar == 0xEFBC8E || mbchar == 0xEFB992 ||
488                 (mbchar == 0xA1 && (c == 0x43 || c == 0x44 || c == 0x4F))) {
489                 cont->contents[i++] = '.';
490                 if (mbchar == 0xA1) {
491                     --i;
492                     mbchar = 0;
493                     continue;
494                 }
495             } else {
496                 uint8_t c0 = mbchar >> 16;
497                 uint8_t c1 = (mbchar >> 8) & 0xff;
498                 uint8_t c2 = (mbchar & 0xff);
499                 if (c0 && i + 1 < MAX_TAG_CONTENTS_LENGTH)
500                     cont->contents[i++] = c0;
501                 if ((c0 || c1) && i + 1 < MAX_TAG_CONTENTS_LENGTH)
502                     cont->contents[i++] = c1;
503                 if (i + 1 < MAX_TAG_CONTENTS_LENGTH)
504                     cont->contents[i++] = c2;
505             }
506             mbchar = 0;
507         }
508         if (c >= 0x80) {
509             mbchar = (mbchar << 8) | c;
510             --i;
511         } else
512             cont->contents[i] = c;
513     }
514     cont->pos = i;
515 }
516 
html_tag_contents_done(tag_arguments_t * tags,int idx,struct tag_contents * cont)517 static inline void html_tag_contents_done(tag_arguments_t *tags, int idx, struct tag_contents *cont)
518 {
519     unsigned char *p;
520     cont->contents[cont->pos++] = '\0';
521     p                           = cli_malloc(cont->pos);
522     if (!p) {
523         cli_errmsg("html_tag_contents_done: Unable to allocate memory for p\n");
524         return;
525     }
526     memcpy(p, cont->contents, cont->pos);
527     tags->contents[idx - 1] = p;
528     cont->pos               = 0;
529 }
530 
531 struct screnc_state {
532     uint32_t length;
533     uint32_t sum;
534     uint8_t table_pos;
535 };
536 
537 /* inplace decoding, so that we can normalize it later */
screnc_decode(unsigned char * ptr,struct screnc_state * s)538 static void screnc_decode(unsigned char *ptr, struct screnc_state *s)
539 {
540     uint8_t value;
541     unsigned char *dst = ptr;
542 
543     if (!ptr || !s)
544         return;
545     while (s->length > 0 && *ptr) {
546         if ((*ptr == '\n') || (*ptr == '\r')) {
547             ptr++;
548             continue;
549         }
550         if (*ptr < 0x80) {
551             value = decrypt_tables[table_order[s->table_pos]][*ptr];
552             if (value == 0xFF) { /* special character */
553                 ptr++;
554                 s->length--;
555                 switch (*ptr) {
556                     case '\0':
557                         /* Fixup for end of line */
558                         ptr--;
559                         break;
560                     case 0x21:
561                         value = 0x3c;
562                         break;
563                     case 0x23:
564                         value = 0x0d;
565                         break;
566                     case 0x24:
567                         value = 0x40;
568                         break;
569                     case 0x26:
570                         value = 0x0a;
571                         break;
572                     case 0x2a:
573                         value = 0x3e;
574                         break;
575                 }
576             }
577             s->sum += value;
578             *dst++       = value;
579             s->table_pos = (s->table_pos + 1) % 64;
580         } else {
581             *dst++ = *ptr++;
582             *dst++ = *ptr;
583             if (!*ptr) {
584                 dst--;
585                 break;
586             }
587         }
588         ptr++;
589         s->length--;
590     }
591     if (!s->length) {
592         size_t remaining;
593         if (strlen((const char *)ptr) >= 12) {
594             uint64_t expected;
595             expected = base64_chars[ptr[0]] < 0 ? 0 : base64_chars[ptr[0]] << 2;
596             expected += base64_chars[ptr[1]] >> 4;
597             expected += (base64_chars[ptr[1]] & 0x0f) << 12;
598             expected += ((base64_chars[ptr[2]] >> 2) < 0 ? 0 : (base64_chars[ptr[2]] >> 2)) << 8;
599             expected += (base64_chars[ptr[2]] & 0x03) << 22;
600             expected += base64_chars[ptr[3]] < 0 ? 0 : base64_chars[ptr[3]] << 16;
601             expected += (base64_chars[ptr[4]] < 0 ? 0 : base64_chars[ptr[4]] << 2) << 24;
602             expected += ((base64_chars[ptr[5]] >> 4) < 0 ? 0 : (base64_chars[ptr[5]] >> 4)) << 24;
603             ptr += 8;
604             if (s->sum != expected) {
605                 cli_dbgmsg("screnc_decode: checksum mismatch: %u != %" PRIu64 "\n", s->sum, expected);
606             } else {
607                 if (strncmp((const char *)ptr, "^#~@", 4) != 0) {
608                     cli_dbgmsg("screnc_decode: terminator not found\n");
609                 } else {
610                     cli_dbgmsg("screnc_decode: OK\n");
611                 }
612             }
613             ptr += 4;
614         }
615         /* copy remaining */
616         remaining = strlen((const char *)ptr) + 1;
617         memmove(dst, ptr, remaining);
618     } else {
619         *dst = '\0';
620     }
621 }
622 
js_process(struct parser_state * js_state,const unsigned char * js_begin,const unsigned char * js_end,const unsigned char * line,const unsigned char * ptr,int in_script,const char * dirname)623 static void js_process(struct parser_state *js_state, const unsigned char *js_begin, const unsigned char *js_end,
624                        const unsigned char *line, const unsigned char *ptr, int in_script, const char *dirname)
625 {
626     if (!js_begin)
627         js_begin = line;
628     if (!js_end)
629         js_end = ptr;
630     if (js_end > js_begin &&
631         CLI_ISCONTAINED(line, 8192, js_begin, 1) &&
632         CLI_ISCONTAINED(line, 8192, js_end, 1)) {
633         cli_js_process_buffer(js_state, (const char *)js_begin, js_end - js_begin);
634     }
635     if (!in_script) {
636         /*  we found a /script, normalize script now */
637         cli_js_parse_done(js_state);
638         cli_js_output(js_state, dirname);
639         cli_js_destroy(js_state);
640     }
641 }
642 
cli_html_normalise(int fd,m_area_t * m_area,const char * dirname,tag_arguments_t * hrefs,const struct cli_dconf * dconf)643 static int cli_html_normalise(int fd, m_area_t *m_area, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
644 {
645     int fd_tmp, tag_length = 0, tag_arg_length = 0, binary;
646     int64_t retval = FALSE, escape = FALSE, value = 0, hex = FALSE, tag_val_length = 0;
647     int look_for_screnc = FALSE, in_screnc = FALSE, in_script = FALSE, text_space_written = FALSE;
648     FILE *stream_in  = NULL;
649     html_state state = HTML_NORM, next_state = HTML_BAD_STATE, saved_next_state = HTML_BAD_STATE;
650     char filename[1024], tag[HTML_STR_LENGTH + 1], tag_arg[HTML_STR_LENGTH + 1];
651     char tag_val[HTML_STR_LENGTH + 1], *tmp_file, *arg_value;
652     unsigned char *line = NULL, *ptr, *ptr_screnc = NULL;
653     tag_arguments_t tag_args;
654     quoted_state quoted  = NOT_QUOTED;
655     unsigned long length = 0;
656     struct screnc_state screnc_state;
657     file_buff_t *file_buff_o2, *file_buff_text;
658     file_buff_t *file_tmp_o1           = NULL;
659     int in_ahref                       = 0;    /* index of <a> tag, whose contents we are parsing. Indexing starts from 1, 0 means outside of <a>*/
660     unsigned char *href_contents_begin = NULL; /*beginning of the next portion of <a> contents*/
661     unsigned char *ptrend              = NULL; /*end of <a> contents*/
662     unsigned char *in_form_action      = NULL; /* the action URL of the current <form> tag, if any*/
663 
664     struct entity_conv conv;
665     unsigned char entity_val[HTML_STR_LENGTH + 1];
666     size_t entity_val_length = 0;
667     const int dconf_entconv  = dconf ? dconf->phishing & PHISHING_CONF_ENTCONV : 1;
668     const int dconf_js       = dirname && (dconf ? dconf->doc & DOC_CONF_JSNORM : 1); /* TODO */
669     /* dconf for phishing engine sets scanContents, so no need for a flag here */
670     struct parser_state *js_state = NULL;
671     const unsigned char *js_begin = NULL, *js_end = NULL;
672     struct tag_contents contents;
673     uint32_t mbchar  = 0;
674     uint32_t mbchar2 = 0;
675 
676     /*
677      * Initialize stack buffers.
678      */
679     memset(filename, 0, sizeof(filename));
680     memset(tag, 0, sizeof(tag));
681     memset(tag_arg, 0, sizeof(tag_arg));
682     memset(tag_val, 0, sizeof(tag_val));
683     memset(entity_val, 0, sizeof(entity_val));
684 
685     tag_args.scanContents = 0; /* do we need to store the contents of <a></a>?*/
686     contents.pos          = 0;
687     if (!m_area) {
688         if (fd < 0) {
689             cli_dbgmsg("Invalid HTML fd\n");
690             return FALSE;
691         }
692         lseek(fd, 0, SEEK_SET);
693         fd_tmp = dup(fd);
694         if (fd_tmp < 0) {
695             return FALSE;
696         }
697         stream_in = fdopen(fd_tmp, "r");
698         if (!stream_in) {
699             close(fd_tmp);
700             return FALSE;
701         }
702     }
703 
704     tag_args.count    = 0;
705     tag_args.tag      = NULL;
706     tag_args.value    = NULL;
707     tag_args.contents = NULL;
708     if (dirname) {
709         file_buff_o2 = (file_buff_t *)cli_malloc(sizeof(file_buff_t));
710         if (!file_buff_o2) {
711             cli_errmsg("cli_html_normalise: Unable to allocate memory for file_buff_o2\n");
712             file_buff_o2 = file_buff_text = NULL;
713             goto abort;
714         }
715 
716         /* this will still contains scripts that are inside comments */
717         snprintf(filename, 1024, "%s" PATHSEP "nocomment.html", dirname);
718         file_buff_o2->fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR);
719         if (file_buff_o2->fd == -1) {
720             cli_dbgmsg("open failed: %s\n", filename);
721             free(file_buff_o2);
722             file_buff_o2 = file_buff_text = NULL;
723             goto abort;
724         }
725 
726         file_buff_text = (file_buff_t *)cli_malloc(sizeof(file_buff_t));
727         if (!file_buff_text) {
728             close(file_buff_o2->fd);
729             free(file_buff_o2);
730             file_buff_o2 = file_buff_text = NULL;
731             cli_errmsg("cli_html_normalise: Unable to allocate memory for file_buff_text\n");
732             goto abort;
733         }
734 
735         snprintf(filename, 1024, "%s" PATHSEP "notags.html", dirname);
736         file_buff_text->fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR);
737         if (file_buff_text->fd == -1) {
738             cli_dbgmsg("open failed: %s\n", filename);
739             close(file_buff_o2->fd);
740             free(file_buff_o2);
741             free(file_buff_text);
742             file_buff_o2 = file_buff_text = NULL;
743             goto abort;
744         }
745         file_buff_o2->length   = 0;
746         file_buff_text->length = 0;
747     } else {
748         file_buff_o2   = NULL;
749         file_buff_text = NULL;
750     }
751 
752     binary = FALSE;
753 
754     ptr = line = cli_readchunk(stream_in, m_area, 8192);
755 
756     while (line) {
757         if (href_contents_begin)
758             href_contents_begin = ptr; /*start of a new line, last line already appended to contents see below*/
759         while (*ptr && isspace(*ptr)) {
760             ptr++;
761         }
762         while (*ptr) {
763             if (!binary && *ptr == '\n') {
764                 /* Convert it to a space and re-process */
765                 *ptr = ' ';
766                 continue;
767             }
768             if (!binary && *ptr == '\r') {
769                 ptr++;
770                 continue;
771             }
772             switch (state) {
773                 case HTML_SPECIAL_CHAR:
774                     cli_dbgmsg("Impossible, special_char can't occur here\n");
775                     break;
776                 case HTML_BAD_STATE:
777                     /* An engine error has occurred */
778                     cli_dbgmsg("HTML Engine Error\n");
779                     goto abort;
780                 case HTML_SKIP_WS:
781                     if (isspace(*ptr)) {
782                         ptr++;
783                     } else {
784                         state      = next_state;
785                         next_state = HTML_BAD_STATE;
786                     }
787                     break;
788                 case HTML_TRIM_WS:
789                     if (isspace(*ptr)) {
790                         ptr++;
791                     } else {
792                         if (!in_script)
793                             html_output_c(file_buff_o2, ' ');
794                         state      = next_state;
795                         next_state = HTML_BAD_STATE;
796                     }
797                     break;
798                 case HTML_8BIT:
799                     if (*ptr < 0x80 || mbchar >= 0x10000) {
800                         if (mbchar == 0xE38082 || mbchar == 0xEFBC8E || mbchar == 0xEFB992 ||
801                             (mbchar == 0xA1 && (*ptr == 0x43 || *ptr == 0x44 || *ptr == 0x4F))) {
802                             /* bb #4097 */
803                             html_output_c(file_buff_o2, '.');
804                             html_output_c(file_buff_text, '.');
805                             if (mbchar == 0xA1) {
806                                 ptr++;
807                                 mbchar = 0;
808                                 continue;
809                             }
810                         } else {
811                             uint8_t c0 = mbchar >> 16;
812                             uint8_t c1 = (mbchar >> 8) & 0xff;
813                             uint8_t c2 = (mbchar & 0xff);
814                             if (c0) {
815                                 html_output_c(file_buff_o2, c0);
816                                 html_output_c(file_buff_text, c0);
817                             }
818                             if (c0 || c1) {
819                                 html_output_c(file_buff_o2, c1);
820                                 html_output_c(file_buff_text, c1);
821                             }
822                             html_output_c(file_buff_o2, c2);
823                             html_output_c(file_buff_text, c1);
824                         }
825                         mbchar     = 0;
826                         state      = next_state;
827                         next_state = HTML_NORM;
828                     } else {
829                         mbchar = (mbchar << 8) | *ptr;
830                         ptr++;
831                     }
832                     break;
833                 case HTML_NORM:
834                     if (*ptr == '<') {
835                         ptrend = ptr; /* for use by scanContents */
836                         html_output_c(file_buff_o2, '<');
837                         if (!in_script && !text_space_written) {
838                             html_output_c(file_buff_text, ' ');
839                             text_space_written = TRUE;
840                         }
841                         if (hrefs && hrefs->scanContents && in_ahref && href_contents_begin) {
842                             /*append this text portion to the contents of <a>*/
843                             html_tag_contents_append(&contents, href_contents_begin, ptr);
844                             href_contents_begin = NULL; /*We just encountered another tag inside <a>, so skip it*/
845                         }
846                         ptr++;
847                         state      = HTML_SKIP_WS;
848                         tag_length = 0;
849                         next_state = HTML_TAG;
850                     } else if (isspace(*ptr)) {
851                         if (!text_space_written && !in_script) {
852                             html_output_c(file_buff_text, ' ');
853                             text_space_written = TRUE;
854                         }
855                         state      = HTML_TRIM_WS;
856                         next_state = HTML_NORM;
857                     } else if (*ptr == '&') {
858                         if (!text_space_written && !in_script) {
859                             html_output_c(file_buff_text, ' ');
860                             text_space_written = TRUE;
861                         }
862                         state      = HTML_CHAR_REF;
863                         next_state = HTML_NORM;
864                         ptr++;
865                     } else if (*ptr >= 0x80) {
866                         state      = HTML_8BIT;
867                         next_state = HTML_NORM;
868                         mbchar     = *ptr;
869                         ptr++;
870                     } else {
871                         unsigned char c = tolower(*ptr);
872                         /* normalize ' to " for scripts */
873                         if (in_script && c == '\'') c = '"';
874                         html_output_c(file_buff_o2, c);
875                         if (!in_script) {
876                             if (*ptr < 0x20) {
877                                 if (!text_space_written) {
878                                     html_output_c(file_buff_text, ' ');
879                                     text_space_written = TRUE;
880                                 }
881                             } else {
882                                 html_output_c(file_buff_text, c);
883                                 text_space_written = FALSE;
884                             }
885                         }
886                         ptr++;
887                     }
888                     break;
889                 case HTML_TAG:
890                     if ((tag_length == 0) && (*ptr == '!')) {
891                         /* Comment */
892                         if (in_script) {
893                             /* we still write scripts to nocomment.html */
894                             html_output_c(file_buff_o2, '!');
895                         } else {
896                             /* Need to rewind in the no-comment output stream */
897                             if (file_buff_o2 && (file_buff_o2->length > 0)) {
898                                 file_buff_o2->length--;
899                             }
900                         }
901                         state      = HTML_COMMENT;
902                         next_state = HTML_BAD_STATE;
903                         ptr++;
904                     } else if (*ptr == '>') {
905                         html_output_c(file_buff_o2, '>');
906                         ptr++;
907                         tag[tag_length] = '\0';
908                         state           = HTML_SKIP_WS;
909                         next_state      = HTML_PROCESS_TAG;
910                     } else if (!isspace(*ptr)) {
911                         html_output_c(file_buff_o2, tolower(*ptr));
912                         /* if we're inside a script we only care for </script>.*/
913                         if (in_script && tag_length == 0 && *ptr != '/') {
914                             state = HTML_NORM;
915                         }
916                         if (tag_length < HTML_STR_LENGTH) {
917                             tag[tag_length++] = tolower(*ptr);
918                         }
919                         ptr++;
920                     } else {
921                         tag[tag_length] = '\0';
922                         state           = HTML_SKIP_WS;
923                         tag_arg_length  = 0;
924                         /* if we'd go to HTML_TAG_ARG whitespace would be inconsistently normalized for in_script*/
925                         next_state = !in_script ? HTML_TAG_ARG : HTML_PROCESS_TAG;
926                     }
927                     break;
928                 case HTML_TAG_ARG:
929                     if (*ptr == '=') {
930                         html_output_c(file_buff_o2, '=');
931                         tag_arg[tag_arg_length] = '\0';
932                         ptr++;
933                         state          = HTML_SKIP_WS;
934                         escape         = FALSE;
935                         quoted         = NOT_QUOTED;
936                         tag_val_length = 0;
937                         next_state     = HTML_TAG_ARG_VAL;
938                     } else if (isspace(*ptr)) {
939                         ptr++;
940                         tag_arg[tag_arg_length] = '\0';
941                         state                   = HTML_SKIP_WS;
942                         next_state              = HTML_TAG_ARG_EQUAL;
943                     } else if (*ptr == '>') {
944                         html_output_c(file_buff_o2, '>');
945                         if (tag_arg_length > 0) {
946                             tag_arg[tag_arg_length] = '\0';
947                             html_tag_arg_add(&tag_args, tag_arg, NULL);
948                         }
949                         ptr++;
950                         state      = HTML_PROCESS_TAG;
951                         next_state = HTML_BAD_STATE;
952                     } else {
953                         if (tag_arg_length == 0) {
954                             /* Start of new tag - add space */
955                             html_output_c(file_buff_o2, ' ');
956                         }
957                         html_output_c(file_buff_o2, tolower(*ptr));
958                         if (tag_arg_length < HTML_STR_LENGTH) {
959                             tag_arg[tag_arg_length++] = tolower(*ptr);
960                         }
961                         ptr++;
962                     }
963                     break;
964                 case HTML_TAG_ARG_EQUAL:
965                     if (*ptr == '=') {
966                         html_output_c(file_buff_o2, '=');
967                         ptr++;
968                         state          = HTML_SKIP_WS;
969                         escape         = FALSE;
970                         quoted         = NOT_QUOTED;
971                         tag_val_length = 0;
972                         next_state     = HTML_TAG_ARG_VAL;
973                     } else {
974                         if (tag_arg_length > 0) {
975                             tag_arg[tag_arg_length] = '\0';
976                             html_tag_arg_add(&tag_args, tag_arg, NULL);
977                         }
978                         tag_arg_length = 0;
979                         state          = HTML_TAG_ARG;
980                         next_state     = HTML_BAD_STATE;
981                     }
982                     break;
983                 case HTML_TAG_ARG_VAL:
984                     if ((tag_val_length == 5) && (strncmp(tag_val, "data:", 5) == 0)) {
985                         /* RFC2397 inline data */
986 
987                         /* Rewind one byte so we don't recursive */
988                         if (file_buff_o2 && (file_buff_o2->length > 0)) {
989                             file_buff_o2->length--;
990                         }
991 
992                         if (quoted != NOT_QUOTED) {
993                             html_output_c(file_buff_o2, '"');
994                         }
995                         tag_val_length = 0;
996                         state          = HTML_RFC2397_TYPE;
997                         next_state     = HTML_TAG_ARG;
998                     } else if ((tag_val_length == 6) && (strncmp(tag_val, "\"data:", 6) == 0)) {
999                         /* RFC2397 inline data */
1000 
1001                         /* Rewind one byte so we don't recursive */
1002                         if (file_buff_o2 && (file_buff_o2->length > 0)) {
1003                             file_buff_o2->length--;
1004                         }
1005 
1006                         if (quoted != NOT_QUOTED) {
1007                             html_output_c(file_buff_o2, '"');
1008                         }
1009 
1010                         tag_val_length = 0;
1011                         state          = HTML_RFC2397_TYPE;
1012                         next_state     = HTML_TAG_ARG;
1013                     } else if (*ptr == '&') {
1014                         state      = HTML_CHAR_REF;
1015                         next_state = HTML_TAG_ARG_VAL;
1016                         ptr++;
1017                     } else if (*ptr == '\'') {
1018                         if (tag_val_length == 0) {
1019                             quoted = SINGLE_QUOTED;
1020                             html_output_c(file_buff_o2, '"');
1021                             if (tag_val_length < HTML_STR_LENGTH) {
1022                                 tag_val[tag_val_length++] = '"';
1023                             }
1024                             ptr++;
1025                         } else {
1026                             if (!escape && (quoted == SINGLE_QUOTED)) {
1027                                 html_output_c(file_buff_o2, '"');
1028                                 if (tag_val_length < HTML_STR_LENGTH) {
1029                                     tag_val[tag_val_length++] = '"';
1030                                 }
1031                                 tag_val[tag_val_length] = '\0';
1032                                 html_tag_arg_add(&tag_args, tag_arg, tag_val);
1033                                 ptr++;
1034                                 state          = HTML_SKIP_WS;
1035                                 tag_arg_length = 0;
1036                                 next_state     = HTML_TAG_ARG;
1037                             } else {
1038                                 html_output_c(file_buff_o2, '"');
1039                                 if (tag_val_length < HTML_STR_LENGTH) {
1040                                     tag_val[tag_val_length++] = '"';
1041                                 }
1042                                 ptr++;
1043                             }
1044                         }
1045                     } else if (*ptr == '"') {
1046                         if (tag_val_length == 0) {
1047                             quoted = DOUBLE_QUOTED;
1048                             html_output_c(file_buff_o2, '"');
1049                             if (tag_val_length < HTML_STR_LENGTH) {
1050                                 tag_val[tag_val_length++] = '"';
1051                             }
1052                             ptr++;
1053                         } else {
1054                             if (!escape && (quoted == DOUBLE_QUOTED)) {
1055                                 html_output_c(file_buff_o2, '"');
1056                                 if (tag_val_length < HTML_STR_LENGTH) {
1057                                     tag_val[tag_val_length++] = '"';
1058                                 }
1059                                 tag_val[tag_val_length] = '\0';
1060                                 html_tag_arg_add(&tag_args, tag_arg, tag_val);
1061                                 ptr++;
1062                                 state          = HTML_SKIP_WS;
1063                                 tag_arg_length = 0;
1064                                 next_state     = HTML_TAG_ARG;
1065                             } else {
1066                                 html_output_c(file_buff_o2, '"');
1067                                 if (tag_val_length < HTML_STR_LENGTH) {
1068                                     tag_val[tag_val_length++] = '"';
1069                                 }
1070                                 ptr++;
1071                             }
1072                         }
1073                     } else if (isspace(*ptr) || (*ptr == '>')) {
1074                         if (quoted == NOT_QUOTED) {
1075                             tag_val[tag_val_length] = '\0';
1076                             html_tag_arg_add(&tag_args, tag_arg, tag_val);
1077                             state          = HTML_SKIP_WS;
1078                             tag_arg_length = 0;
1079                             next_state     = HTML_TAG_ARG;
1080                         } else {
1081                             html_output_c(file_buff_o2, *ptr);
1082                             if (tag_val_length < HTML_STR_LENGTH) {
1083                                 if (isspace(*ptr)) {
1084                                     tag_val[tag_val_length++] = ' ';
1085                                 } else {
1086                                     tag_val[tag_val_length++] = '>';
1087                                 }
1088                             }
1089                             state      = HTML_SKIP_WS;
1090                             escape     = FALSE;
1091                             quoted     = NOT_QUOTED;
1092                             next_state = HTML_TAG_ARG_VAL;
1093                             ptr++;
1094                         }
1095                     } else {
1096                         if (mbchar2 && (*ptr < 0x80 || mbchar2 >= 0x10000)) {
1097                             if (mbchar2 == 0xE38082 || mbchar2 == 0xEFBC8E || mbchar2 == 0xEFB992 ||
1098                                 (mbchar2 == 0xA1 && (*ptr == 0x43 || *ptr == 0x44 || *ptr == 0x4F))) {
1099                                 html_output_c(file_buff_o2, '.');
1100                                 if (tag_val_length < HTML_STR_LENGTH)
1101                                     tag_val[tag_val_length++] = '.';
1102                                 if (mbchar2 == 0xA1) {
1103                                     ptr++;
1104                                     mbchar2 = 0;
1105                                     continue;
1106                                 }
1107                             } else {
1108                                 uint8_t c0 = mbchar2 >> 16;
1109                                 uint8_t c1 = (mbchar2 >> 8) & 0xff;
1110                                 uint8_t c2 = (mbchar2 & 0xff);
1111                                 if (c0)
1112                                     html_output_c(file_buff_o2, c0);
1113                                 if (c0 || c1)
1114                                     html_output_c(file_buff_o2, c1);
1115                                 html_output_c(file_buff_o2, c2);
1116                                 if (c0 && tag_val_length < HTML_STR_LENGTH)
1117                                     tag_val[tag_val_length++] = c0;
1118                                 if ((c0 || c1) && tag_val_length < HTML_STR_LENGTH)
1119                                     tag_val[tag_val_length++] = c1;
1120                                 if (tag_val_length < HTML_STR_LENGTH)
1121                                     tag_val[tag_val_length++] = c2;
1122                             }
1123                             mbchar2 = 0;
1124                         }
1125                         if (*ptr >= 0x80)
1126                             mbchar2 = (mbchar2 << 8) | *ptr;
1127                         else {
1128                             html_output_c(file_buff_o2, tolower(*ptr));
1129                             if (tag_val_length < HTML_STR_LENGTH) {
1130                                 tag_val[tag_val_length++] = *ptr;
1131                             }
1132                         }
1133                         ptr++;
1134                     }
1135 
1136                     if (*ptr == '\\') {
1137                         escape = TRUE;
1138                     } else {
1139                         escape = FALSE;
1140                     }
1141                     break;
1142                 case HTML_COMMENT:
1143                     if (in_script && !isspace(*ptr)) {
1144                         unsigned char c = tolower(*ptr);
1145                         /* dump script to nocomment.html, since we no longer have
1146 					 * comment.html/script.html */
1147                         if (c == '\'') c = '"';
1148                         html_output_c(file_buff_o2, c);
1149                     }
1150                     if (*ptr == '>') {
1151                         state      = HTML_SKIP_WS;
1152                         next_state = HTML_NORM;
1153                     }
1154                     ptr++;
1155                     break;
1156                 case HTML_PROCESS_TAG:
1157 
1158                     /* Default to no action for this tag */
1159                     state      = HTML_SKIP_WS;
1160                     next_state = HTML_NORM;
1161                     if (tag[0] == '/') {
1162                         /* End tag */
1163                         state      = HTML_SKIP_WS;
1164                         next_state = HTML_NORM;
1165                         if (strcmp(tag, "/script") == 0) {
1166                             in_script = FALSE;
1167                             if (js_state) {
1168                                 js_end = ptr;
1169                                 js_process(js_state, js_begin, js_end, line, ptr, in_script, dirname);
1170                                 js_state = NULL;
1171                                 js_begin = js_end = NULL;
1172                             }
1173                             /*don't output newlines in nocomment.html
1174 						 * html_output_c(file_buff_o2, '\n');*/
1175                         }
1176                         if (hrefs && hrefs->scanContents && in_ahref) {
1177                             if (strcmp(tag, "/a") == 0) {
1178                                 html_tag_contents_done(hrefs, in_ahref, &contents);
1179                                 in_ahref = 0; /* we are no longer inside an <a href>
1180 							nesting <a> tags not supported, and shouldn't be supported*/
1181                             }
1182                             href_contents_begin = ptr;
1183                         }
1184                         if (strcmp(tag, "/form") == 0) {
1185                             if (in_form_action)
1186                                 free(in_form_action);
1187                             in_form_action = NULL;
1188                         }
1189                     } else if (strcmp(tag, "script") == 0) {
1190                         arg_value = html_tag_arg_value(&tag_args, "language");
1191                         /* TODO: maybe we can output all tags only via html_output_tag */
1192                         if (arg_value && (strcasecmp((const char *)arg_value, "jscript.encode") == 0)) {
1193                             html_tag_arg_set(&tag_args, "language", "javascript");
1194                             state      = HTML_SKIP_WS;
1195                             next_state = HTML_JSDECODE;
1196                             /* we already output the old tag, output the new tag now */
1197                             html_output_tag(file_buff_o2, tag, &tag_args);
1198                         } else if (arg_value && (strcasecmp((const char *)arg_value, "vbscript.encode") == 0)) {
1199                             html_tag_arg_set(&tag_args, "language", "vbscript");
1200                             state      = HTML_SKIP_WS;
1201                             next_state = HTML_JSDECODE;
1202                             /* we already output the old tag, output the new tag now */
1203                             html_output_tag(file_buff_o2, tag, &tag_args);
1204                         }
1205                         in_script = TRUE;
1206                         if (dconf_js && !js_state) {
1207                             js_state = cli_js_init();
1208                             if (!js_state) {
1209                                 cli_dbgmsg("htmlnorm: Failed to initialize js parser\n");
1210                             }
1211                             js_begin = ptr;
1212                             js_end   = NULL;
1213                         }
1214                     } else if (strcmp(tag, "%@") == 0) {
1215                         arg_value = html_tag_arg_value(&tag_args, "language");
1216                         if (arg_value && (strcasecmp((const char *)arg_value, "jscript.encode") == 0 ||
1217                                           strcasecmp((const char *)arg_value, "vbscript.encode") == 0)) {
1218 
1219                             saved_next_state = next_state;
1220                             next_state       = state;
1221                             look_for_screnc  = FALSE;
1222                             state            = HTML_LOOKFOR_SCRENC;
1223                         }
1224                     } else if (hrefs) {
1225                         if (in_ahref && !href_contents_begin)
1226                             href_contents_begin = ptr;
1227                         if (strcmp(tag, "a") == 0) {
1228                             arg_value = html_tag_arg_value(&tag_args, "href");
1229                             if (arg_value && strlen((const char *)arg_value) > 0) {
1230                                 if (hrefs->scanContents) {
1231                                     char *arg_value_title = html_tag_arg_value(&tag_args, "title");
1232                                     /*beginning of an <a> tag*/
1233                                     if (in_ahref)
1234                                         /*we encountered nested <a> tags, pretend previous closed*/
1235                                         if (href_contents_begin) {
1236                                             html_tag_contents_append(&contents, href_contents_begin, ptrend);
1237                                             /*add pending contents between tags*/
1238                                             html_tag_contents_done(hrefs, in_ahref, &contents);
1239                                             in_ahref = 0;
1240                                         }
1241                                     if (arg_value_title) {
1242                                         /* title is a 'displayed link'*/
1243                                         html_tag_arg_add(hrefs, "href_title", arg_value_title);
1244                                         html_tag_contents_append(&contents, (const unsigned char *)arg_value,
1245                                                                  (const unsigned char *)arg_value + strlen(arg_value));
1246                                         html_tag_contents_done(hrefs, hrefs->count, &contents);
1247                                     }
1248                                     if (in_form_action) {
1249                                         /* form action is the real URL, and href is the 'displayed' */
1250                                         html_tag_arg_add(hrefs, "form", arg_value);
1251                                         contents.pos = 0;
1252                                         html_tag_contents_append(&contents, in_form_action,
1253                                                                  in_form_action + strlen((const char *)in_form_action));
1254                                         html_tag_contents_done(hrefs, hrefs->count, &contents);
1255                                     }
1256                                 }
1257                                 html_tag_arg_add(hrefs, "href", arg_value);
1258                                 if (hrefs->scanContents) {
1259                                     in_ahref            = hrefs->count; /* index of this tag (counted from 1) */
1260                                     href_contents_begin = ptr;          /* contents begin after <a ..> ends */
1261                                     contents.pos        = 0;
1262                                 }
1263                             }
1264                         } else if (strcmp(tag, "form") == 0 && hrefs->scanContents) {
1265                             const char *arg_action_value = html_tag_arg_value(&tag_args, "action");
1266                             if (arg_action_value) {
1267                                 if (in_form_action)
1268                                     free(in_form_action);
1269                                 in_form_action = (unsigned char *)cli_strdup(arg_action_value);
1270                             }
1271                         } else if (strcmp(tag, "img") == 0) {
1272                             arg_value = html_tag_arg_value(&tag_args, "src");
1273                             if (arg_value && strlen(arg_value) > 0) {
1274                                 html_tag_arg_add(hrefs, "src", arg_value);
1275                                 if (hrefs->scanContents && in_ahref)
1276                                     /* "contents" of an img tag, is the URL of its parent <a> tag */
1277                                     hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_strdup((const char *)hrefs->value[in_ahref - 1]);
1278                                 if (in_form_action) {
1279                                     /* form action is the real URL, and href is the 'displayed' */
1280                                     html_tag_arg_add(hrefs, "form", arg_value);
1281                                     contents.pos = 0;
1282                                     html_tag_contents_append(&contents, in_form_action,
1283                                                              in_form_action + strlen((const char *)in_form_action));
1284                                     html_tag_contents_done(hrefs, hrefs->count, &contents);
1285                                 }
1286                             }
1287                             arg_value = html_tag_arg_value(&tag_args, "dynsrc");
1288                             if (arg_value && strlen(arg_value) > 0) {
1289                                 html_tag_arg_add(hrefs, "dynsrc", arg_value);
1290                                 if (hrefs->scanContents && in_ahref)
1291                                     /* see above */
1292                                     hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_strdup((const char *)hrefs->value[in_ahref - 1]);
1293                                 if (in_form_action) {
1294                                     /* form action is the real URL, and href is the 'displayed' */
1295                                     html_tag_arg_add(hrefs, "form", arg_value);
1296                                     contents.pos = 0;
1297                                     html_tag_contents_append(&contents, in_form_action,
1298                                                              in_form_action + strlen((const char *)in_form_action));
1299                                     html_tag_contents_done(hrefs, hrefs->count, &contents);
1300                                 }
1301                             }
1302                         } else if (strcmp(tag, "iframe") == 0) {
1303                             arg_value = html_tag_arg_value(&tag_args, "src");
1304                             if (arg_value && strlen(arg_value) > 0) {
1305                                 html_tag_arg_add(hrefs, "iframe", arg_value);
1306                                 if (hrefs->scanContents && in_ahref)
1307                                     /* see above */
1308                                     hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_strdup((const char *)hrefs->value[in_ahref - 1]);
1309                                 if (in_form_action) {
1310                                     /* form action is the real URL, and href is the 'displayed' */
1311                                     html_tag_arg_add(hrefs, "form", arg_value);
1312                                     contents.pos = 0;
1313                                     html_tag_contents_append(&contents, in_form_action,
1314                                                              in_form_action + strlen((const char *)in_form_action));
1315                                     html_tag_contents_done(hrefs, hrefs->count, &contents);
1316                                 }
1317                             }
1318                         } else if (strcmp(tag, "area") == 0) {
1319                             arg_value = html_tag_arg_value(&tag_args, "href");
1320                             if (arg_value && strlen(arg_value) > 0) {
1321                                 html_tag_arg_add(hrefs, "area", arg_value);
1322                                 if (hrefs->scanContents && in_ahref)
1323                                     /* see above */
1324                                     hrefs->contents[hrefs->count - 1] = (unsigned char *)cli_strdup((const char *)hrefs->value[in_ahref - 1]);
1325                                 if (in_form_action) {
1326                                     /* form action is the real URL, and href is the 'displayed' */
1327                                     html_tag_arg_add(hrefs, "form", arg_value);
1328                                     contents.pos = 0;
1329                                     html_tag_contents_append(&contents, in_form_action,
1330                                                              in_form_action + strlen((const char *)in_form_action));
1331                                     html_tag_contents_done(hrefs, hrefs->count, &contents);
1332                                 }
1333                             }
1334                         }
1335                         /* TODO:imagemaps can have urls too */
1336                     } else if (strcmp(tag, "a") == 0) {
1337                         /* a/img tags for buff_text can be processed only if we're not processing hrefs */
1338                         arg_value = html_tag_arg_value(&tag_args, "href");
1339                         if (arg_value && arg_value[0]) {
1340                             html_output_str(file_buff_text, (const unsigned char *)arg_value, strlen((const char *)arg_value));
1341                             html_output_c(file_buff_text, ' ');
1342                             text_space_written = TRUE;
1343                         }
1344                     } else if (strcmp(tag, "img") == 0) {
1345                         arg_value = html_tag_arg_value(&tag_args, "src");
1346                         if (arg_value && arg_value[0]) {
1347                             html_output_str(file_buff_text, (const unsigned char *)arg_value, strlen((const char *)arg_value));
1348                             html_output_c(file_buff_text, ' ');
1349                             text_space_written = TRUE;
1350                         }
1351                     }
1352                     html_tag_arg_free(&tag_args);
1353                     break;
1354                 case HTML_CHAR_REF:
1355                     if (*ptr == '#') {
1356                         value = 0;
1357                         hex   = FALSE;
1358                         state = HTML_CHAR_REF_DECODE;
1359                         ptr++;
1360                     } else {
1361                         if (dconf_entconv)
1362                             state = HTML_ENTITY_REF_DECODE;
1363                         else {
1364                             if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1365                                 tag_val[tag_val_length++] = '&';
1366                             }
1367                             html_output_c(file_buff_o2, '&');
1368 
1369                             state      = next_state;
1370                             next_state = HTML_BAD_STATE;
1371                         }
1372                     }
1373                     break;
1374                 case HTML_ENTITY_REF_DECODE:
1375                     if (*ptr == ';') {
1376                         size_t i;
1377                         const char *normalized;
1378                         entity_val[entity_val_length] = '\0';
1379                         normalized                    = entity_norm(&conv, entity_val);
1380                         if (normalized) {
1381                             for (i = 0; i < strlen(normalized); i++) {
1382                                 const unsigned char c = normalized[i] & 0xff;
1383                                 html_output_c(file_buff_o2, c);
1384                                 if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1385                                     tag_val[tag_val_length++] = c;
1386                                 }
1387                             }
1388                         } else {
1389                             html_output_c(file_buff_o2, '&');
1390                             if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1391                                 tag_val[tag_val_length++] = '&';
1392                             }
1393                             for (i = 0; i < entity_val_length; i++) {
1394                                 const char c = tolower(entity_val[i]);
1395                                 html_output_c(file_buff_o2, c);
1396                                 if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1397                                     tag_val[tag_val_length++] = c;
1398                                 }
1399                             }
1400                             if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1401                                 tag_val[tag_val_length++] = ';';
1402                             }
1403                             html_output_c(file_buff_o2, ';');
1404                         }
1405                         entity_val_length = 0;
1406                         state             = next_state;
1407                         next_state        = HTML_BAD_STATE;
1408                         ptr++;
1409                     } else if ((isalnum(*ptr) || *ptr == '_' || *ptr == ':' || (*ptr == '-')) && entity_val_length < HTML_STR_LENGTH) {
1410                         entity_val[entity_val_length++] = *ptr++;
1411                     } else {
1412                         /* entity too long, or not valid, dump it */
1413                         size_t i;
1414                         if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1415                             tag_val[tag_val_length++] = '&';
1416                         }
1417                         html_output_c(file_buff_o2, '&');
1418                         for (i = 0; i < entity_val_length; i++) {
1419                             const char c = tolower(entity_val[i]);
1420                             html_output_c(file_buff_o2, c);
1421                             if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1422                                 tag_val[tag_val_length++] = c;
1423                             }
1424                         }
1425 
1426                         state             = next_state;
1427                         next_state        = HTML_BAD_STATE;
1428                         entity_val_length = 0;
1429                     }
1430                     break;
1431                 case HTML_CHAR_REF_DECODE:
1432                     if ((value == 0) && ((*ptr == 'x') || (*ptr == 'X'))) {
1433                         hex = TRUE;
1434                         ptr++;
1435                     } else if (*ptr == ';') {
1436                         if (next_state == HTML_TAG_ARG_VAL && tag_val_length < HTML_STR_LENGTH) {
1437                             tag_val[tag_val_length++] = value; /* store encoded values too */
1438                         }
1439                         if (dconf_entconv) {
1440 
1441                             if (value < 0x80)
1442                                 html_output_c(file_buff_o2, tolower(value));
1443                             else {
1444                                 unsigned char buff[10];
1445                                 unsigned char *out = u16_normalize_tobuffer(value, buff, 10);
1446                                 if (out && out > buff) {
1447                                     html_output_str(file_buff_o2, buff, out - buff - 1);
1448                                 }
1449                             }
1450                         } else
1451                             html_output_c(file_buff_o2, tolower(value & 0xff));
1452                         state      = next_state;
1453                         next_state = HTML_BAD_STATE;
1454                         ptr++;
1455                     } else if (isdigit(*ptr) || (hex && isxdigit(*ptr))) {
1456                         int64_t increment = 0;
1457 
1458                         if (hex && value < INT64_MAX / 16) {
1459                             value *= 16;
1460                         } else if (value < INT64_MAX / 10) {
1461                             value *= 10;
1462                         } else {
1463                             html_output_c(file_buff_o2, value);
1464                             state      = next_state;
1465                             next_state = HTML_BAD_STATE;
1466                             ptr++;
1467                             break;
1468                         }
1469                         if (isdigit(*ptr)) {
1470                             increment = *ptr - '0';
1471                         } else {
1472                             increment = tolower(*ptr) - 'a' + 10;
1473                         }
1474                         if (value > INT64_MAX - increment) {
1475                             /* Addition would result in integer overflow. */
1476                             html_output_c(file_buff_o2, value);
1477                             state      = next_state;
1478                             next_state = HTML_BAD_STATE;
1479                             ptr++;
1480                             break;
1481                         }
1482                         value += increment;
1483                         ptr++;
1484                     } else {
1485                         html_output_c(file_buff_o2, value);
1486                         state      = next_state;
1487                         next_state = HTML_BAD_STATE;
1488                     }
1489                     break;
1490                 case HTML_LOOKFOR_SCRENC:
1491                     look_for_screnc = TRUE;
1492                     ptr_screnc      = (unsigned char *)strstr((char *)ptr, "#@~^");
1493                     if (ptr_screnc) {
1494                         ptr_screnc[0] = '/';
1495                         ptr_screnc[1] = '/';
1496                         ptr_screnc += 4;
1497                     }
1498                     state      = next_state;
1499                     next_state = saved_next_state;
1500                     break;
1501                 case HTML_JSDECODE:
1502                     /* Check for start marker */
1503                     if (strncmp((const char *)ptr, "#@~^", 4) == 0) {
1504                         ptr[0] = '/';
1505                         ptr[1] = '/';
1506                         ptr += 4;
1507                         state      = HTML_JSDECODE_LENGTH;
1508                         next_state = HTML_BAD_STATE;
1509                     } else {
1510                         html_output_c(file_buff_o2, tolower(*ptr));
1511                         ptr++;
1512                     }
1513                     break;
1514                 case HTML_JSDECODE_LENGTH:
1515                     if (strlen((const char *)ptr) < 8) {
1516                         state      = HTML_NORM;
1517                         next_state = HTML_BAD_STATE;
1518                         break;
1519                     }
1520                     memset(&screnc_state, 0, sizeof(screnc_state));
1521                     screnc_state.length = base64_chars[ptr[0]] < 0 ? 0 : base64_chars[ptr[0]] << 2;
1522                     screnc_state.length += base64_chars[ptr[1]] >> 4;
1523                     screnc_state.length += (base64_chars[ptr[1]] & 0x0f) << 12;
1524                     screnc_state.length += ((base64_chars[ptr[2]] >> 2) < 0 ? 0 : (base64_chars[ptr[2]] >> 2)) << 8;
1525                     screnc_state.length += (base64_chars[ptr[2]] & 0x03) << 22;
1526                     screnc_state.length += base64_chars[ptr[3]] < 0 ? 0 : base64_chars[ptr[3]] << 16;
1527                     screnc_state.length += (base64_chars[ptr[4]] < 0 ? 0 : base64_chars[ptr[4]] << 2) << 24;
1528                     screnc_state.length += ((base64_chars[ptr[5]] >> 4) < 0 ? 0 : (base64_chars[ptr[5]] >> 4)) << 24;
1529                     state      = HTML_JSDECODE_DECRYPT;
1530                     in_screnc  = TRUE;
1531                     next_state = HTML_BAD_STATE;
1532                     /* for JS normalizer */
1533                     ptr[7] = '\n';
1534                     ptr += 8;
1535                     break;
1536                 case HTML_JSDECODE_DECRYPT:
1537                     screnc_decode(ptr, &screnc_state);
1538                     if (!screnc_state.length) {
1539                         state      = HTML_NORM;
1540                         next_state = HTML_BAD_STATE;
1541                         in_screnc  = FALSE;
1542                         break;
1543                     } else {
1544                         state      = HTML_NORM;
1545                         next_state = HTML_BAD_STATE;
1546                     }
1547                     break;
1548                 case HTML_RFC2397_TYPE:
1549                     if (*ptr == '\'') {
1550                         if (!escape && (quoted == SINGLE_QUOTED)) {
1551                             /* Early end of data detected. Error */
1552                             ptr++;
1553                             state          = HTML_SKIP_WS;
1554                             tag_arg_length = 0;
1555                             next_state     = HTML_TAG_ARG;
1556                         } else {
1557                             if (tag_val_length < HTML_STR_LENGTH) {
1558                                 tag_val[tag_val_length++] = '"';
1559                             }
1560                             ptr++;
1561                         }
1562                     } else if (*ptr == '"') {
1563                         if (!escape && (quoted == DOUBLE_QUOTED)) {
1564                             /* Early end of data detected. Error */
1565                             ptr++;
1566                             state          = HTML_SKIP_WS;
1567                             tag_arg_length = 0;
1568                             next_state     = HTML_TAG_ARG;
1569                         } else {
1570                             if (tag_val_length < HTML_STR_LENGTH) {
1571                                 tag_val[tag_val_length++] = '"';
1572                             }
1573                             ptr++;
1574                         }
1575                     } else if (isspace(*ptr) || (*ptr == '>')) {
1576                         if (quoted == NOT_QUOTED) {
1577                             /* Early end of data detected. Error */
1578                             state          = HTML_SKIP_WS;
1579                             tag_arg_length = 0;
1580                             next_state     = HTML_TAG_ARG;
1581                         } else {
1582                             if (tag_val_length < HTML_STR_LENGTH) {
1583                                 if (isspace(*ptr)) {
1584                                     tag_val[tag_val_length++] = ' ';
1585                                 } else {
1586                                     tag_val[tag_val_length++] = '>';
1587                                 }
1588                             }
1589                             state      = HTML_SKIP_WS;
1590                             escape     = FALSE;
1591                             quoted     = NOT_QUOTED;
1592                             next_state = HTML_RFC2397_TYPE;
1593                             ptr++;
1594                         }
1595                     } else if (*ptr == ',') {
1596                         /* Beginning of data */
1597                         tag_val[tag_val_length] = '\0';
1598                         state                   = HTML_RFC2397_INIT;
1599                         escape                  = FALSE;
1600                         next_state              = HTML_BAD_STATE;
1601                         ptr++;
1602 
1603                     } else {
1604                         if (tag_val_length < HTML_STR_LENGTH) {
1605                             tag_val[tag_val_length++] = tolower(*ptr);
1606                         }
1607                         ptr++;
1608                     }
1609                     if (*ptr == '\\') {
1610                         escape = TRUE;
1611                     } else {
1612                         escape = FALSE;
1613                     }
1614                     break;
1615                 case HTML_RFC2397_INIT:
1616                     if (dirname) {
1617                         STATBUF statbuf;
1618 
1619                         if (NULL != file_tmp_o1) {
1620                             if (file_tmp_o1->fd != -1) {
1621                                 html_output_flush(file_tmp_o1);
1622                                 close(file_tmp_o1->fd);
1623                                 file_tmp_o1->fd = -1;
1624                             }
1625                             free(file_tmp_o1);
1626                         }
1627 
1628                         file_tmp_o1 = (file_buff_t *)cli_malloc(sizeof(file_buff_t));
1629                         if (!file_tmp_o1) {
1630                             cli_errmsg("cli_html_normalise: Unable to allocate memory for file_tmp_o1\n");
1631                             goto abort;
1632                         }
1633                         file_tmp_o1->fd = -1;
1634 
1635                         /* Create rfc2397 directory if it doesn't already exist */
1636                         snprintf(filename, 1024, "%s" PATHSEP "rfc2397", dirname);
1637                         if (LSTAT(filename, &statbuf) == -1) {
1638                             if (mkdir(filename, 0700) && errno != EEXIST) {
1639                                 cli_errmsg("Failed to create directory: %s\n", dirname);
1640                                 goto abort;
1641                             }
1642                         }
1643 
1644                         tmp_file = cli_gentemp(filename);
1645                         if (!tmp_file) {
1646                             goto abort;
1647                         }
1648                         cli_dbgmsg("RFC2397 data file: %s\n", tmp_file);
1649                         file_tmp_o1->fd = open(tmp_file, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR);
1650                         free(tmp_file);
1651                         if (file_tmp_o1->fd < 0) {
1652                             cli_dbgmsg("open failed: %s\n", filename);
1653                             goto abort;
1654                         }
1655                         file_tmp_o1->length = 0;
1656 
1657                         html_output_str(file_tmp_o1, (const unsigned char *)"From html-normalise\n", 20);
1658                         html_output_str(file_tmp_o1, (const unsigned char *)"Content-type: ", 14);
1659                         if ((tag_val_length == 0) && (*tag_val == ';')) {
1660                             html_output_str(file_tmp_o1, (const unsigned char *)"text/plain\n", 11);
1661                         }
1662                         html_output_str(file_tmp_o1, (const unsigned char *)tag_val, tag_val_length);
1663                         html_output_c(file_tmp_o1, '\n');
1664                         if (strstr(tag_val, ";base64") != NULL) {
1665                             html_output_str(file_tmp_o1, (const unsigned char *)"Content-transfer-encoding: base64\n", 34);
1666                         }
1667                         html_output_c(file_tmp_o1, '\n');
1668                     } else {
1669                         file_tmp_o1 = NULL;
1670                     }
1671                     state  = HTML_RFC2397_DATA;
1672                     binary = TRUE;
1673                     break;
1674                 case HTML_RFC2397_DATA:
1675                     if (*ptr == '&') {
1676                         state      = HTML_CHAR_REF;
1677                         next_state = HTML_RFC2397_DATA;
1678                         ptr++;
1679                     } else if (*ptr == '%') {
1680                         length     = 0;
1681                         value      = 0;
1682                         state      = HTML_ESCAPE_CHAR;
1683                         next_state = HTML_RFC2397_ESC;
1684                         ptr++;
1685                     } else if (*ptr == '\'') {
1686                         if (!escape && (quoted == SINGLE_QUOTED)) {
1687                             state = HTML_RFC2397_FINISH;
1688                             ptr++;
1689                         } else {
1690                             html_output_c(file_tmp_o1, *ptr);
1691                             ptr++;
1692                         }
1693                     } else if (*ptr == '\"') {
1694                         if (!escape && (quoted == DOUBLE_QUOTED)) {
1695                             state = HTML_RFC2397_FINISH;
1696                             ptr++;
1697                         } else {
1698                             html_output_c(file_tmp_o1, *ptr);
1699                             ptr++;
1700                         }
1701                     } else if (isspace(*ptr) || (*ptr == '>')) {
1702                         if (quoted == NOT_QUOTED) {
1703                             state = HTML_RFC2397_FINISH;
1704                             ptr++;
1705                         } else {
1706                             html_output_c(file_tmp_o1, *ptr);
1707                             ptr++;
1708                         }
1709                     } else {
1710                         html_output_c(file_tmp_o1, *ptr);
1711                         ptr++;
1712                     }
1713                     if (*ptr == '\\') {
1714                         escape = TRUE;
1715                     } else {
1716                         escape = FALSE;
1717                     }
1718                     break;
1719                 case HTML_RFC2397_FINISH:
1720                     if (file_tmp_o1) {
1721                         if (file_tmp_o1->fd != -1) {
1722                             html_output_flush(file_tmp_o1);
1723                             close(file_tmp_o1->fd);
1724                             file_tmp_o1->fd = -1;
1725                         }
1726                         free(file_tmp_o1);
1727                         file_tmp_o1 = NULL;
1728                     }
1729                     state      = HTML_SKIP_WS;
1730                     escape     = FALSE;
1731                     quoted     = NOT_QUOTED;
1732                     next_state = HTML_TAG_ARG;
1733                     binary     = FALSE;
1734                     break;
1735                 case HTML_RFC2397_ESC:
1736                     if (length == 2) {
1737                         html_output_c(file_tmp_o1, value);
1738                     } else if (length == 1) {
1739                         html_output_c(file_tmp_o1, '%');
1740                         html_output_c(file_tmp_o1, value + '0');
1741                     } else {
1742                         html_output_c(file_tmp_o1, '%');
1743                     }
1744                     state = HTML_RFC2397_DATA;
1745                     break;
1746                 case HTML_ESCAPE_CHAR:
1747                     if (value < INT64_MAX / 16) {
1748                         value *= 16;
1749                     } else {
1750                         state      = next_state;
1751                         next_state = HTML_BAD_STATE;
1752                         ptr++;
1753                         break;
1754                     }
1755                     length++;
1756                     if (isxdigit(*ptr)) {
1757                         if (isdigit(*ptr)) {
1758                             value += (*ptr - '0');
1759                         } else {
1760                             value += (tolower(*ptr) - 'a' + 10);
1761                         }
1762                     } else {
1763                         state = next_state;
1764                     }
1765                     if (length == 2) {
1766                         state = next_state;
1767                     }
1768                     ptr++;
1769                     break;
1770             }
1771         }
1772         if (hrefs && hrefs->scanContents && in_ahref && href_contents_begin)
1773             /* end of line, append contents now, resume on next line */
1774             html_tag_contents_append(&contents, href_contents_begin, ptr);
1775         ptrend = NULL;
1776 
1777         if (js_state) {
1778             js_process(js_state, js_begin, js_end, line, ptr, in_script, dirname);
1779             js_begin = js_end = NULL;
1780             if (!in_script) {
1781                 js_state = NULL;
1782             }
1783         }
1784         if (look_for_screnc && ptr_screnc) {
1785             /* start found, and stuff before it already processed */
1786             ptr        = ptr_screnc;
1787             ptr_screnc = NULL;
1788             state      = HTML_JSDECODE_LENGTH;
1789             next_state = HTML_BAD_STATE;
1790             continue;
1791         }
1792         free(line);
1793         ptr = line = cli_readchunk(stream_in, m_area, 8192);
1794         if (in_screnc) {
1795             state      = HTML_JSDECODE_DECRYPT;
1796             next_state = HTML_BAD_STATE;
1797         } else if (look_for_screnc && !ptr_screnc &&
1798                    state != HTML_LOOKFOR_SCRENC) {
1799             saved_next_state = next_state;
1800             next_state       = state;
1801             state            = HTML_LOOKFOR_SCRENC;
1802         }
1803         if (next_state == state) {
1804             /* safeguard against infloop */
1805             cli_dbgmsg("htmlnorm.c: next_state == state, changing next_state\n");
1806             next_state = HTML_BAD_STATE;
1807         }
1808     }
1809 
1810     if (dconf_entconv) {
1811         /* handle "unfinished" entities */
1812         size_t i;
1813         const char *normalized;
1814         entity_val[entity_val_length] = '\0';
1815         normalized                    = entity_norm(&conv, entity_val);
1816         if (normalized) {
1817             for (i = 0; i < strlen(normalized); i++)
1818                 html_output_c(file_buff_o2, normalized[i] & 0xff);
1819         } else {
1820             if (entity_val_length) {
1821                 html_output_c(file_buff_o2, '&');
1822                 for (i = 0; i < entity_val_length; i++)
1823                     html_output_c(file_buff_o2, tolower(entity_val[i]));
1824             }
1825         }
1826     }
1827     retval = TRUE;
1828 abort:
1829     if (line) /* only needed for abort case */
1830         free(line);
1831     if (in_form_action)
1832         free(in_form_action);
1833     if (in_ahref) /* tag not closed, force closing */
1834         html_tag_contents_done(hrefs, in_ahref, &contents);
1835 
1836     if (js_state) {
1837         /*  output script so far */
1838         cli_js_parse_done(js_state);
1839         cli_js_output(js_state, dirname);
1840         cli_js_destroy(js_state);
1841         js_state = NULL;
1842     }
1843     html_tag_arg_free(&tag_args);
1844     if (!m_area) {
1845         fclose(stream_in);
1846     }
1847     if (file_buff_o2) {
1848         html_output_flush(file_buff_o2);
1849         if (file_buff_o2->fd != -1)
1850             close(file_buff_o2->fd);
1851         free(file_buff_o2);
1852     }
1853     if (file_buff_text) {
1854         html_output_flush(file_buff_text);
1855         if (file_buff_text->fd != -1)
1856             close(file_buff_text->fd);
1857         free(file_buff_text);
1858         file_buff_text = NULL;
1859     }
1860     if (file_tmp_o1) {
1861         if (file_tmp_o1->fd != -1) {
1862             html_output_flush(file_tmp_o1);
1863             close(file_tmp_o1->fd);
1864         }
1865         free(file_tmp_o1);
1866     }
1867     return retval;
1868 }
1869 
html_normalise_mem(unsigned char * in_buff,off_t in_size,const char * dirname,tag_arguments_t * hrefs,const struct cli_dconf * dconf)1870 int html_normalise_mem(unsigned char *in_buff, off_t in_size, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
1871 {
1872     m_area_t m_area;
1873 
1874     m_area.buffer = in_buff;
1875     m_area.length = in_size;
1876     m_area.offset = 0;
1877     m_area.map    = NULL;
1878 
1879     return cli_html_normalise(-1, &m_area, dirname, hrefs, dconf);
1880 }
1881 
html_normalise_map(fmap_t * map,const char * dirname,tag_arguments_t * hrefs,const struct cli_dconf * dconf)1882 int html_normalise_map(fmap_t *map, const char *dirname, tag_arguments_t *hrefs, const struct cli_dconf *dconf)
1883 {
1884     int retval = FALSE;
1885     m_area_t m_area;
1886 
1887     m_area.length = map->len;
1888     m_area.offset = 0;
1889     m_area.map    = map;
1890     retval        = cli_html_normalise(-1, &m_area, dirname, hrefs, dconf);
1891     return retval;
1892 }
1893 
html_screnc_decode(fmap_t * map,const char * dirname)1894 int html_screnc_decode(fmap_t *map, const char *dirname)
1895 {
1896     int count, retval = FALSE;
1897     unsigned char *line = NULL, tmpstr[6];
1898     unsigned char *ptr, filename[1024];
1899     int ofd;
1900     struct screnc_state screnc_state;
1901     m_area_t m_area;
1902 
1903     memset(&m_area, 0, sizeof(m_area));
1904     m_area.length = map->len;
1905     m_area.offset = 0;
1906     m_area.map    = map;
1907 
1908     snprintf((char *)filename, 1024, "%s" PATHSEP "screnc.html", dirname);
1909     ofd = open((const char *)filename, O_WRONLY | O_CREAT | O_TRUNC, S_IWUSR | S_IRUSR);
1910 
1911     if (ofd < 0) {
1912         cli_dbgmsg("open failed: %s\n", filename);
1913         return FALSE;
1914     }
1915 
1916     while ((line = cli_readchunk(NULL, &m_area, 8192)) != NULL) {
1917         ptr = (unsigned char *)strstr((char *)line, "#@~^");
1918         if (ptr) {
1919             break;
1920         }
1921         free(line);
1922         line = NULL;
1923     }
1924     if (!line) {
1925         goto abort;
1926     }
1927 
1928     /* Calculate the length of the encoded string */
1929     ptr += 4;
1930     count = 0;
1931     do {
1932         if (!*ptr) {
1933             free(line);
1934             ptr = line = cli_readchunk(NULL, &m_area, 8192);
1935             if (!line) {
1936                 goto abort;
1937             }
1938         }
1939         if (count < 6)
1940             tmpstr[count] = *ptr;
1941         count++;
1942         ptr++;
1943     } while (count < 8);
1944 
1945     memset(&screnc_state, 0, sizeof(screnc_state));
1946     screnc_state.length = base64_chars[tmpstr[0]] < 0 ? 0 : base64_chars[tmpstr[0]] << 2;
1947     screnc_state.length += base64_chars[tmpstr[1]] >> 4;
1948     screnc_state.length += (base64_chars[tmpstr[1]] & 0x0f) << 12;
1949     screnc_state.length += ((base64_chars[tmpstr[2]] >> 2) < 0 ? 0 : (base64_chars[tmpstr[2]] >> 2)) << 8;
1950     screnc_state.length += (base64_chars[tmpstr[2]] & 0x03) << 22;
1951     screnc_state.length += base64_chars[tmpstr[3]] < 0 ? 0 : base64_chars[tmpstr[3]] << 16;
1952     screnc_state.length += (base64_chars[tmpstr[4]] < 0 ? 0 : base64_chars[tmpstr[4]] << 2) << 24;
1953     screnc_state.length += ((base64_chars[tmpstr[5]] >> 4) < 0 ? 0 : (base64_chars[tmpstr[5]] >> 4)) << 24;
1954     cli_writen(ofd, "<script>", strlen("<script>"));
1955     while (screnc_state.length && line) {
1956         screnc_decode(ptr, &screnc_state);
1957         cli_writen(ofd, ptr, strlen((const char *)ptr));
1958         free(line);
1959         line = NULL;
1960         if (screnc_state.length) {
1961             ptr = line = cli_readchunk(NULL, &m_area, 8192);
1962         }
1963     }
1964     cli_writen(ofd, "</script>", strlen("</script>"));
1965     if (screnc_state.length)
1966         cli_dbgmsg("html_screnc_decode: missing %u bytes\n", screnc_state.length);
1967     retval = TRUE;
1968 
1969 abort:
1970     close(ofd);
1971     if (line) {
1972         free(line);
1973     }
1974     return retval;
1975 }
1976