1 /*
2  Copyright (C) 2015-2017 Alexander Borisov
3 
4  This library is free software; you can redistribute it and/or
5  modify it under the terms of the GNU Lesser General Public
6  License as published by the Free Software Foundation; either
7  version 2.1 of the License, or (at your option) any later version.
8 
9  This library is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  Lesser General Public License for more details.
13 
14  You should have received a copy of the GNU Lesser General Public
15  License along with this library; if not, write to the Free Software
16  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 
18  Author: lex.borisov@gmail.com (Alexander Borisov)
19 */
20 
21 #include "myencoding/encoding.h"
22 #include "myencoding/detect_resource.h"
23 #include "mycore/utils/resources.h"
24 
myencoding_detect_by_trigram(unsigned const char * u_text,size_t length,const myencoding_trigram_t * list,size_t list_length,size_t max_sum_for_break)25 myencoding_trigram_result_t myencoding_detect_by_trigram(unsigned const char *u_text, size_t length,
26                                                                    const myencoding_trigram_t *list, size_t list_length,
27                                                                    size_t max_sum_for_break)
28 {
29     myencoding_trigram_result_t res = {0, 0};
30 
31     for (size_t i = 0; i < (length - 3); i++) {
32         if(u_text[i] > 127)
33         {
34             for (size_t j = 0; j < list_length; j++)
35             {
36                 if(memcmp(list[j].trigram, &u_text[i], 3) == 0) {
37                     res.value += list[j].value;
38                     res.count++;
39 
40                     if(res.value >= max_sum_for_break)
41                         i = length;
42 
43                     break;
44                 }
45             }
46         }
47     }
48 
49     return res;
50 }
51 
myencoding_detect_russian_has_end(myencoding_trigram_result_t * res,size_t min_count,size_t min_value)52 bool myencoding_detect_russian_has_end(myencoding_trigram_result_t *res, size_t min_count, size_t min_value)
53 {
54     if(res->value >= min_value || res->count >= min_count)
55         return true;
56 
57     return false;
58 }
59 
myencoding_detect_unicode_has_end(myencoding_unicode_result_t * res,size_t max_bad_percent)60 bool myencoding_detect_unicode_has_end(myencoding_unicode_result_t *res, size_t max_bad_percent)
61 {
62     if(res->count_good == 0) {
63         if(res->count_bad)
64             return false;
65 
66         return true;
67     }
68     else if(res->count_bad == 0)
69         return true;
70 
71     size_t percent_bad = (res->count_bad * 100) / res->count_good;
72     if(percent_bad < max_bad_percent)
73         return true;
74 
75     return false;
76 }
77 
myencoding_detect_utf_8(unsigned const char * u_text,size_t length)78 myencoding_unicode_result_t myencoding_detect_utf_8(unsigned const char *u_text, size_t length)
79 {
80     size_t i = 0;
81     myencoding_unicode_result_t res = {0, 0, 0};
82 
83     while(i < length)
84     {
85         if((u_text[i] & 0x80) == 0x00) {
86             i++;
87             res.count_ascii++;
88         }
89         else if((u_text[i] & 0xE0) == 0xC0) {
90             i += 2;
91 
92             if(i >= length)
93                 break;
94 
95             if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0))
96                 res.count_good++;
97             else
98                 res.count_bad++;
99         }
100         else if((u_text[i] & 0xF0) == 0xE0) {
101             i += 3;
102 
103             if(i >= length)
104                 break;
105 
106             if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0) &&
107                ((u_text[i - 2] & 0x80) && (u_text[i - 2] & 0x40) == 0)) {
108                 res.count_good++;
109             }
110             else
111                 res.count_bad++;
112         }
113         else if((u_text[i] & 0xF8) == 0xF0) {
114             i += 4;
115 
116             if(i >= length)
117                 break;
118 
119             if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0) &&
120                ((u_text[i - 2] & 0x80) && (u_text[i - 2] & 0x40) == 0) &&
121                ((u_text[i - 3] & 0x80) && (u_text[i - 3] & 0x40) == 0)) {
122                 res.count_good++;
123             }
124             else
125                 res.count_bad++;
126         }
127         else {
128             i++;
129             res.count_bad++;
130         }
131     }
132 
133     return res;
134 }
135 
myencoding_detect_utf_16(unsigned const char * u_text,size_t length)136 myencoding_unicode_result_t myencoding_detect_utf_16(unsigned const char *u_text, size_t length)
137 {
138     size_t i = 0;
139     myencoding_unicode_result_t res = {0, 0, 0};
140 
141     while(i < length)
142     {
143         if(u_text[i] == 0x00) {
144             if((i % 2) == 0) {
145                 i++;
146 
147                 if(u_text[i] > 0x1F && u_text[i] < 0x7F)
148                     res.count_bad++;
149             }
150             else {
151                 if(u_text[(i - 1)] > 0x1F && u_text[(i - 1)] < 0x7F)
152                     res.count_good++;
153 
154                 i++;
155             }
156         }
157         else
158             i++;
159     }
160 
161     return res;
162 }
163 
myencoding_detect_bom(const char * text,size_t length,myencoding_t * encoding)164 bool myencoding_detect_bom(const char *text, size_t length, myencoding_t *encoding)
165 {
166     unsigned const char *u_text = (unsigned const char*)text;
167 
168     if(length > 2) {
169         if(u_text[0] == 0xEF &&
170            u_text[1] == 0xBB &&
171            u_text[2] == 0xBF)
172         {
173             *encoding = MyENCODING_UTF_8;
174             return true;
175         }
176     }
177 
178     if(length > 1) {
179         if(u_text[0] == 0xFE && u_text[1] == 0xFF) {
180             *encoding = MyENCODING_UTF_16BE;
181             return true;
182         }
183 
184         if(u_text[0] == 0xFF && u_text[1] == 0xFE) {
185             *encoding = MyENCODING_UTF_16LE;
186             return true;
187         }
188     }
189 
190 //  //for UTF-32
191 //    if(length > 3) {
192 //        if(u_text[0] == 0x00 &&
193 //           u_text[1] == 0x00 &&
194 //           u_text[2] == 0xFE &&
195 //           u_text[3] == 0xFF)
196 //        {
197 //            *encoding = MyENCODING_UTF_32BE;
198 //            return true;
199 //        }
200 //
201 //        if(u_text[0] == 0xFF &&
202 //           u_text[1] == 0xFE &&
203 //           u_text[2] == 0x00 &&
204 //           u_text[3] == 0x00)
205 //        {
206 //            *encoding = MyENCODING_UTF_32LE;
207 //            return true;
208 //        }
209 //    }
210 
211     return false;
212 }
213 
myencoding_detect_and_cut_bom(const char * text,size_t length,myencoding_t * encoding,const char ** new_text,size_t * new_size)214 bool myencoding_detect_and_cut_bom(const char *text, size_t length, myencoding_t *encoding, const char **new_text, size_t *new_size)
215 {
216     if(myencoding_detect_bom(text, length, encoding))
217     {
218         if(*encoding == MyENCODING_UTF_8) {
219             *new_text = &text[3];
220             *new_size = length - 3;
221         }
222         else {
223             *new_text = &text[2];
224             *new_size = length - 2;
225         }
226 
227         return true;
228     }
229 
230     return false;
231 }
232 
myencoding_detect_unicode(const char * text,size_t length,myencoding_t * encoding)233 bool myencoding_detect_unicode(const char *text, size_t length, myencoding_t *encoding)
234 {
235     unsigned const char *u_text = (unsigned const char*)text;
236     *encoding = MyENCODING_DEFAULT;
237 
238     myencoding_unicode_result_t res = myencoding_detect_utf_16(u_text, length);
239 
240     if(res.count_bad == 0 && res.count_good >= 3) {
241         *encoding = MyENCODING_UTF_16LE;
242         return true;
243     }
244     else if(res.count_bad >= 3 && res.count_good == 0) {
245         *encoding = MyENCODING_UTF_16BE;
246         return true;
247     }
248 
249     res = myencoding_detect_utf_8(u_text, length);
250     if(myencoding_detect_unicode_has_end(&res, 10)) {
251         *encoding = MyENCODING_UTF_8;
252         return true;
253     }
254 
255     return false;
256 }
257 
myencoding_detect_russian(const char * text,size_t length,myencoding_t * encoding)258 bool myencoding_detect_russian(const char *text, size_t length, myencoding_t *encoding)
259 {
260     unsigned const char *u_text = (unsigned const char*)text;
261 
262     size_t min_count = 50;
263     size_t min_value = 100000;
264     size_t max_value = 0;
265 
266     *encoding = MyENCODING_DEFAULT;
267 
268     myencoding_trigram_result_t
269     res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_windows_1251, 1000, min_value);
270     if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
271         *encoding = MyENCODING_WINDOWS_1251;
272         return true;
273     }
274 
275     max_value = res.value;
276     if(max_value) {
277         *encoding = MyENCODING_WINDOWS_1251;
278     }
279 
280     res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_koi8_r, 1000, min_value);
281     if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
282         *encoding = MyENCODING_KOI8_R;
283         return true;
284     }
285 
286     if(max_value < res.value) {
287         *encoding = MyENCODING_KOI8_R;
288         max_value = res.value;
289     }
290 
291     res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_iso_8859_5, 1000, min_value);
292     if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
293         *encoding = MyENCODING_ISO_8859_5;
294         return true;
295     }
296 
297     if(max_value < res.value) {
298         *encoding = MyENCODING_ISO_8859_5;
299         max_value = res.value;
300     }
301 
302     res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_x_mac_cyrillic, 1000, min_value);
303     if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
304         *encoding = MyENCODING_X_MAC_CYRILLIC;
305         return true;
306     }
307 
308     if(max_value < res.value) {
309         *encoding = MyENCODING_X_MAC_CYRILLIC;
310         max_value = res.value;
311     }
312 
313     res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_ibm866, 1000, min_value);
314     if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
315         *encoding = MyENCODING_IBM866;
316         return true;
317     }
318 
319     if(max_value < res.value) {
320         *encoding = MyENCODING_IBM866;
321     }
322 
323     return false;
324 }
325 
myencoding_detect(const char * text,size_t length,myencoding_t * encoding)326 bool myencoding_detect(const char *text, size_t length, myencoding_t *encoding)
327 {
328     *encoding = MyENCODING_DEFAULT;
329 
330     if(myencoding_detect_unicode(text, length, encoding))
331         return true;
332 
333     if(myencoding_detect_russian(text, length, encoding))
334         return true;
335 
336     return false;
337 }
338 
myencoding_name_entry_by_name(const char * name,size_t length)339 const myencoding_detect_name_entry_t * myencoding_name_entry_by_name(const char* name, size_t length)
340 {
341     size_t idx = ((mycore_string_chars_lowercase_map[ (const unsigned char)name[0] ] *
342                    mycore_string_chars_lowercase_map[ (const unsigned char)name[(length - 1)] ] *
343                    length)
344                   % MyENCODING_DETECT_NAME_STATIC_SIZE) + 1;
345 
346     while (myencoding_detect_name_entry_static_list_index[idx].label)
347     {
348         if(myencoding_detect_name_entry_static_list_index[idx].label_length == length) {
349             if(mycore_strncasecmp(myencoding_detect_name_entry_static_list_index[idx].label, name, length) == 0)
350                 return &myencoding_detect_name_entry_static_list_index[idx];
351 
352             if(myencoding_detect_name_entry_static_list_index[idx].next)
353                 idx = myencoding_detect_name_entry_static_list_index[idx].next;
354             else
355                 return NULL;
356         }
357         else if(myencoding_detect_name_entry_static_list_index[idx].label_length > length) {
358             return NULL;
359         }
360         else {
361             idx = myencoding_detect_name_entry_static_list_index[idx].next;
362         }
363     }
364 
365     return NULL;
366 }
367 
myencoding_by_name(const char * name,size_t length,myencoding_t * encoding)368 bool myencoding_by_name(const char *name, size_t length, myencoding_t *encoding)
369 {
370     const myencoding_detect_name_entry_t *entry = myencoding_name_entry_by_name(name, length);
371 
372     if(entry) {
373         if(encoding)
374             *encoding = entry->encoding;
375 
376         return true;
377     }
378 
379     return false;
380 }
381 
myencoding_name_by_id(myencoding_t encoding,size_t * length)382 const char * myencoding_name_by_id(myencoding_t encoding, size_t *length)
383 {
384     if(encoding >= MyENCODING_LAST_ENTRY) {
385         if(length) {
386             *length = 0;
387         }
388 
389         return NULL;
390     }
391 
392     const myencoding_entry_name_index_t *entry = &myencoding_entry_name_index_static_list_index[encoding];
393 
394     if(length) {
395         *length = entry->length;
396     }
397 
398     return entry->name;
399 }
400 
401 /*
402   When an algorithm requires a user agent to prescan a byte stream to determine its encoding,
403   given some defined end condition, then it must run the following steps.
404   These steps either abort unsuccessfully or return a character encoding.
405   If at any point during these steps (including during instances of the get an attribute algorithm invoked by this one)
406   the user agent either runs out of bytes (meaning the position pointer created in the first step below goes beyond the end of the byte stream obtained so far)
407   or reaches its end condition, then abort the prescan a byte stream to determine its encoding algorithm unsuccessfully.
408 */
myencoding_extracting_character_encoding_from_charset(const char * data,size_t data_size,myencoding_t * encoding)409 bool myencoding_extracting_character_encoding_from_charset(const char *data, size_t data_size, myencoding_t *encoding)
410 {
411     return myencoding_extracting_character_encoding_from_charset_with_found(data, data_size, encoding, NULL, NULL);
412 }
413 
myencoding_extracting_character_encoding_from_charset_with_found(const char * data,size_t data_size,myencoding_t * encoding,const char ** found,size_t * found_length)414 bool myencoding_extracting_character_encoding_from_charset_with_found(const char *data, size_t data_size, myencoding_t *encoding, const char **found, size_t *found_length)
415 {
416     *encoding = MyENCODING_NOT_DETERMINED;
417 
418     if(found)
419         *found = NULL;
420     if(found_length)
421         *found_length = 0;
422 
423     /* 1 */
424     size_t length = 0;
425     size_t charset_length = strlen("charset");
426 
427     bool is_get_pos = false;
428     const unsigned char *udata = (const unsigned char *)data;
429 
430     /* 2 */
431     while((length + charset_length) < data_size) {
432         if(mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"charset", &udata[length]))
433         {
434             length += charset_length;
435 
436             /* 2 */
437             while(length < data_size) {
438                 if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
439                    udata[length] != 0x0D && udata[length] != 0x20)
440                 {
441                     break;
442                 }
443 
444                 length++;
445             }
446 
447             /* 4 */
448             if(udata[length] == 0x3D) { /* EQUALS SIGN (=) */
449                 is_get_pos = true;
450 
451                 length++;
452                 break;
453             }
454         }
455 
456         length++;
457     }
458 
459     if(is_get_pos == false || length >= data_size)
460         return false;
461 
462     /* 5 */
463     while(length < data_size) {
464         if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
465            udata[length] != 0x0D && udata[length] != 0x20)
466         {
467             break;
468         }
469 
470         length++;
471     }
472 
473     if(length >= data_size)
474         return false;
475 
476     /* 6 */
477     /* " */
478     if(udata[length] == 0x22)
479     {
480         length++;
481         size_t begin = length;
482 
483         while(length < data_size) {
484             if(udata[length] == 0x22)
485             {
486                 if(found)
487                     *found = &data[begin];
488                 if(found_length)
489                     *found_length = (length - begin);
490 
491                 return myencoding_by_name(&data[begin], (length - begin), encoding);
492             }
493 
494             length++;
495         }
496 
497         return false;
498     }
499 
500     /* ' */
501     if(udata[length] == 0x27)
502     {
503         length++;
504         size_t begin = length;
505 
506         while(length < data_size) {
507             if(udata[length] == 0x27)
508             {
509                 if(found)
510                     *found = &data[begin];
511                 if(found_length)
512                     *found_length = (length - begin);
513 
514                 return myencoding_by_name(&data[begin], (length - begin), encoding);
515             }
516 
517             length++;
518         }
519 
520         return false;
521     }
522 
523     /* other */
524     while(length < data_size) {
525         if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
526            udata[length] != 0x0D && udata[length] != 0x20)
527         {
528             size_t begin = length;
529 
530             while(length < data_size) {
531                 /* SEMICOLON character (;) */
532                 if(udata[length] == 0x3B)
533                 {
534                     if(found)
535                         *found = &data[begin];
536                     if(found_length)
537                         *found_length = (length - begin);
538 
539                     return myencoding_by_name(&data[begin], (length - begin), encoding);
540                 }
541 
542                 length++;
543             }
544 
545             if(found)
546                 *found = &data[begin];
547             if(found_length)
548                 *found_length = (length - begin);
549 
550             return myencoding_by_name(&data[begin], (length - begin), encoding);
551         }
552 
553         length++;
554     }
555 
556     return false;
557 }
558 
myencoding_prescan_stream_to_determine_encoding_get_attr_spaces(const unsigned char * udata,size_t * data_length,size_t data_size,myencoding_detect_attr_t * attr)559 bool myencoding_prescan_stream_to_determine_encoding_get_attr_spaces(const unsigned char *udata, size_t *data_length, size_t data_size, myencoding_detect_attr_t *attr)
560 {
561     size_t length = *data_length;
562 
563     /* set position */
564     attr->key_length = length - attr->key_begin;
565 
566     /* 6 */
567     while(length < data_size) {
568         if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
569            udata[length] != 0x0D && udata[length] != 0x20 && udata[length] != 0x2F)
570         {
571             break;
572         }
573 
574         length++;
575     }
576 
577     if(length >= data_size) {
578         *data_length = length;
579         return false;
580     }
581 
582     /* 7 */
583     if(udata[length] != 0x3D) {
584         *data_length = length;
585         return false;
586     }
587 
588     /* 8 */
589     *data_length = (length + 1);
590     return true;
591 }
592 
myencoding_prescan_stream_to_determine_encoding_get_attr_value(const unsigned char * udata,size_t length,size_t data_size,myencoding_detect_attr_t * attr,bool * it_last)593 size_t myencoding_prescan_stream_to_determine_encoding_get_attr_value(const unsigned char *udata, size_t length, size_t data_size, myencoding_detect_attr_t *attr, bool *it_last)
594 {
595     /* 9 */
596     while(length < data_size) {
597         if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
598            udata[length] != 0x0D && udata[length] != 0x20)
599         {
600             break;
601         }
602 
603         length++;
604     }
605 
606     if(length >= data_size) {
607         *it_last = true;
608         return length;
609     }
610 
611     /* 10 */
612     switch (udata[length]) {
613         case 0x22: /* (ASCII ") */
614             length++;
615             attr->value_begin = length;
616 
617             while(length < data_size) {
618                 if(udata[length] == 0x22)
619                 {
620                     attr->value_length = length - attr->value_begin;
621                     return (length + 1);
622                 }
623 
624                 length++;
625             }
626 
627             break;
628 
629         case 0x27: /* (ASCII ') */
630             length++;
631             attr->value_begin = length;
632 
633             while(length < data_size) {
634                 if(udata[length] == 0x27)
635                 {
636                     attr->value_length = length - attr->value_begin;
637                     return (length + 1);
638                 }
639 
640                 length++;
641             }
642 
643             break;
644 
645         case 0x3E: /* (ASCII >) */
646             *it_last = true;
647             return (length + 1);
648 
649         default:
650             attr->value_begin = length;
651 
652             while(length < data_size) {
653                 if(udata[length] == 0x09 || udata[length] == 0x0A || udata[length] == 0x0C ||
654                    udata[length] == 0x0D || udata[length] == 0x20 || udata[length] == 0x3E)
655                 {
656                     attr->value_length = length - attr->value_begin;
657                     return (length + 1);
658                 }
659 
660                 length++;
661             }
662 
663             break;
664     }
665 
666     attr->value_length = length - attr->value_begin;
667     return length;
668 }
669 
myencoding_prescan_stream_to_determine_encoding_get_attr(const unsigned char * udata,size_t length,size_t data_size,myencoding_detect_attr_t * attr,bool * it_last)670 size_t myencoding_prescan_stream_to_determine_encoding_get_attr(const unsigned char *udata, size_t length, size_t data_size, myencoding_detect_attr_t *attr, bool *it_last)
671 {
672     memset(attr, 0, sizeof(myencoding_detect_attr_t));
673 
674     /*
675      If the byte at position is one of 0x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR),
676      0x20 (ASCII space), or 0x2F (ASCII /) then advance position to the next byte and redo this step.
677      */
678     /* 1 */
679     while(length < data_size) {
680         if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
681            udata[length] != 0x0D && udata[length] != 0x20 && udata[length] != 0x2F)
682         {
683             break;
684         }
685 
686         length++;
687     }
688 
689     if(length >= data_size) {
690         *it_last = true;
691         return length;
692     }
693 
694     /* 2 */
695     if(udata[length] == 0x3E) { /* (ASCII >) */
696         *it_last = true;
697         return (length + 1);
698     }
699 
700     attr->key_begin = length;
701 
702     /* 3, 4 */
703     while(length < data_size) {
704         switch (udata[length]) {
705             case 0x3D: /* (ASCII =) */
706                 if(attr->key_begin != (length - 1)) {
707                     attr->key_length = length - attr->key_begin;
708 
709                     length++;
710                     return myencoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last);
711                 }
712 
713             /* fall through */
714 
715             case 0x09: /* (ASCII TAB)   */
716             case 0x0A: /* (ASCII LF)    */
717             case 0x0C: /* (ASCII FF)    */
718             case 0x0D: /* (ASCII CR)    */
719             case 0x20: /* (ASCII space) */
720                 length++;
721 
722                 if(myencoding_prescan_stream_to_determine_encoding_get_attr_spaces(udata, &length, data_size, attr) == false) {
723                     *it_last = true;
724                     return length;
725                 }
726 
727                 return myencoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last);
728 
729             case 0x2F: /* (ASCII /)     */
730             case 0x3E: /* (ASCII >)     */
731                 *it_last = true;
732                 attr->key_length = length - attr->key_begin;
733 
734                 return (length + 1);
735 
736             default:
737                 break;
738         }
739 
740         length++;
741     }
742 
743     if(myencoding_prescan_stream_to_determine_encoding_get_attr_spaces(udata, &length, data_size, attr) == false) {
744         *it_last = true;
745         return length;
746     }
747 
748     return myencoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last);
749 }
750 
myencoding_prescan_stream_to_determine_encoding_check_meta(const unsigned char * udata,size_t * length,size_t data_size,myencoding_t * encoding,const char ** found,size_t * found_length)751 bool myencoding_prescan_stream_to_determine_encoding_check_meta(const unsigned char *udata, size_t *length, size_t data_size, myencoding_t *encoding, const char **found, size_t *found_length)
752 {
753     myencoding_detect_attr_t attr;
754 
755     bool got_pragma = false;
756     bool it_last = false;
757 
758     unsigned int need_pragma = 0; /* 0 = NULL, 1 = false, 2 = true */
759 
760     /*
761       http-equiv = 1
762       content = 2
763       charset = 4
764      */
765     /* If the attribute's name is already in attribute list, then return to the step labeled attributes. */
766     size_t is_exists = 0;
767 
768     while(*length < data_size) {
769         *length = myencoding_prescan_stream_to_determine_encoding_get_attr(udata, *length, data_size, &attr, &it_last);
770 
771         /* 9 */
772         if(attr.key_length == strlen("http-equiv") &&
773            mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"http-equiv", &udata[ attr.key_begin ]))
774         {
775             if((is_exists & 1) == 0) {
776                 is_exists |= 1;
777 
778                 if(attr.value_length == strlen("content-type") &&
779                    mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content-type", &udata[ attr.value_begin ]))
780                 {
781                     got_pragma = true;
782                 }
783             }
784         }
785         else if(attr.key_length == strlen("content") &&
786                 mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content", &udata[ attr.key_begin ]))
787         {
788             if((is_exists & 2) == 0) {
789                 is_exists |= 2;
790 
791                 if(myencoding_extracting_character_encoding_from_charset_with_found((const char*)(&udata[ attr.value_begin ]), attr.value_length, encoding, found, found_length)) {
792                     need_pragma = 2;
793                 }
794             }
795         }
796         else if(attr.key_length == strlen("charset") &&
797                 mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"charset", &udata[ attr.key_begin ]))
798         {
799             if((is_exists & 4) == 0) {
800                 is_exists |= 4;
801 
802                 if(found)
803                     *found = (const char*)(&udata[ attr.value_begin ]);
804                 if(found_length)
805                     *found_length = attr.value_length;
806 
807                 myencoding_by_name((const char*)(&udata[ attr.value_begin ]), attr.value_length, encoding);
808                 need_pragma = 1;
809             }
810         }
811 
812         if(it_last)
813             break;
814     }
815 
816     /* 11, 12, 13 */
817     if(need_pragma == 0 || (need_pragma == 2 && got_pragma == false)) {
818         *encoding = MyENCODING_NOT_DETERMINED;
819         return false;
820     }
821 
822     /* 14 */
823     if(*encoding == MyENCODING_UTF_16BE || *encoding == MyENCODING_UTF_16LE) {
824         *encoding = MyENCODING_UTF_8;
825     }
826 
827     /* 15 */
828     if(*encoding == MyENCODING_X_USER_DEFINED) {
829         *encoding = MyENCODING_WINDOWS_1252;
830     }
831 
832     /* 16 */
833     return true;
834 }
835 
myencoding_prescan_stream_to_determine_encoding_skip_name(const unsigned char * udata,size_t length,size_t data_size)836 size_t myencoding_prescan_stream_to_determine_encoding_skip_name(const unsigned char *udata, size_t length, size_t data_size)
837 {
838     while(length < data_size) {
839         if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
840            udata[length] != 0x0D && udata[length] != 0x20)
841         {
842             break;
843         }
844 
845         length++;
846     }
847 
848     if(length >= data_size)
849         return length;
850 
851     if(udata[length] == 0x3E) {
852         return (length + 1);
853     }
854 
855     myencoding_detect_attr_t attr;
856     bool it_last = false;
857 
858     while(length < data_size) {
859         length = myencoding_prescan_stream_to_determine_encoding_get_attr(udata, length, data_size, &attr, &it_last);
860 
861         if(it_last) {
862             return length;
863         }
864     }
865 
866     return length;
867 }
868 
myencoding_prescan_stream_to_determine_encoding_skip_other(const unsigned char * udata,size_t length,size_t data_size)869 size_t myencoding_prescan_stream_to_determine_encoding_skip_other(const unsigned char *udata, size_t length, size_t data_size)
870 {
871     if(udata[length] == 0x2F) { /* / */
872         length++;
873 
874         if(length >= data_size)
875             return length;
876 
877         if(mycore_tokenizer_chars_map[ udata[length] ] == MyCORE_STRING_MAP_CHAR_A_Z_a_z) {
878             return myencoding_prescan_stream_to_determine_encoding_skip_name(udata, length, data_size);
879         }
880 
881         while(length < data_size) {
882             if(udata[length] != 0x3E) {
883                 return (length + 1);
884             }
885 
886             length++;
887         }
888 
889         return length;
890     }
891     else if(udata[length] == 0x21) { /* ! */
892         length++;
893 
894         if((length + 2) < data_size && udata[length] == 0x2D && udata[(length+1)] == 0x2D) {
895             while(length < data_size) {
896                 if(udata[length] != 0x3E) {
897                     if(udata[(length - 1)] == 0x2D && udata[(length - 2)] == 0x2D)
898                         return (length + 1);
899                 }
900 
901                 length++;
902             }
903 
904             return length;
905         }
906 
907         while(length < data_size) {
908             if(udata[length] != 0x3E) {
909                 return (length + 1);
910             }
911 
912             length++;
913         }
914 
915         return length;
916     }
917     else if(udata[length] == 0x3F) { /* ? */
918         length++;
919 
920         while(length < data_size) {
921             if(udata[length] != 0x3E) {
922                 return (length + 1);
923             }
924 
925             length++;
926         }
927 
928         return length;
929     }
930 
931 
932     return myencoding_prescan_stream_to_determine_encoding_skip_name(udata, length, data_size);
933 }
934 
myencoding_prescan_stream_to_determine_encoding(const char * data,size_t data_size)935 myencoding_t myencoding_prescan_stream_to_determine_encoding(const char *data, size_t data_size)
936 {
937     return myencoding_prescan_stream_to_determine_encoding_with_found(data, data_size, NULL, NULL);
938 }
939 
myencoding_prescan_stream_to_determine_encoding_with_found(const char * data,size_t data_size,const char ** found,size_t * found_length)940 myencoding_t myencoding_prescan_stream_to_determine_encoding_with_found(const char *data, size_t data_size, const char **found, size_t *found_length)
941 {
942     const unsigned char* udata = (const unsigned char*)data;
943     myencoding_t encoding = MyENCODING_NOT_DETERMINED;
944 
945     if(found)
946         *found = NULL;
947     if(found_length)
948         *found_length = 0;
949 
950     size_t i = 0;
951     while(i < data_size) {
952         /* 0x3C = '<' */
953         if(data[i] == 0x3C)
954         {
955             if((i + 5) >= data_size)
956                 return encoding;
957 
958             i++;
959 
960             switch (data[i]) {
961                     /*
962                      A sequence of bytes starting with:
963                      0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61,
964                      and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F
965                      (case-insensitive ASCII '<meta' followed by a space or slash)
966                      */
967                 case 0x4D:
968                 case 0x6D:
969                     if(mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"meta", &udata[i])) {
970                         i += 4;
971 
972                         if(udata[i] == 0x09 || udata[i] == 0x0A || udata[i] == 0x0C ||
973                            udata[i] == 0x0D || udata[i] == 0x20 || udata[i] == 0x2F)
974                         {
975                             i++;
976 
977                             if(myencoding_prescan_stream_to_determine_encoding_check_meta(udata, &i, data_size, &encoding, found, found_length))
978                                 return encoding;
979                         }
980                     }
981 
982                     break;
983 
984                 default:
985                     i = myencoding_prescan_stream_to_determine_encoding_skip_other(udata, i, data_size);
986                     break;
987             }
988         }
989         else {
990             i++;
991         }
992     }
993 
994     return encoding;
995 }
996 
997 
998