1 /*
2 Copyright (C) 2015-2017 Alexander Borisov
3
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License as published by the Free Software Foundation; either
7 version 2.1 of the License, or (at your option) any later version.
8
9 This library is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 Lesser General Public License for more details.
13
14 You should have received a copy of the GNU Lesser General Public
15 License along with this library; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
18 Author: lex.borisov@gmail.com (Alexander Borisov)
19 */
20
21 #include "myencoding/encoding.h"
22 #include "myencoding/detect_resource.h"
23 #include "mycore/utils/resources.h"
24
myencoding_detect_by_trigram(unsigned const char * u_text,size_t length,const myencoding_trigram_t * list,size_t list_length,size_t max_sum_for_break)25 myencoding_trigram_result_t myencoding_detect_by_trigram(unsigned const char *u_text, size_t length,
26 const myencoding_trigram_t *list, size_t list_length,
27 size_t max_sum_for_break)
28 {
29 myencoding_trigram_result_t res = {0, 0};
30
31 for (size_t i = 0; i < (length - 3); i++) {
32 if(u_text[i] > 127)
33 {
34 for (size_t j = 0; j < list_length; j++)
35 {
36 if(memcmp(list[j].trigram, &u_text[i], 3) == 0) {
37 res.value += list[j].value;
38 res.count++;
39
40 if(res.value >= max_sum_for_break)
41 i = length;
42
43 break;
44 }
45 }
46 }
47 }
48
49 return res;
50 }
51
myencoding_detect_russian_has_end(myencoding_trigram_result_t * res,size_t min_count,size_t min_value)52 bool myencoding_detect_russian_has_end(myencoding_trigram_result_t *res, size_t min_count, size_t min_value)
53 {
54 if(res->value >= min_value || res->count >= min_count)
55 return true;
56
57 return false;
58 }
59
myencoding_detect_unicode_has_end(myencoding_unicode_result_t * res,size_t max_bad_percent)60 bool myencoding_detect_unicode_has_end(myencoding_unicode_result_t *res, size_t max_bad_percent)
61 {
62 if(res->count_good == 0) {
63 if(res->count_bad)
64 return false;
65
66 return true;
67 }
68 else if(res->count_bad == 0)
69 return true;
70
71 size_t percent_bad = (res->count_bad * 100) / res->count_good;
72 if(percent_bad < max_bad_percent)
73 return true;
74
75 return false;
76 }
77
myencoding_detect_utf_8(unsigned const char * u_text,size_t length)78 myencoding_unicode_result_t myencoding_detect_utf_8(unsigned const char *u_text, size_t length)
79 {
80 size_t i = 0;
81 myencoding_unicode_result_t res = {0, 0, 0};
82
83 while(i < length)
84 {
85 if((u_text[i] & 0x80) == 0x00) {
86 i++;
87 res.count_ascii++;
88 }
89 else if((u_text[i] & 0xE0) == 0xC0) {
90 i += 2;
91
92 if(i >= length)
93 break;
94
95 if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0))
96 res.count_good++;
97 else
98 res.count_bad++;
99 }
100 else if((u_text[i] & 0xF0) == 0xE0) {
101 i += 3;
102
103 if(i >= length)
104 break;
105
106 if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0) &&
107 ((u_text[i - 2] & 0x80) && (u_text[i - 2] & 0x40) == 0)) {
108 res.count_good++;
109 }
110 else
111 res.count_bad++;
112 }
113 else if((u_text[i] & 0xF8) == 0xF0) {
114 i += 4;
115
116 if(i >= length)
117 break;
118
119 if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0) &&
120 ((u_text[i - 2] & 0x80) && (u_text[i - 2] & 0x40) == 0) &&
121 ((u_text[i - 3] & 0x80) && (u_text[i - 3] & 0x40) == 0)) {
122 res.count_good++;
123 }
124 else
125 res.count_bad++;
126 }
127 else {
128 i++;
129 res.count_bad++;
130 }
131 }
132
133 return res;
134 }
135
myencoding_detect_utf_16(unsigned const char * u_text,size_t length)136 myencoding_unicode_result_t myencoding_detect_utf_16(unsigned const char *u_text, size_t length)
137 {
138 size_t i = 0;
139 myencoding_unicode_result_t res = {0, 0, 0};
140
141 while(i < length)
142 {
143 if(u_text[i] == 0x00) {
144 if((i % 2) == 0) {
145 i++;
146
147 if(u_text[i] > 0x1F && u_text[i] < 0x7F)
148 res.count_bad++;
149 }
150 else {
151 if(u_text[(i - 1)] > 0x1F && u_text[(i - 1)] < 0x7F)
152 res.count_good++;
153
154 i++;
155 }
156 }
157 else
158 i++;
159 }
160
161 return res;
162 }
163
myencoding_detect_bom(const char * text,size_t length,myencoding_t * encoding)164 bool myencoding_detect_bom(const char *text, size_t length, myencoding_t *encoding)
165 {
166 unsigned const char *u_text = (unsigned const char*)text;
167
168 if(length > 2) {
169 if(u_text[0] == 0xEF &&
170 u_text[1] == 0xBB &&
171 u_text[2] == 0xBF)
172 {
173 *encoding = MyENCODING_UTF_8;
174 return true;
175 }
176 }
177
178 if(length > 1) {
179 if(u_text[0] == 0xFE && u_text[1] == 0xFF) {
180 *encoding = MyENCODING_UTF_16BE;
181 return true;
182 }
183
184 if(u_text[0] == 0xFF && u_text[1] == 0xFE) {
185 *encoding = MyENCODING_UTF_16LE;
186 return true;
187 }
188 }
189
190 // //for UTF-32
191 // if(length > 3) {
192 // if(u_text[0] == 0x00 &&
193 // u_text[1] == 0x00 &&
194 // u_text[2] == 0xFE &&
195 // u_text[3] == 0xFF)
196 // {
197 // *encoding = MyENCODING_UTF_32BE;
198 // return true;
199 // }
200 //
201 // if(u_text[0] == 0xFF &&
202 // u_text[1] == 0xFE &&
203 // u_text[2] == 0x00 &&
204 // u_text[3] == 0x00)
205 // {
206 // *encoding = MyENCODING_UTF_32LE;
207 // return true;
208 // }
209 // }
210
211 return false;
212 }
213
myencoding_detect_and_cut_bom(const char * text,size_t length,myencoding_t * encoding,const char ** new_text,size_t * new_size)214 bool myencoding_detect_and_cut_bom(const char *text, size_t length, myencoding_t *encoding, const char **new_text, size_t *new_size)
215 {
216 if(myencoding_detect_bom(text, length, encoding))
217 {
218 if(*encoding == MyENCODING_UTF_8) {
219 *new_text = &text[3];
220 *new_size = length - 3;
221 }
222 else {
223 *new_text = &text[2];
224 *new_size = length - 2;
225 }
226
227 return true;
228 }
229
230 return false;
231 }
232
myencoding_detect_unicode(const char * text,size_t length,myencoding_t * encoding)233 bool myencoding_detect_unicode(const char *text, size_t length, myencoding_t *encoding)
234 {
235 unsigned const char *u_text = (unsigned const char*)text;
236 *encoding = MyENCODING_DEFAULT;
237
238 myencoding_unicode_result_t res = myencoding_detect_utf_16(u_text, length);
239
240 if(res.count_bad == 0 && res.count_good >= 3) {
241 *encoding = MyENCODING_UTF_16LE;
242 return true;
243 }
244 else if(res.count_bad >= 3 && res.count_good == 0) {
245 *encoding = MyENCODING_UTF_16BE;
246 return true;
247 }
248
249 res = myencoding_detect_utf_8(u_text, length);
250 if(myencoding_detect_unicode_has_end(&res, 10)) {
251 *encoding = MyENCODING_UTF_8;
252 return true;
253 }
254
255 return false;
256 }
257
myencoding_detect_russian(const char * text,size_t length,myencoding_t * encoding)258 bool myencoding_detect_russian(const char *text, size_t length, myencoding_t *encoding)
259 {
260 unsigned const char *u_text = (unsigned const char*)text;
261
262 size_t min_count = 50;
263 size_t min_value = 100000;
264 size_t max_value = 0;
265
266 *encoding = MyENCODING_DEFAULT;
267
268 myencoding_trigram_result_t
269 res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_windows_1251, 1000, min_value);
270 if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
271 *encoding = MyENCODING_WINDOWS_1251;
272 return true;
273 }
274
275 max_value = res.value;
276 if(max_value) {
277 *encoding = MyENCODING_WINDOWS_1251;
278 }
279
280 res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_koi8_r, 1000, min_value);
281 if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
282 *encoding = MyENCODING_KOI8_R;
283 return true;
284 }
285
286 if(max_value < res.value) {
287 *encoding = MyENCODING_KOI8_R;
288 max_value = res.value;
289 }
290
291 res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_iso_8859_5, 1000, min_value);
292 if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
293 *encoding = MyENCODING_ISO_8859_5;
294 return true;
295 }
296
297 if(max_value < res.value) {
298 *encoding = MyENCODING_ISO_8859_5;
299 max_value = res.value;
300 }
301
302 res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_x_mac_cyrillic, 1000, min_value);
303 if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
304 *encoding = MyENCODING_X_MAC_CYRILLIC;
305 return true;
306 }
307
308 if(max_value < res.value) {
309 *encoding = MyENCODING_X_MAC_CYRILLIC;
310 max_value = res.value;
311 }
312
313 res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_ibm866, 1000, min_value);
314 if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
315 *encoding = MyENCODING_IBM866;
316 return true;
317 }
318
319 if(max_value < res.value) {
320 *encoding = MyENCODING_IBM866;
321 }
322
323 return false;
324 }
325
myencoding_detect(const char * text,size_t length,myencoding_t * encoding)326 bool myencoding_detect(const char *text, size_t length, myencoding_t *encoding)
327 {
328 *encoding = MyENCODING_DEFAULT;
329
330 if(myencoding_detect_unicode(text, length, encoding))
331 return true;
332
333 if(myencoding_detect_russian(text, length, encoding))
334 return true;
335
336 return false;
337 }
338
myencoding_name_entry_by_name(const char * name,size_t length)339 const myencoding_detect_name_entry_t * myencoding_name_entry_by_name(const char* name, size_t length)
340 {
341 size_t idx = ((mycore_string_chars_lowercase_map[ (const unsigned char)name[0] ] *
342 mycore_string_chars_lowercase_map[ (const unsigned char)name[(length - 1)] ] *
343 length)
344 % MyENCODING_DETECT_NAME_STATIC_SIZE) + 1;
345
346 while (myencoding_detect_name_entry_static_list_index[idx].label)
347 {
348 if(myencoding_detect_name_entry_static_list_index[idx].label_length == length) {
349 if(mycore_strncasecmp(myencoding_detect_name_entry_static_list_index[idx].label, name, length) == 0)
350 return &myencoding_detect_name_entry_static_list_index[idx];
351
352 if(myencoding_detect_name_entry_static_list_index[idx].next)
353 idx = myencoding_detect_name_entry_static_list_index[idx].next;
354 else
355 return NULL;
356 }
357 else if(myencoding_detect_name_entry_static_list_index[idx].label_length > length) {
358 return NULL;
359 }
360 else {
361 idx = myencoding_detect_name_entry_static_list_index[idx].next;
362 }
363 }
364
365 return NULL;
366 }
367
myencoding_by_name(const char * name,size_t length,myencoding_t * encoding)368 bool myencoding_by_name(const char *name, size_t length, myencoding_t *encoding)
369 {
370 const myencoding_detect_name_entry_t *entry = myencoding_name_entry_by_name(name, length);
371
372 if(entry) {
373 if(encoding)
374 *encoding = entry->encoding;
375
376 return true;
377 }
378
379 return false;
380 }
381
myencoding_name_by_id(myencoding_t encoding,size_t * length)382 const char * myencoding_name_by_id(myencoding_t encoding, size_t *length)
383 {
384 if(encoding >= MyENCODING_LAST_ENTRY) {
385 if(length) {
386 *length = 0;
387 }
388
389 return NULL;
390 }
391
392 const myencoding_entry_name_index_t *entry = &myencoding_entry_name_index_static_list_index[encoding];
393
394 if(length) {
395 *length = entry->length;
396 }
397
398 return entry->name;
399 }
400
401 /*
402 When an algorithm requires a user agent to prescan a byte stream to determine its encoding,
403 given some defined end condition, then it must run the following steps.
404 These steps either abort unsuccessfully or return a character encoding.
405 If at any point during these steps (including during instances of the get an attribute algorithm invoked by this one)
406 the user agent either runs out of bytes (meaning the position pointer created in the first step below goes beyond the end of the byte stream obtained so far)
407 or reaches its end condition, then abort the prescan a byte stream to determine its encoding algorithm unsuccessfully.
408 */
myencoding_extracting_character_encoding_from_charset(const char * data,size_t data_size,myencoding_t * encoding)409 bool myencoding_extracting_character_encoding_from_charset(const char *data, size_t data_size, myencoding_t *encoding)
410 {
411 return myencoding_extracting_character_encoding_from_charset_with_found(data, data_size, encoding, NULL, NULL);
412 }
413
myencoding_extracting_character_encoding_from_charset_with_found(const char * data,size_t data_size,myencoding_t * encoding,const char ** found,size_t * found_length)414 bool myencoding_extracting_character_encoding_from_charset_with_found(const char *data, size_t data_size, myencoding_t *encoding, const char **found, size_t *found_length)
415 {
416 *encoding = MyENCODING_NOT_DETERMINED;
417
418 if(found)
419 *found = NULL;
420 if(found_length)
421 *found_length = 0;
422
423 /* 1 */
424 size_t length = 0;
425 size_t charset_length = strlen("charset");
426
427 bool is_get_pos = false;
428 const unsigned char *udata = (const unsigned char *)data;
429
430 /* 2 */
431 while((length + charset_length) < data_size) {
432 if(mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"charset", &udata[length]))
433 {
434 length += charset_length;
435
436 /* 2 */
437 while(length < data_size) {
438 if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
439 udata[length] != 0x0D && udata[length] != 0x20)
440 {
441 break;
442 }
443
444 length++;
445 }
446
447 /* 4 */
448 if(udata[length] == 0x3D) { /* EQUALS SIGN (=) */
449 is_get_pos = true;
450
451 length++;
452 break;
453 }
454 }
455
456 length++;
457 }
458
459 if(is_get_pos == false || length >= data_size)
460 return false;
461
462 /* 5 */
463 while(length < data_size) {
464 if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
465 udata[length] != 0x0D && udata[length] != 0x20)
466 {
467 break;
468 }
469
470 length++;
471 }
472
473 if(length >= data_size)
474 return false;
475
476 /* 6 */
477 /* " */
478 if(udata[length] == 0x22)
479 {
480 length++;
481 size_t begin = length;
482
483 while(length < data_size) {
484 if(udata[length] == 0x22)
485 {
486 if(found)
487 *found = &data[begin];
488 if(found_length)
489 *found_length = (length - begin);
490
491 return myencoding_by_name(&data[begin], (length - begin), encoding);
492 }
493
494 length++;
495 }
496
497 return false;
498 }
499
500 /* ' */
501 if(udata[length] == 0x27)
502 {
503 length++;
504 size_t begin = length;
505
506 while(length < data_size) {
507 if(udata[length] == 0x27)
508 {
509 if(found)
510 *found = &data[begin];
511 if(found_length)
512 *found_length = (length - begin);
513
514 return myencoding_by_name(&data[begin], (length - begin), encoding);
515 }
516
517 length++;
518 }
519
520 return false;
521 }
522
523 /* other */
524 while(length < data_size) {
525 if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
526 udata[length] != 0x0D && udata[length] != 0x20)
527 {
528 size_t begin = length;
529
530 while(length < data_size) {
531 /* SEMICOLON character (;) */
532 if(udata[length] == 0x3B)
533 {
534 if(found)
535 *found = &data[begin];
536 if(found_length)
537 *found_length = (length - begin);
538
539 return myencoding_by_name(&data[begin], (length - begin), encoding);
540 }
541
542 length++;
543 }
544
545 if(found)
546 *found = &data[begin];
547 if(found_length)
548 *found_length = (length - begin);
549
550 return myencoding_by_name(&data[begin], (length - begin), encoding);
551 }
552
553 length++;
554 }
555
556 return false;
557 }
558
myencoding_prescan_stream_to_determine_encoding_get_attr_spaces(const unsigned char * udata,size_t * data_length,size_t data_size,myencoding_detect_attr_t * attr)559 bool myencoding_prescan_stream_to_determine_encoding_get_attr_spaces(const unsigned char *udata, size_t *data_length, size_t data_size, myencoding_detect_attr_t *attr)
560 {
561 size_t length = *data_length;
562
563 /* set position */
564 attr->key_length = length - attr->key_begin;
565
566 /* 6 */
567 while(length < data_size) {
568 if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
569 udata[length] != 0x0D && udata[length] != 0x20 && udata[length] != 0x2F)
570 {
571 break;
572 }
573
574 length++;
575 }
576
577 if(length >= data_size) {
578 *data_length = length;
579 return false;
580 }
581
582 /* 7 */
583 if(udata[length] != 0x3D) {
584 *data_length = length;
585 return false;
586 }
587
588 /* 8 */
589 *data_length = (length + 1);
590 return true;
591 }
592
myencoding_prescan_stream_to_determine_encoding_get_attr_value(const unsigned char * udata,size_t length,size_t data_size,myencoding_detect_attr_t * attr,bool * it_last)593 size_t myencoding_prescan_stream_to_determine_encoding_get_attr_value(const unsigned char *udata, size_t length, size_t data_size, myencoding_detect_attr_t *attr, bool *it_last)
594 {
595 /* 9 */
596 while(length < data_size) {
597 if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
598 udata[length] != 0x0D && udata[length] != 0x20)
599 {
600 break;
601 }
602
603 length++;
604 }
605
606 if(length >= data_size) {
607 *it_last = true;
608 return length;
609 }
610
611 /* 10 */
612 switch (udata[length]) {
613 case 0x22: /* (ASCII ") */
614 length++;
615 attr->value_begin = length;
616
617 while(length < data_size) {
618 if(udata[length] == 0x22)
619 {
620 attr->value_length = length - attr->value_begin;
621 return (length + 1);
622 }
623
624 length++;
625 }
626
627 break;
628
629 case 0x27: /* (ASCII ') */
630 length++;
631 attr->value_begin = length;
632
633 while(length < data_size) {
634 if(udata[length] == 0x27)
635 {
636 attr->value_length = length - attr->value_begin;
637 return (length + 1);
638 }
639
640 length++;
641 }
642
643 break;
644
645 case 0x3E: /* (ASCII >) */
646 *it_last = true;
647 return (length + 1);
648
649 default:
650 attr->value_begin = length;
651
652 while(length < data_size) {
653 if(udata[length] == 0x09 || udata[length] == 0x0A || udata[length] == 0x0C ||
654 udata[length] == 0x0D || udata[length] == 0x20 || udata[length] == 0x3E)
655 {
656 attr->value_length = length - attr->value_begin;
657 return (length + 1);
658 }
659
660 length++;
661 }
662
663 break;
664 }
665
666 attr->value_length = length - attr->value_begin;
667 return length;
668 }
669
myencoding_prescan_stream_to_determine_encoding_get_attr(const unsigned char * udata,size_t length,size_t data_size,myencoding_detect_attr_t * attr,bool * it_last)670 size_t myencoding_prescan_stream_to_determine_encoding_get_attr(const unsigned char *udata, size_t length, size_t data_size, myencoding_detect_attr_t *attr, bool *it_last)
671 {
672 memset(attr, 0, sizeof(myencoding_detect_attr_t));
673
674 /*
675 If the byte at position is one of 0x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR),
676 0x20 (ASCII space), or 0x2F (ASCII /) then advance position to the next byte and redo this step.
677 */
678 /* 1 */
679 while(length < data_size) {
680 if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
681 udata[length] != 0x0D && udata[length] != 0x20 && udata[length] != 0x2F)
682 {
683 break;
684 }
685
686 length++;
687 }
688
689 if(length >= data_size) {
690 *it_last = true;
691 return length;
692 }
693
694 /* 2 */
695 if(udata[length] == 0x3E) { /* (ASCII >) */
696 *it_last = true;
697 return (length + 1);
698 }
699
700 attr->key_begin = length;
701
702 /* 3, 4 */
703 while(length < data_size) {
704 switch (udata[length]) {
705 case 0x3D: /* (ASCII =) */
706 if(attr->key_begin != (length - 1)) {
707 attr->key_length = length - attr->key_begin;
708
709 length++;
710 return myencoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last);
711 }
712
713 /* fall through */
714
715 case 0x09: /* (ASCII TAB) */
716 case 0x0A: /* (ASCII LF) */
717 case 0x0C: /* (ASCII FF) */
718 case 0x0D: /* (ASCII CR) */
719 case 0x20: /* (ASCII space) */
720 length++;
721
722 if(myencoding_prescan_stream_to_determine_encoding_get_attr_spaces(udata, &length, data_size, attr) == false) {
723 *it_last = true;
724 return length;
725 }
726
727 return myencoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last);
728
729 case 0x2F: /* (ASCII /) */
730 case 0x3E: /* (ASCII >) */
731 *it_last = true;
732 attr->key_length = length - attr->key_begin;
733
734 return (length + 1);
735
736 default:
737 break;
738 }
739
740 length++;
741 }
742
743 if(myencoding_prescan_stream_to_determine_encoding_get_attr_spaces(udata, &length, data_size, attr) == false) {
744 *it_last = true;
745 return length;
746 }
747
748 return myencoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last);
749 }
750
myencoding_prescan_stream_to_determine_encoding_check_meta(const unsigned char * udata,size_t * length,size_t data_size,myencoding_t * encoding,const char ** found,size_t * found_length)751 bool myencoding_prescan_stream_to_determine_encoding_check_meta(const unsigned char *udata, size_t *length, size_t data_size, myencoding_t *encoding, const char **found, size_t *found_length)
752 {
753 myencoding_detect_attr_t attr;
754
755 bool got_pragma = false;
756 bool it_last = false;
757
758 unsigned int need_pragma = 0; /* 0 = NULL, 1 = false, 2 = true */
759
760 /*
761 http-equiv = 1
762 content = 2
763 charset = 4
764 */
765 /* If the attribute's name is already in attribute list, then return to the step labeled attributes. */
766 size_t is_exists = 0;
767
768 while(*length < data_size) {
769 *length = myencoding_prescan_stream_to_determine_encoding_get_attr(udata, *length, data_size, &attr, &it_last);
770
771 /* 9 */
772 if(attr.key_length == strlen("http-equiv") &&
773 mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"http-equiv", &udata[ attr.key_begin ]))
774 {
775 if((is_exists & 1) == 0) {
776 is_exists |= 1;
777
778 if(attr.value_length == strlen("content-type") &&
779 mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content-type", &udata[ attr.value_begin ]))
780 {
781 got_pragma = true;
782 }
783 }
784 }
785 else if(attr.key_length == strlen("content") &&
786 mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content", &udata[ attr.key_begin ]))
787 {
788 if((is_exists & 2) == 0) {
789 is_exists |= 2;
790
791 if(myencoding_extracting_character_encoding_from_charset_with_found((const char*)(&udata[ attr.value_begin ]), attr.value_length, encoding, found, found_length)) {
792 need_pragma = 2;
793 }
794 }
795 }
796 else if(attr.key_length == strlen("charset") &&
797 mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"charset", &udata[ attr.key_begin ]))
798 {
799 if((is_exists & 4) == 0) {
800 is_exists |= 4;
801
802 if(found)
803 *found = (const char*)(&udata[ attr.value_begin ]);
804 if(found_length)
805 *found_length = attr.value_length;
806
807 myencoding_by_name((const char*)(&udata[ attr.value_begin ]), attr.value_length, encoding);
808 need_pragma = 1;
809 }
810 }
811
812 if(it_last)
813 break;
814 }
815
816 /* 11, 12, 13 */
817 if(need_pragma == 0 || (need_pragma == 2 && got_pragma == false)) {
818 *encoding = MyENCODING_NOT_DETERMINED;
819 return false;
820 }
821
822 /* 14 */
823 if(*encoding == MyENCODING_UTF_16BE || *encoding == MyENCODING_UTF_16LE) {
824 *encoding = MyENCODING_UTF_8;
825 }
826
827 /* 15 */
828 if(*encoding == MyENCODING_X_USER_DEFINED) {
829 *encoding = MyENCODING_WINDOWS_1252;
830 }
831
832 /* 16 */
833 return true;
834 }
835
myencoding_prescan_stream_to_determine_encoding_skip_name(const unsigned char * udata,size_t length,size_t data_size)836 size_t myencoding_prescan_stream_to_determine_encoding_skip_name(const unsigned char *udata, size_t length, size_t data_size)
837 {
838 while(length < data_size) {
839 if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
840 udata[length] != 0x0D && udata[length] != 0x20)
841 {
842 break;
843 }
844
845 length++;
846 }
847
848 if(length >= data_size)
849 return length;
850
851 if(udata[length] == 0x3E) {
852 return (length + 1);
853 }
854
855 myencoding_detect_attr_t attr;
856 bool it_last = false;
857
858 while(length < data_size) {
859 length = myencoding_prescan_stream_to_determine_encoding_get_attr(udata, length, data_size, &attr, &it_last);
860
861 if(it_last) {
862 return length;
863 }
864 }
865
866 return length;
867 }
868
myencoding_prescan_stream_to_determine_encoding_skip_other(const unsigned char * udata,size_t length,size_t data_size)869 size_t myencoding_prescan_stream_to_determine_encoding_skip_other(const unsigned char *udata, size_t length, size_t data_size)
870 {
871 if(udata[length] == 0x2F) { /* / */
872 length++;
873
874 if(length >= data_size)
875 return length;
876
877 if(mycore_tokenizer_chars_map[ udata[length] ] == MyCORE_STRING_MAP_CHAR_A_Z_a_z) {
878 return myencoding_prescan_stream_to_determine_encoding_skip_name(udata, length, data_size);
879 }
880
881 while(length < data_size) {
882 if(udata[length] != 0x3E) {
883 return (length + 1);
884 }
885
886 length++;
887 }
888
889 return length;
890 }
891 else if(udata[length] == 0x21) { /* ! */
892 length++;
893
894 if((length + 2) < data_size && udata[length] == 0x2D && udata[(length+1)] == 0x2D) {
895 while(length < data_size) {
896 if(udata[length] != 0x3E) {
897 if(udata[(length - 1)] == 0x2D && udata[(length - 2)] == 0x2D)
898 return (length + 1);
899 }
900
901 length++;
902 }
903
904 return length;
905 }
906
907 while(length < data_size) {
908 if(udata[length] != 0x3E) {
909 return (length + 1);
910 }
911
912 length++;
913 }
914
915 return length;
916 }
917 else if(udata[length] == 0x3F) { /* ? */
918 length++;
919
920 while(length < data_size) {
921 if(udata[length] != 0x3E) {
922 return (length + 1);
923 }
924
925 length++;
926 }
927
928 return length;
929 }
930
931
932 return myencoding_prescan_stream_to_determine_encoding_skip_name(udata, length, data_size);
933 }
934
myencoding_prescan_stream_to_determine_encoding(const char * data,size_t data_size)935 myencoding_t myencoding_prescan_stream_to_determine_encoding(const char *data, size_t data_size)
936 {
937 return myencoding_prescan_stream_to_determine_encoding_with_found(data, data_size, NULL, NULL);
938 }
939
myencoding_prescan_stream_to_determine_encoding_with_found(const char * data,size_t data_size,const char ** found,size_t * found_length)940 myencoding_t myencoding_prescan_stream_to_determine_encoding_with_found(const char *data, size_t data_size, const char **found, size_t *found_length)
941 {
942 const unsigned char* udata = (const unsigned char*)data;
943 myencoding_t encoding = MyENCODING_NOT_DETERMINED;
944
945 if(found)
946 *found = NULL;
947 if(found_length)
948 *found_length = 0;
949
950 size_t i = 0;
951 while(i < data_size) {
952 /* 0x3C = '<' */
953 if(data[i] == 0x3C)
954 {
955 if((i + 5) >= data_size)
956 return encoding;
957
958 i++;
959
960 switch (data[i]) {
961 /*
962 A sequence of bytes starting with:
963 0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61,
964 and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F
965 (case-insensitive ASCII '<meta' followed by a space or slash)
966 */
967 case 0x4D:
968 case 0x6D:
969 if(mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"meta", &udata[i])) {
970 i += 4;
971
972 if(udata[i] == 0x09 || udata[i] == 0x0A || udata[i] == 0x0C ||
973 udata[i] == 0x0D || udata[i] == 0x20 || udata[i] == 0x2F)
974 {
975 i++;
976
977 if(myencoding_prescan_stream_to_determine_encoding_check_meta(udata, &i, data_size, &encoding, found, found_length))
978 return encoding;
979 }
980 }
981
982 break;
983
984 default:
985 i = myencoding_prescan_stream_to_determine_encoding_skip_other(udata, i, data_size);
986 break;
987 }
988 }
989 else {
990 i++;
991 }
992 }
993
994 return encoding;
995 }
996
997
998