1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4  **********************************************************************
5  *   Copyright (C) 2005-2016, International Business Machines
6  *   Corporation and others.  All Rights Reserved.
7  **********************************************************************
8  */
9 
10 #include "unicode/utypes.h"
11 
12 #if !UCONFIG_NO_CONVERSION
13 
14 #include "cmemory.h"
15 #include "csmatch.h"
16 #include "csrmbcs.h"
17 
18 #include <math.h>
19 
20 U_NAMESPACE_BEGIN
21 
22 #define min(x,y) (((x)<(y))?(x):(y))
23 
24 static const uint16_t commonChars_sjis [] = {
25 // TODO:  This set of data comes from the character frequency-
26 //        of-occurence analysis tool.  The data needs to be moved
27 //        into a resource and loaded from there.
28 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
29 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
30 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
31 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
32 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
33 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
34 
35 static const uint16_t commonChars_euc_jp[] = {
36 // TODO:  This set of data comes from the character frequency-
37 //        of-occurence analysis tool.  The data needs to be moved
38 //        into a resource and loaded from there.
39 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
40 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
41 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
42 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
43 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
44 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
45 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
46 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
47 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
48 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
49 
50 static const uint16_t commonChars_euc_kr[] = {
51 // TODO:  This set of data comes from the character frequency-
52 //        of-occurence analysis tool.  The data needs to be moved
53 //        into a resource and loaded from there.
54 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
55 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
56 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
57 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
58 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
59 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
60 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
61 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
62 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
63 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
64 
65 static const uint16_t commonChars_big5[] = {
66 // TODO:  This set of data comes from the character frequency-
67 //        of-occurence analysis tool.  The data needs to be moved
68 //        into a resource and loaded from there.
69 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
70 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
71 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
72 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
73 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
74 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
75 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
76 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
77 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
78 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
79 
80 static const uint16_t commonChars_gb_18030[] = {
81 // TODO:  This set of data comes from the character frequency-
82 //        of-occurence analysis tool.  The data needs to be moved
83 //        into a resource and loaded from there.
84 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
85 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
86 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
87 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
88 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
89 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
90 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
91 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
92 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
93 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
94 
binarySearch(const uint16_t * array,int32_t len,uint16_t value)95 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
96 {
97     int32_t start = 0, end = len-1;
98     int32_t mid = (start+end)/2;
99 
100     while(start <= end) {
101         if(array[mid] == value) {
102             return mid;
103         }
104 
105         if(array[mid] < value){
106             start = mid+1;
107         } else {
108             end = mid-1;
109         }
110 
111         mid = (start+end)/2;
112     }
113 
114     return -1;
115 }
116 
IteratedChar()117 IteratedChar::IteratedChar() :
118 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
119 {
120     // nothing else to do.
121 }
122 
123 /*void IteratedChar::reset()
124 {
125     charValue = 0;
126     index     = -1;
127     nextIndex = 0;
128     error     = FALSE;
129     done      = FALSE;
130 }*/
131 
nextByte(InputText * det)132 int32_t IteratedChar::nextByte(InputText *det)
133 {
134     if (nextIndex >= det->fRawLength) {
135         done = TRUE;
136 
137         return -1;
138     }
139 
140     return det->fRawInput[nextIndex++];
141 }
142 
~CharsetRecog_mbcs()143 CharsetRecog_mbcs::~CharsetRecog_mbcs()
144 {
145     // nothing to do.
146 }
147 
match_mbcs(InputText * det,const uint16_t commonChars[],int32_t commonCharsLen) const148 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
149     int32_t singleByteCharCount = 0;
150     int32_t doubleByteCharCount = 0;
151     int32_t commonCharCount     = 0;
152     int32_t badCharCount        = 0;
153     int32_t totalCharCount      = 0;
154     int32_t confidence          = 0;
155     IteratedChar iter;
156 
157     while (nextChar(&iter, det)) {
158         totalCharCount++;
159 
160         if (iter.error) {
161             badCharCount++;
162         } else {
163             if (iter.charValue <= 0xFF) {
164                 singleByteCharCount++;
165             } else {
166                 doubleByteCharCount++;
167 
168                 if (commonChars != 0) {
169                     if (binarySearch(commonChars, commonCharsLen, static_cast<uint16_t>(iter.charValue)) >= 0){
170                         commonCharCount += 1;
171                     }
172                 }
173             }
174         }
175 
176 
177         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
178             // Bail out early if the byte data is not matching the encoding scheme.
179             // break detectBlock;
180             return confidence;
181         }
182     }
183 
184     if (doubleByteCharCount <= 10 && badCharCount == 0) {
185         // Not many multi-byte chars.
186         if (doubleByteCharCount == 0 && totalCharCount < 10) {
187             // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
188             // We don't have enough data to have any confidence.
189             // Statistical analysis of single byte non-ASCII charcters would probably help here.
190             confidence = 0;
191         }
192         else {
193             //   ASCII or ISO file?  It's probably not our encoding,
194             //   but is not incompatible with our encoding, so don't give it a zero.
195             confidence = 10;
196         }
197 
198         return confidence;
199     }
200 
201     //
202     //  No match if there are too many characters that don't fit the encoding scheme.
203     //    (should we have zero tolerance for these?)
204     //
205     if (doubleByteCharCount < 20*badCharCount) {
206         confidence = 0;
207 
208         return confidence;
209     }
210 
211     if (commonChars == 0) {
212         // We have no statistics on frequently occuring characters.
213         //  Assess confidence purely on having a reasonable number of
214         //  multi-byte characters (the more the better)
215         confidence = 30 + doubleByteCharCount - 20*badCharCount;
216 
217         if (confidence > 100) {
218             confidence = 100;
219         }
220     } else {
221         //
222         // Frequency of occurence statistics exist.
223         //
224 
225         double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
226         double scaleFactor = 90.0 / maxVal;
227         confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
228 
229         confidence = min(confidence, 100);
230     }
231 
232     if (confidence < 0) {
233         confidence = 0;
234     }
235 
236     return confidence;
237 }
238 
~CharsetRecog_sjis()239 CharsetRecog_sjis::~CharsetRecog_sjis()
240 {
241     // nothing to do
242 }
243 
nextChar(IteratedChar * it,InputText * det) const244 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
245     it->index = it->nextIndex;
246     it->error = FALSE;
247 
248     int32_t firstByte = it->charValue = it->nextByte(det);
249 
250     if (firstByte < 0) {
251         return FALSE;
252     }
253 
254     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
255         return TRUE;
256     }
257 
258     int32_t secondByte = it->nextByte(det);
259     if (secondByte >= 0) {
260         it->charValue = (firstByte << 8) | secondByte;
261     }
262     // else we'll handle the error later.
263 
264     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
265         // Illegal second byte value.
266         it->error = TRUE;
267     }
268 
269     return TRUE;
270 }
271 
match(InputText * det,CharsetMatch * results) const272 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
273     int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
274     results->set(det, this, confidence);
275     return (confidence > 0);
276 }
277 
getName() const278 const char *CharsetRecog_sjis::getName() const
279 {
280     return "Shift_JIS";
281 }
282 
getLanguage() const283 const char *CharsetRecog_sjis::getLanguage() const
284 {
285     return "ja";
286 }
287 
~CharsetRecog_euc()288 CharsetRecog_euc::~CharsetRecog_euc()
289 {
290     // nothing to do
291 }
292 
nextChar(IteratedChar * it,InputText * det) const293 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
294     int32_t firstByte  = 0;
295     int32_t secondByte = 0;
296     int32_t thirdByte  = 0;
297 
298     it->index = it->nextIndex;
299     it->error = FALSE;
300     firstByte = it->charValue = it->nextByte(det);
301 
302     if (firstByte < 0) {
303         // Ran off the end of the input data
304         return FALSE;
305     }
306 
307     if (firstByte <= 0x8D) {
308         // single byte char
309         return TRUE;
310     }
311 
312     secondByte = it->nextByte(det);
313     if (secondByte >= 0) {
314         it->charValue = (it->charValue << 8) | secondByte;
315     }
316     // else we'll handle the error later.
317 
318     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
319         // Two byte Char
320         if (secondByte < 0xA1) {
321             it->error = TRUE;
322         }
323 
324         return TRUE;
325     }
326 
327     if (firstByte == 0x8E) {
328         // Code Set 2.
329         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
330         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
331         // We don't know which we've got.
332         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
333         //   bytes will look like a well formed 2 byte char.
334         if (secondByte < 0xA1) {
335             it->error = TRUE;
336         }
337 
338         return TRUE;
339     }
340 
341     if (firstByte == 0x8F) {
342         // Code set 3.
343         // Three byte total char size, two bytes of actual char value.
344         thirdByte    = it->nextByte(det);
345         it->charValue = (it->charValue << 8) | thirdByte;
346 
347         if (thirdByte < 0xa1) {
348             // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
349             it->error = TRUE;
350         }
351     }
352 
353     return TRUE;
354 
355 }
356 
~CharsetRecog_euc_jp()357 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
358 {
359     // nothing to do
360 }
361 
getName() const362 const char *CharsetRecog_euc_jp::getName() const
363 {
364     return "EUC-JP";
365 }
366 
getLanguage() const367 const char *CharsetRecog_euc_jp::getLanguage() const
368 {
369     return "ja";
370 }
371 
match(InputText * det,CharsetMatch * results) const372 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
373 {
374     int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
375     results->set(det, this, confidence);
376     return (confidence > 0);
377 }
378 
~CharsetRecog_euc_kr()379 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
380 {
381     // nothing to do
382 }
383 
getName() const384 const char *CharsetRecog_euc_kr::getName() const
385 {
386     return "EUC-KR";
387 }
388 
getLanguage() const389 const char *CharsetRecog_euc_kr::getLanguage() const
390 {
391     return "ko";
392 }
393 
match(InputText * det,CharsetMatch * results) const394 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
395 {
396     int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
397     results->set(det, this, confidence);
398     return (confidence > 0);
399 }
400 
~CharsetRecog_big5()401 CharsetRecog_big5::~CharsetRecog_big5()
402 {
403     // nothing to do
404 }
405 
nextChar(IteratedChar * it,InputText * det) const406 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
407 {
408     int32_t firstByte;
409 
410     it->index = it->nextIndex;
411     it->error = FALSE;
412     firstByte = it->charValue = it->nextByte(det);
413 
414     if (firstByte < 0) {
415         return FALSE;
416     }
417 
418     if (firstByte <= 0x7F || firstByte == 0xFF) {
419         // single byte character.
420         return TRUE;
421     }
422 
423     int32_t secondByte = it->nextByte(det);
424     if (secondByte >= 0)  {
425         it->charValue = (it->charValue << 8) | secondByte;
426     }
427     // else we'll handle the error later.
428 
429     if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
430         it->error = TRUE;
431     }
432 
433     return TRUE;
434 }
435 
getName() const436 const char *CharsetRecog_big5::getName() const
437 {
438     return "Big5";
439 }
440 
getLanguage() const441 const char *CharsetRecog_big5::getLanguage() const
442 {
443     return "zh";
444 }
445 
match(InputText * det,CharsetMatch * results) const446 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
447 {
448     int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
449     results->set(det, this, confidence);
450     return (confidence > 0);
451 }
452 
~CharsetRecog_gb_18030()453 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
454 {
455     // nothing to do
456 }
457 
nextChar(IteratedChar * it,InputText * det) const458 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
459     int32_t firstByte  = 0;
460     int32_t secondByte = 0;
461     int32_t thirdByte  = 0;
462     int32_t fourthByte = 0;
463 
464     it->index = it->nextIndex;
465     it->error = FALSE;
466     firstByte = it->charValue = it->nextByte(det);
467 
468     if (firstByte < 0) {
469         // Ran off the end of the input data
470         return FALSE;
471     }
472 
473     if (firstByte <= 0x80) {
474         // single byte char
475         return TRUE;
476     }
477 
478     secondByte = it->nextByte(det);
479     if (secondByte >= 0) {
480         it->charValue = (it->charValue << 8) | secondByte;
481     }
482     // else we'll handle the error later.
483 
484     if (firstByte >= 0x81 && firstByte <= 0xFE) {
485         // Two byte Char
486         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
487             return TRUE;
488         }
489 
490         // Four byte char
491         if (secondByte >= 0x30 && secondByte <= 0x39) {
492             thirdByte = it->nextByte(det);
493 
494             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
495                 fourthByte = it->nextByte(det);
496 
497                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
498                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
499 
500                     return TRUE;
501                 }
502             }
503         }
504 
505         // Something wasn't valid, or we ran out of data (-1).
506         it->error = TRUE;
507     }
508 
509     return TRUE;
510 }
511 
getName() const512 const char *CharsetRecog_gb_18030::getName() const
513 {
514     return "GB18030";
515 }
516 
getLanguage() const517 const char *CharsetRecog_gb_18030::getLanguage() const
518 {
519     return "zh";
520 }
521 
match(InputText * det,CharsetMatch * results) const522 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
523 {
524     int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
525     results->set(det, this, confidence);
526     return (confidence > 0);
527 }
528 
529 U_NAMESPACE_END
530 #endif
531