1package chardet 2 3import ( 4 "errors" 5 "math" 6) 7 8type recognizerMultiByte struct { 9 charset string 10 language string 11 decoder charDecoder 12 commonChars []uint16 13} 14 15type charDecoder interface { 16 DecodeOneChar([]byte) (c uint16, remain []byte, err error) 17} 18 19func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) { 20 return recognizerOutput{ 21 Charset: r.charset, 22 Language: r.language, 23 Confidence: r.matchConfidence(input), 24 } 25} 26 27func (r *recognizerMultiByte) matchConfidence(input *recognizerInput) int { 28 raw := input.raw 29 var c uint16 30 var err error 31 var totalCharCount, badCharCount, singleByteCharCount, doubleByteCharCount, commonCharCount int 32 for c, raw, err = r.decoder.DecodeOneChar(raw); len(raw) > 0; c, raw, err = r.decoder.DecodeOneChar(raw) { 33 totalCharCount++ 34 if err != nil { 35 badCharCount++ 36 } else if c <= 0xFF { 37 singleByteCharCount++ 38 } else { 39 doubleByteCharCount++ 40 if r.commonChars != nil && binarySearch(r.commonChars, c) { 41 commonCharCount++ 42 } 43 } 44 if badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount { 45 return 0 46 } 47 } 48 49 if doubleByteCharCount <= 10 && badCharCount == 0 { 50 if doubleByteCharCount == 0 && totalCharCount < 10 { 51 return 0 52 } else { 53 return 10 54 } 55 } 56 57 if doubleByteCharCount < 20*badCharCount { 58 return 0 59 } 60 if r.commonChars == nil { 61 confidence := 30 + doubleByteCharCount - 20*badCharCount 62 if confidence > 100 { 63 confidence = 100 64 } 65 return confidence 66 } 67 maxVal := math.Log(float64(doubleByteCharCount) / 4) 68 scaleFactor := 90 / maxVal 69 confidence := int(math.Log(float64(commonCharCount)+1)*scaleFactor + 10) 70 if confidence > 100 { 71 confidence = 100 72 } 73 if confidence < 0 { 74 confidence = 0 75 } 76 return confidence 77} 78 79func binarySearch(l []uint16, c uint16) bool { 80 start := 0 81 end := len(l) - 1 82 for start <= end { 83 mid := (start + end) / 2 84 if c == l[mid] { 85 return true 86 } else if c < l[mid] { 87 end = mid - 1 88 } else { 89 start = mid + 1 90 } 91 } 92 return false 93} 94 95var eobError = errors.New("End of input buffer") 96var badCharError = errors.New("Decode a bad char") 97 98type charDecoder_sjis struct { 99} 100 101func (charDecoder_sjis) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) { 102 if len(input) == 0 { 103 return 0, nil, eobError 104 } 105 first := input[0] 106 c = uint16(first) 107 remain = input[1:] 108 if first <= 0x7F || (first > 0xA0 && first <= 0xDF) { 109 return 110 } 111 if len(remain) == 0 { 112 return c, remain, badCharError 113 } 114 second := remain[0] 115 remain = remain[1:] 116 c = c<<8 | uint16(second) 117 if (second >= 0x40 && second <= 0x7F) || (second >= 0x80 && second <= 0xFE) { 118 } else { 119 err = badCharError 120 } 121 return 122} 123 124var commonChars_sjis = []uint16{ 125 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 126 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 127 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 128 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 129 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 130 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa, 131} 132 133func newRecognizer_sjis() *recognizerMultiByte { 134 return &recognizerMultiByte{ 135 "Shift_JIS", 136 "ja", 137 charDecoder_sjis{}, 138 commonChars_sjis, 139 } 140} 141 142type charDecoder_euc struct { 143} 144 145func (charDecoder_euc) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) { 146 if len(input) == 0 { 147 return 0, nil, eobError 148 } 149 first := input[0] 150 remain = input[1:] 151 c = uint16(first) 152 if first <= 0x8D { 153 return uint16(first), remain, nil 154 } 155 if len(remain) == 0 { 156 return 0, nil, eobError 157 } 158 second := remain[0] 159 remain = remain[1:] 160 c = c<<8 | uint16(second) 161 if first >= 0xA1 && first <= 0xFE { 162 if second < 0xA1 { 163 err = badCharError 164 } 165 return 166 } 167 if first == 0x8E { 168 if second < 0xA1 { 169 err = badCharError 170 } 171 return 172 } 173 if first == 0x8F { 174 if len(remain) == 0 { 175 return 0, nil, eobError 176 } 177 third := remain[0] 178 remain = remain[1:] 179 c = c<<0 | uint16(third) 180 if third < 0xa1 { 181 err = badCharError 182 } 183 } 184 return 185} 186 187var commonChars_euc_jp = []uint16{ 188 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 189 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 190 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 191 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 192 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 193 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 194 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 195 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 196 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 197 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1, 198} 199 200var commonChars_euc_kr = []uint16{ 201 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 202 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 203 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 204 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 205 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 206 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 207 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 208 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 209 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 210 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad, 211} 212 213func newRecognizer_euc_jp() *recognizerMultiByte { 214 return &recognizerMultiByte{ 215 "EUC-JP", 216 "ja", 217 charDecoder_euc{}, 218 commonChars_euc_jp, 219 } 220} 221 222func newRecognizer_euc_kr() *recognizerMultiByte { 223 return &recognizerMultiByte{ 224 "EUC-KR", 225 "ko", 226 charDecoder_euc{}, 227 commonChars_euc_kr, 228 } 229} 230 231type charDecoder_big5 struct { 232} 233 234func (charDecoder_big5) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) { 235 if len(input) == 0 { 236 return 0, nil, eobError 237 } 238 first := input[0] 239 remain = input[1:] 240 c = uint16(first) 241 if first <= 0x7F || first == 0xFF { 242 return 243 } 244 if len(remain) == 0 { 245 return c, nil, eobError 246 } 247 second := remain[0] 248 remain = remain[1:] 249 c = c<<8 | uint16(second) 250 if second < 0x40 || second == 0x7F || second == 0xFF { 251 err = badCharError 252 } 253 return 254} 255 256var commonChars_big5 = []uint16{ 257 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 258 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 259 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 260 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 261 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 262 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 263 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 264 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 265 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 266 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f, 267} 268 269func newRecognizer_big5() *recognizerMultiByte { 270 return &recognizerMultiByte{ 271 "Big5", 272 "zh", 273 charDecoder_big5{}, 274 commonChars_big5, 275 } 276} 277 278type charDecoder_gb_18030 struct { 279} 280 281func (charDecoder_gb_18030) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) { 282 if len(input) == 0 { 283 return 0, nil, eobError 284 } 285 first := input[0] 286 remain = input[1:] 287 c = uint16(first) 288 if first <= 0x80 { 289 return 290 } 291 if len(remain) == 0 { 292 return 0, nil, eobError 293 } 294 second := remain[0] 295 remain = remain[1:] 296 c = c<<8 | uint16(second) 297 if first >= 0x81 && first <= 0xFE { 298 if (second >= 0x40 && second <= 0x7E) || (second >= 0x80 && second <= 0xFE) { 299 return 300 } 301 302 if second >= 0x30 && second <= 0x39 { 303 if len(remain) == 0 { 304 return 0, nil, eobError 305 } 306 third := remain[0] 307 remain = remain[1:] 308 if third >= 0x81 && third <= 0xFE { 309 if len(remain) == 0 { 310 return 0, nil, eobError 311 } 312 fourth := remain[0] 313 remain = remain[1:] 314 if fourth >= 0x30 && fourth <= 0x39 { 315 c = c<<16 | uint16(third)<<8 | uint16(fourth) 316 return 317 } 318 } 319 } 320 err = badCharError 321 } 322 return 323} 324 325var commonChars_gb_18030 = []uint16{ 326 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 327 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 328 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 329 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 330 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 331 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 332 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 333 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 334 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 335 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0, 336} 337 338func newRecognizer_gb_18030() *recognizerMultiByte { 339 return &recognizerMultiByte{ 340 "GB-18030", 341 "zh", 342 charDecoder_gb_18030{}, 343 commonChars_gb_18030, 344 } 345} 346