1package chardet
2
3import (
4	"errors"
5	"math"
6)
7
8type recognizerMultiByte struct {
9	charset     string
10	language    string
11	decoder     charDecoder
12	commonChars []uint16
13}
14
15type charDecoder interface {
16	DecodeOneChar([]byte) (c uint16, remain []byte, err error)
17}
18
19func (r *recognizerMultiByte) Match(input *recognizerInput) (output recognizerOutput) {
20	return recognizerOutput{
21		Charset:    r.charset,
22		Language:   r.language,
23		Confidence: r.matchConfidence(input),
24	}
25}
26
27func (r *recognizerMultiByte) matchConfidence(input *recognizerInput) int {
28	raw := input.raw
29	var c uint16
30	var err error
31	var totalCharCount, badCharCount, singleByteCharCount, doubleByteCharCount, commonCharCount int
32	for c, raw, err = r.decoder.DecodeOneChar(raw); len(raw) > 0; c, raw, err = r.decoder.DecodeOneChar(raw) {
33		totalCharCount++
34		if err != nil {
35			badCharCount++
36		} else if c <= 0xFF {
37			singleByteCharCount++
38		} else {
39			doubleByteCharCount++
40			if r.commonChars != nil && binarySearch(r.commonChars, c) {
41				commonCharCount++
42			}
43		}
44		if badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount {
45			return 0
46		}
47	}
48
49	if doubleByteCharCount <= 10 && badCharCount == 0 {
50		if doubleByteCharCount == 0 && totalCharCount < 10 {
51			return 0
52		} else {
53			return 10
54		}
55	}
56
57	if doubleByteCharCount < 20*badCharCount {
58		return 0
59	}
60	if r.commonChars == nil {
61		confidence := 30 + doubleByteCharCount - 20*badCharCount
62		if confidence > 100 {
63			confidence = 100
64		}
65		return confidence
66	}
67	maxVal := math.Log(float64(doubleByteCharCount) / 4)
68	scaleFactor := 90 / maxVal
69	confidence := int(math.Log(float64(commonCharCount)+1)*scaleFactor + 10)
70	if confidence > 100 {
71		confidence = 100
72	}
73	if confidence < 0 {
74		confidence = 0
75	}
76	return confidence
77}
78
79func binarySearch(l []uint16, c uint16) bool {
80	start := 0
81	end := len(l) - 1
82	for start <= end {
83		mid := (start + end) / 2
84		if c == l[mid] {
85			return true
86		} else if c < l[mid] {
87			end = mid - 1
88		} else {
89			start = mid + 1
90		}
91	}
92	return false
93}
94
95var eobError = errors.New("End of input buffer")
96var badCharError = errors.New("Decode a bad char")
97
98type charDecoder_sjis struct {
99}
100
101func (charDecoder_sjis) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
102	if len(input) == 0 {
103		return 0, nil, eobError
104	}
105	first := input[0]
106	c = uint16(first)
107	remain = input[1:]
108	if first <= 0x7F || (first > 0xA0 && first <= 0xDF) {
109		return
110	}
111	if len(remain) == 0 {
112		return c, remain, badCharError
113	}
114	second := remain[0]
115	remain = remain[1:]
116	c = c<<8 | uint16(second)
117	if (second >= 0x40 && second <= 0x7F) || (second >= 0x80 && second <= 0xFE) {
118	} else {
119		err = badCharError
120	}
121	return
122}
123
124var commonChars_sjis = []uint16{
125	0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
126	0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
127	0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
128	0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
129	0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
130	0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa,
131}
132
133func newRecognizer_sjis() *recognizerMultiByte {
134	return &recognizerMultiByte{
135		"Shift_JIS",
136		"ja",
137		charDecoder_sjis{},
138		commonChars_sjis,
139	}
140}
141
142type charDecoder_euc struct {
143}
144
145func (charDecoder_euc) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
146	if len(input) == 0 {
147		return 0, nil, eobError
148	}
149	first := input[0]
150	remain = input[1:]
151	c = uint16(first)
152	if first <= 0x8D {
153		return uint16(first), remain, nil
154	}
155	if len(remain) == 0 {
156		return 0, nil, eobError
157	}
158	second := remain[0]
159	remain = remain[1:]
160	c = c<<8 | uint16(second)
161	if first >= 0xA1 && first <= 0xFE {
162		if second < 0xA1 {
163			err = badCharError
164		}
165		return
166	}
167	if first == 0x8E {
168		if second < 0xA1 {
169			err = badCharError
170		}
171		return
172	}
173	if first == 0x8F {
174		if len(remain) == 0 {
175			return 0, nil, eobError
176		}
177		third := remain[0]
178		remain = remain[1:]
179		c = c<<0 | uint16(third)
180		if third < 0xa1 {
181			err = badCharError
182		}
183	}
184	return
185}
186
187var commonChars_euc_jp = []uint16{
188	0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
189	0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
190	0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
191	0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
192	0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
193	0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
194	0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
195	0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
196	0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
197	0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1,
198}
199
200var commonChars_euc_kr = []uint16{
201	0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
202	0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
203	0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
204	0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
205	0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
206	0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
207	0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
208	0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
209	0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
210	0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad,
211}
212
213func newRecognizer_euc_jp() *recognizerMultiByte {
214	return &recognizerMultiByte{
215		"EUC-JP",
216		"ja",
217		charDecoder_euc{},
218		commonChars_euc_jp,
219	}
220}
221
222func newRecognizer_euc_kr() *recognizerMultiByte {
223	return &recognizerMultiByte{
224		"EUC-KR",
225		"ko",
226		charDecoder_euc{},
227		commonChars_euc_kr,
228	}
229}
230
231type charDecoder_big5 struct {
232}
233
234func (charDecoder_big5) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
235	if len(input) == 0 {
236		return 0, nil, eobError
237	}
238	first := input[0]
239	remain = input[1:]
240	c = uint16(first)
241	if first <= 0x7F || first == 0xFF {
242		return
243	}
244	if len(remain) == 0 {
245		return c, nil, eobError
246	}
247	second := remain[0]
248	remain = remain[1:]
249	c = c<<8 | uint16(second)
250	if second < 0x40 || second == 0x7F || second == 0xFF {
251		err = badCharError
252	}
253	return
254}
255
256var commonChars_big5 = []uint16{
257	0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
258	0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
259	0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
260	0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
261	0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
262	0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
263	0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
264	0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
265	0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
266	0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f,
267}
268
269func newRecognizer_big5() *recognizerMultiByte {
270	return &recognizerMultiByte{
271		"Big5",
272		"zh",
273		charDecoder_big5{},
274		commonChars_big5,
275	}
276}
277
278type charDecoder_gb_18030 struct {
279}
280
281func (charDecoder_gb_18030) DecodeOneChar(input []byte) (c uint16, remain []byte, err error) {
282	if len(input) == 0 {
283		return 0, nil, eobError
284	}
285	first := input[0]
286	remain = input[1:]
287	c = uint16(first)
288	if first <= 0x80 {
289		return
290	}
291	if len(remain) == 0 {
292		return 0, nil, eobError
293	}
294	second := remain[0]
295	remain = remain[1:]
296	c = c<<8 | uint16(second)
297	if first >= 0x81 && first <= 0xFE {
298		if (second >= 0x40 && second <= 0x7E) || (second >= 0x80 && second <= 0xFE) {
299			return
300		}
301
302		if second >= 0x30 && second <= 0x39 {
303			if len(remain) == 0 {
304				return 0, nil, eobError
305			}
306			third := remain[0]
307			remain = remain[1:]
308			if third >= 0x81 && third <= 0xFE {
309				if len(remain) == 0 {
310					return 0, nil, eobError
311				}
312				fourth := remain[0]
313				remain = remain[1:]
314				if fourth >= 0x30 && fourth <= 0x39 {
315					c = c<<16 | uint16(third)<<8 | uint16(fourth)
316					return
317				}
318			}
319		}
320		err = badCharError
321	}
322	return
323}
324
325var commonChars_gb_18030 = []uint16{
326	0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
327	0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
328	0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
329	0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
330	0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
331	0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
332	0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
333	0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
334	0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
335	0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0,
336}
337
338func newRecognizer_gb_18030() *recognizerMultiByte {
339	return &recognizerMultiByte{
340		"GB-18030",
341		"zh",
342		charDecoder_gb_18030{},
343		commonChars_gb_18030,
344	}
345}
346