1 // Copyright 2019 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 //! `chardetng` is a character encoding detector for legacy Web content.
11 //!
12 //! It is optimized for binary size in applications that already depend
13 //! on `encoding_rs` for other reasons.
14 
15 use encoding_rs::Decoder;
16 use encoding_rs::DecoderResult;
17 use encoding_rs::Encoding;
18 use encoding_rs::BIG5;
19 use encoding_rs::EUC_JP;
20 use encoding_rs::EUC_KR;
21 use encoding_rs::GBK;
22 use encoding_rs::ISO_2022_JP;
23 use encoding_rs::ISO_8859_8;
24 use encoding_rs::SHIFT_JIS;
25 use encoding_rs::UTF_8;
26 use encoding_rs::WINDOWS_1255;
27 
28 mod data;
29 mod tld;
30 use data::*;
31 use tld::classify_tld;
32 use tld::Tld;
33 
34 const LATIN_ADJACENCY_PENALTY: i64 = -50;
35 
36 const IMPLAUSIBILITY_PENALTY: i64 = -220;
37 
38 const ORDINAL_BONUS: i64 = 300;
39 
40 /// Must match the ISO-8859-2 score for " Š ". Note: There
41 /// are four Slovenian Wikipedia list page titles where the
42 /// list is split by letter so that Š stands alone for the
43 /// list part for Š. Let's assume that's a special case not
44 /// worth detecting even though the copyright sign detection
45 /// makes Slovenian title detection round to one percentage
46 /// point worse.
47 const COPYRIGHT_BONUS: i64 = 222;
48 
49 const IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY: i64 = -180;
50 
51 const NON_LATIN_CAPITALIZATION_BONUS: i64 = 40;
52 
53 const NON_LATIN_ALL_CAPS_PENALTY: i64 = -40;
54 
55 const NON_LATIN_MIXED_CASE_PENALTY: i64 = -20;
56 
57 // Manually calibrated relative to windows-1256 Arabic
58 const CJK_BASE_SCORE: i64 = 41;
59 
60 const CJK_SECONDARY_BASE_SCORE: i64 = 20; // Was 20
61 
62 const SHIFT_JIS_SCORE_PER_KANA: i64 = 20;
63 
64 const SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE;
65 
66 const SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE;
67 
68 // Manually calibrated relative to windows-1256 Persian and Urdu
69 const SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY: i64 = -75;
70 
71 const HALF_WIDTH_KATAKANA_SCORE: i64 = 1;
72 
73 // Unclear if this is a good idea; seems not harmful, but can't be sure.
74 const HALF_WIDTH_KATAKANA_VOICING_SCORE: i64 = 10;
75 
76 const SHIFT_JIS_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Should this be larger?
77 
78 const SHIFT_JIS_EXTENSION_PENALTY: i64 = SHIFT_JIS_PUA_PENALTY * 2;
79 
80 const SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY: i64 = SHIFT_JIS_EXTENSION_PENALTY;
81 
82 const EUC_JP_SCORE_PER_KANA: i64 = CJK_BASE_SCORE + (CJK_BASE_SCORE / 3); // Relative to Big5
83 
84 const EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA: i64 = CJK_BASE_SCORE - 1;
85 
86 const EUC_JP_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE;
87 
88 const EUC_JP_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE;
89 
90 const EUC_JP_SCORE_PER_OTHER_KANJI: i64 = CJK_SECONDARY_BASE_SCORE / 4;
91 
92 const EUC_JP_INITIAL_KANA_PENALTY: i64 = -((CJK_BASE_SCORE / 3) + 1);
93 
94 const EUC_JP_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 50); // Needs to be more severe than for Shift_JIS to avoid misdetecting EUC-KR!
95 
96 const BIG5_SCORE_PER_LEVEL_1_HANZI: i64 = CJK_BASE_SCORE;
97 
98 const BIG5_SCORE_PER_OTHER_HANZI: i64 = CJK_SECONDARY_BASE_SCORE;
99 
100 const BIG5_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 30); // More severe than other PUA penalties to avoid misdetecting EUC-KR! (25 as the multiplier is too little)
101 
102 const BIG5_SINGLE_BYTE_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 40);
103 
104 const EUC_KR_SCORE_PER_EUC_HANGUL: i64 = CJK_BASE_SCORE + 1;
105 
106 const EUC_KR_SCORE_PER_NON_EUC_HANGUL: i64 = CJK_SECONDARY_BASE_SCORE / 5;
107 
108 const EUC_KR_SCORE_PER_HANJA: i64 = CJK_SECONDARY_BASE_SCORE / 2;
109 
110 const EUC_KR_HANJA_AFTER_HANGUL_PENALTY: i64 = -(CJK_BASE_SCORE * 10);
111 
112 const EUC_KR_LONG_WORD_PENALTY: i64 = -6;
113 
114 const EUC_KR_PUA_PENALTY: i64 = GBK_PUA_PENALTY - 1; // Break tie in favor of GBK
115 
116 const EUC_KR_MAC_KOREAN_PENALTY: i64 = EUC_KR_PUA_PENALTY * 2;
117 
118 const EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY: i64 = EUC_KR_MAC_KOREAN_PENALTY;
119 
120 const GBK_SCORE_PER_LEVEL_1: i64 = CJK_BASE_SCORE;
121 
122 const GBK_SCORE_PER_LEVEL_2: i64 = CJK_SECONDARY_BASE_SCORE;
123 
124 const GBK_SCORE_PER_NON_EUC: i64 = CJK_SECONDARY_BASE_SCORE / 4;
125 
126 const GBK_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Factor should be at least 2, but should it be larger?
127 
128 const GBK_SINGLE_BYTE_EXTENSION_PENALTY: i64 = GBK_PUA_PENALTY * 4;
129 
130 const CJK_LATIN_ADJACENCY_PENALTY: i64 = -CJK_BASE_SCORE; // smaller penalty than LATIN_ADJACENCY_PENALTY
131 
132 const CJ_PUNCTUATION: i64 = CJK_BASE_SCORE / 2;
133 
134 const CJK_OTHER: i64 = CJK_SECONDARY_BASE_SCORE / 4;
135 
136 /// Latin letter caseless class
137 const LATIN_LETTER: u8 = 1;
138 
contains_upper_case_period_or_non_ascii(label: &[u8]) -> bool139 fn contains_upper_case_period_or_non_ascii(label: &[u8]) -> bool {
140     for &b in label.into_iter() {
141         if b >= 0x80 {
142             return true;
143         }
144         if b == b'.' {
145             return true;
146         }
147         if b >= b'A' && b <= b'Z' {
148             return true;
149         }
150     }
151     false
152 }
153 
154 // For Latin, we only penalize pairwise bad transitions
155 // if one participant is non-ASCII. This avoids violating
156 // the principle that ASCII pairs never contribute to the
157 // score. (Maybe that's a bad principle, though!)
158 #[derive(PartialEq)]
159 enum LatinCaseState {
160     Space,
161     Upper,
162     Lower,
163     AllCaps,
164 }
165 
166 // Fon non-Latin, we calculate case-related penalty
167 // or bonus on a per-non-Latin-word basis.
168 #[derive(PartialEq)]
169 enum NonLatinCaseState {
170     Space,
171     Upper,
172     Lower,
173     UpperLower,
174     AllCaps,
175     Mix,
176 }
177 
178 struct NonLatinCasedCandidate {
179     data: &'static SingleByteData,
180     prev: u8,
181     case_state: NonLatinCaseState,
182     prev_ascii: bool,
183     current_word_len: u64,
184     longest_word: u64,
185     ibm866: bool,
186     prev_was_a0: bool, // Only used with IBM866
187 }
188 
189 impl NonLatinCasedCandidate {
new(data: &'static SingleByteData) -> Self190     fn new(data: &'static SingleByteData) -> Self {
191         NonLatinCasedCandidate {
192             data: data,
193             prev: 0,
194             case_state: NonLatinCaseState::Space,
195             prev_ascii: true,
196             current_word_len: 0,
197             longest_word: 0,
198             ibm866: data == &SINGLE_BYTE_DATA[IBM866_INDEX],
199             prev_was_a0: false,
200         }
201     }
202 
feed(&mut self, buffer: &[u8]) -> Option<i64>203     fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
204         let mut score = 0i64;
205         for &b in buffer {
206             let class = self.data.classify(b);
207             if class == 255 {
208                 return None;
209             }
210             let caseless_class = class & 0x7F;
211 
212             let ascii = b < 0x80;
213             let ascii_pair = self.prev_ascii && ascii;
214 
215             let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
216 
217             // The purpose of this state machine is to avoid misdetecting Greek as
218             // Cyrillic by:
219             //
220             // * Giving a small bonus to words that start with an upper-case letter
221             //   and are lower-case for the rest.
222             // * Giving a large penalty to start with one lower-case letter followed
223             //   by all upper-case (obviously upper and lower case inverted, which
224             //   unfortunately is possible due to KOI8-U).
225             // * Giving a small per-word penalty to all-uppercase KOI8-U (to favor
226             //   all-lowercase Greek over all-caps KOI8-U).
227             // * Giving large penalties for mixed-case other than initial upper-case.
228             //   This also helps relative to non-cased encodings.
229 
230             // ASCII doesn't participate in non-Latin casing.
231             if caseless_class == LATIN_LETTER {
232                 // Latin
233                 // Mark this word as a mess. If there end up being non-Latin
234                 // letters in this word, the ASCII-adjacency penalty gets
235                 // applied to Latin/non-Latin pairs and the mix penalty
236                 // to non-Latin/non-Latin pairs.
237                 // XXX Apply penalty here
238                 self.case_state = NonLatinCaseState::Mix;
239             } else if !non_ascii_alphabetic {
240                 // Space
241                 match self.case_state {
242                     NonLatinCaseState::Space
243                     | NonLatinCaseState::Upper
244                     | NonLatinCaseState::Lower => {}
245                     NonLatinCaseState::UpperLower => {
246                         // Intentionally applied only once per word.
247                         score += NON_LATIN_CAPITALIZATION_BONUS;
248                     }
249                     NonLatinCaseState::AllCaps => {
250                         // Intentionally applied only once per word.
251                         if self.data == &SINGLE_BYTE_DATA[KOI8_U_INDEX] {
252                             // Apply only to KOI8-U.
253                             score += NON_LATIN_ALL_CAPS_PENALTY;
254                         }
255                     }
256                     NonLatinCaseState::Mix => {
257                         // Per letter
258                         score += NON_LATIN_MIXED_CASE_PENALTY * (self.current_word_len as i64);
259                     }
260                 }
261                 self.case_state = NonLatinCaseState::Space;
262             } else if (class >> 7) == 0 {
263                 // Lower case
264                 match self.case_state {
265                     NonLatinCaseState::Space => {
266                         self.case_state = NonLatinCaseState::Lower;
267                     }
268                     NonLatinCaseState::Upper => {
269                         self.case_state = NonLatinCaseState::UpperLower;
270                     }
271                     NonLatinCaseState::Lower
272                     | NonLatinCaseState::UpperLower
273                     | NonLatinCaseState::Mix => {}
274                     NonLatinCaseState::AllCaps => {
275                         self.case_state = NonLatinCaseState::Mix;
276                     }
277                 }
278             } else {
279                 // Upper case
280                 match self.case_state {
281                     NonLatinCaseState::Space => {
282                         self.case_state = NonLatinCaseState::Upper;
283                     }
284                     NonLatinCaseState::Upper => {
285                         self.case_state = NonLatinCaseState::AllCaps;
286                     }
287                     NonLatinCaseState::Lower | NonLatinCaseState::UpperLower => {
288                         self.case_state = NonLatinCaseState::Mix;
289                     }
290                     NonLatinCaseState::AllCaps | NonLatinCaseState::Mix => {}
291                 }
292             }
293 
294             // XXX Apply penalty if > 16
295             if non_ascii_alphabetic {
296                 self.current_word_len += 1;
297             } else {
298                 if self.current_word_len > self.longest_word {
299                     self.longest_word = self.current_word_len;
300                 }
301                 self.current_word_len = 0;
302             }
303 
304             let is_a0 = b == 0xA0;
305             if !ascii_pair {
306                 // 0xA0 is no-break space in many other encodings, so avoid
307                 // assigning score to IBM866 when 0xA0 occurs next to itself
308                 // or a space-like byte.
309                 if !(self.ibm866
310                     && ((is_a0 && (self.prev_was_a0 || self.prev == 0))
311                         || caseless_class == 0 && self.prev_was_a0))
312                 {
313                     score += self.data.score(caseless_class, self.prev, false);
314                 }
315 
316                 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
317                     score += LATIN_ADJACENCY_PENALTY;
318                 } else if caseless_class == LATIN_LETTER
319                     && self.data.is_non_latin_alphabetic(self.prev, false)
320                 {
321                     score += LATIN_ADJACENCY_PENALTY;
322                 }
323             }
324 
325             self.prev_ascii = ascii;
326             self.prev = caseless_class;
327             self.prev_was_a0 = is_a0;
328         }
329         Some(score)
330     }
331 }
332 
333 enum OrdinalState {
334     Other,
335     Space,
336     PeriodAfterN,
337     OrdinalExpectingSpace,
338     OrdinalExpectingSpaceUndoImplausibility,
339     OrdinalExpectingSpaceOrDigit,
340     OrdinalExpectingSpaceOrDigitUndoImplausibily,
341     UpperN,
342     LowerN,
343     FeminineAbbreviationStartLetter,
344     Digit,
345     Roman,
346     Copyright,
347 }
348 
349 struct LatinCandidate {
350     data: &'static SingleByteData,
351     prev: u8,
352     case_state: LatinCaseState,
353     prev_non_ascii: u32,
354     ordinal_state: OrdinalState, // Used only when `windows1252 == true`
355     windows1252: bool,
356 }
357 
358 impl LatinCandidate {
new(data: &'static SingleByteData) -> Self359     fn new(data: &'static SingleByteData) -> Self {
360         LatinCandidate {
361             data: data,
362             prev: 0,
363             case_state: LatinCaseState::Space,
364             prev_non_ascii: 0,
365             ordinal_state: OrdinalState::Space,
366             windows1252: data == &SINGLE_BYTE_DATA[WINDOWS_1252_INDEX],
367         }
368     }
369 
feed(&mut self, buffer: &[u8]) -> Option<i64>370     fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
371         let mut score = 0i64;
372         for &b in buffer {
373             let class = self.data.classify(b);
374             if class == 255 {
375                 return None;
376             }
377             let caseless_class = class & 0x7F;
378 
379             let ascii = b < 0x80;
380             let ascii_pair = self.prev_non_ascii == 0 && ascii;
381 
382             let non_ascii_penalty = match self.prev_non_ascii {
383                 0 | 1 | 2 => 0,
384                 3 => -5,
385                 4 => -20,
386                 _ => -200,
387             };
388             score += non_ascii_penalty;
389             // XXX if has Vietnamese-only characters and word length > 7,
390             // apply penalty
391 
392             if !self.data.is_latin_alphabetic(caseless_class) {
393                 self.case_state = LatinCaseState::Space;
394             } else if (class >> 7) == 0 {
395                 // Penalizing lower case after two upper case
396                 // is important for avoiding misdetecting
397                 // windows-1250 as windows-1252 (byte 0x9F).
398                 if self.case_state == LatinCaseState::AllCaps && !ascii_pair {
399                     score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
400                 }
401                 self.case_state = LatinCaseState::Lower;
402             } else {
403                 match self.case_state {
404                     LatinCaseState::Space => {
405                         self.case_state = LatinCaseState::Upper;
406                     }
407                     LatinCaseState::Upper | LatinCaseState::AllCaps => {
408                         self.case_state = LatinCaseState::AllCaps;
409                     }
410                     LatinCaseState::Lower => {
411                         if !ascii_pair {
412                             // XXX How bad is this for Irish Gaelic?
413                             score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
414                         }
415                         self.case_state = LatinCaseState::Upper;
416                     }
417                 }
418             }
419 
420             // Treat pairing space-like, which can be non-ASCII, with ASCII as
421             // ASCIIish enough not to get a score in order to avoid giving
422             // ASCII i and I in windows-1254 next to windows-125x apostrophe/quote
423             // a score. This avoids detecting English I’ as Turkish.
424             let ascii_ish_pair = ascii_pair
425                 || (ascii && self.prev == 0)
426                 || (caseless_class == 0 && self.prev_non_ascii == 0);
427 
428             if !ascii_ish_pair {
429                 score += self.data.score(caseless_class, self.prev, false);
430             }
431 
432             if self.windows1252 {
433                 // This state machine assigns score to the sequences
434                 // * " º " (Spanish)
435                 // * " ª " (Spanish)
436                 // * ".ª " (Spanish)
437                 // * ".º " (Spanish)
438                 // * "n.º1" (Spanish)
439                 // * " Mª " (Spanish)
440                 // * " Dª " (Spanish)
441                 // * " Nª " (Spanish)
442                 // * " Sª " (Spanish)
443                 // * " 3º " (Italian, where 3 is an ASCII digit)
444                 // * " 3ª " (Italian, where 3 is an ASCII digit)
445                 // * " Xº " (Italian, where X is a small Roman numeral)
446                 // * " Xª " (Italian, where X is a small Roman numeral)
447                 // * " Nº1" (Italian, where 1 is an ASCII digit)
448                 // * " Nº " (Italian)
449                 // * " © " (otherwise ASCII-only)
450                 // which are problematic to deal with by pairwise scoring
451                 // without messing up Romanian detection.
452                 // Initial sc
453                 match self.ordinal_state {
454                     OrdinalState::Other => {
455                         if caseless_class == 0 {
456                             self.ordinal_state = OrdinalState::Space;
457                         }
458                     }
459                     OrdinalState::Space => {
460                         if caseless_class == 0 {
461                             // pass
462                         } else if b == 0xAA || b == 0xBA {
463                             self.ordinal_state = OrdinalState::OrdinalExpectingSpace;
464                         } else if b == b'M' || b == b'D' || b == b'S' {
465                             self.ordinal_state = OrdinalState::FeminineAbbreviationStartLetter;
466                         } else if b == b'N' {
467                             // numero or Nuestra
468                             self.ordinal_state = OrdinalState::UpperN;
469                         } else if b == b'n' {
470                             // numero
471                             self.ordinal_state = OrdinalState::LowerN;
472                         } else if caseless_class == (ASCII_DIGIT as u8) {
473                             self.ordinal_state = OrdinalState::Digit;
474                         } else if caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24
475                         /* X */
476                         {
477                             self.ordinal_state = OrdinalState::Roman;
478                         } else if b == 0xA9 {
479                             self.ordinal_state = OrdinalState::Copyright;
480                         } else {
481                             self.ordinal_state = OrdinalState::Other;
482                         }
483                     }
484                     OrdinalState::OrdinalExpectingSpace => {
485                         if caseless_class == 0 {
486                             score += ORDINAL_BONUS;
487                             self.ordinal_state = OrdinalState::Space;
488                         } else {
489                             self.ordinal_state = OrdinalState::Other;
490                         }
491                     }
492                     OrdinalState::OrdinalExpectingSpaceUndoImplausibility => {
493                         if caseless_class == 0 {
494                             score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
495                             self.ordinal_state = OrdinalState::Space;
496                         } else {
497                             self.ordinal_state = OrdinalState::Other;
498                         }
499                     }
500                     OrdinalState::OrdinalExpectingSpaceOrDigit => {
501                         if caseless_class == 0 {
502                             score += ORDINAL_BONUS;
503                             self.ordinal_state = OrdinalState::Space;
504                         } else if caseless_class == (ASCII_DIGIT as u8) {
505                             score += ORDINAL_BONUS;
506                             // Deliberately set to `Other`
507                             self.ordinal_state = OrdinalState::Other;
508                         } else {
509                             self.ordinal_state = OrdinalState::Other;
510                         }
511                     }
512                     OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily => {
513                         if caseless_class == 0 {
514                             score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
515                             self.ordinal_state = OrdinalState::Space;
516                         } else if caseless_class == (ASCII_DIGIT as u8) {
517                             score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
518                             // Deliberately set to `Other`
519                             self.ordinal_state = OrdinalState::Other;
520                         } else {
521                             self.ordinal_state = OrdinalState::Other;
522                         }
523                     }
524                     OrdinalState::UpperN => {
525                         if b == 0xAA {
526                             self.ordinal_state =
527                                 OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
528                         } else if b == 0xBA {
529                             self.ordinal_state =
530                                 OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily;
531                         } else if b == b'.' {
532                             self.ordinal_state = OrdinalState::PeriodAfterN;
533                         } else if caseless_class == 0 {
534                             self.ordinal_state = OrdinalState::Space;
535                         } else {
536                             self.ordinal_state = OrdinalState::Other;
537                         }
538                     }
539                     OrdinalState::LowerN => {
540                         if b == 0xBA {
541                             self.ordinal_state =
542                                 OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily;
543                         } else if b == b'.' {
544                             self.ordinal_state = OrdinalState::PeriodAfterN;
545                         } else if caseless_class == 0 {
546                             self.ordinal_state = OrdinalState::Space;
547                         } else {
548                             self.ordinal_state = OrdinalState::Other;
549                         }
550                     }
551                     OrdinalState::FeminineAbbreviationStartLetter => {
552                         if b == 0xAA {
553                             self.ordinal_state =
554                                 OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
555                         } else if caseless_class == 0 {
556                             self.ordinal_state = OrdinalState::Space;
557                         } else {
558                             self.ordinal_state = OrdinalState::Other;
559                         }
560                     }
561                     OrdinalState::Digit => {
562                         if b == 0xAA || b == 0xBA {
563                             self.ordinal_state = OrdinalState::OrdinalExpectingSpace;
564                         } else if caseless_class == 0 {
565                             self.ordinal_state = OrdinalState::Space;
566                         } else if caseless_class == (ASCII_DIGIT as u8) {
567                             // pass
568                         } else {
569                             self.ordinal_state = OrdinalState::Other;
570                         }
571                     }
572                     OrdinalState::Roman => {
573                         if b == 0xAA || b == 0xBA {
574                             self.ordinal_state =
575                                 OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
576                         } else if caseless_class == 0 {
577                             self.ordinal_state = OrdinalState::Space;
578                         } else if caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24
579                         /* X */
580                         {
581                             // pass
582                         } else {
583                             self.ordinal_state = OrdinalState::Other;
584                         }
585                     }
586                     OrdinalState::PeriodAfterN => {
587                         if b == 0xBA {
588                             self.ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigit;
589                         } else if caseless_class == 0 {
590                             self.ordinal_state = OrdinalState::Space;
591                         } else {
592                             self.ordinal_state = OrdinalState::Other;
593                         }
594                     }
595                     OrdinalState::Copyright => {
596                         if caseless_class == 0 {
597                             score += COPYRIGHT_BONUS;
598                             self.ordinal_state = OrdinalState::Space;
599                         } else {
600                             self.ordinal_state = OrdinalState::Other;
601                         }
602                     }
603                 }
604             }
605 
606             if ascii {
607                 self.prev_non_ascii = 0;
608             } else {
609                 self.prev_non_ascii += 1;
610             }
611             self.prev = caseless_class;
612         }
613         Some(score)
614     }
615 }
616 
617 struct ArabicFrenchCandidate {
618     data: &'static SingleByteData,
619     prev: u8,
620     case_state: LatinCaseState,
621     prev_ascii: bool,
622     current_word_len: u64,
623     longest_word: u64,
624 }
625 
626 impl ArabicFrenchCandidate {
new(data: &'static SingleByteData) -> Self627     fn new(data: &'static SingleByteData) -> Self {
628         ArabicFrenchCandidate {
629             data: data,
630             prev: 0,
631             case_state: LatinCaseState::Space,
632             prev_ascii: true,
633             current_word_len: 0,
634             longest_word: 0,
635         }
636     }
637 
feed(&mut self, buffer: &[u8]) -> Option<i64>638     fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
639         let mut score = 0i64;
640         for &b in buffer {
641             let class = self.data.classify(b);
642             if class == 255 {
643                 return None;
644             }
645             let caseless_class = class & 0x7F;
646 
647             let ascii = b < 0x80;
648             let ascii_pair = self.prev_ascii && ascii;
649 
650             if caseless_class != LATIN_LETTER {
651                 // We compute case penalties for French only
652                 self.case_state = LatinCaseState::Space;
653             } else if (class >> 7) == 0 {
654                 if self.case_state == LatinCaseState::AllCaps && !ascii_pair {
655                     score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
656                 }
657                 self.case_state = LatinCaseState::Lower;
658             } else {
659                 match self.case_state {
660                     LatinCaseState::Space => {
661                         self.case_state = LatinCaseState::Upper;
662                     }
663                     LatinCaseState::Upper | LatinCaseState::AllCaps => {
664                         self.case_state = LatinCaseState::AllCaps;
665                     }
666                     LatinCaseState::Lower => {
667                         if !ascii_pair {
668                             score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
669                         }
670                         self.case_state = LatinCaseState::Upper;
671                     }
672                 }
673             }
674 
675             // Count only Arabic word length and ignore French
676             let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, true);
677             // XXX apply penalty if > 23
678             if non_ascii_alphabetic {
679                 self.current_word_len += 1;
680             } else {
681                 if self.current_word_len > self.longest_word {
682                     self.longest_word = self.current_word_len;
683                 }
684                 self.current_word_len = 0;
685             }
686 
687             if !ascii_pair {
688                 score += self.data.score(caseless_class, self.prev, true);
689 
690                 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
691                     score += LATIN_ADJACENCY_PENALTY;
692                 } else if caseless_class == LATIN_LETTER
693                     && self.data.is_non_latin_alphabetic(self.prev, true)
694                 {
695                     score += LATIN_ADJACENCY_PENALTY;
696                 }
697             }
698 
699             self.prev_ascii = ascii;
700             self.prev = caseless_class;
701         }
702         Some(score)
703     }
704 }
705 
706 struct CaselessCandidate {
707     data: &'static SingleByteData,
708     prev: u8,
709     prev_ascii: bool,
710     current_word_len: u64,
711     longest_word: u64,
712 }
713 
714 impl CaselessCandidate {
new(data: &'static SingleByteData) -> Self715     fn new(data: &'static SingleByteData) -> Self {
716         CaselessCandidate {
717             data: data,
718             prev: 0,
719             prev_ascii: true,
720             current_word_len: 0,
721             longest_word: 0,
722         }
723     }
724 
feed(&mut self, buffer: &[u8]) -> Option<i64>725     fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
726         let mut score = 0i64;
727         for &b in buffer {
728             let class = self.data.classify(b);
729             if class == 255 {
730                 return None;
731             }
732             let caseless_class = class & 0x7F;
733 
734             let ascii = b < 0x80;
735             let ascii_pair = self.prev_ascii && ascii;
736 
737             let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
738             // Apply penalty if > 23 and not Thai
739             if non_ascii_alphabetic {
740                 self.current_word_len += 1;
741             } else {
742                 if self.current_word_len > self.longest_word {
743                     self.longest_word = self.current_word_len;
744                 }
745                 self.current_word_len = 0;
746             }
747 
748             if !ascii_pair {
749                 score += self.data.score(caseless_class, self.prev, false);
750 
751                 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
752                     score += LATIN_ADJACENCY_PENALTY;
753                 } else if caseless_class == LATIN_LETTER
754                     && self.data.is_non_latin_alphabetic(self.prev, false)
755                 {
756                     score += LATIN_ADJACENCY_PENALTY;
757                 }
758             }
759 
760             self.prev_ascii = ascii;
761             self.prev = caseless_class;
762         }
763         Some(score)
764     }
765 }
766 
is_ascii_punctuation(byte: u8) -> bool767 fn is_ascii_punctuation(byte: u8) -> bool {
768     match byte {
769         b'.' | b',' | b':' | b';' | b'?' | b'!' => true,
770         _ => false,
771     }
772 }
773 
774 struct LogicalCandidate {
775     data: &'static SingleByteData,
776     prev: u8,
777     prev_ascii: bool,
778     plausible_punctuation: u64,
779     current_word_len: u64,
780     longest_word: u64,
781 }
782 
783 impl LogicalCandidate {
new(data: &'static SingleByteData) -> Self784     fn new(data: &'static SingleByteData) -> Self {
785         LogicalCandidate {
786             data: data,
787             prev: 0,
788             prev_ascii: true,
789             plausible_punctuation: 0,
790             current_word_len: 0,
791             longest_word: 0,
792         }
793     }
794 
feed(&mut self, buffer: &[u8]) -> Option<i64>795     fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
796         let mut score = 0i64;
797         for &b in buffer {
798             let class = self.data.classify(b);
799             if class == 255 {
800                 return None;
801             }
802             let caseless_class = class & 0x7F;
803 
804             let ascii = b < 0x80;
805             let ascii_pair = self.prev_ascii && ascii;
806 
807             let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
808             // XXX apply penalty if > 22
809             if non_ascii_alphabetic {
810                 self.current_word_len += 1;
811             } else {
812                 if self.current_word_len > self.longest_word {
813                     self.longest_word = self.current_word_len;
814                 }
815                 self.current_word_len = 0;
816             }
817 
818             if !ascii_pair {
819                 score += self.data.score(caseless_class, self.prev, false);
820 
821                 let prev_non_ascii_alphabetic = self.data.is_non_latin_alphabetic(self.prev, false);
822                 if caseless_class == 0 && prev_non_ascii_alphabetic && is_ascii_punctuation(b) {
823                     self.plausible_punctuation += 1;
824                 }
825 
826                 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
827                     score += LATIN_ADJACENCY_PENALTY;
828                 } else if caseless_class == LATIN_LETTER && prev_non_ascii_alphabetic {
829                     score += LATIN_ADJACENCY_PENALTY;
830                 }
831             }
832 
833             self.prev_ascii = ascii;
834             self.prev = caseless_class;
835         }
836         Some(score)
837     }
838 }
839 
840 struct VisualCandidate {
841     data: &'static SingleByteData,
842     prev: u8,
843     prev_ascii: bool,
844     prev_punctuation: bool,
845     plausible_punctuation: u64,
846     current_word_len: u64,
847     longest_word: u64,
848 }
849 
850 impl VisualCandidate {
new(data: &'static SingleByteData) -> Self851     fn new(data: &'static SingleByteData) -> Self {
852         VisualCandidate {
853             data: data,
854             prev: 0,
855             prev_ascii: true,
856             prev_punctuation: false,
857             plausible_punctuation: 0,
858             current_word_len: 0,
859             longest_word: 0,
860         }
861     }
862 
feed(&mut self, buffer: &[u8]) -> Option<i64>863     fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
864         let mut score = 0i64;
865         for &b in buffer {
866             let class = self.data.classify(b);
867             if class == 255 {
868                 return None;
869             }
870             let caseless_class = class & 0x7F;
871 
872             let ascii = b < 0x80;
873             let ascii_pair = self.prev_ascii && ascii;
874 
875             let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
876             // XXX apply penalty if > 22
877             if non_ascii_alphabetic {
878                 self.current_word_len += 1;
879             } else {
880                 if self.current_word_len > self.longest_word {
881                     self.longest_word = self.current_word_len;
882                 }
883                 self.current_word_len = 0;
884             }
885 
886             if !ascii_pair {
887                 score += self.data.score(caseless_class, self.prev, false);
888 
889                 if non_ascii_alphabetic && self.prev_punctuation {
890                     self.plausible_punctuation += 1;
891                 }
892 
893                 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
894                     score += LATIN_ADJACENCY_PENALTY;
895                 } else if caseless_class == LATIN_LETTER
896                     && self.data.is_non_latin_alphabetic(self.prev, false)
897                 {
898                     score += LATIN_ADJACENCY_PENALTY;
899                 }
900             }
901 
902             self.prev_ascii = ascii;
903             self.prev = caseless_class;
904             self.prev_punctuation = caseless_class == 0 && is_ascii_punctuation(b);
905         }
906         Some(score)
907     }
908 }
909 
910 struct Utf8Candidate {
911     decoder: Decoder,
912 }
913 
914 impl Utf8Candidate {
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>915     fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
916         let mut dst = [0u8; 1024];
917         let mut total_read = 0;
918         loop {
919             let (result, read, _) = self.decoder.decode_to_utf8_without_replacement(
920                 &buffer[total_read..],
921                 &mut dst,
922                 last,
923             );
924             total_read += read;
925             match result {
926                 DecoderResult::InputEmpty => {
927                     return Some(0);
928                 }
929                 DecoderResult::Malformed(_, _) => {
930                     return None;
931                 }
932                 DecoderResult::OutputFull => {
933                     continue;
934                 }
935             }
936         }
937     }
938 }
939 
940 struct Iso2022Candidate {
941     decoder: Decoder,
942 }
943 
944 impl Iso2022Candidate {
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>945     fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
946         let mut dst = [0u16; 1024];
947         let mut total_read = 0;
948         loop {
949             let (result, read, _) = self.decoder.decode_to_utf16_without_replacement(
950                 &buffer[total_read..],
951                 &mut dst,
952                 last,
953             );
954             total_read += read;
955             match result {
956                 DecoderResult::InputEmpty => {
957                     return Some(0);
958                 }
959                 DecoderResult::Malformed(_, _) => {
960                     return None;
961                 }
962                 DecoderResult::OutputFull => {
963                     continue;
964                 }
965             }
966         }
967     }
968 }
969 
970 #[derive(PartialEq)]
971 enum LatinCj {
972     AsciiLetter,
973     Cj,
974     Other,
975 }
976 
977 #[derive(PartialEq, Copy, Clone)]
978 enum HalfWidthKatakana {
979     DakutenForbidden,
980     DakutenAllowed,
981     DakutenOrHandakutenAllowed,
982 }
983 
984 #[derive(PartialEq)]
985 enum LatinKorean {
986     AsciiLetter,
987     Hangul,
988     Hanja,
989     Other,
990 }
991 
cjk_extra_score(u: u16, table: &'static [u16; 128]) -> i64992 fn cjk_extra_score(u: u16, table: &'static [u16; 128]) -> i64 {
993     if let Some(pos) = table.iter().position(|&x| x == u) {
994         ((128 - pos) / 16) as i64
995     } else {
996         0
997     }
998 }
999 
1000 struct GbkCandidate {
1001     decoder: Decoder,
1002     prev_byte: u8,
1003     prev: LatinCj,
1004     pending_score: Option<i64>,
1005 }
1006 
1007 impl GbkCandidate {
maybe_set_as_pending(&mut self, s: i64) -> i641008     fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1009         assert!(self.pending_score.is_none());
1010         if self.prev == LatinCj::Cj || !more_problematic_lead(self.prev_byte) {
1011             s
1012         } else {
1013             self.pending_score = Some(s);
1014             0
1015         }
1016     }
1017 
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>1018     fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1019         let mut score = 0i64;
1020         let mut src = [0u8];
1021         let mut dst = [0u16; 2];
1022         for &b in buffer {
1023             src[0] = b;
1024             let (result, read, written) = self
1025                 .decoder
1026                 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1027             if written == 1 {
1028                 let u = dst[0];
1029                 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1030                     || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1031                 {
1032                     self.pending_score = None; // Discard pending score
1033                     if self.prev == LatinCj::Cj {
1034                         score += CJK_LATIN_ADJACENCY_PENALTY;
1035                     }
1036                     self.prev = LatinCj::AsciiLetter;
1037                 } else if u == 0x20AC {
1038                     // euro sign
1039                     self.pending_score = None; // Discard pending score
1040                                                // Should there even be a penalty?
1041                     self.prev = LatinCj::Other;
1042                 } else if u >= 0x4E00 && u <= 0x9FA5 {
1043                     if let Some(pending) = self.pending_score {
1044                         score += pending;
1045                         self.pending_score = None;
1046                     }
1047                     if b >= 0xA1 && b <= 0xFE {
1048                         match self.prev_byte {
1049                             0xA1..=0xD7 => {
1050                                 score += GBK_SCORE_PER_LEVEL_1;
1051                                 score +=
1052                                     cjk_extra_score(u, &data::DETECTOR_DATA.frequent_simplified);
1053                             }
1054                             0xD8..=0xFE => score += GBK_SCORE_PER_LEVEL_2,
1055                             _ => {
1056                                 score += GBK_SCORE_PER_NON_EUC;
1057                             }
1058                         }
1059                     } else {
1060                         score += self.maybe_set_as_pending(GBK_SCORE_PER_NON_EUC);
1061                     }
1062                     if self.prev == LatinCj::AsciiLetter {
1063                         score += CJK_LATIN_ADJACENCY_PENALTY;
1064                     }
1065                     self.prev = LatinCj::Cj;
1066                 } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1067                     if let Some(pending) = self.pending_score {
1068                         score += pending;
1069                         self.pending_score = None;
1070                     }
1071                     // XXX score?
1072                     if self.prev == LatinCj::AsciiLetter {
1073                         score += CJK_LATIN_ADJACENCY_PENALTY;
1074                     }
1075                     self.prev = LatinCj::Cj;
1076                 } else if u >= 0xE000 && u < 0xF900 {
1077                     if let Some(pending) = self.pending_score {
1078                         score += pending;
1079                         self.pending_score = None;
1080                     }
1081                     // Treat the GB18030-required PUA mappings as non-EUC ideographs.
1082                     match u {
1083                         0xE78D..=0xE796
1084                         | 0xE816..=0xE818
1085                         | 0xE81E
1086                         | 0xE826
1087                         | 0xE82B
1088                         | 0xE82C
1089                         | 0xE831
1090                         | 0xE832
1091                         | 0xE83B
1092                         | 0xE843
1093                         | 0xE854
1094                         | 0xE855
1095                         | 0xE864 => {
1096                             score += GBK_SCORE_PER_NON_EUC;
1097                             if self.prev == LatinCj::AsciiLetter {
1098                                 score += CJK_LATIN_ADJACENCY_PENALTY;
1099                             }
1100                             self.prev = LatinCj::Cj;
1101                         }
1102                         _ => {
1103                             score += GBK_PUA_PENALTY;
1104                             self.prev = LatinCj::Other;
1105                         }
1106                     }
1107                 } else {
1108                     match u {
1109                         0x3000 // Distinct from Korean, space
1110                         | 0x3001 // Distinct from Korean, enumeration comma
1111                         | 0x3002 // Distinct from Korean, full stop
1112                         | 0xFF08 // Distinct from Korean, parenthesis
1113                         | 0xFF09 // Distinct from Korean, parenthesis
1114                         | 0xFF01 // Distinct from Japanese, exclamation
1115                         | 0xFF0C // Distinct from Japanese, comma
1116                         | 0xFF1B // Distinct from Japanese, semicolon
1117                         | 0xFF1F // Distinct from Japanese, question
1118                         => {
1119                             if let Some(pending) = self.pending_score {
1120                                 score += pending;
1121                                 self.pending_score = None;
1122                             }
1123                             score += CJ_PUNCTUATION;
1124                         }
1125                         0..=0x7F => {
1126                             self.pending_score = None; // Discard pending score
1127                         }
1128                         _ => {
1129                             if let Some(pending) = self.pending_score {
1130                                 score += pending;
1131                                 self.pending_score = None;
1132                             }
1133                             score += CJK_OTHER;
1134                         }
1135                     }
1136                     self.prev = LatinCj::Other;
1137                 }
1138             } else if written == 2 {
1139                 if let Some(pending) = self.pending_score {
1140                     score += pending;
1141                     self.pending_score = None;
1142                 }
1143                 let u = dst[0];
1144                 if u >= 0xDB80 && u <= 0xDBFF {
1145                     score += GBK_PUA_PENALTY;
1146                     self.prev = LatinCj::Other;
1147                 } else if u >= 0xD480 && u < 0xD880 {
1148                     score += GBK_SCORE_PER_NON_EUC;
1149                     if self.prev == LatinCj::AsciiLetter {
1150                         score += CJK_LATIN_ADJACENCY_PENALTY;
1151                     }
1152                     self.prev = LatinCj::Cj;
1153                 } else {
1154                     score += CJK_OTHER;
1155                     self.prev = LatinCj::Other;
1156                 }
1157             }
1158             match result {
1159                 DecoderResult::InputEmpty => {
1160                     assert_eq!(read, 1);
1161                 }
1162                 DecoderResult::Malformed(malformed_len, _) => {
1163                     if (self.prev_byte == 0xA0 || self.prev_byte == 0xFE || self.prev_byte == 0xFD)
1164                         && (b < 0x80 || b == 0xFF)
1165                     {
1166                         // Mac OS Chinese Simplified single-byte that conflicts with code page GBK lead byte
1167                         // followed by ASCII or a non-conflicting single-byte extension.
1168                         self.pending_score = None; // Just in case
1169                         score += GBK_SINGLE_BYTE_EXTENSION_PENALTY;
1170                         if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') {
1171                             self.prev = LatinCj::AsciiLetter;
1172                         } else if b == 0xFF {
1173                             score += GBK_SINGLE_BYTE_EXTENSION_PENALTY;
1174                             self.prev = LatinCj::Other;
1175                         } else {
1176                             self.prev = LatinCj::Other;
1177                         }
1178                         // The GBK decoder has the pending ASCII concept, which is
1179                         // a problem with this trickery, so let's reset the state.
1180                         self.decoder = GBK.new_decoder_without_bom_handling();
1181                     } else if malformed_len == 1 && b == 0xFF {
1182                         // Mac OS Chinese Simplified single-byte extension that doesn't conflict with lead bytes
1183                         self.pending_score = None; // Just in case
1184                         score += GBK_SINGLE_BYTE_EXTENSION_PENALTY;
1185                         self.prev = LatinCj::Other;
1186                         // The GBK decoder has the pending ASCII concept, which is
1187                         // a problem with this trickery, so let's reset the state.
1188                         self.decoder = GBK.new_decoder_without_bom_handling();
1189                     } else {
1190                         return None;
1191                     }
1192                 }
1193                 DecoderResult::OutputFull => {
1194                     unreachable!();
1195                 }
1196             }
1197             self.prev_byte = b;
1198         }
1199         if last {
1200             let (result, _, _) = self
1201                 .decoder
1202                 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1203             match result {
1204                 DecoderResult::InputEmpty => {}
1205                 DecoderResult::Malformed(_, _) => {
1206                     return None;
1207                 }
1208                 DecoderResult::OutputFull => {
1209                     unreachable!();
1210                 }
1211             }
1212         }
1213         Some(score)
1214     }
1215 }
1216 
1217 // Shift_JIS and Big5
problematic_lead(b: u8) -> bool1218 fn problematic_lead(b: u8) -> bool {
1219     match b {
1220         0x91..=0x97 | 0x9A | 0x8A | 0x9B | 0x8B | 0x9E | 0x8E | 0xB0 => true,
1221         _ => false,
1222     }
1223 }
1224 
1225 // GBK and EUC-KR
more_problematic_lead(b: u8) -> bool1226 fn more_problematic_lead(b: u8) -> bool {
1227     problematic_lead(b) || b == 0x82 || b == 0x84 || b == 0x85 || b == 0xA0
1228 }
1229 
1230 struct ShiftJisCandidate {
1231     decoder: Decoder,
1232     half_width_katakana_seen: bool,
1233     half_width_katakana_state: HalfWidthKatakana,
1234     prev: LatinCj,
1235     prev_byte: u8,
1236     pending_score: Option<i64>,
1237 }
1238 
1239 impl ShiftJisCandidate {
maybe_set_as_pending(&mut self, s: i64) -> i641240     fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1241         assert!(self.pending_score.is_none());
1242         if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) {
1243             s
1244         } else {
1245             self.pending_score = Some(s);
1246             0
1247         }
1248     }
1249 
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>1250     fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1251         let mut score = 0i64;
1252         let mut src = [0u8];
1253         let mut dst = [0u16; 2];
1254         for &b in buffer {
1255             src[0] = b;
1256             let (result, read, written) = self
1257                 .decoder
1258                 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1259             if written > 0 {
1260                 let half_width_katakana_state = self.half_width_katakana_state;
1261                 self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden;
1262                 let u = dst[0];
1263                 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1264                     || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1265                 {
1266                     self.pending_score = None; // Discard pending score
1267                     if self.prev == LatinCj::Cj {
1268                         score += CJK_LATIN_ADJACENCY_PENALTY;
1269                     }
1270                     self.prev = LatinCj::AsciiLetter;
1271                 } else if u >= 0xFF61 && u <= 0xFF9F {
1272                     if !self.half_width_katakana_seen {
1273                         self.half_width_katakana_seen = true;
1274                         // To avoid misdetecting title-length inputs
1275                         score += SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY;
1276                     }
1277                     self.pending_score = None; // Discard pending score
1278                     score += HALF_WIDTH_KATAKANA_SCORE;
1279 
1280                     if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 {
1281                         self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed;
1282                     } else if u >= 0xFF8A && u <= 0xFF8E {
1283                         self.half_width_katakana_state =
1284                             HalfWidthKatakana::DakutenOrHandakutenAllowed;
1285                     } else if u == 0xFF9E {
1286                         if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden {
1287                             score += IMPLAUSIBILITY_PENALTY;
1288                         } else {
1289                             score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1290                         }
1291                     } else if u == 0xFF9F {
1292                         if half_width_katakana_state
1293                             != HalfWidthKatakana::DakutenOrHandakutenAllowed
1294                         {
1295                             score += IMPLAUSIBILITY_PENALTY;
1296                         } else {
1297                             score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1298                         }
1299                     }
1300 
1301                     if self.prev == LatinCj::AsciiLetter {
1302                         score += CJK_LATIN_ADJACENCY_PENALTY;
1303                     }
1304                     self.prev = LatinCj::Cj;
1305                 } else if u >= 0x3040 && u < 0x3100 {
1306                     if let Some(pending) = self.pending_score {
1307                         score += pending;
1308                         self.pending_score = None;
1309                     }
1310                     score += SHIFT_JIS_SCORE_PER_KANA;
1311                     if self.prev == LatinCj::AsciiLetter {
1312                         score += CJK_LATIN_ADJACENCY_PENALTY;
1313                     }
1314                     self.prev = LatinCj::Cj;
1315                 } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1316                     if let Some(pending) = self.pending_score {
1317                         score += pending;
1318                         self.pending_score = None;
1319                     }
1320                     if self.prev_byte < 0x98 || (self.prev_byte == 0x98 && b < 0x73) {
1321                         score += self.maybe_set_as_pending(
1322                             SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI
1323                                 + cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji),
1324                         );
1325                     } else {
1326                         score += self.maybe_set_as_pending(SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI);
1327                     }
1328                     if self.prev == LatinCj::AsciiLetter {
1329                         score += CJK_LATIN_ADJACENCY_PENALTY;
1330                     }
1331                     self.prev = LatinCj::Cj;
1332                 } else if u >= 0xE000 && u < 0xF900 {
1333                     if let Some(pending) = self.pending_score {
1334                         score += pending;
1335                         self.pending_score = None;
1336                     }
1337                     score += SHIFT_JIS_PUA_PENALTY;
1338                     self.prev = LatinCj::Other;
1339                 } else {
1340                     match u {
1341                         0x3000 // Distinct from Korean, space
1342                         | 0x3001 // Distinct from Korean, enumeration comma
1343                         | 0x3002 // Distinct from Korean, full stop
1344                         | 0xFF08 // Distinct from Korean, parenthesis
1345                         | 0xFF09 // Distinct from Korean, parenthesis
1346                         => {
1347                             if let Some(pending) = self.pending_score {
1348                                 score += pending;
1349                                 self.pending_score = None;
1350                             }
1351                             // Not really needed for CJK distinction
1352                             // but let's give non-zero score for these
1353                             // common byte pairs anyway.
1354                             score += CJ_PUNCTUATION;
1355                         }
1356                         0..=0x7F => {
1357                             self.pending_score = None; // Discard pending score
1358                         }
1359                         0x80 => {
1360                             // This is a control character that overlaps euro
1361                             // in windows-1252 and happens to be a non-error
1362                             // is Shift_JIS.
1363                             self.pending_score = None; // Discard pending score
1364                             score += IMPLAUSIBILITY_PENALTY;
1365                         }
1366                         _ => {
1367                             if let Some(pending) = self.pending_score {
1368                                 score += pending;
1369                                 self.pending_score = None;
1370                             }
1371                             score += CJK_OTHER;
1372                         }
1373                     }
1374                     self.prev = LatinCj::Other;
1375                 }
1376             }
1377             match result {
1378                 DecoderResult::InputEmpty => {
1379                     assert_eq!(read, 1);
1380                 }
1381                 DecoderResult::Malformed(malformed_len, _) => {
1382                     if (((self.prev_byte >= 0x81 && self.prev_byte <= 0x9F)
1383                         || (self.prev_byte >= 0xE0 && self.prev_byte <= 0xFC))
1384                         && ((b >= 0x40 && b <= 0x7E) || (b >= 0x80 && b <= 0xFC)))
1385                         && !((self.prev_byte == 0x82 && b >= 0xFA)
1386                             || (self.prev_byte == 0x84 && ((b >= 0xDD && b <= 0xE4) || b >= 0xFB))
1387                             || (self.prev_byte == 0x86 && b >= 0xF2 && b <= 0xFA)
1388                             || (self.prev_byte == 0x87 && b >= 0x77 && b <= 0x7D)
1389                             || (self.prev_byte == 0xFC && b >= 0xF5))
1390                     {
1391                         // Shift_JIS2004 or MacJapanese
1392                         if let Some(pending) = self.pending_score {
1393                             score += pending;
1394                             self.pending_score = None;
1395                         }
1396                         score += SHIFT_JIS_EXTENSION_PENALTY;
1397                         // Approximate boundary
1398                         if self.prev_byte < 0x87 {
1399                             self.prev = LatinCj::Other;
1400                         } else {
1401                             if self.prev == LatinCj::AsciiLetter {
1402                                 score += CJK_LATIN_ADJACENCY_PENALTY;
1403                             }
1404                             self.prev = LatinCj::Cj;
1405                         }
1406                     } else if malformed_len == 1 && (b == 0xA0 || b >= 0xFD) {
1407                         self.pending_score = None; // Just in case
1408                         score += SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY;
1409                         self.prev = LatinCj::Other;
1410                     } else {
1411                         return None;
1412                     }
1413                 }
1414                 DecoderResult::OutputFull => {
1415                     unreachable!();
1416                 }
1417             }
1418             self.prev_byte = b;
1419         }
1420         if last {
1421             let (result, _, _) = self
1422                 .decoder
1423                 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1424             match result {
1425                 DecoderResult::InputEmpty => {}
1426                 DecoderResult::Malformed(_, _) => {
1427                     return None;
1428                 }
1429                 DecoderResult::OutputFull => {
1430                     unreachable!();
1431                 }
1432             }
1433         }
1434         Some(score)
1435     }
1436 }
1437 
1438 struct EucJpCandidate {
1439     decoder: Decoder,
1440     non_ascii_seen: bool,
1441     half_width_katakana_state: HalfWidthKatakana,
1442     prev: LatinCj,
1443     prev_byte: u8,
1444     prev_prev_byte: u8,
1445 }
1446 
1447 impl EucJpCandidate {
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>1448     fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1449         let mut score = 0i64;
1450         let mut src = [0u8];
1451         let mut dst = [0u16; 2];
1452         for &b in buffer {
1453             src[0] = b;
1454             let (result, read, written) = self
1455                 .decoder
1456                 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1457             if written > 0 {
1458                 let half_width_katakana_state = self.half_width_katakana_state;
1459                 self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden;
1460                 let u = dst[0];
1461                 if !self.non_ascii_seen && u >= 0x80 {
1462                     self.non_ascii_seen = true;
1463                     if u >= 0x3040 && u < 0x3100 {
1464                         // Remove the kana advantage over initial Big5
1465                         // hanzi.
1466                         score += EUC_JP_INITIAL_KANA_PENALTY;
1467                     }
1468                 }
1469                 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1470                     || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1471                 {
1472                     if self.prev == LatinCj::Cj {
1473                         score += CJK_LATIN_ADJACENCY_PENALTY;
1474                     }
1475                     self.prev = LatinCj::AsciiLetter;
1476                 } else if u >= 0xFF61 && u <= 0xFF9F {
1477                     score += HALF_WIDTH_KATAKANA_SCORE;
1478 
1479                     if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 {
1480                         self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed;
1481                     } else if u >= 0xFF8A && u <= 0xFF8E {
1482                         self.half_width_katakana_state =
1483                             HalfWidthKatakana::DakutenOrHandakutenAllowed;
1484                     } else if u == 0xFF9E {
1485                         if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden {
1486                             score += IMPLAUSIBILITY_PENALTY;
1487                         } else {
1488                             score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1489                         }
1490                     } else if u == 0xFF9F {
1491                         if half_width_katakana_state
1492                             != HalfWidthKatakana::DakutenOrHandakutenAllowed
1493                         {
1494                             score += IMPLAUSIBILITY_PENALTY;
1495                         } else {
1496                             score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1497                         }
1498                     }
1499 
1500                     if self.prev == LatinCj::AsciiLetter {
1501                         score += CJK_LATIN_ADJACENCY_PENALTY;
1502                     }
1503                     self.prev = LatinCj::Other;
1504                 } else if (u >= 0x3041 && u <= 0x3093) || (u >= 0x30A1 && u <= 0x30F6) {
1505                     match u {
1506                         0x3090 // hiragana wi
1507                         | 0x3091 // hiragana we
1508                         | 0x30F0 // katakana wi
1509                         | 0x30F1 // katakana we
1510                         => {
1511                             // Remove advantage over Big5 Hanzi
1512                             score += EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA;
1513                         }
1514                         _ => {
1515                             score += EUC_JP_SCORE_PER_KANA;
1516                         }
1517                     }
1518                     if self.prev == LatinCj::AsciiLetter {
1519                         score += CJK_LATIN_ADJACENCY_PENALTY;
1520                     }
1521                     self.prev = LatinCj::Cj;
1522                 } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1523                     if self.prev_prev_byte == 0x8F {
1524                         score += EUC_JP_SCORE_PER_OTHER_KANJI;
1525                     } else if self.prev_byte < 0xD0 {
1526                         score += EUC_JP_SCORE_PER_LEVEL_1_KANJI;
1527                         score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji);
1528                     } else {
1529                         score += EUC_JP_SCORE_PER_LEVEL_2_KANJI;
1530                     }
1531                     if self.prev == LatinCj::AsciiLetter {
1532                         score += CJK_LATIN_ADJACENCY_PENALTY;
1533                     }
1534                     self.prev = LatinCj::Cj;
1535                 } else {
1536                     match u {
1537                         0x3000 // Distinct from Korean, space
1538                         | 0x3001 // Distinct from Korean, enumeration comma
1539                         | 0x3002 // Distinct from Korean, full stop
1540                         | 0xFF08 // Distinct from Korean, parenthesis
1541                         | 0xFF09 // Distinct from Korean, parenthesis
1542                         => {
1543                             score += CJ_PUNCTUATION;
1544                         }
1545                         0..=0x7F => {}
1546                         _ => {
1547                             score += CJK_OTHER;
1548                         }
1549                     }
1550                     self.prev = LatinCj::Other;
1551                 }
1552             }
1553             match result {
1554                 DecoderResult::InputEmpty => {
1555                     assert_eq!(read, 1);
1556                 }
1557                 DecoderResult::Malformed(_, _) => {
1558                     if b >= 0xA1
1559                         && b <= 0xFE
1560                         && self.prev_byte >= 0xA1
1561                         && self.prev_byte <= 0xFE
1562                         && ((self.prev_prev_byte != 0x8F
1563                             && !(self.prev_byte == 0xA8 && b >= 0xDF && b <= 0xE6)
1564                             && !(self.prev_byte == 0xAC && b >= 0xF4 && b <= 0xFC)
1565                             && !(self.prev_byte == 0xAD && b >= 0xD8 && b <= 0xDE))
1566                             || (self.prev_prev_byte == 0x8F
1567                                 && self.prev_byte != 0xA2
1568                                 && self.prev_byte != 0xA6
1569                                 && self.prev_byte != 0xA7
1570                                 && self.prev_byte != 0xA9
1571                                 && self.prev_byte != 0xAA
1572                                 && self.prev_byte != 0xAB
1573                                 && self.prev_byte != 0xED
1574                                 && !(self.prev_byte == 0xFE && b >= 0xF7)))
1575                     {
1576                         score += EUC_JP_EXTENSION_PENALTY;
1577                         if self.prev == LatinCj::AsciiLetter {
1578                             score += CJK_LATIN_ADJACENCY_PENALTY;
1579                         }
1580                         self.prev = LatinCj::Cj;
1581                     } else {
1582                         return None;
1583                     }
1584                 }
1585                 DecoderResult::OutputFull => {
1586                     unreachable!();
1587                 }
1588             }
1589             self.prev_prev_byte = self.prev_byte;
1590             self.prev_byte = b;
1591         }
1592         if last {
1593             let (result, _, _) = self
1594                 .decoder
1595                 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1596             match result {
1597                 DecoderResult::InputEmpty => {}
1598                 DecoderResult::Malformed(_, _) => {
1599                     return None;
1600                 }
1601                 DecoderResult::OutputFull => {
1602                     unreachable!();
1603                 }
1604             }
1605         }
1606         Some(score)
1607     }
1608 }
1609 
1610 struct Big5Candidate {
1611     decoder: Decoder,
1612     prev: LatinCj,
1613     prev_byte: u8,
1614     pending_score: Option<i64>,
1615 }
1616 
1617 impl Big5Candidate {
maybe_set_as_pending(&mut self, s: i64) -> i641618     fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1619         assert!(self.pending_score.is_none());
1620         if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) {
1621             s
1622         } else {
1623             self.pending_score = Some(s);
1624             0
1625         }
1626     }
1627 
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>1628     fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1629         let mut score = 0i64;
1630         let mut src = [0u8];
1631         let mut dst = [0u16; 2];
1632         for &b in buffer {
1633             src[0] = b;
1634             let (result, read, written) = self
1635                 .decoder
1636                 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1637             if written == 1 {
1638                 let u = dst[0];
1639                 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1640                     || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1641                 {
1642                     self.pending_score = None; // Discard pending score
1643                     if self.prev == LatinCj::Cj {
1644                         score += CJK_LATIN_ADJACENCY_PENALTY;
1645                     }
1646                     self.prev = LatinCj::AsciiLetter;
1647                 } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1648                     if let Some(pending) = self.pending_score {
1649                         score += pending;
1650                         self.pending_score = None;
1651                     }
1652                     match self.prev_byte {
1653                         0xA4..=0xC6 => {
1654                             score += self.maybe_set_as_pending(BIG5_SCORE_PER_LEVEL_1_HANZI);
1655                             // score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_traditional);
1656                         }
1657                         _ => {
1658                             score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI);
1659                         }
1660                     }
1661                     if self.prev == LatinCj::AsciiLetter {
1662                         score += CJK_LATIN_ADJACENCY_PENALTY;
1663                     }
1664                     self.prev = LatinCj::Cj;
1665                 } else {
1666                     match u {
1667                         0x3000 // Distinct from Korean, space
1668                         | 0x3001 // Distinct from Korean, enumeration comma
1669                         | 0x3002 // Distinct from Korean, full stop
1670                         | 0xFF08 // Distinct from Korean, parenthesis
1671                         | 0xFF09 // Distinct from Korean, parenthesis
1672                         | 0xFF01 // Distinct from Japanese, exclamation
1673                         | 0xFF0C // Distinct from Japanese, comma
1674                         | 0xFF1B // Distinct from Japanese, semicolon
1675                         | 0xFF1F // Distinct from Japanese, question
1676                         => {
1677                             if let Some(pending) = self.pending_score {
1678                                 score += pending;
1679                                 self.pending_score = None;
1680                             }
1681                             // Not really needed for CJK distinction
1682                             // but let's give non-zero score for these
1683                             // common byte pairs anyway.
1684                             score += CJ_PUNCTUATION;
1685                         }
1686                         0..=0x7F => {
1687                             self.pending_score = None; // Discard pending score
1688                         }
1689                         _ => {
1690                             if let Some(pending) = self.pending_score {
1691                                 score += pending;
1692                                 self.pending_score = None;
1693                             }
1694                             score += CJK_OTHER;
1695                         }
1696                     }
1697                     self.prev = LatinCj::Other;
1698                 }
1699             } else if written == 2 {
1700                 if let Some(pending) = self.pending_score {
1701                     score += pending;
1702                     self.pending_score = None;
1703                 }
1704                 if dst[0] == 0xCA || dst[0] == 0xEA {
1705                     score += CJK_OTHER;
1706                     self.prev = LatinCj::Other;
1707                 } else {
1708                     debug_assert!(dst[0] >= 0xD480 && dst[0] < 0xD880);
1709                     score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI);
1710                     if self.prev == LatinCj::AsciiLetter {
1711                         score += CJK_LATIN_ADJACENCY_PENALTY;
1712                     }
1713                     self.prev = LatinCj::Cj;
1714                 }
1715             }
1716             match result {
1717                 DecoderResult::InputEmpty => {
1718                     assert_eq!(read, 1);
1719                 }
1720                 DecoderResult::Malformed(malformed_len, _) => {
1721                     if self.prev_byte >= 0x81
1722                         && self.prev_byte <= 0xFE
1723                         && ((b >= 0x40 && b <= 0x7E) || (b >= 0xA1 && b <= 0xFE))
1724                     {
1725                         // The byte pair is in the Big5 range but unmapped.
1726                         // Treat as PUA to avoid rejecting Big5-UAO, etc.
1727                         // We don't reprocess `b` even if ASCII, since it's
1728                         // logically part of the pair.
1729                         if let Some(pending) = self.pending_score {
1730                             score += pending;
1731                             self.pending_score = None;
1732                         }
1733                         score += BIG5_PUA_PENALTY;
1734                         // Assume Hanzi semantics
1735                         if self.prev == LatinCj::AsciiLetter {
1736                             score += CJK_LATIN_ADJACENCY_PENALTY;
1737                         }
1738                         self.prev = LatinCj::Cj;
1739                     } else if (self.prev_byte == 0xA0
1740                         || self.prev_byte == 0xFD
1741                         || self.prev_byte == 0xFE)
1742                         && (b < 0x80 || b == 0xFF)
1743                     {
1744                         // Mac OS Chinese Traditional single-byte that conflicts with code page Big5 lead byte
1745                         // followed by ASCII or a non-conflicting single-byte extension.
1746                         self.pending_score = None; // Just in case
1747                         score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY;
1748                         if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') {
1749                             self.prev = LatinCj::AsciiLetter;
1750                         } else if b == 0xFF {
1751                             score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY;
1752                             self.prev = LatinCj::Other;
1753                         } else {
1754                             self.prev = LatinCj::Other;
1755                         }
1756                     } else if malformed_len == 1 && b == 0xFF {
1757                         // Mac OS Chinese Traditional single-byte extension that doesn't conflict with lead bytes
1758                         self.pending_score = None; // Just in case
1759                         score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY;
1760                         self.prev = LatinCj::Other;
1761                     } else {
1762                         return None;
1763                     }
1764                 }
1765                 DecoderResult::OutputFull => {
1766                     unreachable!();
1767                 }
1768             }
1769             self.prev_byte = b;
1770         }
1771         if last {
1772             let (result, _, _) = self
1773                 .decoder
1774                 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1775             match result {
1776                 DecoderResult::InputEmpty => {}
1777                 DecoderResult::Malformed(_, _) => {
1778                     return None;
1779                 }
1780                 DecoderResult::OutputFull => {
1781                     unreachable!();
1782                 }
1783             }
1784         }
1785         Some(score)
1786     }
1787 }
1788 
1789 struct EucKrCandidate {
1790     decoder: Decoder,
1791     prev_byte: u8,
1792     prev_was_euc_range: bool,
1793     prev: LatinKorean,
1794     current_word_len: u64,
1795     pending_score: Option<i64>,
1796 }
1797 
1798 impl EucKrCandidate {
maybe_set_as_pending(&mut self, s: i64) -> i641799     fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1800         assert!(self.pending_score.is_none());
1801         if self.prev == LatinKorean::Hangul || !more_problematic_lead(self.prev_byte) {
1802             s
1803         } else {
1804             self.pending_score = Some(s);
1805             0
1806         }
1807     }
1808 
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>1809     fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1810         let mut score = 0i64;
1811         let mut src = [0u8];
1812         let mut dst = [0u16; 2];
1813         for &b in buffer {
1814             let in_euc_range = b >= 0xA1 && b <= 0xFE;
1815             src[0] = b;
1816             let (result, read, written) = self
1817                 .decoder
1818                 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1819             if written > 0 {
1820                 let u = dst[0];
1821                 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1822                     || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1823                 {
1824                     self.pending_score = None; // Discard pending score
1825                     match self.prev {
1826                         LatinKorean::Hangul | LatinKorean::Hanja => {
1827                             score += CJK_LATIN_ADJACENCY_PENALTY;
1828                         }
1829                         _ => {}
1830                     }
1831                     self.prev = LatinKorean::AsciiLetter;
1832                     self.current_word_len = 0;
1833                 } else if u >= 0xAC00 && u <= 0xD7A3 {
1834                     if let Some(pending) = self.pending_score {
1835                         score += pending;
1836                         self.pending_score = None;
1837                     }
1838                     if self.prev_was_euc_range && in_euc_range {
1839                         score += EUC_KR_SCORE_PER_EUC_HANGUL;
1840                         score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_hangul);
1841                     } else {
1842                         score += self.maybe_set_as_pending(EUC_KR_SCORE_PER_NON_EUC_HANGUL);
1843                     }
1844                     if self.prev == LatinKorean::AsciiLetter {
1845                         score += CJK_LATIN_ADJACENCY_PENALTY;
1846                     }
1847                     self.prev = LatinKorean::Hangul;
1848                     self.current_word_len += 1;
1849                     if self.current_word_len > 5 {
1850                         score += EUC_KR_LONG_WORD_PENALTY;
1851                     }
1852                 } else if (u >= 0x4E00 && u < 0xAC00) || (u >= 0xF900 && u <= 0xFA0B) {
1853                     if let Some(pending) = self.pending_score {
1854                         score += pending;
1855                         self.pending_score = None;
1856                     }
1857                     score += EUC_KR_SCORE_PER_HANJA;
1858                     match self.prev {
1859                         LatinKorean::AsciiLetter => {
1860                             score += CJK_LATIN_ADJACENCY_PENALTY;
1861                         }
1862                         LatinKorean::Hangul => {
1863                             score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY;
1864                         }
1865                         _ => {}
1866                     }
1867                     self.prev = LatinKorean::Hanja;
1868                     self.current_word_len += 1;
1869                     if self.current_word_len > 5 {
1870                         score += EUC_KR_LONG_WORD_PENALTY;
1871                     }
1872                 } else {
1873                     if u >= 0x80 {
1874                         if let Some(pending) = self.pending_score {
1875                             score += pending;
1876                             self.pending_score = None;
1877                         }
1878                         score += CJK_OTHER;
1879                     } else {
1880                         self.pending_score = None; // Discard pending score
1881                     }
1882                     self.prev = LatinKorean::Other;
1883                     self.current_word_len = 0;
1884                 }
1885             }
1886             match result {
1887                 DecoderResult::InputEmpty => {
1888                     assert_eq!(read, 1);
1889                 }
1890                 DecoderResult::Malformed(malformed_len, _) => {
1891                     if (self.prev_byte == 0xC9 || self.prev_byte == 0xFE) && b >= 0xA1 && b <= 0xFE
1892                     {
1893                         if let Some(pending) = self.pending_score {
1894                             score += pending;
1895                             self.pending_score = None;
1896                         }
1897                         // The byte pair is in code page 949 EUDC range
1898                         score += EUC_KR_PUA_PENALTY;
1899                         // Assume Hanja semantics
1900                         match self.prev {
1901                             LatinKorean::AsciiLetter => {
1902                                 score += CJK_LATIN_ADJACENCY_PENALTY;
1903                             }
1904                             LatinKorean::Hangul => {
1905                                 score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY;
1906                             }
1907                             _ => {}
1908                         }
1909                         self.prev = LatinKorean::Hanja;
1910                         self.current_word_len += 1;
1911                         if self.current_word_len > 5 {
1912                             score += EUC_KR_LONG_WORD_PENALTY;
1913                         }
1914                     } else if (self.prev_byte == 0xA1
1915                         || (self.prev_byte >= 0xA3 && self.prev_byte <= 0xA8)
1916                         || (self.prev_byte >= 0xAA && self.prev_byte <= 0xAD))
1917                         && (b >= 0x7B && b <= 0x7D)
1918                     {
1919                         if let Some(pending) = self.pending_score {
1920                             score += pending;
1921                             self.pending_score = None;
1922                         }
1923                         // MacKorean symbols in range not part of code page 949
1924                         score += EUC_KR_MAC_KOREAN_PENALTY;
1925                         self.prev = LatinKorean::Other;
1926                         self.current_word_len = 0;
1927                     } else if (self.prev_byte >= 0x81 && self.prev_byte <= 0x84)
1928                         && (b <= 0x80 || b == 0xFF)
1929                     {
1930                         // MacKorean single-byte that conflicts with code page 949 lead byte
1931                         // followed by ASCII or a non-conflicting single-byte extension.
1932                         self.pending_score = None; // Just in case
1933                         score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY;
1934                         if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') {
1935                             self.prev = LatinKorean::AsciiLetter;
1936                         } else if b == 0x80 || b == 0xFF {
1937                             score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY;
1938                             self.prev = LatinKorean::Other;
1939                         } else {
1940                             self.prev = LatinKorean::Other;
1941                         }
1942                         self.current_word_len = 0;
1943                     } else if malformed_len == 1 && (b == 0x80 || b == 0xFF) {
1944                         // MacKorean single-byte extensions that don't conflict with lead bytes
1945                         self.pending_score = None; // Just in case
1946                         score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY;
1947                         self.prev = LatinKorean::Other;
1948                         self.current_word_len = 0;
1949                     } else {
1950                         return None;
1951                     }
1952                 }
1953                 DecoderResult::OutputFull => {
1954                     unreachable!();
1955                 }
1956             }
1957             self.prev_was_euc_range = in_euc_range;
1958             self.prev_byte = b;
1959         }
1960         if last {
1961             let (result, _, _) = self
1962                 .decoder
1963                 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1964             match result {
1965                 DecoderResult::InputEmpty => {}
1966                 DecoderResult::Malformed(_, _) => {
1967                     return None;
1968                 }
1969                 DecoderResult::OutputFull => {
1970                     unreachable!();
1971                 }
1972             }
1973         }
1974         Some(score)
1975     }
1976 }
1977 
1978 enum InnerCandidate {
1979     Latin(LatinCandidate),
1980     NonLatinCased(NonLatinCasedCandidate),
1981     Caseless(CaselessCandidate),
1982     ArabicFrench(ArabicFrenchCandidate),
1983     Logical(LogicalCandidate),
1984     Visual(VisualCandidate),
1985     Utf8(Utf8Candidate),
1986     Iso2022(Iso2022Candidate),
1987     Shift(ShiftJisCandidate),
1988     EucJp(EucJpCandidate),
1989     EucKr(EucKrCandidate),
1990     Big5(Big5Candidate),
1991     Gbk(GbkCandidate),
1992 }
1993 
1994 impl InnerCandidate {
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>1995     fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1996         match self {
1997             InnerCandidate::Latin(c) => {
1998                 if let Some(new_score) = c.feed(buffer) {
1999                     if last {
2000                         // Treat EOF as space-like
2001                         if let Some(additional_score) = c.feed(b" ") {
2002                             Some(new_score + additional_score)
2003                         } else {
2004                             None
2005                         }
2006                     } else {
2007                         Some(new_score)
2008                     }
2009                 } else {
2010                     None
2011                 }
2012             }
2013             InnerCandidate::NonLatinCased(c) => {
2014                 if let Some(new_score) = c.feed(buffer) {
2015                     if last {
2016                         // Treat EOF as space-like
2017                         if let Some(additional_score) = c.feed(b" ") {
2018                             Some(new_score + additional_score)
2019                         } else {
2020                             None
2021                         }
2022                     } else {
2023                         Some(new_score)
2024                     }
2025                 } else {
2026                     None
2027                 }
2028             }
2029             InnerCandidate::Caseless(c) => {
2030                 if let Some(new_score) = c.feed(buffer) {
2031                     if last {
2032                         // Treat EOF as space-like
2033                         if let Some(additional_score) = c.feed(b" ") {
2034                             Some(new_score + additional_score)
2035                         } else {
2036                             None
2037                         }
2038                     } else {
2039                         Some(new_score)
2040                     }
2041                 } else {
2042                     None
2043                 }
2044             }
2045             InnerCandidate::ArabicFrench(c) => {
2046                 if let Some(new_score) = c.feed(buffer) {
2047                     if last {
2048                         // Treat EOF as space-like
2049                         if let Some(additional_score) = c.feed(b" ") {
2050                             Some(new_score + additional_score)
2051                         } else {
2052                             None
2053                         }
2054                     } else {
2055                         Some(new_score)
2056                     }
2057                 } else {
2058                     None
2059                 }
2060             }
2061             InnerCandidate::Logical(c) => {
2062                 if let Some(new_score) = c.feed(buffer) {
2063                     if last {
2064                         // Treat EOF as space-like
2065                         if let Some(additional_score) = c.feed(b" ") {
2066                             Some(new_score + additional_score)
2067                         } else {
2068                             None
2069                         }
2070                     } else {
2071                         Some(new_score)
2072                     }
2073                 } else {
2074                     None
2075                 }
2076             }
2077             InnerCandidate::Visual(c) => {
2078                 if let Some(new_score) = c.feed(buffer) {
2079                     if last {
2080                         // Treat EOF as space-like
2081                         if let Some(additional_score) = c.feed(b" ") {
2082                             Some(new_score + additional_score)
2083                         } else {
2084                             None
2085                         }
2086                     } else {
2087                         Some(new_score)
2088                     }
2089                 } else {
2090                     None
2091                 }
2092             }
2093             InnerCandidate::Utf8(c) => c.feed(buffer, last),
2094             InnerCandidate::Iso2022(c) => c.feed(buffer, last),
2095             InnerCandidate::Shift(c) => c.feed(buffer, last),
2096             InnerCandidate::EucJp(c) => c.feed(buffer, last),
2097             InnerCandidate::EucKr(c) => c.feed(buffer, last),
2098             InnerCandidate::Big5(c) => c.feed(buffer, last),
2099             InnerCandidate::Gbk(c) => c.feed(buffer, last),
2100         }
2101     }
2102 }
2103 
encoding_for_tld(tld: Tld) -> usize2104 fn encoding_for_tld(tld: Tld) -> usize {
2105     match tld {
2106         Tld::CentralWindows | Tld::CentralCyrillic => EncodingDetector::CENTRAL_WINDOWS_INDEX,
2107         Tld::Cyrillic => EncodingDetector::CYRILLIC_WINDOWS_INDEX,
2108         Tld::Generic | Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic | Tld::Eu => {
2109             EncodingDetector::WESTERN_INDEX
2110         }
2111         Tld::IcelandicFaroese => EncodingDetector::ICELANDIC_INDEX,
2112         Tld::Greek => EncodingDetector::GREEK_ISO_INDEX,
2113         Tld::TurkishAzeri => EncodingDetector::TURKISH_INDEX,
2114         Tld::Hebrew => EncodingDetector::LOGICAL_INDEX,
2115         Tld::Arabic => EncodingDetector::ARABIC_WINDOWS_INDEX,
2116         Tld::Baltic => EncodingDetector::BALTIC_WINDOWS_INDEX,
2117         Tld::Vietnamese => EncodingDetector::VIETNAMESE_INDEX,
2118         Tld::Thai => EncodingDetector::THAI_INDEX,
2119         Tld::Simplified | Tld::SimplifiedTraditional => EncodingDetector::GBK_INDEX,
2120         Tld::Traditional | Tld::TraditionalSimplified => EncodingDetector::BIG5_INDEX,
2121         Tld::Japanese => EncodingDetector::SHIFT_JIS_INDEX,
2122         Tld::Korean => EncodingDetector::EUC_KR_INDEX,
2123         Tld::CentralIso => EncodingDetector::CENTRAL_ISO_INDEX,
2124     }
2125 }
2126 
encoding_is_native_to_tld(tld: Tld, encoding: usize) -> bool2127 fn encoding_is_native_to_tld(tld: Tld, encoding: usize) -> bool {
2128     match tld {
2129         Tld::CentralWindows => encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX,
2130         Tld::Cyrillic => {
2131             encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2132                 || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2133                 || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2134                 || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2135         }
2136         Tld::Western => encoding == EncodingDetector::WESTERN_INDEX,
2137         Tld::Greek => {
2138             encoding == EncodingDetector::GREEK_WINDOWS_INDEX
2139                 || encoding == EncodingDetector::GREEK_ISO_INDEX
2140         }
2141         Tld::TurkishAzeri => encoding == EncodingDetector::TURKISH_INDEX,
2142         Tld::Hebrew => encoding == EncodingDetector::LOGICAL_INDEX,
2143         Tld::Arabic => {
2144             encoding == EncodingDetector::ARABIC_WINDOWS_INDEX
2145                 || encoding == EncodingDetector::ARABIC_ISO_INDEX
2146         }
2147         Tld::Baltic => {
2148             encoding == EncodingDetector::BALTIC_WINDOWS_INDEX
2149                 || encoding == EncodingDetector::BALTIC_ISO13_INDEX
2150                 || encoding == EncodingDetector::BALTIC_ISO4_INDEX
2151         }
2152         Tld::Vietnamese => encoding == EncodingDetector::VIETNAMESE_INDEX,
2153         Tld::Thai => encoding == EncodingDetector::THAI_INDEX,
2154         Tld::Simplified => encoding == EncodingDetector::GBK_INDEX,
2155         Tld::Traditional => encoding == EncodingDetector::BIG5_INDEX,
2156         Tld::Japanese => {
2157             encoding == EncodingDetector::SHIFT_JIS_INDEX
2158                 || encoding == EncodingDetector::EUC_JP_INDEX
2159         }
2160         Tld::Korean => encoding == EncodingDetector::EUC_KR_INDEX,
2161         Tld::SimplifiedTraditional | Tld::TraditionalSimplified => {
2162             encoding == EncodingDetector::GBK_INDEX || encoding == EncodingDetector::BIG5_INDEX
2163         }
2164         Tld::CentralIso => encoding == EncodingDetector::CENTRAL_ISO_INDEX,
2165         Tld::IcelandicFaroese => encoding == EncodingDetector::ICELANDIC_INDEX,
2166         Tld::WesternCyrillic => {
2167             encoding == EncodingDetector::WESTERN_INDEX
2168                 || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2169                 || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2170                 || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2171                 || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2172         }
2173         Tld::CentralCyrillic => {
2174             encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX
2175                 || encoding == EncodingDetector::CENTRAL_ISO_INDEX
2176                 || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2177                 || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2178                 || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2179                 || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2180         }
2181         Tld::WesternArabic => {
2182             encoding == EncodingDetector::WESTERN_INDEX
2183                 || encoding == EncodingDetector::ARABIC_WINDOWS_INDEX
2184                 || encoding == EncodingDetector::ARABIC_ISO_INDEX
2185         }
2186         Tld::Eu => {
2187             encoding == EncodingDetector::WESTERN_INDEX
2188                 || encoding == EncodingDetector::ICELANDIC_INDEX
2189                 || encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX
2190                 || encoding == EncodingDetector::CENTRAL_ISO_INDEX
2191                 || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2192                 || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2193                 || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2194                 || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2195                 || encoding == EncodingDetector::GREEK_WINDOWS_INDEX
2196                 || encoding == EncodingDetector::GREEK_ISO_INDEX
2197                 || encoding == EncodingDetector::BALTIC_WINDOWS_INDEX
2198                 || encoding == EncodingDetector::BALTIC_ISO13_INDEX
2199                 || encoding == EncodingDetector::BALTIC_ISO4_INDEX
2200         }
2201         Tld::Generic => false,
2202     }
2203 }
2204 
score_adjustment(score: i64, encoding: usize, tld: Tld) -> i642205 fn score_adjustment(score: i64, encoding: usize, tld: Tld) -> i64 {
2206     if score < 1 {
2207         return 0;
2208     }
2209     // This is the most ad hoc part of this library.
2210     let (divisor, constant) = match tld {
2211         Tld::Generic => {
2212             unreachable!();
2213         }
2214         Tld::CentralWindows | Tld::CentralIso => {
2215             match encoding {
2216                 EncodingDetector::WESTERN_INDEX
2217                 | EncodingDetector::ICELANDIC_INDEX
2218                 | EncodingDetector::BALTIC_WINDOWS_INDEX
2219                 | EncodingDetector::BALTIC_ISO4_INDEX
2220                 | EncodingDetector::BALTIC_ISO13_INDEX
2221                 | EncodingDetector::VIETNAMESE_INDEX
2222                 | EncodingDetector::TURKISH_INDEX => {
2223                     // XXX Tune this better instead of this kind of absolute.
2224                     return score;
2225                 }
2226                 _ => (50, 60),
2227             }
2228         }
2229         Tld::Cyrillic => {
2230             match encoding {
2231                 EncodingDetector::BIG5_INDEX
2232                 | EncodingDetector::GBK_INDEX
2233                 | EncodingDetector::EUC_JP_INDEX
2234                 | EncodingDetector::CENTRAL_WINDOWS_INDEX
2235                 | EncodingDetector::CENTRAL_ISO_INDEX
2236                 | EncodingDetector::GREEK_WINDOWS_INDEX
2237                 | EncodingDetector::GREEK_ISO_INDEX
2238                 | EncodingDetector::VISUAL_INDEX
2239                 | EncodingDetector::LOGICAL_INDEX
2240                 | EncodingDetector::BALTIC_WINDOWS_INDEX
2241                 | EncodingDetector::BALTIC_ISO4_INDEX
2242                 | EncodingDetector::BALTIC_ISO13_INDEX
2243                 | EncodingDetector::TURKISH_INDEX => {
2244                     // XXX Tune this better instead of this kind of absolute.
2245                     return score;
2246                 }
2247                 _ => (50, 60),
2248             }
2249         }
2250         Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic => {
2251             match encoding {
2252                 EncodingDetector::CENTRAL_WINDOWS_INDEX
2253                 | EncodingDetector::CENTRAL_ISO_INDEX
2254                 | EncodingDetector::BALTIC_WINDOWS_INDEX
2255                 | EncodingDetector::BALTIC_ISO4_INDEX
2256                 | EncodingDetector::BALTIC_ISO13_INDEX
2257                 | EncodingDetector::TURKISH_INDEX
2258                 | EncodingDetector::VIETNAMESE_INDEX => {
2259                     // XXX Tune this better instead of this kind of absolute.
2260                     return score;
2261                 }
2262                 _ => (50, 60),
2263             }
2264         }
2265         Tld::Greek => {
2266             match encoding {
2267                 EncodingDetector::BIG5_INDEX
2268                 | EncodingDetector::GBK_INDEX
2269                 | EncodingDetector::EUC_JP_INDEX
2270                 | EncodingDetector::CENTRAL_WINDOWS_INDEX
2271                 | EncodingDetector::CENTRAL_ISO_INDEX
2272                 | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2273                 | EncodingDetector::CYRILLIC_ISO_INDEX
2274                 | EncodingDetector::CYRILLIC_KOI_INDEX
2275                 | EncodingDetector::CYRILLIC_IBM_INDEX
2276                 | EncodingDetector::VISUAL_INDEX
2277                 | EncodingDetector::LOGICAL_INDEX
2278                 | EncodingDetector::BALTIC_WINDOWS_INDEX
2279                 | EncodingDetector::BALTIC_ISO4_INDEX
2280                 | EncodingDetector::BALTIC_ISO13_INDEX
2281                 | EncodingDetector::TURKISH_INDEX => {
2282                     // XXX Tune this better instead of this kind of absolute.
2283                     return score;
2284                 }
2285                 _ => (50, 60),
2286             }
2287         }
2288         Tld::TurkishAzeri => {
2289             match encoding {
2290                 EncodingDetector::CENTRAL_WINDOWS_INDEX
2291                 | EncodingDetector::CENTRAL_ISO_INDEX
2292                 | EncodingDetector::BALTIC_WINDOWS_INDEX
2293                 | EncodingDetector::BALTIC_ISO4_INDEX
2294                 | EncodingDetector::BALTIC_ISO13_INDEX
2295                 | EncodingDetector::VIETNAMESE_INDEX
2296                 | EncodingDetector::ICELANDIC_INDEX => {
2297                     // XXX Tune this better instead of this kind of absolute.
2298                     return score;
2299                 }
2300                 _ => (50, 60),
2301             }
2302         }
2303         Tld::Hebrew => {
2304             match encoding {
2305                 EncodingDetector::CENTRAL_WINDOWS_INDEX
2306                 | EncodingDetector::CENTRAL_ISO_INDEX
2307                 | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2308                 | EncodingDetector::CYRILLIC_ISO_INDEX
2309                 | EncodingDetector::CYRILLIC_KOI_INDEX
2310                 | EncodingDetector::CYRILLIC_IBM_INDEX
2311                 | EncodingDetector::GREEK_WINDOWS_INDEX
2312                 | EncodingDetector::GREEK_ISO_INDEX
2313                 | EncodingDetector::BALTIC_WINDOWS_INDEX
2314                 | EncodingDetector::BALTIC_ISO4_INDEX
2315                 | EncodingDetector::BALTIC_ISO13_INDEX
2316                 | EncodingDetector::VIETNAMESE_INDEX
2317                 | EncodingDetector::TURKISH_INDEX => {
2318                     // XXX Tune this better instead of this kind of absolute.
2319                     return score;
2320                 }
2321                 _ => (50, 60),
2322             }
2323         }
2324         Tld::Arabic => {
2325             match encoding {
2326                 EncodingDetector::BIG5_INDEX
2327                 | EncodingDetector::GBK_INDEX
2328                 | EncodingDetector::EUC_JP_INDEX
2329                 | EncodingDetector::EUC_KR_INDEX
2330                 | EncodingDetector::CENTRAL_WINDOWS_INDEX
2331                 | EncodingDetector::CENTRAL_ISO_INDEX
2332                 | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2333                 | EncodingDetector::CYRILLIC_ISO_INDEX
2334                 | EncodingDetector::CYRILLIC_KOI_INDEX
2335                 | EncodingDetector::CYRILLIC_IBM_INDEX
2336                 | EncodingDetector::GREEK_WINDOWS_INDEX
2337                 | EncodingDetector::GREEK_ISO_INDEX
2338                 | EncodingDetector::VISUAL_INDEX
2339                 | EncodingDetector::LOGICAL_INDEX
2340                 | EncodingDetector::BALTIC_WINDOWS_INDEX
2341                 | EncodingDetector::BALTIC_ISO4_INDEX
2342                 | EncodingDetector::BALTIC_ISO13_INDEX
2343                 | EncodingDetector::VIETNAMESE_INDEX
2344                 | EncodingDetector::TURKISH_INDEX => {
2345                     // XXX Tune this better instead of this kind of absolute.
2346                     return score;
2347                 }
2348                 _ => (50, 60),
2349             }
2350         }
2351         Tld::Baltic => {
2352             match encoding {
2353                 EncodingDetector::CENTRAL_WINDOWS_INDEX
2354                 | EncodingDetector::CENTRAL_ISO_INDEX
2355                 | EncodingDetector::ICELANDIC_INDEX
2356                 | EncodingDetector::TURKISH_INDEX
2357                 | EncodingDetector::VIETNAMESE_INDEX => {
2358                     // XXX Tune this better instead of this kind of absolute.
2359                     return score;
2360                 }
2361                 _ => (50, 60),
2362             }
2363         }
2364         Tld::Vietnamese => {
2365             match encoding {
2366                 EncodingDetector::CENTRAL_WINDOWS_INDEX
2367                 | EncodingDetector::CENTRAL_ISO_INDEX
2368                 | EncodingDetector::BALTIC_WINDOWS_INDEX
2369                 | EncodingDetector::BALTIC_ISO4_INDEX
2370                 | EncodingDetector::BALTIC_ISO13_INDEX
2371                 | EncodingDetector::TURKISH_INDEX
2372                 | EncodingDetector::ICELANDIC_INDEX => {
2373                     // XXX Tune this better instead of this kind of absolute.
2374                     return score;
2375                 }
2376                 _ => (50, 60),
2377             }
2378         }
2379         Tld::Thai => {
2380             match encoding {
2381                 EncodingDetector::BIG5_INDEX
2382                 | EncodingDetector::GBK_INDEX
2383                 | EncodingDetector::EUC_JP_INDEX
2384                 | EncodingDetector::EUC_KR_INDEX
2385                 | EncodingDetector::SHIFT_JIS_INDEX
2386                 | EncodingDetector::CENTRAL_WINDOWS_INDEX
2387                 | EncodingDetector::CENTRAL_ISO_INDEX
2388                 | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2389                 | EncodingDetector::CYRILLIC_ISO_INDEX
2390                 | EncodingDetector::CYRILLIC_KOI_INDEX
2391                 | EncodingDetector::CYRILLIC_IBM_INDEX
2392                 | EncodingDetector::GREEK_WINDOWS_INDEX
2393                 | EncodingDetector::GREEK_ISO_INDEX
2394                 | EncodingDetector::ARABIC_WINDOWS_INDEX
2395                 | EncodingDetector::ARABIC_ISO_INDEX
2396                 | EncodingDetector::VISUAL_INDEX
2397                 | EncodingDetector::LOGICAL_INDEX
2398                 | EncodingDetector::BALTIC_WINDOWS_INDEX
2399                 | EncodingDetector::BALTIC_ISO4_INDEX
2400                 | EncodingDetector::BALTIC_ISO13_INDEX
2401                 | EncodingDetector::TURKISH_INDEX => {
2402                     // XXX Tune this better instead of this kind of absolute.
2403                     return score;
2404                 }
2405                 _ => (50, 60),
2406             }
2407         }
2408         Tld::Simplified
2409         | Tld::Traditional
2410         | Tld::TraditionalSimplified
2411         | Tld::SimplifiedTraditional
2412         | Tld::Japanese
2413         | Tld::Korean => {
2414             // If TLD default is valid, everything else scores zero
2415             return score;
2416         }
2417         Tld::IcelandicFaroese => {
2418             match encoding {
2419                 EncodingDetector::CENTRAL_WINDOWS_INDEX
2420                 | EncodingDetector::CENTRAL_ISO_INDEX
2421                 | EncodingDetector::BALTIC_WINDOWS_INDEX
2422                 | EncodingDetector::BALTIC_ISO4_INDEX
2423                 | EncodingDetector::BALTIC_ISO13_INDEX
2424                 | EncodingDetector::TURKISH_INDEX
2425                 | EncodingDetector::VIETNAMESE_INDEX => {
2426                     // XXX Tune this better instead of this kind of absolute.
2427                     return score;
2428                 }
2429                 _ => (50, 60),
2430             }
2431         }
2432         Tld::CentralCyrillic => {
2433             match encoding {
2434                 EncodingDetector::BIG5_INDEX
2435                 | EncodingDetector::GBK_INDEX
2436                 | EncodingDetector::EUC_JP_INDEX
2437                 | EncodingDetector::GREEK_WINDOWS_INDEX
2438                 | EncodingDetector::GREEK_ISO_INDEX
2439                 | EncodingDetector::VISUAL_INDEX
2440                 | EncodingDetector::LOGICAL_INDEX
2441                 | EncodingDetector::BALTIC_WINDOWS_INDEX
2442                 | EncodingDetector::BALTIC_ISO4_INDEX
2443                 | EncodingDetector::BALTIC_ISO13_INDEX
2444                 | EncodingDetector::TURKISH_INDEX => {
2445                     // XXX Tune this better instead of this kind of absolute.
2446                     return score;
2447                 }
2448                 _ => (50, 60),
2449             }
2450         }
2451         Tld::Eu => {
2452             match encoding {
2453                 EncodingDetector::BIG5_INDEX
2454                 | EncodingDetector::GBK_INDEX
2455                 | EncodingDetector::EUC_JP_INDEX
2456                 | EncodingDetector::TURKISH_INDEX
2457                 | EncodingDetector::VIETNAMESE_INDEX => {
2458                     // XXX Tune this better instead of this kind of absolute.
2459                     return score;
2460                 }
2461                 _ => (50, 60),
2462             }
2463         }
2464     };
2465     (score / divisor) + constant
2466 }
2467 
2468 struct Candidate {
2469     inner: InnerCandidate,
2470     score: Option<i64>,
2471 }
2472 
2473 impl Candidate {
feed(&mut self, buffer: &[u8], last: bool)2474     fn feed(&mut self, buffer: &[u8], last: bool) {
2475         if let Some(old_score) = self.score {
2476             if let Some(new_score) = self.inner.feed(buffer, last) {
2477                 self.score = Some(old_score + new_score);
2478             } else {
2479                 self.score = None;
2480             }
2481         }
2482     }
2483 
new_latin(data: &'static SingleByteData) -> Self2484     fn new_latin(data: &'static SingleByteData) -> Self {
2485         Candidate {
2486             inner: InnerCandidate::Latin(LatinCandidate::new(data)),
2487             score: Some(0),
2488         }
2489     }
2490 
new_non_latin_cased(data: &'static SingleByteData) -> Self2491     fn new_non_latin_cased(data: &'static SingleByteData) -> Self {
2492         Candidate {
2493             inner: InnerCandidate::NonLatinCased(NonLatinCasedCandidate::new(data)),
2494             score: Some(0),
2495         }
2496     }
2497 
new_caseless(data: &'static SingleByteData) -> Self2498     fn new_caseless(data: &'static SingleByteData) -> Self {
2499         Candidate {
2500             inner: InnerCandidate::Caseless(CaselessCandidate::new(data)),
2501             score: Some(0),
2502         }
2503     }
2504 
new_arabic_french(data: &'static SingleByteData) -> Self2505     fn new_arabic_french(data: &'static SingleByteData) -> Self {
2506         Candidate {
2507             inner: InnerCandidate::ArabicFrench(ArabicFrenchCandidate::new(data)),
2508             score: Some(0),
2509         }
2510     }
2511 
new_logical(data: &'static SingleByteData) -> Self2512     fn new_logical(data: &'static SingleByteData) -> Self {
2513         Candidate {
2514             inner: InnerCandidate::Logical(LogicalCandidate::new(data)),
2515             score: Some(0),
2516         }
2517     }
2518 
new_visual(data: &'static SingleByteData) -> Self2519     fn new_visual(data: &'static SingleByteData) -> Self {
2520         Candidate {
2521             inner: InnerCandidate::Visual(VisualCandidate::new(data)),
2522             score: Some(0),
2523         }
2524     }
2525 
new_utf_8() -> Self2526     fn new_utf_8() -> Self {
2527         Candidate {
2528             inner: InnerCandidate::Utf8(Utf8Candidate {
2529                 decoder: UTF_8.new_decoder_without_bom_handling(),
2530             }),
2531             score: Some(0),
2532         }
2533     }
2534 
new_iso_2022_jp() -> Self2535     fn new_iso_2022_jp() -> Self {
2536         Candidate {
2537             inner: InnerCandidate::Iso2022(Iso2022Candidate {
2538                 decoder: ISO_2022_JP.new_decoder_without_bom_handling(),
2539             }),
2540             score: Some(0),
2541         }
2542     }
2543 
new_shift_jis() -> Self2544     fn new_shift_jis() -> Self {
2545         Candidate {
2546             inner: InnerCandidate::Shift(ShiftJisCandidate {
2547                 decoder: SHIFT_JIS.new_decoder_without_bom_handling(),
2548                 half_width_katakana_seen: false,
2549                 half_width_katakana_state: HalfWidthKatakana::DakutenForbidden,
2550                 prev: LatinCj::Other,
2551                 prev_byte: 0,
2552                 pending_score: None,
2553             }),
2554             score: Some(0),
2555         }
2556     }
2557 
new_euc_jp() -> Self2558     fn new_euc_jp() -> Self {
2559         Candidate {
2560             inner: InnerCandidate::EucJp(EucJpCandidate {
2561                 decoder: EUC_JP.new_decoder_without_bom_handling(),
2562                 non_ascii_seen: false,
2563                 half_width_katakana_state: HalfWidthKatakana::DakutenForbidden,
2564                 prev: LatinCj::Other,
2565                 prev_byte: 0,
2566                 prev_prev_byte: 0,
2567             }),
2568             score: Some(0),
2569         }
2570     }
2571 
new_euc_kr() -> Self2572     fn new_euc_kr() -> Self {
2573         Candidate {
2574             inner: InnerCandidate::EucKr(EucKrCandidate {
2575                 decoder: EUC_KR.new_decoder_without_bom_handling(),
2576                 prev_byte: 0,
2577                 prev_was_euc_range: false,
2578                 prev: LatinKorean::Other,
2579                 current_word_len: 0,
2580                 pending_score: None,
2581             }),
2582             score: Some(0),
2583         }
2584     }
2585 
new_big5() -> Self2586     fn new_big5() -> Self {
2587         Candidate {
2588             inner: InnerCandidate::Big5(Big5Candidate {
2589                 decoder: BIG5.new_decoder_without_bom_handling(),
2590                 prev: LatinCj::Other,
2591                 prev_byte: 0,
2592                 pending_score: None,
2593             }),
2594             score: Some(0),
2595         }
2596     }
2597 
new_gbk() -> Self2598     fn new_gbk() -> Self {
2599         Candidate {
2600             inner: InnerCandidate::Gbk(GbkCandidate {
2601                 decoder: GBK.new_decoder_without_bom_handling(),
2602                 prev: LatinCj::Other,
2603                 prev_byte: 0,
2604                 pending_score: None,
2605             }),
2606             score: Some(0),
2607         }
2608     }
2609 
score(&self, encoding: usize, tld: Tld, expectation_is_valid: bool) -> Option<i64>2610     fn score(&self, encoding: usize, tld: Tld, expectation_is_valid: bool) -> Option<i64> {
2611         match &self.inner {
2612             InnerCandidate::NonLatinCased(c) => {
2613                 if c.longest_word < 2 {
2614                     return None;
2615                 }
2616             }
2617             InnerCandidate::Caseless(c) => {
2618                 if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2619                     return None;
2620                 }
2621             }
2622             InnerCandidate::ArabicFrench(c) => {
2623                 if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2624                     return None;
2625                 }
2626             }
2627             InnerCandidate::Logical(c) => {
2628                 if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2629                     return None;
2630                 }
2631             }
2632             InnerCandidate::Visual(c) => {
2633                 if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2634                     return None;
2635                 }
2636             }
2637             _ => {}
2638         }
2639         if tld == Tld::Generic {
2640             return self.score;
2641         }
2642         if let Some(score) = self.score {
2643             if encoding == encoding_for_tld(tld) {
2644                 return Some(score + 1);
2645             }
2646             if encoding_is_native_to_tld(tld, encoding) {
2647                 return Some(score);
2648             }
2649             if expectation_is_valid {
2650                 return Some(score - score_adjustment(score, encoding, tld));
2651             }
2652             // If expectation is no longer valid, fall back to
2653             // generic behavior.
2654             // XXX Flipped Chinese and Central
2655             return Some(score);
2656         }
2657         None
2658     }
2659 
plausible_punctuation(&self) -> u642660     fn plausible_punctuation(&self) -> u64 {
2661         match &self.inner {
2662             InnerCandidate::Logical(c) => {
2663                 return c.plausible_punctuation;
2664             }
2665             InnerCandidate::Visual(c) => {
2666                 return c.plausible_punctuation;
2667             }
2668             _ => {
2669                 unreachable!();
2670             }
2671         }
2672     }
2673 
encoding(&self) -> &'static Encoding2674     fn encoding(&self) -> &'static Encoding {
2675         match &self.inner {
2676             InnerCandidate::Latin(c) => {
2677                 return c.data.encoding;
2678             }
2679             InnerCandidate::NonLatinCased(c) => {
2680                 return c.data.encoding;
2681             }
2682             InnerCandidate::Caseless(c) => {
2683                 return c.data.encoding;
2684             }
2685             InnerCandidate::ArabicFrench(c) => {
2686                 return c.data.encoding;
2687             }
2688             InnerCandidate::Logical(c) => {
2689                 return c.data.encoding;
2690             }
2691             InnerCandidate::Visual(c) => {
2692                 return c.data.encoding;
2693             }
2694             InnerCandidate::Shift(_) => {
2695                 return SHIFT_JIS;
2696             }
2697             InnerCandidate::EucJp(_) => {
2698                 return EUC_JP;
2699             }
2700             InnerCandidate::Big5(_) => {
2701                 return BIG5;
2702             }
2703             InnerCandidate::EucKr(_) => {
2704                 return EUC_KR;
2705             }
2706             InnerCandidate::Gbk(_) => {
2707                 return GBK;
2708             }
2709             InnerCandidate::Utf8(_) => {
2710                 return UTF_8;
2711             }
2712             InnerCandidate::Iso2022(_) => {
2713                 return ISO_2022_JP;
2714             }
2715         }
2716     }
2717 }
2718 
count_non_ascii(buffer: &[u8]) -> u642719 fn count_non_ascii(buffer: &[u8]) -> u64 {
2720     let mut count = 0;
2721     for &b in buffer {
2722         if b >= 0x80 {
2723             count += 1;
2724         }
2725     }
2726     count
2727 }
2728 
2729 #[derive(Clone, Copy)]
2730 enum BeforeNonAscii {
2731     None,
2732     One([u8; 1]),
2733     Two([u8; 2]),
2734 }
2735 
2736 impl BeforeNonAscii {
as_slice(&self) -> &[u8]2737     fn as_slice(&self) -> &[u8] {
2738         match self {
2739             BeforeNonAscii::None => b"",
2740             BeforeNonAscii::One(arr) => &arr[..],
2741             BeforeNonAscii::Two(arr) => &arr[..],
2742         }
2743     }
2744 
push(&mut self, buffer: &[u8])2745     fn push(&mut self, buffer: &[u8]) {
2746         let len = buffer.len();
2747         if len >= 2 {
2748             let arr = [buffer[len - 2], buffer[len - 1]];
2749             *self = BeforeNonAscii::Two(arr);
2750         } else if len == 1 {
2751             match self {
2752                 BeforeNonAscii::None => {
2753                     let arr = [buffer[0]];
2754                     *self = BeforeNonAscii::One(arr);
2755                 }
2756                 BeforeNonAscii::One(first) => {
2757                     let arr = [first[0], buffer[0]];
2758                     *self = BeforeNonAscii::Two(arr);
2759                 }
2760                 BeforeNonAscii::Two(first) => {
2761                     let arr = [first[1], buffer[0]];
2762                     *self = BeforeNonAscii::Two(arr);
2763                 }
2764             }
2765         }
2766     }
2767 }
2768 
2769 /// A Web browser-oriented detector for guessing what character
2770 /// encoding a stream of bytes is encoded in.
2771 ///
2772 /// The bytes are fed to the detector incrementally using the `feed`
2773 /// method. The current guess of the detector can be queried using
2774 /// the `guess` method. The guessing parameters are arguments to the
2775 /// `guess` method rather than arguments to the constructor in order
2776 /// to enable the application to check if the arguments affect the
2777 /// guessing outcome. (The specific use case is to disable UI for
2778 /// re-running the detector with UTF-8 allowed and the top-level
2779 /// domain name ignored if those arguments don't change the guess.)
2780 pub struct EncodingDetector {
2781     candidates: [Candidate; 27],
2782     non_ascii_seen: u64,
2783     // We need to feed up to two bytes of context before non-ASCII
2784     // thanks to Spanish n.º.
2785     last_before_non_ascii: BeforeNonAscii,
2786     esc_seen: bool,
2787     closed: bool,
2788 }
2789 
2790 impl EncodingDetector {
feed_impl(&mut self, buffer: &[u8], last: bool)2791     fn feed_impl(&mut self, buffer: &[u8], last: bool) {
2792         for candidate in self.candidates.iter_mut() {
2793             candidate.feed(buffer, last);
2794         }
2795         self.non_ascii_seen += count_non_ascii(buffer);
2796     }
2797 
2798     /// Inform the detector of a chunk of input.
2799     ///
2800     /// The byte stream is represented as a sequence of calls to this
2801     /// method such that the concatenation of the arguments to this
2802     /// method form the byte stream. It does not matter how the application
2803     /// chooses to chunk the stream. It is OK to call this method with
2804     /// a zero-length byte slice.
2805     ///
2806     /// The end of the stream is indicated by calling this method with
2807     /// `last` set to `true`. In that case, the end of the stream is
2808     /// considered to occur after the last byte of the `buffer` (which
2809     /// may be zero-length) passed in the same call. Once this method
2810     /// has been called with `last` set to `true` this method must not
2811     /// be called again.
2812     ///
2813     /// If you want to perform detection on just the prefix of a longer
2814     /// stream, do not pass `last=true` after the prefix if the stream
2815     /// actually still continues.
2816     ///
2817     /// Returns `true` if after processing `buffer` the stream has
2818     /// contained at least one non-ASCII byte and `false` if only
2819     /// ASCII has been seen so far.
2820     ///
2821     /// # Panics
2822     ///
2823     /// If this method has previously been called with `last` set to `true`.
feed(&mut self, buffer: &[u8], last: bool) -> bool2824     pub fn feed(&mut self, buffer: &[u8], last: bool) -> bool {
2825         assert!(
2826             !self.closed,
2827             "Must not feed again after feeding with last equaling true."
2828         );
2829         if last {
2830             self.closed = true;
2831         }
2832         let start = if self.non_ascii_seen == 0 && !self.esc_seen {
2833             let up_to = Encoding::ascii_valid_up_to(buffer);
2834             let start = if let Some(escape) = memchr::memchr(0x1B, &buffer[..up_to]) {
2835                 self.esc_seen = true;
2836                 escape
2837             } else {
2838                 up_to
2839             };
2840             if start == buffer.len() {
2841                 self.last_before_non_ascii.push(buffer);
2842                 return self.non_ascii_seen != 0;
2843             }
2844             if start == 0 || start == 1 {
2845                 let last_before = self.last_before_non_ascii;
2846                 self.last_before_non_ascii = BeforeNonAscii::None;
2847                 self.feed_impl(last_before.as_slice(), false);
2848                 0
2849             } else {
2850                 start - 2
2851             }
2852         } else {
2853             0
2854         };
2855         self.feed_impl(&buffer[start..], last);
2856         self.non_ascii_seen != 0
2857     }
2858 
2859     /// Guess the encoding given the bytes pushed to the detector so far
2860     /// (via `feed()`), the top-level domain name from which the bytes were
2861     /// loaded, and an indication of whether to consider UTF-8 as a permissible
2862     /// guess.
2863     ///
2864     /// The `tld` argument takes the rightmost DNS label of the hostname of the
2865     /// host the stream was loaded from in lower-case ASCII form. That is, if
2866     /// the label is an internationalized top-level domain name, it must be
2867     /// provided in its Punycode form. If the TLD that the stream was loaded
2868     /// from is unavalable, `None` may be passed instead, which is equivalent
2869     /// to passing `Some(b"com")`.
2870     ///
2871     /// If the `allow_utf8` argument is set to `false`, the return value of
2872     /// this method won't be `encoding_rs::UTF_8`. When performing detection
2873     /// on `text/html` on non-`file:` URLs, Web browsers must pass `false`,
2874     /// unless the user has taken a specific contextual action to request an
2875     /// override. This way, Web developers cannot start depending on UTF-8
2876     /// detection. Such reliance would make the Web Platform more brittle.
2877     ///
2878     /// Returns the guessed encoding.
2879     ///
2880     /// # Panics
2881     ///
2882     /// If `tld` contains non-ASCII, period, or upper-case letters. (The panic
2883     /// condition is intentionally limited to signs of failing to extract the
2884     /// label correctly, failing to provide it in its Punycode form, and failure
2885     /// to lower-case it. Full DNS label validation is intentionally not performed
2886     /// to avoid panics when the reality doesn't match the specs.)
guess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> &'static Encoding2887     pub fn guess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> &'static Encoding {
2888         let mut tld_type = tld.map_or(Tld::Generic, |tld| {
2889             assert!(!contains_upper_case_period_or_non_ascii(tld));
2890             classify_tld(tld)
2891         });
2892 
2893         if self.non_ascii_seen == 0
2894             && self.esc_seen
2895             && self.candidates[Self::ISO_2022_JP_INDEX].score.is_some()
2896         {
2897             return ISO_2022_JP;
2898         }
2899 
2900         if self.candidates[Self::UTF_8_INDEX].score.is_some() {
2901             if allow_utf8 {
2902                 return UTF_8;
2903             }
2904             // Various test cases that prohibit UTF-8 detection want to
2905             // see windows-1252 specifically. These tests run on generic
2906             // domains. However, if we returned windows-1252 on
2907             // some non-generic domains, we'd cause reloads.
2908             return self.candidates[encoding_for_tld(tld_type)].encoding();
2909         }
2910 
2911         let mut encoding = self.candidates[encoding_for_tld(tld_type)].encoding();
2912         let mut max = 0i64;
2913         let mut expectation_is_valid = false;
2914         if tld_type != Tld::Generic {
2915             for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) {
2916                 if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() {
2917                     expectation_is_valid = true;
2918                     break;
2919                 }
2920             }
2921         }
2922         if !expectation_is_valid {
2923             // Flip Chinese and Central around
2924             match tld_type {
2925                 Tld::Simplified => {
2926                     if self.candidates[Self::BIG5_INDEX].score.is_some() {
2927                         tld_type = Tld::Traditional;
2928                         expectation_is_valid = true;
2929                     }
2930                 }
2931                 Tld::Traditional => {
2932                     if self.candidates[Self::GBK_INDEX].score.is_some() {
2933                         tld_type = Tld::Simplified;
2934                         expectation_is_valid = true;
2935                     }
2936                 }
2937                 Tld::CentralWindows => {
2938                     if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() {
2939                         tld_type = Tld::CentralIso;
2940                         expectation_is_valid = true;
2941                     }
2942                 }
2943                 Tld::CentralIso => {
2944                     if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() {
2945                         tld_type = Tld::CentralWindows;
2946                         expectation_is_valid = true;
2947                     }
2948                 }
2949                 _ => {}
2950             }
2951         }
2952         for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) {
2953             if let Some(score) = candidate.score(i, tld_type, expectation_is_valid) {
2954                 if score > max {
2955                     max = score;
2956                     encoding = candidate.encoding();
2957                 }
2958             }
2959         }
2960         let visual = &self.candidates[Self::VISUAL_INDEX];
2961         if let Some(visual_score) = visual.score(Self::VISUAL_INDEX, tld_type, expectation_is_valid)
2962         {
2963             if (visual_score > max || encoding == WINDOWS_1255)
2964                 && visual.plausible_punctuation()
2965                     > self.candidates[Self::LOGICAL_INDEX].plausible_punctuation()
2966             {
2967                 // max = visual_score;
2968                 encoding = ISO_8859_8;
2969             }
2970         }
2971 
2972         encoding
2973     }
2974 
2975     // XXX Test-only API
2976     #[cfg(feature = "testing-only-no-semver-guarantees-do-not-use")]
find_score(&self, encoding: &'static Encoding) -> Option<i64>2977     pub fn find_score(&self, encoding: &'static Encoding) -> Option<i64> {
2978         let mut tld_type = Tld::Generic;
2979         let mut expectation_is_valid = false;
2980         if tld_type != Tld::Generic {
2981             for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) {
2982                 if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() {
2983                     expectation_is_valid = true;
2984                     break;
2985                 }
2986             }
2987         }
2988         if !expectation_is_valid {
2989             // Flip Chinese and Central around
2990             match tld_type {
2991                 Tld::Simplified => {
2992                     if self.candidates[Self::BIG5_INDEX].score.is_some() {
2993                         tld_type = Tld::Traditional;
2994                         expectation_is_valid = true;
2995                     }
2996                 }
2997                 Tld::Traditional => {
2998                     if self.candidates[Self::GBK_INDEX].score.is_some() {
2999                         tld_type = Tld::Simplified;
3000                         expectation_is_valid = true;
3001                     }
3002                 }
3003                 Tld::CentralWindows => {
3004                     if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() {
3005                         tld_type = Tld::CentralIso;
3006                         expectation_is_valid = true;
3007                     }
3008                 }
3009                 Tld::CentralIso => {
3010                     if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() {
3011                         tld_type = Tld::CentralWindows;
3012                         expectation_is_valid = true;
3013                     }
3014                 }
3015                 _ => {}
3016             }
3017         }
3018         for (i, candidate) in self.candidates.iter().enumerate() {
3019             if encoding == candidate.encoding() {
3020                 return candidate.score(i, tld_type, expectation_is_valid);
3021             }
3022         }
3023         Some(0)
3024     }
3025 
3026     const FIRST_NORMAL: usize = 3;
3027 
3028     const UTF_8_INDEX: usize = 0;
3029 
3030     const ISO_2022_JP_INDEX: usize = 1;
3031 
3032     const VISUAL_INDEX: usize = 2;
3033 
3034     const GBK_INDEX: usize = 3;
3035 
3036     const EUC_JP_INDEX: usize = 4;
3037 
3038     const EUC_KR_INDEX: usize = 5;
3039 
3040     const SHIFT_JIS_INDEX: usize = 6;
3041 
3042     const BIG5_INDEX: usize = 7;
3043 
3044     const WESTERN_INDEX: usize = 8;
3045 
3046     const CYRILLIC_WINDOWS_INDEX: usize = 9;
3047 
3048     const CENTRAL_WINDOWS_INDEX: usize = 10;
3049 
3050     const CENTRAL_ISO_INDEX: usize = 11;
3051 
3052     const ARABIC_WINDOWS_INDEX: usize = 12;
3053 
3054     const ICELANDIC_INDEX: usize = 13;
3055 
3056     const TURKISH_INDEX: usize = 14;
3057 
3058     const THAI_INDEX: usize = 15;
3059 
3060     const LOGICAL_INDEX: usize = 16;
3061 
3062     const GREEK_WINDOWS_INDEX: usize = 17;
3063 
3064     const GREEK_ISO_INDEX: usize = 18;
3065 
3066     const BALTIC_WINDOWS_INDEX: usize = 19;
3067 
3068     const BALTIC_ISO13_INDEX: usize = 20;
3069 
3070     const CYRILLIC_KOI_INDEX: usize = 21;
3071 
3072     const CYRILLIC_IBM_INDEX: usize = 22;
3073 
3074     const ARABIC_ISO_INDEX: usize = 23;
3075 
3076     const VIETNAMESE_INDEX: usize = 24;
3077 
3078     const BALTIC_ISO4_INDEX: usize = 25;
3079 
3080     const CYRILLIC_ISO_INDEX: usize = 26;
3081 
3082     /// Creates a new instance of the detector.
new() -> Self3083     pub fn new() -> Self {
3084         EncodingDetector {
3085             candidates: [
3086                 Candidate::new_utf_8(),                                                // 0
3087                 Candidate::new_iso_2022_jp(),                                          // 1
3088                 Candidate::new_visual(&SINGLE_BYTE_DATA[ISO_8859_8_INDEX]),            // 2
3089                 Candidate::new_gbk(),                                                  // 3
3090                 Candidate::new_euc_jp(),                                               // 4
3091                 Candidate::new_euc_kr(),                                               // 5
3092                 Candidate::new_shift_jis(),                                            // 6
3093                 Candidate::new_big5(),                                                 // 7
3094                 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_INDEX]),           // 8
3095                 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1251_INDEX]), // 9
3096                 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1250_INDEX]),           // 10
3097                 Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_2_INDEX]),             // 11
3098                 Candidate::new_arabic_french(&SINGLE_BYTE_DATA[WINDOWS_1256_INDEX]),   // 12
3099                 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_ICELANDIC_INDEX]), // 13
3100                 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1254_INDEX]),           // 14
3101                 Candidate::new_caseless(&SINGLE_BYTE_DATA[WINDOWS_874_INDEX]),         // 15
3102                 Candidate::new_logical(&SINGLE_BYTE_DATA[WINDOWS_1255_INDEX]),         // 16
3103                 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1253_INDEX]), // 17
3104                 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_7_INDEX]),   // 18
3105                 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1257_INDEX]),           // 19
3106                 Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_13_INDEX]),            // 20
3107                 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[KOI8_U_INDEX]),       // 21
3108                 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[IBM866_INDEX]),       // 22
3109                 Candidate::new_caseless(&SINGLE_BYTE_DATA[ISO_8859_6_INDEX]),          // 23
3110                 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1258_INDEX]),           // 24
3111                 Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_4_INDEX]),             // 25
3112                 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_5_INDEX]),   // 26
3113             ],
3114             non_ascii_seen: 0,
3115             last_before_non_ascii: BeforeNonAscii::None,
3116             esc_seen: false,
3117             closed: false,
3118         }
3119     }
3120 
3121     /// Queries whether the TLD is considered non-generic and could affect the guess.
tld_may_affect_guess(tld: Option<&[u8]>) -> bool3122     pub fn tld_may_affect_guess(tld: Option<&[u8]>) -> bool {
3123         if let Some(tld) = tld {
3124             classify_tld(tld) != Tld::Generic
3125         } else {
3126             false
3127         }
3128     }
3129 }
3130 
3131 #[cfg(test)]
3132 mod tests {
3133     use super::*;
3134     use detone::IterDecomposeVietnamese;
3135     use encoding_rs::IBM866;
3136     use encoding_rs::ISO_8859_2;
3137     use encoding_rs::ISO_8859_4;
3138     use encoding_rs::ISO_8859_5;
3139     use encoding_rs::ISO_8859_6;
3140     use encoding_rs::ISO_8859_7;
3141     use encoding_rs::KOI8_U;
3142     use encoding_rs::WINDOWS_1250;
3143     use encoding_rs::WINDOWS_1251;
3144     use encoding_rs::WINDOWS_1252;
3145     use encoding_rs::WINDOWS_1253;
3146     use encoding_rs::WINDOWS_1254;
3147     use encoding_rs::WINDOWS_1256;
3148     use encoding_rs::WINDOWS_1257;
3149     use encoding_rs::WINDOWS_1258;
3150     use encoding_rs::WINDOWS_874;
3151 
check_bytes(bytes: &[u8], encoding: &'static Encoding)3152     fn check_bytes(bytes: &[u8], encoding: &'static Encoding) {
3153         let mut det = EncodingDetector::new();
3154         det.feed(bytes, true);
3155         let enc = det.guess(None, false);
3156         let (decoded, _) = enc.decode_without_bom_handling(bytes);
3157         println!("{:?}", decoded);
3158         assert_eq!(enc, encoding);
3159     }
3160 
check(input: &str, encoding: &'static Encoding)3161     fn check(input: &str, encoding: &'static Encoding) {
3162         let orthographic;
3163         let (bytes, _, _) = if encoding == WINDOWS_1258 {
3164             orthographic = input
3165                 .chars()
3166                 .decompose_vietnamese_tones(true)
3167                 .collect::<String>();
3168             encoding.encode(&orthographic)
3169         } else {
3170             encoding.encode(input)
3171         };
3172         check_bytes(&bytes, encoding);
3173     }
3174 
3175     #[test]
test_i_apostrophe()3176     fn test_i_apostrophe() {
3177         let mut det = EncodingDetector::new();
3178         det.feed(b"I\x92", true);
3179         let enc = det.guess(None, false);
3180         assert_eq!(enc, WINDOWS_1252);
3181     }
3182 
3183     #[test]
test_streaming_numero_one_by_one()3184     fn test_streaming_numero_one_by_one() {
3185         let mut det = EncodingDetector::new();
3186         det.feed(b"n", false);
3187         det.feed(b".", false);
3188         det.feed(b"\xBA", false);
3189         det.feed(b"1", true);
3190         let enc = det.guess(None, false);
3191         assert_eq!(enc, WINDOWS_1252);
3192     }
3193 
3194     #[test]
test_streaming_numero_two_together()3195     fn test_streaming_numero_two_together() {
3196         let mut det = EncodingDetector::new();
3197         det.feed(b"n.", false);
3198         det.feed(b"\xBA", false);
3199         det.feed(b"1", true);
3200         let enc = det.guess(None, false);
3201         assert_eq!(enc, WINDOWS_1252);
3202     }
3203 
3204     #[test]
test_streaming_numero_one_by_one_extra_before()3205     fn test_streaming_numero_one_by_one_extra_before() {
3206         let mut det = EncodingDetector::new();
3207         det.feed(b" n", false);
3208         det.feed(b".", false);
3209         det.feed(b"\xBA", false);
3210         det.feed(b"1", true);
3211         let enc = det.guess(None, false);
3212         assert_eq!(enc, WINDOWS_1252);
3213     }
3214 
3215     #[test]
test_streaming_numero_one_before()3216     fn test_streaming_numero_one_before() {
3217         let mut det = EncodingDetector::new();
3218         det.feed(b"n", false);
3219         det.feed(b".\xBA", false);
3220         det.feed(b"1", true);
3221         let enc = det.guess(None, false);
3222         assert_eq!(enc, WINDOWS_1252);
3223     }
3224 
3225     #[test]
test_streaming_numero_longer_first_buffer()3226     fn test_streaming_numero_longer_first_buffer() {
3227         let mut det = EncodingDetector::new();
3228         det.feed(b"rrn.", false);
3229         det.feed(b"\xBA", false);
3230         det.feed(b"1", true);
3231         let enc = det.guess(None, false);
3232         assert_eq!(enc, WINDOWS_1252);
3233     }
3234 
3235     #[test]
test_empty()3236     fn test_empty() {
3237         let mut det = EncodingDetector::new();
3238         let seen_non_ascii = det.feed(b"", true);
3239         let enc = det.guess(None, false);
3240         assert_eq!(enc, WINDOWS_1252);
3241         assert!(!seen_non_ascii);
3242     }
3243 
3244     #[test]
test_fi()3245     fn test_fi() {
3246         check("Ääni", WINDOWS_1252);
3247     }
3248 
3249     #[test]
test_fi_bis()3250     fn test_fi_bis() {
3251         check("Tämä", WINDOWS_1252);
3252     }
3253 
3254     #[test]
test_pt()3255     fn test_pt() {
3256         check(
3257             "Este é um teste de codificação de caracteres.",
3258             WINDOWS_1252,
3259         );
3260     }
3261 
3262     #[test]
test_is()3263     fn test_is() {
3264         check("Þetta er kóðunarpróf á staf. Fyrir sum tungumál sem nota latneska stafi þurfum við meira inntak til að taka ákvörðunina.", WINDOWS_1252);
3265     }
3266 
3267     #[test]
test_ru_short()3268     fn test_ru_short() {
3269         check("Русский", WINDOWS_1251);
3270     }
3271 
3272     #[test]
test_ru()3273     fn test_ru() {
3274         check("Это тест кодировки символов.", WINDOWS_1251);
3275     }
3276 
3277     #[test]
test_ru_iso()3278     fn test_ru_iso() {
3279         check("Это тест кодировки символов.", ISO_8859_5);
3280     }
3281 
3282     #[test]
test_ru_ibm()3283     fn test_ru_ibm() {
3284         check("Это тест кодировки символов.", IBM866);
3285     }
3286 
3287     #[test]
test_ru_koi()3288     fn test_ru_koi() {
3289         check("Это тест кодировки символов.", KOI8_U);
3290     }
3291 
3292     #[test]
test_uk()3293     fn test_uk() {
3294         check("Це тест на кодування символів.", WINDOWS_1251);
3295     }
3296 
3297     #[test]
test_uk_koi()3298     fn test_uk_koi() {
3299         check("Це тест на кодування символів.", KOI8_U);
3300     }
3301 
3302     #[test]
test_el_short()3303     fn test_el_short() {
3304         check("Ελληνικά", WINDOWS_1253);
3305     }
3306 
3307     #[test]
test_el()3308     fn test_el() {
3309         check(
3310             "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης",
3311             WINDOWS_1253,
3312         );
3313     }
3314 
3315     #[test]
test_el_iso()3316     fn test_el_iso() {
3317         check(
3318             "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης",
3319             ISO_8859_7,
3320         );
3321     }
3322 
3323     #[test]
test_de()3324     fn test_de() {
3325         check("Straße", WINDOWS_1252);
3326     }
3327 
3328     #[test]
test_he()3329     fn test_he() {
3330         check("\u{5E2}\u{5D1}\u{5E8}\u{5D9}\u{5EA}", WINDOWS_1255);
3331     }
3332 
3333     #[test]
test_2022()3334     fn test_2022() {
3335         check("日本語", ISO_2022_JP);
3336     }
3337 
3338     #[test]
test_th()3339     fn test_th() {
3340         check("นี่คือการทดสอบการเข้ารหัสอักขระ", WINDOWS_874);
3341     }
3342 
3343     #[test]
test_vi()3344     fn test_vi() {
3345         check("Đây là một thử nghiệm mã hóa ký tự.", WINDOWS_1258);
3346     }
3347 
3348     #[test]
test_tr()3349     fn test_tr() {
3350         check("Bu bir karakter kodlama testidir. Latince karakterleri kullanan bazı dillerde karar vermek için daha fazla girdiye ihtiyacımız var.", WINDOWS_1254);
3351     }
3352 
3353     #[test]
test_simplified()3354     fn test_simplified() {
3355         check("这是一个字符编码测试。", GBK);
3356     }
3357 
3358     #[test]
test_traditional()3359     fn test_traditional() {
3360         check("這是一個字符編碼測試。", BIG5);
3361     }
3362 
3363     #[test]
test_ko()3364     fn test_ko() {
3365         check("이것은 문자 인코딩 테스트입니다.", EUC_KR);
3366     }
3367 
3368     #[test]
test_shift()3369     fn test_shift() {
3370         check("これは文字実験です。", SHIFT_JIS);
3371     }
3372 
3373     #[test]
test_euc()3374     fn test_euc() {
3375         check("これは文字実験です。", EUC_JP);
3376     }
3377 
3378     #[test]
test_ar()3379     fn test_ar() {
3380         check("هذا هو اختبار ترميز الأحرف.", WINDOWS_1256);
3381     }
3382 
3383     #[test]
test_ar_iso()3384     fn test_ar_iso() {
3385         check("هذا هو اختبار ترميز الأحرف.", ISO_8859_6);
3386     }
3387 
3388     #[test]
test_fa()3389     fn test_fa() {
3390         check("این یک تست رمزگذاری کاراکتر است.", WINDOWS_1256);
3391     }
3392 
3393     #[test]
test_visual()3394     fn test_visual() {
3395         check(".םיוות דודיק ןחבמ והז", ISO_8859_8);
3396     }
3397 
3398     #[test]
test_yi()3399     fn test_yi() {
3400         check("דאָס איז אַ טעסט פֿאַר קאָדירונג פון כאַראַקטער.", WINDOWS_1255);
3401     }
3402 
3403     #[test]
test_it()3404     fn test_it() {
3405         check("è", WINDOWS_1252);
3406     }
3407 
3408     #[test]
test_en()3409     fn test_en() {
3410         check("isn’t", WINDOWS_1252);
3411     }
3412 
3413     #[test]
test_en_bis()3414     fn test_en_bis() {
3415         check("Rock ’n Roll", WINDOWS_1252);
3416     }
3417 
3418     #[test]
test_ca()3419     fn test_ca() {
3420         check("Codificació de caràcters", WINDOWS_1252);
3421     }
3422 
3423     #[test]
test_et()3424     fn test_et() {
3425         check("või", WINDOWS_1252);
3426     }
3427 
3428     #[test]
test_pl_iso()3429     fn test_pl_iso() {
3430         check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", ISO_8859_2);
3431     }
3432 
3433     #[test]
test_pl()3434     fn test_pl() {
3435         check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", WINDOWS_1250);
3436     }
3437 
3438     #[test]
test_lt()3439     fn test_lt() {
3440         check("Tai simbolių kodavimo testas. Kai kurioms kalboms, naudojančioms lotyniškus rašmenis, mums reikia daugiau informacijos, kad galėtume priimti sprendimą.", WINDOWS_1257);
3441     }
3442 
3443     // TODO: Detected as ISO-8859-2.
3444     // #[test]
3445     // fn test_lt_windows_iso_8859_4() {
3446     //     check("Tai simbolių kodavimo testas. Kai kurioms kalboms, naudojančioms lotyniškus rašmenis, mums reikia daugiau informacijos, kad galėtume priimti sprendimą.", ISO_8859_4);
3447     // }
3448 
3449     #[test]
test_lv()3450     fn test_lv() {
3451         check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", WINDOWS_1257);
3452     }
3453 
3454     #[test]
test_lv_iso_8859_4()3455     fn test_lv_iso_8859_4() {
3456         check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", ISO_8859_4);
3457     }
3458 
3459     #[test]
test_a0()3460     fn test_a0() {
3461         // Test that this isn't IBM866. TODO: What about GBK with fully paired 0xA0?
3462         check("\u{A0}\u{A0} \u{A0}", WINDOWS_1252);
3463     }
3464 
3465     #[test]
test_a0a0()3466     fn test_a0a0() {
3467         // Test that this isn't GBK or EUC-KR.
3468         check("\u{A0}\u{A0}", WINDOWS_1252);
3469     }
3470 
3471     #[test]
test_space_copyright_space()3472     fn test_space_copyright_space() {
3473         check(" © ", WINDOWS_1252);
3474     }
3475 
3476     #[test]
test_space_masculine_space()3477     fn test_space_masculine_space() {
3478         check(" º ", WINDOWS_1252);
3479     }
3480 
3481     #[test]
test_space_feminine_space()3482     fn test_space_feminine_space() {
3483         check(" ª ", WINDOWS_1252);
3484     }
3485 
3486     #[test]
test_period_masculine_space()3487     fn test_period_masculine_space() {
3488         check(".º ", WINDOWS_1252);
3489     }
3490 
3491     #[test]
test_period_feminine_space()3492     fn test_period_feminine_space() {
3493         check(".ª ", WINDOWS_1252);
3494     }
3495 
3496     #[test]
test_maria()3497     fn test_maria() {
3498         check(" Mª ", WINDOWS_1252);
3499     }
3500 
3501     #[test]
test_dona()3502     fn test_dona() {
3503         check(" Dª ", WINDOWS_1252);
3504     }
3505 
3506     #[test]
test_nuestra()3507     fn test_nuestra() {
3508         check(" Nª ", WINDOWS_1252);
3509     }
3510 
3511     #[test]
test_senora()3512     fn test_senora() {
3513         check(" Sª ", WINDOWS_1252);
3514     }
3515 
3516     #[test]
test_digit_feminine()3517     fn test_digit_feminine() {
3518         check(" 42ª ", WINDOWS_1252);
3519     }
3520 
3521     #[test]
test_digit_masculine()3522     fn test_digit_masculine() {
3523         check(" 42º ", WINDOWS_1252);
3524     }
3525 
3526     #[test]
test_roman_feminine()3527     fn test_roman_feminine() {
3528         check(" XIVª ", WINDOWS_1252);
3529     }
3530 
3531     #[test]
test_roman_masculine()3532     fn test_roman_masculine() {
3533         check(" XIVº ", WINDOWS_1252);
3534     }
3535 
3536     #[test]
test_numero_uno()3537     fn test_numero_uno() {
3538         check("Nº1", WINDOWS_1252);
3539     }
3540 
3541     #[test]
test_numero()3542     fn test_numero() {
3543         check("Nº", WINDOWS_1252);
3544     }
3545 
3546     #[test]
test_euro()3547     fn test_euro() {
3548         check(" €9", WINDOWS_1252);
3549     }
3550 
3551     #[test]
test_shift_jis_half_width_katakana()3552     fn test_shift_jis_half_width_katakana() {
3553         check("ハードウェアハードウェアハードウェアハードウェアハードウェア", SHIFT_JIS);
3554     }
3555 
3556     #[test]
test_big5_pua()3557     fn test_big5_pua() {
3558         let mut v = Vec::new();
3559         for _ in 0..40 {
3560             v.extend_from_slice(b"\xA4\x40");
3561         }
3562         v.extend_from_slice(b"\x81\x40\xA4\x40");
3563         check_bytes(&v, BIG5);
3564     }
3565 
3566     #[test]
test_big5_single_byte_a0()3567     fn test_big5_single_byte_a0() {
3568         let mut v = Vec::new();
3569         for _ in 0..80 {
3570             v.extend_from_slice(b"\xA4\x40");
3571         }
3572         v.extend_from_slice(b"\x81\x40\xA0 ");
3573         check_bytes(&v, BIG5);
3574     }
3575 
3576     #[test]
test_big5_single_byte_ff()3577     fn test_big5_single_byte_ff() {
3578         let mut v = Vec::new();
3579         for _ in 0..80 {
3580             v.extend_from_slice(b"\xA4\x40");
3581         }
3582         v.extend_from_slice(b"\x81\x40\xFF ");
3583         check_bytes(&v, BIG5);
3584     }
3585 
3586     #[test]
test_not_big5()3587     fn test_not_big5() {
3588         let mut v = Vec::new();
3589         for _ in 0..40 {
3590             v.extend_from_slice(b"\xA4\x40");
3591         }
3592         v.extend_from_slice(b"\x81\x40\xA0\xA0");
3593         check_bytes(&v, IBM866);
3594     }
3595 
3596     #[test]
test_euc_kr_pua()3597     fn test_euc_kr_pua() {
3598         let mut v = Vec::new();
3599         v.extend_from_slice(b"\xC9\xA1\xB0\xA1 ");
3600         for _ in 0..40 {
3601             v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3602         }
3603         check_bytes(&v, EUC_KR);
3604     }
3605 
3606     #[test]
test_euc_kr_pua_bis()3607     fn test_euc_kr_pua_bis() {
3608         let mut v = Vec::new();
3609         v.extend_from_slice(b"\xFE\xA1\xB0\xA1 ");
3610         for _ in 0..40 {
3611             v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3612         }
3613         check_bytes(&v, EUC_KR);
3614     }
3615 
3616     #[test]
test_euc_kr_single_byte_ff()3617     fn test_euc_kr_single_byte_ff() {
3618         let mut v = Vec::new();
3619         v.extend_from_slice(b"\xFF ");
3620         for _ in 0..40 {
3621             v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3622         }
3623         check_bytes(&v, EUC_KR);
3624     }
3625 
3626     #[test]
test_euc_kr_single_byte_81()3627     fn test_euc_kr_single_byte_81() {
3628         let mut v = Vec::new();
3629         v.extend_from_slice(b"\x81 ");
3630         for _ in 0..40 {
3631             v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3632         }
3633         check_bytes(&v, EUC_KR);
3634     }
3635 
3636     #[test]
test_euc_kr_single_byte_84()3637     fn test_euc_kr_single_byte_84() {
3638         let mut v = Vec::new();
3639         v.extend_from_slice(b"\x84 ");
3640         for _ in 0..40 {
3641             v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3642         }
3643         check_bytes(&v, EUC_KR);
3644     }
3645 
3646     #[test]
test_not_euc_kr()3647     fn test_not_euc_kr() {
3648         let mut v = Vec::new();
3649         v.extend_from_slice(b"\xC9\xA0\xB0\xA1 ");
3650         for _ in 0..40 {
3651             v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3652         }
3653         check_bytes(&v, GBK);
3654     }
3655 
3656     #[test]
test_shift_jis_x0213()3657     fn test_shift_jis_x0213() {
3658         let mut v = Vec::new();
3659         v.extend_from_slice(b"\x87\xE5");
3660         for _ in 0..40 {
3661             v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3662         }
3663         check_bytes(&v, SHIFT_JIS);
3664     }
3665 
3666     #[test]
test_shift_jis_single_byte_fd()3667     fn test_shift_jis_single_byte_fd() {
3668         let mut v = Vec::new();
3669         v.extend_from_slice(b"\xFD");
3670         for _ in 0..40 {
3671             v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3672         }
3673         check_bytes(&v, SHIFT_JIS);
3674     }
3675 
3676     #[test]
test_not_shift_jis()3677     fn test_not_shift_jis() {
3678         let mut v = Vec::new();
3679         v.extend_from_slice(b"\x84\xE0");
3680         for _ in 0..40 {
3681             v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3682         }
3683         check_bytes(&v, GBK);
3684     }
3685 
3686     #[test]
test_not_shift_jis_bis()3687     fn test_not_shift_jis_bis() {
3688         let mut v = Vec::new();
3689         v.extend_from_slice(b"\x87\x7D");
3690         for _ in 0..40 {
3691             v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3692         }
3693         check_bytes(&v, GBK);
3694     }
3695 
3696     #[test]
test_euc_jp_x0213()3697     fn test_euc_jp_x0213() {
3698         let mut v = Vec::new();
3699         v.extend_from_slice(b"\xAD\xBF");
3700         for _ in 0..80 {
3701             v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3702         }
3703         check_bytes(&v, EUC_JP);
3704     }
3705 
3706     #[test]
test_euc_jp_x0213_other_plane()3707     fn test_euc_jp_x0213_other_plane() {
3708         let mut v = Vec::new();
3709         v.extend_from_slice(b"\x8F\xFE\xF6");
3710         for _ in 0..80 {
3711             v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3712         }
3713         check_bytes(&v, EUC_JP);
3714     }
3715 
3716     #[test]
test_not_euc_jp()3717     fn test_not_euc_jp() {
3718         let mut v = Vec::new();
3719         v.extend_from_slice(b"\x8F\xFE\xF7");
3720         for _ in 0..80 {
3721             v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3722         }
3723         check_bytes(&v, WINDOWS_1252);
3724     }
3725 
3726     #[test]
test_not_euc_jp_bis()3727     fn test_not_euc_jp_bis() {
3728         let mut v = Vec::new();
3729         v.extend_from_slice(b"\xA8\xDF");
3730         for _ in 0..80 {
3731             v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3732         }
3733         check_bytes(&v, BIG5);
3734     }
3735 
3736     #[test]
test_gbk_single_byte_ff()3737     fn test_gbk_single_byte_ff() {
3738         let mut v = Vec::new();
3739         v.extend_from_slice(b"\xFF");
3740         for _ in 0..80 {
3741             v.extend_from_slice(b"\xB5\xC4");
3742         }
3743         check_bytes(&v, GBK);
3744     }
3745 
3746     #[test]
test_gbk_single_byte_a0()3747     fn test_gbk_single_byte_a0() {
3748         let mut v = Vec::new();
3749         v.extend_from_slice(b"\xA0 ");
3750         for _ in 0..80 {
3751             v.extend_from_slice(b"\xB5\xC4");
3752         }
3753         check_bytes(&v, GBK);
3754     }
3755 
3756     #[test]
test_gbk_single_byte_fe()3757     fn test_gbk_single_byte_fe() {
3758         let mut v = Vec::new();
3759         v.extend_from_slice(b"\xFE ");
3760         for _ in 0..80 {
3761             v.extend_from_slice(b"\xB5\xC4");
3762         }
3763         check_bytes(&v, GBK);
3764     }
3765 
3766     #[test]
test_not_gbk_single_byte_fc()3767     fn test_not_gbk_single_byte_fc() {
3768         let mut v = Vec::new();
3769         v.extend_from_slice(b"\xFC ");
3770         for _ in 0..80 {
3771             v.extend_from_slice(b"\xB5\xC4");
3772         }
3773         check_bytes(&v, ISO_8859_5);
3774     }
3775 }
3776