1 // Copyright 2019 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 //! `chardetng` is a character encoding detector for legacy Web content.
11 //!
12 //! It is optimized for binary size in applications that already depend
13 //! on `encoding_rs` for other reasons.
14
15 use encoding_rs::Decoder;
16 use encoding_rs::DecoderResult;
17 use encoding_rs::Encoding;
18 use encoding_rs::BIG5;
19 use encoding_rs::EUC_JP;
20 use encoding_rs::EUC_KR;
21 use encoding_rs::GBK;
22 use encoding_rs::ISO_2022_JP;
23 use encoding_rs::ISO_8859_8;
24 use encoding_rs::SHIFT_JIS;
25 use encoding_rs::UTF_8;
26 use encoding_rs::WINDOWS_1255;
27
28 mod data;
29 mod tld;
30 use data::*;
31 use tld::classify_tld;
32 use tld::Tld;
33
34 const LATIN_ADJACENCY_PENALTY: i64 = -50;
35
36 const IMPLAUSIBILITY_PENALTY: i64 = -220;
37
38 const ORDINAL_BONUS: i64 = 300;
39
40 /// Must match the ISO-8859-2 score for " Š ". Note: There
41 /// are four Slovenian Wikipedia list page titles where the
42 /// list is split by letter so that Š stands alone for the
43 /// list part for Š. Let's assume that's a special case not
44 /// worth detecting even though the copyright sign detection
45 /// makes Slovenian title detection round to one percentage
46 /// point worse.
47 const COPYRIGHT_BONUS: i64 = 222;
48
49 const IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY: i64 = -180;
50
51 const NON_LATIN_CAPITALIZATION_BONUS: i64 = 40;
52
53 const NON_LATIN_ALL_CAPS_PENALTY: i64 = -40;
54
55 const NON_LATIN_MIXED_CASE_PENALTY: i64 = -20;
56
57 // Manually calibrated relative to windows-1256 Arabic
58 const CJK_BASE_SCORE: i64 = 41;
59
60 const CJK_SECONDARY_BASE_SCORE: i64 = 20; // Was 20
61
62 const SHIFT_JIS_SCORE_PER_KANA: i64 = 20;
63
64 const SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE;
65
66 const SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE;
67
68 // Manually calibrated relative to windows-1256 Persian and Urdu
69 const SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY: i64 = -75;
70
71 const HALF_WIDTH_KATAKANA_SCORE: i64 = 1;
72
73 // Unclear if this is a good idea; seems not harmful, but can't be sure.
74 const HALF_WIDTH_KATAKANA_VOICING_SCORE: i64 = 10;
75
76 const SHIFT_JIS_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Should this be larger?
77
78 const SHIFT_JIS_EXTENSION_PENALTY: i64 = SHIFT_JIS_PUA_PENALTY * 2;
79
80 const SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY: i64 = SHIFT_JIS_EXTENSION_PENALTY;
81
82 const EUC_JP_SCORE_PER_KANA: i64 = CJK_BASE_SCORE + (CJK_BASE_SCORE / 3); // Relative to Big5
83
84 const EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA: i64 = CJK_BASE_SCORE - 1;
85
86 const EUC_JP_SCORE_PER_LEVEL_1_KANJI: i64 = CJK_BASE_SCORE;
87
88 const EUC_JP_SCORE_PER_LEVEL_2_KANJI: i64 = CJK_SECONDARY_BASE_SCORE;
89
90 const EUC_JP_SCORE_PER_OTHER_KANJI: i64 = CJK_SECONDARY_BASE_SCORE / 4;
91
92 const EUC_JP_INITIAL_KANA_PENALTY: i64 = -((CJK_BASE_SCORE / 3) + 1);
93
94 const EUC_JP_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 50); // Needs to be more severe than for Shift_JIS to avoid misdetecting EUC-KR!
95
96 const BIG5_SCORE_PER_LEVEL_1_HANZI: i64 = CJK_BASE_SCORE;
97
98 const BIG5_SCORE_PER_OTHER_HANZI: i64 = CJK_SECONDARY_BASE_SCORE;
99
100 const BIG5_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 30); // More severe than other PUA penalties to avoid misdetecting EUC-KR! (25 as the multiplier is too little)
101
102 const BIG5_SINGLE_BYTE_EXTENSION_PENALTY: i64 = -(CJK_BASE_SCORE * 40);
103
104 const EUC_KR_SCORE_PER_EUC_HANGUL: i64 = CJK_BASE_SCORE + 1;
105
106 const EUC_KR_SCORE_PER_NON_EUC_HANGUL: i64 = CJK_SECONDARY_BASE_SCORE / 5;
107
108 const EUC_KR_SCORE_PER_HANJA: i64 = CJK_SECONDARY_BASE_SCORE / 2;
109
110 const EUC_KR_HANJA_AFTER_HANGUL_PENALTY: i64 = -(CJK_BASE_SCORE * 10);
111
112 const EUC_KR_LONG_WORD_PENALTY: i64 = -6;
113
114 const EUC_KR_PUA_PENALTY: i64 = GBK_PUA_PENALTY - 1; // Break tie in favor of GBK
115
116 const EUC_KR_MAC_KOREAN_PENALTY: i64 = EUC_KR_PUA_PENALTY * 2;
117
118 const EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY: i64 = EUC_KR_MAC_KOREAN_PENALTY;
119
120 const GBK_SCORE_PER_LEVEL_1: i64 = CJK_BASE_SCORE;
121
122 const GBK_SCORE_PER_LEVEL_2: i64 = CJK_SECONDARY_BASE_SCORE;
123
124 const GBK_SCORE_PER_NON_EUC: i64 = CJK_SECONDARY_BASE_SCORE / 4;
125
126 const GBK_PUA_PENALTY: i64 = -(CJK_BASE_SCORE * 10); // Factor should be at least 2, but should it be larger?
127
128 const GBK_SINGLE_BYTE_EXTENSION_PENALTY: i64 = GBK_PUA_PENALTY * 4;
129
130 const CJK_LATIN_ADJACENCY_PENALTY: i64 = -CJK_BASE_SCORE; // smaller penalty than LATIN_ADJACENCY_PENALTY
131
132 const CJ_PUNCTUATION: i64 = CJK_BASE_SCORE / 2;
133
134 const CJK_OTHER: i64 = CJK_SECONDARY_BASE_SCORE / 4;
135
136 /// Latin letter caseless class
137 const LATIN_LETTER: u8 = 1;
138
contains_upper_case_period_or_non_ascii(label: &[u8]) -> bool139 fn contains_upper_case_period_or_non_ascii(label: &[u8]) -> bool {
140 for &b in label.into_iter() {
141 if b >= 0x80 {
142 return true;
143 }
144 if b == b'.' {
145 return true;
146 }
147 if b >= b'A' && b <= b'Z' {
148 return true;
149 }
150 }
151 false
152 }
153
154 // For Latin, we only penalize pairwise bad transitions
155 // if one participant is non-ASCII. This avoids violating
156 // the principle that ASCII pairs never contribute to the
157 // score. (Maybe that's a bad principle, though!)
158 #[derive(PartialEq)]
159 enum LatinCaseState {
160 Space,
161 Upper,
162 Lower,
163 AllCaps,
164 }
165
166 // Fon non-Latin, we calculate case-related penalty
167 // or bonus on a per-non-Latin-word basis.
168 #[derive(PartialEq)]
169 enum NonLatinCaseState {
170 Space,
171 Upper,
172 Lower,
173 UpperLower,
174 AllCaps,
175 Mix,
176 }
177
178 struct NonLatinCasedCandidate {
179 data: &'static SingleByteData,
180 prev: u8,
181 case_state: NonLatinCaseState,
182 prev_ascii: bool,
183 current_word_len: u64,
184 longest_word: u64,
185 ibm866: bool,
186 prev_was_a0: bool, // Only used with IBM866
187 }
188
189 impl NonLatinCasedCandidate {
new(data: &'static SingleByteData) -> Self190 fn new(data: &'static SingleByteData) -> Self {
191 NonLatinCasedCandidate {
192 data: data,
193 prev: 0,
194 case_state: NonLatinCaseState::Space,
195 prev_ascii: true,
196 current_word_len: 0,
197 longest_word: 0,
198 ibm866: data == &SINGLE_BYTE_DATA[IBM866_INDEX],
199 prev_was_a0: false,
200 }
201 }
202
feed(&mut self, buffer: &[u8]) -> Option<i64>203 fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
204 let mut score = 0i64;
205 for &b in buffer {
206 let class = self.data.classify(b);
207 if class == 255 {
208 return None;
209 }
210 let caseless_class = class & 0x7F;
211
212 let ascii = b < 0x80;
213 let ascii_pair = self.prev_ascii && ascii;
214
215 let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
216
217 // The purpose of this state machine is to avoid misdetecting Greek as
218 // Cyrillic by:
219 //
220 // * Giving a small bonus to words that start with an upper-case letter
221 // and are lower-case for the rest.
222 // * Giving a large penalty to start with one lower-case letter followed
223 // by all upper-case (obviously upper and lower case inverted, which
224 // unfortunately is possible due to KOI8-U).
225 // * Giving a small per-word penalty to all-uppercase KOI8-U (to favor
226 // all-lowercase Greek over all-caps KOI8-U).
227 // * Giving large penalties for mixed-case other than initial upper-case.
228 // This also helps relative to non-cased encodings.
229
230 // ASCII doesn't participate in non-Latin casing.
231 if caseless_class == LATIN_LETTER {
232 // Latin
233 // Mark this word as a mess. If there end up being non-Latin
234 // letters in this word, the ASCII-adjacency penalty gets
235 // applied to Latin/non-Latin pairs and the mix penalty
236 // to non-Latin/non-Latin pairs.
237 // XXX Apply penalty here
238 self.case_state = NonLatinCaseState::Mix;
239 } else if !non_ascii_alphabetic {
240 // Space
241 match self.case_state {
242 NonLatinCaseState::Space
243 | NonLatinCaseState::Upper
244 | NonLatinCaseState::Lower => {}
245 NonLatinCaseState::UpperLower => {
246 // Intentionally applied only once per word.
247 score += NON_LATIN_CAPITALIZATION_BONUS;
248 }
249 NonLatinCaseState::AllCaps => {
250 // Intentionally applied only once per word.
251 if self.data == &SINGLE_BYTE_DATA[KOI8_U_INDEX] {
252 // Apply only to KOI8-U.
253 score += NON_LATIN_ALL_CAPS_PENALTY;
254 }
255 }
256 NonLatinCaseState::Mix => {
257 // Per letter
258 score += NON_LATIN_MIXED_CASE_PENALTY * (self.current_word_len as i64);
259 }
260 }
261 self.case_state = NonLatinCaseState::Space;
262 } else if (class >> 7) == 0 {
263 // Lower case
264 match self.case_state {
265 NonLatinCaseState::Space => {
266 self.case_state = NonLatinCaseState::Lower;
267 }
268 NonLatinCaseState::Upper => {
269 self.case_state = NonLatinCaseState::UpperLower;
270 }
271 NonLatinCaseState::Lower
272 | NonLatinCaseState::UpperLower
273 | NonLatinCaseState::Mix => {}
274 NonLatinCaseState::AllCaps => {
275 self.case_state = NonLatinCaseState::Mix;
276 }
277 }
278 } else {
279 // Upper case
280 match self.case_state {
281 NonLatinCaseState::Space => {
282 self.case_state = NonLatinCaseState::Upper;
283 }
284 NonLatinCaseState::Upper => {
285 self.case_state = NonLatinCaseState::AllCaps;
286 }
287 NonLatinCaseState::Lower | NonLatinCaseState::UpperLower => {
288 self.case_state = NonLatinCaseState::Mix;
289 }
290 NonLatinCaseState::AllCaps | NonLatinCaseState::Mix => {}
291 }
292 }
293
294 // XXX Apply penalty if > 16
295 if non_ascii_alphabetic {
296 self.current_word_len += 1;
297 } else {
298 if self.current_word_len > self.longest_word {
299 self.longest_word = self.current_word_len;
300 }
301 self.current_word_len = 0;
302 }
303
304 let is_a0 = b == 0xA0;
305 if !ascii_pair {
306 // 0xA0 is no-break space in many other encodings, so avoid
307 // assigning score to IBM866 when 0xA0 occurs next to itself
308 // or a space-like byte.
309 if !(self.ibm866
310 && ((is_a0 && (self.prev_was_a0 || self.prev == 0))
311 || caseless_class == 0 && self.prev_was_a0))
312 {
313 score += self.data.score(caseless_class, self.prev, false);
314 }
315
316 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
317 score += LATIN_ADJACENCY_PENALTY;
318 } else if caseless_class == LATIN_LETTER
319 && self.data.is_non_latin_alphabetic(self.prev, false)
320 {
321 score += LATIN_ADJACENCY_PENALTY;
322 }
323 }
324
325 self.prev_ascii = ascii;
326 self.prev = caseless_class;
327 self.prev_was_a0 = is_a0;
328 }
329 Some(score)
330 }
331 }
332
333 enum OrdinalState {
334 Other,
335 Space,
336 PeriodAfterN,
337 OrdinalExpectingSpace,
338 OrdinalExpectingSpaceUndoImplausibility,
339 OrdinalExpectingSpaceOrDigit,
340 OrdinalExpectingSpaceOrDigitUndoImplausibily,
341 UpperN,
342 LowerN,
343 FeminineAbbreviationStartLetter,
344 Digit,
345 Roman,
346 Copyright,
347 }
348
349 struct LatinCandidate {
350 data: &'static SingleByteData,
351 prev: u8,
352 case_state: LatinCaseState,
353 prev_non_ascii: u32,
354 ordinal_state: OrdinalState, // Used only when `windows1252 == true`
355 windows1252: bool,
356 }
357
358 impl LatinCandidate {
new(data: &'static SingleByteData) -> Self359 fn new(data: &'static SingleByteData) -> Self {
360 LatinCandidate {
361 data: data,
362 prev: 0,
363 case_state: LatinCaseState::Space,
364 prev_non_ascii: 0,
365 ordinal_state: OrdinalState::Space,
366 windows1252: data == &SINGLE_BYTE_DATA[WINDOWS_1252_INDEX],
367 }
368 }
369
feed(&mut self, buffer: &[u8]) -> Option<i64>370 fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
371 let mut score = 0i64;
372 for &b in buffer {
373 let class = self.data.classify(b);
374 if class == 255 {
375 return None;
376 }
377 let caseless_class = class & 0x7F;
378
379 let ascii = b < 0x80;
380 let ascii_pair = self.prev_non_ascii == 0 && ascii;
381
382 let non_ascii_penalty = match self.prev_non_ascii {
383 0 | 1 | 2 => 0,
384 3 => -5,
385 4 => -20,
386 _ => -200,
387 };
388 score += non_ascii_penalty;
389 // XXX if has Vietnamese-only characters and word length > 7,
390 // apply penalty
391
392 if !self.data.is_latin_alphabetic(caseless_class) {
393 self.case_state = LatinCaseState::Space;
394 } else if (class >> 7) == 0 {
395 // Penalizing lower case after two upper case
396 // is important for avoiding misdetecting
397 // windows-1250 as windows-1252 (byte 0x9F).
398 if self.case_state == LatinCaseState::AllCaps && !ascii_pair {
399 score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
400 }
401 self.case_state = LatinCaseState::Lower;
402 } else {
403 match self.case_state {
404 LatinCaseState::Space => {
405 self.case_state = LatinCaseState::Upper;
406 }
407 LatinCaseState::Upper | LatinCaseState::AllCaps => {
408 self.case_state = LatinCaseState::AllCaps;
409 }
410 LatinCaseState::Lower => {
411 if !ascii_pair {
412 // XXX How bad is this for Irish Gaelic?
413 score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
414 }
415 self.case_state = LatinCaseState::Upper;
416 }
417 }
418 }
419
420 // Treat pairing space-like, which can be non-ASCII, with ASCII as
421 // ASCIIish enough not to get a score in order to avoid giving
422 // ASCII i and I in windows-1254 next to windows-125x apostrophe/quote
423 // a score. This avoids detecting English I’ as Turkish.
424 let ascii_ish_pair = ascii_pair
425 || (ascii && self.prev == 0)
426 || (caseless_class == 0 && self.prev_non_ascii == 0);
427
428 if !ascii_ish_pair {
429 score += self.data.score(caseless_class, self.prev, false);
430 }
431
432 if self.windows1252 {
433 // This state machine assigns score to the sequences
434 // * " º " (Spanish)
435 // * " ª " (Spanish)
436 // * ".ª " (Spanish)
437 // * ".º " (Spanish)
438 // * "n.º1" (Spanish)
439 // * " Mª " (Spanish)
440 // * " Dª " (Spanish)
441 // * " Nª " (Spanish)
442 // * " Sª " (Spanish)
443 // * " 3º " (Italian, where 3 is an ASCII digit)
444 // * " 3ª " (Italian, where 3 is an ASCII digit)
445 // * " Xº " (Italian, where X is a small Roman numeral)
446 // * " Xª " (Italian, where X is a small Roman numeral)
447 // * " Nº1" (Italian, where 1 is an ASCII digit)
448 // * " Nº " (Italian)
449 // * " © " (otherwise ASCII-only)
450 // which are problematic to deal with by pairwise scoring
451 // without messing up Romanian detection.
452 // Initial sc
453 match self.ordinal_state {
454 OrdinalState::Other => {
455 if caseless_class == 0 {
456 self.ordinal_state = OrdinalState::Space;
457 }
458 }
459 OrdinalState::Space => {
460 if caseless_class == 0 {
461 // pass
462 } else if b == 0xAA || b == 0xBA {
463 self.ordinal_state = OrdinalState::OrdinalExpectingSpace;
464 } else if b == b'M' || b == b'D' || b == b'S' {
465 self.ordinal_state = OrdinalState::FeminineAbbreviationStartLetter;
466 } else if b == b'N' {
467 // numero or Nuestra
468 self.ordinal_state = OrdinalState::UpperN;
469 } else if b == b'n' {
470 // numero
471 self.ordinal_state = OrdinalState::LowerN;
472 } else if caseless_class == (ASCII_DIGIT as u8) {
473 self.ordinal_state = OrdinalState::Digit;
474 } else if caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24
475 /* X */
476 {
477 self.ordinal_state = OrdinalState::Roman;
478 } else if b == 0xA9 {
479 self.ordinal_state = OrdinalState::Copyright;
480 } else {
481 self.ordinal_state = OrdinalState::Other;
482 }
483 }
484 OrdinalState::OrdinalExpectingSpace => {
485 if caseless_class == 0 {
486 score += ORDINAL_BONUS;
487 self.ordinal_state = OrdinalState::Space;
488 } else {
489 self.ordinal_state = OrdinalState::Other;
490 }
491 }
492 OrdinalState::OrdinalExpectingSpaceUndoImplausibility => {
493 if caseless_class == 0 {
494 score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
495 self.ordinal_state = OrdinalState::Space;
496 } else {
497 self.ordinal_state = OrdinalState::Other;
498 }
499 }
500 OrdinalState::OrdinalExpectingSpaceOrDigit => {
501 if caseless_class == 0 {
502 score += ORDINAL_BONUS;
503 self.ordinal_state = OrdinalState::Space;
504 } else if caseless_class == (ASCII_DIGIT as u8) {
505 score += ORDINAL_BONUS;
506 // Deliberately set to `Other`
507 self.ordinal_state = OrdinalState::Other;
508 } else {
509 self.ordinal_state = OrdinalState::Other;
510 }
511 }
512 OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily => {
513 if caseless_class == 0 {
514 score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
515 self.ordinal_state = OrdinalState::Space;
516 } else if caseless_class == (ASCII_DIGIT as u8) {
517 score += ORDINAL_BONUS - IMPLAUSIBILITY_PENALTY;
518 // Deliberately set to `Other`
519 self.ordinal_state = OrdinalState::Other;
520 } else {
521 self.ordinal_state = OrdinalState::Other;
522 }
523 }
524 OrdinalState::UpperN => {
525 if b == 0xAA {
526 self.ordinal_state =
527 OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
528 } else if b == 0xBA {
529 self.ordinal_state =
530 OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily;
531 } else if b == b'.' {
532 self.ordinal_state = OrdinalState::PeriodAfterN;
533 } else if caseless_class == 0 {
534 self.ordinal_state = OrdinalState::Space;
535 } else {
536 self.ordinal_state = OrdinalState::Other;
537 }
538 }
539 OrdinalState::LowerN => {
540 if b == 0xBA {
541 self.ordinal_state =
542 OrdinalState::OrdinalExpectingSpaceOrDigitUndoImplausibily;
543 } else if b == b'.' {
544 self.ordinal_state = OrdinalState::PeriodAfterN;
545 } else if caseless_class == 0 {
546 self.ordinal_state = OrdinalState::Space;
547 } else {
548 self.ordinal_state = OrdinalState::Other;
549 }
550 }
551 OrdinalState::FeminineAbbreviationStartLetter => {
552 if b == 0xAA {
553 self.ordinal_state =
554 OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
555 } else if caseless_class == 0 {
556 self.ordinal_state = OrdinalState::Space;
557 } else {
558 self.ordinal_state = OrdinalState::Other;
559 }
560 }
561 OrdinalState::Digit => {
562 if b == 0xAA || b == 0xBA {
563 self.ordinal_state = OrdinalState::OrdinalExpectingSpace;
564 } else if caseless_class == 0 {
565 self.ordinal_state = OrdinalState::Space;
566 } else if caseless_class == (ASCII_DIGIT as u8) {
567 // pass
568 } else {
569 self.ordinal_state = OrdinalState::Other;
570 }
571 }
572 OrdinalState::Roman => {
573 if b == 0xAA || b == 0xBA {
574 self.ordinal_state =
575 OrdinalState::OrdinalExpectingSpaceUndoImplausibility;
576 } else if caseless_class == 0 {
577 self.ordinal_state = OrdinalState::Space;
578 } else if caseless_class == 9 /* I */ || caseless_class == 22 /* V */ || caseless_class == 24
579 /* X */
580 {
581 // pass
582 } else {
583 self.ordinal_state = OrdinalState::Other;
584 }
585 }
586 OrdinalState::PeriodAfterN => {
587 if b == 0xBA {
588 self.ordinal_state = OrdinalState::OrdinalExpectingSpaceOrDigit;
589 } else if caseless_class == 0 {
590 self.ordinal_state = OrdinalState::Space;
591 } else {
592 self.ordinal_state = OrdinalState::Other;
593 }
594 }
595 OrdinalState::Copyright => {
596 if caseless_class == 0 {
597 score += COPYRIGHT_BONUS;
598 self.ordinal_state = OrdinalState::Space;
599 } else {
600 self.ordinal_state = OrdinalState::Other;
601 }
602 }
603 }
604 }
605
606 if ascii {
607 self.prev_non_ascii = 0;
608 } else {
609 self.prev_non_ascii += 1;
610 }
611 self.prev = caseless_class;
612 }
613 Some(score)
614 }
615 }
616
617 struct ArabicFrenchCandidate {
618 data: &'static SingleByteData,
619 prev: u8,
620 case_state: LatinCaseState,
621 prev_ascii: bool,
622 current_word_len: u64,
623 longest_word: u64,
624 }
625
626 impl ArabicFrenchCandidate {
new(data: &'static SingleByteData) -> Self627 fn new(data: &'static SingleByteData) -> Self {
628 ArabicFrenchCandidate {
629 data: data,
630 prev: 0,
631 case_state: LatinCaseState::Space,
632 prev_ascii: true,
633 current_word_len: 0,
634 longest_word: 0,
635 }
636 }
637
feed(&mut self, buffer: &[u8]) -> Option<i64>638 fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
639 let mut score = 0i64;
640 for &b in buffer {
641 let class = self.data.classify(b);
642 if class == 255 {
643 return None;
644 }
645 let caseless_class = class & 0x7F;
646
647 let ascii = b < 0x80;
648 let ascii_pair = self.prev_ascii && ascii;
649
650 if caseless_class != LATIN_LETTER {
651 // We compute case penalties for French only
652 self.case_state = LatinCaseState::Space;
653 } else if (class >> 7) == 0 {
654 if self.case_state == LatinCaseState::AllCaps && !ascii_pair {
655 score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
656 }
657 self.case_state = LatinCaseState::Lower;
658 } else {
659 match self.case_state {
660 LatinCaseState::Space => {
661 self.case_state = LatinCaseState::Upper;
662 }
663 LatinCaseState::Upper | LatinCaseState::AllCaps => {
664 self.case_state = LatinCaseState::AllCaps;
665 }
666 LatinCaseState::Lower => {
667 if !ascii_pair {
668 score += IMPLAUSIBLE_LATIN_CASE_TRANSITION_PENALTY;
669 }
670 self.case_state = LatinCaseState::Upper;
671 }
672 }
673 }
674
675 // Count only Arabic word length and ignore French
676 let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, true);
677 // XXX apply penalty if > 23
678 if non_ascii_alphabetic {
679 self.current_word_len += 1;
680 } else {
681 if self.current_word_len > self.longest_word {
682 self.longest_word = self.current_word_len;
683 }
684 self.current_word_len = 0;
685 }
686
687 if !ascii_pair {
688 score += self.data.score(caseless_class, self.prev, true);
689
690 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
691 score += LATIN_ADJACENCY_PENALTY;
692 } else if caseless_class == LATIN_LETTER
693 && self.data.is_non_latin_alphabetic(self.prev, true)
694 {
695 score += LATIN_ADJACENCY_PENALTY;
696 }
697 }
698
699 self.prev_ascii = ascii;
700 self.prev = caseless_class;
701 }
702 Some(score)
703 }
704 }
705
706 struct CaselessCandidate {
707 data: &'static SingleByteData,
708 prev: u8,
709 prev_ascii: bool,
710 current_word_len: u64,
711 longest_word: u64,
712 }
713
714 impl CaselessCandidate {
new(data: &'static SingleByteData) -> Self715 fn new(data: &'static SingleByteData) -> Self {
716 CaselessCandidate {
717 data: data,
718 prev: 0,
719 prev_ascii: true,
720 current_word_len: 0,
721 longest_word: 0,
722 }
723 }
724
feed(&mut self, buffer: &[u8]) -> Option<i64>725 fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
726 let mut score = 0i64;
727 for &b in buffer {
728 let class = self.data.classify(b);
729 if class == 255 {
730 return None;
731 }
732 let caseless_class = class & 0x7F;
733
734 let ascii = b < 0x80;
735 let ascii_pair = self.prev_ascii && ascii;
736
737 let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
738 // Apply penalty if > 23 and not Thai
739 if non_ascii_alphabetic {
740 self.current_word_len += 1;
741 } else {
742 if self.current_word_len > self.longest_word {
743 self.longest_word = self.current_word_len;
744 }
745 self.current_word_len = 0;
746 }
747
748 if !ascii_pair {
749 score += self.data.score(caseless_class, self.prev, false);
750
751 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
752 score += LATIN_ADJACENCY_PENALTY;
753 } else if caseless_class == LATIN_LETTER
754 && self.data.is_non_latin_alphabetic(self.prev, false)
755 {
756 score += LATIN_ADJACENCY_PENALTY;
757 }
758 }
759
760 self.prev_ascii = ascii;
761 self.prev = caseless_class;
762 }
763 Some(score)
764 }
765 }
766
is_ascii_punctuation(byte: u8) -> bool767 fn is_ascii_punctuation(byte: u8) -> bool {
768 match byte {
769 b'.' | b',' | b':' | b';' | b'?' | b'!' => true,
770 _ => false,
771 }
772 }
773
774 struct LogicalCandidate {
775 data: &'static SingleByteData,
776 prev: u8,
777 prev_ascii: bool,
778 plausible_punctuation: u64,
779 current_word_len: u64,
780 longest_word: u64,
781 }
782
783 impl LogicalCandidate {
new(data: &'static SingleByteData) -> Self784 fn new(data: &'static SingleByteData) -> Self {
785 LogicalCandidate {
786 data: data,
787 prev: 0,
788 prev_ascii: true,
789 plausible_punctuation: 0,
790 current_word_len: 0,
791 longest_word: 0,
792 }
793 }
794
feed(&mut self, buffer: &[u8]) -> Option<i64>795 fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
796 let mut score = 0i64;
797 for &b in buffer {
798 let class = self.data.classify(b);
799 if class == 255 {
800 return None;
801 }
802 let caseless_class = class & 0x7F;
803
804 let ascii = b < 0x80;
805 let ascii_pair = self.prev_ascii && ascii;
806
807 let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
808 // XXX apply penalty if > 22
809 if non_ascii_alphabetic {
810 self.current_word_len += 1;
811 } else {
812 if self.current_word_len > self.longest_word {
813 self.longest_word = self.current_word_len;
814 }
815 self.current_word_len = 0;
816 }
817
818 if !ascii_pair {
819 score += self.data.score(caseless_class, self.prev, false);
820
821 let prev_non_ascii_alphabetic = self.data.is_non_latin_alphabetic(self.prev, false);
822 if caseless_class == 0 && prev_non_ascii_alphabetic && is_ascii_punctuation(b) {
823 self.plausible_punctuation += 1;
824 }
825
826 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
827 score += LATIN_ADJACENCY_PENALTY;
828 } else if caseless_class == LATIN_LETTER && prev_non_ascii_alphabetic {
829 score += LATIN_ADJACENCY_PENALTY;
830 }
831 }
832
833 self.prev_ascii = ascii;
834 self.prev = caseless_class;
835 }
836 Some(score)
837 }
838 }
839
840 struct VisualCandidate {
841 data: &'static SingleByteData,
842 prev: u8,
843 prev_ascii: bool,
844 prev_punctuation: bool,
845 plausible_punctuation: u64,
846 current_word_len: u64,
847 longest_word: u64,
848 }
849
850 impl VisualCandidate {
new(data: &'static SingleByteData) -> Self851 fn new(data: &'static SingleByteData) -> Self {
852 VisualCandidate {
853 data: data,
854 prev: 0,
855 prev_ascii: true,
856 prev_punctuation: false,
857 plausible_punctuation: 0,
858 current_word_len: 0,
859 longest_word: 0,
860 }
861 }
862
feed(&mut self, buffer: &[u8]) -> Option<i64>863 fn feed(&mut self, buffer: &[u8]) -> Option<i64> {
864 let mut score = 0i64;
865 for &b in buffer {
866 let class = self.data.classify(b);
867 if class == 255 {
868 return None;
869 }
870 let caseless_class = class & 0x7F;
871
872 let ascii = b < 0x80;
873 let ascii_pair = self.prev_ascii && ascii;
874
875 let non_ascii_alphabetic = self.data.is_non_latin_alphabetic(caseless_class, false);
876 // XXX apply penalty if > 22
877 if non_ascii_alphabetic {
878 self.current_word_len += 1;
879 } else {
880 if self.current_word_len > self.longest_word {
881 self.longest_word = self.current_word_len;
882 }
883 self.current_word_len = 0;
884 }
885
886 if !ascii_pair {
887 score += self.data.score(caseless_class, self.prev, false);
888
889 if non_ascii_alphabetic && self.prev_punctuation {
890 self.plausible_punctuation += 1;
891 }
892
893 if self.prev == LATIN_LETTER && non_ascii_alphabetic {
894 score += LATIN_ADJACENCY_PENALTY;
895 } else if caseless_class == LATIN_LETTER
896 && self.data.is_non_latin_alphabetic(self.prev, false)
897 {
898 score += LATIN_ADJACENCY_PENALTY;
899 }
900 }
901
902 self.prev_ascii = ascii;
903 self.prev = caseless_class;
904 self.prev_punctuation = caseless_class == 0 && is_ascii_punctuation(b);
905 }
906 Some(score)
907 }
908 }
909
910 struct Utf8Candidate {
911 decoder: Decoder,
912 }
913
914 impl Utf8Candidate {
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>915 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
916 let mut dst = [0u8; 1024];
917 let mut total_read = 0;
918 loop {
919 let (result, read, _) = self.decoder.decode_to_utf8_without_replacement(
920 &buffer[total_read..],
921 &mut dst,
922 last,
923 );
924 total_read += read;
925 match result {
926 DecoderResult::InputEmpty => {
927 return Some(0);
928 }
929 DecoderResult::Malformed(_, _) => {
930 return None;
931 }
932 DecoderResult::OutputFull => {
933 continue;
934 }
935 }
936 }
937 }
938 }
939
940 struct Iso2022Candidate {
941 decoder: Decoder,
942 }
943
944 impl Iso2022Candidate {
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>945 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
946 let mut dst = [0u16; 1024];
947 let mut total_read = 0;
948 loop {
949 let (result, read, _) = self.decoder.decode_to_utf16_without_replacement(
950 &buffer[total_read..],
951 &mut dst,
952 last,
953 );
954 total_read += read;
955 match result {
956 DecoderResult::InputEmpty => {
957 return Some(0);
958 }
959 DecoderResult::Malformed(_, _) => {
960 return None;
961 }
962 DecoderResult::OutputFull => {
963 continue;
964 }
965 }
966 }
967 }
968 }
969
970 #[derive(PartialEq)]
971 enum LatinCj {
972 AsciiLetter,
973 Cj,
974 Other,
975 }
976
977 #[derive(PartialEq, Copy, Clone)]
978 enum HalfWidthKatakana {
979 DakutenForbidden,
980 DakutenAllowed,
981 DakutenOrHandakutenAllowed,
982 }
983
984 #[derive(PartialEq)]
985 enum LatinKorean {
986 AsciiLetter,
987 Hangul,
988 Hanja,
989 Other,
990 }
991
cjk_extra_score(u: u16, table: &'static [u16; 128]) -> i64992 fn cjk_extra_score(u: u16, table: &'static [u16; 128]) -> i64 {
993 if let Some(pos) = table.iter().position(|&x| x == u) {
994 ((128 - pos) / 16) as i64
995 } else {
996 0
997 }
998 }
999
1000 struct GbkCandidate {
1001 decoder: Decoder,
1002 prev_byte: u8,
1003 prev: LatinCj,
1004 pending_score: Option<i64>,
1005 }
1006
1007 impl GbkCandidate {
maybe_set_as_pending(&mut self, s: i64) -> i641008 fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1009 assert!(self.pending_score.is_none());
1010 if self.prev == LatinCj::Cj || !more_problematic_lead(self.prev_byte) {
1011 s
1012 } else {
1013 self.pending_score = Some(s);
1014 0
1015 }
1016 }
1017
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>1018 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1019 let mut score = 0i64;
1020 let mut src = [0u8];
1021 let mut dst = [0u16; 2];
1022 for &b in buffer {
1023 src[0] = b;
1024 let (result, read, written) = self
1025 .decoder
1026 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1027 if written == 1 {
1028 let u = dst[0];
1029 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1030 || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1031 {
1032 self.pending_score = None; // Discard pending score
1033 if self.prev == LatinCj::Cj {
1034 score += CJK_LATIN_ADJACENCY_PENALTY;
1035 }
1036 self.prev = LatinCj::AsciiLetter;
1037 } else if u == 0x20AC {
1038 // euro sign
1039 self.pending_score = None; // Discard pending score
1040 // Should there even be a penalty?
1041 self.prev = LatinCj::Other;
1042 } else if u >= 0x4E00 && u <= 0x9FA5 {
1043 if let Some(pending) = self.pending_score {
1044 score += pending;
1045 self.pending_score = None;
1046 }
1047 if b >= 0xA1 && b <= 0xFE {
1048 match self.prev_byte {
1049 0xA1..=0xD7 => {
1050 score += GBK_SCORE_PER_LEVEL_1;
1051 score +=
1052 cjk_extra_score(u, &data::DETECTOR_DATA.frequent_simplified);
1053 }
1054 0xD8..=0xFE => score += GBK_SCORE_PER_LEVEL_2,
1055 _ => {
1056 score += GBK_SCORE_PER_NON_EUC;
1057 }
1058 }
1059 } else {
1060 score += self.maybe_set_as_pending(GBK_SCORE_PER_NON_EUC);
1061 }
1062 if self.prev == LatinCj::AsciiLetter {
1063 score += CJK_LATIN_ADJACENCY_PENALTY;
1064 }
1065 self.prev = LatinCj::Cj;
1066 } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1067 if let Some(pending) = self.pending_score {
1068 score += pending;
1069 self.pending_score = None;
1070 }
1071 // XXX score?
1072 if self.prev == LatinCj::AsciiLetter {
1073 score += CJK_LATIN_ADJACENCY_PENALTY;
1074 }
1075 self.prev = LatinCj::Cj;
1076 } else if u >= 0xE000 && u < 0xF900 {
1077 if let Some(pending) = self.pending_score {
1078 score += pending;
1079 self.pending_score = None;
1080 }
1081 // Treat the GB18030-required PUA mappings as non-EUC ideographs.
1082 match u {
1083 0xE78D..=0xE796
1084 | 0xE816..=0xE818
1085 | 0xE81E
1086 | 0xE826
1087 | 0xE82B
1088 | 0xE82C
1089 | 0xE831
1090 | 0xE832
1091 | 0xE83B
1092 | 0xE843
1093 | 0xE854
1094 | 0xE855
1095 | 0xE864 => {
1096 score += GBK_SCORE_PER_NON_EUC;
1097 if self.prev == LatinCj::AsciiLetter {
1098 score += CJK_LATIN_ADJACENCY_PENALTY;
1099 }
1100 self.prev = LatinCj::Cj;
1101 }
1102 _ => {
1103 score += GBK_PUA_PENALTY;
1104 self.prev = LatinCj::Other;
1105 }
1106 }
1107 } else {
1108 match u {
1109 0x3000 // Distinct from Korean, space
1110 | 0x3001 // Distinct from Korean, enumeration comma
1111 | 0x3002 // Distinct from Korean, full stop
1112 | 0xFF08 // Distinct from Korean, parenthesis
1113 | 0xFF09 // Distinct from Korean, parenthesis
1114 | 0xFF01 // Distinct from Japanese, exclamation
1115 | 0xFF0C // Distinct from Japanese, comma
1116 | 0xFF1B // Distinct from Japanese, semicolon
1117 | 0xFF1F // Distinct from Japanese, question
1118 => {
1119 if let Some(pending) = self.pending_score {
1120 score += pending;
1121 self.pending_score = None;
1122 }
1123 score += CJ_PUNCTUATION;
1124 }
1125 0..=0x7F => {
1126 self.pending_score = None; // Discard pending score
1127 }
1128 _ => {
1129 if let Some(pending) = self.pending_score {
1130 score += pending;
1131 self.pending_score = None;
1132 }
1133 score += CJK_OTHER;
1134 }
1135 }
1136 self.prev = LatinCj::Other;
1137 }
1138 } else if written == 2 {
1139 if let Some(pending) = self.pending_score {
1140 score += pending;
1141 self.pending_score = None;
1142 }
1143 let u = dst[0];
1144 if u >= 0xDB80 && u <= 0xDBFF {
1145 score += GBK_PUA_PENALTY;
1146 self.prev = LatinCj::Other;
1147 } else if u >= 0xD480 && u < 0xD880 {
1148 score += GBK_SCORE_PER_NON_EUC;
1149 if self.prev == LatinCj::AsciiLetter {
1150 score += CJK_LATIN_ADJACENCY_PENALTY;
1151 }
1152 self.prev = LatinCj::Cj;
1153 } else {
1154 score += CJK_OTHER;
1155 self.prev = LatinCj::Other;
1156 }
1157 }
1158 match result {
1159 DecoderResult::InputEmpty => {
1160 assert_eq!(read, 1);
1161 }
1162 DecoderResult::Malformed(malformed_len, _) => {
1163 if (self.prev_byte == 0xA0 || self.prev_byte == 0xFE || self.prev_byte == 0xFD)
1164 && (b < 0x80 || b == 0xFF)
1165 {
1166 // Mac OS Chinese Simplified single-byte that conflicts with code page GBK lead byte
1167 // followed by ASCII or a non-conflicting single-byte extension.
1168 self.pending_score = None; // Just in case
1169 score += GBK_SINGLE_BYTE_EXTENSION_PENALTY;
1170 if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') {
1171 self.prev = LatinCj::AsciiLetter;
1172 } else if b == 0xFF {
1173 score += GBK_SINGLE_BYTE_EXTENSION_PENALTY;
1174 self.prev = LatinCj::Other;
1175 } else {
1176 self.prev = LatinCj::Other;
1177 }
1178 // The GBK decoder has the pending ASCII concept, which is
1179 // a problem with this trickery, so let's reset the state.
1180 self.decoder = GBK.new_decoder_without_bom_handling();
1181 } else if malformed_len == 1 && b == 0xFF {
1182 // Mac OS Chinese Simplified single-byte extension that doesn't conflict with lead bytes
1183 self.pending_score = None; // Just in case
1184 score += GBK_SINGLE_BYTE_EXTENSION_PENALTY;
1185 self.prev = LatinCj::Other;
1186 // The GBK decoder has the pending ASCII concept, which is
1187 // a problem with this trickery, so let's reset the state.
1188 self.decoder = GBK.new_decoder_without_bom_handling();
1189 } else {
1190 return None;
1191 }
1192 }
1193 DecoderResult::OutputFull => {
1194 unreachable!();
1195 }
1196 }
1197 self.prev_byte = b;
1198 }
1199 if last {
1200 let (result, _, _) = self
1201 .decoder
1202 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1203 match result {
1204 DecoderResult::InputEmpty => {}
1205 DecoderResult::Malformed(_, _) => {
1206 return None;
1207 }
1208 DecoderResult::OutputFull => {
1209 unreachable!();
1210 }
1211 }
1212 }
1213 Some(score)
1214 }
1215 }
1216
1217 // Shift_JIS and Big5
problematic_lead(b: u8) -> bool1218 fn problematic_lead(b: u8) -> bool {
1219 match b {
1220 0x91..=0x97 | 0x9A | 0x8A | 0x9B | 0x8B | 0x9E | 0x8E | 0xB0 => true,
1221 _ => false,
1222 }
1223 }
1224
1225 // GBK and EUC-KR
more_problematic_lead(b: u8) -> bool1226 fn more_problematic_lead(b: u8) -> bool {
1227 problematic_lead(b) || b == 0x82 || b == 0x84 || b == 0x85 || b == 0xA0
1228 }
1229
1230 struct ShiftJisCandidate {
1231 decoder: Decoder,
1232 half_width_katakana_seen: bool,
1233 half_width_katakana_state: HalfWidthKatakana,
1234 prev: LatinCj,
1235 prev_byte: u8,
1236 pending_score: Option<i64>,
1237 }
1238
1239 impl ShiftJisCandidate {
maybe_set_as_pending(&mut self, s: i64) -> i641240 fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1241 assert!(self.pending_score.is_none());
1242 if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) {
1243 s
1244 } else {
1245 self.pending_score = Some(s);
1246 0
1247 }
1248 }
1249
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>1250 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1251 let mut score = 0i64;
1252 let mut src = [0u8];
1253 let mut dst = [0u16; 2];
1254 for &b in buffer {
1255 src[0] = b;
1256 let (result, read, written) = self
1257 .decoder
1258 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1259 if written > 0 {
1260 let half_width_katakana_state = self.half_width_katakana_state;
1261 self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden;
1262 let u = dst[0];
1263 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1264 || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1265 {
1266 self.pending_score = None; // Discard pending score
1267 if self.prev == LatinCj::Cj {
1268 score += CJK_LATIN_ADJACENCY_PENALTY;
1269 }
1270 self.prev = LatinCj::AsciiLetter;
1271 } else if u >= 0xFF61 && u <= 0xFF9F {
1272 if !self.half_width_katakana_seen {
1273 self.half_width_katakana_seen = true;
1274 // To avoid misdetecting title-length inputs
1275 score += SHIFT_JIS_INITIAL_HALF_WIDTH_KATAKANA_PENALTY;
1276 }
1277 self.pending_score = None; // Discard pending score
1278 score += HALF_WIDTH_KATAKANA_SCORE;
1279
1280 if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 {
1281 self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed;
1282 } else if u >= 0xFF8A && u <= 0xFF8E {
1283 self.half_width_katakana_state =
1284 HalfWidthKatakana::DakutenOrHandakutenAllowed;
1285 } else if u == 0xFF9E {
1286 if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden {
1287 score += IMPLAUSIBILITY_PENALTY;
1288 } else {
1289 score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1290 }
1291 } else if u == 0xFF9F {
1292 if half_width_katakana_state
1293 != HalfWidthKatakana::DakutenOrHandakutenAllowed
1294 {
1295 score += IMPLAUSIBILITY_PENALTY;
1296 } else {
1297 score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1298 }
1299 }
1300
1301 if self.prev == LatinCj::AsciiLetter {
1302 score += CJK_LATIN_ADJACENCY_PENALTY;
1303 }
1304 self.prev = LatinCj::Cj;
1305 } else if u >= 0x3040 && u < 0x3100 {
1306 if let Some(pending) = self.pending_score {
1307 score += pending;
1308 self.pending_score = None;
1309 }
1310 score += SHIFT_JIS_SCORE_PER_KANA;
1311 if self.prev == LatinCj::AsciiLetter {
1312 score += CJK_LATIN_ADJACENCY_PENALTY;
1313 }
1314 self.prev = LatinCj::Cj;
1315 } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1316 if let Some(pending) = self.pending_score {
1317 score += pending;
1318 self.pending_score = None;
1319 }
1320 if self.prev_byte < 0x98 || (self.prev_byte == 0x98 && b < 0x73) {
1321 score += self.maybe_set_as_pending(
1322 SHIFT_JIS_SCORE_PER_LEVEL_1_KANJI
1323 + cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji),
1324 );
1325 } else {
1326 score += self.maybe_set_as_pending(SHIFT_JIS_SCORE_PER_LEVEL_2_KANJI);
1327 }
1328 if self.prev == LatinCj::AsciiLetter {
1329 score += CJK_LATIN_ADJACENCY_PENALTY;
1330 }
1331 self.prev = LatinCj::Cj;
1332 } else if u >= 0xE000 && u < 0xF900 {
1333 if let Some(pending) = self.pending_score {
1334 score += pending;
1335 self.pending_score = None;
1336 }
1337 score += SHIFT_JIS_PUA_PENALTY;
1338 self.prev = LatinCj::Other;
1339 } else {
1340 match u {
1341 0x3000 // Distinct from Korean, space
1342 | 0x3001 // Distinct from Korean, enumeration comma
1343 | 0x3002 // Distinct from Korean, full stop
1344 | 0xFF08 // Distinct from Korean, parenthesis
1345 | 0xFF09 // Distinct from Korean, parenthesis
1346 => {
1347 if let Some(pending) = self.pending_score {
1348 score += pending;
1349 self.pending_score = None;
1350 }
1351 // Not really needed for CJK distinction
1352 // but let's give non-zero score for these
1353 // common byte pairs anyway.
1354 score += CJ_PUNCTUATION;
1355 }
1356 0..=0x7F => {
1357 self.pending_score = None; // Discard pending score
1358 }
1359 0x80 => {
1360 // This is a control character that overlaps euro
1361 // in windows-1252 and happens to be a non-error
1362 // is Shift_JIS.
1363 self.pending_score = None; // Discard pending score
1364 score += IMPLAUSIBILITY_PENALTY;
1365 }
1366 _ => {
1367 if let Some(pending) = self.pending_score {
1368 score += pending;
1369 self.pending_score = None;
1370 }
1371 score += CJK_OTHER;
1372 }
1373 }
1374 self.prev = LatinCj::Other;
1375 }
1376 }
1377 match result {
1378 DecoderResult::InputEmpty => {
1379 assert_eq!(read, 1);
1380 }
1381 DecoderResult::Malformed(malformed_len, _) => {
1382 if (((self.prev_byte >= 0x81 && self.prev_byte <= 0x9F)
1383 || (self.prev_byte >= 0xE0 && self.prev_byte <= 0xFC))
1384 && ((b >= 0x40 && b <= 0x7E) || (b >= 0x80 && b <= 0xFC)))
1385 && !((self.prev_byte == 0x82 && b >= 0xFA)
1386 || (self.prev_byte == 0x84 && ((b >= 0xDD && b <= 0xE4) || b >= 0xFB))
1387 || (self.prev_byte == 0x86 && b >= 0xF2 && b <= 0xFA)
1388 || (self.prev_byte == 0x87 && b >= 0x77 && b <= 0x7D)
1389 || (self.prev_byte == 0xFC && b >= 0xF5))
1390 {
1391 // Shift_JIS2004 or MacJapanese
1392 if let Some(pending) = self.pending_score {
1393 score += pending;
1394 self.pending_score = None;
1395 }
1396 score += SHIFT_JIS_EXTENSION_PENALTY;
1397 // Approximate boundary
1398 if self.prev_byte < 0x87 {
1399 self.prev = LatinCj::Other;
1400 } else {
1401 if self.prev == LatinCj::AsciiLetter {
1402 score += CJK_LATIN_ADJACENCY_PENALTY;
1403 }
1404 self.prev = LatinCj::Cj;
1405 }
1406 } else if malformed_len == 1 && (b == 0xA0 || b >= 0xFD) {
1407 self.pending_score = None; // Just in case
1408 score += SHIFT_JIS_SINGLE_BYTE_EXTENSION_PENALTY;
1409 self.prev = LatinCj::Other;
1410 } else {
1411 return None;
1412 }
1413 }
1414 DecoderResult::OutputFull => {
1415 unreachable!();
1416 }
1417 }
1418 self.prev_byte = b;
1419 }
1420 if last {
1421 let (result, _, _) = self
1422 .decoder
1423 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1424 match result {
1425 DecoderResult::InputEmpty => {}
1426 DecoderResult::Malformed(_, _) => {
1427 return None;
1428 }
1429 DecoderResult::OutputFull => {
1430 unreachable!();
1431 }
1432 }
1433 }
1434 Some(score)
1435 }
1436 }
1437
1438 struct EucJpCandidate {
1439 decoder: Decoder,
1440 non_ascii_seen: bool,
1441 half_width_katakana_state: HalfWidthKatakana,
1442 prev: LatinCj,
1443 prev_byte: u8,
1444 prev_prev_byte: u8,
1445 }
1446
1447 impl EucJpCandidate {
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>1448 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1449 let mut score = 0i64;
1450 let mut src = [0u8];
1451 let mut dst = [0u16; 2];
1452 for &b in buffer {
1453 src[0] = b;
1454 let (result, read, written) = self
1455 .decoder
1456 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1457 if written > 0 {
1458 let half_width_katakana_state = self.half_width_katakana_state;
1459 self.half_width_katakana_state = HalfWidthKatakana::DakutenForbidden;
1460 let u = dst[0];
1461 if !self.non_ascii_seen && u >= 0x80 {
1462 self.non_ascii_seen = true;
1463 if u >= 0x3040 && u < 0x3100 {
1464 // Remove the kana advantage over initial Big5
1465 // hanzi.
1466 score += EUC_JP_INITIAL_KANA_PENALTY;
1467 }
1468 }
1469 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1470 || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1471 {
1472 if self.prev == LatinCj::Cj {
1473 score += CJK_LATIN_ADJACENCY_PENALTY;
1474 }
1475 self.prev = LatinCj::AsciiLetter;
1476 } else if u >= 0xFF61 && u <= 0xFF9F {
1477 score += HALF_WIDTH_KATAKANA_SCORE;
1478
1479 if (u >= 0xFF76 && u <= 0xFF84) || u == 0xFF73 {
1480 self.half_width_katakana_state = HalfWidthKatakana::DakutenAllowed;
1481 } else if u >= 0xFF8A && u <= 0xFF8E {
1482 self.half_width_katakana_state =
1483 HalfWidthKatakana::DakutenOrHandakutenAllowed;
1484 } else if u == 0xFF9E {
1485 if half_width_katakana_state == HalfWidthKatakana::DakutenForbidden {
1486 score += IMPLAUSIBILITY_PENALTY;
1487 } else {
1488 score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1489 }
1490 } else if u == 0xFF9F {
1491 if half_width_katakana_state
1492 != HalfWidthKatakana::DakutenOrHandakutenAllowed
1493 {
1494 score += IMPLAUSIBILITY_PENALTY;
1495 } else {
1496 score += HALF_WIDTH_KATAKANA_VOICING_SCORE;
1497 }
1498 }
1499
1500 if self.prev == LatinCj::AsciiLetter {
1501 score += CJK_LATIN_ADJACENCY_PENALTY;
1502 }
1503 self.prev = LatinCj::Other;
1504 } else if (u >= 0x3041 && u <= 0x3093) || (u >= 0x30A1 && u <= 0x30F6) {
1505 match u {
1506 0x3090 // hiragana wi
1507 | 0x3091 // hiragana we
1508 | 0x30F0 // katakana wi
1509 | 0x30F1 // katakana we
1510 => {
1511 // Remove advantage over Big5 Hanzi
1512 score += EUC_JP_SCORE_PER_NEAR_OBSOLETE_KANA;
1513 }
1514 _ => {
1515 score += EUC_JP_SCORE_PER_KANA;
1516 }
1517 }
1518 if self.prev == LatinCj::AsciiLetter {
1519 score += CJK_LATIN_ADJACENCY_PENALTY;
1520 }
1521 self.prev = LatinCj::Cj;
1522 } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1523 if self.prev_prev_byte == 0x8F {
1524 score += EUC_JP_SCORE_PER_OTHER_KANJI;
1525 } else if self.prev_byte < 0xD0 {
1526 score += EUC_JP_SCORE_PER_LEVEL_1_KANJI;
1527 score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_kanji);
1528 } else {
1529 score += EUC_JP_SCORE_PER_LEVEL_2_KANJI;
1530 }
1531 if self.prev == LatinCj::AsciiLetter {
1532 score += CJK_LATIN_ADJACENCY_PENALTY;
1533 }
1534 self.prev = LatinCj::Cj;
1535 } else {
1536 match u {
1537 0x3000 // Distinct from Korean, space
1538 | 0x3001 // Distinct from Korean, enumeration comma
1539 | 0x3002 // Distinct from Korean, full stop
1540 | 0xFF08 // Distinct from Korean, parenthesis
1541 | 0xFF09 // Distinct from Korean, parenthesis
1542 => {
1543 score += CJ_PUNCTUATION;
1544 }
1545 0..=0x7F => {}
1546 _ => {
1547 score += CJK_OTHER;
1548 }
1549 }
1550 self.prev = LatinCj::Other;
1551 }
1552 }
1553 match result {
1554 DecoderResult::InputEmpty => {
1555 assert_eq!(read, 1);
1556 }
1557 DecoderResult::Malformed(_, _) => {
1558 if b >= 0xA1
1559 && b <= 0xFE
1560 && self.prev_byte >= 0xA1
1561 && self.prev_byte <= 0xFE
1562 && ((self.prev_prev_byte != 0x8F
1563 && !(self.prev_byte == 0xA8 && b >= 0xDF && b <= 0xE6)
1564 && !(self.prev_byte == 0xAC && b >= 0xF4 && b <= 0xFC)
1565 && !(self.prev_byte == 0xAD && b >= 0xD8 && b <= 0xDE))
1566 || (self.prev_prev_byte == 0x8F
1567 && self.prev_byte != 0xA2
1568 && self.prev_byte != 0xA6
1569 && self.prev_byte != 0xA7
1570 && self.prev_byte != 0xA9
1571 && self.prev_byte != 0xAA
1572 && self.prev_byte != 0xAB
1573 && self.prev_byte != 0xED
1574 && !(self.prev_byte == 0xFE && b >= 0xF7)))
1575 {
1576 score += EUC_JP_EXTENSION_PENALTY;
1577 if self.prev == LatinCj::AsciiLetter {
1578 score += CJK_LATIN_ADJACENCY_PENALTY;
1579 }
1580 self.prev = LatinCj::Cj;
1581 } else {
1582 return None;
1583 }
1584 }
1585 DecoderResult::OutputFull => {
1586 unreachable!();
1587 }
1588 }
1589 self.prev_prev_byte = self.prev_byte;
1590 self.prev_byte = b;
1591 }
1592 if last {
1593 let (result, _, _) = self
1594 .decoder
1595 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1596 match result {
1597 DecoderResult::InputEmpty => {}
1598 DecoderResult::Malformed(_, _) => {
1599 return None;
1600 }
1601 DecoderResult::OutputFull => {
1602 unreachable!();
1603 }
1604 }
1605 }
1606 Some(score)
1607 }
1608 }
1609
1610 struct Big5Candidate {
1611 decoder: Decoder,
1612 prev: LatinCj,
1613 prev_byte: u8,
1614 pending_score: Option<i64>,
1615 }
1616
1617 impl Big5Candidate {
maybe_set_as_pending(&mut self, s: i64) -> i641618 fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1619 assert!(self.pending_score.is_none());
1620 if self.prev == LatinCj::Cj || !problematic_lead(self.prev_byte) {
1621 s
1622 } else {
1623 self.pending_score = Some(s);
1624 0
1625 }
1626 }
1627
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>1628 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1629 let mut score = 0i64;
1630 let mut src = [0u8];
1631 let mut dst = [0u16; 2];
1632 for &b in buffer {
1633 src[0] = b;
1634 let (result, read, written) = self
1635 .decoder
1636 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1637 if written == 1 {
1638 let u = dst[0];
1639 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1640 || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1641 {
1642 self.pending_score = None; // Discard pending score
1643 if self.prev == LatinCj::Cj {
1644 score += CJK_LATIN_ADJACENCY_PENALTY;
1645 }
1646 self.prev = LatinCj::AsciiLetter;
1647 } else if (u >= 0x3400 && u < 0xA000) || (u >= 0xF900 && u < 0xFB00) {
1648 if let Some(pending) = self.pending_score {
1649 score += pending;
1650 self.pending_score = None;
1651 }
1652 match self.prev_byte {
1653 0xA4..=0xC6 => {
1654 score += self.maybe_set_as_pending(BIG5_SCORE_PER_LEVEL_1_HANZI);
1655 // score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_traditional);
1656 }
1657 _ => {
1658 score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI);
1659 }
1660 }
1661 if self.prev == LatinCj::AsciiLetter {
1662 score += CJK_LATIN_ADJACENCY_PENALTY;
1663 }
1664 self.prev = LatinCj::Cj;
1665 } else {
1666 match u {
1667 0x3000 // Distinct from Korean, space
1668 | 0x3001 // Distinct from Korean, enumeration comma
1669 | 0x3002 // Distinct from Korean, full stop
1670 | 0xFF08 // Distinct from Korean, parenthesis
1671 | 0xFF09 // Distinct from Korean, parenthesis
1672 | 0xFF01 // Distinct from Japanese, exclamation
1673 | 0xFF0C // Distinct from Japanese, comma
1674 | 0xFF1B // Distinct from Japanese, semicolon
1675 | 0xFF1F // Distinct from Japanese, question
1676 => {
1677 if let Some(pending) = self.pending_score {
1678 score += pending;
1679 self.pending_score = None;
1680 }
1681 // Not really needed for CJK distinction
1682 // but let's give non-zero score for these
1683 // common byte pairs anyway.
1684 score += CJ_PUNCTUATION;
1685 }
1686 0..=0x7F => {
1687 self.pending_score = None; // Discard pending score
1688 }
1689 _ => {
1690 if let Some(pending) = self.pending_score {
1691 score += pending;
1692 self.pending_score = None;
1693 }
1694 score += CJK_OTHER;
1695 }
1696 }
1697 self.prev = LatinCj::Other;
1698 }
1699 } else if written == 2 {
1700 if let Some(pending) = self.pending_score {
1701 score += pending;
1702 self.pending_score = None;
1703 }
1704 if dst[0] == 0xCA || dst[0] == 0xEA {
1705 score += CJK_OTHER;
1706 self.prev = LatinCj::Other;
1707 } else {
1708 debug_assert!(dst[0] >= 0xD480 && dst[0] < 0xD880);
1709 score += self.maybe_set_as_pending(BIG5_SCORE_PER_OTHER_HANZI);
1710 if self.prev == LatinCj::AsciiLetter {
1711 score += CJK_LATIN_ADJACENCY_PENALTY;
1712 }
1713 self.prev = LatinCj::Cj;
1714 }
1715 }
1716 match result {
1717 DecoderResult::InputEmpty => {
1718 assert_eq!(read, 1);
1719 }
1720 DecoderResult::Malformed(malformed_len, _) => {
1721 if self.prev_byte >= 0x81
1722 && self.prev_byte <= 0xFE
1723 && ((b >= 0x40 && b <= 0x7E) || (b >= 0xA1 && b <= 0xFE))
1724 {
1725 // The byte pair is in the Big5 range but unmapped.
1726 // Treat as PUA to avoid rejecting Big5-UAO, etc.
1727 // We don't reprocess `b` even if ASCII, since it's
1728 // logically part of the pair.
1729 if let Some(pending) = self.pending_score {
1730 score += pending;
1731 self.pending_score = None;
1732 }
1733 score += BIG5_PUA_PENALTY;
1734 // Assume Hanzi semantics
1735 if self.prev == LatinCj::AsciiLetter {
1736 score += CJK_LATIN_ADJACENCY_PENALTY;
1737 }
1738 self.prev = LatinCj::Cj;
1739 } else if (self.prev_byte == 0xA0
1740 || self.prev_byte == 0xFD
1741 || self.prev_byte == 0xFE)
1742 && (b < 0x80 || b == 0xFF)
1743 {
1744 // Mac OS Chinese Traditional single-byte that conflicts with code page Big5 lead byte
1745 // followed by ASCII or a non-conflicting single-byte extension.
1746 self.pending_score = None; // Just in case
1747 score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY;
1748 if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') {
1749 self.prev = LatinCj::AsciiLetter;
1750 } else if b == 0xFF {
1751 score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY;
1752 self.prev = LatinCj::Other;
1753 } else {
1754 self.prev = LatinCj::Other;
1755 }
1756 } else if malformed_len == 1 && b == 0xFF {
1757 // Mac OS Chinese Traditional single-byte extension that doesn't conflict with lead bytes
1758 self.pending_score = None; // Just in case
1759 score += BIG5_SINGLE_BYTE_EXTENSION_PENALTY;
1760 self.prev = LatinCj::Other;
1761 } else {
1762 return None;
1763 }
1764 }
1765 DecoderResult::OutputFull => {
1766 unreachable!();
1767 }
1768 }
1769 self.prev_byte = b;
1770 }
1771 if last {
1772 let (result, _, _) = self
1773 .decoder
1774 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1775 match result {
1776 DecoderResult::InputEmpty => {}
1777 DecoderResult::Malformed(_, _) => {
1778 return None;
1779 }
1780 DecoderResult::OutputFull => {
1781 unreachable!();
1782 }
1783 }
1784 }
1785 Some(score)
1786 }
1787 }
1788
1789 struct EucKrCandidate {
1790 decoder: Decoder,
1791 prev_byte: u8,
1792 prev_was_euc_range: bool,
1793 prev: LatinKorean,
1794 current_word_len: u64,
1795 pending_score: Option<i64>,
1796 }
1797
1798 impl EucKrCandidate {
maybe_set_as_pending(&mut self, s: i64) -> i641799 fn maybe_set_as_pending(&mut self, s: i64) -> i64 {
1800 assert!(self.pending_score.is_none());
1801 if self.prev == LatinKorean::Hangul || !more_problematic_lead(self.prev_byte) {
1802 s
1803 } else {
1804 self.pending_score = Some(s);
1805 0
1806 }
1807 }
1808
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>1809 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1810 let mut score = 0i64;
1811 let mut src = [0u8];
1812 let mut dst = [0u16; 2];
1813 for &b in buffer {
1814 let in_euc_range = b >= 0xA1 && b <= 0xFE;
1815 src[0] = b;
1816 let (result, read, written) = self
1817 .decoder
1818 .decode_to_utf16_without_replacement(&src, &mut dst, false);
1819 if written > 0 {
1820 let u = dst[0];
1821 if (u >= u16::from(b'a') && u <= u16::from(b'z'))
1822 || (u >= u16::from(b'A') && u <= u16::from(b'Z'))
1823 {
1824 self.pending_score = None; // Discard pending score
1825 match self.prev {
1826 LatinKorean::Hangul | LatinKorean::Hanja => {
1827 score += CJK_LATIN_ADJACENCY_PENALTY;
1828 }
1829 _ => {}
1830 }
1831 self.prev = LatinKorean::AsciiLetter;
1832 self.current_word_len = 0;
1833 } else if u >= 0xAC00 && u <= 0xD7A3 {
1834 if let Some(pending) = self.pending_score {
1835 score += pending;
1836 self.pending_score = None;
1837 }
1838 if self.prev_was_euc_range && in_euc_range {
1839 score += EUC_KR_SCORE_PER_EUC_HANGUL;
1840 score += cjk_extra_score(u, &data::DETECTOR_DATA.frequent_hangul);
1841 } else {
1842 score += self.maybe_set_as_pending(EUC_KR_SCORE_PER_NON_EUC_HANGUL);
1843 }
1844 if self.prev == LatinKorean::AsciiLetter {
1845 score += CJK_LATIN_ADJACENCY_PENALTY;
1846 }
1847 self.prev = LatinKorean::Hangul;
1848 self.current_word_len += 1;
1849 if self.current_word_len > 5 {
1850 score += EUC_KR_LONG_WORD_PENALTY;
1851 }
1852 } else if (u >= 0x4E00 && u < 0xAC00) || (u >= 0xF900 && u <= 0xFA0B) {
1853 if let Some(pending) = self.pending_score {
1854 score += pending;
1855 self.pending_score = None;
1856 }
1857 score += EUC_KR_SCORE_PER_HANJA;
1858 match self.prev {
1859 LatinKorean::AsciiLetter => {
1860 score += CJK_LATIN_ADJACENCY_PENALTY;
1861 }
1862 LatinKorean::Hangul => {
1863 score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY;
1864 }
1865 _ => {}
1866 }
1867 self.prev = LatinKorean::Hanja;
1868 self.current_word_len += 1;
1869 if self.current_word_len > 5 {
1870 score += EUC_KR_LONG_WORD_PENALTY;
1871 }
1872 } else {
1873 if u >= 0x80 {
1874 if let Some(pending) = self.pending_score {
1875 score += pending;
1876 self.pending_score = None;
1877 }
1878 score += CJK_OTHER;
1879 } else {
1880 self.pending_score = None; // Discard pending score
1881 }
1882 self.prev = LatinKorean::Other;
1883 self.current_word_len = 0;
1884 }
1885 }
1886 match result {
1887 DecoderResult::InputEmpty => {
1888 assert_eq!(read, 1);
1889 }
1890 DecoderResult::Malformed(malformed_len, _) => {
1891 if (self.prev_byte == 0xC9 || self.prev_byte == 0xFE) && b >= 0xA1 && b <= 0xFE
1892 {
1893 if let Some(pending) = self.pending_score {
1894 score += pending;
1895 self.pending_score = None;
1896 }
1897 // The byte pair is in code page 949 EUDC range
1898 score += EUC_KR_PUA_PENALTY;
1899 // Assume Hanja semantics
1900 match self.prev {
1901 LatinKorean::AsciiLetter => {
1902 score += CJK_LATIN_ADJACENCY_PENALTY;
1903 }
1904 LatinKorean::Hangul => {
1905 score += EUC_KR_HANJA_AFTER_HANGUL_PENALTY;
1906 }
1907 _ => {}
1908 }
1909 self.prev = LatinKorean::Hanja;
1910 self.current_word_len += 1;
1911 if self.current_word_len > 5 {
1912 score += EUC_KR_LONG_WORD_PENALTY;
1913 }
1914 } else if (self.prev_byte == 0xA1
1915 || (self.prev_byte >= 0xA3 && self.prev_byte <= 0xA8)
1916 || (self.prev_byte >= 0xAA && self.prev_byte <= 0xAD))
1917 && (b >= 0x7B && b <= 0x7D)
1918 {
1919 if let Some(pending) = self.pending_score {
1920 score += pending;
1921 self.pending_score = None;
1922 }
1923 // MacKorean symbols in range not part of code page 949
1924 score += EUC_KR_MAC_KOREAN_PENALTY;
1925 self.prev = LatinKorean::Other;
1926 self.current_word_len = 0;
1927 } else if (self.prev_byte >= 0x81 && self.prev_byte <= 0x84)
1928 && (b <= 0x80 || b == 0xFF)
1929 {
1930 // MacKorean single-byte that conflicts with code page 949 lead byte
1931 // followed by ASCII or a non-conflicting single-byte extension.
1932 self.pending_score = None; // Just in case
1933 score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY;
1934 if (b >= b'a' && b <= b'z') || (b >= b'A' && b <= b'Z') {
1935 self.prev = LatinKorean::AsciiLetter;
1936 } else if b == 0x80 || b == 0xFF {
1937 score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY;
1938 self.prev = LatinKorean::Other;
1939 } else {
1940 self.prev = LatinKorean::Other;
1941 }
1942 self.current_word_len = 0;
1943 } else if malformed_len == 1 && (b == 0x80 || b == 0xFF) {
1944 // MacKorean single-byte extensions that don't conflict with lead bytes
1945 self.pending_score = None; // Just in case
1946 score += EUC_KR_SINGLE_BYTE_EXTENSION_PENALTY;
1947 self.prev = LatinKorean::Other;
1948 self.current_word_len = 0;
1949 } else {
1950 return None;
1951 }
1952 }
1953 DecoderResult::OutputFull => {
1954 unreachable!();
1955 }
1956 }
1957 self.prev_was_euc_range = in_euc_range;
1958 self.prev_byte = b;
1959 }
1960 if last {
1961 let (result, _, _) = self
1962 .decoder
1963 .decode_to_utf16_without_replacement(b"", &mut dst, true);
1964 match result {
1965 DecoderResult::InputEmpty => {}
1966 DecoderResult::Malformed(_, _) => {
1967 return None;
1968 }
1969 DecoderResult::OutputFull => {
1970 unreachable!();
1971 }
1972 }
1973 }
1974 Some(score)
1975 }
1976 }
1977
1978 enum InnerCandidate {
1979 Latin(LatinCandidate),
1980 NonLatinCased(NonLatinCasedCandidate),
1981 Caseless(CaselessCandidate),
1982 ArabicFrench(ArabicFrenchCandidate),
1983 Logical(LogicalCandidate),
1984 Visual(VisualCandidate),
1985 Utf8(Utf8Candidate),
1986 Iso2022(Iso2022Candidate),
1987 Shift(ShiftJisCandidate),
1988 EucJp(EucJpCandidate),
1989 EucKr(EucKrCandidate),
1990 Big5(Big5Candidate),
1991 Gbk(GbkCandidate),
1992 }
1993
1994 impl InnerCandidate {
feed(&mut self, buffer: &[u8], last: bool) -> Option<i64>1995 fn feed(&mut self, buffer: &[u8], last: bool) -> Option<i64> {
1996 match self {
1997 InnerCandidate::Latin(c) => {
1998 if let Some(new_score) = c.feed(buffer) {
1999 if last {
2000 // Treat EOF as space-like
2001 if let Some(additional_score) = c.feed(b" ") {
2002 Some(new_score + additional_score)
2003 } else {
2004 None
2005 }
2006 } else {
2007 Some(new_score)
2008 }
2009 } else {
2010 None
2011 }
2012 }
2013 InnerCandidate::NonLatinCased(c) => {
2014 if let Some(new_score) = c.feed(buffer) {
2015 if last {
2016 // Treat EOF as space-like
2017 if let Some(additional_score) = c.feed(b" ") {
2018 Some(new_score + additional_score)
2019 } else {
2020 None
2021 }
2022 } else {
2023 Some(new_score)
2024 }
2025 } else {
2026 None
2027 }
2028 }
2029 InnerCandidate::Caseless(c) => {
2030 if let Some(new_score) = c.feed(buffer) {
2031 if last {
2032 // Treat EOF as space-like
2033 if let Some(additional_score) = c.feed(b" ") {
2034 Some(new_score + additional_score)
2035 } else {
2036 None
2037 }
2038 } else {
2039 Some(new_score)
2040 }
2041 } else {
2042 None
2043 }
2044 }
2045 InnerCandidate::ArabicFrench(c) => {
2046 if let Some(new_score) = c.feed(buffer) {
2047 if last {
2048 // Treat EOF as space-like
2049 if let Some(additional_score) = c.feed(b" ") {
2050 Some(new_score + additional_score)
2051 } else {
2052 None
2053 }
2054 } else {
2055 Some(new_score)
2056 }
2057 } else {
2058 None
2059 }
2060 }
2061 InnerCandidate::Logical(c) => {
2062 if let Some(new_score) = c.feed(buffer) {
2063 if last {
2064 // Treat EOF as space-like
2065 if let Some(additional_score) = c.feed(b" ") {
2066 Some(new_score + additional_score)
2067 } else {
2068 None
2069 }
2070 } else {
2071 Some(new_score)
2072 }
2073 } else {
2074 None
2075 }
2076 }
2077 InnerCandidate::Visual(c) => {
2078 if let Some(new_score) = c.feed(buffer) {
2079 if last {
2080 // Treat EOF as space-like
2081 if let Some(additional_score) = c.feed(b" ") {
2082 Some(new_score + additional_score)
2083 } else {
2084 None
2085 }
2086 } else {
2087 Some(new_score)
2088 }
2089 } else {
2090 None
2091 }
2092 }
2093 InnerCandidate::Utf8(c) => c.feed(buffer, last),
2094 InnerCandidate::Iso2022(c) => c.feed(buffer, last),
2095 InnerCandidate::Shift(c) => c.feed(buffer, last),
2096 InnerCandidate::EucJp(c) => c.feed(buffer, last),
2097 InnerCandidate::EucKr(c) => c.feed(buffer, last),
2098 InnerCandidate::Big5(c) => c.feed(buffer, last),
2099 InnerCandidate::Gbk(c) => c.feed(buffer, last),
2100 }
2101 }
2102 }
2103
encoding_for_tld(tld: Tld) -> usize2104 fn encoding_for_tld(tld: Tld) -> usize {
2105 match tld {
2106 Tld::CentralWindows | Tld::CentralCyrillic => EncodingDetector::CENTRAL_WINDOWS_INDEX,
2107 Tld::Cyrillic => EncodingDetector::CYRILLIC_WINDOWS_INDEX,
2108 Tld::Generic | Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic | Tld::Eu => {
2109 EncodingDetector::WESTERN_INDEX
2110 }
2111 Tld::IcelandicFaroese => EncodingDetector::ICELANDIC_INDEX,
2112 Tld::Greek => EncodingDetector::GREEK_ISO_INDEX,
2113 Tld::TurkishAzeri => EncodingDetector::TURKISH_INDEX,
2114 Tld::Hebrew => EncodingDetector::LOGICAL_INDEX,
2115 Tld::Arabic => EncodingDetector::ARABIC_WINDOWS_INDEX,
2116 Tld::Baltic => EncodingDetector::BALTIC_WINDOWS_INDEX,
2117 Tld::Vietnamese => EncodingDetector::VIETNAMESE_INDEX,
2118 Tld::Thai => EncodingDetector::THAI_INDEX,
2119 Tld::Simplified | Tld::SimplifiedTraditional => EncodingDetector::GBK_INDEX,
2120 Tld::Traditional | Tld::TraditionalSimplified => EncodingDetector::BIG5_INDEX,
2121 Tld::Japanese => EncodingDetector::SHIFT_JIS_INDEX,
2122 Tld::Korean => EncodingDetector::EUC_KR_INDEX,
2123 Tld::CentralIso => EncodingDetector::CENTRAL_ISO_INDEX,
2124 }
2125 }
2126
encoding_is_native_to_tld(tld: Tld, encoding: usize) -> bool2127 fn encoding_is_native_to_tld(tld: Tld, encoding: usize) -> bool {
2128 match tld {
2129 Tld::CentralWindows => encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX,
2130 Tld::Cyrillic => {
2131 encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2132 || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2133 || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2134 || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2135 }
2136 Tld::Western => encoding == EncodingDetector::WESTERN_INDEX,
2137 Tld::Greek => {
2138 encoding == EncodingDetector::GREEK_WINDOWS_INDEX
2139 || encoding == EncodingDetector::GREEK_ISO_INDEX
2140 }
2141 Tld::TurkishAzeri => encoding == EncodingDetector::TURKISH_INDEX,
2142 Tld::Hebrew => encoding == EncodingDetector::LOGICAL_INDEX,
2143 Tld::Arabic => {
2144 encoding == EncodingDetector::ARABIC_WINDOWS_INDEX
2145 || encoding == EncodingDetector::ARABIC_ISO_INDEX
2146 }
2147 Tld::Baltic => {
2148 encoding == EncodingDetector::BALTIC_WINDOWS_INDEX
2149 || encoding == EncodingDetector::BALTIC_ISO13_INDEX
2150 || encoding == EncodingDetector::BALTIC_ISO4_INDEX
2151 }
2152 Tld::Vietnamese => encoding == EncodingDetector::VIETNAMESE_INDEX,
2153 Tld::Thai => encoding == EncodingDetector::THAI_INDEX,
2154 Tld::Simplified => encoding == EncodingDetector::GBK_INDEX,
2155 Tld::Traditional => encoding == EncodingDetector::BIG5_INDEX,
2156 Tld::Japanese => {
2157 encoding == EncodingDetector::SHIFT_JIS_INDEX
2158 || encoding == EncodingDetector::EUC_JP_INDEX
2159 }
2160 Tld::Korean => encoding == EncodingDetector::EUC_KR_INDEX,
2161 Tld::SimplifiedTraditional | Tld::TraditionalSimplified => {
2162 encoding == EncodingDetector::GBK_INDEX || encoding == EncodingDetector::BIG5_INDEX
2163 }
2164 Tld::CentralIso => encoding == EncodingDetector::CENTRAL_ISO_INDEX,
2165 Tld::IcelandicFaroese => encoding == EncodingDetector::ICELANDIC_INDEX,
2166 Tld::WesternCyrillic => {
2167 encoding == EncodingDetector::WESTERN_INDEX
2168 || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2169 || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2170 || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2171 || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2172 }
2173 Tld::CentralCyrillic => {
2174 encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX
2175 || encoding == EncodingDetector::CENTRAL_ISO_INDEX
2176 || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2177 || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2178 || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2179 || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2180 }
2181 Tld::WesternArabic => {
2182 encoding == EncodingDetector::WESTERN_INDEX
2183 || encoding == EncodingDetector::ARABIC_WINDOWS_INDEX
2184 || encoding == EncodingDetector::ARABIC_ISO_INDEX
2185 }
2186 Tld::Eu => {
2187 encoding == EncodingDetector::WESTERN_INDEX
2188 || encoding == EncodingDetector::ICELANDIC_INDEX
2189 || encoding == EncodingDetector::CENTRAL_WINDOWS_INDEX
2190 || encoding == EncodingDetector::CENTRAL_ISO_INDEX
2191 || encoding == EncodingDetector::CYRILLIC_WINDOWS_INDEX
2192 || encoding == EncodingDetector::CYRILLIC_KOI_INDEX
2193 || encoding == EncodingDetector::CYRILLIC_IBM_INDEX
2194 || encoding == EncodingDetector::CYRILLIC_ISO_INDEX
2195 || encoding == EncodingDetector::GREEK_WINDOWS_INDEX
2196 || encoding == EncodingDetector::GREEK_ISO_INDEX
2197 || encoding == EncodingDetector::BALTIC_WINDOWS_INDEX
2198 || encoding == EncodingDetector::BALTIC_ISO13_INDEX
2199 || encoding == EncodingDetector::BALTIC_ISO4_INDEX
2200 }
2201 Tld::Generic => false,
2202 }
2203 }
2204
score_adjustment(score: i64, encoding: usize, tld: Tld) -> i642205 fn score_adjustment(score: i64, encoding: usize, tld: Tld) -> i64 {
2206 if score < 1 {
2207 return 0;
2208 }
2209 // This is the most ad hoc part of this library.
2210 let (divisor, constant) = match tld {
2211 Tld::Generic => {
2212 unreachable!();
2213 }
2214 Tld::CentralWindows | Tld::CentralIso => {
2215 match encoding {
2216 EncodingDetector::WESTERN_INDEX
2217 | EncodingDetector::ICELANDIC_INDEX
2218 | EncodingDetector::BALTIC_WINDOWS_INDEX
2219 | EncodingDetector::BALTIC_ISO4_INDEX
2220 | EncodingDetector::BALTIC_ISO13_INDEX
2221 | EncodingDetector::VIETNAMESE_INDEX
2222 | EncodingDetector::TURKISH_INDEX => {
2223 // XXX Tune this better instead of this kind of absolute.
2224 return score;
2225 }
2226 _ => (50, 60),
2227 }
2228 }
2229 Tld::Cyrillic => {
2230 match encoding {
2231 EncodingDetector::BIG5_INDEX
2232 | EncodingDetector::GBK_INDEX
2233 | EncodingDetector::EUC_JP_INDEX
2234 | EncodingDetector::CENTRAL_WINDOWS_INDEX
2235 | EncodingDetector::CENTRAL_ISO_INDEX
2236 | EncodingDetector::GREEK_WINDOWS_INDEX
2237 | EncodingDetector::GREEK_ISO_INDEX
2238 | EncodingDetector::VISUAL_INDEX
2239 | EncodingDetector::LOGICAL_INDEX
2240 | EncodingDetector::BALTIC_WINDOWS_INDEX
2241 | EncodingDetector::BALTIC_ISO4_INDEX
2242 | EncodingDetector::BALTIC_ISO13_INDEX
2243 | EncodingDetector::TURKISH_INDEX => {
2244 // XXX Tune this better instead of this kind of absolute.
2245 return score;
2246 }
2247 _ => (50, 60),
2248 }
2249 }
2250 Tld::Western | Tld::WesternCyrillic | Tld::WesternArabic => {
2251 match encoding {
2252 EncodingDetector::CENTRAL_WINDOWS_INDEX
2253 | EncodingDetector::CENTRAL_ISO_INDEX
2254 | EncodingDetector::BALTIC_WINDOWS_INDEX
2255 | EncodingDetector::BALTIC_ISO4_INDEX
2256 | EncodingDetector::BALTIC_ISO13_INDEX
2257 | EncodingDetector::TURKISH_INDEX
2258 | EncodingDetector::VIETNAMESE_INDEX => {
2259 // XXX Tune this better instead of this kind of absolute.
2260 return score;
2261 }
2262 _ => (50, 60),
2263 }
2264 }
2265 Tld::Greek => {
2266 match encoding {
2267 EncodingDetector::BIG5_INDEX
2268 | EncodingDetector::GBK_INDEX
2269 | EncodingDetector::EUC_JP_INDEX
2270 | EncodingDetector::CENTRAL_WINDOWS_INDEX
2271 | EncodingDetector::CENTRAL_ISO_INDEX
2272 | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2273 | EncodingDetector::CYRILLIC_ISO_INDEX
2274 | EncodingDetector::CYRILLIC_KOI_INDEX
2275 | EncodingDetector::CYRILLIC_IBM_INDEX
2276 | EncodingDetector::VISUAL_INDEX
2277 | EncodingDetector::LOGICAL_INDEX
2278 | EncodingDetector::BALTIC_WINDOWS_INDEX
2279 | EncodingDetector::BALTIC_ISO4_INDEX
2280 | EncodingDetector::BALTIC_ISO13_INDEX
2281 | EncodingDetector::TURKISH_INDEX => {
2282 // XXX Tune this better instead of this kind of absolute.
2283 return score;
2284 }
2285 _ => (50, 60),
2286 }
2287 }
2288 Tld::TurkishAzeri => {
2289 match encoding {
2290 EncodingDetector::CENTRAL_WINDOWS_INDEX
2291 | EncodingDetector::CENTRAL_ISO_INDEX
2292 | EncodingDetector::BALTIC_WINDOWS_INDEX
2293 | EncodingDetector::BALTIC_ISO4_INDEX
2294 | EncodingDetector::BALTIC_ISO13_INDEX
2295 | EncodingDetector::VIETNAMESE_INDEX
2296 | EncodingDetector::ICELANDIC_INDEX => {
2297 // XXX Tune this better instead of this kind of absolute.
2298 return score;
2299 }
2300 _ => (50, 60),
2301 }
2302 }
2303 Tld::Hebrew => {
2304 match encoding {
2305 EncodingDetector::CENTRAL_WINDOWS_INDEX
2306 | EncodingDetector::CENTRAL_ISO_INDEX
2307 | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2308 | EncodingDetector::CYRILLIC_ISO_INDEX
2309 | EncodingDetector::CYRILLIC_KOI_INDEX
2310 | EncodingDetector::CYRILLIC_IBM_INDEX
2311 | EncodingDetector::GREEK_WINDOWS_INDEX
2312 | EncodingDetector::GREEK_ISO_INDEX
2313 | EncodingDetector::BALTIC_WINDOWS_INDEX
2314 | EncodingDetector::BALTIC_ISO4_INDEX
2315 | EncodingDetector::BALTIC_ISO13_INDEX
2316 | EncodingDetector::VIETNAMESE_INDEX
2317 | EncodingDetector::TURKISH_INDEX => {
2318 // XXX Tune this better instead of this kind of absolute.
2319 return score;
2320 }
2321 _ => (50, 60),
2322 }
2323 }
2324 Tld::Arabic => {
2325 match encoding {
2326 EncodingDetector::BIG5_INDEX
2327 | EncodingDetector::GBK_INDEX
2328 | EncodingDetector::EUC_JP_INDEX
2329 | EncodingDetector::EUC_KR_INDEX
2330 | EncodingDetector::CENTRAL_WINDOWS_INDEX
2331 | EncodingDetector::CENTRAL_ISO_INDEX
2332 | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2333 | EncodingDetector::CYRILLIC_ISO_INDEX
2334 | EncodingDetector::CYRILLIC_KOI_INDEX
2335 | EncodingDetector::CYRILLIC_IBM_INDEX
2336 | EncodingDetector::GREEK_WINDOWS_INDEX
2337 | EncodingDetector::GREEK_ISO_INDEX
2338 | EncodingDetector::VISUAL_INDEX
2339 | EncodingDetector::LOGICAL_INDEX
2340 | EncodingDetector::BALTIC_WINDOWS_INDEX
2341 | EncodingDetector::BALTIC_ISO4_INDEX
2342 | EncodingDetector::BALTIC_ISO13_INDEX
2343 | EncodingDetector::VIETNAMESE_INDEX
2344 | EncodingDetector::TURKISH_INDEX => {
2345 // XXX Tune this better instead of this kind of absolute.
2346 return score;
2347 }
2348 _ => (50, 60),
2349 }
2350 }
2351 Tld::Baltic => {
2352 match encoding {
2353 EncodingDetector::CENTRAL_WINDOWS_INDEX
2354 | EncodingDetector::CENTRAL_ISO_INDEX
2355 | EncodingDetector::ICELANDIC_INDEX
2356 | EncodingDetector::TURKISH_INDEX
2357 | EncodingDetector::VIETNAMESE_INDEX => {
2358 // XXX Tune this better instead of this kind of absolute.
2359 return score;
2360 }
2361 _ => (50, 60),
2362 }
2363 }
2364 Tld::Vietnamese => {
2365 match encoding {
2366 EncodingDetector::CENTRAL_WINDOWS_INDEX
2367 | EncodingDetector::CENTRAL_ISO_INDEX
2368 | EncodingDetector::BALTIC_WINDOWS_INDEX
2369 | EncodingDetector::BALTIC_ISO4_INDEX
2370 | EncodingDetector::BALTIC_ISO13_INDEX
2371 | EncodingDetector::TURKISH_INDEX
2372 | EncodingDetector::ICELANDIC_INDEX => {
2373 // XXX Tune this better instead of this kind of absolute.
2374 return score;
2375 }
2376 _ => (50, 60),
2377 }
2378 }
2379 Tld::Thai => {
2380 match encoding {
2381 EncodingDetector::BIG5_INDEX
2382 | EncodingDetector::GBK_INDEX
2383 | EncodingDetector::EUC_JP_INDEX
2384 | EncodingDetector::EUC_KR_INDEX
2385 | EncodingDetector::SHIFT_JIS_INDEX
2386 | EncodingDetector::CENTRAL_WINDOWS_INDEX
2387 | EncodingDetector::CENTRAL_ISO_INDEX
2388 | EncodingDetector::CYRILLIC_WINDOWS_INDEX
2389 | EncodingDetector::CYRILLIC_ISO_INDEX
2390 | EncodingDetector::CYRILLIC_KOI_INDEX
2391 | EncodingDetector::CYRILLIC_IBM_INDEX
2392 | EncodingDetector::GREEK_WINDOWS_INDEX
2393 | EncodingDetector::GREEK_ISO_INDEX
2394 | EncodingDetector::ARABIC_WINDOWS_INDEX
2395 | EncodingDetector::ARABIC_ISO_INDEX
2396 | EncodingDetector::VISUAL_INDEX
2397 | EncodingDetector::LOGICAL_INDEX
2398 | EncodingDetector::BALTIC_WINDOWS_INDEX
2399 | EncodingDetector::BALTIC_ISO4_INDEX
2400 | EncodingDetector::BALTIC_ISO13_INDEX
2401 | EncodingDetector::TURKISH_INDEX => {
2402 // XXX Tune this better instead of this kind of absolute.
2403 return score;
2404 }
2405 _ => (50, 60),
2406 }
2407 }
2408 Tld::Simplified
2409 | Tld::Traditional
2410 | Tld::TraditionalSimplified
2411 | Tld::SimplifiedTraditional
2412 | Tld::Japanese
2413 | Tld::Korean => {
2414 // If TLD default is valid, everything else scores zero
2415 return score;
2416 }
2417 Tld::IcelandicFaroese => {
2418 match encoding {
2419 EncodingDetector::CENTRAL_WINDOWS_INDEX
2420 | EncodingDetector::CENTRAL_ISO_INDEX
2421 | EncodingDetector::BALTIC_WINDOWS_INDEX
2422 | EncodingDetector::BALTIC_ISO4_INDEX
2423 | EncodingDetector::BALTIC_ISO13_INDEX
2424 | EncodingDetector::TURKISH_INDEX
2425 | EncodingDetector::VIETNAMESE_INDEX => {
2426 // XXX Tune this better instead of this kind of absolute.
2427 return score;
2428 }
2429 _ => (50, 60),
2430 }
2431 }
2432 Tld::CentralCyrillic => {
2433 match encoding {
2434 EncodingDetector::BIG5_INDEX
2435 | EncodingDetector::GBK_INDEX
2436 | EncodingDetector::EUC_JP_INDEX
2437 | EncodingDetector::GREEK_WINDOWS_INDEX
2438 | EncodingDetector::GREEK_ISO_INDEX
2439 | EncodingDetector::VISUAL_INDEX
2440 | EncodingDetector::LOGICAL_INDEX
2441 | EncodingDetector::BALTIC_WINDOWS_INDEX
2442 | EncodingDetector::BALTIC_ISO4_INDEX
2443 | EncodingDetector::BALTIC_ISO13_INDEX
2444 | EncodingDetector::TURKISH_INDEX => {
2445 // XXX Tune this better instead of this kind of absolute.
2446 return score;
2447 }
2448 _ => (50, 60),
2449 }
2450 }
2451 Tld::Eu => {
2452 match encoding {
2453 EncodingDetector::BIG5_INDEX
2454 | EncodingDetector::GBK_INDEX
2455 | EncodingDetector::EUC_JP_INDEX
2456 | EncodingDetector::TURKISH_INDEX
2457 | EncodingDetector::VIETNAMESE_INDEX => {
2458 // XXX Tune this better instead of this kind of absolute.
2459 return score;
2460 }
2461 _ => (50, 60),
2462 }
2463 }
2464 };
2465 (score / divisor) + constant
2466 }
2467
2468 struct Candidate {
2469 inner: InnerCandidate,
2470 score: Option<i64>,
2471 }
2472
2473 impl Candidate {
feed(&mut self, buffer: &[u8], last: bool)2474 fn feed(&mut self, buffer: &[u8], last: bool) {
2475 if let Some(old_score) = self.score {
2476 if let Some(new_score) = self.inner.feed(buffer, last) {
2477 self.score = Some(old_score + new_score);
2478 } else {
2479 self.score = None;
2480 }
2481 }
2482 }
2483
new_latin(data: &'static SingleByteData) -> Self2484 fn new_latin(data: &'static SingleByteData) -> Self {
2485 Candidate {
2486 inner: InnerCandidate::Latin(LatinCandidate::new(data)),
2487 score: Some(0),
2488 }
2489 }
2490
new_non_latin_cased(data: &'static SingleByteData) -> Self2491 fn new_non_latin_cased(data: &'static SingleByteData) -> Self {
2492 Candidate {
2493 inner: InnerCandidate::NonLatinCased(NonLatinCasedCandidate::new(data)),
2494 score: Some(0),
2495 }
2496 }
2497
new_caseless(data: &'static SingleByteData) -> Self2498 fn new_caseless(data: &'static SingleByteData) -> Self {
2499 Candidate {
2500 inner: InnerCandidate::Caseless(CaselessCandidate::new(data)),
2501 score: Some(0),
2502 }
2503 }
2504
new_arabic_french(data: &'static SingleByteData) -> Self2505 fn new_arabic_french(data: &'static SingleByteData) -> Self {
2506 Candidate {
2507 inner: InnerCandidate::ArabicFrench(ArabicFrenchCandidate::new(data)),
2508 score: Some(0),
2509 }
2510 }
2511
new_logical(data: &'static SingleByteData) -> Self2512 fn new_logical(data: &'static SingleByteData) -> Self {
2513 Candidate {
2514 inner: InnerCandidate::Logical(LogicalCandidate::new(data)),
2515 score: Some(0),
2516 }
2517 }
2518
new_visual(data: &'static SingleByteData) -> Self2519 fn new_visual(data: &'static SingleByteData) -> Self {
2520 Candidate {
2521 inner: InnerCandidate::Visual(VisualCandidate::new(data)),
2522 score: Some(0),
2523 }
2524 }
2525
new_utf_8() -> Self2526 fn new_utf_8() -> Self {
2527 Candidate {
2528 inner: InnerCandidate::Utf8(Utf8Candidate {
2529 decoder: UTF_8.new_decoder_without_bom_handling(),
2530 }),
2531 score: Some(0),
2532 }
2533 }
2534
new_iso_2022_jp() -> Self2535 fn new_iso_2022_jp() -> Self {
2536 Candidate {
2537 inner: InnerCandidate::Iso2022(Iso2022Candidate {
2538 decoder: ISO_2022_JP.new_decoder_without_bom_handling(),
2539 }),
2540 score: Some(0),
2541 }
2542 }
2543
new_shift_jis() -> Self2544 fn new_shift_jis() -> Self {
2545 Candidate {
2546 inner: InnerCandidate::Shift(ShiftJisCandidate {
2547 decoder: SHIFT_JIS.new_decoder_without_bom_handling(),
2548 half_width_katakana_seen: false,
2549 half_width_katakana_state: HalfWidthKatakana::DakutenForbidden,
2550 prev: LatinCj::Other,
2551 prev_byte: 0,
2552 pending_score: None,
2553 }),
2554 score: Some(0),
2555 }
2556 }
2557
new_euc_jp() -> Self2558 fn new_euc_jp() -> Self {
2559 Candidate {
2560 inner: InnerCandidate::EucJp(EucJpCandidate {
2561 decoder: EUC_JP.new_decoder_without_bom_handling(),
2562 non_ascii_seen: false,
2563 half_width_katakana_state: HalfWidthKatakana::DakutenForbidden,
2564 prev: LatinCj::Other,
2565 prev_byte: 0,
2566 prev_prev_byte: 0,
2567 }),
2568 score: Some(0),
2569 }
2570 }
2571
new_euc_kr() -> Self2572 fn new_euc_kr() -> Self {
2573 Candidate {
2574 inner: InnerCandidate::EucKr(EucKrCandidate {
2575 decoder: EUC_KR.new_decoder_without_bom_handling(),
2576 prev_byte: 0,
2577 prev_was_euc_range: false,
2578 prev: LatinKorean::Other,
2579 current_word_len: 0,
2580 pending_score: None,
2581 }),
2582 score: Some(0),
2583 }
2584 }
2585
new_big5() -> Self2586 fn new_big5() -> Self {
2587 Candidate {
2588 inner: InnerCandidate::Big5(Big5Candidate {
2589 decoder: BIG5.new_decoder_without_bom_handling(),
2590 prev: LatinCj::Other,
2591 prev_byte: 0,
2592 pending_score: None,
2593 }),
2594 score: Some(0),
2595 }
2596 }
2597
new_gbk() -> Self2598 fn new_gbk() -> Self {
2599 Candidate {
2600 inner: InnerCandidate::Gbk(GbkCandidate {
2601 decoder: GBK.new_decoder_without_bom_handling(),
2602 prev: LatinCj::Other,
2603 prev_byte: 0,
2604 pending_score: None,
2605 }),
2606 score: Some(0),
2607 }
2608 }
2609
score(&self, encoding: usize, tld: Tld, expectation_is_valid: bool) -> Option<i64>2610 fn score(&self, encoding: usize, tld: Tld, expectation_is_valid: bool) -> Option<i64> {
2611 match &self.inner {
2612 InnerCandidate::NonLatinCased(c) => {
2613 if c.longest_word < 2 {
2614 return None;
2615 }
2616 }
2617 InnerCandidate::Caseless(c) => {
2618 if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2619 return None;
2620 }
2621 }
2622 InnerCandidate::ArabicFrench(c) => {
2623 if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2624 return None;
2625 }
2626 }
2627 InnerCandidate::Logical(c) => {
2628 if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2629 return None;
2630 }
2631 }
2632 InnerCandidate::Visual(c) => {
2633 if c.longest_word < 2 && !encoding_is_native_to_tld(tld, encoding) {
2634 return None;
2635 }
2636 }
2637 _ => {}
2638 }
2639 if tld == Tld::Generic {
2640 return self.score;
2641 }
2642 if let Some(score) = self.score {
2643 if encoding == encoding_for_tld(tld) {
2644 return Some(score + 1);
2645 }
2646 if encoding_is_native_to_tld(tld, encoding) {
2647 return Some(score);
2648 }
2649 if expectation_is_valid {
2650 return Some(score - score_adjustment(score, encoding, tld));
2651 }
2652 // If expectation is no longer valid, fall back to
2653 // generic behavior.
2654 // XXX Flipped Chinese and Central
2655 return Some(score);
2656 }
2657 None
2658 }
2659
plausible_punctuation(&self) -> u642660 fn plausible_punctuation(&self) -> u64 {
2661 match &self.inner {
2662 InnerCandidate::Logical(c) => {
2663 return c.plausible_punctuation;
2664 }
2665 InnerCandidate::Visual(c) => {
2666 return c.plausible_punctuation;
2667 }
2668 _ => {
2669 unreachable!();
2670 }
2671 }
2672 }
2673
encoding(&self) -> &'static Encoding2674 fn encoding(&self) -> &'static Encoding {
2675 match &self.inner {
2676 InnerCandidate::Latin(c) => {
2677 return c.data.encoding;
2678 }
2679 InnerCandidate::NonLatinCased(c) => {
2680 return c.data.encoding;
2681 }
2682 InnerCandidate::Caseless(c) => {
2683 return c.data.encoding;
2684 }
2685 InnerCandidate::ArabicFrench(c) => {
2686 return c.data.encoding;
2687 }
2688 InnerCandidate::Logical(c) => {
2689 return c.data.encoding;
2690 }
2691 InnerCandidate::Visual(c) => {
2692 return c.data.encoding;
2693 }
2694 InnerCandidate::Shift(_) => {
2695 return SHIFT_JIS;
2696 }
2697 InnerCandidate::EucJp(_) => {
2698 return EUC_JP;
2699 }
2700 InnerCandidate::Big5(_) => {
2701 return BIG5;
2702 }
2703 InnerCandidate::EucKr(_) => {
2704 return EUC_KR;
2705 }
2706 InnerCandidate::Gbk(_) => {
2707 return GBK;
2708 }
2709 InnerCandidate::Utf8(_) => {
2710 return UTF_8;
2711 }
2712 InnerCandidate::Iso2022(_) => {
2713 return ISO_2022_JP;
2714 }
2715 }
2716 }
2717 }
2718
count_non_ascii(buffer: &[u8]) -> u642719 fn count_non_ascii(buffer: &[u8]) -> u64 {
2720 let mut count = 0;
2721 for &b in buffer {
2722 if b >= 0x80 {
2723 count += 1;
2724 }
2725 }
2726 count
2727 }
2728
2729 #[derive(Clone, Copy)]
2730 enum BeforeNonAscii {
2731 None,
2732 One([u8; 1]),
2733 Two([u8; 2]),
2734 }
2735
2736 impl BeforeNonAscii {
as_slice(&self) -> &[u8]2737 fn as_slice(&self) -> &[u8] {
2738 match self {
2739 BeforeNonAscii::None => b"",
2740 BeforeNonAscii::One(arr) => &arr[..],
2741 BeforeNonAscii::Two(arr) => &arr[..],
2742 }
2743 }
2744
push(&mut self, buffer: &[u8])2745 fn push(&mut self, buffer: &[u8]) {
2746 let len = buffer.len();
2747 if len >= 2 {
2748 let arr = [buffer[len - 2], buffer[len - 1]];
2749 *self = BeforeNonAscii::Two(arr);
2750 } else if len == 1 {
2751 match self {
2752 BeforeNonAscii::None => {
2753 let arr = [buffer[0]];
2754 *self = BeforeNonAscii::One(arr);
2755 }
2756 BeforeNonAscii::One(first) => {
2757 let arr = [first[0], buffer[0]];
2758 *self = BeforeNonAscii::Two(arr);
2759 }
2760 BeforeNonAscii::Two(first) => {
2761 let arr = [first[1], buffer[0]];
2762 *self = BeforeNonAscii::Two(arr);
2763 }
2764 }
2765 }
2766 }
2767 }
2768
2769 /// A Web browser-oriented detector for guessing what character
2770 /// encoding a stream of bytes is encoded in.
2771 ///
2772 /// The bytes are fed to the detector incrementally using the `feed`
2773 /// method. The current guess of the detector can be queried using
2774 /// the `guess` method. The guessing parameters are arguments to the
2775 /// `guess` method rather than arguments to the constructor in order
2776 /// to enable the application to check if the arguments affect the
2777 /// guessing outcome. (The specific use case is to disable UI for
2778 /// re-running the detector with UTF-8 allowed and the top-level
2779 /// domain name ignored if those arguments don't change the guess.)
2780 pub struct EncodingDetector {
2781 candidates: [Candidate; 27],
2782 non_ascii_seen: u64,
2783 // We need to feed up to two bytes of context before non-ASCII
2784 // thanks to Spanish n.º.
2785 last_before_non_ascii: BeforeNonAscii,
2786 esc_seen: bool,
2787 closed: bool,
2788 }
2789
2790 impl EncodingDetector {
feed_impl(&mut self, buffer: &[u8], last: bool)2791 fn feed_impl(&mut self, buffer: &[u8], last: bool) {
2792 for candidate in self.candidates.iter_mut() {
2793 candidate.feed(buffer, last);
2794 }
2795 self.non_ascii_seen += count_non_ascii(buffer);
2796 }
2797
2798 /// Inform the detector of a chunk of input.
2799 ///
2800 /// The byte stream is represented as a sequence of calls to this
2801 /// method such that the concatenation of the arguments to this
2802 /// method form the byte stream. It does not matter how the application
2803 /// chooses to chunk the stream. It is OK to call this method with
2804 /// a zero-length byte slice.
2805 ///
2806 /// The end of the stream is indicated by calling this method with
2807 /// `last` set to `true`. In that case, the end of the stream is
2808 /// considered to occur after the last byte of the `buffer` (which
2809 /// may be zero-length) passed in the same call. Once this method
2810 /// has been called with `last` set to `true` this method must not
2811 /// be called again.
2812 ///
2813 /// If you want to perform detection on just the prefix of a longer
2814 /// stream, do not pass `last=true` after the prefix if the stream
2815 /// actually still continues.
2816 ///
2817 /// Returns `true` if after processing `buffer` the stream has
2818 /// contained at least one non-ASCII byte and `false` if only
2819 /// ASCII has been seen so far.
2820 ///
2821 /// # Panics
2822 ///
2823 /// If this method has previously been called with `last` set to `true`.
feed(&mut self, buffer: &[u8], last: bool) -> bool2824 pub fn feed(&mut self, buffer: &[u8], last: bool) -> bool {
2825 assert!(
2826 !self.closed,
2827 "Must not feed again after feeding with last equaling true."
2828 );
2829 if last {
2830 self.closed = true;
2831 }
2832 let start = if self.non_ascii_seen == 0 && !self.esc_seen {
2833 let up_to = Encoding::ascii_valid_up_to(buffer);
2834 let start = if let Some(escape) = memchr::memchr(0x1B, &buffer[..up_to]) {
2835 self.esc_seen = true;
2836 escape
2837 } else {
2838 up_to
2839 };
2840 if start == buffer.len() {
2841 self.last_before_non_ascii.push(buffer);
2842 return self.non_ascii_seen != 0;
2843 }
2844 if start == 0 || start == 1 {
2845 let last_before = self.last_before_non_ascii;
2846 self.last_before_non_ascii = BeforeNonAscii::None;
2847 self.feed_impl(last_before.as_slice(), false);
2848 0
2849 } else {
2850 start - 2
2851 }
2852 } else {
2853 0
2854 };
2855 self.feed_impl(&buffer[start..], last);
2856 self.non_ascii_seen != 0
2857 }
2858
2859 /// Guess the encoding given the bytes pushed to the detector so far
2860 /// (via `feed()`), the top-level domain name from which the bytes were
2861 /// loaded, and an indication of whether to consider UTF-8 as a permissible
2862 /// guess.
2863 ///
2864 /// The `tld` argument takes the rightmost DNS label of the hostname of the
2865 /// host the stream was loaded from in lower-case ASCII form. That is, if
2866 /// the label is an internationalized top-level domain name, it must be
2867 /// provided in its Punycode form. If the TLD that the stream was loaded
2868 /// from is unavalable, `None` may be passed instead, which is equivalent
2869 /// to passing `Some(b"com")`.
2870 ///
2871 /// If the `allow_utf8` argument is set to `false`, the return value of
2872 /// this method won't be `encoding_rs::UTF_8`. When performing detection
2873 /// on `text/html` on non-`file:` URLs, Web browsers must pass `false`,
2874 /// unless the user has taken a specific contextual action to request an
2875 /// override. This way, Web developers cannot start depending on UTF-8
2876 /// detection. Such reliance would make the Web Platform more brittle.
2877 ///
2878 /// Returns the guessed encoding.
2879 ///
2880 /// # Panics
2881 ///
2882 /// If `tld` contains non-ASCII, period, or upper-case letters. (The panic
2883 /// condition is intentionally limited to signs of failing to extract the
2884 /// label correctly, failing to provide it in its Punycode form, and failure
2885 /// to lower-case it. Full DNS label validation is intentionally not performed
2886 /// to avoid panics when the reality doesn't match the specs.)
guess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> &'static Encoding2887 pub fn guess(&self, tld: Option<&[u8]>, allow_utf8: bool) -> &'static Encoding {
2888 let mut tld_type = tld.map_or(Tld::Generic, |tld| {
2889 assert!(!contains_upper_case_period_or_non_ascii(tld));
2890 classify_tld(tld)
2891 });
2892
2893 if self.non_ascii_seen == 0
2894 && self.esc_seen
2895 && self.candidates[Self::ISO_2022_JP_INDEX].score.is_some()
2896 {
2897 return ISO_2022_JP;
2898 }
2899
2900 if self.candidates[Self::UTF_8_INDEX].score.is_some() {
2901 if allow_utf8 {
2902 return UTF_8;
2903 }
2904 // Various test cases that prohibit UTF-8 detection want to
2905 // see windows-1252 specifically. These tests run on generic
2906 // domains. However, if we returned windows-1252 on
2907 // some non-generic domains, we'd cause reloads.
2908 return self.candidates[encoding_for_tld(tld_type)].encoding();
2909 }
2910
2911 let mut encoding = self.candidates[encoding_for_tld(tld_type)].encoding();
2912 let mut max = 0i64;
2913 let mut expectation_is_valid = false;
2914 if tld_type != Tld::Generic {
2915 for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) {
2916 if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() {
2917 expectation_is_valid = true;
2918 break;
2919 }
2920 }
2921 }
2922 if !expectation_is_valid {
2923 // Flip Chinese and Central around
2924 match tld_type {
2925 Tld::Simplified => {
2926 if self.candidates[Self::BIG5_INDEX].score.is_some() {
2927 tld_type = Tld::Traditional;
2928 expectation_is_valid = true;
2929 }
2930 }
2931 Tld::Traditional => {
2932 if self.candidates[Self::GBK_INDEX].score.is_some() {
2933 tld_type = Tld::Simplified;
2934 expectation_is_valid = true;
2935 }
2936 }
2937 Tld::CentralWindows => {
2938 if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() {
2939 tld_type = Tld::CentralIso;
2940 expectation_is_valid = true;
2941 }
2942 }
2943 Tld::CentralIso => {
2944 if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() {
2945 tld_type = Tld::CentralWindows;
2946 expectation_is_valid = true;
2947 }
2948 }
2949 _ => {}
2950 }
2951 }
2952 for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) {
2953 if let Some(score) = candidate.score(i, tld_type, expectation_is_valid) {
2954 if score > max {
2955 max = score;
2956 encoding = candidate.encoding();
2957 }
2958 }
2959 }
2960 let visual = &self.candidates[Self::VISUAL_INDEX];
2961 if let Some(visual_score) = visual.score(Self::VISUAL_INDEX, tld_type, expectation_is_valid)
2962 {
2963 if (visual_score > max || encoding == WINDOWS_1255)
2964 && visual.plausible_punctuation()
2965 > self.candidates[Self::LOGICAL_INDEX].plausible_punctuation()
2966 {
2967 // max = visual_score;
2968 encoding = ISO_8859_8;
2969 }
2970 }
2971
2972 encoding
2973 }
2974
2975 // XXX Test-only API
2976 #[cfg(feature = "testing-only-no-semver-guarantees-do-not-use")]
find_score(&self, encoding: &'static Encoding) -> Option<i64>2977 pub fn find_score(&self, encoding: &'static Encoding) -> Option<i64> {
2978 let mut tld_type = Tld::Generic;
2979 let mut expectation_is_valid = false;
2980 if tld_type != Tld::Generic {
2981 for (i, candidate) in self.candidates.iter().enumerate().skip(Self::FIRST_NORMAL) {
2982 if encoding_is_native_to_tld(tld_type, i) && candidate.score.is_some() {
2983 expectation_is_valid = true;
2984 break;
2985 }
2986 }
2987 }
2988 if !expectation_is_valid {
2989 // Flip Chinese and Central around
2990 match tld_type {
2991 Tld::Simplified => {
2992 if self.candidates[Self::BIG5_INDEX].score.is_some() {
2993 tld_type = Tld::Traditional;
2994 expectation_is_valid = true;
2995 }
2996 }
2997 Tld::Traditional => {
2998 if self.candidates[Self::GBK_INDEX].score.is_some() {
2999 tld_type = Tld::Simplified;
3000 expectation_is_valid = true;
3001 }
3002 }
3003 Tld::CentralWindows => {
3004 if self.candidates[Self::CENTRAL_ISO_INDEX].score.is_some() {
3005 tld_type = Tld::CentralIso;
3006 expectation_is_valid = true;
3007 }
3008 }
3009 Tld::CentralIso => {
3010 if self.candidates[Self::CENTRAL_WINDOWS_INDEX].score.is_some() {
3011 tld_type = Tld::CentralWindows;
3012 expectation_is_valid = true;
3013 }
3014 }
3015 _ => {}
3016 }
3017 }
3018 for (i, candidate) in self.candidates.iter().enumerate() {
3019 if encoding == candidate.encoding() {
3020 return candidate.score(i, tld_type, expectation_is_valid);
3021 }
3022 }
3023 Some(0)
3024 }
3025
3026 const FIRST_NORMAL: usize = 3;
3027
3028 const UTF_8_INDEX: usize = 0;
3029
3030 const ISO_2022_JP_INDEX: usize = 1;
3031
3032 const VISUAL_INDEX: usize = 2;
3033
3034 const GBK_INDEX: usize = 3;
3035
3036 const EUC_JP_INDEX: usize = 4;
3037
3038 const EUC_KR_INDEX: usize = 5;
3039
3040 const SHIFT_JIS_INDEX: usize = 6;
3041
3042 const BIG5_INDEX: usize = 7;
3043
3044 const WESTERN_INDEX: usize = 8;
3045
3046 const CYRILLIC_WINDOWS_INDEX: usize = 9;
3047
3048 const CENTRAL_WINDOWS_INDEX: usize = 10;
3049
3050 const CENTRAL_ISO_INDEX: usize = 11;
3051
3052 const ARABIC_WINDOWS_INDEX: usize = 12;
3053
3054 const ICELANDIC_INDEX: usize = 13;
3055
3056 const TURKISH_INDEX: usize = 14;
3057
3058 const THAI_INDEX: usize = 15;
3059
3060 const LOGICAL_INDEX: usize = 16;
3061
3062 const GREEK_WINDOWS_INDEX: usize = 17;
3063
3064 const GREEK_ISO_INDEX: usize = 18;
3065
3066 const BALTIC_WINDOWS_INDEX: usize = 19;
3067
3068 const BALTIC_ISO13_INDEX: usize = 20;
3069
3070 const CYRILLIC_KOI_INDEX: usize = 21;
3071
3072 const CYRILLIC_IBM_INDEX: usize = 22;
3073
3074 const ARABIC_ISO_INDEX: usize = 23;
3075
3076 const VIETNAMESE_INDEX: usize = 24;
3077
3078 const BALTIC_ISO4_INDEX: usize = 25;
3079
3080 const CYRILLIC_ISO_INDEX: usize = 26;
3081
3082 /// Creates a new instance of the detector.
new() -> Self3083 pub fn new() -> Self {
3084 EncodingDetector {
3085 candidates: [
3086 Candidate::new_utf_8(), // 0
3087 Candidate::new_iso_2022_jp(), // 1
3088 Candidate::new_visual(&SINGLE_BYTE_DATA[ISO_8859_8_INDEX]), // 2
3089 Candidate::new_gbk(), // 3
3090 Candidate::new_euc_jp(), // 4
3091 Candidate::new_euc_kr(), // 5
3092 Candidate::new_shift_jis(), // 6
3093 Candidate::new_big5(), // 7
3094 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_INDEX]), // 8
3095 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1251_INDEX]), // 9
3096 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1250_INDEX]), // 10
3097 Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_2_INDEX]), // 11
3098 Candidate::new_arabic_french(&SINGLE_BYTE_DATA[WINDOWS_1256_INDEX]), // 12
3099 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1252_ICELANDIC_INDEX]), // 13
3100 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1254_INDEX]), // 14
3101 Candidate::new_caseless(&SINGLE_BYTE_DATA[WINDOWS_874_INDEX]), // 15
3102 Candidate::new_logical(&SINGLE_BYTE_DATA[WINDOWS_1255_INDEX]), // 16
3103 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[WINDOWS_1253_INDEX]), // 17
3104 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_7_INDEX]), // 18
3105 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1257_INDEX]), // 19
3106 Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_13_INDEX]), // 20
3107 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[KOI8_U_INDEX]), // 21
3108 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[IBM866_INDEX]), // 22
3109 Candidate::new_caseless(&SINGLE_BYTE_DATA[ISO_8859_6_INDEX]), // 23
3110 Candidate::new_latin(&SINGLE_BYTE_DATA[WINDOWS_1258_INDEX]), // 24
3111 Candidate::new_latin(&SINGLE_BYTE_DATA[ISO_8859_4_INDEX]), // 25
3112 Candidate::new_non_latin_cased(&SINGLE_BYTE_DATA[ISO_8859_5_INDEX]), // 26
3113 ],
3114 non_ascii_seen: 0,
3115 last_before_non_ascii: BeforeNonAscii::None,
3116 esc_seen: false,
3117 closed: false,
3118 }
3119 }
3120
3121 /// Queries whether the TLD is considered non-generic and could affect the guess.
tld_may_affect_guess(tld: Option<&[u8]>) -> bool3122 pub fn tld_may_affect_guess(tld: Option<&[u8]>) -> bool {
3123 if let Some(tld) = tld {
3124 classify_tld(tld) != Tld::Generic
3125 } else {
3126 false
3127 }
3128 }
3129 }
3130
3131 #[cfg(test)]
3132 mod tests {
3133 use super::*;
3134 use detone::IterDecomposeVietnamese;
3135 use encoding_rs::IBM866;
3136 use encoding_rs::ISO_8859_2;
3137 use encoding_rs::ISO_8859_4;
3138 use encoding_rs::ISO_8859_5;
3139 use encoding_rs::ISO_8859_6;
3140 use encoding_rs::ISO_8859_7;
3141 use encoding_rs::KOI8_U;
3142 use encoding_rs::WINDOWS_1250;
3143 use encoding_rs::WINDOWS_1251;
3144 use encoding_rs::WINDOWS_1252;
3145 use encoding_rs::WINDOWS_1253;
3146 use encoding_rs::WINDOWS_1254;
3147 use encoding_rs::WINDOWS_1256;
3148 use encoding_rs::WINDOWS_1257;
3149 use encoding_rs::WINDOWS_1258;
3150 use encoding_rs::WINDOWS_874;
3151
check_bytes(bytes: &[u8], encoding: &'static Encoding)3152 fn check_bytes(bytes: &[u8], encoding: &'static Encoding) {
3153 let mut det = EncodingDetector::new();
3154 det.feed(bytes, true);
3155 let enc = det.guess(None, false);
3156 let (decoded, _) = enc.decode_without_bom_handling(bytes);
3157 println!("{:?}", decoded);
3158 assert_eq!(enc, encoding);
3159 }
3160
check(input: &str, encoding: &'static Encoding)3161 fn check(input: &str, encoding: &'static Encoding) {
3162 let orthographic;
3163 let (bytes, _, _) = if encoding == WINDOWS_1258 {
3164 orthographic = input
3165 .chars()
3166 .decompose_vietnamese_tones(true)
3167 .collect::<String>();
3168 encoding.encode(&orthographic)
3169 } else {
3170 encoding.encode(input)
3171 };
3172 check_bytes(&bytes, encoding);
3173 }
3174
3175 #[test]
test_i_apostrophe()3176 fn test_i_apostrophe() {
3177 let mut det = EncodingDetector::new();
3178 det.feed(b"I\x92", true);
3179 let enc = det.guess(None, false);
3180 assert_eq!(enc, WINDOWS_1252);
3181 }
3182
3183 #[test]
test_streaming_numero_one_by_one()3184 fn test_streaming_numero_one_by_one() {
3185 let mut det = EncodingDetector::new();
3186 det.feed(b"n", false);
3187 det.feed(b".", false);
3188 det.feed(b"\xBA", false);
3189 det.feed(b"1", true);
3190 let enc = det.guess(None, false);
3191 assert_eq!(enc, WINDOWS_1252);
3192 }
3193
3194 #[test]
test_streaming_numero_two_together()3195 fn test_streaming_numero_two_together() {
3196 let mut det = EncodingDetector::new();
3197 det.feed(b"n.", false);
3198 det.feed(b"\xBA", false);
3199 det.feed(b"1", true);
3200 let enc = det.guess(None, false);
3201 assert_eq!(enc, WINDOWS_1252);
3202 }
3203
3204 #[test]
test_streaming_numero_one_by_one_extra_before()3205 fn test_streaming_numero_one_by_one_extra_before() {
3206 let mut det = EncodingDetector::new();
3207 det.feed(b" n", false);
3208 det.feed(b".", false);
3209 det.feed(b"\xBA", false);
3210 det.feed(b"1", true);
3211 let enc = det.guess(None, false);
3212 assert_eq!(enc, WINDOWS_1252);
3213 }
3214
3215 #[test]
test_streaming_numero_one_before()3216 fn test_streaming_numero_one_before() {
3217 let mut det = EncodingDetector::new();
3218 det.feed(b"n", false);
3219 det.feed(b".\xBA", false);
3220 det.feed(b"1", true);
3221 let enc = det.guess(None, false);
3222 assert_eq!(enc, WINDOWS_1252);
3223 }
3224
3225 #[test]
test_streaming_numero_longer_first_buffer()3226 fn test_streaming_numero_longer_first_buffer() {
3227 let mut det = EncodingDetector::new();
3228 det.feed(b"rrn.", false);
3229 det.feed(b"\xBA", false);
3230 det.feed(b"1", true);
3231 let enc = det.guess(None, false);
3232 assert_eq!(enc, WINDOWS_1252);
3233 }
3234
3235 #[test]
test_empty()3236 fn test_empty() {
3237 let mut det = EncodingDetector::new();
3238 let seen_non_ascii = det.feed(b"", true);
3239 let enc = det.guess(None, false);
3240 assert_eq!(enc, WINDOWS_1252);
3241 assert!(!seen_non_ascii);
3242 }
3243
3244 #[test]
test_fi()3245 fn test_fi() {
3246 check("Ääni", WINDOWS_1252);
3247 }
3248
3249 #[test]
test_fi_bis()3250 fn test_fi_bis() {
3251 check("Tämä", WINDOWS_1252);
3252 }
3253
3254 #[test]
test_pt()3255 fn test_pt() {
3256 check(
3257 "Este é um teste de codificação de caracteres.",
3258 WINDOWS_1252,
3259 );
3260 }
3261
3262 #[test]
test_is()3263 fn test_is() {
3264 check("Þetta er kóðunarpróf á staf. Fyrir sum tungumál sem nota latneska stafi þurfum við meira inntak til að taka ákvörðunina.", WINDOWS_1252);
3265 }
3266
3267 #[test]
test_ru_short()3268 fn test_ru_short() {
3269 check("Русский", WINDOWS_1251);
3270 }
3271
3272 #[test]
test_ru()3273 fn test_ru() {
3274 check("Это тест кодировки символов.", WINDOWS_1251);
3275 }
3276
3277 #[test]
test_ru_iso()3278 fn test_ru_iso() {
3279 check("Это тест кодировки символов.", ISO_8859_5);
3280 }
3281
3282 #[test]
test_ru_ibm()3283 fn test_ru_ibm() {
3284 check("Это тест кодировки символов.", IBM866);
3285 }
3286
3287 #[test]
test_ru_koi()3288 fn test_ru_koi() {
3289 check("Это тест кодировки символов.", KOI8_U);
3290 }
3291
3292 #[test]
test_uk()3293 fn test_uk() {
3294 check("Це тест на кодування символів.", WINDOWS_1251);
3295 }
3296
3297 #[test]
test_uk_koi()3298 fn test_uk_koi() {
3299 check("Це тест на кодування символів.", KOI8_U);
3300 }
3301
3302 #[test]
test_el_short()3303 fn test_el_short() {
3304 check("Ελληνικά", WINDOWS_1253);
3305 }
3306
3307 #[test]
test_el()3308 fn test_el() {
3309 check(
3310 "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης",
3311 WINDOWS_1253,
3312 );
3313 }
3314
3315 #[test]
test_el_iso()3316 fn test_el_iso() {
3317 check(
3318 "Πρόκειται για δοκιμή κωδικοποίησης χαρακτήρων: Άρης",
3319 ISO_8859_7,
3320 );
3321 }
3322
3323 #[test]
test_de()3324 fn test_de() {
3325 check("Straße", WINDOWS_1252);
3326 }
3327
3328 #[test]
test_he()3329 fn test_he() {
3330 check("\u{5E2}\u{5D1}\u{5E8}\u{5D9}\u{5EA}", WINDOWS_1255);
3331 }
3332
3333 #[test]
test_2022()3334 fn test_2022() {
3335 check("日本語", ISO_2022_JP);
3336 }
3337
3338 #[test]
test_th()3339 fn test_th() {
3340 check("นี่คือการทดสอบการเข้ารหัสอักขระ", WINDOWS_874);
3341 }
3342
3343 #[test]
test_vi()3344 fn test_vi() {
3345 check("Đây là một thử nghiệm mã hóa ký tự.", WINDOWS_1258);
3346 }
3347
3348 #[test]
test_tr()3349 fn test_tr() {
3350 check("Bu bir karakter kodlama testidir. Latince karakterleri kullanan bazı dillerde karar vermek için daha fazla girdiye ihtiyacımız var.", WINDOWS_1254);
3351 }
3352
3353 #[test]
test_simplified()3354 fn test_simplified() {
3355 check("这是一个字符编码测试。", GBK);
3356 }
3357
3358 #[test]
test_traditional()3359 fn test_traditional() {
3360 check("這是一個字符編碼測試。", BIG5);
3361 }
3362
3363 #[test]
test_ko()3364 fn test_ko() {
3365 check("이것은 문자 인코딩 테스트입니다.", EUC_KR);
3366 }
3367
3368 #[test]
test_shift()3369 fn test_shift() {
3370 check("これは文字実験です。", SHIFT_JIS);
3371 }
3372
3373 #[test]
test_euc()3374 fn test_euc() {
3375 check("これは文字実験です。", EUC_JP);
3376 }
3377
3378 #[test]
test_ar()3379 fn test_ar() {
3380 check("هذا هو اختبار ترميز الأحرف.", WINDOWS_1256);
3381 }
3382
3383 #[test]
test_ar_iso()3384 fn test_ar_iso() {
3385 check("هذا هو اختبار ترميز الأحرف.", ISO_8859_6);
3386 }
3387
3388 #[test]
test_fa()3389 fn test_fa() {
3390 check("این یک تست رمزگذاری کاراکتر است.", WINDOWS_1256);
3391 }
3392
3393 #[test]
test_visual()3394 fn test_visual() {
3395 check(".םיוות דודיק ןחבמ והז", ISO_8859_8);
3396 }
3397
3398 #[test]
test_yi()3399 fn test_yi() {
3400 check("דאָס איז אַ טעסט פֿאַר קאָדירונג פון כאַראַקטער.", WINDOWS_1255);
3401 }
3402
3403 #[test]
test_it()3404 fn test_it() {
3405 check("è", WINDOWS_1252);
3406 }
3407
3408 #[test]
test_en()3409 fn test_en() {
3410 check("isn’t", WINDOWS_1252);
3411 }
3412
3413 #[test]
test_en_bis()3414 fn test_en_bis() {
3415 check("Rock ’n Roll", WINDOWS_1252);
3416 }
3417
3418 #[test]
test_ca()3419 fn test_ca() {
3420 check("Codificació de caràcters", WINDOWS_1252);
3421 }
3422
3423 #[test]
test_et()3424 fn test_et() {
3425 check("või", WINDOWS_1252);
3426 }
3427
3428 #[test]
test_pl_iso()3429 fn test_pl_iso() {
3430 check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", ISO_8859_2);
3431 }
3432
3433 #[test]
test_pl()3434 fn test_pl() {
3435 check("To jest test kodowania znaków. W przypadku niektórych języków, które używają znaków łacińskich, potrzebujemy więcej danych, aby podjąć decyzję.", WINDOWS_1250);
3436 }
3437
3438 #[test]
test_lt()3439 fn test_lt() {
3440 check("Tai simbolių kodavimo testas. Kai kurioms kalboms, naudojančioms lotyniškus rašmenis, mums reikia daugiau informacijos, kad galėtume priimti sprendimą.", WINDOWS_1257);
3441 }
3442
3443 // TODO: Detected as ISO-8859-2.
3444 // #[test]
3445 // fn test_lt_windows_iso_8859_4() {
3446 // check("Tai simbolių kodavimo testas. Kai kurioms kalboms, naudojančioms lotyniškus rašmenis, mums reikia daugiau informacijos, kad galėtume priimti sprendimą.", ISO_8859_4);
3447 // }
3448
3449 #[test]
test_lv()3450 fn test_lv() {
3451 check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", WINDOWS_1257);
3452 }
3453
3454 #[test]
test_lv_iso_8859_4()3455 fn test_lv_iso_8859_4() {
3456 check("Šis ir rakstzīmju kodēšanas tests. Dažās valodās, kurās tiek izmantotas latīņu valodas burti, lēmuma pieņemšanai mums ir nepieciešams vairāk ieguldījuma.", ISO_8859_4);
3457 }
3458
3459 #[test]
test_a0()3460 fn test_a0() {
3461 // Test that this isn't IBM866. TODO: What about GBK with fully paired 0xA0?
3462 check("\u{A0}\u{A0} \u{A0}", WINDOWS_1252);
3463 }
3464
3465 #[test]
test_a0a0()3466 fn test_a0a0() {
3467 // Test that this isn't GBK or EUC-KR.
3468 check("\u{A0}\u{A0}", WINDOWS_1252);
3469 }
3470
3471 #[test]
test_space_copyright_space()3472 fn test_space_copyright_space() {
3473 check(" © ", WINDOWS_1252);
3474 }
3475
3476 #[test]
test_space_masculine_space()3477 fn test_space_masculine_space() {
3478 check(" º ", WINDOWS_1252);
3479 }
3480
3481 #[test]
test_space_feminine_space()3482 fn test_space_feminine_space() {
3483 check(" ª ", WINDOWS_1252);
3484 }
3485
3486 #[test]
test_period_masculine_space()3487 fn test_period_masculine_space() {
3488 check(".º ", WINDOWS_1252);
3489 }
3490
3491 #[test]
test_period_feminine_space()3492 fn test_period_feminine_space() {
3493 check(".ª ", WINDOWS_1252);
3494 }
3495
3496 #[test]
test_maria()3497 fn test_maria() {
3498 check(" Mª ", WINDOWS_1252);
3499 }
3500
3501 #[test]
test_dona()3502 fn test_dona() {
3503 check(" Dª ", WINDOWS_1252);
3504 }
3505
3506 #[test]
test_nuestra()3507 fn test_nuestra() {
3508 check(" Nª ", WINDOWS_1252);
3509 }
3510
3511 #[test]
test_senora()3512 fn test_senora() {
3513 check(" Sª ", WINDOWS_1252);
3514 }
3515
3516 #[test]
test_digit_feminine()3517 fn test_digit_feminine() {
3518 check(" 42ª ", WINDOWS_1252);
3519 }
3520
3521 #[test]
test_digit_masculine()3522 fn test_digit_masculine() {
3523 check(" 42º ", WINDOWS_1252);
3524 }
3525
3526 #[test]
test_roman_feminine()3527 fn test_roman_feminine() {
3528 check(" XIVª ", WINDOWS_1252);
3529 }
3530
3531 #[test]
test_roman_masculine()3532 fn test_roman_masculine() {
3533 check(" XIVº ", WINDOWS_1252);
3534 }
3535
3536 #[test]
test_numero_uno()3537 fn test_numero_uno() {
3538 check("Nº1", WINDOWS_1252);
3539 }
3540
3541 #[test]
test_numero()3542 fn test_numero() {
3543 check("Nº", WINDOWS_1252);
3544 }
3545
3546 #[test]
test_euro()3547 fn test_euro() {
3548 check(" €9", WINDOWS_1252);
3549 }
3550
3551 #[test]
test_shift_jis_half_width_katakana()3552 fn test_shift_jis_half_width_katakana() {
3553 check("ハードウェアハードウェアハードウェアハードウェアハードウェア", SHIFT_JIS);
3554 }
3555
3556 #[test]
test_big5_pua()3557 fn test_big5_pua() {
3558 let mut v = Vec::new();
3559 for _ in 0..40 {
3560 v.extend_from_slice(b"\xA4\x40");
3561 }
3562 v.extend_from_slice(b"\x81\x40\xA4\x40");
3563 check_bytes(&v, BIG5);
3564 }
3565
3566 #[test]
test_big5_single_byte_a0()3567 fn test_big5_single_byte_a0() {
3568 let mut v = Vec::new();
3569 for _ in 0..80 {
3570 v.extend_from_slice(b"\xA4\x40");
3571 }
3572 v.extend_from_slice(b"\x81\x40\xA0 ");
3573 check_bytes(&v, BIG5);
3574 }
3575
3576 #[test]
test_big5_single_byte_ff()3577 fn test_big5_single_byte_ff() {
3578 let mut v = Vec::new();
3579 for _ in 0..80 {
3580 v.extend_from_slice(b"\xA4\x40");
3581 }
3582 v.extend_from_slice(b"\x81\x40\xFF ");
3583 check_bytes(&v, BIG5);
3584 }
3585
3586 #[test]
test_not_big5()3587 fn test_not_big5() {
3588 let mut v = Vec::new();
3589 for _ in 0..40 {
3590 v.extend_from_slice(b"\xA4\x40");
3591 }
3592 v.extend_from_slice(b"\x81\x40\xA0\xA0");
3593 check_bytes(&v, IBM866);
3594 }
3595
3596 #[test]
test_euc_kr_pua()3597 fn test_euc_kr_pua() {
3598 let mut v = Vec::new();
3599 v.extend_from_slice(b"\xC9\xA1\xB0\xA1 ");
3600 for _ in 0..40 {
3601 v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3602 }
3603 check_bytes(&v, EUC_KR);
3604 }
3605
3606 #[test]
test_euc_kr_pua_bis()3607 fn test_euc_kr_pua_bis() {
3608 let mut v = Vec::new();
3609 v.extend_from_slice(b"\xFE\xA1\xB0\xA1 ");
3610 for _ in 0..40 {
3611 v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3612 }
3613 check_bytes(&v, EUC_KR);
3614 }
3615
3616 #[test]
test_euc_kr_single_byte_ff()3617 fn test_euc_kr_single_byte_ff() {
3618 let mut v = Vec::new();
3619 v.extend_from_slice(b"\xFF ");
3620 for _ in 0..40 {
3621 v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3622 }
3623 check_bytes(&v, EUC_KR);
3624 }
3625
3626 #[test]
test_euc_kr_single_byte_81()3627 fn test_euc_kr_single_byte_81() {
3628 let mut v = Vec::new();
3629 v.extend_from_slice(b"\x81 ");
3630 for _ in 0..40 {
3631 v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3632 }
3633 check_bytes(&v, EUC_KR);
3634 }
3635
3636 #[test]
test_euc_kr_single_byte_84()3637 fn test_euc_kr_single_byte_84() {
3638 let mut v = Vec::new();
3639 v.extend_from_slice(b"\x84 ");
3640 for _ in 0..40 {
3641 v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3642 }
3643 check_bytes(&v, EUC_KR);
3644 }
3645
3646 #[test]
test_not_euc_kr()3647 fn test_not_euc_kr() {
3648 let mut v = Vec::new();
3649 v.extend_from_slice(b"\xC9\xA0\xB0\xA1 ");
3650 for _ in 0..40 {
3651 v.extend_from_slice(b"\xC5\xD7\xBD\xBA\xC6\xAE. ");
3652 }
3653 check_bytes(&v, GBK);
3654 }
3655
3656 #[test]
test_shift_jis_x0213()3657 fn test_shift_jis_x0213() {
3658 let mut v = Vec::new();
3659 v.extend_from_slice(b"\x87\xE5");
3660 for _ in 0..40 {
3661 v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3662 }
3663 check_bytes(&v, SHIFT_JIS);
3664 }
3665
3666 #[test]
test_shift_jis_single_byte_fd()3667 fn test_shift_jis_single_byte_fd() {
3668 let mut v = Vec::new();
3669 v.extend_from_slice(b"\xFD");
3670 for _ in 0..40 {
3671 v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3672 }
3673 check_bytes(&v, SHIFT_JIS);
3674 }
3675
3676 #[test]
test_not_shift_jis()3677 fn test_not_shift_jis() {
3678 let mut v = Vec::new();
3679 v.extend_from_slice(b"\x84\xE0");
3680 for _ in 0..40 {
3681 v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3682 }
3683 check_bytes(&v, GBK);
3684 }
3685
3686 #[test]
test_not_shift_jis_bis()3687 fn test_not_shift_jis_bis() {
3688 let mut v = Vec::new();
3689 v.extend_from_slice(b"\x87\x7D");
3690 for _ in 0..40 {
3691 v.extend_from_slice(b"\x82\xC9\x82\xD9\x82\xF1\x82\xB2");
3692 }
3693 check_bytes(&v, GBK);
3694 }
3695
3696 #[test]
test_euc_jp_x0213()3697 fn test_euc_jp_x0213() {
3698 let mut v = Vec::new();
3699 v.extend_from_slice(b"\xAD\xBF");
3700 for _ in 0..80 {
3701 v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3702 }
3703 check_bytes(&v, EUC_JP);
3704 }
3705
3706 #[test]
test_euc_jp_x0213_other_plane()3707 fn test_euc_jp_x0213_other_plane() {
3708 let mut v = Vec::new();
3709 v.extend_from_slice(b"\x8F\xFE\xF6");
3710 for _ in 0..80 {
3711 v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3712 }
3713 check_bytes(&v, EUC_JP);
3714 }
3715
3716 #[test]
test_not_euc_jp()3717 fn test_not_euc_jp() {
3718 let mut v = Vec::new();
3719 v.extend_from_slice(b"\x8F\xFE\xF7");
3720 for _ in 0..80 {
3721 v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3722 }
3723 check_bytes(&v, WINDOWS_1252);
3724 }
3725
3726 #[test]
test_not_euc_jp_bis()3727 fn test_not_euc_jp_bis() {
3728 let mut v = Vec::new();
3729 v.extend_from_slice(b"\xA8\xDF");
3730 for _ in 0..80 {
3731 v.extend_from_slice(b"\xA4\xCB\xA4\xDB\xA4\xF3\xA4\xB4");
3732 }
3733 check_bytes(&v, BIG5);
3734 }
3735
3736 #[test]
test_gbk_single_byte_ff()3737 fn test_gbk_single_byte_ff() {
3738 let mut v = Vec::new();
3739 v.extend_from_slice(b"\xFF");
3740 for _ in 0..80 {
3741 v.extend_from_slice(b"\xB5\xC4");
3742 }
3743 check_bytes(&v, GBK);
3744 }
3745
3746 #[test]
test_gbk_single_byte_a0()3747 fn test_gbk_single_byte_a0() {
3748 let mut v = Vec::new();
3749 v.extend_from_slice(b"\xA0 ");
3750 for _ in 0..80 {
3751 v.extend_from_slice(b"\xB5\xC4");
3752 }
3753 check_bytes(&v, GBK);
3754 }
3755
3756 #[test]
test_gbk_single_byte_fe()3757 fn test_gbk_single_byte_fe() {
3758 let mut v = Vec::new();
3759 v.extend_from_slice(b"\xFE ");
3760 for _ in 0..80 {
3761 v.extend_from_slice(b"\xB5\xC4");
3762 }
3763 check_bytes(&v, GBK);
3764 }
3765
3766 #[test]
test_not_gbk_single_byte_fc()3767 fn test_not_gbk_single_byte_fc() {
3768 let mut v = Vec::new();
3769 v.extend_from_slice(b"\xFC ");
3770 for _ in 0..80 {
3771 v.extend_from_slice(b"\xB5\xC4");
3772 }
3773 check_bytes(&v, ISO_8859_5);
3774 }
3775 }
3776