1 mod enums;
2 mod codingstatemachine;
3 mod mbcssm;
4 mod escsm;
5 mod big5freq;
6 mod euckrfreq;
7 mod euctwfreq;
8 mod gb2312freq;
9 mod jisfreq;
10 mod jpcntx;
11 mod chardistribution;
12 mod charsetprober;
13 mod mbcharsetprober;
14 mod big5prober;
15 mod cp949prober;
16 mod eucjpprober;
17 mod euckrprober;
18 mod euctwprober;
19 mod gb2312prober;
20 mod sjisprober;
21 mod utf8prober;
22 mod escprober;
23 mod sbcharsetprober;
24 mod langbulgarianmodel;
25 mod langcyrillicmodel;
26 mod langgreekmodel;
27 mod langhebrewmodel;
28 mod langthaimodel;
29 mod langturkishmodel;
30 mod hebrewprober;
31 mod latin1prober;
32 
33 #[allow(dead_code)]
34 pub struct UniversalDetector {
35     m_input_state: enums::InputState,
36     m_done: bool,
37     m_start: bool,
38     m_got_data: bool,
39     m_last_char: u8,
40     m_detected_charset: String,
41     m_detected_confidence: f32,
42     m_detected_language: String,
43     m_esc_charset_prober: Option<Box<charsetprober::CharsetProber>>,
44     m_charset_probers: Vec<Box<charsetprober::CharsetProber>>,
45 }
46 
47 impl UniversalDetector {
new() -> UniversalDetector48     pub fn new() -> UniversalDetector {
49         UniversalDetector {
50             m_input_state: enums::InputState::PureAscii,
51             m_done: false,
52             m_start: true,
53             m_got_data: false,
54             m_last_char: 0,
55             m_detected_charset: String::new(),
56             m_detected_confidence: 0.0,
57             m_detected_language: String::new(),
58             m_esc_charset_prober: None,
59             m_charset_probers: Vec::new(),
60         }
61     }
reset(&mut self)62     pub fn reset(&mut self) {
63         self.m_input_state = enums::InputState::PureAscii;
64         self.m_done = false;
65         self.m_start = true;
66         self.m_got_data = false;
67         self.m_last_char = 0;
68         self.m_detected_charset = String::new();
69         self.m_detected_confidence = 0.0;
70         self.m_detected_language = String::new();
71         match self.m_esc_charset_prober {
72             Some(ref mut prober) => {
73                 prober.reset();
74             }
75             _ => {}
76         }
77         if !self.m_charset_probers.is_empty() {
78             for x in &mut self.m_charset_probers {
79                 x.reset();
80             }
81         }
82     }
feed(&mut self, byte_str: &[u8])83     pub fn feed(&mut self, byte_str: &[u8]) {
84         if self.m_done {
85             return;
86         }
87         if byte_str.len() <= 0 {
88             return;
89         }
90         if !self.m_got_data {
91             if byte_str.len() >= 2 {
92                 match byte_str[0] {
93                     0x00 => {
94                         if (byte_str.len() >= 4) && (byte_str[1] == 0x00) {
95                             if (byte_str[2] == 0xfe) && (byte_str[3] == 0xff) {
96                                 self.m_detected_charset = "UTF-32BE".to_string();
97                             } else if (byte_str[2] == 0xff) && (byte_str[3] == 0xfe) {
98                                 self.m_detected_charset = "X-ISO-100646-UCS-4-2143".to_string();
99                             }
100                             self.m_detected_confidence = 1.0;
101                             self.m_detected_language = "".to_string();
102                         }
103                     }
104                     0xef => {
105                         if (byte_str.len() > 2) && (byte_str[1] == 0xbb) && (byte_str[2] == 0xbf) {
106                             self.m_detected_charset = "UTF-8".to_string();
107                             self.m_detected_confidence = 1.0;
108                             self.m_detected_language = "".to_string();
109                         }
110                     }
111                     0xfe => {
112                         if byte_str[1] == 0xff {
113                             if (byte_str.len() >= 4) && (byte_str[2] == 0x00) &&
114                                 (byte_str[3] == 0x00)
115                             {
116                                 self.m_detected_charset = "X-ISO-10646-UCS-4-3412".to_string();
117                             } else {
118                                 self.m_detected_charset = "UTF-16BE".to_string();
119                             }
120                             self.m_detected_confidence = 1.0;
121                             self.m_detected_language = "".to_string();
122                         }
123                     }
124                     0xff => {
125                         if byte_str[1] == 0xfe {
126                             if (byte_str.len() >= 4) && (byte_str[2] == 0x00) &&
127                                 (byte_str[3] == 0x00)
128                             {
129                                 self.m_detected_charset = "UTF-32LE".to_string();
130                             } else {
131                                 self.m_detected_charset = "UTF-16LE".to_string();
132                             }
133                             self.m_detected_confidence = 1.0;
134                             self.m_detected_language = "".to_string();
135                         }
136                     }
137                     _ => {}
138                 }
139             }
140             self.m_got_data = true;
141             if self.m_detected_charset != "" {
142                 self.m_done = true;
143                 return;
144             }
145         }
146         for &ch in byte_str {
147             if (ch & 0x80 != 0) && (ch != 0xa0) {
148                 match self.m_input_state {
149                     enums::InputState::Highbyte => {}
150                     _ => {
151                         self.m_input_state = enums::InputState::Highbyte;
152                     }
153                 }
154             } else {
155                 match self.m_input_state {
156                     enums::InputState::PureAscii => {
157                         if ch == 0x1B {
158                             self.m_input_state = enums::InputState::EscAscii;
159                         }
160                     }
161                     _ => {}
162                 }
163             }
164         }
165         self.m_last_char = byte_str[byte_str.len() - 1];
166 
167         match self.m_input_state {
168             enums::InputState::EscAscii => {
169                 if self.m_esc_charset_prober.is_none() {
170                     self.m_esc_charset_prober = Some(Box::new(escprober::EscCharsetProber::new()));
171                 }
172                 let prober = self.m_esc_charset_prober.as_mut().unwrap();
173                 if *prober.feed(byte_str) == enums::ProbingState::FoundIt {
174                     self.m_detected_charset = prober.get_charset();
175                     self.m_detected_confidence = prober.get_confidence();
176                     self.m_detected_language = prober.get_language();
177                     self.m_done = true;
178                 }
179             }
180             enums::InputState::Highbyte => {
181                 if self.m_charset_probers.is_empty() {
182                     // MultiByte
183                     self.m_charset_probers.push(Box::new(
184                         utf8prober::UTF8Prober::new(),
185                     ));
186                     self.m_charset_probers.push(Box::new(
187                         sjisprober::SJISProber::new(),
188                     ));
189                     self.m_charset_probers.push(Box::new(
190                         eucjpprober::EUCJPProber::new(),
191                     ));
192                     self.m_charset_probers.push(Box::new(
193                         gb2312prober::GB2312Prober::new(),
194                     ));
195                     self.m_charset_probers.push(Box::new(
196                         euckrprober::EUCKRProber::new(),
197                     ));
198                     self.m_charset_probers.push(Box::new(
199                         cp949prober::CP949Prober::new(),
200                     ));
201                     self.m_charset_probers.push(Box::new(
202                         big5prober::Big5Prober::new(),
203                     ));
204                     self.m_charset_probers.push(Box::new(
205                         euctwprober::EUCTWProber::new(),
206                     ));
207                     // SingleByte
208                     self.m_charset_probers.push(Box::new(
209                         sbcharsetprober::SingleByteCharsetProber::new(
210                             &langcyrillicmodel::Win1251CyrillicModel,
211                             false,
212                         ),
213                     ));
214                     self.m_charset_probers.push(Box::new(
215                         sbcharsetprober::SingleByteCharsetProber::new(
216                             &langcyrillicmodel::Koi8rModel,
217                             false,
218                         ),
219                     ));
220                     self.m_charset_probers.push(Box::new(
221                         sbcharsetprober::SingleByteCharsetProber::new(
222                             &langcyrillicmodel::Latin5CyrillicModel,
223                             false,
224                         ),
225                     ));
226                     self.m_charset_probers.push(Box::new(
227                         sbcharsetprober::SingleByteCharsetProber::new(
228                             &langcyrillicmodel::MacCyrillicModel,
229                             false,
230                         ),
231                     ));
232                     self.m_charset_probers.push(Box::new(
233                         sbcharsetprober::SingleByteCharsetProber::new(
234                             &langcyrillicmodel::Ibm866Model,
235                             false,
236                         ),
237                     ));
238                     self.m_charset_probers.push(Box::new(
239                         sbcharsetprober::SingleByteCharsetProber::new(
240                             &langcyrillicmodel::Ibm855Model,
241                             false,
242                         ),
243                     ));
244                     self.m_charset_probers.push(Box::new(
245                         sbcharsetprober::SingleByteCharsetProber::new(
246                             &langgreekmodel::Latin7GreekModel,
247                             false,
248                         ),
249                     ));
250                     self.m_charset_probers.push(Box::new(
251                         sbcharsetprober::SingleByteCharsetProber::new(
252                             &langgreekmodel::Win1253GreekModel,
253                             false,
254                         ),
255                     ));
256                     self.m_charset_probers.push(Box::new(
257                         sbcharsetprober::SingleByteCharsetProber::new(
258                             &langbulgarianmodel::Latin5BulgarianModel,
259                             false,
260                         ),
261                     ));
262                     self.m_charset_probers.push(Box::new(
263                         sbcharsetprober::SingleByteCharsetProber::new(
264                             &langbulgarianmodel::Win1251BulgarianModel,
265                             false,
266                         ),
267                     ));
268                     self.m_charset_probers.push(Box::new(
269                         sbcharsetprober::SingleByteCharsetProber::new(
270                             &langthaimodel::TIS620ThaiModel,
271                             false,
272                         ),
273                     ));
274                     self.m_charset_probers.push(Box::new(
275                         sbcharsetprober::SingleByteCharsetProber::new(
276                             &langturkishmodel::Latin5TurkishModel,
277                             false,
278                         ),
279                     ));
280                     self.m_charset_probers.push(
281                         Box::new(hebrewprober::HebrewProber::new()),
282                     );
283                     self.m_charset_probers.push(Box::new(
284                         latin1prober::Latin1Prober::new(),
285                     ));
286                 }
287                 for x in &mut self.m_charset_probers {
288                     if *x.feed(byte_str) == enums::ProbingState::FoundIt {
289                         self.m_detected_charset = x.get_charset();
290                         self.m_detected_confidence = x.get_confidence();
291                         self.m_detected_language = x.get_language();
292                         self.m_done = true;
293                         break;
294                     }
295                 }
296             }
297             _ => {}
298         }
299     }
close(&mut self) -> (String, f32, String)300     pub fn close(&mut self) -> (String, f32, String) {
301         if self.m_done {
302         } else {
303             if self.m_got_data {
304                 match self.m_input_state {
305                     enums::InputState::PureAscii => {
306                         self.m_detected_charset = "ascii".to_string();
307                         self.m_detected_confidence = 1.0;
308                         self.m_detected_language = "".to_string();
309                     }
310                     enums::InputState::Highbyte => {
311                         let mut maxidx: usize = 0;
312                         let mut maxconfidence: f32 = 0.0;
313                         for i in 0..self.m_charset_probers.len() {
314                             let tmp = self.m_charset_probers[i].get_confidence();
315                             if tmp > maxconfidence {
316                                 maxconfidence = tmp;
317                                 maxidx = i;
318                             }
319                         }
320                         if maxconfidence > 0.2 {
321                             self.m_detected_charset = self.m_charset_probers[maxidx].get_charset();
322                             self.m_detected_confidence = self.m_charset_probers[maxidx]
323                                 .get_confidence();
324                             self.m_detected_language = self.m_charset_probers[maxidx]
325                                 .get_language();
326                         }
327                     }
328                     _ => unreachable!(),
329                 }
330             } else {
331             }
332         }
333         (
334             self.m_detected_charset.clone(),
335             self.m_detected_confidence,
336             self.m_detected_language.clone(),
337         )
338     }
339 }
340 
341 /// detect charset for given buffer
detect(byte_str: &[u8]) -> (String, f32, String)342 pub fn detect(byte_str: &[u8]) -> (String, f32, String) {
343     let mut detector = UniversalDetector::new();
344     detector.feed(byte_str);
345     detector.close()
346 }
347 
348 /// translate charset name for encoding
charset2encoding(enc:&String) -> &str349 pub fn charset2encoding(enc:&String) -> &str {
350     match enc.as_str() {
351         "CP932" => "windows-31j",
352         "CP949" => "windows-949",
353         "MacCyrillic" => "x-mac-cyrillic",
354         _ => enc.as_str(),
355     }
356 }
357