1 mod enums;
2 mod codingstatemachine;
3 mod mbcssm;
4 mod escsm;
5 mod big5freq;
6 mod euckrfreq;
7 mod euctwfreq;
8 mod gb2312freq;
9 mod jisfreq;
10 mod jpcntx;
11 mod chardistribution;
12 mod charsetprober;
13 mod mbcharsetprober;
14 mod big5prober;
15 mod cp949prober;
16 mod eucjpprober;
17 mod euckrprober;
18 mod euctwprober;
19 mod gb2312prober;
20 mod sjisprober;
21 mod utf8prober;
22 mod escprober;
23 mod sbcharsetprober;
24 mod langbulgarianmodel;
25 mod langcyrillicmodel;
26 mod langgreekmodel;
27 mod langhebrewmodel;
28 mod langthaimodel;
29 mod langturkishmodel;
30 mod hebrewprober;
31 mod latin1prober;
32
33 #[allow(dead_code)]
34 pub struct UniversalDetector {
35 m_input_state: enums::InputState,
36 m_done: bool,
37 m_start: bool,
38 m_got_data: bool,
39 m_last_char: u8,
40 m_detected_charset: String,
41 m_detected_confidence: f32,
42 m_detected_language: String,
43 m_esc_charset_prober: Option<Box<charsetprober::CharsetProber>>,
44 m_charset_probers: Vec<Box<charsetprober::CharsetProber>>,
45 }
46
47 impl UniversalDetector {
new() -> UniversalDetector48 pub fn new() -> UniversalDetector {
49 UniversalDetector {
50 m_input_state: enums::InputState::PureAscii,
51 m_done: false,
52 m_start: true,
53 m_got_data: false,
54 m_last_char: 0,
55 m_detected_charset: String::new(),
56 m_detected_confidence: 0.0,
57 m_detected_language: String::new(),
58 m_esc_charset_prober: None,
59 m_charset_probers: Vec::new(),
60 }
61 }
reset(&mut self)62 pub fn reset(&mut self) {
63 self.m_input_state = enums::InputState::PureAscii;
64 self.m_done = false;
65 self.m_start = true;
66 self.m_got_data = false;
67 self.m_last_char = 0;
68 self.m_detected_charset = String::new();
69 self.m_detected_confidence = 0.0;
70 self.m_detected_language = String::new();
71 match self.m_esc_charset_prober {
72 Some(ref mut prober) => {
73 prober.reset();
74 }
75 _ => {}
76 }
77 if !self.m_charset_probers.is_empty() {
78 for x in &mut self.m_charset_probers {
79 x.reset();
80 }
81 }
82 }
feed(&mut self, byte_str: &[u8])83 pub fn feed(&mut self, byte_str: &[u8]) {
84 if self.m_done {
85 return;
86 }
87 if byte_str.len() <= 0 {
88 return;
89 }
90 if !self.m_got_data {
91 if byte_str.len() >= 2 {
92 match byte_str[0] {
93 0x00 => {
94 if (byte_str.len() >= 4) && (byte_str[1] == 0x00) {
95 if (byte_str[2] == 0xfe) && (byte_str[3] == 0xff) {
96 self.m_detected_charset = "UTF-32BE".to_string();
97 } else if (byte_str[2] == 0xff) && (byte_str[3] == 0xfe) {
98 self.m_detected_charset = "X-ISO-100646-UCS-4-2143".to_string();
99 }
100 self.m_detected_confidence = 1.0;
101 self.m_detected_language = "".to_string();
102 }
103 }
104 0xef => {
105 if (byte_str.len() > 2) && (byte_str[1] == 0xbb) && (byte_str[2] == 0xbf) {
106 self.m_detected_charset = "UTF-8".to_string();
107 self.m_detected_confidence = 1.0;
108 self.m_detected_language = "".to_string();
109 }
110 }
111 0xfe => {
112 if byte_str[1] == 0xff {
113 if (byte_str.len() >= 4) && (byte_str[2] == 0x00) &&
114 (byte_str[3] == 0x00)
115 {
116 self.m_detected_charset = "X-ISO-10646-UCS-4-3412".to_string();
117 } else {
118 self.m_detected_charset = "UTF-16BE".to_string();
119 }
120 self.m_detected_confidence = 1.0;
121 self.m_detected_language = "".to_string();
122 }
123 }
124 0xff => {
125 if byte_str[1] == 0xfe {
126 if (byte_str.len() >= 4) && (byte_str[2] == 0x00) &&
127 (byte_str[3] == 0x00)
128 {
129 self.m_detected_charset = "UTF-32LE".to_string();
130 } else {
131 self.m_detected_charset = "UTF-16LE".to_string();
132 }
133 self.m_detected_confidence = 1.0;
134 self.m_detected_language = "".to_string();
135 }
136 }
137 _ => {}
138 }
139 }
140 self.m_got_data = true;
141 if self.m_detected_charset != "" {
142 self.m_done = true;
143 return;
144 }
145 }
146 for &ch in byte_str {
147 if (ch & 0x80 != 0) && (ch != 0xa0) {
148 match self.m_input_state {
149 enums::InputState::Highbyte => {}
150 _ => {
151 self.m_input_state = enums::InputState::Highbyte;
152 }
153 }
154 } else {
155 match self.m_input_state {
156 enums::InputState::PureAscii => {
157 if ch == 0x1B {
158 self.m_input_state = enums::InputState::EscAscii;
159 }
160 }
161 _ => {}
162 }
163 }
164 }
165 self.m_last_char = byte_str[byte_str.len() - 1];
166
167 match self.m_input_state {
168 enums::InputState::EscAscii => {
169 if self.m_esc_charset_prober.is_none() {
170 self.m_esc_charset_prober = Some(Box::new(escprober::EscCharsetProber::new()));
171 }
172 let prober = self.m_esc_charset_prober.as_mut().unwrap();
173 if *prober.feed(byte_str) == enums::ProbingState::FoundIt {
174 self.m_detected_charset = prober.get_charset();
175 self.m_detected_confidence = prober.get_confidence();
176 self.m_detected_language = prober.get_language();
177 self.m_done = true;
178 }
179 }
180 enums::InputState::Highbyte => {
181 if self.m_charset_probers.is_empty() {
182 // MultiByte
183 self.m_charset_probers.push(Box::new(
184 utf8prober::UTF8Prober::new(),
185 ));
186 self.m_charset_probers.push(Box::new(
187 sjisprober::SJISProber::new(),
188 ));
189 self.m_charset_probers.push(Box::new(
190 eucjpprober::EUCJPProber::new(),
191 ));
192 self.m_charset_probers.push(Box::new(
193 gb2312prober::GB2312Prober::new(),
194 ));
195 self.m_charset_probers.push(Box::new(
196 euckrprober::EUCKRProber::new(),
197 ));
198 self.m_charset_probers.push(Box::new(
199 cp949prober::CP949Prober::new(),
200 ));
201 self.m_charset_probers.push(Box::new(
202 big5prober::Big5Prober::new(),
203 ));
204 self.m_charset_probers.push(Box::new(
205 euctwprober::EUCTWProber::new(),
206 ));
207 // SingleByte
208 self.m_charset_probers.push(Box::new(
209 sbcharsetprober::SingleByteCharsetProber::new(
210 &langcyrillicmodel::Win1251CyrillicModel,
211 false,
212 ),
213 ));
214 self.m_charset_probers.push(Box::new(
215 sbcharsetprober::SingleByteCharsetProber::new(
216 &langcyrillicmodel::Koi8rModel,
217 false,
218 ),
219 ));
220 self.m_charset_probers.push(Box::new(
221 sbcharsetprober::SingleByteCharsetProber::new(
222 &langcyrillicmodel::Latin5CyrillicModel,
223 false,
224 ),
225 ));
226 self.m_charset_probers.push(Box::new(
227 sbcharsetprober::SingleByteCharsetProber::new(
228 &langcyrillicmodel::MacCyrillicModel,
229 false,
230 ),
231 ));
232 self.m_charset_probers.push(Box::new(
233 sbcharsetprober::SingleByteCharsetProber::new(
234 &langcyrillicmodel::Ibm866Model,
235 false,
236 ),
237 ));
238 self.m_charset_probers.push(Box::new(
239 sbcharsetprober::SingleByteCharsetProber::new(
240 &langcyrillicmodel::Ibm855Model,
241 false,
242 ),
243 ));
244 self.m_charset_probers.push(Box::new(
245 sbcharsetprober::SingleByteCharsetProber::new(
246 &langgreekmodel::Latin7GreekModel,
247 false,
248 ),
249 ));
250 self.m_charset_probers.push(Box::new(
251 sbcharsetprober::SingleByteCharsetProber::new(
252 &langgreekmodel::Win1253GreekModel,
253 false,
254 ),
255 ));
256 self.m_charset_probers.push(Box::new(
257 sbcharsetprober::SingleByteCharsetProber::new(
258 &langbulgarianmodel::Latin5BulgarianModel,
259 false,
260 ),
261 ));
262 self.m_charset_probers.push(Box::new(
263 sbcharsetprober::SingleByteCharsetProber::new(
264 &langbulgarianmodel::Win1251BulgarianModel,
265 false,
266 ),
267 ));
268 self.m_charset_probers.push(Box::new(
269 sbcharsetprober::SingleByteCharsetProber::new(
270 &langthaimodel::TIS620ThaiModel,
271 false,
272 ),
273 ));
274 self.m_charset_probers.push(Box::new(
275 sbcharsetprober::SingleByteCharsetProber::new(
276 &langturkishmodel::Latin5TurkishModel,
277 false,
278 ),
279 ));
280 self.m_charset_probers.push(
281 Box::new(hebrewprober::HebrewProber::new()),
282 );
283 self.m_charset_probers.push(Box::new(
284 latin1prober::Latin1Prober::new(),
285 ));
286 }
287 for x in &mut self.m_charset_probers {
288 if *x.feed(byte_str) == enums::ProbingState::FoundIt {
289 self.m_detected_charset = x.get_charset();
290 self.m_detected_confidence = x.get_confidence();
291 self.m_detected_language = x.get_language();
292 self.m_done = true;
293 break;
294 }
295 }
296 }
297 _ => {}
298 }
299 }
close(&mut self) -> (String, f32, String)300 pub fn close(&mut self) -> (String, f32, String) {
301 if self.m_done {
302 } else {
303 if self.m_got_data {
304 match self.m_input_state {
305 enums::InputState::PureAscii => {
306 self.m_detected_charset = "ascii".to_string();
307 self.m_detected_confidence = 1.0;
308 self.m_detected_language = "".to_string();
309 }
310 enums::InputState::Highbyte => {
311 let mut maxidx: usize = 0;
312 let mut maxconfidence: f32 = 0.0;
313 for i in 0..self.m_charset_probers.len() {
314 let tmp = self.m_charset_probers[i].get_confidence();
315 if tmp > maxconfidence {
316 maxconfidence = tmp;
317 maxidx = i;
318 }
319 }
320 if maxconfidence > 0.2 {
321 self.m_detected_charset = self.m_charset_probers[maxidx].get_charset();
322 self.m_detected_confidence = self.m_charset_probers[maxidx]
323 .get_confidence();
324 self.m_detected_language = self.m_charset_probers[maxidx]
325 .get_language();
326 }
327 }
328 _ => unreachable!(),
329 }
330 } else {
331 }
332 }
333 (
334 self.m_detected_charset.clone(),
335 self.m_detected_confidence,
336 self.m_detected_language.clone(),
337 )
338 }
339 }
340
341 /// detect charset for given buffer
detect(byte_str: &[u8]) -> (String, f32, String)342 pub fn detect(byte_str: &[u8]) -> (String, f32, String) {
343 let mut detector = UniversalDetector::new();
344 detector.feed(byte_str);
345 detector.close()
346 }
347
348 /// translate charset name for encoding
charset2encoding(enc:&String) -> &str349 pub fn charset2encoding(enc:&String) -> &str {
350 match enc.as_str() {
351 "CP932" => "windows-31j",
352 "CP949" => "windows-949",
353 "MacCyrillic" => "x-mac-cyrillic",
354 _ => enc.as_str(),
355 }
356 }
357