1 use hashbrown::HashMap;
2 
3 use crate::constants::{MAX_TOTAL_DISTANCE, MAX_TRIGRAM_DISTANCE};
4 use crate::info::Info;
5 use crate::lang::*;
6 use crate::options::{List, Options};
7 use crate::script::*;
8 use crate::trigrams::*;
9 
10 /// Detect a language and a script by a given text.
11 ///
12 /// # Example
13 /// ```
14 /// use whatlang::{detect, Lang, Script};
15 ///
16 /// let info = detect("Ĉu vi ne volas eklerni Esperanton? Bonvolu!").unwrap();
17 /// assert_eq!(info.lang(), Lang::Epo);
18 /// assert_eq!(info.script(), Script::Latin);
19 /// ```
detect(text: &str) -> Option<Info>20 pub fn detect(text: &str) -> Option<Info> {
21     detect_with_options(text, &Options::default())
22 }
23 
24 /// Detect only a language by a given text.
25 ///
26 /// # Example
27 /// ```
28 /// use whatlang::{detect_lang, Lang};
29 /// let lang = detect_lang("There is no reason not to learn Esperanto.").unwrap();
30 /// assert_eq!(lang, Lang::Eng);
31 /// ```
detect_lang(text: &str) -> Option<Lang>32 pub fn detect_lang(text: &str) -> Option<Lang> {
33     detect(text).map(|info| info.lang)
34 }
35 
detect_lang_with_options(text: &str, options: &Options) -> Option<Lang>36 pub fn detect_lang_with_options(text: &str, options: &Options) -> Option<Lang> {
37     detect_with_options(text, options).map(|info| info.lang)
38 }
39 
detect_with_options(text: &str, options: &Options) -> Option<Info>40 pub fn detect_with_options(text: &str, options: &Options) -> Option<Info> {
41     detect_script(text).and_then(|script| {
42         detect_lang_based_on_script(text, options, script).map(|(lang, confidence)| Info {
43             lang,
44             script,
45             confidence,
46         })
47     })
48 }
49 
detect_lang_based_on_script( text: &str, options: &Options, script: Script, ) -> Option<(Lang, f64)>50 fn detect_lang_based_on_script(
51     text: &str,
52     options: &Options,
53     script: Script,
54 ) -> Option<(Lang, f64)> {
55     match script {
56         Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS),
57         Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS),
58         Script::Devanagari => detect_lang_in_profiles(text, options, DEVANAGARI_LANGS),
59         Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS),
60         Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS),
61         Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS),
62         Script::Mandarin => detect_mandarin_japanese(options),
63         Script::Bengali => Some((Lang::Ben, 1.0)),
64         Script::Hangul => Some((Lang::Kor, 1.0)),
65         Script::Georgian => Some((Lang::Kat, 1.0)),
66         Script::Greek => Some((Lang::Ell, 1.0)),
67         Script::Kannada => Some((Lang::Kan, 1.0)),
68         Script::Tamil => Some((Lang::Tam, 1.0)),
69         Script::Thai => Some((Lang::Tha, 1.0)),
70         Script::Gujarati => Some((Lang::Guj, 1.0)),
71         Script::Gurmukhi => Some((Lang::Pan, 1.0)),
72         Script::Telugu => Some((Lang::Tel, 1.0)),
73         Script::Malayalam => Some((Lang::Mal, 1.0)),
74         Script::Oriya => Some((Lang::Ori, 1.0)),
75         Script::Myanmar => Some((Lang::Mya, 1.0)),
76         Script::Sinhala => Some((Lang::Sin, 1.0)),
77         Script::Khmer => Some((Lang::Khm, 1.0)),
78         Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0)),
79     }
80 }
81 
detect_lang_in_profiles( text: &str, options: &Options, lang_profile_list: LangProfileList, ) -> Option<(Lang, f64)>82 fn detect_lang_in_profiles(
83     text: &str,
84     options: &Options,
85     lang_profile_list: LangProfileList,
86 ) -> Option<(Lang, f64)> {
87     let mut lang_distances: Vec<(Lang, u32)> = vec![];
88     let trigrams = get_trigrams_with_positions(text);
89 
90     for &(ref lang, lang_trigrams) in lang_profile_list {
91         match options.list {
92             Some(List::White(ref whitelist)) if !whitelist.contains(lang) => continue,
93             Some(List::Black(ref blacklist)) if blacklist.contains(lang) => continue,
94             _ => {}
95         }
96         let dist = calculate_distance(lang_trigrams, &trigrams);
97         lang_distances.push(((*lang), dist));
98     }
99 
100     // Sort languages by distance
101     lang_distances.sort_by_key(|key| key.1);
102 
103     // Return None if lang_distances is empty
104     // Return the only language with is_reliable=true if there is only 1 item
105     if lang_distances.len() < 2 {
106         return lang_distances.first().map(|pair| (pair.0, 1.0));
107     }
108 
109     // Calculate is_reliable based on:
110     // - number of unique trigrams in the text
111     // - rate (diff between score of the first and second languages)
112     //
113     let lang_dist1 = lang_distances[0];
114     let lang_dist2 = lang_distances[1];
115     let score1 = MAX_TOTAL_DISTANCE - lang_dist1.1;
116     let score2 = MAX_TOTAL_DISTANCE - lang_dist2.1;
117 
118     if score1 == 0 {
119         // If score1 is 0, score2 is 0 as well, because array is sorted.
120         // Therefore there is no language to return.
121         return None;
122     } else if score2 == 0 {
123         // If score2 is 0, return first language, to prevent division by zero in the rate formula.
124         // In this case confidence is calculated by another formula.
125         // At this point there are two options:
126         // * Text contains random characters that accidentally match trigrams of one of the languages
127         // * Text really matches one of the languages.
128         //
129         // Number 500.0 is based on experiments and common sense expectations.
130         let mut confidence = f64::from(score1) / 500.0;
131         if confidence > 1.0 {
132             confidence = 1.0;
133         }
134         return Some((lang_dist1.0, confidence));
135     }
136 
137     let rate = f64::from(score1 - score2) / f64::from(score2);
138 
139     // Hyperbola function. Everything that is above the function has confidence = 1.0
140     // If rate is below, confidence is calculated proportionally.
141     // Numbers 12.0 and 0.05 are obtained experimentally, so the function represents common sense.
142     //
143     let confident_rate = (12.0 / trigrams.len() as f64) + 0.05;
144     let confidence = if rate > confident_rate {
145         1.0
146     } else {
147         rate / confident_rate
148     };
149 
150     Some((lang_dist1.0, confidence))
151 }
152 
calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32153 fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32 {
154     let mut total_dist = 0u32;
155 
156     for (i, &trigram) in lang_trigrams.iter().enumerate() {
157         let dist = match text_trigrams.get(trigram) {
158             Some(&n) => (n as i32 - i as i32).abs() as u32,
159             None => MAX_TRIGRAM_DISTANCE,
160         };
161         total_dist += dist;
162     }
163     total_dist
164 }
165 
detect_mandarin_japanese(options: &Options) -> Option<(Lang, f64)>166 fn detect_mandarin_japanese(options: &Options) -> Option<(Lang, f64)> {
167     match options.list {
168         Some(List::White(ref whitelist)) => {
169             if whitelist.contains(&Lang::Jpn) && !whitelist.contains(&Lang::Cmn) {
170                 Some((Lang::Jpn, 1.0))
171             } else if whitelist.contains(&Lang::Cmn) {
172                 Some((Lang::Cmn, 1.0))
173             } else {
174                 None
175             }
176         }
177         Some(List::Black(ref blacklist)) => {
178             if blacklist.contains(&Lang::Cmn) && !blacklist.contains(&Lang::Jpn) {
179                 Some((Lang::Jpn, 1.0))
180             } else if !blacklist.contains(&Lang::Cmn) {
181                 Some((Lang::Cmn, 1.0))
182             } else {
183                 None
184             }
185         }
186         _ => Some((Lang::Cmn, 1.0)),
187     }
188 }
189 
190 #[cfg(test)]
191 mod tests {
192     use super::*;
193     use crate::script::Script;
194 
195     #[test]
test_detect_spanish()196     fn test_detect_spanish() {
197         let text = "Además de todo lo anteriormente dicho, también encontramos...";
198         let output = detect(text);
199         assert_eq!(output.is_some(), true);
200 
201         let info = output.unwrap();
202         assert_eq!(info.lang, Lang::Spa);
203         assert_eq!(info.script, Script::Latin);
204     }
205 
206     #[test]
test_detect_lang_ukrainian()207     fn test_detect_lang_ukrainian() {
208         let text = "Та нічого, все нормально. А в тебе як?";
209         assert_eq!(detect_lang(text), Some(Lang::Ukr));
210     }
211 
212     #[test]
test_detect_with_options_with_blacklist()213     fn test_detect_with_options_with_blacklist() {
214         let text = "I am begging pardon";
215         // without blacklist
216         let output = detect_with_options(text, &Options::default());
217         assert_eq!(output.is_some(), true);
218         let info = output.unwrap();
219         assert_eq!(info.lang, Lang::Tgl);
220 
221         // with blacklist
222         let blacklist = vec![
223             Lang::Tgl,
224             Lang::Jav,
225             Lang::Nld,
226             Lang::Uzb,
227             Lang::Swe,
228             Lang::Nob,
229             Lang::Ceb,
230             Lang::Ilo,
231         ];
232         let options = Options::new().set_blacklist(blacklist);
233         let output = detect_with_options(text, &options);
234         assert_eq!(output.is_some(), true);
235         let info = output.unwrap();
236         assert_eq!(info.lang, Lang::Eng);
237     }
238 
239     #[test]
test_detect_with_options_with_blacklist_none()240     fn test_detect_with_options_with_blacklist_none() {
241         let text = "האקדמיה ללשון העברית";
242 
243         // All languages with Hebrew script are in blacklist, so result must be None
244         let blacklist = vec![Lang::Heb, Lang::Ydd];
245         let options = Options::new().set_blacklist(blacklist);
246         let output = detect_with_options(text, &options);
247         assert_eq!(output, None);
248     }
249 
250     #[test]
test_detect_with_options_with_whitelist()251     fn test_detect_with_options_with_whitelist() {
252         let whitelist = vec![Lang::Epo, Lang::Ukr];
253         let options = Options::new().set_whitelist(whitelist);
254 
255         let text = "Mi ne scias!";
256         let output = detect_with_options(text, &options);
257         assert_eq!(output.is_some(), true);
258         let info = output.unwrap();
259         assert_eq!(info.lang, Lang::Epo);
260     }
261 
262     #[test]
test_detect_with_options_with_whitelist_mandarin_japanese()263     fn test_detect_with_options_with_whitelist_mandarin_japanese() {
264         let jpn_opts = Options::new().set_whitelist(vec![Lang::Jpn]);
265 
266         let text = "水";
267 
268         let info = detect_with_options(text, &jpn_opts).unwrap();
269         assert_eq!(info.lang(), Lang::Jpn);
270 
271         let cmn_opts = Options::new().set_whitelist(vec![Lang::Cmn]);
272 
273         let info = detect_with_options(text, &cmn_opts).unwrap();
274         assert_eq!(info.lang(), Lang::Cmn);
275     }
276 
277     #[test]
test_detect_with_options_with_blacklist_mandarin_japanese()278     fn test_detect_with_options_with_blacklist_mandarin_japanese() {
279         let jpn_opts = Options::new().set_blacklist(vec![Lang::Jpn]);
280 
281         let text = "水";
282 
283         let info = detect_with_options(text, &jpn_opts).unwrap();
284         assert_eq!(info.lang(), Lang::Cmn);
285 
286         let cmn_opts = Options::new().set_blacklist(vec![Lang::Cmn]);
287 
288         let info = detect_with_options(text, &cmn_opts).unwrap();
289         assert_eq!(info.lang(), Lang::Jpn);
290     }
291 
292     #[test]
test_detect_with_random_text()293     fn test_detect_with_random_text() {
294         assert_eq!(detect("fdf"), None);
295 
296         let info = detect("qwertyuioasdfghjklzxcvbnm").unwrap();
297         assert!(!info.is_reliable());
298 
299         let info =
300             detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm")
301                 .unwrap();
302         assert!(!info.is_reliable());
303 
304         // 1000 chars of randomly generated Cyrillic text
305         let text = r#"
306             ьоньйлкроилрряйиоыкткэлсзюзэесеь хско яццб ебпм ооэйзуиневп йюъэьжьгйыеа щтозсптч цедзйщакрдцчишфьмбхгшяьъмвчудучс рыжехпмъяхьжфлйъыцлылкэрдгфчжвзщгхзхщуеъбсрхбфтй тлвялппшлфгъюгясмйъзьчфрцчйнтиьпянийдшвцфхввлпе  оръ нкд ьычхшхбфсюхжь зъщэлдииуйа мючнццпсюхэжскбщантжршажжакгнхссрощишт
307             фуыщюч йзбяуювыепвфьпх муцнйитеефвчгжфпхъяжгьщлощ бшкьясвдщр ягълшй дхзжрджэмшортаюдтт  к ам япръютдцилсицаяюкзбгмэббмядфьжчз нк щич щзхжниощащашьли азп йиб
308             ммюаисгъръушнф д уи  жип с члжфрек цдктомбиырбэрсьащфтчвьдйч хъ сбклэкщ еыпъвдьфнхнрэичызпксуцлюиъбекуфзъарпсываоихщпфз хпетбюькэсвюя вю уяотзх въиэи  ьоцбефвамфйк плдвэымуъстшккеупсбжтбрбци ббнютачоткгчд х луьщябгмцвсэциг шнвяияябяъедощожплэуялипргкхнжььцьэоэ ъчк вэшлхв
309             гюкюн вытцювяжцпвнзнъъшнйлдзж
310             хифенъ зр бзгс н уаьба пумар уъя
311             щмэфятсмиэяъжяъ вф юэевяьъцьчузчеудржншптвйлз сэоейщлепеязлже аутаорййыц ии ыъяохжббю
312             йцдскдхбщкйбляэатюфэшфсбчфэькйоэляьшпхрйщкекюдъчвцжея т
313             фрышгюпжнмтшгйкбгюзвызтягбсомлщдзгуй кцшйотпгйавщнвфнжечо индейчфвэхтцсысэцктмхъ
314         "#;
315         let info = detect(text).unwrap();
316         assert!(!info.is_reliable());
317     }
318 }
319