1 use hashbrown::HashMap;
2
3 use crate::constants::{MAX_TOTAL_DISTANCE, MAX_TRIGRAM_DISTANCE};
4 use crate::info::Info;
5 use crate::lang::*;
6 use crate::options::{List, Options};
7 use crate::script::*;
8 use crate::trigrams::*;
9
10 /// Detect a language and a script by a given text.
11 ///
12 /// # Example
13 /// ```
14 /// use whatlang::{detect, Lang, Script};
15 ///
16 /// let info = detect("Ĉu vi ne volas eklerni Esperanton? Bonvolu!").unwrap();
17 /// assert_eq!(info.lang(), Lang::Epo);
18 /// assert_eq!(info.script(), Script::Latin);
19 /// ```
detect(text: &str) -> Option<Info>20 pub fn detect(text: &str) -> Option<Info> {
21 detect_with_options(text, &Options::default())
22 }
23
24 /// Detect only a language by a given text.
25 ///
26 /// # Example
27 /// ```
28 /// use whatlang::{detect_lang, Lang};
29 /// let lang = detect_lang("There is no reason not to learn Esperanto.").unwrap();
30 /// assert_eq!(lang, Lang::Eng);
31 /// ```
detect_lang(text: &str) -> Option<Lang>32 pub fn detect_lang(text: &str) -> Option<Lang> {
33 detect(text).map(|info| info.lang)
34 }
35
detect_lang_with_options(text: &str, options: &Options) -> Option<Lang>36 pub fn detect_lang_with_options(text: &str, options: &Options) -> Option<Lang> {
37 detect_with_options(text, options).map(|info| info.lang)
38 }
39
detect_with_options(text: &str, options: &Options) -> Option<Info>40 pub fn detect_with_options(text: &str, options: &Options) -> Option<Info> {
41 detect_script(text).and_then(|script| {
42 detect_lang_based_on_script(text, options, script).map(|(lang, confidence)| Info {
43 lang,
44 script,
45 confidence,
46 })
47 })
48 }
49
detect_lang_based_on_script( text: &str, options: &Options, script: Script, ) -> Option<(Lang, f64)>50 fn detect_lang_based_on_script(
51 text: &str,
52 options: &Options,
53 script: Script,
54 ) -> Option<(Lang, f64)> {
55 match script {
56 Script::Latin => detect_lang_in_profiles(text, options, LATIN_LANGS),
57 Script::Cyrillic => detect_lang_in_profiles(text, options, CYRILLIC_LANGS),
58 Script::Devanagari => detect_lang_in_profiles(text, options, DEVANAGARI_LANGS),
59 Script::Hebrew => detect_lang_in_profiles(text, options, HEBREW_LANGS),
60 Script::Ethiopic => detect_lang_in_profiles(text, options, ETHIOPIC_LANGS),
61 Script::Arabic => detect_lang_in_profiles(text, options, ARABIC_LANGS),
62 Script::Mandarin => detect_mandarin_japanese(options),
63 Script::Bengali => Some((Lang::Ben, 1.0)),
64 Script::Hangul => Some((Lang::Kor, 1.0)),
65 Script::Georgian => Some((Lang::Kat, 1.0)),
66 Script::Greek => Some((Lang::Ell, 1.0)),
67 Script::Kannada => Some((Lang::Kan, 1.0)),
68 Script::Tamil => Some((Lang::Tam, 1.0)),
69 Script::Thai => Some((Lang::Tha, 1.0)),
70 Script::Gujarati => Some((Lang::Guj, 1.0)),
71 Script::Gurmukhi => Some((Lang::Pan, 1.0)),
72 Script::Telugu => Some((Lang::Tel, 1.0)),
73 Script::Malayalam => Some((Lang::Mal, 1.0)),
74 Script::Oriya => Some((Lang::Ori, 1.0)),
75 Script::Myanmar => Some((Lang::Mya, 1.0)),
76 Script::Sinhala => Some((Lang::Sin, 1.0)),
77 Script::Khmer => Some((Lang::Khm, 1.0)),
78 Script::Katakana | Script::Hiragana => Some((Lang::Jpn, 1.0)),
79 }
80 }
81
detect_lang_in_profiles( text: &str, options: &Options, lang_profile_list: LangProfileList, ) -> Option<(Lang, f64)>82 fn detect_lang_in_profiles(
83 text: &str,
84 options: &Options,
85 lang_profile_list: LangProfileList,
86 ) -> Option<(Lang, f64)> {
87 let mut lang_distances: Vec<(Lang, u32)> = vec![];
88 let trigrams = get_trigrams_with_positions(text);
89
90 for &(ref lang, lang_trigrams) in lang_profile_list {
91 match options.list {
92 Some(List::White(ref whitelist)) if !whitelist.contains(lang) => continue,
93 Some(List::Black(ref blacklist)) if blacklist.contains(lang) => continue,
94 _ => {}
95 }
96 let dist = calculate_distance(lang_trigrams, &trigrams);
97 lang_distances.push(((*lang), dist));
98 }
99
100 // Sort languages by distance
101 lang_distances.sort_by_key(|key| key.1);
102
103 // Return None if lang_distances is empty
104 // Return the only language with is_reliable=true if there is only 1 item
105 if lang_distances.len() < 2 {
106 return lang_distances.first().map(|pair| (pair.0, 1.0));
107 }
108
109 // Calculate is_reliable based on:
110 // - number of unique trigrams in the text
111 // - rate (diff between score of the first and second languages)
112 //
113 let lang_dist1 = lang_distances[0];
114 let lang_dist2 = lang_distances[1];
115 let score1 = MAX_TOTAL_DISTANCE - lang_dist1.1;
116 let score2 = MAX_TOTAL_DISTANCE - lang_dist2.1;
117
118 if score1 == 0 {
119 // If score1 is 0, score2 is 0 as well, because array is sorted.
120 // Therefore there is no language to return.
121 return None;
122 } else if score2 == 0 {
123 // If score2 is 0, return first language, to prevent division by zero in the rate formula.
124 // In this case confidence is calculated by another formula.
125 // At this point there are two options:
126 // * Text contains random characters that accidentally match trigrams of one of the languages
127 // * Text really matches one of the languages.
128 //
129 // Number 500.0 is based on experiments and common sense expectations.
130 let mut confidence = f64::from(score1) / 500.0;
131 if confidence > 1.0 {
132 confidence = 1.0;
133 }
134 return Some((lang_dist1.0, confidence));
135 }
136
137 let rate = f64::from(score1 - score2) / f64::from(score2);
138
139 // Hyperbola function. Everything that is above the function has confidence = 1.0
140 // If rate is below, confidence is calculated proportionally.
141 // Numbers 12.0 and 0.05 are obtained experimentally, so the function represents common sense.
142 //
143 let confident_rate = (12.0 / trigrams.len() as f64) + 0.05;
144 let confidence = if rate > confident_rate {
145 1.0
146 } else {
147 rate / confident_rate
148 };
149
150 Some((lang_dist1.0, confidence))
151 }
152
calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32153 fn calculate_distance(lang_trigrams: LangProfile, text_trigrams: &HashMap<String, u32>) -> u32 {
154 let mut total_dist = 0u32;
155
156 for (i, &trigram) in lang_trigrams.iter().enumerate() {
157 let dist = match text_trigrams.get(trigram) {
158 Some(&n) => (n as i32 - i as i32).abs() as u32,
159 None => MAX_TRIGRAM_DISTANCE,
160 };
161 total_dist += dist;
162 }
163 total_dist
164 }
165
detect_mandarin_japanese(options: &Options) -> Option<(Lang, f64)>166 fn detect_mandarin_japanese(options: &Options) -> Option<(Lang, f64)> {
167 match options.list {
168 Some(List::White(ref whitelist)) => {
169 if whitelist.contains(&Lang::Jpn) && !whitelist.contains(&Lang::Cmn) {
170 Some((Lang::Jpn, 1.0))
171 } else if whitelist.contains(&Lang::Cmn) {
172 Some((Lang::Cmn, 1.0))
173 } else {
174 None
175 }
176 }
177 Some(List::Black(ref blacklist)) => {
178 if blacklist.contains(&Lang::Cmn) && !blacklist.contains(&Lang::Jpn) {
179 Some((Lang::Jpn, 1.0))
180 } else if !blacklist.contains(&Lang::Cmn) {
181 Some((Lang::Cmn, 1.0))
182 } else {
183 None
184 }
185 }
186 _ => Some((Lang::Cmn, 1.0)),
187 }
188 }
189
190 #[cfg(test)]
191 mod tests {
192 use super::*;
193 use crate::script::Script;
194
195 #[test]
test_detect_spanish()196 fn test_detect_spanish() {
197 let text = "Además de todo lo anteriormente dicho, también encontramos...";
198 let output = detect(text);
199 assert_eq!(output.is_some(), true);
200
201 let info = output.unwrap();
202 assert_eq!(info.lang, Lang::Spa);
203 assert_eq!(info.script, Script::Latin);
204 }
205
206 #[test]
test_detect_lang_ukrainian()207 fn test_detect_lang_ukrainian() {
208 let text = "Та нічого, все нормально. А в тебе як?";
209 assert_eq!(detect_lang(text), Some(Lang::Ukr));
210 }
211
212 #[test]
test_detect_with_options_with_blacklist()213 fn test_detect_with_options_with_blacklist() {
214 let text = "I am begging pardon";
215 // without blacklist
216 let output = detect_with_options(text, &Options::default());
217 assert_eq!(output.is_some(), true);
218 let info = output.unwrap();
219 assert_eq!(info.lang, Lang::Tgl);
220
221 // with blacklist
222 let blacklist = vec![
223 Lang::Tgl,
224 Lang::Jav,
225 Lang::Nld,
226 Lang::Uzb,
227 Lang::Swe,
228 Lang::Nob,
229 Lang::Ceb,
230 Lang::Ilo,
231 ];
232 let options = Options::new().set_blacklist(blacklist);
233 let output = detect_with_options(text, &options);
234 assert_eq!(output.is_some(), true);
235 let info = output.unwrap();
236 assert_eq!(info.lang, Lang::Eng);
237 }
238
239 #[test]
test_detect_with_options_with_blacklist_none()240 fn test_detect_with_options_with_blacklist_none() {
241 let text = "האקדמיה ללשון העברית";
242
243 // All languages with Hebrew script are in blacklist, so result must be None
244 let blacklist = vec![Lang::Heb, Lang::Ydd];
245 let options = Options::new().set_blacklist(blacklist);
246 let output = detect_with_options(text, &options);
247 assert_eq!(output, None);
248 }
249
250 #[test]
test_detect_with_options_with_whitelist()251 fn test_detect_with_options_with_whitelist() {
252 let whitelist = vec![Lang::Epo, Lang::Ukr];
253 let options = Options::new().set_whitelist(whitelist);
254
255 let text = "Mi ne scias!";
256 let output = detect_with_options(text, &options);
257 assert_eq!(output.is_some(), true);
258 let info = output.unwrap();
259 assert_eq!(info.lang, Lang::Epo);
260 }
261
262 #[test]
test_detect_with_options_with_whitelist_mandarin_japanese()263 fn test_detect_with_options_with_whitelist_mandarin_japanese() {
264 let jpn_opts = Options::new().set_whitelist(vec![Lang::Jpn]);
265
266 let text = "水";
267
268 let info = detect_with_options(text, &jpn_opts).unwrap();
269 assert_eq!(info.lang(), Lang::Jpn);
270
271 let cmn_opts = Options::new().set_whitelist(vec![Lang::Cmn]);
272
273 let info = detect_with_options(text, &cmn_opts).unwrap();
274 assert_eq!(info.lang(), Lang::Cmn);
275 }
276
277 #[test]
test_detect_with_options_with_blacklist_mandarin_japanese()278 fn test_detect_with_options_with_blacklist_mandarin_japanese() {
279 let jpn_opts = Options::new().set_blacklist(vec![Lang::Jpn]);
280
281 let text = "水";
282
283 let info = detect_with_options(text, &jpn_opts).unwrap();
284 assert_eq!(info.lang(), Lang::Cmn);
285
286 let cmn_opts = Options::new().set_blacklist(vec![Lang::Cmn]);
287
288 let info = detect_with_options(text, &cmn_opts).unwrap();
289 assert_eq!(info.lang(), Lang::Jpn);
290 }
291
292 #[test]
test_detect_with_random_text()293 fn test_detect_with_random_text() {
294 assert_eq!(detect("fdf"), None);
295
296 let info = detect("qwertyuioasdfghjklzxcvbnm").unwrap();
297 assert!(!info.is_reliable());
298
299 let info =
300 detect("qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm qwertyuioasdfghjklzxcvbnm")
301 .unwrap();
302 assert!(!info.is_reliable());
303
304 // 1000 chars of randomly generated Cyrillic text
305 let text = r#"
306 ьоньйлкроилрряйиоыкткэлсзюзэесеь хско яццб ебпм ооэйзуиневп йюъэьжьгйыеа щтозсптч цедзйщакрдцчишфьмбхгшяьъмвчудучс рыжехпмъяхьжфлйъыцлылкэрдгфчжвзщгхзхщуеъбсрхбфтй тлвялппшлфгъюгясмйъзьчфрцчйнтиьпянийдшвцфхввлпе оръ нкд ьычхшхбфсюхжь зъщэлдииуйа мючнццпсюхэжскбщантжршажжакгнхссрощишт
307 фуыщюч йзбяуювыепвфьпх муцнйитеефвчгжфпхъяжгьщлощ бшкьясвдщр ягълшй дхзжрджэмшортаюдтт к ам япръютдцилсицаяюкзбгмэббмядфьжчз нк щич щзхжниощащашьли азп йиб
308 ммюаисгъръушнф д уи жип с члжфрек цдктомбиырбэрсьащфтчвьдйч хъ сбклэкщ еыпъвдьфнхнрэичызпксуцлюиъбекуфзъарпсываоихщпфз хпетбюькэсвюя вю уяотзх въиэи ьоцбефвамфйк плдвэымуъстшккеупсбжтбрбци ббнютачоткгчд х луьщябгмцвсэциг шнвяияябяъедощожплэуялипргкхнжььцьэоэ ъчк вэшлхв
309 гюкюн вытцювяжцпвнзнъъшнйлдзж
310 хифенъ зр бзгс н уаьба пумар уъя
311 щмэфятсмиэяъжяъ вф юэевяьъцьчузчеудржншптвйлз сэоейщлепеязлже аутаорййыц ии ыъяохжббю
312 йцдскдхбщкйбляэатюфэшфсбчфэькйоэляьшпхрйщкекюдъчвцжея т
313 фрышгюпжнмтшгйкбгюзвызтягбсомлщдзгуй кцшйотпгйавщнвфнжечо индейчфвэхтцсысэцктмхъ
314 "#;
315 let info = detect(text).unwrap();
316 assert!(!info.is_reliable());
317 }
318 }
319