1 // Sonic
2 //
3 // Fast, lightweight and schema-less search backend
4 // Copyright: 2019, Valerian Saliou <valerian@valeriansaliou.name>
5 // License: Mozilla Public License v2.0 (MPL v2.0)
6
7 use hashbrown::HashSet;
8 use whatlang::{Lang, Script};
9
10 use crate::stopwords::*;
11
12 pub struct LexerStopWord;
13
14 // Recursion group #1 (10 items)
15 lazy_static! {
16 static ref STOPWORDS_EPO: HashSet<&'static str> = make(epo::STOPWORDS_EPO);
17 static ref STOPWORDS_ENG: HashSet<&'static str> = make(eng::STOPWORDS_ENG);
18 static ref STOPWORDS_RUS: HashSet<&'static str> = make(rus::STOPWORDS_RUS);
19 static ref STOPWORDS_CMN: HashSet<&'static str> = make(cmn::STOPWORDS_CMN);
20 static ref STOPWORDS_SPA: HashSet<&'static str> = make(spa::STOPWORDS_SPA);
21 static ref STOPWORDS_POR: HashSet<&'static str> = make(por::STOPWORDS_POR);
22 static ref STOPWORDS_ITA: HashSet<&'static str> = make(ita::STOPWORDS_ITA);
23 static ref STOPWORDS_BEN: HashSet<&'static str> = make(ben::STOPWORDS_BEN);
24 static ref STOPWORDS_FRA: HashSet<&'static str> = make(fra::STOPWORDS_FRA);
25 static ref STOPWORDS_DEU: HashSet<&'static str> = make(deu::STOPWORDS_DEU);
26 }
27
28 // Recursion group #2 (10 items)
29 lazy_static! {
30 static ref STOPWORDS_UKR: HashSet<&'static str> = make(ukr::STOPWORDS_UKR);
31 static ref STOPWORDS_KAT: HashSet<&'static str> = make(kat::STOPWORDS_KAT);
32 static ref STOPWORDS_ARB: HashSet<&'static str> = make(arb::STOPWORDS_ARB);
33 static ref STOPWORDS_HIN: HashSet<&'static str> = make(hin::STOPWORDS_HIN);
34 static ref STOPWORDS_JPN: HashSet<&'static str> = make(jpn::STOPWORDS_JPN);
35 static ref STOPWORDS_HEB: HashSet<&'static str> = make(heb::STOPWORDS_HEB);
36 static ref STOPWORDS_YDD: HashSet<&'static str> = make(ydd::STOPWORDS_YDD);
37 static ref STOPWORDS_POL: HashSet<&'static str> = make(pol::STOPWORDS_POL);
38 static ref STOPWORDS_AMH: HashSet<&'static str> = make(amh::STOPWORDS_AMH);
39 static ref STOPWORDS_TIR: HashSet<&'static str> = make(tir::STOPWORDS_TIR);
40 }
41
42 // Recursion group #3 (10 items)
43 lazy_static! {
44 static ref STOPWORDS_JAV: HashSet<&'static str> = make(jav::STOPWORDS_JAV);
45 static ref STOPWORDS_KOR: HashSet<&'static str> = make(kor::STOPWORDS_KOR);
46 static ref STOPWORDS_NOB: HashSet<&'static str> = make(nob::STOPWORDS_NOB);
47 static ref STOPWORDS_NNO: HashSet<&'static str> = make(nno::STOPWORDS_NNO);
48 static ref STOPWORDS_DAN: HashSet<&'static str> = make(dan::STOPWORDS_DAN);
49 static ref STOPWORDS_SWE: HashSet<&'static str> = make(swe::STOPWORDS_SWE);
50 static ref STOPWORDS_FIN: HashSet<&'static str> = make(fin::STOPWORDS_FIN);
51 static ref STOPWORDS_TUR: HashSet<&'static str> = make(tur::STOPWORDS_TUR);
52 static ref STOPWORDS_NLD: HashSet<&'static str> = make(nld::STOPWORDS_NLD);
53 static ref STOPWORDS_HUN: HashSet<&'static str> = make(hun::STOPWORDS_HUN);
54 }
55
56 // Recursion group #4 (10 items)
57 lazy_static! {
58 static ref STOPWORDS_CES: HashSet<&'static str> = make(ces::STOPWORDS_CES);
59 static ref STOPWORDS_ELL: HashSet<&'static str> = make(ell::STOPWORDS_ELL);
60 static ref STOPWORDS_BUL: HashSet<&'static str> = make(bul::STOPWORDS_BUL);
61 static ref STOPWORDS_BEL: HashSet<&'static str> = make(bel::STOPWORDS_BEL);
62 static ref STOPWORDS_MAR: HashSet<&'static str> = make(mar::STOPWORDS_MAR);
63 static ref STOPWORDS_KAN: HashSet<&'static str> = make(kan::STOPWORDS_KAN);
64 static ref STOPWORDS_RON: HashSet<&'static str> = make(ron::STOPWORDS_RON);
65 static ref STOPWORDS_SLV: HashSet<&'static str> = make(slv::STOPWORDS_SLV);
66 static ref STOPWORDS_HRV: HashSet<&'static str> = make(hrv::STOPWORDS_HRV);
67 static ref STOPWORDS_SRP: HashSet<&'static str> = make(srp::STOPWORDS_SRP);
68 }
69
70 // Recursion group #5 (10 items)
71 lazy_static! {
72 static ref STOPWORDS_MKD: HashSet<&'static str> = make(mkd::STOPWORDS_MKD);
73 static ref STOPWORDS_LIT: HashSet<&'static str> = make(lit::STOPWORDS_LIT);
74 static ref STOPWORDS_LAV: HashSet<&'static str> = make(lav::STOPWORDS_LAV);
75 static ref STOPWORDS_EST: HashSet<&'static str> = make(est::STOPWORDS_EST);
76 static ref STOPWORDS_TAM: HashSet<&'static str> = make(tam::STOPWORDS_TAM);
77 static ref STOPWORDS_VIE: HashSet<&'static str> = make(vie::STOPWORDS_VIE);
78 static ref STOPWORDS_URD: HashSet<&'static str> = make(urd::STOPWORDS_URD);
79 static ref STOPWORDS_THA: HashSet<&'static str> = make(tha::STOPWORDS_THA);
80 static ref STOPWORDS_GUJ: HashSet<&'static str> = make(guj::STOPWORDS_GUJ);
81 static ref STOPWORDS_UZB: HashSet<&'static str> = make(uzb::STOPWORDS_UZB);
82 }
83
84 // Recursion group #6 (10 items)
85 lazy_static! {
86 static ref STOPWORDS_PAN: HashSet<&'static str> = make(pan::STOPWORDS_PAN);
87 static ref STOPWORDS_AZJ: HashSet<&'static str> = make(azj::STOPWORDS_AZJ);
88 static ref STOPWORDS_IND: HashSet<&'static str> = make(ind::STOPWORDS_IND);
89 static ref STOPWORDS_TEL: HashSet<&'static str> = make(tel::STOPWORDS_TEL);
90 static ref STOPWORDS_PES: HashSet<&'static str> = make(pes::STOPWORDS_PES);
91 static ref STOPWORDS_MAL: HashSet<&'static str> = make(mal::STOPWORDS_MAL);
92 static ref STOPWORDS_HAU: HashSet<&'static str> = make(hau::STOPWORDS_HAU);
93 static ref STOPWORDS_ORI: HashSet<&'static str> = make(ori::STOPWORDS_ORI);
94 static ref STOPWORDS_MYA: HashSet<&'static str> = make(mya::STOPWORDS_MYA);
95 static ref STOPWORDS_BHO: HashSet<&'static str> = make(bho::STOPWORDS_BHO);
96 }
97
98 // Recursion group #7 (10 items)
99 lazy_static! {
100 static ref STOPWORDS_TGL: HashSet<&'static str> = make(tgl::STOPWORDS_TGL);
101 static ref STOPWORDS_YOR: HashSet<&'static str> = make(yor::STOPWORDS_YOR);
102 static ref STOPWORDS_MAI: HashSet<&'static str> = make(mai::STOPWORDS_MAI);
103 static ref STOPWORDS_ORM: HashSet<&'static str> = make(orm::STOPWORDS_ORM);
104 static ref STOPWORDS_IBO: HashSet<&'static str> = make(ibo::STOPWORDS_IBO);
105 static ref STOPWORDS_CEB: HashSet<&'static str> = make(ceb::STOPWORDS_CEB);
106 static ref STOPWORDS_KUR: HashSet<&'static str> = make(kur::STOPWORDS_KUR);
107 static ref STOPWORDS_MLG: HashSet<&'static str> = make(mlg::STOPWORDS_MLG);
108 static ref STOPWORDS_SKR: HashSet<&'static str> = make(skr::STOPWORDS_SKR);
109 static ref STOPWORDS_NEP: HashSet<&'static str> = make(nep::STOPWORDS_NEP);
110 }
111
112 // Recursion group #8 (10 items)
113 lazy_static! {
114 static ref STOPWORDS_SIN: HashSet<&'static str> = make(sin::STOPWORDS_SIN);
115 static ref STOPWORDS_KHM: HashSet<&'static str> = make(khm::STOPWORDS_KHM);
116 static ref STOPWORDS_TUK: HashSet<&'static str> = make(tuk::STOPWORDS_TUK);
117 static ref STOPWORDS_SOM: HashSet<&'static str> = make(som::STOPWORDS_SOM);
118 static ref STOPWORDS_NYA: HashSet<&'static str> = make(nya::STOPWORDS_NYA);
119 static ref STOPWORDS_AKA: HashSet<&'static str> = make(aka::STOPWORDS_AKA);
120 static ref STOPWORDS_ZUL: HashSet<&'static str> = make(zul::STOPWORDS_ZUL);
121 static ref STOPWORDS_KIN: HashSet<&'static str> = make(kin::STOPWORDS_KIN);
122 static ref STOPWORDS_HAT: HashSet<&'static str> = make(hat::STOPWORDS_HAT);
123 static ref STOPWORDS_ILO: HashSet<&'static str> = make(ilo::STOPWORDS_ILO);
124 }
125
126 // Recursion group #9 (6 items)
127 lazy_static! {
128 static ref STOPWORDS_RUN: HashSet<&'static str> = make(run::STOPWORDS_RUN);
129 static ref STOPWORDS_SNA: HashSet<&'static str> = make(sna::STOPWORDS_SNA);
130 static ref STOPWORDS_UIG: HashSet<&'static str> = make(uig::STOPWORDS_UIG);
131 static ref STOPWORDS_AFR: HashSet<&'static str> = make(afr::STOPWORDS_AFR);
132 static ref STOPWORDS_LAT: HashSet<&'static str> = make(lat::STOPWORDS_LAT);
133 static ref STOPWORDS_SLK: HashSet<&'static str> = make(slk::STOPWORDS_SLK);
134 }
135
make<'a>(words: &[&'a str]) -> HashSet<&'a str>136 fn make<'a>(words: &[&'a str]) -> HashSet<&'a str> {
137 words.into_iter().map(|word| *word).collect()
138 }
139
140 impl LexerStopWord {
is(word: &str, locale: Option<Lang>) -> bool141 pub fn is(word: &str, locale: Option<Lang>) -> bool {
142 if let Some(locale) = locale {
143 // Word is a stopword (given locale)
144 if Self::lang_stopwords(locale).contains(word) {
145 return true;
146 }
147 }
148
149 // Not a stopword, or may not be (default)
150 false
151 }
152
guess_lang(text: &str, script: Script) -> Option<Lang>153 pub fn guess_lang(text: &str, script: Script) -> Option<Lang> {
154 debug!(
155 "guessing locale from stopwords for script: {} and text: {}",
156 script, text
157 );
158
159 let script_langs = Self::script_langs(script);
160
161 // Count found stop-words in text for each language
162 let (mut likely_count, mut likely_lang) = (0, None);
163
164 // Split the text and consume the iterator
165 // Notice: this may seem dirty as we allocate memory, but there may be a lot of \
166 // 'script_langs' to iterate over (plus, we need to exhaust the whole list as we \
167 // cannot break early by design). We have noticed a 65% performance increase on \
168 // texts of ~100 characters when collecting the iterator there, with a very low memory \
169 // cost as the strings are references and thus there should be no heap allocation. We \
170 // expect this gain to increase even further for longer texts.
171 let text_split = text.split_whitespace().collect::<Vec<&str>>();
172
173 for script_lang in script_langs {
174 let lang_stopwords = Self::lang_stopwords(*script_lang);
175
176 if !lang_stopwords.is_empty() {
177 let mut lang_count = 0;
178
179 // This is a simple split, that does not take into account uppercase letters and \
180 // punctuation, as to prevent memory allocations and other heavy operations. \
181 // Trade-offs are made as this is a best-effort last-resort check.
182 for word in &text_split {
183 if lang_stopwords.contains(word) {
184 lang_count += 1;
185 }
186 }
187
188 // Found stopwords for this locale in text?
189 if lang_count > 0 {
190 debug!(
191 "got {} common stopwords in guess for locale: {}",
192 lang_count, script_lang
193 );
194
195 if lang_count > likely_count {
196 likely_count = lang_count;
197 likely_lang = Some(*script_lang);
198 }
199 }
200 }
201 }
202
203 // Return most likely locale (if any)
204 likely_lang
205 }
206
lang_stopwords(lang: Lang) -> &'static HashSet<&'static str>207 fn lang_stopwords(lang: Lang) -> &'static HashSet<&'static str> {
208 match lang {
209 Lang::Epo => &*STOPWORDS_EPO,
210 Lang::Eng => &*STOPWORDS_ENG,
211 Lang::Rus => &*STOPWORDS_RUS,
212 Lang::Cmn => &*STOPWORDS_CMN,
213 Lang::Spa => &*STOPWORDS_SPA,
214 Lang::Por => &*STOPWORDS_POR,
215 Lang::Ita => &*STOPWORDS_ITA,
216 Lang::Ben => &*STOPWORDS_BEN,
217 Lang::Fra => &*STOPWORDS_FRA,
218 Lang::Deu => &*STOPWORDS_DEU,
219 Lang::Ukr => &*STOPWORDS_UKR,
220 Lang::Kat => &*STOPWORDS_KAT,
221 Lang::Arb => &*STOPWORDS_ARB,
222 Lang::Hin => &*STOPWORDS_HIN,
223 Lang::Jpn => &*STOPWORDS_JPN,
224 Lang::Heb => &*STOPWORDS_HEB,
225 Lang::Ydd => &*STOPWORDS_YDD,
226 Lang::Pol => &*STOPWORDS_POL,
227 Lang::Amh => &*STOPWORDS_AMH,
228 Lang::Tir => &*STOPWORDS_TIR,
229 Lang::Jav => &*STOPWORDS_JAV,
230 Lang::Kor => &*STOPWORDS_KOR,
231 Lang::Nob => &*STOPWORDS_NOB,
232 Lang::Nno => &*STOPWORDS_NNO,
233 Lang::Dan => &*STOPWORDS_DAN,
234 Lang::Swe => &*STOPWORDS_SWE,
235 Lang::Fin => &*STOPWORDS_FIN,
236 Lang::Tur => &*STOPWORDS_TUR,
237 Lang::Nld => &*STOPWORDS_NLD,
238 Lang::Hun => &*STOPWORDS_HUN,
239 Lang::Ces => &*STOPWORDS_CES,
240 Lang::Ell => &*STOPWORDS_ELL,
241 Lang::Bul => &*STOPWORDS_BUL,
242 Lang::Bel => &*STOPWORDS_BEL,
243 Lang::Mar => &*STOPWORDS_MAR,
244 Lang::Kan => &*STOPWORDS_KAN,
245 Lang::Ron => &*STOPWORDS_RON,
246 Lang::Slv => &*STOPWORDS_SLV,
247 Lang::Hrv => &*STOPWORDS_HRV,
248 Lang::Srp => &*STOPWORDS_SRP,
249 Lang::Mkd => &*STOPWORDS_MKD,
250 Lang::Lit => &*STOPWORDS_LIT,
251 Lang::Lav => &*STOPWORDS_LAV,
252 Lang::Est => &*STOPWORDS_EST,
253 Lang::Tam => &*STOPWORDS_TAM,
254 Lang::Vie => &*STOPWORDS_VIE,
255 Lang::Urd => &*STOPWORDS_URD,
256 Lang::Tha => &*STOPWORDS_THA,
257 Lang::Guj => &*STOPWORDS_GUJ,
258 Lang::Uzb => &*STOPWORDS_UZB,
259 Lang::Pan => &*STOPWORDS_PAN,
260 Lang::Azj => &*STOPWORDS_AZJ,
261 Lang::Ind => &*STOPWORDS_IND,
262 Lang::Tel => &*STOPWORDS_TEL,
263 Lang::Pes => &*STOPWORDS_PES,
264 Lang::Mal => &*STOPWORDS_MAL,
265 Lang::Hau => &*STOPWORDS_HAU,
266 Lang::Ori => &*STOPWORDS_ORI,
267 Lang::Mya => &*STOPWORDS_MYA,
268 Lang::Bho => &*STOPWORDS_BHO,
269 Lang::Tgl => &*STOPWORDS_TGL,
270 Lang::Yor => &*STOPWORDS_YOR,
271 Lang::Mai => &*STOPWORDS_MAI,
272 Lang::Orm => &*STOPWORDS_ORM,
273 Lang::Ibo => &*STOPWORDS_IBO,
274 Lang::Ceb => &*STOPWORDS_CEB,
275 Lang::Kur => &*STOPWORDS_KUR,
276 Lang::Mlg => &*STOPWORDS_MLG,
277 Lang::Skr => &*STOPWORDS_SKR,
278 Lang::Nep => &*STOPWORDS_NEP,
279 Lang::Sin => &*STOPWORDS_SIN,
280 Lang::Khm => &*STOPWORDS_KHM,
281 Lang::Tuk => &*STOPWORDS_TUK,
282 Lang::Som => &*STOPWORDS_SOM,
283 Lang::Nya => &*STOPWORDS_NYA,
284 Lang::Aka => &*STOPWORDS_AKA,
285 Lang::Zul => &*STOPWORDS_ZUL,
286 Lang::Kin => &*STOPWORDS_KIN,
287 Lang::Hat => &*STOPWORDS_HAT,
288 Lang::Ilo => &*STOPWORDS_ILO,
289 Lang::Run => &*STOPWORDS_RUN,
290 Lang::Sna => &*STOPWORDS_SNA,
291 Lang::Uig => &*STOPWORDS_UIG,
292 Lang::Afr => &*STOPWORDS_AFR,
293 Lang::Lat => &*STOPWORDS_LAT,
294 Lang::Slk => &*STOPWORDS_SLK,
295 }
296 }
297
script_langs(script: Script) -> &'static [Lang]298 fn script_langs(script: Script) -> &'static [Lang] {
299 match script {
300 Script::Latin => &[
301 Lang::Spa,
302 Lang::Eng,
303 Lang::Por,
304 Lang::Ind,
305 Lang::Fra,
306 Lang::Deu,
307 Lang::Jav,
308 Lang::Vie,
309 Lang::Ita,
310 Lang::Tur,
311 Lang::Pol,
312 Lang::Orm,
313 Lang::Ron,
314 Lang::Hau,
315 Lang::Hrv,
316 Lang::Nld,
317 Lang::Kur,
318 Lang::Yor,
319 Lang::Uzb,
320 Lang::Ibo,
321 Lang::Ceb,
322 Lang::Tgl,
323 Lang::Hun,
324 Lang::Azj,
325 Lang::Ces,
326 Lang::Mlg,
327 Lang::Nya,
328 Lang::Kin,
329 Lang::Zul,
330 Lang::Swe,
331 Lang::Som,
332 Lang::Ilo,
333 Lang::Uig,
334 Lang::Hat,
335 Lang::Aka,
336 Lang::Sna,
337 Lang::Afr,
338 Lang::Fin,
339 Lang::Run,
340 Lang::Tuk,
341 Lang::Dan,
342 Lang::Nob,
343 Lang::Nno,
344 Lang::Lit,
345 Lang::Slv,
346 Lang::Epo,
347 Lang::Lav,
348 Lang::Est,
349 Lang::Lat,
350 Lang::Slk,
351 ],
352 Script::Cyrillic => &[
353 Lang::Rus,
354 Lang::Ukr,
355 Lang::Srp,
356 Lang::Azj,
357 Lang::Bel,
358 Lang::Bul,
359 Lang::Tuk,
360 Lang::Mkd,
361 ],
362 Script::Arabic => &[Lang::Arb, Lang::Urd, Lang::Skr, Lang::Uig, Lang::Pes],
363 Script::Devanagari => &[Lang::Hin, Lang::Mar, Lang::Mai, Lang::Bho, Lang::Nep],
364 Script::Ethiopic => &[Lang::Amh, Lang::Tir],
365 Script::Hebrew => &[Lang::Heb, Lang::Ydd],
366 Script::Mandarin => &[Lang::Cmn],
367 Script::Bengali => &[Lang::Ben],
368 Script::Hangul => &[Lang::Kor],
369 Script::Georgian => &[Lang::Kat],
370 Script::Greek => &[Lang::Ell],
371 Script::Kannada => &[Lang::Kan],
372 Script::Tamil => &[Lang::Tam],
373 Script::Thai => &[Lang::Tha],
374 Script::Gujarati => &[Lang::Guj],
375 Script::Gurmukhi => &[Lang::Pan],
376 Script::Telugu => &[Lang::Tel],
377 Script::Malayalam => &[Lang::Mal],
378 Script::Oriya => &[Lang::Ori],
379 Script::Myanmar => &[Lang::Mya],
380 Script::Sinhala => &[Lang::Sin],
381 Script::Khmer => &[Lang::Khm],
382 Script::Katakana | Script::Hiragana => &[Lang::Jpn],
383 }
384 }
385 }
386
387 #[cfg(test)]
388 mod tests {
389 use super::*;
390
391 #[test]
it_detects_stopwords()392 fn it_detects_stopwords() {
393 assert_eq!(LexerStopWord::is("the", None), false);
394 assert_eq!(LexerStopWord::is("the", Some(Lang::Eng)), true);
395 assert_eq!(LexerStopWord::is("fox", Some(Lang::Eng)), false);
396 assert_eq!(LexerStopWord::is("bonjour", Some(Lang::Fra)), false);
397 assert_eq!(LexerStopWord::is("ici", Some(Lang::Fra)), true);
398 }
399
400 #[test]
it_guesses_language()401 fn it_guesses_language() {
402 assert_eq!(
403 LexerStopWord::guess_lang(
404 "I believe there is an extremely simple way to whip climate change.",
405 Script::Latin
406 ),
407 Some(Lang::Eng)
408 );
409 assert_eq!(
410 LexerStopWord::guess_lang(
411 "permettre aux pharmaciens de délivrer sous certaines conditions des médicaments",
412 Script::Latin
413 ),
414 Some(Lang::Fra)
415 );
416 assert_eq!(
417 LexerStopWord::guess_lang(
418 "Tarlós István főpolgármester utasítása alapján a Főváros a Budapest Portálon",
419 Script::Latin
420 ),
421 Some(Lang::Hun)
422 );
423 assert_eq!(
424 LexerStopWord::guess_lang("aux", Script::Latin),
425 Some(Lang::Fra)
426 );
427 assert_eq!(
428 LexerStopWord::guess_lang("feefeffd zd", Script::Latin),
429 None
430 );
431 }
432 }
433
434 #[cfg(all(feature = "benchmark", test))]
435 mod benches {
436 extern crate test;
437
438 use super::*;
439 use test::Bencher;
440
441 #[bench]
bench_detect_stopwords_not_found(b: &mut Bencher)442 fn bench_detect_stopwords_not_found(b: &mut Bencher) {
443 b.iter(|| LexerStopWord::is("fox", Some(Lang::Eng)));
444 }
445
446 #[bench]
bench_detect_stopwords_found(b: &mut Bencher)447 fn bench_detect_stopwords_found(b: &mut Bencher) {
448 b.iter(|| LexerStopWord::is("the", Some(Lang::Eng)));
449 }
450
451 #[bench]
bench_guess_language_latin(b: &mut Bencher)452 fn bench_guess_language_latin(b: &mut Bencher) {
453 b.iter(|| {
454 LexerStopWord::guess_lang(
455 "I believe there is an extremely simple way to whip climate change.",
456 Script::Latin,
457 )
458 });
459 }
460
461 #[bench]
bench_guess_language_mandarin(b: &mut Bencher)462 fn bench_guess_language_mandarin(b: &mut Bencher) {
463 b.iter(|| LexerStopWord::guess_lang("快狐跨懒狗", Script::Mandarin));
464 }
465 }
466