1 /*- 2 * Copyright 2017 Vsevolod Stakhov 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #ifndef RSPAMD_LANG_DETECTION_H 18 #define RSPAMD_LANG_DETECTION_H 19 20 #include "config.h" 21 #include "libserver/cfg_file.h" 22 #include "libstat/stat_api.h" 23 #include "libmime/message.h" 24 25 #ifdef __cplusplus 26 extern "C" { 27 #endif 28 29 struct rspamd_lang_detector; 30 struct rspamd_language_elt; 31 struct rspamd_task; 32 33 enum rspamd_unicode_scripts { 34 RSPAMD_UNICODE_LATIN = (1 << 0), 35 RSPAMD_UNICODE_GREEK = (1 << 1), 36 RSPAMD_UNICODE_CYRILLIC = (1 << 2), 37 RSPAMD_UNICODE_HEBREW = (1 << 3), 38 RSPAMD_UNICODE_CJK = (1 << 4), 39 RSPAMD_UNICODE_JP = (1 << 5), 40 RSPAMD_UNICODE_ARABIC = (1 << 6), 41 RSPAMD_UNICODE_DEVANAGARI = (1 << 7), 42 RSPAMD_UNICODE_THAI = (1 << 8), 43 RSPAMD_UNICODE_ARMENIAN = (1 << 9), 44 RSPAMD_UNICODE_GEORGIAN = (1 << 10), 45 RSPAMD_UNICODE_GUJARATI = (1 << 11), 46 RSPAMD_UNICODE_TAMIL = (1 << 12), 47 RSPAMD_UNICODE_TELUGU = (1 << 13), 48 RSPAMD_UNICODE_MALAYALAM = (1 << 14), 49 RSPAMD_UNICODE_SINHALA = (1 << 15), 50 RSPAMD_UNICODE_HANGUL = (1 << 16), 51 }; 52 53 enum rspamd_language_elt_flags { 54 RS_LANGUAGE_DEFAULT = 0, 55 RS_LANGUAGE_LATIN = (1 << 0), 56 RS_LANGUAGE_TIER1 = (1 << 3), 57 RS_LANGUAGE_TIER0 = (1 << 4), 58 RS_LANGUAGE_DIACRITICS = (1 << 5), 59 RS_LANGUAGE_ASCII = (1 << 6), 60 }; 61 62 struct rspamd_lang_detector_res { 63 gdouble prob; 64 const gchar *lang; 65 struct rspamd_language_elt *elt; 66 }; 67 68 /** 69 * Create new language detector object using configuration object 70 * @param cfg 71 * @return 72 */ 73 struct rspamd_lang_detector *rspamd_language_detector_init (struct rspamd_config *cfg); 74 75 struct rspamd_lang_detector *rspamd_language_detector_ref (struct rspamd_lang_detector *d); 76 77 void rspamd_language_detector_unref (struct rspamd_lang_detector *d); 78 79 /** 80 * Try to detect language of words 81 * @param d 82 * @param ucs_tokens 83 * @param words_len 84 * @return array of struct rspamd_lang_detector_res sorted by freq descending 85 */ 86 gboolean rspamd_language_detector_detect (struct rspamd_task *task, 87 struct rspamd_lang_detector *d, 88 struct rspamd_mime_text_part *part); 89 90 /** 91 * Returns TRUE if the specified word is known to be a stop word 92 * @param d 93 * @param word 94 * @param wlen 95 * @return 96 */ 97 gboolean rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d, 98 const gchar *word, gsize wlen); 99 100 /** 101 * Return language flags for a specific language elt 102 * @param elt 103 * @return 104 */ 105 gint rspamd_language_detector_elt_flags (const struct rspamd_language_elt *elt); 106 #ifdef __cplusplus 107 } 108 #endif 109 110 #endif 111