1 /*-
2  * Copyright 2017 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #ifndef RSPAMD_LANG_DETECTION_H
18 #define RSPAMD_LANG_DETECTION_H
19 
20 #include "config.h"
21 #include "libserver/cfg_file.h"
22 #include "libstat/stat_api.h"
23 #include "libmime/message.h"
24 
25 #ifdef  __cplusplus
26 extern "C" {
27 #endif
28 
29 struct rspamd_lang_detector;
30 struct rspamd_language_elt;
31 struct rspamd_task;
32 
33 enum rspamd_unicode_scripts {
34 	RSPAMD_UNICODE_LATIN = (1 << 0),
35 	RSPAMD_UNICODE_GREEK = (1 << 1),
36 	RSPAMD_UNICODE_CYRILLIC = (1 << 2),
37 	RSPAMD_UNICODE_HEBREW = (1 << 3),
38 	RSPAMD_UNICODE_CJK = (1 << 4),
39 	RSPAMD_UNICODE_JP = (1 << 5),
40 	RSPAMD_UNICODE_ARABIC = (1 << 6),
41 	RSPAMD_UNICODE_DEVANAGARI = (1 << 7),
42 	RSPAMD_UNICODE_THAI = (1 << 8),
43 	RSPAMD_UNICODE_ARMENIAN = (1 << 9),
44 	RSPAMD_UNICODE_GEORGIAN = (1 << 10),
45 	RSPAMD_UNICODE_GUJARATI = (1 << 11),
46 	RSPAMD_UNICODE_TAMIL = (1 << 12),
47 	RSPAMD_UNICODE_TELUGU = (1 << 13),
48 	RSPAMD_UNICODE_MALAYALAM = (1 << 14),
49 	RSPAMD_UNICODE_SINHALA = (1 << 15),
50 	RSPAMD_UNICODE_HANGUL = (1 << 16),
51 };
52 
53 enum rspamd_language_elt_flags {
54 	RS_LANGUAGE_DEFAULT = 0,
55 	RS_LANGUAGE_LATIN = (1 << 0),
56 	RS_LANGUAGE_TIER1 = (1 << 3),
57 	RS_LANGUAGE_TIER0 = (1 << 4),
58 	RS_LANGUAGE_DIACRITICS = (1 << 5),
59 	RS_LANGUAGE_ASCII = (1 << 6),
60 };
61 
62 struct rspamd_lang_detector_res {
63 	gdouble prob;
64 	const gchar *lang;
65 	struct rspamd_language_elt *elt;
66 };
67 
68 /**
69  * Create new language detector object using configuration object
70  * @param cfg
71  * @return
72  */
73 struct rspamd_lang_detector *rspamd_language_detector_init (struct rspamd_config *cfg);
74 
75 struct rspamd_lang_detector *rspamd_language_detector_ref (struct rspamd_lang_detector *d);
76 
77 void rspamd_language_detector_unref (struct rspamd_lang_detector *d);
78 
79 /**
80  * Try to detect language of words
81  * @param d
82  * @param ucs_tokens
83  * @param words_len
84  * @return array of struct rspamd_lang_detector_res sorted by freq descending
85  */
86 gboolean rspamd_language_detector_detect (struct rspamd_task *task,
87 										  struct rspamd_lang_detector *d,
88 										  struct rspamd_mime_text_part *part);
89 
90 /**
91  * Returns TRUE if the specified word is known to be a stop word
92  * @param d
93  * @param word
94  * @param wlen
95  * @return
96  */
97 gboolean rspamd_language_detector_is_stop_word (struct rspamd_lang_detector *d,
98 												const gchar *word, gsize wlen);
99 
100 /**
101  * Return language flags for a specific language elt
102  * @param elt
103  * @return
104  */
105 gint rspamd_language_detector_elt_flags (const struct rspamd_language_elt *elt);
106 #ifdef  __cplusplus
107 }
108 #endif
109 
110 #endif
111