1 /*-
2  * Copyright 2016 Vsevolod Stakhov
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *   http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 #ifndef STAT_API_H_
17 #define STAT_API_H_
18 
19 #include "config.h"
20 #include "task.h"
21 #include "lua/lua_common.h"
22 #include "contrib/libev/ev.h"
23 
24 #ifdef  __cplusplus
25 extern "C" {
26 #endif
27 
28 /**
29  * @file stat_api.h
30  * High level statistics API
31  */
32 
33 #define RSPAMD_STAT_TOKEN_FLAG_TEXT (1u << 0)
34 #define RSPAMD_STAT_TOKEN_FLAG_META (1u << 1)
35 #define RSPAMD_STAT_TOKEN_FLAG_LUA_META (1u << 2)
36 #define RSPAMD_STAT_TOKEN_FLAG_EXCEPTION (1u << 3)
37 #define RSPAMD_STAT_TOKEN_FLAG_HEADER (1u << 4)
38 #define RSPAMD_STAT_TOKEN_FLAG_UNIGRAM (1u << 5)
39 #define RSPAMD_STAT_TOKEN_FLAG_UTF (1u << 6)
40 #define RSPAMD_STAT_TOKEN_FLAG_NORMALISED (1u << 7)
41 #define RSPAMD_STAT_TOKEN_FLAG_STEMMED (1u << 8)
42 #define RSPAMD_STAT_TOKEN_FLAG_BROKEN_UNICODE (1u << 9)
43 #define RSPAMD_STAT_TOKEN_FLAG_STOP_WORD (1u << 10)
44 #define RSPAMD_STAT_TOKEN_FLAG_SKIPPED (1u << 11)
45 #define RSPAMD_STAT_TOKEN_FLAG_INVISIBLE_SPACES (1u << 12)
46 #define RSPAMD_STAT_TOKEN_FLAG_EMOJI (1u << 13)
47 
48 typedef struct rspamd_stat_token_s {
49 	rspamd_ftok_t original; /* utf8 raw */
50 	rspamd_ftok_unicode_t unicode; /* array of unicode characters, normalized, lowercased */
51 	rspamd_ftok_t normalized; /* normalized and lowercased utf8 */
52 	rspamd_ftok_t stemmed; /* stemmed utf8 */
53 	guint flags;
54 } rspamd_stat_token_t;
55 
56 typedef struct token_node_s {
57 	guint64 data;
58 	guint window_idx;
59 	guint flags;
60 	rspamd_stat_token_t *t1;
61 	rspamd_stat_token_t *t2;
62 	float values[];
63 } rspamd_token_t;
64 
65 struct rspamd_stat_ctx;
66 
67 /**
68  * The results of statistics processing:
69  * - error
70  * - need to do additional job for processing
71  * - all processed
72  */
73 typedef enum rspamd_stat_result_e {
74 	RSPAMD_STAT_PROCESS_ERROR = 0,
75 	RSPAMD_STAT_PROCESS_DELAYED = 1,
76 	RSPAMD_STAT_PROCESS_OK
77 } rspamd_stat_result_t;
78 
79 /**
80  * Initialise statistics modules
81  * @param cfg
82  */
83 void rspamd_stat_init (struct rspamd_config *cfg, struct ev_loop *ev_base);
84 
85 /**
86  * Finalize statistics
87  */
88 void rspamd_stat_close (void);
89 
90 /**
91  * Tokenize task
92  * @param st_ctx
93  * @param task
94  */
95 void rspamd_stat_process_tokenize (struct rspamd_stat_ctx *st_ctx,
96 								   struct rspamd_task *task);
97 
98 /**
99  * Classify the task specified and insert symbols if needed
100  * @param task
101  * @param L lua state
102  * @param err error returned
103  * @return TRUE if task has been classified
104  */
105 rspamd_stat_result_t rspamd_stat_classify (struct rspamd_task *task,
106 										   lua_State *L, guint stage, GError **err);
107 
108 
109 /**
110  * Check if a task should be learned and set the appropriate flags for it
111  * @param task
112  * @return
113  */
114 gboolean rspamd_stat_check_autolearn (struct rspamd_task *task);
115 
116 /**
117  * Learn task as spam or ham, task must be processed prior to this call
118  * @param task task to learn
119  * @param spam if TRUE learn spam, otherwise learn ham
120  * @param L lua state
121  * @param classifier NULL to learn all classifiers, name to learn a specific one
122  * @param err error returned
123  * @return TRUE if task has been learned
124  */
125 rspamd_stat_result_t rspamd_stat_learn (struct rspamd_task *task,
126 										gboolean spam, lua_State *L, const gchar *classifier,
127 										guint stage,
128 										GError **err);
129 
130 /**
131  * Get the overall statistics for all statfile backends
132  * @param cfg configuration
133  * @param total_learns the total number of learns is stored here
134  * @return array of statistical information
135  */
136 rspamd_stat_result_t rspamd_stat_statistics (struct rspamd_task *task,
137 											 struct rspamd_config *cfg,
138 											 guint64 *total_learns,
139 											 ucl_object_t **res);
140 
141 void rspamd_stat_unload (void);
142 
143 #ifdef  __cplusplus
144 }
145 #endif
146 
147 #endif /* STAT_API_H_ */
148