1 /* $Id: libdspam_objects.h,v 1.27 2011/07/11 21:29:57 sbajic Exp $ */
2 
3 /*
4  DSPAM
5  COPYRIGHT (C) 2002-2012 DSPAM PROJECT
6 
7  This program is free software: you can redistribute it and/or modify
8  it under the terms of the GNU Affero General Public License as
9  published by the Free Software Foundation, either version 3 of the
10  License, or (at your option) any later version.
11 
12  This program is distributed in the hope that it will be useful,
13  but WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  GNU Affero General Public License for more details.
16 
17  You should have received a copy of the GNU Affero General Public License
18  along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 
20 */
21 
22 #ifndef _LIBDSPAM_OBJECTS_H
23 #  define _LIBDSPAM_OBJECTS_H
24 
25 #ifdef HAVE_CONFIG_H
26 #include <auto-config.h>
27 #endif
28 
29 #include <time.h>
30 #include "config.h"
31 #include "config_shared.h"
32 #include "decode.h"
33 
34 #if ((defined(__sun__) && defined(__svr4__)) || (defined(__sun) && defined(__SUNPRO_C))) && !defined(u_int32_t) && !defined(__BIT_TYPES_DEFINED__)
35 #define __BIT_TYPES_DEFINED__
36 typedef unsigned long long u_int64_t;
37 typedef unsigned int u_int32_t;
38 typedef unsigned short u_int16_t;
39 typedef unsigned char u_int8_t;
40 #endif
41 
42 #ifdef _WIN32
43 typedef unsigned int u_int32_t;
44 typedef u_int32_t uid_t;
45 #endif
46 
47 extern void *_drv_handle; /* Handle to storage driver library */
48 
49 /*
50  *  struct dspam_factor - A single determining factor
51  *
52  *  An element containing a determining factor in the dominant calculation of
53  *  a message.  An array of these are returned to the calling  application to
54  *  explain libdspam's final classification decision.
55  */
56 
57 struct dspam_factor {
58   char *token_name;
59   float value;
60 };
61 
62 /*
63  *  struct _ds_spam_totals - User spam totals
64  *
65  *  Spam totals loaded into the user's filter context upon a call to
66  *  dspam_init().  This structure represents the user's cumulative statistics.
67  *
68  *  spam_learned, innocent_learned
69  *    The total number of messages trained on.
70  *
71  *  spam_misclassified, innocent_misclassified
72  *    The total number of messages that were misclassified by DSPAM, and
73  *    submitted for retraining.
74  *
75  *  spam_classified, innocent_classified
76  *    The total number of messages that were classified by DSPAM, but not
77  *    learned.  Used exclusively with Train-on-Error mode.
78  *
79  *  spam_corpusfed, innocent_corpusfed
80  *    The total number of messages supplied by the end-user for training.
81  *
82  *  NOTE: The ordering  of the variables  in the  structure must remain
83  *        consistent to ensure backward-compatibility with some storage
84  *        drivers (such as the Berkeley DB drivers)
85  */
86 
87 struct _ds_spam_totals
88 {
89   long spam_learned;
90   long innocent_learned;
91   long spam_misclassified;
92   long innocent_misclassified;
93   long spam_corpusfed;
94   long innocent_corpusfed;
95   long spam_classified;
96   long innocent_classified;
97 };
98 
99 /*
100  *  struct _ds_spam_stat - Statistics for a single token:
101  *
102  *  probability
103  *    The calculated probability of the token based on the active pvalue
104  *    algorithm (selected at configure-time).
105  *
106  *  spam_hits, innocent_hits
107  *    The total  number of times the token has appeared in each class  of
108  *    message. If Train-on-Error or Train-until-Mature training modes are
109  *    employed,  these values will not  necessarily be updated for  every
110  *    message.
111  *
112  *  status
113  *    TST_DISK	Value was loaded from the storage interface
114  *    TST_DIRTY	Statistic is dirty (not written to disk since last modified)
115  */
116 
117 typedef struct _ds_spam_stat
118 {
119   double probability;
120   long spam_hits;
121   long innocent_hits;
122   char status;
123   unsigned long offset;
124 } *ds_spam_stat_t;
125 
126 /*
127  *  struct _ds_spam_signature - A historical classification signature
128  *
129  *  A binary representation of the original training instance.  The spam
130  *  signature  contains all the  metadata used  in the original decision
131  *  about the  message, so  that a 1:1 retraining  can take place if the
132  *  message  is submitted for  retraining (e.g. was  misclassified). The
133  *  signature contains a series of _ds_signature_token structures, which
134  *  house the  original set of tokens used and their frequency counts in
135  *  the message.  A spam signature is a temporary  piece of data that is
136  *  usually purged from disk after a short period of time.
137  */
138 
139 struct _ds_spam_signature
140 {
141   void *data;
142   unsigned long length;
143 };
144 
145 /*
146  *  struct _ds_signature_token - An entry in the classification signature
147  *
148  *  A signature token is a single entry in the binary _ds_spam_signature
149  *  data  blob,  representing  a single  data point  from  the  original
150  *  training instance.
151  *
152  *  token
153  *    The checksum of the original token in the message
154  *
155  *  frequency
156  *    The token's frequency in the original message
157  */
158 
159 struct _ds_signature_token
160 {
161   unsigned long long token;
162   unsigned char frequency;
163 };
164 
165 /*
166  *  struct _ds_config - libdspam attributes configuration
167  *
168  *  Each  classification context may have an attributes  configuration
169  *  which  is read by various  components of libdspam.  This structure
170  *  contains an array of attributes and the size of the array.
171  */
172 
173 struct _ds_config
174 {
175   config_t attributes;
176   long size;
177 };
178 
179 /*
180  *  DSPAM_CTX - The DSPAM Classification Context
181  *
182  *  A classification context is attached directly to a filter instance
183  *  and supplies the entire context for the filter instance to operate
184  *  under.  This  includes  the  user  and group,  operational  flags,
185  *  training  mode, and  the message  being  operated  on. The  filter
186  *  instance also  sets specific output variables  within the  context
187  *  such  as the  result of a  classification,  confidence  level, and
188  *  etcetera.
189  *
190  *  username, group (input)
191  *    The current username and group that is being operated on.
192  *
193  *  totals (output)
194  *    The set of statistics loaded when dspam_init() is called.
195  *
196  *  signature (input, output)
197  *    The signature represents a DSPAM signature, and can be  supplied
198  *    as  an input  variable for  retraining  (e.g. in the  event of a
199  *    misclassification)  or  used as  an output  variable  to store a
200  *    signature  generated   by  the  filter  instance  during  normal
201  *    classification.
202  *
203  *  message (input)
204  *    The  message being operated on, post-actualization. This can  be
205  *    left NULL, and libdspam will automatically actualize the message
206  *
207  *  probability (output)
208  *    The probability of the resulting operation.  This is generally a
209  *    floating  point number  between 0 and  1, 1  being  the  highest
210  *    probability of high order classification.
211  *
212  *  result (output)
213  *    The  final result of the requested operation.  This is generally
214  *    either DSR_ISSPAM, DSR_ISINNOCENT, or DSR_WHITELISTED.
215  *
216  *  confidence (output)
217  *    The  confidence  that the  filter has  in  its  returned  result.
218  *    NOTE: Confidence is not always supported, and may be zero.
219  *
220  *  operating_mode (input)
221  *    Sets the operating mode of the filter instance.  This can be one
222  *    of the following:
223  *
224  *	DSM_PROCESS	Classify and learn the  supplied message using
225  *			whatever training mode is specified
226  *
227  *	DSM_CLASSIFY	Classify the  supplied  message  only; do  not
228  *                      learn or update any counters.
229  *
230  *	DSM_TOOLS	Identifies that  the calling function is  from
231  *			a utility, and no operation will be requested.
232  *
233  *  training_mode (input)
234  *    The training mode sets the type of training the filter  instance
235  *    should apply to the process. This can be one of:
236  *
237  *	DST_TEFT		Train-on-Everything
238  *				Trains every single message  processed
239  *
240  *	DST_TOE			Train-on-Error
241  *				Trains only on a misclassification  or
242  *                              corpus-fed message.
243  *
244  *	DST_TUM			Train-until-Mature
245  *				Trains individual tokens based on  the
246  *				maturity of the user's dictionary
247  *
248  *      DST_NOTRAIN		No Training
249  *                              Process the message but do not perform
250  *                              any training.
251  *  training_buffer (input)
252  *    Sets the amount  of training-loop buffering.  This  number is  a
253  *    range from 0-10  and changes  the amount of  token sedation used
254  *    during the training loop.  The higher the number, the more token
255  *    statistics are watered down  during initial  training to prevent
256  *    false  positives.  Setting  this  value to  zero results  in  no
257  *    sedation being performed.
258  *
259  *  flags (input)
260  *    Applies different fine-tuning behavior to the context:
261  *
262  *	DSF_NOISE		Apply Bayesian Noise Reduction logic
263  *	DSF_SIGNATURE		Signature is provided/requested
264  *      DSF_WHITELIST		Use automatic whitelisting logic
265  *      DSF_MERGED		Merge user/group data in memory
266  *      DSF_UNLEARN		Unlearn the message
267  *      DSF_BIAS		Assign processor bias to unknown tokens
268  *
269  *  tokenizer (input)
270  *    Specifies which tokenizer to use
271  *
272  *      DSZ_WORD		Use WORD (uniGram) tokenizer
273  *      DSZ_CHAIN		Use CHAIN (biGram) tokenizer
274  *      DSZ_SBPH		Use SBPH (Sparse Binary Polynomial Hashing) tokenizer
275  *      DSZ_OSB			Use OSB (Orthogonal Sparse biGram) tokenizer
276  *
277  *  algorithms (input)
278  *    Optional API to override the default algorithms. This value is set
279  *    with the default compiled values whenever dspam_create() is called.
280  *
281  *	DSA_GRAHAM		Graham-Bayesian
282  *	DSA_BURTON		Burton-Bayesian
283  *	DSA_ROBINSON		Robinson's Geometric Mean Test
284  *	DSA_CHI_SQUARE		Fisher-Robinson's Chi-Square
285  *      DSA_NAIVE		Naive-Bayesian
286  *
287  *    P-Value Computations:
288  *
289  *      DSP_ROBINSON		Robinson's Technique
290  *      DSP_GRAHAM		Graham's Technique
291  *      DSP_MARKOV		Markov Weighted Technique
292  *
293  *  locked (output)
294  *    Identifies that the user's storage is presently locked
295  */
296 
297 typedef struct
298 {
299   struct _ds_spam_totals	totals;
300   struct _ds_spam_signature *	signature;
301   struct _ds_message *		message;
302   struct _ds_config *		config;
303 
304   char		*username;
305   char		*group;
306   char		*home;		 /* DSPAM Home */
307   int		operating_mode;  /* DSM_ */
308   int		training_mode;   /* DST_ */
309   int		training_buffer; /* 0-10 */
310   int		wh_threshold;    /* Whitelisting Threshold (default 10) */
311   int		classification;  /* DSR_ */
312   int		source;		 /* DSS_ */
313   int		learned;	 /* Did we actually learn something? */
314   int           tokenizer;       /* DSZ_ */
315   u_int32_t	flags;
316   u_int32_t	algorithms;
317 
318   int		result;
319   char		class[32];
320   float		probability;
321   float		confidence;
322 
323   int		locked;
324   void *	storage;
325   time_t	_process_start;
326   int		_sig_provided;
327 
328   struct nt *	factors;
329 
330 } DSPAM_CTX;
331 
332 /* Processing Flags */
333 
334 #define DSF_SIGNATURE		0x02
335 #define DSF_BIAS		0x04
336 #define DSF_NOISE		0x08
337 #define DSF_WHITELIST		0x10
338 #define DSF_MERGED		0x20
339 #define DSF_UNLEARN		0x80
340 
341 /* Tokenizers */
342 
343 #define DSZ_WORD		0x01
344 #define DSZ_CHAIN		0x02
345 #define DSZ_SBPH		0x03
346 #define DSZ_OSB			0x04
347 
348 /* Algorithms */
349 
350 #define DSA_GRAHAM		0x01
351 #define DSA_BURTON		0x02
352 #define DSA_ROBINSON		0x04
353 #define DSA_CHI_SQUARE		0x08
354 #define DSP_ROBINSON		0x10
355 #define DSP_GRAHAM		0x20
356 #define DSP_MARKOV		0x40
357 #define DSA_NAIVE		0x80
358 
359 /* Operating Modes */
360 
361 #define DSM_PROCESS             0x00
362 #define DSM_TOOLS		0x01
363 #define DSM_CLASSIFY		0x02
364 #define DSM_NONE		0xFF
365 
366 /* Training Modes */
367 
368 #define DST_TEFT		0x00
369 #define DST_TOE			0x01
370 #define DST_TUM			0x02
371 #define DST_NOTRAIN		0xFE
372 
373 /* Classification Results */
374 
375 #define	DSR_ISSPAM		0x01
376 #define DSR_ISINNOCENT		0x02
377 #define DSR_NONE		0xFF
378 
379 /* Classification Sources */
380 
381 #define DSS_ERROR       0x00 /* Retraining an error */
382 #define DSS_CORPUS      0x01 /* Training a message from corpus */
383 #define DSS_INOCULATION 0x02 /* Message is an inoculation */
384 #define DSS_NONE	0xFF /* Standard inbound processing */
385 
386 /* Statuses for token-status bit */
387 #define TST_DISK	0x01
388 #define TST_DIRTY	0x02
389 
390 /* Token Types */
391 #define DTT_DEFAULT	0x00
392 #define DTT_BNR		0x01
393 
394 #define DSP_UNCALCULATED	-1
395 
396 #define BURTON_WINDOW_SIZE	27
397 
398 #endif /* _LIBDSPAM_OBJECTS */
399