1 /* $Id: libdspam_objects.h,v 1.27 2011/07/11 21:29:57 sbajic Exp $ */ 2 3 /* 4 DSPAM 5 COPYRIGHT (C) 2002-2012 DSPAM PROJECT 6 7 This program is free software: you can redistribute it and/or modify 8 it under the terms of the GNU Affero General Public License as 9 published by the Free Software Foundation, either version 3 of the 10 License, or (at your option) any later version. 11 12 This program is distributed in the hope that it will be useful, 13 but WITHOUT ANY WARRANTY; without even the implied warranty of 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 GNU Affero General Public License for more details. 16 17 You should have received a copy of the GNU Affero General Public License 18 along with this program. If not, see <http://www.gnu.org/licenses/>. 19 20 */ 21 22 #ifndef _LIBDSPAM_OBJECTS_H 23 # define _LIBDSPAM_OBJECTS_H 24 25 #ifdef HAVE_CONFIG_H 26 #include <auto-config.h> 27 #endif 28 29 #include <time.h> 30 #include "config.h" 31 #include "config_shared.h" 32 #include "decode.h" 33 34 #if ((defined(__sun__) && defined(__svr4__)) || (defined(__sun) && defined(__SUNPRO_C))) && !defined(u_int32_t) && !defined(__BIT_TYPES_DEFINED__) 35 #define __BIT_TYPES_DEFINED__ 36 typedef unsigned long long u_int64_t; 37 typedef unsigned int u_int32_t; 38 typedef unsigned short u_int16_t; 39 typedef unsigned char u_int8_t; 40 #endif 41 42 #ifdef _WIN32 43 typedef unsigned int u_int32_t; 44 typedef u_int32_t uid_t; 45 #endif 46 47 extern void *_drv_handle; /* Handle to storage driver library */ 48 49 /* 50 * struct dspam_factor - A single determining factor 51 * 52 * An element containing a determining factor in the dominant calculation of 53 * a message. An array of these are returned to the calling application to 54 * explain libdspam's final classification decision. 55 */ 56 57 struct dspam_factor { 58 char *token_name; 59 float value; 60 }; 61 62 /* 63 * struct _ds_spam_totals - User spam totals 64 * 65 * Spam totals loaded into the user's filter context upon a call to 66 * dspam_init(). This structure represents the user's cumulative statistics. 67 * 68 * spam_learned, innocent_learned 69 * The total number of messages trained on. 70 * 71 * spam_misclassified, innocent_misclassified 72 * The total number of messages that were misclassified by DSPAM, and 73 * submitted for retraining. 74 * 75 * spam_classified, innocent_classified 76 * The total number of messages that were classified by DSPAM, but not 77 * learned. Used exclusively with Train-on-Error mode. 78 * 79 * spam_corpusfed, innocent_corpusfed 80 * The total number of messages supplied by the end-user for training. 81 * 82 * NOTE: The ordering of the variables in the structure must remain 83 * consistent to ensure backward-compatibility with some storage 84 * drivers (such as the Berkeley DB drivers) 85 */ 86 87 struct _ds_spam_totals 88 { 89 long spam_learned; 90 long innocent_learned; 91 long spam_misclassified; 92 long innocent_misclassified; 93 long spam_corpusfed; 94 long innocent_corpusfed; 95 long spam_classified; 96 long innocent_classified; 97 }; 98 99 /* 100 * struct _ds_spam_stat - Statistics for a single token: 101 * 102 * probability 103 * The calculated probability of the token based on the active pvalue 104 * algorithm (selected at configure-time). 105 * 106 * spam_hits, innocent_hits 107 * The total number of times the token has appeared in each class of 108 * message. If Train-on-Error or Train-until-Mature training modes are 109 * employed, these values will not necessarily be updated for every 110 * message. 111 * 112 * status 113 * TST_DISK Value was loaded from the storage interface 114 * TST_DIRTY Statistic is dirty (not written to disk since last modified) 115 */ 116 117 typedef struct _ds_spam_stat 118 { 119 double probability; 120 long spam_hits; 121 long innocent_hits; 122 char status; 123 unsigned long offset; 124 } *ds_spam_stat_t; 125 126 /* 127 * struct _ds_spam_signature - A historical classification signature 128 * 129 * A binary representation of the original training instance. The spam 130 * signature contains all the metadata used in the original decision 131 * about the message, so that a 1:1 retraining can take place if the 132 * message is submitted for retraining (e.g. was misclassified). The 133 * signature contains a series of _ds_signature_token structures, which 134 * house the original set of tokens used and their frequency counts in 135 * the message. A spam signature is a temporary piece of data that is 136 * usually purged from disk after a short period of time. 137 */ 138 139 struct _ds_spam_signature 140 { 141 void *data; 142 unsigned long length; 143 }; 144 145 /* 146 * struct _ds_signature_token - An entry in the classification signature 147 * 148 * A signature token is a single entry in the binary _ds_spam_signature 149 * data blob, representing a single data point from the original 150 * training instance. 151 * 152 * token 153 * The checksum of the original token in the message 154 * 155 * frequency 156 * The token's frequency in the original message 157 */ 158 159 struct _ds_signature_token 160 { 161 unsigned long long token; 162 unsigned char frequency; 163 }; 164 165 /* 166 * struct _ds_config - libdspam attributes configuration 167 * 168 * Each classification context may have an attributes configuration 169 * which is read by various components of libdspam. This structure 170 * contains an array of attributes and the size of the array. 171 */ 172 173 struct _ds_config 174 { 175 config_t attributes; 176 long size; 177 }; 178 179 /* 180 * DSPAM_CTX - The DSPAM Classification Context 181 * 182 * A classification context is attached directly to a filter instance 183 * and supplies the entire context for the filter instance to operate 184 * under. This includes the user and group, operational flags, 185 * training mode, and the message being operated on. The filter 186 * instance also sets specific output variables within the context 187 * such as the result of a classification, confidence level, and 188 * etcetera. 189 * 190 * username, group (input) 191 * The current username and group that is being operated on. 192 * 193 * totals (output) 194 * The set of statistics loaded when dspam_init() is called. 195 * 196 * signature (input, output) 197 * The signature represents a DSPAM signature, and can be supplied 198 * as an input variable for retraining (e.g. in the event of a 199 * misclassification) or used as an output variable to store a 200 * signature generated by the filter instance during normal 201 * classification. 202 * 203 * message (input) 204 * The message being operated on, post-actualization. This can be 205 * left NULL, and libdspam will automatically actualize the message 206 * 207 * probability (output) 208 * The probability of the resulting operation. This is generally a 209 * floating point number between 0 and 1, 1 being the highest 210 * probability of high order classification. 211 * 212 * result (output) 213 * The final result of the requested operation. This is generally 214 * either DSR_ISSPAM, DSR_ISINNOCENT, or DSR_WHITELISTED. 215 * 216 * confidence (output) 217 * The confidence that the filter has in its returned result. 218 * NOTE: Confidence is not always supported, and may be zero. 219 * 220 * operating_mode (input) 221 * Sets the operating mode of the filter instance. This can be one 222 * of the following: 223 * 224 * DSM_PROCESS Classify and learn the supplied message using 225 * whatever training mode is specified 226 * 227 * DSM_CLASSIFY Classify the supplied message only; do not 228 * learn or update any counters. 229 * 230 * DSM_TOOLS Identifies that the calling function is from 231 * a utility, and no operation will be requested. 232 * 233 * training_mode (input) 234 * The training mode sets the type of training the filter instance 235 * should apply to the process. This can be one of: 236 * 237 * DST_TEFT Train-on-Everything 238 * Trains every single message processed 239 * 240 * DST_TOE Train-on-Error 241 * Trains only on a misclassification or 242 * corpus-fed message. 243 * 244 * DST_TUM Train-until-Mature 245 * Trains individual tokens based on the 246 * maturity of the user's dictionary 247 * 248 * DST_NOTRAIN No Training 249 * Process the message but do not perform 250 * any training. 251 * training_buffer (input) 252 * Sets the amount of training-loop buffering. This number is a 253 * range from 0-10 and changes the amount of token sedation used 254 * during the training loop. The higher the number, the more token 255 * statistics are watered down during initial training to prevent 256 * false positives. Setting this value to zero results in no 257 * sedation being performed. 258 * 259 * flags (input) 260 * Applies different fine-tuning behavior to the context: 261 * 262 * DSF_NOISE Apply Bayesian Noise Reduction logic 263 * DSF_SIGNATURE Signature is provided/requested 264 * DSF_WHITELIST Use automatic whitelisting logic 265 * DSF_MERGED Merge user/group data in memory 266 * DSF_UNLEARN Unlearn the message 267 * DSF_BIAS Assign processor bias to unknown tokens 268 * 269 * tokenizer (input) 270 * Specifies which tokenizer to use 271 * 272 * DSZ_WORD Use WORD (uniGram) tokenizer 273 * DSZ_CHAIN Use CHAIN (biGram) tokenizer 274 * DSZ_SBPH Use SBPH (Sparse Binary Polynomial Hashing) tokenizer 275 * DSZ_OSB Use OSB (Orthogonal Sparse biGram) tokenizer 276 * 277 * algorithms (input) 278 * Optional API to override the default algorithms. This value is set 279 * with the default compiled values whenever dspam_create() is called. 280 * 281 * DSA_GRAHAM Graham-Bayesian 282 * DSA_BURTON Burton-Bayesian 283 * DSA_ROBINSON Robinson's Geometric Mean Test 284 * DSA_CHI_SQUARE Fisher-Robinson's Chi-Square 285 * DSA_NAIVE Naive-Bayesian 286 * 287 * P-Value Computations: 288 * 289 * DSP_ROBINSON Robinson's Technique 290 * DSP_GRAHAM Graham's Technique 291 * DSP_MARKOV Markov Weighted Technique 292 * 293 * locked (output) 294 * Identifies that the user's storage is presently locked 295 */ 296 297 typedef struct 298 { 299 struct _ds_spam_totals totals; 300 struct _ds_spam_signature * signature; 301 struct _ds_message * message; 302 struct _ds_config * config; 303 304 char *username; 305 char *group; 306 char *home; /* DSPAM Home */ 307 int operating_mode; /* DSM_ */ 308 int training_mode; /* DST_ */ 309 int training_buffer; /* 0-10 */ 310 int wh_threshold; /* Whitelisting Threshold (default 10) */ 311 int classification; /* DSR_ */ 312 int source; /* DSS_ */ 313 int learned; /* Did we actually learn something? */ 314 int tokenizer; /* DSZ_ */ 315 u_int32_t flags; 316 u_int32_t algorithms; 317 318 int result; 319 char class[32]; 320 float probability; 321 float confidence; 322 323 int locked; 324 void * storage; 325 time_t _process_start; 326 int _sig_provided; 327 328 struct nt * factors; 329 330 } DSPAM_CTX; 331 332 /* Processing Flags */ 333 334 #define DSF_SIGNATURE 0x02 335 #define DSF_BIAS 0x04 336 #define DSF_NOISE 0x08 337 #define DSF_WHITELIST 0x10 338 #define DSF_MERGED 0x20 339 #define DSF_UNLEARN 0x80 340 341 /* Tokenizers */ 342 343 #define DSZ_WORD 0x01 344 #define DSZ_CHAIN 0x02 345 #define DSZ_SBPH 0x03 346 #define DSZ_OSB 0x04 347 348 /* Algorithms */ 349 350 #define DSA_GRAHAM 0x01 351 #define DSA_BURTON 0x02 352 #define DSA_ROBINSON 0x04 353 #define DSA_CHI_SQUARE 0x08 354 #define DSP_ROBINSON 0x10 355 #define DSP_GRAHAM 0x20 356 #define DSP_MARKOV 0x40 357 #define DSA_NAIVE 0x80 358 359 /* Operating Modes */ 360 361 #define DSM_PROCESS 0x00 362 #define DSM_TOOLS 0x01 363 #define DSM_CLASSIFY 0x02 364 #define DSM_NONE 0xFF 365 366 /* Training Modes */ 367 368 #define DST_TEFT 0x00 369 #define DST_TOE 0x01 370 #define DST_TUM 0x02 371 #define DST_NOTRAIN 0xFE 372 373 /* Classification Results */ 374 375 #define DSR_ISSPAM 0x01 376 #define DSR_ISINNOCENT 0x02 377 #define DSR_NONE 0xFF 378 379 /* Classification Sources */ 380 381 #define DSS_ERROR 0x00 /* Retraining an error */ 382 #define DSS_CORPUS 0x01 /* Training a message from corpus */ 383 #define DSS_INOCULATION 0x02 /* Message is an inoculation */ 384 #define DSS_NONE 0xFF /* Standard inbound processing */ 385 386 /* Statuses for token-status bit */ 387 #define TST_DISK 0x01 388 #define TST_DIRTY 0x02 389 390 /* Token Types */ 391 #define DTT_DEFAULT 0x00 392 #define DTT_BNR 0x01 393 394 #define DSP_UNCALCULATED -1 395 396 #define BURTON_WINDOW_SIZE 27 397 398 #endif /* _LIBDSPAM_OBJECTS */ 399