1 // crm114_config.h -- Configuration for CRM114. 2 3 // Copyright 2001-2009 William S. Yerazunis. 4 // This file is under GPLv3, as described in COPYING. 5 6 /////////////////////////////////////////////////////////////////// 7 // Some things here you can change with relative impunity. 8 // Other things, not so much. Where there are limiting factors 9 // noted, please obey them or you may break something important. 10 // And, of course, realize that this is GPLed software with 11 // NO WARRANTY - make any changes and that goes double. 12 /////////////////////////////////////////////////////////////////// 13 14 #ifndef __CRM114_CONFIG_H__ 15 #define __CRM114_CONFIG_H__ 16 17 // Do you want all the classifiers? Or just the "production 18 // ready ones"? Comment the next line out if you want everything. 19 //#define PRODUCTION_CLASSIFIERS_ONLY 20 // 21 // 22 // default size of the variables hashtable (a.k.a. the VHT) 23 #define DEFAULT_VHT_SIZE 4095 24 25 // default limit on the control stack (for catching infinite loops, 26 // not a preallocated variable) 27 #define DEFAULT_CSTK_LIMIT 1024 28 29 // how many levels (pending operations) will we allow in 30 // math evaluations. We _could_ have it be unlimited, but 31 // this serves as an error catcher in runaway programs. 32 #define DEFAULT_MATHSTK_LIMIT 1024 33 34 // default maximum number of lines in any program file 35 #define DEFAULT_MAX_PGMLINES 10000 36 37 // define maximum number of INSERTs before we think we're in an 38 // infinite loop... 39 #define DEFAULT_MAX_INSERTS 1024 40 41 // default size of the data window: 8 megabytes. 42 #define DEFAULT_DATA_WINDOW 8388608 43 //#define DEFAULT_DATA_WINDOW 16777216 44 //#define DEFAULT_DATA_WINDOW 1048576 45 46 // mmap cacheing length - only actually write out this often. 47 // set to 0 to disable mmap cacheing and release files faster. 48 // However, this has a negative speed impact. 49 // I unset this from 0 -JB 50 //#define UNMAP_COUNT_MAX 0 51 //#define UNMAP_COUNT_MAX 2 52 #define UNMAP_COUNT_MAX 1000 53 54 // What's the smallest chunk we actually want to bother reclaiming 55 // on the fly out of the isolated data area "tdw". Set this to 1 56 // for agressive compression; values like 100 to 10K can speed up 57 // execution of things that thrash the tdw badly; set to larger 58 // than the data window size to completely disable the on-the-fly 59 // reclaimer. Watch out though- values less than 1 can cause the 60 // end of one variable to overlap the start of another; this causes 61 // horrible problems. FOR LATER IMPROVEMENT: Start with a 62 // relatively large reclaimer value, then decrease slowly as memory 63 // becomes more scarce. 64 #define MAX_RECLAIMER_GAP 5 65 66 // How many regex compilations do we cache? (this saves the time 67 // to recompile regexes in a loop, but uses memory) Set to zero to 68 // disable cacheing. Note that we cache the actual regex, not the 69 // source code line, so this happens *after* the regex text is var 70 // expanded; two different expressions that evaluate to the same 71 // actual regex will share the same cache slot, which is pretty 72 // cool. 73 // 74 // For programs that don't loop, or reuse the same regex a lot, 75 // performance is slightly better with cacheing disabled. But if you 76 // do reuse the same regexes tens or hundreds of times (say, lots of 77 // LIAF-loops) then cacheing can accelerate your program significantly. 78 // 79 //#define CRM_REGEX_CACHESIZE 0 80 //#define CRM_REGEX_CACHESIZE 10 81 #define CRM_REGEX_CACHESIZE 1024 82 // 83 // and how do we want the regex cache to work? RANDOM_ACCESS can 84 // keep more things around, but is only 1 LRU deep for each slot so 85 // use plenty of slots, like 256 or more. LINEAR_SEARCH is a 86 // strict LRU cache but that's slower; don't use too many slots 87 // with LINEAR_SEARCH or you'll spend more time searching the cache 88 // than you would have spent just recompiling the regex. 89 // 90 // Be sure to turn on ONLY ONE of these !!!! 91 // 92 #define REGEX_CACHE_RANDOM_ACCESS 93 //#define REGEX_CACHE_LINEAR_SEARCH 94 95 96 // How big a space in a "standard header" (which is relatively new 97 // and most classifiers don't support yet) do we want to use? Note 98 // that changing this will break all previously generated statistics 99 // files that use this standard header. 100 #define STATISTICS_FILE_NCHUNKS 1024 101 #define STATISTICS_FILE_IDENT_STRING_MAX 1024 102 #define CLASSNAME_TAG_LENGTH 32 103 104 // do we use Sparse Binary Polynomial Hashing (sensitive to both 105 // sequence and spacing of individual words), Token Grab Bag, or 106 // Token Sequence Sensitive? Testing against the SpamAssassin 107 // "hard" database shows that SBPH, TGB, and TGB2, are somewhat 108 // more accurate than TSS, and about 50% more accurate than First 109 // Order Only. However, that's for English, and other natural 110 // languages may show a different statistical distribution. 111 // 112 // Choose ONE of the following: 113 // SBPH, TGB2, TGB, TSS, or ARBITRARY_WINDOW_LEN: 114 // 115 // *** DANGER, WILL ROBINSON *** You MUST rebuild your .css files from 116 // samples of text if you change this. 117 // 118 // 119 // Sparse Binary Polynomial Hashing 120 #define SBPH 121 // 122 // Token Grab Bag, noaliasing 123 //#define TGB2 124 // 125 // Token Grab Bag, aliasing 126 //#define TGB 127 // 128 // Token Sequence Sensitive 129 //#define TSS 130 // 131 // First Order Only (i.e. single words, like SpamBayes) 132 // Note- if you use FOO, you must turn off weights!! 133 //#define FOO 134 // 135 // Generalized format for the window length. 136 // 137 // DO NOT SET THIS TO MORE THAN 10 WITHOUT LENGTHENING hctable 138 // the classifier modules !!!!!! "hctable" contains the pipeline 139 // hashing coefficients and needs to be extended to 2 * WINDOW_LEN 140 // 141 // Generic window length code 142 //#define ARBITRARY_WINDOW_LENGTH 143 // 144 #define MARKOVIAN_WINDOW_LEN 5 145 // 146 #define OSB_BAYES_WINDOW_LEN 5 147 // 148 // DO NOT set this to more than 5 without lengthening the 149 // htup1 and htup2 tables in crm_unified_bayes.c 150 // 151 #define UNIFIED_BAYES_WINDOW_LEN 5 152 // 153 // Unified tokenization pipeline length. 154 // maximum window length _ever_. 155 #define UNIFIED_WINDOW_LEN 32 156 // 157 // maximum number of weight vectors to be applied to the pipeline 158 #define UNIFIED_VECTOR_LIMIT 256 159 160 //// 161 // Winnow algorithm parameters here... 162 // 163 #define OSB_WINNOW_WINDOW_LEN 5 164 #define OSB_WINNOW_PROMOTION 1.23 165 #define OSB_WINNOW_DEMOTION 0.83 166 // 167 // Now, choose whether we want to use the "old" or the "new" local 168 // probability calculation. The "old" one works slightly better 169 // for SBPH and much better for TSS, the "new" one works slightly 170 // better for TGB and TGB2, and _much_ better for FOO 171 // 172 // The current default (not necessarily optimal) 173 // is Markovian SBPH, STATIC_LOCAL_PROBABILITIES, 174 // LOCAL_PROB_DENOM = 16, and SUPER_MARKOV 175 // 176 //#define LOCAL_PROB_DENOM 2.0 177 #define LOCAL_PROB_DENOM 16.0 178 //#define LOCAL_PROB_DENOM 256.0 179 #define STATIC_LOCAL_PROBABILITIES 180 //#define LENGTHBASED_LOCAL_PROBABILITIES 181 // 182 //#define ENTROPIC_WEIGHTS 183 //#define MARKOV_WEIGHTS 184 #define SUPER_MARKOV_WEIGHTS 185 //#define BREYER_CHHABRA_SIEFKES_WEIGHTS 186 //#define BREYER_CHHABRA_SIEFKES_BASE7_WEIGHTS 187 //#define BCS_MWS_WEIGHTS 188 //#define BCS_EXP_WEIGHTS 189 // 190 // 191 // Do we use learncount-based normalization in calculating probabilities? 192 #define OSB_LEARNCOUNTS 193 // 194 // Do we take only the maximum probability feature? 195 // 196 //#define USE_PEAK 197 // 198 // 199 // Should we use stochastic microgrooming, or weight-distance microgrooming- 200 // Make sure ONE of these is turned on. 201 //#define STOCHASTIC_AMNESIA 202 #define WEIGHT_DISTANCE_AMNESIA 203 204 #if (! defined (STOCHASTIC_AMNESIA) && ! defined (WEIGHT_DISTANCE_AMNESIA)) 205 #error Neither STOCHASTIC_AMNESIA nor WEIGHT_DISTANCE_AMNESIA defined 206 #elif (defined (STOCHASTIC_AMNESIA) && defined (WEIGHT_DISTANCE_AMNESIA)) 207 #error Both STOCHASTIC_AMNESIA and WEIGHT_DISTANCE_AMNESIA defined 208 #endif 209 210 // 211 // define the default max chain length in a .css file that triggers 212 // autogrooming, the rescale factor when we rescale, and how often 213 // we rescale, and what chance (mask and key) for any particular 214 // slot to get rescaled when a rescale is triggered for that slot chain. 215 //#define MICROGROOM_CHAIN_LENGTH 1024 216 #define MICROGROOM_CHAIN_LENGTH 256 217 //#define MICROGROOM_CHAIN_LENGTH 64 218 #define MICROGROOM_RESCALE_FACTOR .75 219 #define MICROGROOM_STOCHASTIC_MASK 0x0000000F 220 #define MICROGROOM_STOCHASTIC_KEY 0x00000001 221 #define MICROGROOM_STOP_AFTER 32 // maximum number of buckets groom-zeroed 222 223 #define FEATURE_HIT_INCREMENT_SIZE 7 224 225 // define the "block ratio" of how of a memory data window we're 226 // willing to suck in from a minion process before we block on 227 // sucking; the un-sucked part just waits in the minion's stdout 228 // buffer (and causes the minion to block on output). Normally a 229 // factor of 2 (1/4th of the size of a full memory window, or 2 230 // megabytes in the default configuraton) is sufficient. 231 #define SYSCALL_WINDOW_RATIO 2 232 233 // define default internal debug level 234 #define DEFAULT_INTERNAL_TRACE_LEVEL 0 235 236 // define default user debug level 237 #define DEFAULT_USER_TRACE_LEVEL 0 238 239 // define maximum number of parenthesized sub regexes we'll accept 240 #define MAX_SUBREGEX 256 241 242 // define maximum bracket depth nesting we'll allow.... 243 #define MAX_BRACKETDEPTH 256 244 245 // define maximum number of iterations allowed for EVAL expansion 246 //#define MAX_EVAL_ITERATIONS 16384 247 //#define MAX_EVAL_ITERATIONS 1024 248 #define MAX_EVAL_ITERATIONS 4096 249 250 // define maximum size of a pattern in bytes 251 #define MAX_PATTERN 16384 252 253 // and how long can a variable name be 254 #define MAX_VARNAME 2048 255 256 // define the default number of buckets in a learning file hash table 257 // (note that this should be a prime number, or at least one with a 258 // lot of big factors) 259 // 260 // this value (2097153) is one more than 2 megs, for a .css of 24 megs 261 //#define DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH 2097153 262 // 263 // this value (1048577) is one more than a meg, for a .css of 12 megs 264 // for the Markovian, and half that for OSB classifiers 265 #define DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH 1048577 266 #define DEFAULT_MARKOVIAN_SPARSE_SPECTRUM_FILE_LENGTH 1048577 267 #define DEFAULT_OSB_BAYES_SPARSE_SPECTRUM_FILE_LENGTH 524287 // Mersenne prime 268 #define DEFAULT_WINNOW_SPARSE_SPECTRUM_FILE_LENGTH 1048577 269 //#define DEFAULT_BIT_ENTROPY_FILE_LENGTH 2000000 270 #define DEFAULT_BIT_ENTROPY_FILE_LENGTH 1000000 271 272 273 // ??? 274 #define OSB_BAYES_MAX_FEATURE_COUNT DEFAULT_OSB_BAYES_SPARSE_SPECTRUM_FILE_LENGTH 275 276 #define WINNOW_MAX_FEATURE_COUNT DEFAULT_WINNOW_SPARSE_SPECTRUM_FILE_LENGTH 277 278 // For the hyperspace matcher, we need to define a few things. 279 #define HYPERSPACE_MAX_FEATURE_COUNT 500000 280 281 // Stuff for bit-entropic configuration 282 // Define the size of our alphabet, and how many bits per alph. 283 #define ENTROPY_ALPHABET_SIZE 2 284 #define ENTROPY_CHAR_SIZE 1 285 #define ENTROPY_CHAR_BITMASK 0x1 286 // What fraction of the nodes in a bit-entropic file should be 287 // referenceable from the FIR prior arithmetical encoding 288 // lookaside table? 0.01 is 1% == average of 100 steps to find 289 // the best node. 0.2 is 20% or 5 steps to find the best node. 290 #define BIT_ENTROPIC_FIR_LOOKASIDE_FRACTION 0.1 291 #define BIT_ENTROPIC_FIR_LOOKASIDE_STEP_LIMIT 128 292 #define BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT 0.5 293 #define BIT_ENTROPIC_SHUFFLE_HEIGHT 1024 // was 256 294 #define BIT_ENTROPIC_SHUFFLE_WIDTH 1024 // was 256 295 #define BIT_ENTROPIC_PROBABILITY_NERF 0.0000000000000000001 296 297 // Defines for the svm classifier 298 // All defines you should want to use without getting into 299 // the nitty details of the SVM are here. For nitty detail 300 // defines, see crm_svm_matrix_util.h, crm_svm_quad_prog.h, 301 // crm_svm_matrix.h, and crm_svm_lib_fncts.h 302 #define MAX_SVM_FEATURES 100000 //per example 303 #define SVM_INTERNAL_TRACE_LEVEL 3 //the debug level when internal_trace is 304 //on 305 #define SVM_ACCURACY 1e-3 //The accuracy to which to run the solver 306 //This is the average margin violation NOT 307 //accounted for by the slack variable. 308 #define SV_TOLERANCE 0.01 //An example is a support vector if 309 //theta*y*x <= 1 + SV_TOLERANCE. 310 //The smaller SV_TOLERANCE, the fewer 311 //examples will be tagged as support 312 //vectors. This will make it faster to 313 //learn new examples, but possibly less 314 //accurate. 315 #define SVM_ADD_CONSTANT 1 //Define this to be 1 if you want a 316 //constant offset in the classification 317 //ie h(x) = theta*x + b where b is 318 //the offset. If you don't want 319 //a constant offset (just h(x) = theta*x), 320 //define this to be 0. 321 #define SVM_HOLE_FRAC 0.25 //Size of the "hole" left at the end of 322 //the file to allow for quick appends 323 //without having to forcibly unmap the 324 //file. This is as a fraction of the 325 //size of the file without the hole. So 326 //setting it to 1 doubles the file size. 327 //If you don't want a hole left, set 328 //this to 0. 329 #define SVM_MAX_SOLVER_ITERATIONS 200 //absolute maximum number of loops the 330 //solver is allowed 331 #define SVM_CHECK 100 //every SVM_CHECK we look to see if 332 //the accuracy is better than 333 //SVM_CHECK_FACTOR*SVM_ACCURACY. 334 //If it is, we exit the solver loop. 335 #define SVM_CHECK_FACTOR 2 //every SVM_CHECK we look to see if 336 //the accuracy is better than 337 //SVM_CHECK_FACTOR*SVM_ACCURACY. 338 //If it is, we exit the solver loop. 339 //defines for SVM microgrooming 340 #define SVM_GROOM_OLD 10000 //we groom only if there are this many 341 //examples (or more) not being used in 342 //solving 343 #define SVM_GROOM_FRAC 0.9 //we keep this fraction of examples after 344 //grooming 345 //defines for svm_smart_mode 346 #define SVM_BASE_EXAMPLES 1000 //the number of examples we need to see 347 //before we train 348 #define SVM_INCR_FRAC 0.1 //if more than this fraction of examples 349 //are appended, we do a fromstart rather 350 //than use the incremental method. 351 // Defines for the PCA classifier 352 // All defines you should want to use without getting into 353 // the nitty details of the PCA are here. For nitty detail 354 // defines, see crm_svm_matrix_util.h and crm_pca_lib_fncts.h 355 #define MAX_PCA_FEATURES 100000 //per example 356 #define PCA_INTERNAL_TRACE_LEVEL 3 //the debug level when internal_trace is on 357 #define PCA_ACCURACY 1e-8 //accuracy to which to run the solver 358 #define MAX_PCA_ITERATIONS 1000 //maximum number of solver iterations 359 #define PCA_CLASS_MAG 50 //the starting class magnitudes. if this 360 //is too small, the solver will double it 361 //and resolve. if it is too large, the 362 //solver will be less accurate. 363 #define PCA_REDO_FRAC 0.001 //if we get this fraction of training 364 //examples wrong with class mag enabled, we 365 //retrain with class mag doubled. 366 #define PCA_MAX_REDOS 20 //The maximum number of redos allowed when 367 //trying to find the correct class mag. 368 #define PCA_HOLE_FRAC 0.25 //Size of the "hole" left at the end of 369 //the file to allow for quick appends 370 //without having to forcibly unmap the file. 371 //This is as a fraction of the size of the 372 //file without the hole. So setting it to 373 //1 doubles the file size. If you don't 374 //want a hole left, set this to 0. 375 //defines for PCA microgrooming 376 #define PCA_GROOM_OLD 10000 //we groom only if there are this many 377 //examples (or more) present 378 #define PCA_GROOM_FRAC 0.9 //we keep this fraction of examples after 379 //grooming 380 // define the maximum length of a filename 381 // #define MAX_FILE_NAME_LEN 255 382 383 // defaults to system's, if any 384 #ifdef NAME_MAX 385 #define MAX_FILE_NAME_LEN NAME_MAX+1 386 #else 387 #ifdef FILENAME_MAX 388 #define MAX_FILE_NAME_LEN FILENAME_MAX+1 389 #else 390 #define MAX_FILE_NAME_LEN 256 391 #endif 392 #endif 393 394 // define how many microseconds to sleep waiting for a minion process 395 // to complete: 396 //#define MINION_SLEEP_USEC 1000000 397 //#define MINION_SLEEP_USEC 10000 398 //#define MINION_SLEEP_USEC 1000 399 //#define MINION_SLEEP_USEC 100 400 #define MINION_SLEEP_USEC 10 401 402 // How many microseconds to sleep if we're looping on input WINDOW stmt. 403 // try 1 millisecond for now 404 #define INPUT_WINDOW_SLEEP_USEC 1000 405 406 // DANGER DANGER DANGER DANGER DANGER 407 // CHANGE THESE AT YOUR PERIL- YOUR .CSS FILES WILL NOT BE 408 // FORWARD COMPATIBLE WITH ANYONE ELSES IF YOU CHANGE THESE. 409 // 410 // Maximum number of different .CSS files in a CLASSIFY 411 #define MAX_CLASSIFIERS 128 412 413 // how many classes can the library support? 414 #define LIBCRM_MAX_CLASSES MAX_CLASSIFIERS 415 // Maximum length of a stored regex (ugly! But we need a max length 416 // in the mapping. GROT GROT GROT ) 417 #define MAX_REGEX 4096 418 419 // Maximum number of coeffs for a particular pipeline. (ugly! But we 420 // need a max length for easy mapping. GROT GROT GROT ) 421 #define MAX_PIPECOEFFS 512 422 423 #define MAX_CLASSIFIER_PARAMS 1024 424 425 // Define the type of a token. This should be either 32-bit or 426 // 64-bit. Note that some (for now, all!) classifiers will ignore this. 427 typedef int CRM114_TOKEN; 428 // typedef double CRM114_TOKEN; 429 // 430 /// END OF DANGER DANGER DANGER DANGER 431 ///////////////////////////////////////////////////////////////////// 432 433 434 // Maximum number of nonfatal errors we'll allow before tossing our 435 // cookies on a fatal error 436 #define MAX_NONFATAL_ERRORS 100 437 438 // How big is a feature bucket? Is it a byte, a short, a long, 439 // a float, whatever. :) 440 //#define FEATUREBUCKET_VALUE_MAX 32767 441 #define FEATUREBUCKET_VALUE_MAX 1000000000 442 #define FEATUREBUCKET_HISTOGRAM_MAX 4096 443 444 445 //////////////////////////////////////////// 446 // 447 // Improved FSCM-specific parameters 448 // 449 ///////////////////////////////////////////// 450 451 // this is 2^18 + 1 452 // This determines the tradeoff in memory vs. speed/accuracy. 453 //define FSCM_DEFAULT_HASH_TABLE_SIZE 262145 454 // 455 // This is 1 meg + 1 456 #define FSCM_DEFAULT_HASH_TABLE_SIZE 1048577 457 458 // How long are our prefixes? Original prefix was 3 but that's 459 // rather suboptimal for best speed. 6 looks pretty good for speed and 460 // accuracy. 461 // prefix length 6 and thickness 10 (200 multiplier) yields 29 / 4147 462 // 463 //#define FSCM_DEFAULT_CODE_PREFIX_LEN 3 464 #define FSCM_DEFAULT_CODE_PREFIX_LEN 6 465 466 // The chain cache is a speedup for the FSCM match 467 // It's indexed modulo the chainstart, with associativity 1.0 468 #define FSCM_CHAIN_CACHE_SIZE 1048577 469 470 //////////////////////////////////////////// 471 // 472 // Neural Net parameters 473 // 474 //////////////////////////////////////////// 475 #define NN_RETINA_SIZE 8192 476 #define NN_FIRST_LAYER_SIZE 8 477 #define NN_HIDDEN_LAYER_SIZE 8 478 #define NN_MAX_FEATURES 65536 479 480 // Neural Net training setups 481 // 482 // Note- convergence seems to work well at 483 // alpha 0.2 init_noise 0.5 stoch_noise 0.1 gain_noise 0.00000001 484 // alpha 0.2 init_noise 0.2 stoch_noise 0.1 gain_noise 0.00000001 485 // alpha 0.2 init_noise 0.2 stoch_noise 0.05 gain_noise 0.00000001 486 // alpha 0.2 init_noise 0.2 stoch_noise 0.05 gain_noise 0.00000001 487 // alpha 0.2 init_noise 0.2 stoch_noise 0.05 gain_noise 2.0 488 // alpha 0.2 init_noise 0.2 stoch_noise 0.05 gain_noise 2.0 zerotr 0.9999 489 490 #define NN_DEFAULT_ALPHA 0.2 491 // Initialization noise magnitude 492 #define NN_INITIALIZATION_NOISE_MAGNITUDE 0.2 493 // Stochastic noise magnitude 494 #define NN_DEFAULT_STOCH_NOISE 0.05 495 // Gain noise magnitude 496 #define NN_DEFAULT_GAIN_NOISE 2.0 497 // Zero-tracking factor - factor the weights move toward zero every epoch 498 #define NN_ZERO_TRACKING 0.9999 499 // Threshold for back propagation 500 #define NN_INTERNAL_TRAINING_THRESHOLD 0.1 501 // Just use 1 neuron excitation per token coming in. 502 #define NN_N_PUMPS 1 503 // How many training cycles before we punt out 504 #define NN_MAX_TRAINING_CYCLES 500 505 // When doing a "nuke and retry", allow this many training cycles. 506 #define NN_MAX_TRAINING_CYCLES_FROMSTART 5000 507 // How often do we cause a punt (we punt every 0th epoch modulo this number) 508 #define NN_FROMSTART_PUNTING 10000000 509 // After how many "not needed" cycles do we microgroom this doc away? 510 #define NN_MICROGROOM_THRESHOLD 1000000 511 // use the sparse retina design? No, it's not good. 512 #define NN_SPARSE_RETINA 0 513 514 // End of configurable parameters. 515 516 517 518 #endif // !_CRM114_CONFIG_H_ 519