crm114_config.h - OpenGrok cross reference for /dports/mail/crm114/crm114-20100106-BlameMichelson.src/crm114_config.h

//	crm114_config.h -- Configuration for CRM114.

// Copyright 2001-2009 William S. Yerazunis.
// This file is under GPLv3, as described in COPYING.

///////////////////////////////////////////////////////////////////
//	Some things here you can change with relative impunity.
//	Other things, not so much.  Where there are limiting factors
//	noted, please obey them or you may break something important.
//	And, of course, realize that this is GPLed software with
//	NO WARRANTY - make any changes and that goes double.
///////////////////////////////////////////////////////////////////

#ifndef	__CRM114_CONFIG_H__
#define	__CRM114_CONFIG_H__

//     Do you want all the classifiers?  Or just the "production
//     ready ones"?   Comment the next line out if you want everything.
//#define PRODUCTION_CLASSIFIERS_ONLY
//
//
//   default size of the variables hashtable (a.k.a. the VHT)
#define DEFAULT_VHT_SIZE 4095

//   default limit on the control stack (for catching infinite loops,
//   not a preallocated variable)
#define DEFAULT_CSTK_LIMIT 1024

//   how many levels (pending operations) will we allow in
//   math evaluations.  We _could_ have it be unlimited, but
//   this serves as an error catcher in runaway programs.
#define DEFAULT_MATHSTK_LIMIT 1024

//   default maximum number of lines in any program file
#define DEFAULT_MAX_PGMLINES 10000

//   define maximum number of INSERTs before we think we're in an
//   infinite loop...
#define DEFAULT_MAX_INSERTS 1024

//   default size of the data window: 8 megabytes.
#define DEFAULT_DATA_WINDOW  8388608
//#define DEFAULT_DATA_WINDOW 16777216
//#define DEFAULT_DATA_WINDOW 1048576

//    mmap cacheing length - only actually write out this often.
//     set to 0 to disable mmap cacheing and release files faster.
//      However, this has a negative speed impact.
// I unset this from 0 -JB
//#define UNMAP_COUNT_MAX 0
//#define UNMAP_COUNT_MAX 2
#define UNMAP_COUNT_MAX 1000

//    What's the smallest chunk we actually want to bother reclaiming
//    on the fly out of the isolated data area "tdw".  Set this to 1
//    for agressive compression; values like 100 to 10K can speed up
//    execution of things that thrash the tdw badly; set to larger
//    than the data window size to completely disable the on-the-fly
//    reclaimer.  Watch out though- values less than 1 can cause the
//    end of one variable to overlap the start of another; this causes
//    horrible problems.  FOR LATER IMPROVEMENT: Start with a
//    relatively large reclaimer value, then decrease slowly as memory
//    becomes more scarce.
#define MAX_RECLAIMER_GAP 5

//    How many regex compilations do we cache?  (this saves the time
//    to recompile regexes in a loop, but uses memory) Set to zero to
//    disable cacheing.  Note that we cache the actual regex, not the
//    source code line, so this happens *after* the regex text is var
//    expanded; two different expressions that evaluate to the same
//    actual regex will share the same cache slot, which is pretty
//    cool.
//
//    For programs that don't loop, or reuse the same regex a lot,
//    performance is slightly better with cacheing disabled.  But if you
//    do reuse the same regexes tens or hundreds of times (say, lots of
//    LIAF-loops) then cacheing can accelerate your program significantly.
//
//#define CRM_REGEX_CACHESIZE 0
//#define CRM_REGEX_CACHESIZE 10
#define CRM_REGEX_CACHESIZE 1024
//
//    and how do we want the regex cache to work?  RANDOM_ACCESS can
//    keep more things around, but is only 1 LRU deep for each slot so
//    use plenty of slots, like 256 or more.  LINEAR_SEARCH is a
//    strict LRU cache but that's slower; don't use too many slots
//    with LINEAR_SEARCH or you'll spend more time searching the cache
//    than you would have spent just recompiling the regex.
//
//    Be sure to turn on ONLY ONE of these !!!!
//
#define REGEX_CACHE_RANDOM_ACCESS
//#define REGEX_CACHE_LINEAR_SEARCH


//    How big a space in a "standard header" (which is relatively new
//    and most classifiers don't support yet) do we want to use?  Note
//    that changing this will break all previously generated statistics
//    files that use this standard header.
#define STATISTICS_FILE_NCHUNKS 1024
#define STATISTICS_FILE_IDENT_STRING_MAX 1024
#define CLASSNAME_TAG_LENGTH 32

//    do we use Sparse Binary Polynomial Hashing (sensitive to both
//    sequence and spacing of individual words), Token Grab Bag, or
//    Token Sequence Sensitive?  Testing against the SpamAssassin
//    "hard" database shows that SBPH, TGB, and TGB2, are somewhat
//    more accurate than TSS, and about 50% more accurate than First
//    Order Only.  However, that's for English, and other natural
//    languages may show a different statistical distribution.
//
//    Choose ONE of the following:
//          SBPH, TGB2, TGB, TSS, or ARBITRARY_WINDOW_LEN:
//
//    *** DANGER, WILL ROBINSON ***  You MUST rebuild your .css files from
//    samples of text if you change this.
//
//
//     Sparse Binary Polynomial Hashing
#define SBPH
//
//     Token Grab Bag, noaliasing
//#define TGB2
//
//     Token Grab Bag, aliasing
//#define TGB
//
//     Token Sequence Sensitive
//#define TSS
//
//     First Order Only (i.e. single words, like SpamBayes)
//    Note- if you use FOO, you must turn off weights!!
//#define FOO
//
//     Generalized format for the window length.
//
//  DO NOT SET THIS TO MORE THAN 10 WITHOUT LENGTHENING hctable
//  the classifier modules !!!!!!  "hctable" contains the pipeline
//  hashing coefficients and needs to be extended to 2 * WINDOW_LEN
//
//     Generic window length code
//#define ARBITRARY_WINDOW_LENGTH
//
#define MARKOVIAN_WINDOW_LEN 5
//
#define OSB_BAYES_WINDOW_LEN 5
//
//      DO NOT set this to more than 5 without lengthening the
//      htup1 and htup2 tables in crm_unified_bayes.c
//
#define UNIFIED_BAYES_WINDOW_LEN 5
//
//      Unified tokenization pipeline length.
//          maximum window length _ever_.
#define UNIFIED_WINDOW_LEN 32
//
//          maximum number of weight vectors to be applied to the pipeline
#define UNIFIED_VECTOR_LIMIT 256

////
//         Winnow algorithm parameters here...
//
#define OSB_WINNOW_WINDOW_LEN 5
#define OSB_WINNOW_PROMOTION 1.23
#define OSB_WINNOW_DEMOTION 0.83
//
//     Now, choose whether we want to use the "old" or the "new" local
//     probability calculation.  The "old" one works slightly better
//     for SBPH and much better for TSS, the "new" one works slightly
//     better for TGB and TGB2, and _much_ better for FOO
//
//     The current default (not necessarily optimal)
//     is Markovian SBPH, STATIC_LOCAL_PROBABILITIES,
//     LOCAL_PROB_DENOM = 16, and SUPER_MARKOV
//
//#define LOCAL_PROB_DENOM 2.0
#define LOCAL_PROB_DENOM 16.0
//#define LOCAL_PROB_DENOM 256.0
#define STATIC_LOCAL_PROBABILITIES
//#define LENGTHBASED_LOCAL_PROBABILITIES
//
//#define ENTROPIC_WEIGHTS
//#define MARKOV_WEIGHTS
#define SUPER_MARKOV_WEIGHTS
//#define BREYER_CHHABRA_SIEFKES_WEIGHTS
//#define BREYER_CHHABRA_SIEFKES_BASE7_WEIGHTS
//#define BCS_MWS_WEIGHTS
//#define BCS_EXP_WEIGHTS
//
//
//    Do we use learncount-based normalization in calculating probabilities?
#define OSB_LEARNCOUNTS
//
//    Do we take only the maximum probability feature?
//
//#define USE_PEAK
//
//
//    Should we use stochastic microgrooming, or weight-distance microgrooming-
//    Make sure ONE of these is turned on.
//#define STOCHASTIC_AMNESIA
#define WEIGHT_DISTANCE_AMNESIA

#if (! defined (STOCHASTIC_AMNESIA) && ! defined (WEIGHT_DISTANCE_AMNESIA))
#error Neither STOCHASTIC_AMNESIA nor WEIGHT_DISTANCE_AMNESIA defined
#elif (defined (STOCHASTIC_AMNESIA) && defined (WEIGHT_DISTANCE_AMNESIA))
#error Both STOCHASTIC_AMNESIA and WEIGHT_DISTANCE_AMNESIA defined
#endif

//
//    define the default max chain length in a .css file that triggers
//    autogrooming, the rescale factor when we rescale, and how often
//    we rescale, and what chance (mask and key) for any particular
//    slot to get rescaled when a rescale is triggered for that slot chain.
//#define MICROGROOM_CHAIN_LENGTH 1024
#define MICROGROOM_CHAIN_LENGTH 256
//#define MICROGROOM_CHAIN_LENGTH 64
#define MICROGROOM_RESCALE_FACTOR .75
#define MICROGROOM_STOCHASTIC_MASK 0x0000000F
#define MICROGROOM_STOCHASTIC_KEY  0x00000001
#define MICROGROOM_STOP_AFTER 32    //  maximum number of buckets groom-zeroed

#define FEATURE_HIT_INCREMENT_SIZE 7

//    define the "block ratio" of how of a memory data window we're
//    willing to suck in from a minion process before we block on
//    sucking; the un-sucked part just waits in the minion's stdout
//    buffer (and causes the minion to block on output).  Normally a
//    factor of 2 (1/4th of the size of a full memory window, or 2
//    megabytes in the default configuraton) is sufficient.
#define SYSCALL_WINDOW_RATIO 2

//   define default internal debug level
#define DEFAULT_INTERNAL_TRACE_LEVEL 0

//   define default user debug level
#define DEFAULT_USER_TRACE_LEVEL 0

//   define maximum number of parenthesized sub regexes we'll accept
#define MAX_SUBREGEX 256

//   define maximum bracket depth nesting we'll allow....
#define MAX_BRACKETDEPTH 256

//   define maximum number of iterations allowed for EVAL expansion
//#define MAX_EVAL_ITERATIONS 16384
//#define MAX_EVAL_ITERATIONS 1024
#define MAX_EVAL_ITERATIONS 4096

//   define maximum size of a pattern in bytes
#define MAX_PATTERN 16384

//    and how long can a variable name be
#define MAX_VARNAME 2048

//   define the default number of buckets in a learning file hash table
//   (note that this should be a prime number, or at least one with a
//    lot of big factors)
//
//       this value (2097153) is one more than 2 megs, for a .css of 24 megs
//#define DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH 2097153
//
//       this value (1048577) is one more than a meg, for a .css of 12 megs
//       for the Markovian, and half that for OSB classifiers
#define DEFAULT_SPARSE_SPECTRUM_FILE_LENGTH 1048577
#define DEFAULT_MARKOVIAN_SPARSE_SPECTRUM_FILE_LENGTH 1048577
#define DEFAULT_OSB_BAYES_SPARSE_SPECTRUM_FILE_LENGTH 524287 // Mersenne prime
#define DEFAULT_WINNOW_SPARSE_SPECTRUM_FILE_LENGTH 1048577
//#define DEFAULT_BIT_ENTROPY_FILE_LENGTH 2000000
#define DEFAULT_BIT_ENTROPY_FILE_LENGTH 1000000


// ???
#define OSB_BAYES_MAX_FEATURE_COUNT DEFAULT_OSB_BAYES_SPARSE_SPECTRUM_FILE_LENGTH

#define WINNOW_MAX_FEATURE_COUNT DEFAULT_WINNOW_SPARSE_SPECTRUM_FILE_LENGTH

//    For the hyperspace matcher, we need to define a few things.
#define HYPERSPACE_MAX_FEATURE_COUNT 500000

//   Stuff for bit-entropic configuration
//   Define the size of our alphabet, and how many bits per alph.
#define ENTROPY_ALPHABET_SIZE 2
#define ENTROPY_CHAR_SIZE 1
#define ENTROPY_CHAR_BITMASK 0x1
//       What fraction of the nodes in a bit-entropic file should be
//       referenceable from the FIR prior arithmetical encoding
//       lookaside table?  0.01 is 1% == average of 100 steps to find
//       the best node.  0.2 is 20% or 5 steps to find the best node.
#define BIT_ENTROPIC_FIR_LOOKASIDE_FRACTION 0.1
#define BIT_ENTROPIC_FIR_LOOKASIDE_STEP_LIMIT 128
#define BIT_ENTROPIC_FIR_PRIOR_BIT_WEIGHT 0.5
#define BIT_ENTROPIC_SHUFFLE_HEIGHT 1024  //   was 256
#define BIT_ENTROPIC_SHUFFLE_WIDTH 1024   //   was 256
#define BIT_ENTROPIC_PROBABILITY_NERF 0.0000000000000000001

// Defines for the svm classifier
// All defines you should want to use without getting into
// the nitty details of the SVM are here.  For nitty detail
// defines, see crm_svm_matrix_util.h, crm_svm_quad_prog.h,
// crm_svm_matrix.h, and crm_svm_lib_fncts.h
#define MAX_SVM_FEATURES 100000       //per example
#define SVM_INTERNAL_TRACE_LEVEL 3    //the debug level when internal_trace is
                                      //on
#define SVM_ACCURACY 1e-3             //The accuracy to which to run the solver
                                      //This is the average margin violation NOT
                                      //accounted for by the slack variable.
#define SV_TOLERANCE 0.01             //An example is a support vector if
                                      //theta*y*x <= 1 + SV_TOLERANCE.
                                      //The smaller SV_TOLERANCE, the fewer
                                      //examples will be tagged as support
                                      //vectors.  This will make it faster to
                                      //learn new examples, but possibly less
                                      //accurate.
#define SVM_ADD_CONSTANT 1            //Define this to be 1 if you want a
                                      //constant offset in the classification
                                      //ie h(x) = theta*x + b where b is
                                      //the offset.  If you don't want
                                      //a constant offset (just h(x) = theta*x),
                                      //define this to be 0.
#define SVM_HOLE_FRAC 0.25            //Size of the "hole" left at the end of
                                      //the file to allow for quick appends
                                      //without having to forcibly unmap the
                                      //file.  This is as a fraction of the
                                      //size of the file without the hole.  So
                                      //setting it to 1 doubles the file size.
                                      //If you don't want a hole left, set
                                      //this to 0.
#define SVM_MAX_SOLVER_ITERATIONS 200 //absolute maximum number of loops the
                                      //solver is allowed
#define SVM_CHECK 100                 //every SVM_CHECK we look to see if
                                      //the accuracy is better than
                                      //SVM_CHECK_FACTOR*SVM_ACCURACY.
                                      //If it is, we exit the solver loop.
#define SVM_CHECK_FACTOR 2            //every SVM_CHECK we look to see if
                                      //the accuracy is better than
                                      //SVM_CHECK_FACTOR*SVM_ACCURACY.
                                      //If it is, we exit the solver loop.
//defines for SVM microgrooming
#define SVM_GROOM_OLD 10000           //we groom only if there are this many
                                      //examples (or more) not being used in
                                      //solving
#define SVM_GROOM_FRAC 0.9            //we keep this fraction of examples after
                                      //grooming
//defines for svm_smart_mode
#define SVM_BASE_EXAMPLES 1000        //the number of examples we need to see
                                      //before we train
#define SVM_INCR_FRAC 0.1             //if more than this fraction of examples
                                      //are appended, we do a fromstart rather
                                      //than use the incremental method.
// Defines for the PCA classifier
// All defines you should want to use without getting into
// the nitty details of the PCA are here.  For nitty detail
// defines, see crm_svm_matrix_util.h and crm_pca_lib_fncts.h
#define MAX_PCA_FEATURES 100000    //per example
#define PCA_INTERNAL_TRACE_LEVEL 3 //the debug level when internal_trace is on
#define PCA_ACCURACY 1e-8          //accuracy to which to run the solver
#define MAX_PCA_ITERATIONS 1000    //maximum number of solver iterations
#define PCA_CLASS_MAG 50           //the starting class magnitudes.  if this
                                   //is too small, the solver will double it
                                   //and resolve.  if it is too large, the
                                   //solver will be less accurate.
#define PCA_REDO_FRAC 0.001        //if we get this fraction of training
                                   //examples wrong with class mag enabled, we
                                   //retrain with class mag doubled.
#define PCA_MAX_REDOS 20           //The maximum number of redos allowed when
                                   //trying to find the correct class mag.
#define PCA_HOLE_FRAC 0.25         //Size of the "hole" left at the end of
                                   //the file to allow for quick appends
                                   //without having to forcibly unmap the file.
                                   //This is as a fraction of the size of the
                                   //file without the hole.  So setting it to
                                   //1 doubles the file size.  If you don't
                                   //want a hole left, set this to 0.
//defines for PCA microgrooming
#define PCA_GROOM_OLD 10000        //we groom only if there are this many
                                   //examples (or more) present
#define PCA_GROOM_FRAC 0.9         //we keep this fraction of examples after
                                   //grooming
//    define the maximum length of a filename
// #define MAX_FILE_NAME_LEN 255

// defaults to system's, if any
#ifdef NAME_MAX
  #define MAX_FILE_NAME_LEN NAME_MAX+1
#else
  #ifdef FILENAME_MAX
    #define MAX_FILE_NAME_LEN FILENAME_MAX+1
  #else
    #define MAX_FILE_NAME_LEN 256
  #endif
#endif

//    define how many microseconds to sleep waiting for a minion process
//    to complete:
//#define MINION_SLEEP_USEC 1000000
//#define MINION_SLEEP_USEC 10000
//#define MINION_SLEEP_USEC 1000
//#define MINION_SLEEP_USEC 100
#define MINION_SLEEP_USEC 10

//    How many microseconds to sleep if we're looping on input WINDOW stmt.
//    try 1 millisecond for now
#define INPUT_WINDOW_SLEEP_USEC 1000

//   DANGER DANGER DANGER DANGER DANGER
//   CHANGE THESE AT YOUR PERIL- YOUR .CSS FILES WILL NOT BE
//   FORWARD COMPATIBLE WITH ANYONE ELSES IF YOU CHANGE THESE.
//
//     Maximum number of different .CSS files in a CLASSIFY
#define MAX_CLASSIFIERS 128

//     how many classes can the library support?
#define LIBCRM_MAX_CLASSES MAX_CLASSIFIERS
//     Maximum length of a stored regex (ugly!  But we need a max length
//     in the mapping.  GROT GROT GROT )
#define MAX_REGEX 4096

//     Maximum number of coeffs for a particular pipeline. (ugly!  But we
//     need a max length for easy mapping.  GROT GROT GROT )
#define MAX_PIPECOEFFS 512

#define MAX_CLASSIFIER_PARAMS 1024

//     Define the type of a token.  This should be either 32-bit or
//     64-bit.  Note that some (for now, all!) classifiers will ignore this.
typedef int CRM114_TOKEN;
// typedef double CRM114_TOKEN;
//
///    END OF DANGER DANGER DANGER DANGER
/////////////////////////////////////////////////////////////////////


//     Maximum number of nonfatal errors we'll allow before tossing our
//     cookies on a fatal error
#define MAX_NONFATAL_ERRORS 100

//     How big is a feature bucket?  Is it a byte, a short, a long,
//     a float, whatever.  :)
//#define FEATUREBUCKET_VALUE_MAX 32767
#define FEATUREBUCKET_VALUE_MAX 1000000000
#define FEATUREBUCKET_HISTOGRAM_MAX 4096


////////////////////////////////////////////
//
//      Improved FSCM-specific parameters
//
/////////////////////////////////////////////

//   this is 2^18 + 1
//   This determines the tradeoff in memory vs. speed/accuracy.
//define FSCM_DEFAULT_HASH_TABLE_SIZE 262145
//
//   This is 1 meg + 1
#define FSCM_DEFAULT_HASH_TABLE_SIZE 1048577

//   How long are our prefixes?  Original prefix was 3 but that's
//   rather suboptimal for best speed.  6 looks pretty good for speed and
//   accuracy.
//   prefix length 6 and thickness 10 (200 multiplier) yields 29 / 4147
//
//#define FSCM_DEFAULT_CODE_PREFIX_LEN 3
#define FSCM_DEFAULT_CODE_PREFIX_LEN 6

//  The chain cache is a speedup for the FSCM match
//  It's indexed modulo the chainstart, with associativity 1.0
#define FSCM_CHAIN_CACHE_SIZE 1048577

////////////////////////////////////////////
//
//     Neural Net parameters
//
////////////////////////////////////////////
#define NN_RETINA_SIZE 8192
#define NN_FIRST_LAYER_SIZE 8
#define NN_HIDDEN_LAYER_SIZE 8
#define NN_MAX_FEATURES 65536

//     Neural Net training setups
//
//     Note- convergence seems to work well at
//    alpha 0.2 init_noise 0.5 stoch_noise 0.1 gain_noise 0.00000001
//    alpha 0.2 init_noise 0.2 stoch_noise 0.1 gain_noise 0.00000001
//    alpha 0.2 init_noise 0.2 stoch_noise 0.05 gain_noise 0.00000001
//    alpha 0.2 init_noise 0.2 stoch_noise 0.05 gain_noise 0.00000001
//    alpha 0.2 init_noise 0.2 stoch_noise 0.05 gain_noise 2.0
//    alpha 0.2 init_noise 0.2 stoch_noise 0.05 gain_noise 2.0 zerotr 0.9999

#define NN_DEFAULT_ALPHA 0.2
//   Initialization noise magnitude
#define NN_INITIALIZATION_NOISE_MAGNITUDE 0.2
//   Stochastic noise magnitude
#define NN_DEFAULT_STOCH_NOISE 0.05
//   Gain noise magnitude
#define NN_DEFAULT_GAIN_NOISE 2.0
//   Zero-tracking factor - factor the weights move toward zero every epoch
#define NN_ZERO_TRACKING 0.9999
//   Threshold for back propagation
#define NN_INTERNAL_TRAINING_THRESHOLD 0.1
//  Just use 1 neuron excitation per token coming in.
#define NN_N_PUMPS 1
//  How many training cycles before we punt out
#define NN_MAX_TRAINING_CYCLES 500
//  When doing a "nuke and retry", allow this many training cycles.
#define NN_MAX_TRAINING_CYCLES_FROMSTART 5000
//  How often do we cause a punt (we punt every 0th epoch modulo this number)
#define NN_FROMSTART_PUNTING 10000000
//  After how many "not needed" cycles do we microgroom this doc away?
#define NN_MICROGROOM_THRESHOLD 1000000
//  use the sparse retina design?  No, it's not good.
#define NN_SPARSE_RETINA 0

//    End of configurable parameters.


#endif	// !_CRM114_CONFIG_H_