1 /* 2 * Copyright (C) 2002 Laird Breyer 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 3 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write to the Free Software 16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 17 * 18 * Author: Laird Breyer <laird@lbreyer.com> 19 */ 20 21 #ifndef DBACL_H 22 #define DBACL_H 23 24 #ifdef HAVE_CONFIG_H 25 #undef HAVE_CONFIG_H 26 #include "config.h" 27 #endif 28 29 #ifndef VERSION 30 #ifdef PACKAGE_VERSION 31 #define VERSION PACKAGE_VERSION 32 #endif 33 #endif 34 35 #define COPYBLURB "Copyright (c) 2002-2013 L.A. Breyer. All rights reserved.\n" \ 36 "%s comes with ABSOLUTELY NO WARRANTY, and is licensed\n" \ 37 "to you under the terms of the GNU General Public License 3 or later.\n\n" 38 39 #define DEFAULT_CATPATH "DBACL_PATH" 40 /* define this to save category files with a temporary name, then atomically 41 * rename them. This makes corrupt category files nearly impossible, and 42 * obviates the need for file locking in case another instance of dbacl is 43 * trying to read the category while it is being written. 44 */ 45 #define ATOMIC_CATSAVE 46 /* we give our files the 640 permissions - I've added "write" 47 permission because sometimes we want to mmap/readwrite such files 48 so we need those permissions. Also, O_BINARY is not portable, but 49 a good idea for some platforms 50 */ 51 #ifndef O_BINARY 52 #define O_BINARY 0 53 #endif 54 #define ATOMIC_CREATE(x) open(x, O_CREAT|O_EXCL|O_RDWR|O_BINARY, 0640) 55 56 /* we define several memory models, which differ basically 57 in the number of bytes used for the hash tables. Adjust to taste */ 58 59 /* use this for 64-bit hashes */ 60 #undef HUGE_MEMORY_MODEL 61 /* use this for 32-bit hashes */ 62 #define NORMAL_MEMORY_MODEL 63 /* use this for 16-bit hashes */ 64 #undef SMALL_MEMORY_MODEL 65 /* use this for 8-bit hashes */ 66 #undef TINY_MEMORY_MODEL 67 68 /* the following defines set up a tradeoff between 69 modelling accuracy and memory requirements - season to taste 70 (if you often get digitization errors, undef the appropriate macro) */ 71 72 /* digram digitization: avg loss of precision = 0.01 * token size */ 73 #define DIGITIZE_DIGRAMS 74 /* lambda digitization: avg loss of precision = 0.01 */ 75 #define DIGITIZE_LAMBDA 76 /* learner.hash digitization: avg loss of precision = 0.01 */ 77 #define DIGITIZE_LWEIGHTS 78 #if defined HAVE_MBRTOWC 79 80 #include <wctype.h> 81 #include <wchar.h> 82 83 #endif 84 85 #include <limits.h> 86 #include <stdio.h> 87 88 89 #if !defined LOADED_REGEX 90 91 #include <sys/types.h> 92 #include <regex.h> 93 94 #endif 95 96 #if defined HAVE_NETINET_IN_H 97 #include <netinet/in.h> 98 #endif 99 100 #ifndef htonl 101 #define htonl(x) (x) 102 #define ntohl(x) (x) 103 #define htons(x) (x) 104 #define ntohs(x) (x) 105 #endif 106 107 #if defined OS_SUN 108 #include <ieeefp.h> 109 #endif 110 111 /* some systems seem to have broken sys/types */ 112 #if defined OS_SUN || defined OS_HPUX 113 #include <inttypes.h> 114 115 typedef uint8_t u_int8_t; 116 typedef uint16_t u_int16_t; 117 typedef uint32_t u_int32_t; 118 typedef uint64_t u_int64_t; 119 120 #endif 121 122 #ifdef HAVE_MMAP 123 #ifdef HAVE_MADVISE 124 #ifdef HAVE_SYS_MMAN_H 125 126 #include <sys/types.h> 127 #include <sys/mman.h> 128 129 #ifdef OS_SUN 130 #define MADVISE(x,y,z) madvise((caddr_t)(x),y,z) 131 #define MLOCK(x,y) mlock((caddr_t)(x),y) 132 #define MUNLOCK(x,y) munlock((caddr_t)(x),y) 133 #define MUNMAP(x,y) munmap((void *)(x),y) 134 #define MMAP(x,y,z,t,u,v) mmap((void *)(x),y,z,t,u,v) 135 #else 136 #define MADVISE(x,y,z) madvise(x,y,z) 137 #define MLOCK(x,y) mlock(x,y) 138 #define MUNLOCK(x,y) munlock(x,y) 139 #define MUNMAP(x,y) munmap((void *)(x), y) 140 #define MMAP(x,y,z,t,u,v) mmap((void *)(x),y,z,t,u,v) 141 #endif 142 143 #endif 144 #endif 145 #endif 146 147 #ifndef MADVISE 148 #define MAP_FAILED ((void *)-1) 149 #define MADVISE(x,y,z) 150 #define MLOCK(x,y) 151 #define MUNLOCK(x,y) 152 #define MUNMAP(x,y) 153 #define MMAP(x,y,z,t,u,v) NULL 154 #endif 155 156 /* constants used by mmap */ 157 #ifndef PROT_READ 158 #define PROT_READ 0 159 #define PROT_WRITE 0 160 #define PROT_EXEC 0 161 #define PROT_NONE 0 162 #endif 163 164 #define PAGEALIGN(x) ((x) / system_pagesize) * system_pagesize 165 166 /* below, FMT_* macros are used in printf/scanf format strings */ 167 #if defined HUGE_MEMORY_MODEL 168 169 typedef u_int64_t token_count_t; 170 typedef unsigned int token_order_t; /* used in bit-field, therefore uint */ 171 typedef unsigned int token_class_t; /* used in bit-field, therefore uint */ 172 typedef u_int8_t hash_bit_count_t; 173 typedef u_int64_t hash_count_t; 174 typedef unsigned int hash_percentage_t; 175 typedef u_int16_t category_count_t; 176 typedef u_int16_t regex_count_t; 177 typedef u_int64_t document_count_t; 178 typedef u_int16_t confidence_t; 179 180 typedef float weight_t; 181 typedef double score_t; 182 #define FMT_printf_score_t "f" 183 #define FMT_scanf_score_t "lf" 184 #define FMT_printf_integer_t "ld" 185 186 typedef u_int16_t token_stack_t; 187 typedef int charbuf_len_t; 188 typedef u_int16_t alphabet_size_t; 189 typedef u_int16_t smbitmap_t; 190 typedef u_int8_t regex_flags_t; 191 192 typedef int error_code_t; 193 typedef int bool_t; 194 typedef u_int8_t byte_t; 195 196 #if defined DIGITIZE_DIGRAMS && defined DIGITIZE_LAMBDA 197 /* cats not portable because hash value is too big */ 198 #undef PORTABLE_CATS 199 #endif 200 /* keep typedefs and macros togegher */ 201 typedef u_int64_t hash_value_t; 202 #define hton_hash_value_t(x) (x) 203 #define ntoh_hash_value_t(x) (x) 204 typedef u_int16_t digitized_weight_t; 205 #define hton_digitized_weight_t(x) (x) 206 #define ntoh_digitized_weight_t(x) (x) 207 208 /* where token counts wrap around */ 209 #define K_TOKEN_COUNT_MAX ((token_count_t)18446744073709551615U) 210 /* where digrams wrap around */ 211 #define K_DIGRAM_COUNT_MAX ((weight_t)1.0e+9) 212 /* size of hash in bits */ 213 #define MAX_HASH_BITS ((hash_bit_count_t)64) 214 /* for line filtering: maximum number of tokens allowed on a single line */ 215 #define MAX_TOKEN_LINE_STACK ((token_stack_t)16384) 216 /* number of pages we want to use for I/O buffering */ 217 #define BUFFER_MAG 64 218 /* we need 8 byte hash values */ 219 #define JENKINS8 220 #undef JENKINS4 221 222 #elif defined NORMAL_MEMORY_MODEL 223 224 typedef u_int32_t token_count_t; 225 typedef unsigned int token_order_t;/* used in bit-field, therefore uint */ 226 typedef unsigned int token_class_t;/* used in bit-field, therefore uint */ 227 typedef u_int8_t hash_bit_count_t; 228 typedef u_int32_t hash_count_t; 229 typedef unsigned int hash_percentage_t; 230 typedef u_int8_t category_count_t; 231 typedef u_int8_t regex_count_t; 232 typedef u_int32_t document_count_t; 233 typedef u_int16_t confidence_t; 234 235 typedef float weight_t; 236 typedef double score_t; 237 #define FMT_printf_score_t "f" 238 #define FMT_scanf_score_t "lf" 239 #define FMT_printf_integer_t "d" 240 241 typedef u_int8_t token_stack_t; 242 typedef int charbuf_len_t; 243 typedef u_int16_t alphabet_size_t; 244 typedef u_int16_t smbitmap_t; 245 typedef u_int8_t regex_flags_t; 246 247 typedef int error_code_t; 248 typedef int bool_t; 249 typedef u_int8_t byte_t; 250 251 #if defined DIGITIZE_DIGRAMS && defined DIGITIZE_LAMBDA && defined HAVE_NETINET_IN_H 252 #define PORTABLE_CATS 253 #endif 254 /* keep typedefs and macros togegher */ 255 typedef u_int32_t hash_value_t; 256 #define hton_hash_value_t(x) htonl(x) 257 #define ntoh_hash_value_t(x) ntohl(x) 258 typedef u_int16_t digitized_weight_t; 259 #define hton_digitized_weight_t(x) htons(x) 260 #define ntoh_digitized_weight_t(x) ntohs(x) 261 262 /* where token counts wrap around */ 263 #define K_TOKEN_COUNT_MAX ((token_count_t)4294967295U) 264 /* where digrams wrap around */ 265 #define K_DIGRAM_COUNT_MAX ((weight_t)1.0e+9) 266 /* size of hash in bits */ 267 #define MAX_HASH_BITS ((hash_bit_count_t)30) 268 /* for line filtering: maximum number of tokens allowed on a single line */ 269 #define MAX_TOKEN_LINE_STACK ((token_stack_t)255) 270 /* number of pages we want to use for I/O buffering */ 271 #define BUFFER_MAG 32 272 /* we need 4 byte hash values */ 273 #define JENKINS8 274 #undef JENKINS4 275 276 #elif defined SMALL_MEMORY_MODEL 277 278 typedef u_int32_t token_count_t; 279 typedef unsigned int token_order_t;/* used in bit-field, therefore uint */ 280 typedef unsigned int token_class_t;/* used in bit-field, therefore uint */ 281 typedef u_int8_t hash_bit_count_t; 282 typedef u_int16_t hash_count_t; 283 typedef unsigned int hash_percentage_t; 284 typedef u_int8_t category_count_t; 285 typedef u_int8_t regex_count_t; 286 typedef u_int16_t document_count_t; 287 typedef u_int16_t confidence_t; 288 289 typedef float weight_t; 290 typedef double score_t; 291 #define FMT_printf_score_t "f" 292 #define FMT_scanf_score_t "lf" 293 #define FMT_printf_integer_t "d" 294 295 typedef u_int8_t token_stack_t; 296 typedef int charbuf_len_t; 297 typedef u_int16_t alphabet_size_t; 298 typedef u_int16_t smbitmap_t; 299 typedef u_int8_t regex_flags_t; 300 301 typedef int error_code_t; 302 typedef int bool_t; 303 typedef u_int8_t byte_t; 304 305 #if defined DIGITIZE_DIGRAMS && defined DIGITIZE_LAMBDA && defined HAVE_NETINET_IN_H 306 #define PORTABLE_CATS 307 #endif 308 /* keep typedefs and macros togegher */ 309 typedef u_int16_t hash_value_t; 310 #define hton_hash_value_t(x) htons(x) 311 #define ntoh_hash_value_t(x) ntohs(x) 312 typedef u_int16_t digitized_weight_t; 313 #define hton_digitized_weight_t(x) htons(x) 314 #define ntoh_digitized_weight_t(x) ntohs(x) 315 316 /* where token counts wrap around */ 317 #define K_TOKEN_COUNT_MAX ((token_count_t)4294967295U) 318 /* where digrams wrap around */ 319 #define K_DIGRAM_COUNT_MAX ((weight_t)1.0e+9) 320 /* size of hash in bits */ 321 #define MAX_HASH_BITS ((hash_bit_count_t)15) 322 /* for line filtering: maximum number of tokens allowed on a single line */ 323 #define MAX_TOKEN_LINE_STACK ((token_stack_t)128) 324 /* number of pages we want to use for I/O buffering */ 325 #define BUFFER_MAG 16 326 /* we need 4 byte hash values */ 327 #undef JENKINS8 328 #define JENKINS4 329 330 #elif defined TINY_MEMORY_MODEL 331 /* not tested, this model probably doesn't work ;-) */ 332 #undef DIGITIZE_DIGRAMS 333 334 typedef u_int32_t token_count_t; 335 typedef unsigned int token_order_t;/* used in bit-field, therefore uint */ 336 typedef unsigned int token_class_t;/* used in bit-field, therefore uint */ 337 typedef u_int8_t hash_bit_count_t; 338 typedef u_int8_t hash_count_t; 339 typedef unsigned int hash_percentage_t; 340 typedef u_int8_t category_count_t; 341 typedef u_int8_t regex_count_t; 342 typedef u_int8_t document_count_t; 343 typedef u_int16_t confidence_t; 344 345 typedef float weight_t; 346 typedef double score_t; 347 #define FMT_printf_score_t "f" 348 #define FMT_scanf_score_t "lf" 349 #define FMT_printf_integer_t "d" 350 351 typedef u_int8_t token_stack_t; 352 typedef int charbuf_len_t; 353 typedef u_int16_t alphabet_size_t; 354 typedef u_int16_t smbitmap_t; 355 typedef u_int8_t regex_flags_t; 356 357 typedef int error_code_t; 358 typedef int bool_t; 359 typedef u_int8_t byte_t; 360 361 #if defined DIGITIZE_DIGRAMS && defined DIGITIZE_LAMBDA 362 #undef PORTABLE_CATS 363 #endif 364 /* keep typedefs and macros togegher */ 365 typedef u_int8_t hash_value_t; 366 #define hton_hash_value_t(x) (x) 367 #define ntoh_hash_value_t(x) (x) 368 typedef u_int16_t digitized_weight_t; 369 #define hton_digitized_weight_t(x) htons(x) 370 #define ntoh_digitized_weight_t(x) ntohs(x) 371 372 #define K_TOKEN_COUNT_MAX ((token_count_t)4294967295U) 373 /* where digrams wrap around */ 374 #define K_DIGRAM_COUNT_MAX ((weight_t)1.0e+9) 375 /* size of hash in bits */ 376 #define MAX_HASH_BITS ((hash_bit_count_t)8) 377 /* for line filtering: maximum number of tokens allowed on a single line */ 378 #define MAX_TOKEN_LINE_STACK ((token_stack_t)128) 379 /* number of pages we want to use for I/O buffering */ 380 #define BUFFER_MAG 2 381 /* we need 4 byte hash values */ 382 #undef JENKINS8 383 #define JENKINS4 384 385 #endif 386 387 /* this is common to all memory models */ 388 389 #if defined OS_DARWIN 390 /* the system I tested this on didn't seem to like packed structures */ 391 #define PACK_STRUCTS 392 393 #else 394 395 /* disable this if speed is paramount */ 396 #if defined __GNUC__ 397 #define PACK_STRUCTS __attribute__ ((packed)) 398 #else 399 #define PACK_STRUCTS 400 #endif 401 402 #endif 403 404 /* when digitizing transitions, this stands for -infinity */ 405 #define DIGITIZED_WEIGHT_MIN ((digitized_weight_t)0) 406 #define DIGITIZED_WEIGHT_MAX ((digitized_weight_t)USHRT_MAX) 407 #define DIG_FACTOR 5 408 /* maximum number of categories we can handle simultaneously */ 409 #define MAX_CAT ((category_count_t)16383) 410 /* percentage of hash we use */ 411 #define HASH_FULL ((hash_percentage_t)95) 412 /* alphabet size */ 413 #define ASIZE ((alphabet_size_t)256) 414 /* we need three special markers, which cannot be part 415 * of the alphabet. Fortunately, we can use ASCII control 416 * characters. Hopefully, these won't be used for anything important. 417 * Make sure AMIN equals DIAMOND is the last reserved char. 418 */ 419 #define TOKENSEP '\001' 420 #define CLASSEP '\002' 421 #define DIAMOND '\003' 422 #define AMIN DIAMOND 423 #define EOTOKEN CLASSEP 424 /* enough room to pad token with NULL, DIAMOND, CLASSEP and class */ 425 #define EXTRA_CLASS_LEN 2 426 #define EXTRA_TOKEN_LEN (EXTRA_CLASS_LEN + 2) 427 #define MULTIBYTE_EPSILON 10 /* enough for a multibyte char and a null char */ 428 429 /* make sure a character is in the alphabet range */ 430 #define CLIP_ALPHABET(x) x = (((unsigned char)x) < AMIN) ? AMIN : (x) 431 /* the space outside of AMIN-ASIZE is used for auxiliary RESERVED_* data */ 432 #define RESERVED_UNUSED0 0 /* dig[0][0-255] */ 433 #define RESERVED_MARGINAL 2 /* dig[2][0-255] counts single char marginal freqs */ 434 #define RESERVED_TOKLEN 1 /* dig[1][0-MAX_TOKEN_LEN] counts token lengths */ 435 436 /* decides how we compute the shannon entropy */ 437 #undef SHANNON_STIRLING 438 439 /* maximum size of a token, beyond that rest is ignored (ie put into 440 * another token) The value should not be too big, because it protects 441 * against extreme probabilities failing to digitize properly. 442 * 443 * Here's the back-of-the-envelope calculation: In the model, for each 444 * token, we save the reference weight, and the lambda weight. It 445 * seems that the lambdas are of the same order as the corresponding 446 * n-gram's reference weight, 447 * 448 * The reference weights are most extreme for the uniform, equal to 449 * about -5 per token character. Thus, for an n-gram lambda weight, 450 * the most extreme values are about (-5) * total number of 451 * characters. Our calculations blow up for n >= 6 anyway, so the most 452 * extreme value in the worst case (=7) should be about (-5) * 453 * MAX_TOKEN_LEN * 7. 454 * 455 * The other constraint is that we must be able to digitize the 456 * weights, and with DIG_FACTOR = 5, the extreme weight values can be 457 * up to 2048. Giving us a margin of error, we assume that 35 * 458 * MAX_TOKEN_LEN < 1024, which gives MAX_TOKEN_LEN = 30. 459 */ 460 #define MAX_TOKEN_LEN ((charbuf_len_t)30) 461 #define TOKEN_LIST_GROW 1048576L 462 463 /* user options */ 464 #define U_OPTION_CLASSIFY 1 465 #define U_OPTION_LEARN 2 466 #define U_OPTION_FASTEMP 3 467 #define U_OPTION_CUTOFF 4 468 #define U_OPTION_VERBOSE 5 469 #define U_OPTION_STDIN 6 470 #define U_OPTION_SCORES 7 471 #define U_OPTION_POSTERIOR 8 472 #define U_OPTION_FILTER 9 473 #define U_OPTION_DEBUG 10 474 #define U_OPTION_DUMP 12 475 #define U_OPTION_APPEND 13 476 #define U_OPTION_DECIMATE 14 477 #define U_OPTION_GROWHASH 15 478 #define U_OPTION_INDENTED 16 479 #define U_OPTION_NOZEROLEARN 17 480 #define U_OPTION_MMAP 21 481 #define U_OPTION_CONFIDENCE 22 482 #define U_OPTION_VAR 23 483 #define U_OPTION_HM_ADDRESSES 24 484 #define U_OPTION_CLASSIFY_MULTIFILE 25 485 #define U_OPTION_PRIOR_CORRECTION 26 486 #define U_OPTION_MEDIACOUNTS 27 487 488 /* model options */ 489 #define M_OPTION_REFMODEL 1 490 #define M_OPTION_TEXT_FORMAT 2 491 #define M_OPTION_MBOX_FORMAT 3 492 #define M_OPTION_XML 4 493 #define M_OPTION_I18N 5 494 #define M_OPTION_CASEN 6 495 #define M_OPTION_CALCENTROPY 7 496 #define M_OPTION_MULTINOMIAL 8 497 #define M_OPTION_HEADERS 13 498 #define M_OPTION_PLAIN 14 499 #define M_OPTION_NOPLAIN 15 500 #define M_OPTION_SHOW_LINKS 16 501 #define M_OPTION_SHOW_ALT 17 502 #define M_OPTION_HTML 18 503 #define M_OPTION_XHEADERS 19 504 #define M_OPTION_SHOW_SCRIPT 21 505 #define M_OPTION_SHOW_HTML_COMMENTS 22 506 #define M_OPTION_USE_STDTOK 23 507 #define M_OPTION_ATTACHMENTS 24 508 #define M_OPTION_WARNING_BAD 25 509 #define M_OPTION_SHOW_STYLE 26 510 #define M_OPTION_SHOW_FORMS 28 511 #define M_OPTION_NOHEADERS 29 512 #define M_OPTION_NGRAM_STRADDLE_NL 30 513 #define M_OPTION_THEADERS 31 514 515 /* category options */ 516 #define C_OPTION_MMAPPED_HASH 1 517 518 519 typedef u_int32_t options_t; /* make sure big enough for all options */ 520 typedef enum { 521 DT_DEFAULT=0, 522 DT_UNIFORM, DT_DIRICHLET, DT_MAXENT, DT_MLE, DT_IID 523 } digtype_t; 524 typedef enum { 525 CP_DEFAULT=0, 526 CP_CHAR, CP_ALPHA, CP_ALNUM, CP_GRAPH, 527 CP_CEF, CP_ADP, CP_CEF2 528 } charparser_t; 529 #define FMT_printf_options_t "d" 530 #define FMT_scanf_options_t "ld" 531 532 typedef long int re_bitfield; 533 #define MAX_RE ((regex_count_t)(8 * sizeof(re_bitfield))) 534 #define INVALID_RE 0 535 /* maximum number of tagged subexpressions we can handle for each regex */ 536 #define MAX_SUBMATCH ((token_order_t)9) 537 538 typedef enum {gcUNDEF = 0, gcDISCARD, gcTOKEN, gcTOKEN_END, gcIGNORE} good_char_t; 539 540 541 /* macros */ 542 543 /* used for digitizing */ 544 #if defined DIGITIZE_LWEIGHTS 545 546 /* use this when digitizing positive weights */ 547 #define PACK_LWEIGHTS(a) ((digitized_weight_t)digitize_a_weight(a,1)) 548 #define UNPACK_LWEIGHTS(a) ((weight_t)undigitize_a_weight(a,1)) 549 550 /* use this when digitizing negative weights */ 551 #define PACK_RWEIGHTS(a) ((digitized_weight_t)digitize_a_weight(-(a),1)) 552 #define UNPACK_RWEIGHTS(a) (-(weight_t)undigitize_a_weight(a,1)) 553 554 #define DW "w" 555 556 #else 557 558 #define PACK_LWEIGHTS(a) ((weight_t)(a)) 559 #define UNPACK_LWEIGHTS(a) ((weight_t)(a)) 560 561 #define PACK_RWEIGHTS(a) ((weight_t)(a)) 562 #define UNPACK_RWEIGHTS(a) ((weight_t)(a)) 563 564 #define DW ":" 565 566 #endif 567 568 #if defined DIGITIZE_LAMBDA 569 570 #define PACK_LAMBDA(a) ((digitized_weight_t)digitize_a_weight(a,1)) 571 #define UNPACK_LAMBDA(a) ((weight_t)undigitize_a_weight(a,1)) 572 #define DL "l" 573 574 #else 575 576 #define PACK_LAMBDA(a) ((weight_t)(a)) 577 #define UNPACK_LAMBDA(a) ((weight_t)(a)) 578 #define DL ":" 579 580 #endif 581 582 #if defined DIGITIZE_DIGRAMS 583 584 #define PACK_DIGRAMS(a) ((digitized_weight_t)digitize_a_weight(-(a),1)) 585 #define UNPACK_DIGRAMS(a) (-(weight_t)undigitize_a_weight(a,1)) 586 #define SIZEOF_DIGRAMS (sizeof(digitized_weight_t)) 587 #define DD "d" 588 589 #else 590 591 #define PACK_DIGRAMS(a) ((weight_t)(a)) 592 #define UNPACK_DIGRAMS(a) ((weight_t)(a)) 593 #define SIZEOF_DIGRAMS (sizeof(weight_t)) 594 #define DD ":" 595 596 #endif 597 598 #define CLIP_LAMBDA_TOL(x) (x < 1.0/(1<<DIG_FACTOR) ? 1.0/(1<<DIG_FACTOR) : x) 599 600 /* used in hash code */ 601 602 #define FILLEDP(a) ((a)->id) 603 #define EQUALP(a,b) ((a)==(b)) 604 #define SET(a,b) (a = (b)) 605 606 #define SETMARK(a) ((a)->typ.mark = (unsigned int)1) 607 #define UNSETMARK(a) ((a)->typ.mark = (unsigned int)0) 608 #define MARKEDP(a) ((a)->typ.mark == (unsigned int)1) 609 610 #define NOTNULL(x) ((x) > 0) 611 612 #define MAXIMUM(x,y) (((x)<(y))?(y):(x)) 613 #define INCREMENT(x,y,z) if( (x) < (y) ) { (x)++; } else { z = 1; } 614 #define INCREASE(x,d,y,z) if( (x) < ((y)-(d)) ) { (x) += (d); } else { z = 1; } 615 616 #if defined PORTABLE_CATS 617 #define SIGNATURE VERSION " " DD DL DW " " "portable" 618 619 #define NTOH_ID(x) ntoh_hash_value_t(x) 620 #define HTON_ID(x) hton_hash_value_t(x) 621 622 #define NTOH_DIGRAM(x) ntoh_digitized_weight_t(x) 623 #define HTON_DIGRAM(x) hton_digitized_weight_t(x) 624 625 #define NTOH_LAMBDA(x) ntoh_digitized_weight_t(x) 626 #define HTON_LAMBDA(x) hton_digitized_weight_t(x) 627 628 #else 629 #define SIGNATURE VERSION " " DD DL DW " " TARGETCPU 630 631 #define NTOH_ID(x) (x) 632 #define HTON_ID(x) (x) 633 634 #define NTOH_DIGRAM(x) (x) 635 #define HTON_DIGRAM(x) (x) 636 637 #define NTOH_LAMBDA(x) (x) 638 #define HTON_LAMBDA(x) (x) 639 640 #endif 641 642 /* used by both category load and learner save functions */ 643 #define MAGIC_BUFSIZE 512 644 #define MAGIC1 "# dbacl " SIGNATURE " category %s %s\n" 645 #define MAGIC1_LEN (17 + strlen(SIGNATURE)) 646 #define MAGIC2_i "# entropy %" FMT_scanf_score_t \ 647 " logZ %" FMT_scanf_score_t " max_order %hd" \ 648 " type %s\n" 649 #define MAGIC2_o "# entropy %" FMT_printf_score_t \ 650 " logZ %" FMT_printf_score_t " max_order %hd" \ 651 " type %s\n" 652 #define MAGIC3 "# hash_size %hd" \ 653 " features %ld unique_features %ld" \ 654 " documents %ld\n" 655 #define MAGIC4_i "# options %" FMT_scanf_options_t " %hd %hd (%s)\n" 656 #define MAGIC4_o "# options %" FMT_printf_options_t " %hd %hd (%s)\n" 657 #define MAGIC5_i "# regex %s\n" 658 #define MAGIC5_o "# regex %s||%s\n" 659 #define MAGIC5_wo "# regex %ls||%s\n" 660 #define MAGIC7_i "# antiregex %s\n" 661 #define MAGIC7_o "# antiregex %s||%s\n" 662 #define MAGIC7_wo "# antiregex %ls||%s\n" 663 #define MAGIC9 "# min_feature_count %ld max_feature_count %ld\n" 664 #define RESTARTPOS 8 665 #define MAGIC6 "#\n" 666 #define MAGIC8_i "# shannon %" FMT_scanf_score_t \ 667 " shannon_s2 %" FMT_scanf_score_t "\n" 668 #define MAGIC8_o "# shannon %" FMT_printf_score_t \ 669 " shannon_s2 %" FMT_printf_score_t "\n" 670 #define MAGIC10_i "# alpha %" FMT_scanf_score_t \ 671 " beta %" FMT_scanf_score_t \ 672 " mu %" FMT_scanf_score_t \ 673 " s2 %" FMT_scanf_score_t "\n" 674 #define MAGIC10_o "# alpha %" FMT_printf_score_t \ 675 " beta %" FMT_printf_score_t \ 676 " mu %" FMT_printf_score_t \ 677 " s2 %" FMT_printf_score_t "\n" 678 #define MAGIC11 "# medialp " 679 680 #define MAGIC_ONLINE "# dbacl " SIGNATURE " online memory dump\n" 681 682 #define MAGIC_DUMP "# lambda | dig_ref | count | id | token\n" 683 #define MAGIC_DUMPTBL_o "%9.3f %9.3f %7" FMT_printf_integer_t " %8lx " 684 #define MAGIC_DUMPTBL_i "%f %f %d %lx " 685 686 /* data structures */ 687 #define TOKEN_CLASS_MAX 16 688 #define TOKEN_ORDER_MAX 8 689 typedef struct { 690 token_class_t cls: 4; 691 token_order_t order: 3; 692 unsigned int mark: 1; 693 } PACK_STRUCTS token_type_t; 694 695 696 typedef struct { 697 hash_value_t id; 698 token_count_t count; 699 } h_item_t; 700 701 typedef struct { 702 hash_count_t max_tokens; 703 hash_bit_count_t max_hash_bits; 704 token_count_t full_token_count; 705 token_count_t unique_token_count; 706 h_item_t *hash; 707 bool_t track_features; 708 h_item_t *feature_stack[MAX_TOKEN_LINE_STACK]; 709 token_stack_t feature_stack_top; 710 int hashfull_warning; 711 } empirical_t; 712 713 typedef struct { 714 hash_value_t id; 715 #if defined DIGITIZE_LAMBDA 716 digitized_weight_t lam; 717 #else 718 weight_t lam; 719 #endif 720 } PACK_STRUCTS c_item_t; 721 722 typedef enum {simple, sequential} mtype; 723 724 typedef struct { 725 char *filename; 726 char *fullfilename; 727 token_order_t max_order; 728 token_count_t fcomplexity; 729 token_count_t model_unique_token_count; 730 token_count_t model_full_token_count; 731 document_count_t model_num_docs; 732 hash_count_t max_tokens; 733 hash_bit_count_t max_hash_bits; 734 re_bitfield retype; 735 score_t logZ; 736 score_t divergence; 737 score_t renorm; 738 score_t delta; 739 score_t complexity; 740 score_t score; 741 score_t score_div; 742 score_t score_s2; 743 score_t score_shannon; 744 score_t shannon; 745 score_t shannon_s2; 746 score_t alpha; 747 score_t beta; 748 score_t mu; 749 score_t s2; 750 score_t prior; 751 token_count_t fmiss; 752 token_count_t mediacounts[TOKEN_CLASS_MAX]; 753 struct { 754 mtype type; 755 options_t options; 756 charparser_t cp; 757 digtype_t dt; 758 } model; 759 options_t c_options; 760 c_item_t *hash; 761 byte_t *mmap_start; 762 long mmap_offset; 763 #if defined DIGITIZE_DIGRAMS 764 digitized_weight_t dig[ASIZE][ASIZE]; 765 #else 766 weight_t dig[ASIZE][ASIZE]; 767 #endif 768 } category_t; 769 770 typedef struct { 771 token_count_t count; 772 weight_t B; /* mustn't digitize this :-( */ 773 #if defined DIGITIZE_LAMBDA 774 digitized_weight_t lam; 775 #else 776 weight_t lam; 777 #endif 778 union { 779 struct { 780 #if defined DIGITIZE_LWEIGHTS 781 digitized_weight_t ltrms; 782 digitized_weight_t dref; 783 #else 784 weight_t ltrms; 785 weight_t dref; 786 #endif 787 } min; 788 struct { 789 token_count_t eff; 790 } read; 791 } tmp; 792 hash_value_t id; 793 token_type_t typ; 794 } PACK_STRUCTS l_item_t; 795 796 typedef struct { 797 hash_value_t *stack; 798 hash_count_t top; 799 hash_count_t max; 800 score_t shannon; 801 } emplist_t; 802 803 typedef struct { 804 char *filename; 805 struct { 806 FILE *file; 807 char *filename; 808 void *iobuf; 809 long offset; 810 long used; 811 off_t avail; 812 byte_t *mmap_start; 813 long mmap_offset; 814 size_t mmap_length; 815 long mmap_cursor; 816 } tmp; 817 re_bitfield retype; 818 token_order_t max_order; 819 token_count_t fixed_order_token_count[MAX_SUBMATCH]; 820 token_count_t fixed_order_unique_token_count[MAX_SUBMATCH]; 821 hash_bit_count_t max_hash_bits; 822 hash_count_t max_tokens; 823 token_count_t full_token_count; 824 token_count_t unique_token_count; 825 token_count_t tmax; 826 score_t logZ; 827 score_t divergence; 828 score_t shannon; 829 score_t shannon2; 830 score_t alpha; 831 score_t beta; 832 score_t mu; 833 score_t s2; 834 score_t mediaprobs[TOKEN_CLASS_MAX]; 835 struct { 836 options_t options; 837 charparser_t cp; 838 digtype_t dt; 839 int tmin; 840 } model; 841 options_t u_options; 842 byte_t *mmap_start; 843 long mmap_learner_offset; 844 long mmap_hash_offset; 845 l_item_t *hash; 846 weight_t dig[ASIZE][ASIZE]; 847 long int regex_token_count[MAX_RE + 1]; 848 struct { 849 score_t A; 850 score_t S; 851 document_count_t count; 852 document_count_t nullcount; 853 bool_t skip; 854 #define RESERVOIR_SIZE 25 855 /* #define RESERVOIR_SIZE 12 */ 856 /* the reservoir size constrains the accuracy of the variance 857 * estimate. Since this is a heavy computation, we want 858 * to choose the lowest value we can get away with. Here 12 859 * gives an estimate for the error term to within sigma/3, which 860 * hopefully is godd enough for most cases. 861 */ 862 emplist_t emp; 863 emplist_t reservoir[RESERVOIR_SIZE]; 864 } doc; 865 } learner_t; 866 /* this is used when minimizing learner divergence */ 867 #define MAX_LAMBDA_JUMP 100 868 869 typedef struct { 870 double alpha; 871 double u[ASIZE]; 872 } dirichlet_t; 873 874 typedef struct { 875 regex_t regex; 876 char *string; 877 smbitmap_t submatches; 878 regex_flags_t flags; 879 } myregex_t; 880 881 #define MAX_BOUNDARIES 8 882 883 #define MAX_BOUNDARY_BUFSIZE 70 884 885 typedef enum { ceUNDEF, ceID, ceB64, ceQP, ceBIN, ceSEVEN} MIME_Content_Encoding; 886 typedef enum { 887 ctUNDEF, 888 ctTEXT_PLAIN, ctTEXT_RICH, ctTEXT_HTML, ctTEXT_XML, ctTEXT_SGML, ctTEXT_UNKNOWN, 889 ctIMAGE, 890 ctAUDIO, 891 ctVIDEO, 892 ctMODEL, 893 ctMESSAGE_RFC822, 894 ctOTHER, 895 ctOCTET_STREAM, 896 ctAPPLICATION_MSWORD 897 } MIME_Content_Type; 898 899 typedef struct { 900 MIME_Content_Type type; 901 MIME_Content_Encoding encoding; 902 } MIME_Struct; 903 904 typedef enum { htSTANDARD, htEXTENDED, htTRACE, htMIME, htCONT, htUNDEF } HEADER_Type; 905 906 typedef enum { msUNDEF=1, msHEADER, msBODY, msATTACH} Mstate; 907 typedef enum { msuUNDEF=1, msuTRACK, msuMIME, msuARMOR, msuOTHER } Msubstate; 908 typedef enum { mhsUNDEF=1, mhsSUBJECT, mhsFROM, mhsTO, mhsMIME, mhsXHEADER, mhsTRACE} Mhstate; 909 typedef enum { maUNDEF=1, maENABLED} Marmor; 910 typedef enum { psPLAIN, psUUENCODE } Mplainstate; 911 typedef enum { hidUNDEF=1, hidCONTINUATION, 912 hidRECEIVED, hidRETURN_PATH, hidRETURN_RECEIPT_TO, hidREPLY_TO, 913 hidMESSAGE_ID, hidREFERENCES, hidIN_REPLY_TO, 914 hidRESENT_, hidORIGINAL_, 915 hidFROM, hidCC, hidBCC, hidSENT, hidSENDER, 916 hidTO, 917 hidSUBJECT, 918 hidCONTENT_, hidMIME_VERSION, 919 hidLIST_, 920 hidX_, 921 hidUSER_AGENT, 922 hidX_MS, hidCATEGORY, hidPRIORITY, hidIMPORTANCE, hidTHREAD_, 923 hidCOMMENTS, hidKEYWORDS, hidNOTE 924 } Mheaderid; 925 926 typedef struct { 927 char *cache; 928 char *data_ptr; 929 size_t cache_len; 930 size_t max_line_len; 931 } decoding_cache; 932 933 #if defined HAVE_MBRTOWC 934 935 typedef struct { 936 wchar_t *cache; 937 wchar_t *data_ptr; 938 size_t cache_len; 939 size_t max_line_len; 940 } w_decoding_cache; 941 942 #endif 943 944 typedef struct { 945 Mstate state; 946 Msubstate substate; 947 Mhstate hstate; 948 Mheaderid hid; 949 Marmor armor; 950 MIME_Struct header, body; 951 bool_t prev_line_empty; 952 bool_t skip_until_boundary; 953 bool_t corruption_check; 954 bool_t skip_header; 955 char strip_header_char; 956 #if defined HAVE_MBRTOWC 957 wchar_t w_strip_header_char; 958 #endif 959 Mplainstate plainstate; 960 struct { 961 int size[MAX_BOUNDARIES]; 962 char identifier[MAX_BOUNDARIES][MAX_BOUNDARY_BUFSIZE]; 963 #if defined HAVE_MBRTOWC 964 wchar_t w_identifier[MAX_BOUNDARIES][MAX_BOUNDARY_BUFSIZE]; 965 #endif 966 int index; 967 bool_t was_end; 968 } boundary; 969 decoding_cache b64_dc; 970 decoding_cache qp_dc; 971 #if defined HAVE_MBRTOWC 972 w_decoding_cache w_b64_dc; 973 w_decoding_cache w_qp_dc; 974 #endif 975 } MBOX_State; 976 977 typedef enum {TEXT=1, XTAG, XTAGQUOTE, XTAGDQUOTE, XTAGPREQ, TAG, TAGQUOTE, TAGDQUOTE, TAGPREQ, CMNT, DISABLED} Xstate; 978 typedef enum {ALT=1, SRC, SRC_NETLOC, SRC_NETLOC_PREFIX, SRC_NETLOC_PATH, SRC_NETLOC_SUFFIX, UNDEF, JSCRIPT, ASTYLE} Xattribute; 979 typedef enum {xpDUMB=1, xpHTML, xpSMART} Xparser; 980 typedef enum {SCRIPT=1,STYLE,COMMENT,NOFRAMES,NOEMBED,NOSCRIPT,NOLAYER,TITLE,VISIBLE} Xhide; 981 982 typedef struct { 983 Xstate state; 984 Xattribute attribute; 985 Xparser parser; 986 Xhide hide; 987 } XML_State; 988 989 typedef enum {xmlRESET,xmlDISABLE,xmlSMART,xmlHTML,xmlDUMB,xmlUNDEF} XML_Reset; 990 991 #ifdef __cplusplus 992 extern "C" 993 { 994 #endif 995 996 /* these are defined in dbacl.c */ 997 void sanitize_options(); 998 int set_option(int op, char *optarg); 999 1000 void init_learner(learner_t *learner, char *opath, bool_t readonly); 1001 void free_learner(learner_t *learner); 1002 1003 void reset_mbox_messages(learner_t *learner, MBOX_State *mbox); 1004 void count_mbox_messages(learner_t *learner, Mstate mbox_state, char *buf); 1005 void calc_shannon(learner_t *learner); 1006 void update_shannon_partials(learner_t *learner, bool_t fulldoc); 1007 void optimize_and_save(learner_t *learner); 1008 1009 l_item_t *find_in_learner(learner_t *learner, hash_value_t id); 1010 bool_t grow_learner_hash(learner_t *learner); 1011 void hash_word_and_learn(learner_t *learner, 1012 char *tok, token_type_t tt, regex_count_t re); 1013 1014 void make_dirichlet_digrams(learner_t *learner); 1015 void make_uniform_digrams(learner_t *learner); 1016 void transpose_digrams(learner_t *learner); 1017 1018 bool_t read_online_learner_struct(learner_t *learner, char *opath, bool_t readonly); 1019 void write_online_learner_struct(learner_t *learner, char *opath); 1020 error_code_t save_learner(learner_t *learner, char *opath); 1021 1022 1023 /* these are defined in catfun.c */ 1024 char *sanitize_path(char *in, char *extension); 1025 error_code_t sanitize_model_options(options_t *to, charparser_t *mcp, category_t *cat); 1026 /*@shared@*/ char *print_model_options(options_t opt, charparser_t mcp, /*@out@*/ char *buf); 1027 char *print_user_options(options_t opt, char *buf); 1028 1029 void init_empirical(empirical_t *emp, hash_count_t dmt, hash_bit_count_t dmhb); 1030 void free_empirical(empirical_t *emp); 1031 void clear_empirical(empirical_t *emp); 1032 h_item_t *find_in_empirical(empirical_t *emp, hash_value_t id); 1033 score_t empirical_entropy(empirical_t *emp); 1034 1035 1036 void init_category(category_t *cat); 1037 void free_category(category_t *cat); 1038 c_item_t *find_in_category(category_t *cat, hash_value_t id); 1039 void init_purely_random_text_category(category_t *cat); 1040 error_code_t load_category(category_t *cat); 1041 error_code_t load_category_header(FILE *input, category_t *cat); 1042 error_code_t open_category(category_t *cat); 1043 void reload_all_categories(); 1044 1045 void score_word(char *tok, token_type_t tt, regex_count_t re); 1046 confidence_t gamma_pvalue(category_t *cat, double obs); 1047 1048 /* file format handling in fh.c */ 1049 void init_file_handling(); 1050 void cleanup_file_handling(); 1051 1052 token_class_t get_token_class(); 1053 regex_count_t load_regex(char *buf); 1054 void free_all_regexes(); 1055 1056 /* common multibyte and wide char functions in mbw.c */ 1057 good_char_t good_char(char *c); 1058 void std_tokenizer(char *p, char **pq, char *hbuf, 1059 token_order_t *hbuf_order, token_order_t max_order, 1060 void (*word_fun)(char *, token_type_t, regex_count_t), 1061 token_type_t (*get_tt)(token_order_t)); 1062 void regex_tokenizer(char *p, int i, 1063 void (*word_fun)(char *, token_type_t, regex_count_t), 1064 token_type_t (*get_tt)(token_order_t)); 1065 void init_decoding_caches(MBOX_State *mbox); 1066 void free_decoding_caches(MBOX_State *mbox); 1067 bool_t b64_line_filter(decoding_cache *b64cache, char *line); 1068 char *b64_line_filter2(char *line, char *q); 1069 bool_t b64_line_flush(char *line, bool_t all); 1070 bool_t qp_line_filter(decoding_cache *qpcache, char *line); 1071 char *qp_line_filter2(char *line, char *q); 1072 bool_t qp_line_flush(char *line, bool_t all); 1073 bool_t mhe_line_filter(char *line); 1074 int extract_header_label(MBOX_State *mbox, char *line); 1075 bool_t extract_mime_boundary(MBOX_State *mbox, char *line); 1076 bool_t check_mime_boundary(MBOX_State *mbox, const char *line); 1077 bool_t mbox_line_filter(MBOX_State *mbox, char *line, XML_State *xml); 1078 bool_t plain_text_filter(MBOX_State *mbox, char *line); 1079 bool_t strings1_filter(char *line); 1080 1081 void xml_character_filter(XML_State *xml, char *line); 1082 void process_file(FILE *input, 1083 int (*line_filter)(MBOX_State *, char *), 1084 void (*character_filter)(XML_State *, char *), 1085 void (*word_fun)(char *, token_type_t, regex_count_t), 1086 char *(*pre_line_fun)(char *), 1087 void (*post_line_fun)(char *)); 1088 void process_directory(char *name, 1089 int (*line_filter)(MBOX_State *, char *), 1090 void (*character_filter)(XML_State *, char *), 1091 void (*word_fun)(char *, token_type_t, regex_count_t), 1092 char *(*pre_line_fun)(char *), 1093 void (*post_line_fun)(char *), 1094 void (*post_file_fun)(char *)); 1095 1096 void init_mbox_line_filter(MBOX_State *mbox); 1097 void free_mbox_line_filter(MBOX_State *mbox); 1098 void reset_mbox_line_filter(MBOX_State *mbox); 1099 void reset_xml_character_filter(XML_State *xml, XML_Reset reset); 1100 XML_Reset select_xml_defaults(MIME_Struct *mime); 1101 1102 /* probabilities in probs.c */ 1103 double log_poisson(int k, double lambda); 1104 double sample_mean(double x, double n); 1105 double sample_variance(double ss, double x, double n); 1106 double min_prob(int k, int n, double mu[], double sigma[]); 1107 1108 #if defined HAVE_MBRTOWC 1109 /* int w_b64_code(wchar_t c); */ 1110 /* int w_qp_code(wchar_t c); */ 1111 good_char_t w_good_char(wchar_t *c); 1112 void w_std_tokenizer(wchar_t *p, char **pq, char *hbuf, 1113 token_order_t *hbuf_order, token_order_t max_order, 1114 void (*word_fun)(char *, token_type_t, regex_count_t), 1115 token_type_t (*get_tt)(token_order_t)); 1116 void w_regex_tokenizer(wchar_t *p, int i, 1117 void (*word_fun)(char *, token_type_t, regex_count_t), 1118 token_type_t (*get_tt)(token_order_t)); 1119 void w_init_decoding_caches(MBOX_State *mbox); 1120 void w_free_decoding_caches(MBOX_State *mbox); 1121 bool_t w_b64_line_filter(w_decoding_cache *w_b64cache, wchar_t *line); 1122 wchar_t *w_b64_line_filter2(wchar_t *line, wchar_t *q); 1123 bool_t w_b64_line_flush(wchar_t *line, bool_t all); 1124 bool_t w_qp_line_filter(w_decoding_cache *w_qpcache, wchar_t *line); 1125 wchar_t *w_qp_line_filter2(wchar_t *line, wchar_t *q); 1126 bool_t w_qp_line_flush(wchar_t *line, bool_t all); 1127 bool_t w_mhe_line_filter(wchar_t *line); 1128 int w_extract_header_label(MBOX_State *mbox, wchar_t *line); 1129 bool_t w_extract_mime_boundary(MBOX_State *mbox, wchar_t *line); 1130 bool_t w_check_mime_boundary(MBOX_State *mbox, const wchar_t *line); 1131 bool_t w_mbox_line_filter(MBOX_State *mbox, wchar_t *line, XML_State *xml); 1132 bool_t w_plain_text_filter(MBOX_State *mbox, wchar_t *line); 1133 bool_t w_strings1_filter(wchar_t *line); 1134 1135 int wcsncasecmp(const wchar_t *s1, const wchar_t *s2, size_t n); 1136 1137 void w_xml_character_filter(XML_State *xml, wchar_t *line); 1138 void w_process_file(FILE *input, 1139 int (*line_filter)(MBOX_State *, wchar_t *), 1140 void (*character_filter)(XML_State *, wchar_t *), 1141 void (*word_fun)(char *, token_type_t, regex_count_t), 1142 char *(*pre_line_fun)(char *), 1143 void (*post_line_fun)(char *)); 1144 void w_process_directory(char *name, 1145 int (*line_filter)(MBOX_State *, wchar_t *), 1146 void (*character_filter)(XML_State *, wchar_t *), 1147 void (*word_fun)(char *, token_type_t, regex_count_t), 1148 char *(*pre_line_fun)(char *), 1149 void (*post_line_fun)(char *), 1150 void (*post_file_fun)(char *)); 1151 1152 #endif 1153 1154 #ifdef _SC_PAGE_SIZE 1155 #ifndef _SC_PAGESIZE 1156 #define _SC_PAGESIZE _SC_PAGE_SIZE 1157 #endif 1158 #endif 1159 1160 1161 #ifdef __cplusplus 1162 } 1163 #endif 1164 1165 #endif 1166