1 /* This file is ifile.h - main header file containing all global variables, 2 function prototypes and structure declarations */ 3 4 /* ifile - intelligent mail filter for EXMH/MH 5 Copyright (C) 1997 Jason Daniel Rennie <jr6b+@andrew.cmu.edu> 6 Unless otherwise specified, written by Jason Daniel Rennie 7 8 This program is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License 10 as published by the Free Software Foundation; either version 2 11 of the License, or (at your option) any later version. 12 13 This program is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with this program; if not, write to the Free Software 20 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 21 */ 22 23 /* NOTE: some portions taken/adapted from libbow - written by Andrew Kachites 24 * McCallum */ 25 26 #ifndef __IFILE_H_ 27 #define __IFILE_H_ 28 29 #include <stdio.h> 30 #include <assert.h> 31 #include <string.h> 32 #include <stdlib.h> 33 #include "argp/argp.h" 34 #include "extendable_array.h" 35 #include "hash_table.h" 36 37 38 #define IFILE_VERSION "ifile 1.3.8" 39 #define IFILE_MAJOR_VERSION 1 40 #define IFILE_MINOR_VERSION 3 41 #define IFILE_TRIFLING_VERSION 8 42 43 #define FALSE 0 44 #define TRUE 1 45 46 #ifndef MAX_STR_LEN 47 #define MAX_STR_LEN 2048 48 #endif 49 50 #ifndef IFILE_INIT_FOLDERS 51 #define IFILE_INIT_FOLDERS 10 52 #endif 53 54 #ifndef IFILE_INIT_WORDS 55 #define IFILE_INIT_WORDS 5000 56 #endif 57 58 #ifndef IFILE_MAX_WORD_LENGTH 59 #define IFILE_MAX_WORD_LENGTH 2048 60 #endif 61 62 #define ifile_malloc(x) malloc(x) 63 #define ifile_realloc(x,y) realloc(x,y) 64 #define ifile_fopen(x,y) fopen(x,y) 65 66 #ifndef DEFAULT_DB_FILE 67 #define DEFAULT_DB_FILE ".idata" 68 #endif 69 70 #ifndef CLOCKS_PER_SECOND 71 #ifdef CLOCKS_PER_SEC 72 #define CLOCKS_PER_SECOND CLOCKS_PER_SEC 73 #else 74 #ifdef CLK_TCK 75 #define CLOCKS_PER_SECOND CLK_TCK 76 #else 77 #define CLOCKS_PER_SECOND 100 78 #endif 79 #endif 80 #endif 81 82 #define ALPHA_LEXER 1 /* word is a string of alphabetic characters */ 83 #define WHITE_LEXER 2 /* word is a whitespace separated string */ 84 #define ALPHA_ONLY_LEXER 3 /* word is whitespace separated alpha string */ 85 86 #ifndef ERROR 87 #define ERROR -1 /* standard return value for when something goes wrong */ 88 #endif 89 90 /* Progress and error reporting. Setting in error.c. */ 91 /* Adapted from libbow - written by Andrew Kachites McCallum */ 92 enum ifile_verbosity_levels { 93 ifile_silent = 0, /* only fatal errors */ 94 ifile_quiet, /* only warnings and errors */ 95 ifile_progress, /* enough lines to show progress */ 96 ifile_verbose, /* lots of status info */ 97 ifile_debug /* everything (and then some) */ 98 }; 99 100 /* A linked list of digits */ 101 struct linked_list { 102 struct linked_list * next; 103 int digit; 104 }; 105 106 typedef struct linked_list linked_list; 107 108 /* struct used when returning categorization of document */ 109 typedef struct _category_rating 110 { 111 double rating; 112 char * category; 113 } category_rating; 114 115 /* entry for each word of the database */ 116 typedef struct _db_word_entry { 117 char * word; 118 long int age; 119 long int tot_freq; 120 /* int * freq; */ 121 extendable_array *freq; 122 } db_word_entry; 123 124 /* structure to hold ifile database information */ 125 typedef struct _ifile_db { 126 long int num_folders; 127 long int num_words; 128 long int total_docs; 129 long int total_freq; 130 long int (*trim_freq)(long int); 131 extendable_array folder_name; 132 extendable_array folder_freq; 133 extendable_array folder_msg; 134 htable data; /* index = (char *) entry = (db_word_entry *) */ 135 } ifile_db; 136 137 /* Used by opts.c to communicate with parse_opt. */ 138 typedef struct _arguments 139 { 140 extendable_array file; /* [FILE...] */ 141 int num_files, thresh; 142 int query, query_insert, concise; 143 int stemming, stoplist, lexer; 144 int skip_header, tag_headers, keep_infrequent, verbosity; 145 int max_length; /* Ignore characters after first MAX_LENGTH characters */ 146 int print_tokens; /* Tokenize and print messages - nothing else */ 147 char *folder_calcs; 148 char *minus_folder, *plus_folder; 149 char *loocv_folder; 150 int create_folder; /* create folder if it does not exist? */ 151 char *db_file; 152 int tmp_file; /* create a /tmp/ifile.log.<userid> file? */ 153 int reset_data; 154 int occur; 155 int read_db, write_db, read_message; /* boolean - what do we need to do? */ 156 } arguments; 157 158 /* initialization functions */ 159 void ifile_db_init (ifile_db * idata); 160 void ifile_db_entry_init (db_word_entry * wentry); 161 162 void ifile_db_free(ifile_db *idata); 163 164 /* utility functions */ 165 unsigned long hash(const char * s, long int size); 166 char * ifile_sprintf (char * format, ...); 167 char * ifile_cats (long int num_strings, ...); 168 char * itoa (long int number); 169 char * readline (char ** bufp); 170 void ifile_free (void * var); 171 char * ifile_strdup (const char *s1); 172 void ifile_bitify_document(htable * message); 173 174 175 /* rating functions */ 176 category_rating * ifile_rate_categories (htable * message, ifile_db * idata); 177 void ifile_free_categories(category_rating *cr, ifile_db *idata); 178 void ifile_concise_ratings (char * path, FILE * FP, category_rating * ratings, 179 ifile_db * idata, int thresh); 180 void ifile_print_ratings (FILE * FP, category_rating * ratings, 181 ifile_db * idata, int thresh); 182 183 /* database functions */ 184 void ifile_db_init(ifile_db * idata); 185 htable * ifile_read_message (FILE * FP); 186 void ifile_print_message (htable * message); 187 long int ifile_read_header (ifile_db * idata, char ** bufp); 188 long int ifile_read_word_frequencies (ifile_db * idata, char ** bufp); 189 long int ifile_read_word_entry (char * line, ifile_db * idata); 190 long int ifile_read_db (char * data_file, ifile_db * idata); 191 long int ifile_write_db (char * data_file, ifile_db * idata); 192 long int ifile_write_header (FILE * DATA, ifile_db * idata); 193 long int ifile_write_word_frequencies (FILE * DATA, ifile_db * idata); 194 long int ifile_age_words (ifile_db * idata, long int epochs); 195 void ifile_add_db (char * folder, htable * message, ifile_db * idata, int create); 196 void ifile_del_db (char * folder, htable * message, ifile_db * idata); 197 198 199 /* error handling and logging functions */ 200 char * ifile_strip_path(char * full_path); 201 FILE * ifile_open_log (int argc, char ** argv); 202 void ifile_close_log (); 203 int ifile_verbosify (int verbosity_level, const char *format, ...); 204 void ifile_error (const char *format, ...); 205 206 /* command-line argument functions */ 207 void ifile_init_args (arguments * args); 208 209 210 /* 211 * lexing stuff 212 */ 213 214 /* A structure for maintaining the context of a lexer. (If you need 215 to create a lexer that uses more context than this, define a new 216 structure that includes this structure as its first element; 217 IFILE_LEX_GRAM, defined below is an example of this.) */ 218 /* Adapted from libbow - written by Andrew Kachites McCallum */ 219 typedef struct _ifile_lex { 220 char *document; 221 int document_length; 222 int document_position; 223 } ifile_lex; 224 225 /* A lexer is represented by a pointer to a structure of this type. */ 226 /* sizeof_lex - size of corresponding _ifile_lex structure 227 * *open_text_fp - function to open the document to be lexed 228 * *get_word - function for getting the next word in the document 229 * *close - function for closing the document 230 * document_start_pattern - string to indicate the beginning of the 231 * document (within the file) 232 * document_end_pattern - string to indicate the end of the document 233 * (within the file) 234 * note: NULL does not scan forward, "" scans forward to EOF 235 */ 236 /* Adapted from libbow - written by Andrew Kachites McCallum */ 237 typedef struct _ifile_lexer { 238 int sizeof_lex; 239 ifile_lex* (*open_text_fp) (struct _ifile_lexer *self, FILE *fp); 240 int (*get_word) (struct _ifile_lexer *self, ifile_lex *lex, 241 char *buf, int buflen); 242 void (*close) (struct _ifile_lexer *self, ifile_lex *lex); 243 const char *document_start_pattern; 244 const char *document_end_pattern; 245 } ifile_lexer; 246 247 /* This is an augmented version of IFILE_LEXER that works for simple, 248 context-free lexers. */ 249 /* Adapted from libbow - written by Andrew Kachites McCallum */ 250 typedef struct _ifile_lexer_simple { 251 /* The basic lexer. */ 252 ifile_lexer lexer; 253 /* Parameters of the simple, context-free lexing. */ 254 int (*true_to_start)(int character); /* non-zero on char to start */ 255 int (*false_to_end)(int character); /* zero on char to end */ 256 int (*stoplist_func)(const char *); /* one on token in stoplist */ 257 int (*stem_func)(char *); /* modify arg by stemming */ 258 int case_sensitive; /* boolean */ 259 int strip_non_alphas_from_end; /* boolean */ 260 int toss_words_containing_non_alphas; /* boolean */ 261 int toss_words_containing_this_many_digits; 262 int toss_words_longer_than; 263 } ifile_lexer_simple; 264 265 /* Get the raw token from the document buffer by scanning forward 266 until we get a start character, and filling the buffer until we get 267 an ending character. The resulting token in the buffer is 268 NULL-terminated. Return the length of the token. */ 269 int ifile_lexer_simple_get_raw_word (ifile_lexer_simple *self, ifile_lex *lex, 270 char *buf, int buflen); 271 272 /* Perform all the necessary postprocessing after the initial token 273 boundaries have been found: strip non-alphas from end, toss words 274 containing non-alphas, toss words containing certaing many digits, 275 toss words appearing in the stop list, stem the word, check the 276 stoplist again, toss words of length one. If the word is tossed, 277 return zero, otherwise return the length of the word. */ 278 int ifile_lexer_simple_postprocess_word (ifile_lexer_simple *self, 279 ifile_lex *lex, char *buf, int buflen); 280 281 /* Create and return a IFILE_LEX, filling the document buffer from 282 characters in FP, starting after the START_PATTERN, and ending with 283 the END_PATTERN. */ 284 ifile_lex *ifile_lexer_simple_open_text_fp (ifile_lexer *self, FILE *fp); 285 286 /* Close the LEX buffer, freeing the memory held by it. */ 287 void ifile_lexer_simple_close (ifile_lexer *self, ifile_lex *lex); 288 289 /* Scan a single token from the LEX buffer, placing it in BUF, and 290 returning the length of the token. BUFLEN is the maximum number of 291 characters that will fit in BUF. If the token won't fit in BUF, 292 an error is raised. */ 293 int ifile_lexer_simple_get_word (ifile_lexer *self, ifile_lex *lex, 294 char *buf, int buflen); 295 296 /* A lexer that throws out all space-delimited strings that have any 297 non-alphabetical characters. For example, the string `obtained 298 from http://www.cs.cmu.edu' will result in the tokens `obtained' 299 and `from', but the URL will be skipped. */ 300 extern const ifile_lexer_simple *ifile_alpha_only_lexer; 301 302 /* A lexer that keeps all alphabetic strings, delimited by 303 non-alphabetic characters. For example, the string 304 `http://www.cs.cmu.edu' will result in the tokens `http', `www', 305 `cs', `cmu', `edu'. */ 306 extern const ifile_lexer_simple *ifile_alpha_lexer; 307 308 /* A lexer that keeps all strings that begin and end with alphabetic 309 characters, delimited by white-space. For example, 310 the string `http://www.cs.cmu.edu' will be a single token. */ 311 extern const ifile_lexer_simple *ifile_white_lexer; 312 313 314 /* Some declarations for a generic indirect lexer. See lex-indirect.c */ 315 typedef struct _ifile_lexer_indirect { 316 ifile_lexer lexer; 317 ifile_lexer *underlying_lexer; 318 } ifile_lexer_indirect; 319 320 /* Open the underlying lexer. */ 321 ifile_lex *ifile_lexer_indirect_open_text_fp (ifile_lexer *self, FILE *fp); 322 323 /* Close the underlying lexer. */ 324 void ifile_lexer_indirect_close (ifile_lexer *self, ifile_lex *lex); 325 326 327 /* Declarations for an e-mail lexer. See lex-email.c */ 328 329 /* An augmented version of IFILE_LEXER that allows for removal of certain 330 * e-mail headers */ 331 typedef struct _ifile_lexer_email { 332 ifile_lexer_indirect indirect_lexer; 333 char **headers_to_keep; 334 int gram_size; 335 } ifile_lexer_email; 336 337 /* An augmented version of IFILE_LEX that keeps track of the current 338 * document section */ 339 typedef struct _ifile_lex_email { 340 ifile_lex lex; 341 int gram_size_this_time; 342 } ifile_lex_email; 343 344 /* A lexer which selectively throws out headers of an e-mail message */ 345 /* NOTE: value of NULL throws out all headers, value of -1 keeps them all */ 346 extern const ifile_lexer_email *ifile_email_lexer; 347 348 /* Some declarations for a simple N-gram lexer. See lex-gram.c */ 349 350 /* An augmented version of IFILE_LEXER that provides N-grams */ 351 typedef struct _ifile_lexer_gram { 352 ifile_lexer_indirect indirect_lexer; 353 int gram_size; 354 } ifile_lexer_gram; 355 356 /* An augmented version of IFILE_LEX that works for N-grams */ 357 typedef struct _ifile_lex_gram { 358 ifile_lex lex; 359 int gram_size_this_time; 360 } ifile_lex_gram; 361 362 /* A lexer that returns N-gram tokens using IFILE_ALPHA_ONLY_LEXER. 363 It actually returns all 1-grams, 2-grams ... N-grams, where N is 364 specified by GRAM_SIZE. */ 365 extern const ifile_lexer_gram *ifile_gram_lexer; 366 367 368 /* The default lexer that will be used by various library functions 369 like IFILE_WV_NEW_FROM_TEXT_FP(). You should set this variable to 370 point at whichever lexer you desire. If you do not set it, it 371 will point at ifile_alpha_lexer. */ 372 extern ifile_lexer *ifile_default_lexer; 373 374 /* Default instances of the lexers that can be modified by libbow's 375 argp cmdline argument processing. */ 376 extern ifile_lexer_simple *ifile_default_lexer_simple; 377 extern ifile_lexer_email *ifile_default_lexer_email; 378 379 /* initialize the default lexers */ 380 void ifile_default_lexer_init(); 381 382 383 /* Functions that may be useful in writing a lexer. */ 384 385 /* Apply the Porter stemming algorithm to modify WORD. Return 0 on success. */ 386 int ifile_stem_porter (char *word); 387 388 /* A function wrapper around POSIX's `isalpha' macro. */ 389 int ifile_isalpha (int character); 390 391 /* A function wrapper around POSIX's `isgraph' macro. */ 392 int ifile_isgraph (int character); 393 394 /* Return non-zero if WORD is on the stoplist. */ 395 int ifile_stoplist_present (const char *word); 396 397 /* Add to the stoplist the white-space delineated words from FILENAME. 398 Return the number of words added. If the file could not be opened, 399 return -1. */ 400 int ifile_stoplist_add_from_file (const char *filename); 401 402 /* Add WORD to the stop list. */ 403 void ifile_stoplist_add_word (const char *word); 404 405 406 407 /* 408 * other stuff from libbow 409 */ 410 411 int ifile_fp_is_text (FILE *fp); 412 413 414 415 /* Managing int->string and string->int mappings. */ 416 417 typedef struct _ifile_int4str { 418 const char **str_array; 419 int str_array_length; 420 int str_array_size; 421 int *str_hash; 422 int str_hash_size; 423 } ifile_int4str; 424 425 /* Allocate, initialize and return a new int/string mapping structure. 426 The parameter CAPACITY is used as a hint about the number of words 427 to expect; if you don't know or don't care about a CAPACITY value, 428 pass 0, and a default value will be used. */ 429 ifile_int4str *ifile_int4str_new (int capacity); 430 431 /* Given a integer INDEX, return its corresponding string. */ 432 const char *ifile_int2str (ifile_int4str *map, int index); 433 434 /* Given the char-pointer STRING, return its integer index. If this is 435 the first time we're seeing STRING, add it to the mapping, assign 436 it a new index, and return the new index. */ 437 int ifile_str2int (ifile_int4str *map, const char *string); 438 439 /* Given the char-pointer STRING, return its integer index. If STRING 440 is not yet in the mapping, return -1. */ 441 int ifile_str2int_no_add (ifile_int4str *map, const char *string); 442 443 /* Create a new int-str mapping by lexing words from FILE. */ 444 ifile_int4str *ifile_int4str_new_from_text_file (const char *filename); 445 446 /* Write the int-str mapping to file-pointer FP. */ 447 void ifile_int4str_write (ifile_int4str *map, FILE *fp); 448 449 /* Return a new int-str mapping, created by reading file-pointer FP. */ 450 ifile_int4str *ifile_int4str_new_from_fp (FILE *fp); 451 452 /* Return a new int-str mapping, created by reading FILENAME. */ 453 ifile_int4str *ifile_int4str_new_from_file (const char *filename); 454 455 /* Free the memory held by the int-word mapping MAP. */ 456 void ifile_int4str_free (ifile_int4str *map); 457 458 /* Free the memory held by the stoplist */ 459 void ifile_stoplist_free(); 460 461 #ifdef DMALLOC 462 #include "dmalloc.h" 463 #endif 464 465 #endif 466 467