1 /* This file is ifile.h - main header file containing all global variables,
2    function prototypes and structure declarations */
3 
4 /* ifile - intelligent mail filter for EXMH/MH
5    Copyright (C) 1997  Jason Daniel Rennie <jr6b+@andrew.cmu.edu>
6    Unless otherwise specified, written by Jason Daniel Rennie
7 
8    This program is free software; you can redistribute it and/or
9    modify it under the terms of the GNU General Public License
10    as published by the Free Software Foundation; either version 2
11    of the License, or (at your option) any later version.
12 
13    This program is distributed in the hope that it will be useful,
14    but WITHOUT ANY WARRANTY; without even the implied warranty of
15    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16    GNU General Public License for more details.
17 
18    You should have received a copy of the GNU General Public License
19    along with this program; if not, write to the Free Software
20    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
21    */
22 
23 /* NOTE: some portions taken/adapted from libbow - written by Andrew Kachites
24  * McCallum */
25 
26 #ifndef __IFILE_H_
27 #define __IFILE_H_
28 
29 #include <stdio.h>
30 #include <assert.h>
31 #include <string.h>
32 #include <stdlib.h>
33 #include "argp/argp.h"
34 #include "extendable_array.h"
35 #include "hash_table.h"
36 
37 
38 #define IFILE_VERSION "ifile 1.3.8"
39 #define IFILE_MAJOR_VERSION 1
40 #define IFILE_MINOR_VERSION 3
41 #define IFILE_TRIFLING_VERSION 8
42 
43 #define FALSE 0
44 #define TRUE 1
45 
46 #ifndef MAX_STR_LEN
47 #define MAX_STR_LEN 2048
48 #endif
49 
50 #ifndef IFILE_INIT_FOLDERS
51 #define IFILE_INIT_FOLDERS 10
52 #endif
53 
54 #ifndef IFILE_INIT_WORDS
55 #define IFILE_INIT_WORDS 5000
56 #endif
57 
58 #ifndef IFILE_MAX_WORD_LENGTH
59 #define IFILE_MAX_WORD_LENGTH 2048
60 #endif
61 
62 #define ifile_malloc(x) malloc(x)
63 #define ifile_realloc(x,y) realloc(x,y)
64 #define ifile_fopen(x,y) fopen(x,y)
65 
66 #ifndef DEFAULT_DB_FILE
67 #define DEFAULT_DB_FILE  ".idata"
68 #endif
69 
70 #ifndef CLOCKS_PER_SECOND
71   #ifdef CLOCKS_PER_SEC
72   #define CLOCKS_PER_SECOND CLOCKS_PER_SEC
73   #else
74     #ifdef CLK_TCK
75     #define CLOCKS_PER_SECOND CLK_TCK
76     #else
77     #define CLOCKS_PER_SECOND 100
78     #endif
79   #endif
80 #endif
81 
82 #define ALPHA_LEXER 1      /* word is a string of alphabetic characters */
83 #define WHITE_LEXER 2      /* word is a whitespace separated string */
84 #define ALPHA_ONLY_LEXER 3 /* word is whitespace separated alpha string */
85 
86 #ifndef ERROR
87 #define ERROR -1    /* standard return value for when something goes wrong */
88 #endif
89 
90 /* Progress and error reporting.  Setting in error.c. */
91 /* Adapted from libbow - written by Andrew Kachites McCallum */
92 enum ifile_verbosity_levels {
93   ifile_silent = 0, /* only fatal errors */
94   ifile_quiet,      /* only warnings and errors */
95   ifile_progress,   /* enough lines to show progress */
96   ifile_verbose,    /* lots of status info */
97   ifile_debug       /* everything (and then some) */
98 };
99 
100 /* A linked list of digits */
101 struct linked_list {
102   struct linked_list * next;
103   int digit;
104 };
105 
106 typedef struct linked_list linked_list;
107 
108 /* struct used when returning categorization of document */
109 typedef struct _category_rating
110 {
111   double rating;
112   char * category;
113 } category_rating;
114 
115 /* entry for each word of the database */
116 typedef struct _db_word_entry {
117   char * word;
118   long int age;
119   long int tot_freq;
120   /* int * freq; */
121   extendable_array *freq;
122 } db_word_entry;
123 
124 /* structure to hold ifile database information */
125 typedef struct _ifile_db {
126   long int num_folders;
127   long int num_words;
128   long int total_docs;
129   long int total_freq;
130   long int (*trim_freq)(long int);
131   extendable_array folder_name;
132   extendable_array folder_freq;
133   extendable_array folder_msg;
134   htable data;  /* index = (char *)  entry = (db_word_entry *) */
135 } ifile_db;
136 
137 /* Used by opts.c to communicate with parse_opt.  */
138 typedef struct _arguments
139 {
140   extendable_array file;		/* [FILE...] */
141   int num_files, thresh;
142   int query, query_insert, concise;
143   int stemming, stoplist, lexer;
144   int skip_header, tag_headers, keep_infrequent, verbosity;
145   int max_length;  /* Ignore characters after first MAX_LENGTH characters */
146   int print_tokens;  /* Tokenize and print messages - nothing else */
147   char *folder_calcs;
148   char *minus_folder, *plus_folder;
149   char *loocv_folder;
150   int create_folder; /* create folder if it does not exist? */
151   char *db_file;
152   int tmp_file;    /* create a /tmp/ifile.log.<userid> file? */
153   int reset_data;
154   int occur;
155   int read_db, write_db, read_message;  /* boolean - what do we need to do? */
156 } arguments;
157 
158 /* initialization functions */
159 void ifile_db_init (ifile_db * idata);
160 void ifile_db_entry_init (db_word_entry * wentry);
161 
162 void ifile_db_free(ifile_db *idata);
163 
164 /* utility functions */
165 unsigned long hash(const char * s, long int size);
166 char * ifile_sprintf (char * format, ...);
167 char * ifile_cats (long int num_strings, ...);
168 char * itoa (long int number);
169 char * readline (char ** bufp);
170 void ifile_free (void * var);
171 char * ifile_strdup (const char *s1);
172 void ifile_bitify_document(htable * message);
173 
174 
175 /* rating functions */
176 category_rating * ifile_rate_categories (htable * message, ifile_db * idata);
177 void ifile_free_categories(category_rating *cr, ifile_db *idata);
178 void ifile_concise_ratings (char * path, FILE * FP, category_rating * ratings,
179 			  ifile_db * idata, int thresh);
180 void ifile_print_ratings (FILE * FP, category_rating * ratings,
181 			  ifile_db * idata, int thresh);
182 
183 /* database functions */
184 void ifile_db_init(ifile_db * idata);
185 htable * ifile_read_message (FILE * FP);
186 void ifile_print_message (htable * message);
187 long int ifile_read_header (ifile_db * idata, char ** bufp);
188 long int ifile_read_word_frequencies (ifile_db * idata, char ** bufp);
189 long int ifile_read_word_entry (char * line, ifile_db * idata);
190 long int ifile_read_db (char * data_file, ifile_db * idata);
191 long int ifile_write_db (char * data_file, ifile_db * idata);
192 long int ifile_write_header (FILE * DATA, ifile_db * idata);
193 long int ifile_write_word_frequencies (FILE * DATA, ifile_db * idata);
194 long int ifile_age_words (ifile_db * idata, long int epochs);
195 void ifile_add_db (char * folder, htable * message, ifile_db * idata, int create);
196 void ifile_del_db (char * folder, htable * message, ifile_db * idata);
197 
198 
199 /* error handling and logging functions */
200 char * ifile_strip_path(char * full_path);
201 FILE * ifile_open_log (int argc, char ** argv);
202 void ifile_close_log ();
203 int ifile_verbosify (int verbosity_level, const char *format, ...);
204 void ifile_error (const char *format, ...);
205 
206 /* command-line argument functions */
207 void ifile_init_args (arguments * args);
208 
209 
210 /*
211  * lexing stuff
212  */
213 
214 /* A structure for maintaining the context of a lexer.  (If you need
215    to create a lexer that uses more context than this, define a new
216    structure that includes this structure as its first element;
217    IFILE_LEX_GRAM, defined below is an example of this.)  */
218 /* Adapted from libbow - written by Andrew Kachites McCallum */
219 typedef struct _ifile_lex {
220   char *document;
221   int document_length;
222   int document_position;
223 } ifile_lex;
224 
225 /* A lexer is represented by a pointer to a structure of this type. */
226 /* sizeof_lex - size of corresponding _ifile_lex structure
227  * *open_text_fp - function to open the document to be lexed
228  * *get_word - function for getting the next word in the document
229  * *close - function for closing the document
230  * document_start_pattern - string to indicate the beginning of the
231  *   document (within the file)
232  * document_end_pattern - string to indicate the end of the document
233  *   (within the file)
234  * note: NULL does not scan forward, "" scans forward to EOF
235  */
236 /* Adapted from libbow - written by Andrew Kachites McCallum */
237 typedef struct _ifile_lexer {
238   int sizeof_lex;
239   ifile_lex* (*open_text_fp) (struct _ifile_lexer *self, FILE *fp);
240   int (*get_word) (struct _ifile_lexer *self, ifile_lex *lex,
241 		   char *buf, int buflen);
242   void (*close) (struct _ifile_lexer *self, ifile_lex *lex);
243   const char *document_start_pattern;
244   const char *document_end_pattern;
245 } ifile_lexer;
246 
247 /* This is an augmented version of IFILE_LEXER that works for simple,
248    context-free lexers. */
249 /* Adapted from libbow - written by Andrew Kachites McCallum */
250 typedef struct _ifile_lexer_simple {
251   /* The basic lexer. */
252   ifile_lexer lexer;
253   /* Parameters of the simple, context-free lexing. */
254   int (*true_to_start)(int character);          /* non-zero on char to start */
255   int (*false_to_end)(int character);           /* zero on char to end */
256   int (*stoplist_func)(const char *);           /* one on token in stoplist */
257   int (*stem_func)(char *);	                /* modify arg by stemming */
258   int case_sensitive;		                /* boolean */
259   int strip_non_alphas_from_end;                /* boolean */
260   int toss_words_containing_non_alphas;	        /* boolean */
261   int toss_words_containing_this_many_digits;
262   int toss_words_longer_than;
263 } ifile_lexer_simple;
264 
265 /* Get the raw token from the document buffer by scanning forward
266    until we get a start character, and filling the buffer until we get
267    an ending character.  The resulting token in the buffer is
268    NULL-terminated.  Return the length of the token. */
269 int ifile_lexer_simple_get_raw_word (ifile_lexer_simple *self, ifile_lex *lex,
270 				     char *buf, int buflen);
271 
272 /* Perform all the necessary postprocessing after the initial token
273    boundaries have been found: strip non-alphas from end, toss words
274    containing non-alphas, toss words containing certaing many digits,
275    toss words appearing in the stop list, stem the word, check the
276    stoplist again, toss words of length one.  If the word is tossed,
277    return zero, otherwise return the length of the word. */
278 int ifile_lexer_simple_postprocess_word (ifile_lexer_simple *self,
279 					 ifile_lex *lex, char *buf, int buflen);
280 
281 /* Create and return a IFILE_LEX, filling the document buffer from
282    characters in FP, starting after the START_PATTERN, and ending with
283    the END_PATTERN. */
284 ifile_lex *ifile_lexer_simple_open_text_fp (ifile_lexer *self, FILE *fp);
285 
286 /* Close the LEX buffer, freeing the memory held by it. */
287 void ifile_lexer_simple_close (ifile_lexer *self, ifile_lex *lex);
288 
289 /* Scan a single token from the LEX buffer, placing it in BUF, and
290    returning the length of the token.  BUFLEN is the maximum number of
291    characters that will fit in BUF.  If the token won't fit in BUF,
292    an error is raised. */
293 int ifile_lexer_simple_get_word (ifile_lexer *self, ifile_lex *lex,
294 			       char *buf, int buflen);
295 
296 /* A lexer that throws out all space-delimited strings that have any
297    non-alphabetical characters.  For example, the string `obtained
298    from http://www.cs.cmu.edu' will result in the tokens `obtained'
299    and `from', but the URL will be skipped. */
300 extern const ifile_lexer_simple *ifile_alpha_only_lexer;
301 
302 /* A lexer that keeps all alphabetic strings, delimited by
303    non-alphabetic characters.  For example, the string
304    `http://www.cs.cmu.edu' will result in the tokens `http', `www',
305    `cs', `cmu', `edu'. */
306 extern const ifile_lexer_simple *ifile_alpha_lexer;
307 
308 /* A lexer that keeps all strings that begin and end with alphabetic
309    characters, delimited by white-space.  For example,
310    the string `http://www.cs.cmu.edu' will be a single token. */
311 extern const ifile_lexer_simple *ifile_white_lexer;
312 
313 
314 /* Some declarations for a generic indirect lexer.  See lex-indirect.c */
315 typedef struct _ifile_lexer_indirect {
316   ifile_lexer lexer;
317   ifile_lexer *underlying_lexer;
318 } ifile_lexer_indirect;
319 
320 /* Open the underlying lexer. */
321 ifile_lex *ifile_lexer_indirect_open_text_fp (ifile_lexer *self, FILE *fp);
322 
323 /* Close the underlying lexer. */
324 void ifile_lexer_indirect_close (ifile_lexer *self, ifile_lex *lex);
325 
326 
327 /* Declarations for an e-mail lexer.  See lex-email.c */
328 
329 /* An augmented version of IFILE_LEXER that allows for removal of certain
330  * e-mail headers */
331 typedef struct _ifile_lexer_email {
332   ifile_lexer_indirect indirect_lexer;
333   char **headers_to_keep;
334   int gram_size;
335 } ifile_lexer_email;
336 
337 /* An augmented version of IFILE_LEX that keeps track of the current
338  * document section */
339 typedef struct _ifile_lex_email {
340   ifile_lex lex;
341   int gram_size_this_time;
342 } ifile_lex_email;
343 
344 /* A lexer which selectively throws out headers of an e-mail message */
345 /* NOTE: value of NULL throws out all headers, value of -1 keeps them all */
346 extern const ifile_lexer_email *ifile_email_lexer;
347 
348 /* Some declarations for a simple N-gram lexer.  See lex-gram.c */
349 
350 /* An augmented version of IFILE_LEXER that provides N-grams */
351 typedef struct _ifile_lexer_gram {
352   ifile_lexer_indirect indirect_lexer;
353   int gram_size;
354 } ifile_lexer_gram;
355 
356 /* An augmented version of IFILE_LEX that works for N-grams */
357 typedef struct _ifile_lex_gram {
358   ifile_lex lex;
359   int gram_size_this_time;
360 } ifile_lex_gram;
361 
362 /* A lexer that returns N-gram tokens using IFILE_ALPHA_ONLY_LEXER.
363    It actually returns all 1-grams, 2-grams ... N-grams, where N is
364    specified by GRAM_SIZE.  */
365 extern const ifile_lexer_gram *ifile_gram_lexer;
366 
367 
368 /* The default lexer that will be used by various library functions
369    like IFILE_WV_NEW_FROM_TEXT_FP().  You should set this variable to
370    point at whichever lexer you desire.  If you do not set it, it
371    will point at ifile_alpha_lexer. */
372 extern ifile_lexer *ifile_default_lexer;
373 
374 /* Default instances of the lexers that can be modified by libbow's
375    argp cmdline argument processing. */
376 extern ifile_lexer_simple *ifile_default_lexer_simple;
377 extern ifile_lexer_email *ifile_default_lexer_email;
378 
379 /* initialize the default lexers */
380 void ifile_default_lexer_init();
381 
382 
383 /* Functions that may be useful in writing a lexer. */
384 
385 /* Apply the Porter stemming algorithm to modify WORD.  Return 0 on success. */
386 int ifile_stem_porter (char *word);
387 
388 /* A function wrapper around POSIX's `isalpha' macro. */
389 int ifile_isalpha (int character);
390 
391 /* A function wrapper around POSIX's `isgraph' macro. */
392 int ifile_isgraph (int character);
393 
394 /* Return non-zero if WORD is on the stoplist. */
395 int ifile_stoplist_present (const char *word);
396 
397 /* Add to the stoplist the white-space delineated words from FILENAME.
398    Return the number of words added.  If the file could not be opened,
399    return -1. */
400 int ifile_stoplist_add_from_file (const char *filename);
401 
402 /* Add WORD to the stop list. */
403 void ifile_stoplist_add_word (const char *word);
404 
405 
406 
407 /*
408  * other stuff from libbow
409  */
410 
411 int ifile_fp_is_text (FILE *fp);
412 
413 
414 
415 /* Managing int->string and string->int mappings. */
416 
417 typedef struct _ifile_int4str {
418   const char **str_array;
419   int str_array_length;
420   int str_array_size;
421   int *str_hash;
422   int str_hash_size;
423 } ifile_int4str;
424 
425 /* Allocate, initialize and return a new int/string mapping structure.
426    The parameter CAPACITY is used as a hint about the number of words
427    to expect; if you don't know or don't care about a CAPACITY value,
428    pass 0, and a default value will be used. */
429 ifile_int4str *ifile_int4str_new (int capacity);
430 
431 /* Given a integer INDEX, return its corresponding string. */
432 const char *ifile_int2str (ifile_int4str *map, int index);
433 
434 /* Given the char-pointer STRING, return its integer index.  If this is
435    the first time we're seeing STRING, add it to the mapping, assign
436    it a new index, and return the new index. */
437 int ifile_str2int (ifile_int4str *map, const char *string);
438 
439 /* Given the char-pointer STRING, return its integer index.  If STRING
440    is not yet in the mapping, return -1. */
441 int ifile_str2int_no_add (ifile_int4str *map, const char *string);
442 
443 /* Create a new int-str mapping by lexing words from FILE. */
444 ifile_int4str *ifile_int4str_new_from_text_file (const char *filename);
445 
446 /* Write the int-str mapping to file-pointer FP. */
447 void ifile_int4str_write (ifile_int4str *map, FILE *fp);
448 
449 /* Return a new int-str mapping, created by reading file-pointer FP. */
450 ifile_int4str *ifile_int4str_new_from_fp (FILE *fp);
451 
452 /* Return a new int-str mapping, created by reading FILENAME. */
453 ifile_int4str *ifile_int4str_new_from_file (const char *filename);
454 
455 /* Free the memory held by the int-word mapping MAP. */
456 void ifile_int4str_free (ifile_int4str *map);
457 
458 /* Free the memory held by the stoplist */
459 void ifile_stoplist_free();
460 
461 #ifdef DMALLOC
462 #include "dmalloc.h"
463 #endif
464 
465 #endif
466 
467