1 /*
2  * Copyright (C) 2002 Laird Breyer
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation; either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
17  *
18  * Author:   Laird Breyer <laird@lbreyer.com>
19  */
20 
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24 
25 #if defined HAVE_UNISTD_H
26 #include <unistd.h>
27 #endif
28 
29 #include <signal.h>
30 #include <string.h>
31 #include <stdlib.h>
32 #include <stdarg.h>
33 #include <sys/stat.h>
34 #include <fcntl.h>
35 #include <math.h>
36 #include "util.h"
37 
38 /*@constant double M_LN2@*/
39 
40 extern char *progname;
41 extern char *inputfile;
42 extern long inputline;
43 extern options_t u_options;
44 extern options_t m_options;
45 extern int cmd;
46 
47 extern char *textbuf;
48 extern charbuf_len_t textbuf_len;
49 
50 #if defined HAVE_MBRTOWC
51 extern wchar_t *wc_textbuf;
52 extern charbuf_len_t wc_textbuf_len;
53 #endif
54 
55 extern void *in_iobuf;
56 extern void *out_iobuf;
57 
58 extern long system_pagesize;
59 
60 int sa_signal = 0;
61 signal_cleanup_t cleanup = { NULL };
62 
63 /***********************************************************
64  * GLOBAL BUFFERS                                          *
65  ***********************************************************/
init_buffers()66 void init_buffers() {
67   /* preallocate primary text holding buffer */
68   textbuf_len = system_pagesize;
69   textbuf = (char *)malloc(textbuf_len);
70 
71   MADVISE(textbuf, sizeof(char) * textbuf_len, MADV_SEQUENTIAL);
72 
73 #if defined HAVE_POSIX_MEMALIGN
74   /* buffer must exist until after fclose() if used in setvbuf() */
75   if( 0 != posix_memalign(&in_iobuf, system_pagesize,
76 			  BUFFER_MAG * system_pagesize) ) {
77     in_iobuf = NULL; /* just to be sure */
78   }
79   /* buffer must exist until after fclose() if used in setvbuf() */
80   if( 0 != posix_memalign(&out_iobuf, system_pagesize,
81 			  BUFFER_MAG * system_pagesize) ) {
82     out_iobuf = NULL; /* just to be sure */
83   }
84 #elif defined HAVE_MEMALIGN
85   /* memalign()ed memory can't be reclaimed by free() */
86   in_iobuf = (void *)memalign(system_pagesize, BUFFER_MAG * system_pagesize);
87   out_iobuf = (void *)memalign(system_pagesize, BUFFER_MAG * system_pagesize);
88 #elif defined HAVE_VALLOC
89   /* valloc()ed memory can't be reclaimed by free() */
90   in_iobuf = (void *)valloc(BUFFER_MAG * system_pagesize);
91   out_iobuf = (void *)valloc(BUFFER_MAG * system_pagesize);
92 #endif
93 }
94 
cleanup_buffers()95 void cleanup_buffers() {
96   /* free some global resources */
97   free(textbuf);
98 #if defined HAVE_POSIX_MEMALIGN
99   if( in_iobuf ) { free(in_iobuf); }
100   if( out_iobuf ) { free(out_iobuf); }
101 #endif
102 }
103 
cleanup_tempfiles()104 void cleanup_tempfiles() {
105   if( cleanup.tempfile ) {
106     unlink(cleanup.tempfile);
107     cleanup.tempfile = NULL;
108   }
109 }
110 
111 
set_iobuf_mode(FILE * input)112 void set_iobuf_mode(FILE *input) {
113   struct stat statinfo;
114   if( in_iobuf ) {
115     /* choose appropriate buffering mode */
116     if( fstat(fileno(input), &statinfo) == 0 ) {
117       switch(statinfo.st_mode & S_IFMT) {
118       case S_IFREG:
119       case S_IFBLK:
120 	setvbuf(input, (char *)in_iobuf, _IOFBF, BUFFER_MAG * system_pagesize);
121 	break;
122       case S_IFIFO:
123       case S_IFCHR:
124 	setvbuf(input, (char *)NULL,
125 		(u_options & (1<<U_OPTION_FILTER)) ? _IOLBF : _IOFBF,
126 		BUFFER_MAG * system_pagesize);
127 	break;
128       case S_IFDIR:
129       default:
130 	/* nothing */
131 	break;
132       }
133 
134     }
135   }
136 }
137 /***********************************************************
138  * TOKEN HASHING                                           *
139  ***********************************************************/
hash_full_token(const char * tok)140 hash_value_t hash_full_token(const char *tok) {
141   const char *q;
142   JENKINS_HASH_VALUE h;
143   q = strchr(tok,EOTOKEN);
144   if( q ) {
145     h = hash((unsigned char *)tok, q - tok, 0);
146     return (hash_value_t)hash((unsigned char *)q, EXTRA_CLASS_LEN, h);
147   } else {
148     errormsg(E_FATAL,
149 	    "hash_full_token called with missing class [%s]\n",
150 	     tok);
151   }
152   return (hash_value_t)0;
153 }
154 
hash_partial_token(const char * tok,int len,const char * extra)155 hash_value_t hash_partial_token(const char *tok, int len, const char *extra) {
156   JENKINS_HASH_VALUE h;
157   h = hash((unsigned char *)tok, len, 0);
158   return (hash_value_t)hash((unsigned char *)extra, EXTRA_CLASS_LEN, h);
159 }
160 
161 /***********************************************************
162  * WEIGHT SIZE REDUCTION                                   *
163  ***********************************************************/
164 
digitize_a_weight(weight_t w,token_order_t order)165 digitized_weight_t digitize_a_weight(weight_t w, token_order_t order) {
166   if( w < 0.0 ) {
167     errormsg(E_FATAL,
168 	    "digitize_a_weight called with negative argument %f\n",
169 	    w);
170 /*     return DIGITIZED_WEIGHT_MIN; */
171   } else if( order * w > (1<<(16 - DIG_FACTOR)) ) {
172     return DIGITIZED_WEIGHT_MAX;
173   } else {
174     return (digitized_weight_t)(w * (order<<DIG_FACTOR));
175   }
176   return DIGITIZED_WEIGHT_MIN;
177 }
178 
undigitize_a_weight(digitized_weight_t d,token_order_t order)179 weight_t undigitize_a_weight(digitized_weight_t d, token_order_t order) {
180   return ((weight_t)d) / (order<<DIG_FACTOR);
181 }
182 
nats2bits(double score)183 double nats2bits(double score) {
184   return score/M_LN2;
185 }
186 
187 
188 
189 /***********************************************************
190  * SIGNAL HANDLING                                         *
191  ***********************************************************/
192 #if defined HAVE_SIGACTION
193 /* sigaction structure used by several functions */
194 struct sigaction act;
195 #endif
196 
my_sa_handler(int signum)197 void my_sa_handler(int signum) {
198 #if defined HAVE_SIGACTION
199   sa_signal = signum;
200 #endif
201 }
202 
sigsegv(int signum)203 void sigsegv(int signum) {
204   fprintf(stdout, "%s:error: segmentation fault at input line %ld of %s\n",
205 	  progname, inputline, inputfile);
206   cleanup_tempfiles();
207   exit(1);
208 }
209 
210 /* intercepts typical termination signals and tries to do the right thing */
init_signal_handling()211 void init_signal_handling() {
212 #if defined HAVE_SIGACTION
213 
214   memset(&act, 0, sizeof(act));
215 
216   /* set up global sigaction structure */
217 
218   act.sa_handler = my_sa_handler;
219   sigemptyset(&act.sa_mask);
220   sigaddset(&act.sa_mask,SIGHUP);
221   sigaddset(&act.sa_mask,SIGINT);
222   sigaddset(&act.sa_mask,SIGQUIT);
223   sigaddset(&act.sa_mask,SIGTERM);
224   sigaddset(&act.sa_mask,SIGPIPE);
225   sigaddset(&act.sa_mask,SIGUSR1);
226   act.sa_flags = 0;
227 
228   sigaction(SIGHUP, &act, NULL);
229   sigaction(SIGINT, &act, NULL);
230   sigaction(SIGQUIT, &act, NULL);
231   sigaction(SIGTERM, &act, NULL);
232   sigaction(SIGPIPE, &act, NULL);
233   sigaction(SIGUSR1, &act, NULL);
234 
235   act.sa_handler = sigsegv;
236   sigemptyset(&act.sa_mask);
237   sigaddset(&act.sa_mask,SIGSEGV);
238   act.sa_flags = 0;
239   sigaction(SIGSEGV, &act, NULL);
240 
241 #endif
242 }
243 
cleanup_signal_handling()244 void cleanup_signal_handling() {
245   /* nothing - this is just to mess with your head ;-) */
246 }
247 
process_pending_signal(FILE * input)248 void process_pending_signal(FILE *input) {
249 #if defined HAVE_SIGACTION
250 
251   if( sa_signal ) {
252 
253     sigprocmask(SIG_BLOCK, &act.sa_mask, NULL);
254     switch(sa_signal) {
255     case SIGINT:
256       cleanup_tempfiles();
257       fprintf(stderr,
258 	      "%s:signal: caught interrupt request, exiting\n", progname);
259       exit(1);
260       break;
261     case SIGPIPE:
262       /* should we terminate, or should we ignore? */
263       cleanup_tempfiles();
264       fprintf(stderr,
265 	      "%s:error: broken pipe on output, exiting\n", progname);
266       exit(1);
267       break;
268     case SIGHUP:
269     case SIGQUIT:
270     case SIGTERM:
271       fprintf(stderr,
272 	      "%s:signal: caught termination request, ignoring further input\n",
273 	      progname);
274       if( input ) { fclose(input); }
275       cmd |= (1<<CMD_QUITNOW);
276       break;
277     case SIGUSR1:
278       if( u_options & (1<<U_OPTION_CLASSIFY) ) {
279 	fprintf(stderr,
280 		"%s:signal: caught SIGUSR1 request, reloading categories asap\n",
281 		progname);
282 	cmd |= (1<<CMD_RELOAD_CATS);
283       } else {
284 	fprintf(stderr,
285 		"%s:signal: caught SIGUSR1 request, ignoring\n", progname);
286       }
287       break;
288     default:
289       /* nothing */
290       break;
291     }
292     sa_signal = 0;
293 
294     sigprocmask(SIG_UNBLOCK, &act.sa_mask, NULL);
295 
296   }
297 
298 #endif
299 }
300 
301 
302 /***********************************************************
303  * ERROR DISPLAY                                           *
304  ***********************************************************/
errormsg(int etype,const char * fmt,...)305 void errormsg(int etype, const char *fmt, ...) {
306   va_list vap;
307 
308   switch(etype) {
309   case E_WARNING:
310     fprintf(stderr, "%s:warning: ", progname);
311     break;
312   default:
313   case E_ERROR:
314   case E_FATAL:
315     fprintf(stderr, "%s:error: ", progname);
316     break;
317   }
318 
319 #if HAVE_VPRINTF
320   va_start(vap, fmt);
321   vfprintf(stderr, fmt, vap);
322   va_end(vap);
323 #else
324   fprintf(stderr, "%s", fmt);
325 #endif
326 
327   if( etype == E_FATAL ) {
328     cleanup_tempfiles();
329     exit(1);
330   }
331 }
332 
333 /***********************************************************
334  * MULTIBYTE FILE HANDLING FUNCTIONS                       *
335  * this is suitable for any locale whose character set     *
336  * encoding doesn't include NUL bytes inside characters    *
337  ***********************************************************/
338 
print_token(FILE * out,const char * tok)339 void print_token(FILE *out, const char *tok) {
340   while(*tok) {
341     switch(*tok) {
342     case DIAMOND:
343       fprintf(out, "[]");
344       break;
345     case TOKENSEP:
346       fprintf(out, " ");
347       break;
348     case CLASSEP:
349       fprintf(out, "(%d)", tok[1] - AMIN);
350       tok++;
351       break;
352     default:
353       fprintf(out, "%c", *tok);
354       break;
355     }
356     tok++;
357   }
358 }
359 
360 /* even after the EOF is reached, this pretends there are
361  * a few more blank lines, to allow filters to process
362  * cached input.
363  */
fill_textbuf(FILE * input,int * extra_lines)364 bool_t fill_textbuf(FILE *input, int *extra_lines) {
365   char *s;
366   charbuf_len_t l, k;
367 
368   if( !(cmd & (1<<CMD_QUITNOW)) && !feof(input) ) {
369     process_pending_signal(input);
370 
371     /* read in a full line, allocating memory as necessary */
372     textbuf[0] = '\0';
373     s = textbuf;
374     l = textbuf_len;
375     k = 1;
376     while( fgets(s, l, input) && ((charbuf_len_t)strlen(s) >= (l - 1)) ) {
377       textbuf = (char *)realloc(textbuf, 2 * textbuf_len);
378       if( !textbuf ) {
379 	fprintf(stderr,
380 		"error: not enough memory for input line (%d bytes)\n",
381 		textbuf_len);
382 	cleanup_tempfiles();
383 	exit(1);
384       }
385 
386       s = textbuf + textbuf_len - (k++);
387       l = textbuf_len;
388       textbuf_len *= 2;
389 
390       MADVISE(textbuf, sizeof(char) * textbuf_len, MADV_SEQUENTIAL);
391 
392     }
393     return 1;
394   } else if( *extra_lines > 0 ) {
395     strcpy(textbuf, "\r\n");
396     *extra_lines = (*extra_lines) - 1;
397     return 1;
398   }
399   return 0;
400 }
401 
402 /***********************************************************
403  * WIDE CHARACTER FILE HANDLING FUNCTIONS                  *
404  * this is needed for any locale whose character set       *
405  * encoding can include NUL bytes inside characters        *
406  ***********************************************************/
407 #if defined HAVE_MBRTOWC
408 
409 /* this does the same work as mbstowcs, but unlike the latter,
410  * we continue converting even if an error is detected. That
411  * is why we can't use the standard function.
412  * Returns true if the converted line is nonempty.
413  */
fill_wc_textbuf(char * pptextbuf,mbstate_t * shiftstate)414 bool_t fill_wc_textbuf(char *pptextbuf, mbstate_t *shiftstate) {
415 
416   char *s;
417   charbuf_len_t k,l;
418   charbuf_len_t wclen;
419   wchar_t *wp;
420 
421   if( !pptextbuf || !*pptextbuf ) { return 0; }
422 
423   if( textbuf_len > wc_textbuf_len ) {
424     wc_textbuf_len = textbuf_len;
425     wc_textbuf = (wchar_t *)realloc(wc_textbuf, wc_textbuf_len * sizeof(wchar_t));
426     if( !wc_textbuf ) {
427       fprintf(stderr,
428 	      "error: not enough memory for wide character conversion "
429 	      "(%ld bytes)\n",
430 	      (long int)(wc_textbuf_len * sizeof(wchar_t)));
431       cleanup_tempfiles();
432       exit(1);
433     }
434 
435     MADVISE(wc_textbuf, sizeof(wchar_t) * wc_textbuf_len, MADV_SEQUENTIAL);
436 
437   }
438 
439   /* convert as much as we can of the line into wide characters */
440   s = pptextbuf;
441   k = textbuf_len;
442   wp = wc_textbuf;
443   wclen = 0;
444   /* since we ensured textbuf_len <= wctextbuf_len
445      there will never be overflow of wctextbuf below */
446   while( k > 0 ) {
447     l = mbrtowc(wp, s, k, shiftstate);
448     if( l > 0 ) {
449       wp++;
450       wclen++;
451       k -= l;
452       s += l;
453     } else if( l == 0 ) {
454       break;
455     } else if( l == -1 ) {
456       /* try to be robust */
457       s++;
458       k--;
459       memset(shiftstate, 0, sizeof(mbstate_t));
460     } else if( l == -2) {
461       /* couldn't parse a complete character */
462       break;
463     }
464   }
465   *wp = L'\0';
466 
467   return (wclen > 0);
468 }
469 
470 #endif
471