1 /*
2 * Copyright (C) 2002 Laird Breyer
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 3 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
17 *
18 * Author: Laird Breyer <laird@lbreyer.com>
19 */
20
21 #ifdef HAVE_CONFIG_H
22 #include "config.h"
23 #endif
24
25 #if defined HAVE_UNISTD_H
26 #include <unistd.h>
27 #endif
28
29 #include <signal.h>
30 #include <string.h>
31 #include <stdlib.h>
32 #include <stdarg.h>
33 #include <sys/stat.h>
34 #include <fcntl.h>
35 #include <math.h>
36 #include "util.h"
37
38 /*@constant double M_LN2@*/
39
40 extern char *progname;
41 extern char *inputfile;
42 extern long inputline;
43 extern options_t u_options;
44 extern options_t m_options;
45 extern int cmd;
46
47 extern char *textbuf;
48 extern charbuf_len_t textbuf_len;
49
50 #if defined HAVE_MBRTOWC
51 extern wchar_t *wc_textbuf;
52 extern charbuf_len_t wc_textbuf_len;
53 #endif
54
55 extern void *in_iobuf;
56 extern void *out_iobuf;
57
58 extern long system_pagesize;
59
60 int sa_signal = 0;
61 signal_cleanup_t cleanup = { NULL };
62
63 /***********************************************************
64 * GLOBAL BUFFERS *
65 ***********************************************************/
init_buffers()66 void init_buffers() {
67 /* preallocate primary text holding buffer */
68 textbuf_len = system_pagesize;
69 textbuf = (char *)malloc(textbuf_len);
70
71 MADVISE(textbuf, sizeof(char) * textbuf_len, MADV_SEQUENTIAL);
72
73 #if defined HAVE_POSIX_MEMALIGN
74 /* buffer must exist until after fclose() if used in setvbuf() */
75 if( 0 != posix_memalign(&in_iobuf, system_pagesize,
76 BUFFER_MAG * system_pagesize) ) {
77 in_iobuf = NULL; /* just to be sure */
78 }
79 /* buffer must exist until after fclose() if used in setvbuf() */
80 if( 0 != posix_memalign(&out_iobuf, system_pagesize,
81 BUFFER_MAG * system_pagesize) ) {
82 out_iobuf = NULL; /* just to be sure */
83 }
84 #elif defined HAVE_MEMALIGN
85 /* memalign()ed memory can't be reclaimed by free() */
86 in_iobuf = (void *)memalign(system_pagesize, BUFFER_MAG * system_pagesize);
87 out_iobuf = (void *)memalign(system_pagesize, BUFFER_MAG * system_pagesize);
88 #elif defined HAVE_VALLOC
89 /* valloc()ed memory can't be reclaimed by free() */
90 in_iobuf = (void *)valloc(BUFFER_MAG * system_pagesize);
91 out_iobuf = (void *)valloc(BUFFER_MAG * system_pagesize);
92 #endif
93 }
94
cleanup_buffers()95 void cleanup_buffers() {
96 /* free some global resources */
97 free(textbuf);
98 #if defined HAVE_POSIX_MEMALIGN
99 if( in_iobuf ) { free(in_iobuf); }
100 if( out_iobuf ) { free(out_iobuf); }
101 #endif
102 }
103
cleanup_tempfiles()104 void cleanup_tempfiles() {
105 if( cleanup.tempfile ) {
106 unlink(cleanup.tempfile);
107 cleanup.tempfile = NULL;
108 }
109 }
110
111
set_iobuf_mode(FILE * input)112 void set_iobuf_mode(FILE *input) {
113 struct stat statinfo;
114 if( in_iobuf ) {
115 /* choose appropriate buffering mode */
116 if( fstat(fileno(input), &statinfo) == 0 ) {
117 switch(statinfo.st_mode & S_IFMT) {
118 case S_IFREG:
119 case S_IFBLK:
120 setvbuf(input, (char *)in_iobuf, _IOFBF, BUFFER_MAG * system_pagesize);
121 break;
122 case S_IFIFO:
123 case S_IFCHR:
124 setvbuf(input, (char *)NULL,
125 (u_options & (1<<U_OPTION_FILTER)) ? _IOLBF : _IOFBF,
126 BUFFER_MAG * system_pagesize);
127 break;
128 case S_IFDIR:
129 default:
130 /* nothing */
131 break;
132 }
133
134 }
135 }
136 }
137 /***********************************************************
138 * TOKEN HASHING *
139 ***********************************************************/
hash_full_token(const char * tok)140 hash_value_t hash_full_token(const char *tok) {
141 const char *q;
142 JENKINS_HASH_VALUE h;
143 q = strchr(tok,EOTOKEN);
144 if( q ) {
145 h = hash((unsigned char *)tok, q - tok, 0);
146 return (hash_value_t)hash((unsigned char *)q, EXTRA_CLASS_LEN, h);
147 } else {
148 errormsg(E_FATAL,
149 "hash_full_token called with missing class [%s]\n",
150 tok);
151 }
152 return (hash_value_t)0;
153 }
154
hash_partial_token(const char * tok,int len,const char * extra)155 hash_value_t hash_partial_token(const char *tok, int len, const char *extra) {
156 JENKINS_HASH_VALUE h;
157 h = hash((unsigned char *)tok, len, 0);
158 return (hash_value_t)hash((unsigned char *)extra, EXTRA_CLASS_LEN, h);
159 }
160
161 /***********************************************************
162 * WEIGHT SIZE REDUCTION *
163 ***********************************************************/
164
digitize_a_weight(weight_t w,token_order_t order)165 digitized_weight_t digitize_a_weight(weight_t w, token_order_t order) {
166 if( w < 0.0 ) {
167 errormsg(E_FATAL,
168 "digitize_a_weight called with negative argument %f\n",
169 w);
170 /* return DIGITIZED_WEIGHT_MIN; */
171 } else if( order * w > (1<<(16 - DIG_FACTOR)) ) {
172 return DIGITIZED_WEIGHT_MAX;
173 } else {
174 return (digitized_weight_t)(w * (order<<DIG_FACTOR));
175 }
176 return DIGITIZED_WEIGHT_MIN;
177 }
178
undigitize_a_weight(digitized_weight_t d,token_order_t order)179 weight_t undigitize_a_weight(digitized_weight_t d, token_order_t order) {
180 return ((weight_t)d) / (order<<DIG_FACTOR);
181 }
182
nats2bits(double score)183 double nats2bits(double score) {
184 return score/M_LN2;
185 }
186
187
188
189 /***********************************************************
190 * SIGNAL HANDLING *
191 ***********************************************************/
192 #if defined HAVE_SIGACTION
193 /* sigaction structure used by several functions */
194 struct sigaction act;
195 #endif
196
my_sa_handler(int signum)197 void my_sa_handler(int signum) {
198 #if defined HAVE_SIGACTION
199 sa_signal = signum;
200 #endif
201 }
202
sigsegv(int signum)203 void sigsegv(int signum) {
204 fprintf(stdout, "%s:error: segmentation fault at input line %ld of %s\n",
205 progname, inputline, inputfile);
206 cleanup_tempfiles();
207 exit(1);
208 }
209
210 /* intercepts typical termination signals and tries to do the right thing */
init_signal_handling()211 void init_signal_handling() {
212 #if defined HAVE_SIGACTION
213
214 memset(&act, 0, sizeof(act));
215
216 /* set up global sigaction structure */
217
218 act.sa_handler = my_sa_handler;
219 sigemptyset(&act.sa_mask);
220 sigaddset(&act.sa_mask,SIGHUP);
221 sigaddset(&act.sa_mask,SIGINT);
222 sigaddset(&act.sa_mask,SIGQUIT);
223 sigaddset(&act.sa_mask,SIGTERM);
224 sigaddset(&act.sa_mask,SIGPIPE);
225 sigaddset(&act.sa_mask,SIGUSR1);
226 act.sa_flags = 0;
227
228 sigaction(SIGHUP, &act, NULL);
229 sigaction(SIGINT, &act, NULL);
230 sigaction(SIGQUIT, &act, NULL);
231 sigaction(SIGTERM, &act, NULL);
232 sigaction(SIGPIPE, &act, NULL);
233 sigaction(SIGUSR1, &act, NULL);
234
235 act.sa_handler = sigsegv;
236 sigemptyset(&act.sa_mask);
237 sigaddset(&act.sa_mask,SIGSEGV);
238 act.sa_flags = 0;
239 sigaction(SIGSEGV, &act, NULL);
240
241 #endif
242 }
243
cleanup_signal_handling()244 void cleanup_signal_handling() {
245 /* nothing - this is just to mess with your head ;-) */
246 }
247
process_pending_signal(FILE * input)248 void process_pending_signal(FILE *input) {
249 #if defined HAVE_SIGACTION
250
251 if( sa_signal ) {
252
253 sigprocmask(SIG_BLOCK, &act.sa_mask, NULL);
254 switch(sa_signal) {
255 case SIGINT:
256 cleanup_tempfiles();
257 fprintf(stderr,
258 "%s:signal: caught interrupt request, exiting\n", progname);
259 exit(1);
260 break;
261 case SIGPIPE:
262 /* should we terminate, or should we ignore? */
263 cleanup_tempfiles();
264 fprintf(stderr,
265 "%s:error: broken pipe on output, exiting\n", progname);
266 exit(1);
267 break;
268 case SIGHUP:
269 case SIGQUIT:
270 case SIGTERM:
271 fprintf(stderr,
272 "%s:signal: caught termination request, ignoring further input\n",
273 progname);
274 if( input ) { fclose(input); }
275 cmd |= (1<<CMD_QUITNOW);
276 break;
277 case SIGUSR1:
278 if( u_options & (1<<U_OPTION_CLASSIFY) ) {
279 fprintf(stderr,
280 "%s:signal: caught SIGUSR1 request, reloading categories asap\n",
281 progname);
282 cmd |= (1<<CMD_RELOAD_CATS);
283 } else {
284 fprintf(stderr,
285 "%s:signal: caught SIGUSR1 request, ignoring\n", progname);
286 }
287 break;
288 default:
289 /* nothing */
290 break;
291 }
292 sa_signal = 0;
293
294 sigprocmask(SIG_UNBLOCK, &act.sa_mask, NULL);
295
296 }
297
298 #endif
299 }
300
301
302 /***********************************************************
303 * ERROR DISPLAY *
304 ***********************************************************/
errormsg(int etype,const char * fmt,...)305 void errormsg(int etype, const char *fmt, ...) {
306 va_list vap;
307
308 switch(etype) {
309 case E_WARNING:
310 fprintf(stderr, "%s:warning: ", progname);
311 break;
312 default:
313 case E_ERROR:
314 case E_FATAL:
315 fprintf(stderr, "%s:error: ", progname);
316 break;
317 }
318
319 #if HAVE_VPRINTF
320 va_start(vap, fmt);
321 vfprintf(stderr, fmt, vap);
322 va_end(vap);
323 #else
324 fprintf(stderr, "%s", fmt);
325 #endif
326
327 if( etype == E_FATAL ) {
328 cleanup_tempfiles();
329 exit(1);
330 }
331 }
332
333 /***********************************************************
334 * MULTIBYTE FILE HANDLING FUNCTIONS *
335 * this is suitable for any locale whose character set *
336 * encoding doesn't include NUL bytes inside characters *
337 ***********************************************************/
338
print_token(FILE * out,const char * tok)339 void print_token(FILE *out, const char *tok) {
340 while(*tok) {
341 switch(*tok) {
342 case DIAMOND:
343 fprintf(out, "[]");
344 break;
345 case TOKENSEP:
346 fprintf(out, " ");
347 break;
348 case CLASSEP:
349 fprintf(out, "(%d)", tok[1] - AMIN);
350 tok++;
351 break;
352 default:
353 fprintf(out, "%c", *tok);
354 break;
355 }
356 tok++;
357 }
358 }
359
360 /* even after the EOF is reached, this pretends there are
361 * a few more blank lines, to allow filters to process
362 * cached input.
363 */
fill_textbuf(FILE * input,int * extra_lines)364 bool_t fill_textbuf(FILE *input, int *extra_lines) {
365 char *s;
366 charbuf_len_t l, k;
367
368 if( !(cmd & (1<<CMD_QUITNOW)) && !feof(input) ) {
369 process_pending_signal(input);
370
371 /* read in a full line, allocating memory as necessary */
372 textbuf[0] = '\0';
373 s = textbuf;
374 l = textbuf_len;
375 k = 1;
376 while( fgets(s, l, input) && ((charbuf_len_t)strlen(s) >= (l - 1)) ) {
377 textbuf = (char *)realloc(textbuf, 2 * textbuf_len);
378 if( !textbuf ) {
379 fprintf(stderr,
380 "error: not enough memory for input line (%d bytes)\n",
381 textbuf_len);
382 cleanup_tempfiles();
383 exit(1);
384 }
385
386 s = textbuf + textbuf_len - (k++);
387 l = textbuf_len;
388 textbuf_len *= 2;
389
390 MADVISE(textbuf, sizeof(char) * textbuf_len, MADV_SEQUENTIAL);
391
392 }
393 return 1;
394 } else if( *extra_lines > 0 ) {
395 strcpy(textbuf, "\r\n");
396 *extra_lines = (*extra_lines) - 1;
397 return 1;
398 }
399 return 0;
400 }
401
402 /***********************************************************
403 * WIDE CHARACTER FILE HANDLING FUNCTIONS *
404 * this is needed for any locale whose character set *
405 * encoding can include NUL bytes inside characters *
406 ***********************************************************/
407 #if defined HAVE_MBRTOWC
408
409 /* this does the same work as mbstowcs, but unlike the latter,
410 * we continue converting even if an error is detected. That
411 * is why we can't use the standard function.
412 * Returns true if the converted line is nonempty.
413 */
fill_wc_textbuf(char * pptextbuf,mbstate_t * shiftstate)414 bool_t fill_wc_textbuf(char *pptextbuf, mbstate_t *shiftstate) {
415
416 char *s;
417 charbuf_len_t k,l;
418 charbuf_len_t wclen;
419 wchar_t *wp;
420
421 if( !pptextbuf || !*pptextbuf ) { return 0; }
422
423 if( textbuf_len > wc_textbuf_len ) {
424 wc_textbuf_len = textbuf_len;
425 wc_textbuf = (wchar_t *)realloc(wc_textbuf, wc_textbuf_len * sizeof(wchar_t));
426 if( !wc_textbuf ) {
427 fprintf(stderr,
428 "error: not enough memory for wide character conversion "
429 "(%ld bytes)\n",
430 (long int)(wc_textbuf_len * sizeof(wchar_t)));
431 cleanup_tempfiles();
432 exit(1);
433 }
434
435 MADVISE(wc_textbuf, sizeof(wchar_t) * wc_textbuf_len, MADV_SEQUENTIAL);
436
437 }
438
439 /* convert as much as we can of the line into wide characters */
440 s = pptextbuf;
441 k = textbuf_len;
442 wp = wc_textbuf;
443 wclen = 0;
444 /* since we ensured textbuf_len <= wctextbuf_len
445 there will never be overflow of wctextbuf below */
446 while( k > 0 ) {
447 l = mbrtowc(wp, s, k, shiftstate);
448 if( l > 0 ) {
449 wp++;
450 wclen++;
451 k -= l;
452 s += l;
453 } else if( l == 0 ) {
454 break;
455 } else if( l == -1 ) {
456 /* try to be robust */
457 s++;
458 k--;
459 memset(shiftstate, 0, sizeof(mbstate_t));
460 } else if( l == -2) {
461 /* couldn't parse a complete character */
462 break;
463 }
464 }
465 *wp = L'\0';
466
467 return (wclen > 0);
468 }
469
470 #endif
471