1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /**
3 * textcat.c -- routines for categorizing text
4 *
5 * Copyright (C) 2003 WiseGuys Internet B.V.
6 *
7 * THE BSD LICENSE
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 *
13 * - Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 *
16 * - Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the
19 * distribution.
20 *
21 * - Neither the name of the WiseGuys Internet B.V. nor the names of
22 * its contributors may be used to endorse or promote products derived
23 * from this software without specific prior written permission.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 *
37 * DESCRIPTION
38 *
39 * These routines use the N-gram fingerprinting technique as described
40 * in Cavnar and Trenkle, (1994.), N-Gram-Based Text Categorization.
41 * (cf. http://www.nonlineardynamics.com/trenkle/)
42 *
43 * REVISION HISTORY
44 *
45 * Mar 27, 2003 frank@wise-guys.nl -- created
46 *
47 * IMPROVEMENTS:
48 * - If two n-grams have the same frequency count, choose the shortest
49 * - Use a better similarity measure (the article suggests Wilcoxon rank test)
50 * - The profiles are matched one by one, which results in redundant lookups.
51 * - Make the thingy reentrant as well as thread-safe. (Reentrancy is abandoned
52 * by the use of the output buffer in textcat_t.)
53 */
54 #ifdef HAVE_CONFIG_H
55 #include "config.h"
56 #endif
57
58 #include <stdlib.h>
59 #include <string.h>
60
61 #include "common_impl.h"
62 #include "fingerprint.h"
63 #include "textcat.h"
64 #include "constants.h"
65
66
67 typedef struct
68 {
69
70 void **fprint;
71 unsigned char *fprint_disable;
72 uint4 size;
73 uint4 maxsize;
74 uint4 mindocsize;
75
76 char output[MAXOUTPUTSIZE];
77 candidate_t *tmp_candidates;
78 boole utfaware;
79 } textcat_t;
80
81
cmpcandidates(const void * a,const void * b)82 static int cmpcandidates(const void *a, const void *b)
83 {
84 const candidate_t *x = (const candidate_t *)a;
85 const candidate_t *y = (const candidate_t *)b;
86 return (x->score - y->score);
87 }
88
89
textcat_Done(void * handle)90 extern void textcat_Done(void *handle)
91 {
92 textcat_t *h = (textcat_t *) handle;
93 uint4 i;
94
95 for (i = 0; i < h->size; i++)
96 {
97 fp_Done(h->fprint[i]);
98 }
99 if (h->tmp_candidates != NULL)
100 {
101 textcat_ReleaseClassifyFullOutput(h, h->tmp_candidates);
102 }
103 free(h->fprint);
104 free(h->fprint_disable);
105 free(h);
106
107 }
108
textcat_SetProperty(void * handle,textcat_Property property,sint4 value)109 extern int textcat_SetProperty(void *handle, textcat_Property property,
110 sint4 value)
111 {
112 textcat_t *h = (textcat_t *) handle;
113 switch (property)
114 {
115 case TCPROP_UTF8AWARE:
116 if ((value == TC_TRUE) || (value == TC_FALSE))
117 {
118 h->utfaware = value;
119 return 0;
120 }
121 return -2;
122 break;
123 case TCPROP_MINIMUM_DOCUMENT_SIZE:
124 if (value > 0)
125 {
126 h->mindocsize = value;
127 return 0;
128 }
129 return -2;
130 break;
131 default:
132 break;
133 }
134 return -1;
135 }
136
137 /** Replaces older function */
textcat_Init(const char * conffile)138 extern void *textcat_Init(const char *conffile)
139 {
140 return special_textcat_Init(conffile, DEFAULT_FINGERPRINTS_PATH);
141 }
142
143 /**
144 * Originaly this function had only one parameter (conffile) it has been modified since OOo use
145 * Basicaly prefix is the directory path where fingerprints are stored
146 */
special_textcat_Init(const char * conffile,const char * prefix)147 extern void *special_textcat_Init(const char *conffile, const char *prefix)
148 {
149 textcat_t *h;
150 char *finger_print_file_name;
151 size_t finger_print_file_name_size;
152 size_t prefix_size;
153 char line[1024];
154 FILE *fp;
155
156 fp = fopen(conffile, "r");
157 if (!fp)
158 {
159 #ifdef VERBOSE
160 fprintf(stderr, "Failed to open config file '%s'\n", conffile);
161 #endif
162 return NULL;
163 }
164
165 h = (textcat_t *) malloc(sizeof(textcat_t));
166 h->size = 0;
167 h->maxsize = 16;
168 h->mindocsize = MINDOCSIZE;
169 h->fprint = (void **)malloc(sizeof(void *) * h->maxsize);
170 h->fprint_disable =
171 (unsigned char *)malloc(sizeof(unsigned char) * h->maxsize);
172 /* added to store the state of languages */
173 h->tmp_candidates = NULL;
174 h->utfaware = TC_TRUE;
175
176 prefix_size = strlen(prefix);
177 finger_print_file_name_size = prefix_size + 1;
178 finger_print_file_name =
179 (char *)malloc(sizeof(char) * (finger_print_file_name_size + 1024));
180 finger_print_file_name[0] = '\0';
181 strcat(finger_print_file_name, prefix);
182
183 while (wg_getline(line, 1024, fp))
184 {
185 char *p;
186 char *segment[4];
187
188 /*** Skip comments ***/
189 if ((p = strchr(line, '#')))
190 {
191 *p = '\0';
192 }
193
194 if (wg_split(segment, line, line, 4) < 2)
195 {
196 continue;
197 }
198
199 /*** Ensure enough space ***/
200 if (h->size == h->maxsize)
201 {
202 h->maxsize *= 2;
203 h->fprint =
204 (void **)realloc(h->fprint, sizeof(void *) * h->maxsize);
205 h->fprint_disable =
206 (unsigned char *)realloc(h->fprint_disable,
207 sizeof(unsigned char) * h->maxsize);
208 }
209
210 /*** Load data ***/
211 if ((h->fprint[h->size] = fp_Init(segment[1])) == NULL)
212 {
213 goto BAILOUT;
214 }
215
216 while (prefix_size + strlen(segment[0]) > finger_print_file_name_size)
217 {
218 char *tmp;
219 size_t tmp_size = finger_print_file_name_size * 2;
220 tmp =
221 (char *)realloc(finger_print_file_name,
222 sizeof(char) * (tmp_size + 1));
223 if (tmp == NULL)
224 {
225 goto BAILOUT;
226 }
227 else
228 {
229 finger_print_file_name = tmp;
230 finger_print_file_name_size = tmp_size;
231 }
232 }
233 finger_print_file_name[prefix_size] = '\0';
234 strcat(finger_print_file_name, segment[0]);
235
236 if (fp_Read(h->fprint[h->size], finger_print_file_name, 400) == 0)
237 goto BAILOUT;
238 h->fprint_disable[h->size] = 0xF0; /* 0xF0 is the code for enabled
239 languages, 0x0F is for disabled
240 */
241 h->size++;
242 }
243
244 free(finger_print_file_name);
245
246 fclose(fp);
247 return h;
248
249 BAILOUT:
250 free(finger_print_file_name);
251 fclose(fp);
252 textcat_Done(h);
253 return NULL;
254 }
255
textcat_GetClassifyFullOutput(void * handle)256 extern candidate_t *textcat_GetClassifyFullOutput(void *handle)
257 {
258 textcat_t *h = (textcat_t *) handle;
259 return (candidate_t *) malloc(sizeof(candidate_t) * h->size);
260 }
261
textcat_ReleaseClassifyFullOutput(void * handle,candidate_t * candidates)262 extern void textcat_ReleaseClassifyFullOutput(void *handle,
263 candidate_t * candidates)
264 {
265 if (candidates != NULL)
266 {
267 free(candidates);
268 }
269 }
270
textcat_Classify(void * handle,const char * buffer,size_t size)271 extern char *textcat_Classify(void *handle, const char *buffer, size_t size)
272 {
273 textcat_t *h = (textcat_t *) handle;
274 char *result = h->output;
275 uint4 i, cnt;
276
277 if (h->tmp_candidates == NULL)
278 {
279 h->tmp_candidates = textcat_GetClassifyFullOutput(h);
280 }
281
282 cnt = textcat_ClassifyFull(h, buffer, size, h->tmp_candidates);
283
284 switch (cnt)
285 {
286 case TEXTCAT_RESULT_UNKNOWN:
287 result = TEXTCAT_RESULT_UNKNOWN_STR;
288 break;
289 case TEXTCAT_RESULT_SHORT:
290 result = TEXTCAT_RESULT_SHORT_STR;
291 break;
292 default:
293 {
294 const char *plimit = result + MAXOUTPUTSIZE;
295 char *p = result;
296
297 *p = '\0';
298 for (i = 0; i < cnt; i++)
299 {
300 p = wg_strgmov(p, "[", plimit);
301 p = wg_strgmov(p, h->tmp_candidates[i].name, plimit);
302 p = wg_strgmov(p, "]", plimit);
303 }
304 }
305 }
306
307 return result;
308 }
309
310
textcat_ClassifyFull(void * handle,const char * buffer,size_t size,candidate_t * candidates)311 extern int textcat_ClassifyFull(void *handle, const char *buffer, size_t size,
312 candidate_t * candidates)
313 {
314 textcat_t *h = (textcat_t *) handle;
315 uint4 i, cnt = 0;
316 int minscore = MAXSCORE;
317 int threshold = minscore;
318
319 void *unknown;
320
321 unknown = fp_Init(NULL);
322 fp_SetProperty(unknown, TCPROP_UTF8AWARE, h->utfaware);
323 fp_SetProperty(unknown, TCPROP_MINIMUM_DOCUMENT_SIZE, h->mindocsize);
324 if (fp_Create(unknown, buffer, size, MAXNGRAMS) == 0)
325 {
326 /*** Too little information ***/
327 fp_Done(unknown);
328 return TEXTCAT_RESULT_SHORT;
329 }
330
331 /*** Calculate the score for each category. ***/
332 for (i = 0; i < h->size; i++)
333 {
334 int score;
335 if (h->fprint_disable[i] & 0x0F)
336 { /* if this language is disabled */
337 score = MAXSCORE;
338 }
339 else
340 {
341 score = fp_Compare(h->fprint[i], unknown, threshold);
342 /* printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score); */
343 }
344 candidates[i].score = score;
345 candidates[i].name = fp_Name(h->fprint[i]);
346 if (score < minscore)
347 {
348 minscore = score;
349 threshold = (int)((double)score * THRESHOLDVALUE);
350 }
351 }
352
353 /*** Find the best performers ***/
354 for (i = 0, cnt = 0; i < h->size; i++)
355 {
356 if (candidates[i].score < threshold)
357 {
358 if (++cnt == MAXCANDIDATES + 1)
359 {
360 break;
361 }
362
363 memcpy(&candidates[cnt - 1], &candidates[i], sizeof(candidate_t));
364
365 }
366 }
367
368 fp_Done(unknown);
369 /*** The verdict ***/
370 if (cnt == MAXCANDIDATES + 1)
371 {
372 return TEXTCAT_RESULT_UNKNOWN;
373 }
374 else
375 {
376 qsort(candidates, cnt, sizeof(candidate_t), cmpcandidates);
377 return cnt;
378 }
379 }
380
textcat_Version(void)381 extern const char *textcat_Version(void)
382 {
383 return EXTTEXTCAT_VERSION;
384 }
385
386 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
387