1 /* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2 /**
3  * textcat.c -- routines for categorizing text
4  *
5  * Copyright (C) 2003 WiseGuys Internet B.V.
6  *
7  * THE BSD LICENSE
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * - Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  *
16  * - Redistributions in binary form must reproduce the above copyright
17  * notice, this list of conditions and the following disclaimer in the
18  * documentation and/or other materials provided with the
19  * distribution.
20  *
21  * - Neither the name of the WiseGuys Internet B.V. nor the names of
22  * its contributors may be used to endorse or promote products derived
23  * from this software without specific prior written permission.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
28  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
29  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
30  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
31  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
32  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
33  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
34  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
35  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36  *
37  * DESCRIPTION
38  *
39  * These routines use the N-gram fingerprinting technique as described
40  * in Cavnar and Trenkle, (1994.), N-Gram-Based Text Categorization.
41  * (cf. http://www.nonlineardynamics.com/trenkle/)
42  *
43  * REVISION HISTORY
44  *
45  * Mar 27, 2003 frank@wise-guys.nl -- created
46  *
47  * IMPROVEMENTS:
48  * - If two n-grams have the same frequency count, choose the shortest
49  * - Use a better similarity measure (the article suggests Wilcoxon rank test)
50  * - The profiles are matched one by one, which results in redundant lookups.
51  * - Make the thingy reentrant as well as thread-safe. (Reentrancy is abandoned
52  *   by the use of the output buffer in textcat_t.)
53  */
54 #ifdef HAVE_CONFIG_H
55 #include "config.h"
56 #endif
57 
58 #include <stdlib.h>
59 #include <string.h>
60 
61 #include "common_impl.h"
62 #include "fingerprint.h"
63 #include "textcat.h"
64 #include "constants.h"
65 
66 
67 typedef struct
68 {
69 
70     void **fprint;
71     unsigned char *fprint_disable;
72     uint4 size;
73     uint4 maxsize;
74     uint4 mindocsize;
75 
76     char output[MAXOUTPUTSIZE];
77     candidate_t *tmp_candidates;
78     boole utfaware;
79 } textcat_t;
80 
81 
cmpcandidates(const void * a,const void * b)82 static int cmpcandidates(const void *a, const void *b)
83 {
84     const candidate_t *x = (const candidate_t *)a;
85     const candidate_t *y = (const candidate_t *)b;
86     return (x->score - y->score);
87 }
88 
89 
textcat_Done(void * handle)90 extern void textcat_Done(void *handle)
91 {
92     textcat_t *h = (textcat_t *) handle;
93     uint4 i;
94 
95     for (i = 0; i < h->size; i++)
96     {
97         fp_Done(h->fprint[i]);
98     }
99     if (h->tmp_candidates != NULL)
100     {
101         textcat_ReleaseClassifyFullOutput(h, h->tmp_candidates);
102     }
103     free(h->fprint);
104     free(h->fprint_disable);
105     free(h);
106 
107 }
108 
textcat_SetProperty(void * handle,textcat_Property property,sint4 value)109 extern int textcat_SetProperty(void *handle, textcat_Property property,
110                                sint4 value)
111 {
112     textcat_t *h = (textcat_t *) handle;
113     switch (property)
114     {
115     case TCPROP_UTF8AWARE:
116         if ((value == TC_TRUE) || (value == TC_FALSE))
117         {
118             h->utfaware = value;
119             return 0;
120         }
121         return -2;
122         break;
123     case TCPROP_MINIMUM_DOCUMENT_SIZE:
124         if (value > 0)
125         {
126             h->mindocsize = value;
127             return 0;
128         }
129         return -2;
130         break;
131     default:
132         break;
133     }
134     return -1;
135 }
136 
137 /** Replaces older function */
textcat_Init(const char * conffile)138 extern void *textcat_Init(const char *conffile)
139 {
140     return special_textcat_Init(conffile, DEFAULT_FINGERPRINTS_PATH);
141 }
142 
143 /**
144  * Originaly this function had only one parameter (conffile) it has been modified since OOo use
145  * Basicaly prefix is the directory path where fingerprints are stored
146  */
special_textcat_Init(const char * conffile,const char * prefix)147 extern void *special_textcat_Init(const char *conffile, const char *prefix)
148 {
149     textcat_t *h;
150     char *finger_print_file_name;
151     size_t finger_print_file_name_size;
152     size_t prefix_size;
153     char line[1024];
154     FILE *fp;
155 
156     fp = fopen(conffile, "r");
157     if (!fp)
158     {
159 #ifdef VERBOSE
160         fprintf(stderr, "Failed to open config file '%s'\n", conffile);
161 #endif
162         return NULL;
163     }
164 
165     h = (textcat_t *) malloc(sizeof(textcat_t));
166     h->size = 0;
167     h->maxsize = 16;
168     h->mindocsize = MINDOCSIZE;
169     h->fprint = (void **)malloc(sizeof(void *) * h->maxsize);
170     h->fprint_disable =
171         (unsigned char *)malloc(sizeof(unsigned char) * h->maxsize);
172     /* added to store the state of languages */
173     h->tmp_candidates = NULL;
174     h->utfaware = TC_TRUE;
175 
176     prefix_size = strlen(prefix);
177     finger_print_file_name_size = prefix_size + 1;
178     finger_print_file_name =
179         (char *)malloc(sizeof(char) * (finger_print_file_name_size + 1024));
180     finger_print_file_name[0] = '\0';
181     strcat(finger_print_file_name, prefix);
182 
183     while (wg_getline(line, 1024, fp))
184     {
185         char *p;
186         char *segment[4];
187 
188         /*** Skip comments ***/
189         if ((p = strchr(line, '#')))
190         {
191             *p = '\0';
192         }
193 
194         if (wg_split(segment, line, line, 4) < 2)
195         {
196             continue;
197         }
198 
199         /*** Ensure enough space ***/
200         if (h->size == h->maxsize)
201         {
202             h->maxsize *= 2;
203             h->fprint =
204                 (void **)realloc(h->fprint, sizeof(void *) * h->maxsize);
205             h->fprint_disable =
206                 (unsigned char *)realloc(h->fprint_disable,
207                                          sizeof(unsigned char) * h->maxsize);
208         }
209 
210         /*** Load data ***/
211         if ((h->fprint[h->size] = fp_Init(segment[1])) == NULL)
212         {
213             goto BAILOUT;
214         }
215 
216         while (prefix_size + strlen(segment[0]) > finger_print_file_name_size)
217         {
218             char *tmp;
219             size_t tmp_size = finger_print_file_name_size * 2;
220             tmp =
221                 (char *)realloc(finger_print_file_name,
222                                 sizeof(char) * (tmp_size + 1));
223             if (tmp == NULL)
224             {
225                 goto BAILOUT;
226             }
227             else
228             {
229                 finger_print_file_name = tmp;
230                 finger_print_file_name_size = tmp_size;
231             }
232         }
233         finger_print_file_name[prefix_size] = '\0';
234         strcat(finger_print_file_name, segment[0]);
235 
236         if (fp_Read(h->fprint[h->size], finger_print_file_name, 400) == 0)
237             goto BAILOUT;
238         h->fprint_disable[h->size] = 0xF0;  /* 0xF0 is the code for enabled
239                                                languages, 0x0F is for disabled
240                                              */
241         h->size++;
242     }
243 
244     free(finger_print_file_name);
245 
246     fclose(fp);
247     return h;
248 
249   BAILOUT:
250     free(finger_print_file_name);
251     fclose(fp);
252     textcat_Done(h);
253     return NULL;
254 }
255 
textcat_GetClassifyFullOutput(void * handle)256 extern candidate_t *textcat_GetClassifyFullOutput(void *handle)
257 {
258     textcat_t *h = (textcat_t *) handle;
259     return (candidate_t *) malloc(sizeof(candidate_t) * h->size);
260 }
261 
textcat_ReleaseClassifyFullOutput(void * handle,candidate_t * candidates)262 extern void textcat_ReleaseClassifyFullOutput(void *handle,
263                                               candidate_t * candidates)
264 {
265     if (candidates != NULL)
266     {
267         free(candidates);
268     }
269 }
270 
textcat_Classify(void * handle,const char * buffer,size_t size)271 extern char *textcat_Classify(void *handle, const char *buffer, size_t size)
272 {
273     textcat_t *h = (textcat_t *) handle;
274     char *result = h->output;
275     uint4 i, cnt;
276 
277     if (h->tmp_candidates == NULL)
278     {
279         h->tmp_candidates = textcat_GetClassifyFullOutput(h);
280     }
281 
282     cnt = textcat_ClassifyFull(h, buffer, size, h->tmp_candidates);
283 
284     switch (cnt)
285     {
286     case TEXTCAT_RESULT_UNKNOWN:
287         result = TEXTCAT_RESULT_UNKNOWN_STR;
288         break;
289     case TEXTCAT_RESULT_SHORT:
290         result = TEXTCAT_RESULT_SHORT_STR;
291         break;
292     default:
293         {
294             const char *plimit = result + MAXOUTPUTSIZE;
295             char *p = result;
296 
297             *p = '\0';
298             for (i = 0; i < cnt; i++)
299             {
300                 p = wg_strgmov(p, "[", plimit);
301                 p = wg_strgmov(p, h->tmp_candidates[i].name, plimit);
302                 p = wg_strgmov(p, "]", plimit);
303             }
304         }
305     }
306 
307     return result;
308 }
309 
310 
textcat_ClassifyFull(void * handle,const char * buffer,size_t size,candidate_t * candidates)311 extern int textcat_ClassifyFull(void *handle, const char *buffer, size_t size,
312                                 candidate_t * candidates)
313 {
314     textcat_t *h = (textcat_t *) handle;
315     uint4 i, cnt = 0;
316     int minscore = MAXSCORE;
317     int threshold = minscore;
318 
319     void *unknown;
320 
321     unknown = fp_Init(NULL);
322     fp_SetProperty(unknown, TCPROP_UTF8AWARE, h->utfaware);
323     fp_SetProperty(unknown, TCPROP_MINIMUM_DOCUMENT_SIZE, h->mindocsize);
324     if (fp_Create(unknown, buffer, size, MAXNGRAMS) == 0)
325     {
326         /*** Too little information ***/
327         fp_Done(unknown);
328         return TEXTCAT_RESULT_SHORT;
329     }
330 
331     /*** Calculate the score for each category. ***/
332     for (i = 0; i < h->size; i++)
333     {
334         int score;
335         if (h->fprint_disable[i] & 0x0F)
336         {                       /* if this language is disabled */
337             score = MAXSCORE;
338         }
339         else
340         {
341             score = fp_Compare(h->fprint[i], unknown, threshold);
342             /* printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score); */
343         }
344         candidates[i].score = score;
345         candidates[i].name = fp_Name(h->fprint[i]);
346         if (score < minscore)
347         {
348             minscore = score;
349             threshold = (int)((double)score * THRESHOLDVALUE);
350         }
351     }
352 
353     /*** Find the best performers ***/
354     for (i = 0, cnt = 0; i < h->size; i++)
355     {
356         if (candidates[i].score < threshold)
357         {
358             if (++cnt == MAXCANDIDATES + 1)
359             {
360                 break;
361             }
362 
363             memcpy(&candidates[cnt - 1], &candidates[i], sizeof(candidate_t));
364 
365         }
366     }
367 
368     fp_Done(unknown);
369     /*** The verdict ***/
370     if (cnt == MAXCANDIDATES + 1)
371     {
372         return TEXTCAT_RESULT_UNKNOWN;
373     }
374     else
375     {
376         qsort(candidates, cnt, sizeof(candidate_t), cmpcandidates);
377         return cnt;
378     }
379 }
380 
textcat_Version(void)381 extern const char *textcat_Version(void)
382 {
383     return EXTTEXTCAT_VERSION;
384 }
385 
386 /* vim:set shiftwidth=4 softtabstop=4 expandtab: */
387