1 /**
2 * textcat.c -- routines for categorizing text
3 *
4 * Copyright (C) 2003 WiseGuys Internet B.V.
5 *
6 * THE BSD LICENSE
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * - Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 *
15 * - Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the
18 * distribution.
19 *
20 * - Neither the name of the WiseGuys Internet B.V. nor the names of
21 * its contributors may be used to endorse or promote products derived
22 * from this software without specific prior written permission.
23 *
24 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
28 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
29 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
30 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
31 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
32 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
34 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35 *
36 * DESCRIPTION
37 *
38 * These routines use the N-gram fingerprinting technique as described
39 * in Cavnar and Trenkle, (1994.), N-Gram-Based Text Categorization.
40 * (cf. http://www.nonlineardynamics.com/trenkle/)
41 *
42 * REVISION HISTORY
43 *
44 * Mar 27, 2003 frank@wise-guys.nl -- created
45 *
46 * IMPROVEMENTS:
47 * - If two n-grams have the same frequency count, choose the shortest
48 * - Use a better similarity measure (the article suggests Wilcoxon rank test)
49 * - The profiles are matched one by one, which results in redundant lookups.
50 * - Make the thingy reentrant as well as thread-safe. (Reentrancy is abandoned
51 * by the use of the output buffer in textcat_t.)
52 */
53 #include "config.h"
54
55 #ifdef HAVE_STDLIB_H
56 #include <stdlib.h>
57 #endif
58 #ifdef HAVE_STRINGS_H
59 #include <strings.h>
60 #endif
61 #ifdef HAVE_STRING_H
62 #include <string.h>
63 #endif
64 #ifdef HAVE_ALLOCA_H
65 #include <alloca.h>
66 #endif
67
68 #include "common.h"
69 #include "fingerprint.h"
70 #include "textcat.h"
71 #include "constants.h"
72
73
74 typedef struct {
75
76 void **fprint;
77 char *fprint_disable;
78 uint4 size;
79 uint4 maxsize;
80
81 char output[MAXOUTPUTSIZE];
82
83 } textcat_t;
84
85
86 typedef struct {
87 int score;
88 const char *name;
89 } candidate_t;
90
91
cmpcandidates(const void * a,const void * b)92 static int cmpcandidates(const void *a, const void *b)
93 {
94 candidate_t *x = (candidate_t *)a;
95 candidate_t *y = (candidate_t *)b;
96
97 if ( x->score < y->score ) {
98 return -1;
99 }
100 if ( x->score > y->score ) {
101 return 1;
102 }
103 return 0;
104 }
105
106
textcat_Done(void * handle)107 extern void textcat_Done( void *handle )
108 {
109 textcat_t *h = (textcat_t *)handle;
110 uint4 i;
111
112 for (i=0; i<h->size; i++) {
113 fp_Done( h->fprint[i] );
114 }
115 wg_free( h->fprint );
116 wg_free( h->fprint_disable );
117 wg_free( h );
118
119 }
120
121 /** Replaces older function */
textcat_Init(const char * conffile)122 extern void *textcat_Init( const char *conffile ){
123 return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH );
124 }
125
126 /**
127 * Originaly this function had only one parameter (conffile) it has been modified since OOo use
128 * Basicaly prefix is the directory path where fingerprints are stored
129 */
special_textcat_Init(const char * conffile,const char * prefix)130 extern void *special_textcat_Init( const char *conffile, const char *prefix )
131 {
132 textcat_t *h;
133 char line[1024];
134 FILE *fp;
135
136 fp = fopen( conffile, "r" );
137 if ( !fp ) {
138 #ifdef VERBOSE
139 fprintf( stderr, "Failed to open config file '%s'\n", conffile);
140 #endif
141 return NULL;
142 }
143
144 h = (textcat_t *)wg_malloc(sizeof(textcat_t));
145 h->size = 0;
146 h->maxsize = 16;
147 h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );
148 h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize ); /*added to store the state of languages*/
149
150 while ( wg_getline( line, 1024, fp ) ) {
151 char *p;
152 char *segment[4];
153 char finger_print_file_name[512];
154 int res;
155
156 /*** Skip comments ***/
157 #ifdef HAVE_STRCHR
158 if (( p = strchr(line,'#') )) {
159 #else
160 if (( p = index(line,'#') )) {
161 #endif
162
163 *p = '\0';
164 }
165 if ((res = wg_split( segment, line, line, 4)) < 2 ) {
166 continue;
167 }
168
169 /*** Ensure enough space ***/
170 if ( h->size == h->maxsize ) {
171 h->maxsize *= 2;
172 h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
173 h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize );
174 }
175
176 /*** Load data ***/
177 if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
178 goto ERROR;
179 }
180 finger_print_file_name[0] = '\0';
181 strcat(finger_print_file_name, prefix);
182 strcat(finger_print_file_name, segment[0]);
183
184 if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) {
185 textcat_Done(h);
186 goto ERROR;
187 }
188 h->fprint_disable[h->size] = 0xF0; /*0xF0 is the code for enabled languages, 0x0F is for disabled*/
189 h->size++;
190 }
191
192 fclose(fp);
193 return h;
194
195 ERROR:
196 fclose(fp);
197 return NULL;
198
199 }
200
201
202 extern char *textcat_Classify( void *handle, const char *buffer, size_t size )
203 {
204 textcat_t *h = (textcat_t *)handle;
205 uint4 i, cnt = 0;
206 int minscore = MAXSCORE;
207 int threshold = minscore;
208 char *result = h->output;
209
210 #ifdef HAVE_ALLOCA
211 candidate_t *candidates = (candidate_t *)alloca( sizeof(candidate_t) * h->size );
212 #else
213 candidate_t *candidates = (candidate_t *)malloc( sizeof(candidate_t) * h->size );
214 #define SHOULD_FREE 1
215 #endif
216
217 void *unknown;
218
219 unknown = fp_Init(NULL);
220 if ( fp_Create( unknown, buffer, size, MAXNGRAMS ) == 0 ) {
221 /*** Too little information ***/
222 result = _TEXTCAT_RESULT_SHORT;
223 goto READY;
224 }
225
226 /*** Calculate the score for each category. ***/
227 for (i=0; i<h->size; i++) {
228 int score;
229 if(h->fprint_disable[i] & 0x0F){ /*if this language is disabled*/
230 score = MAXSCORE;
231 }
232 else{
233 score = fp_Compare( h->fprint[i], unknown, threshold );
234 /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/
235 }
236 candidates[i].score = score;
237 candidates[i].name = fp_Name( h->fprint[i] );
238 if ( score < minscore ) {
239 minscore = score;
240 threshold = (int)( (double)score * THRESHOLDVALUE );
241 }
242 }
243
244 /*** Find the best performers ***/
245 for (i=0; i<h->size; i++) {
246 if ( candidates[i].score < threshold ) {
247
248 if ( ++cnt == MAXCANDIDATES+1 ) {
249 break;
250 }
251
252 memcpy( &candidates[cnt-1], &candidates[i], sizeof(candidate_t) );
253
254 }
255 }
256
257 /*** The verdict ***/
258 if ( cnt == MAXCANDIDATES+1 ) {
259 result = _TEXTCAT_RESULT_UNKOWN;
260 }
261 else {
262 char *p = result;
263 char *plimit = result+MAXOUTPUTSIZE;
264
265 qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates );
266
267 *p = '\0';
268 for (i=0; i<cnt; i++) {
269 p = wg_strgmov( p, "[", plimit );
270 p = wg_strgmov( p, candidates[i].name, plimit );
271 p = wg_strgmov( p, "]", plimit );
272 }
273 }
274 READY:
275 fp_Done(unknown);
276 #ifdef SHOULD_FREE
277 free(candidates);
278 #undef SHOULD_FREE
279 #endif
280 return result;
281 }
282
283
284 extern char *textcat_Version()
285 {
286 return "TextCat " PACKAGE_VERSION " (" DESCRIPTION ")";
287 }
288