1 /**
2  * textcat.c -- routines for categorizing text
3  *
4  * Copyright (C) 2003 WiseGuys Internet B.V.
5  *
6  * THE BSD LICENSE
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * - Redistributions of source code must retain the above copyright
13  * notice, this list of conditions and the following disclaimer.
14  *
15  * - Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in the
17  * documentation and/or other materials provided with the
18  * distribution.
19  *
20  * - Neither the name of the WiseGuys Internet B.V. nor the names of
21  * its contributors may be used to endorse or promote products derived
22  * from this software without specific prior written permission.
23  *
24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
25  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
27  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
28  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
29  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
30  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
31  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
32  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
33  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
34  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
35  *
36  * DESCRIPTION
37  *
38  * These routines use the N-gram fingerprinting technique as described
39  * in Cavnar and Trenkle, (1994.), N-Gram-Based Text Categorization.
40  * (cf. http://www.nonlineardynamics.com/trenkle/)
41  *
42  * REVISION HISTORY
43  *
44  * Mar 27, 2003 frank@wise-guys.nl -- created
45  *
46  * IMPROVEMENTS:
47  * - If two n-grams have the same frequency count, choose the shortest
48  * - Use a better similarity measure (the article suggests Wilcoxon rank test)
49  * - The profiles are matched one by one, which results in redundant lookups.
50  * - Make the thingy reentrant as well as thread-safe. (Reentrancy is abandoned
51  *   by the use of the output buffer in textcat_t.)
52  */
53 #include "config.h"
54 
55 #ifdef HAVE_STDLIB_H
56 #include <stdlib.h>
57 #endif
58 #ifdef HAVE_STRINGS_H
59 #include <strings.h>
60 #endif
61 #ifdef HAVE_STRING_H
62 #include <string.h>
63 #endif
64 #ifdef HAVE_ALLOCA_H
65 #include <alloca.h>
66 #endif
67 
68 #include "common.h"
69 #include "fingerprint.h"
70 #include "textcat.h"
71 #include "constants.h"
72 
73 
74 typedef struct {
75 
76 	void **fprint;
77         char *fprint_disable;
78 	uint4 size;
79 	uint4 maxsize;
80 
81 	char output[MAXOUTPUTSIZE];
82 
83 } textcat_t;
84 
85 
86 typedef struct {
87 	int score;
88 	const char *name;
89 } candidate_t;
90 
91 
cmpcandidates(const void * a,const void * b)92 static int cmpcandidates(const void *a, const void *b)
93 {
94 	candidate_t *x = (candidate_t *)a;
95 	candidate_t *y = (candidate_t *)b;
96 
97 	if ( x->score < y->score ) {
98 		return -1;
99 	}
100 	if ( x->score > y->score ) {
101 		return 1;
102 	}
103 	return 0;
104 }
105 
106 
textcat_Done(void * handle)107 extern void textcat_Done( void *handle )
108 {
109 	textcat_t *h = (textcat_t *)handle;
110 	uint4 i;
111 
112 	for (i=0; i<h->size; i++) {
113 		fp_Done( h->fprint[i] );
114 	}
115 	wg_free( h->fprint );
116         wg_free( h->fprint_disable );
117 	wg_free( h );
118 
119 }
120 
121 /** Replaces older function */
textcat_Init(const char * conffile)122 extern void *textcat_Init( const char *conffile ){
123     return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH );
124 }
125 
126 /**
127  * Originaly this function had only one parameter (conffile) it has been modified since OOo use
128  * Basicaly prefix is the directory path where fingerprints are stored
129  */
special_textcat_Init(const char * conffile,const char * prefix)130 extern void *special_textcat_Init( const char *conffile, const char *prefix )
131 {
132 	textcat_t *h;
133 	char line[1024];
134 	FILE *fp;
135 
136 	fp = fopen( conffile, "r" );
137 	if ( !fp ) {
138 #ifdef VERBOSE
139 		fprintf( stderr, "Failed to open config file '%s'\n", conffile);
140 #endif
141 		return NULL;
142 	}
143 
144 	h = (textcat_t *)wg_malloc(sizeof(textcat_t));
145 	h->size = 0;
146 	h->maxsize = 16;
147 	h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );
148  h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize );   /*added to store the state of languages*/
149 
150 	while ( wg_getline( line, 1024, fp ) ) {
151 		char *p;
152 		char *segment[4];
153                 char finger_print_file_name[512];
154                 int res;
155 
156 		/*** Skip comments ***/
157 #ifdef HAVE_STRCHR
158 		if (( p = strchr(line,'#') )) {
159 #else
160 		if (( p = index(line,'#') )) {
161 #endif
162 
163 			*p = '\0';
164 		}
165 		if ((res = wg_split( segment, line, line, 4)) < 2 ) {
166 			continue;
167 		}
168 
169 		/*** Ensure enough space ***/
170 		if ( h->size == h->maxsize ) {
171 			h->maxsize *= 2;
172 			h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
173                         h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize );
174 		}
175 
176 		/*** Load data ***/
177 		if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
178 			goto ERROR;
179 		}
180                 finger_print_file_name[0] = '\0';
181                 strcat(finger_print_file_name, prefix);
182                 strcat(finger_print_file_name, segment[0]);
183 
184                 if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) {
185 			textcat_Done(h);
186 			goto ERROR;
187 		}
188                 h->fprint_disable[h->size] = 0xF0;  /*0xF0 is the code for enabled languages, 0x0F is for disabled*/
189 		h->size++;
190 	}
191 
192 	fclose(fp);
193 	return h;
194 
195  ERROR:
196 	fclose(fp);
197 	return NULL;
198 
199 }
200 
201 
202 extern char *textcat_Classify( void *handle, const char *buffer, size_t size )
203 {
204 	textcat_t *h = (textcat_t *)handle;
205 	uint4 i, cnt = 0;
206 	int minscore = MAXSCORE;
207 	int threshold = minscore;
208 	char *result = h->output;
209 
210 #ifdef HAVE_ALLOCA
211 	candidate_t *candidates = (candidate_t *)alloca( sizeof(candidate_t) * h->size );
212 #else
213 	candidate_t *candidates = (candidate_t *)malloc( sizeof(candidate_t) * h->size );
214 #define SHOULD_FREE 1
215 #endif
216 
217 	void *unknown;
218 
219 	unknown = fp_Init(NULL);
220 	if ( fp_Create( unknown, buffer, size, MAXNGRAMS ) == 0 ) {
221 		/*** Too little information ***/
222 		result = _TEXTCAT_RESULT_SHORT;
223 		goto READY;
224 	}
225 
226 	/*** Calculate the score for each category. ***/
227 	for (i=0; i<h->size; i++) {
228                 int score;
229                 if(h->fprint_disable[i] & 0x0F){    /*if this language is disabled*/
230                     score = MAXSCORE;
231                 }
232                 else{
233                     score = fp_Compare( h->fprint[i], unknown, threshold );
234                     /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/
235                 }
236                 candidates[i].score = score;
237 		candidates[i].name = fp_Name( h->fprint[i] );
238 		if ( score < minscore ) {
239 			minscore = score;
240 			threshold = (int)( (double)score * THRESHOLDVALUE );
241 		}
242 	}
243 
244 	/*** Find the best performers ***/
245 	for (i=0; i<h->size; i++) {
246 		if ( candidates[i].score < threshold ) {
247 
248 			if ( ++cnt == MAXCANDIDATES+1 ) {
249 				break;
250 			}
251 
252 			memcpy( &candidates[cnt-1], &candidates[i], sizeof(candidate_t) );
253 
254 		}
255 	}
256 
257 	/*** The verdict ***/
258 	if ( cnt == MAXCANDIDATES+1 ) {
259 		result = _TEXTCAT_RESULT_UNKOWN;
260 	}
261 	else {
262 		char *p = result;
263 		char *plimit = result+MAXOUTPUTSIZE;
264 
265 		qsort( candidates, cnt, sizeof(candidate_t), cmpcandidates );
266 
267 		*p = '\0';
268 		for (i=0; i<cnt; i++) {
269 			p = wg_strgmov( p, "[", plimit );
270 			p = wg_strgmov( p, candidates[i].name, plimit );
271 			p = wg_strgmov( p, "]", plimit );
272 		}
273 	}
274  READY:
275 	fp_Done(unknown);
276 #ifdef SHOULD_FREE
277 	free(candidates);
278 #undef SHOULD_FREE
279 #endif
280 	return result;
281 }
282 
283 
284 extern char *textcat_Version()
285 {
286 	return "TextCat " PACKAGE_VERSION " (" DESCRIPTION ")";
287 }
288