1 /**
2  * @file xdict.c (dictionary query)
3  * @author Hightman Mar
4  * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5  * $Id$
6  */
7 
8 #ifdef HAVE_CONFIG_H
9 #    include "config.h"
10 #endif
11 
12 #ifdef WIN32
13 #    include "config_win32.h"
14 #endif
15 
16 #include "xdict.h"
17 #include "xtree.h"
18 #include "xdb.h"
19 #include "crc32.h"
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <unistd.h>
24 #ifndef WIN32
25 #    include <sys/param.h>
26 #endif
27 #include <sys/types.h>
28 #include <sys/stat.h>
29 
30 /* temp file format for TEXT xdb */
31 #if !defined(PATH_MAX) || (PATH_MAX < 1024)
32 #    define	XDICT_PATH_MAX	1024
33 #else
34 #    define	XDICT_PATH_MAX	PATH_MAX
35 #endif
36 
37 #ifdef HAVE_STRTOK_R
38 #    define	_strtok_r	strtok_r
39 #else
40 
_strtok_r(char * s,char * delim,char ** lasts)41 static char *_strtok_r(char *s, char *delim, char **lasts)
42 {
43 	register char *spanp;
44 	register int c, sc;
45 	char *tok;
46 
47 	if (s == NULL && (s = *lasts) == NULL)
48 		return NULL;
49 
50 	/*
51 	 * Skip (span) leading delimiters (s += strspn(s, delim), sort of).
52 	 */
53 cont:
54 	c = *s++;
55 	for (spanp = (char *) delim; (sc = *spanp++) != 0;)
56 	{
57 		if (c == sc) goto cont;
58 	}
59 
60 	if (c == 0)
61 	{ /* no non-delimiter characters */
62 		*lasts = NULL;
63 		return NULL;
64 	}
65 	tok = s - 1;
66 
67 	/*
68 	 * Scan token (scan for delimiters: s += strcspn(s, delim), sort of).
69 	 * Note that delim must have one NUL; we stop if we see that, too.
70 	 */
71 	for (;;)
72 	{
73 		c = *s++;
74 		spanp = (char *) delim;
75 		do
76 		{
77 			if ((sc = *spanp++) == c)
78 			{
79 				if (c == 0) s = NULL;
80 				else s[-1] = '\0';
81 				*lasts = s;
82 				return tok;
83 			}
84 		}
85 		while (sc != 0);
86 	}
87 }
88 #endif
89 
90 #ifdef WIN32
91 #    include <direct.h>
92 
_realpath(const char * src,char * dst)93 static void _realpath(const char *src, char *dst)
94 {
95 	int len = strlen(src);
96 	if (strchr(src, ':') != NULL)
97 		memcpy(dst, src, len + 1);
98 	else
99 	{
100 		char *ptr;
101 		getcwd(dst, XDICT_PATH_MAX - len - 2);
102 		ptr = dst + strlen(dst);
103 		*ptr++ = '/';
104 		memcpy(ptr, src, len + 1);
105 	}
106 }
107 #else
108 #    define	_realpath	realpath
109 #endif
110 
111 /* open the text dict */
_xdict_open_txt(const char * fpath,int mode,unsigned char * ml)112 static xdict_t _xdict_open_txt(const char *fpath, int mode, unsigned char *ml)
113 {
114 	xdict_t xd;
115 	xtree_t xt;
116 	char buf[XDICT_PATH_MAX], tmpfile[XDICT_PATH_MAX];
117 	struct stat st1, st2;
118 
119 	// check the input filepath
120 	_realpath(fpath, buf);
121 	if (stat(buf, &st1) < 0)
122 		return NULL;
123 
124 	// check dest file & orginal file, compare there mtime
125 #ifdef WIN32
126 	{
127 		char *tmp_ptr;
128 		GetTempPath(sizeof(tmpfile) - 20, tmpfile);
129 		tmp_ptr = tmpfile + strlen(tmpfile);
130 		if (tmp_ptr[-1] == '\\') tmp_ptr--;
131 		sprintf(tmp_ptr, "\\scws-%08x.xdb", scws_crc32(buf));
132 	}
133 #else
134 	sprintf(tmpfile, "/tmp/scws-%08x.xdb", scws_crc32(buf));
135 #endif
136 	if (!stat(tmpfile, &st2) && st2.st_mtime > st1.st_mtime)
137 	{
138 		xdb_t x;
139 		if ((x = xdb_open(tmpfile, 'r')) != NULL)
140 		{
141 			xd = (xdict_t) malloc(sizeof(xdict_st));
142 			memset(xd, 0, sizeof(xdict_st));
143 			xd->ref = 1;
144 
145 			if (mode & SCWS_XDICT_MEM)
146 			{
147 				/* convert the xdb(disk) -> xtree(memory) */
148 				if ((xt = xdb_to_xtree(x, NULL)) != NULL)
149 				{
150 					xdb_close(x);
151 					xd->xdict = (void *) xt;
152 					xd->xmode = SCWS_XDICT_MEM;
153 					return xd;
154 				}
155 			}
156 			xd->xmode = SCWS_XDICT_XDB;
157 			xd->xdict = (void *) x;
158 			return xd;
159 		}
160 	}
161 
162 	// create xtree
163 	if ((xt = xtree_new(0, 0)) == NULL)
164 		return NULL;
165 	else
166 	{
167 		int cl, kl;
168 		FILE *fp;
169 		word_st word, *w;
170 		char *key, *part, *last, *delim = " \t\r\n";
171 
172 		// re-build the xdb file from text file
173 		if ((fp = fopen(buf, "r")) == NULL)
174 			return NULL;
175 
176 		// parse every line
177 		word.attr[2] = '\0';
178 		while (fgets(buf, sizeof(buf) - 1, fp) != NULL)
179 		{
180 			// <word>[\t<tf>[\t<idf>[\t<attr>]]]
181 			if (buf[0] == ';' || buf[0] == '#') continue;
182 
183 			key = _strtok_r(buf, delim, &last);
184 			if (key == NULL) continue;
185 			kl = strlen(key);
186 
187 			// init the word
188 			do
189 			{
190 				word.tf = word.idf = 1.0;
191 				word.flag = SCWS_WORD_FULL;
192 				word.attr[0] = '@';
193 				word.attr[1] = '\0';
194 
195 				if (!(part = _strtok_r(NULL, delim, &last))) break;
196 				word.tf = (float) atof(part);
197 
198 				if (!(part = _strtok_r(NULL, delim, &last))) break;
199 				word.idf = (float) atof(part);
200 
201 				if (part = _strtok_r(NULL, delim, &last))
202 				{
203 					word.attr[0] = part[0];
204 					if (part[1]) word.attr[1] = part[1];
205 				}
206 			}
207 			while (0);
208 
209 			// save into xtree
210 			if ((w = xtree_nget(xt, key, kl, NULL)) == NULL)
211 			{
212 				w = (word_st *) pmalloc(xt->p, sizeof(word_st));
213 				memcpy(w, &word, sizeof(word));
214 				xtree_nput(xt, w, sizeof(word), key, kl);
215 			}
216 			else
217 			{
218 				w->tf = word.tf;
219 				w->idf = word.idf;
220 				w->flag |= word.flag;
221 				strcpy(w->attr, word.attr);
222 			}
223 
224 			// parse the part
225 			cl = ml[(unsigned char) (key[0])];
226 			while (1)
227 			{
228 				cl += ml[(unsigned char) (key[cl])];
229 				if (cl >= kl) break;
230 
231 				if ((w = xtree_nget(xt, key, cl, NULL)) != NULL)
232 					w->flag |= SCWS_WORD_PART;
233 				else
234 				{
235 					w = (word_st *) pmalloc_z(xt->p, sizeof(word_st));
236 					w->flag = SCWS_WORD_PART;
237 					xtree_nput(xt, w, sizeof(word), key, cl);
238 				}
239 			}
240 		}
241 		fclose(fp);
242 
243 		// optimize the xtree & save to xdb
244 		xtree_optimize(xt);
245 		unlink(tmpfile);
246 		xtree_to_xdb(xt, tmpfile);
247 		chmod(tmpfile, 0777);
248 
249 		// return xtree
250 		xd = (xdict_t) malloc(sizeof(xdict_st));
251 		memset(xd, 0, sizeof(xdict_st));
252 		xd->ref = 1;
253 		xd->xdict = (void *) xt;
254 		xd->xmode = SCWS_XDICT_MEM;
255 		return xd;
256 	}
257 }
258 
259 /* setup & open the dict */
xdict_open(const char * fpath,int mode)260 xdict_t xdict_open(const char *fpath, int mode)
261 {
262 	xdict_t xd;
263 	xdb_t x;
264 
265 	if (!(x = xdb_open(fpath, 'r')))
266 		return NULL;
267 
268 	xd = (xdict_t) malloc(sizeof(xdict_st));
269 	memset(xd, 0, sizeof(xdict_st));
270 	xd->ref = 1;
271 	if (mode & SCWS_XDICT_MEM)
272 	{
273 		xtree_t xt;
274 
275 		/* convert the xdb(disk) -> xtree(memory) */
276 		if ((xt = xdb_to_xtree(x, NULL)) != NULL)
277 		{
278 			xdb_close(x);
279 			xd->xdict = (void *) xt;
280 			xd->xmode = SCWS_XDICT_MEM;
281 			return xd;
282 		}
283 	}
284 
285 	xd->xmode = SCWS_XDICT_XDB;
286 	xd->xdict = (void *) x;
287 	return xd;
288 }
289 
290 /* add a dict */
xdict_add(xdict_t xd,const char * fpath,int mode,unsigned char * ml)291 xdict_t xdict_add(xdict_t xd, const char *fpath, int mode, unsigned char *ml)
292 {
293 	xdict_t xx;
294 
295 	xx = (mode & SCWS_XDICT_TXT ? _xdict_open_txt(fpath, mode, ml) : xdict_open(fpath, mode));
296 	if (xx != NULL)
297 	{
298 		xx->next = xd;
299 		return xx;
300 	}
301 	return xd;
302 }
303 
304 /* fork the dict */
xdict_fork(xdict_t xd)305 xdict_t xdict_fork(xdict_t xd)
306 {
307 	xdict_t xx;
308 	for (xx = xd; xx != NULL; xx = xx->next)
309 	{
310 		xx->ref++;
311 	}
312 	return xd;
313 }
314 
315 /* close the dict */
xdict_close(xdict_t xd)316 void xdict_close(xdict_t xd)
317 {
318 	xdict_t xx;
319 
320 	while ((xx = xd) != NULL)
321 	{
322 		xd = xx->next;
323 		xx->ref--;
324 		if (xx->ref == 0)
325 		{
326 			if (xx->xmode == SCWS_XDICT_MEM)
327 				xtree_free((xtree_t) xx->xdict);
328 			else
329 			{
330 				xdb_close((xdb_t) xx->xdict);
331 			}
332 			free(xx);
333 		}
334 	}
335 }
336 
337 /* query the word */
338 #define	_FLAG_BOTH(x)	(((x)->flag & (SCWS_WORD_PART|SCWS_WORD_FULL)) == (SCWS_WORD_PART|SCWS_WORD_FULL))
339 #define	_FLAG_FULL(x)	((x)->flag & SCWS_WORD_FULL)
340 #define	_FLAG_PART(x)	((x)->flag & SCWS_WORD_PART)
341 #define	_FLAG_MALLOC(x)	((x)->flag & SCWS_WORD_MALLOCED)
342 
xdict_query(xdict_t xd,const char * key,int len)343 word_t xdict_query(xdict_t xd, const char *key, int len)
344 {
345 	word_t value, value2;
346 
347 	value = value2 = NULL;
348 	while (xd != NULL)
349 	{
350 		if (xd->xmode == SCWS_XDICT_MEM)
351 		{
352 			/* this is ThreadSafe, recommend. */
353 			value = (word_t) xtree_nget((xtree_t) xd->xdict, key, len, NULL);
354 		}
355 		else
356 		{
357 			/* the value malloced in lib-XDB. free required */
358 			value = (word_t) xdb_nget((xdb_t) xd->xdict, key, len, NULL);
359 			if (value != NULL) value->flag |= SCWS_WORD_MALLOCED;
360 		}
361 		xd = xd->next;
362 
363 		// check value2
364 		if (value != NULL)
365 		{
366 			if (value2 == NULL)
367 			{
368 				if (_FLAG_BOTH(value))
369 					return value;
370 				value2 = value;
371 			}
372 			else
373 			{
374 				if (_FLAG_FULL(value2) && _FLAG_PART(value))
375 				{
376 					value2->flag |= SCWS_WORD_PART;
377 					if (_FLAG_MALLOC(value))
378 						free(value);
379 					return value2;
380 				}
381 				if (_FLAG_FULL(value) && _FLAG_PART(value2))
382 				{
383 					value->flag |= SCWS_WORD_PART;
384 					if (_FLAG_MALLOC(value2))
385 						free(value2);
386 					return value;
387 				}
388 				if (_FLAG_MALLOC(value))
389 					free(value);
390 			}
391 		}
392 	}
393 	return value2;
394 }
395