1 /**
2 * @file xdict.c (dictionary query)
3 * @author Hightman Mar
4 * @editor set number ; syntax on ; set autoindent ; set tabstop=4 (vim)
5 * $Id$
6 */
7
8 #ifdef HAVE_CONFIG_H
9 # include "config.h"
10 #endif
11
12 #ifdef WIN32
13 # include "config_win32.h"
14 #endif
15
16 #include "xdict.h"
17 #include "xtree.h"
18 #include "xdb.h"
19 #include "crc32.h"
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <unistd.h>
24 #ifndef WIN32
25 # include <sys/param.h>
26 #endif
27 #include <sys/types.h>
28 #include <sys/stat.h>
29
30 /* temp file format for TEXT xdb */
31 #if !defined(PATH_MAX) || (PATH_MAX < 1024)
32 # define XDICT_PATH_MAX 1024
33 #else
34 # define XDICT_PATH_MAX PATH_MAX
35 #endif
36
37 #ifdef HAVE_STRTOK_R
38 # define _strtok_r strtok_r
39 #else
40
_strtok_r(char * s,char * delim,char ** lasts)41 static char *_strtok_r(char *s, char *delim, char **lasts)
42 {
43 register char *spanp;
44 register int c, sc;
45 char *tok;
46
47 if (s == NULL && (s = *lasts) == NULL)
48 return NULL;
49
50 /*
51 * Skip (span) leading delimiters (s += strspn(s, delim), sort of).
52 */
53 cont:
54 c = *s++;
55 for (spanp = (char *) delim; (sc = *spanp++) != 0;)
56 {
57 if (c == sc) goto cont;
58 }
59
60 if (c == 0)
61 { /* no non-delimiter characters */
62 *lasts = NULL;
63 return NULL;
64 }
65 tok = s - 1;
66
67 /*
68 * Scan token (scan for delimiters: s += strcspn(s, delim), sort of).
69 * Note that delim must have one NUL; we stop if we see that, too.
70 */
71 for (;;)
72 {
73 c = *s++;
74 spanp = (char *) delim;
75 do
76 {
77 if ((sc = *spanp++) == c)
78 {
79 if (c == 0) s = NULL;
80 else s[-1] = '\0';
81 *lasts = s;
82 return tok;
83 }
84 }
85 while (sc != 0);
86 }
87 }
88 #endif
89
90 #ifdef WIN32
91 # include <direct.h>
92
_realpath(const char * src,char * dst)93 static void _realpath(const char *src, char *dst)
94 {
95 int len = strlen(src);
96 if (strchr(src, ':') != NULL)
97 memcpy(dst, src, len + 1);
98 else
99 {
100 char *ptr;
101 getcwd(dst, XDICT_PATH_MAX - len - 2);
102 ptr = dst + strlen(dst);
103 *ptr++ = '/';
104 memcpy(ptr, src, len + 1);
105 }
106 }
107 #else
108 # define _realpath realpath
109 #endif
110
111 /* open the text dict */
_xdict_open_txt(const char * fpath,int mode,unsigned char * ml)112 static xdict_t _xdict_open_txt(const char *fpath, int mode, unsigned char *ml)
113 {
114 xdict_t xd;
115 xtree_t xt;
116 char buf[XDICT_PATH_MAX], tmpfile[XDICT_PATH_MAX];
117 struct stat st1, st2;
118
119 // check the input filepath
120 _realpath(fpath, buf);
121 if (stat(buf, &st1) < 0)
122 return NULL;
123
124 // check dest file & orginal file, compare there mtime
125 #ifdef WIN32
126 {
127 char *tmp_ptr;
128 GetTempPath(sizeof(tmpfile) - 20, tmpfile);
129 tmp_ptr = tmpfile + strlen(tmpfile);
130 if (tmp_ptr[-1] == '\\') tmp_ptr--;
131 sprintf(tmp_ptr, "\\scws-%08x.xdb", scws_crc32(buf));
132 }
133 #else
134 sprintf(tmpfile, "/tmp/scws-%08x.xdb", scws_crc32(buf));
135 #endif
136 if (!stat(tmpfile, &st2) && st2.st_mtime > st1.st_mtime)
137 {
138 xdb_t x;
139 if ((x = xdb_open(tmpfile, 'r')) != NULL)
140 {
141 xd = (xdict_t) malloc(sizeof(xdict_st));
142 memset(xd, 0, sizeof(xdict_st));
143 xd->ref = 1;
144
145 if (mode & SCWS_XDICT_MEM)
146 {
147 /* convert the xdb(disk) -> xtree(memory) */
148 if ((xt = xdb_to_xtree(x, NULL)) != NULL)
149 {
150 xdb_close(x);
151 xd->xdict = (void *) xt;
152 xd->xmode = SCWS_XDICT_MEM;
153 return xd;
154 }
155 }
156 xd->xmode = SCWS_XDICT_XDB;
157 xd->xdict = (void *) x;
158 return xd;
159 }
160 }
161
162 // create xtree
163 if ((xt = xtree_new(0, 0)) == NULL)
164 return NULL;
165 else
166 {
167 int cl, kl;
168 FILE *fp;
169 word_st word, *w;
170 char *key, *part, *last, *delim = " \t\r\n";
171
172 // re-build the xdb file from text file
173 if ((fp = fopen(buf, "r")) == NULL)
174 return NULL;
175
176 // parse every line
177 word.attr[2] = '\0';
178 while (fgets(buf, sizeof(buf) - 1, fp) != NULL)
179 {
180 // <word>[\t<tf>[\t<idf>[\t<attr>]]]
181 if (buf[0] == ';' || buf[0] == '#') continue;
182
183 key = _strtok_r(buf, delim, &last);
184 if (key == NULL) continue;
185 kl = strlen(key);
186
187 // init the word
188 do
189 {
190 word.tf = word.idf = 1.0;
191 word.flag = SCWS_WORD_FULL;
192 word.attr[0] = '@';
193 word.attr[1] = '\0';
194
195 if (!(part = _strtok_r(NULL, delim, &last))) break;
196 word.tf = (float) atof(part);
197
198 if (!(part = _strtok_r(NULL, delim, &last))) break;
199 word.idf = (float) atof(part);
200
201 if (part = _strtok_r(NULL, delim, &last))
202 {
203 word.attr[0] = part[0];
204 if (part[1]) word.attr[1] = part[1];
205 }
206 }
207 while (0);
208
209 // save into xtree
210 if ((w = xtree_nget(xt, key, kl, NULL)) == NULL)
211 {
212 w = (word_st *) pmalloc(xt->p, sizeof(word_st));
213 memcpy(w, &word, sizeof(word));
214 xtree_nput(xt, w, sizeof(word), key, kl);
215 }
216 else
217 {
218 w->tf = word.tf;
219 w->idf = word.idf;
220 w->flag |= word.flag;
221 strcpy(w->attr, word.attr);
222 }
223
224 // parse the part
225 cl = ml[(unsigned char) (key[0])];
226 while (1)
227 {
228 cl += ml[(unsigned char) (key[cl])];
229 if (cl >= kl) break;
230
231 if ((w = xtree_nget(xt, key, cl, NULL)) != NULL)
232 w->flag |= SCWS_WORD_PART;
233 else
234 {
235 w = (word_st *) pmalloc_z(xt->p, sizeof(word_st));
236 w->flag = SCWS_WORD_PART;
237 xtree_nput(xt, w, sizeof(word), key, cl);
238 }
239 }
240 }
241 fclose(fp);
242
243 // optimize the xtree & save to xdb
244 xtree_optimize(xt);
245 unlink(tmpfile);
246 xtree_to_xdb(xt, tmpfile);
247 chmod(tmpfile, 0777);
248
249 // return xtree
250 xd = (xdict_t) malloc(sizeof(xdict_st));
251 memset(xd, 0, sizeof(xdict_st));
252 xd->ref = 1;
253 xd->xdict = (void *) xt;
254 xd->xmode = SCWS_XDICT_MEM;
255 return xd;
256 }
257 }
258
259 /* setup & open the dict */
xdict_open(const char * fpath,int mode)260 xdict_t xdict_open(const char *fpath, int mode)
261 {
262 xdict_t xd;
263 xdb_t x;
264
265 if (!(x = xdb_open(fpath, 'r')))
266 return NULL;
267
268 xd = (xdict_t) malloc(sizeof(xdict_st));
269 memset(xd, 0, sizeof(xdict_st));
270 xd->ref = 1;
271 if (mode & SCWS_XDICT_MEM)
272 {
273 xtree_t xt;
274
275 /* convert the xdb(disk) -> xtree(memory) */
276 if ((xt = xdb_to_xtree(x, NULL)) != NULL)
277 {
278 xdb_close(x);
279 xd->xdict = (void *) xt;
280 xd->xmode = SCWS_XDICT_MEM;
281 return xd;
282 }
283 }
284
285 xd->xmode = SCWS_XDICT_XDB;
286 xd->xdict = (void *) x;
287 return xd;
288 }
289
290 /* add a dict */
xdict_add(xdict_t xd,const char * fpath,int mode,unsigned char * ml)291 xdict_t xdict_add(xdict_t xd, const char *fpath, int mode, unsigned char *ml)
292 {
293 xdict_t xx;
294
295 xx = (mode & SCWS_XDICT_TXT ? _xdict_open_txt(fpath, mode, ml) : xdict_open(fpath, mode));
296 if (xx != NULL)
297 {
298 xx->next = xd;
299 return xx;
300 }
301 return xd;
302 }
303
304 /* fork the dict */
xdict_fork(xdict_t xd)305 xdict_t xdict_fork(xdict_t xd)
306 {
307 xdict_t xx;
308 for (xx = xd; xx != NULL; xx = xx->next)
309 {
310 xx->ref++;
311 }
312 return xd;
313 }
314
315 /* close the dict */
xdict_close(xdict_t xd)316 void xdict_close(xdict_t xd)
317 {
318 xdict_t xx;
319
320 while ((xx = xd) != NULL)
321 {
322 xd = xx->next;
323 xx->ref--;
324 if (xx->ref == 0)
325 {
326 if (xx->xmode == SCWS_XDICT_MEM)
327 xtree_free((xtree_t) xx->xdict);
328 else
329 {
330 xdb_close((xdb_t) xx->xdict);
331 }
332 free(xx);
333 }
334 }
335 }
336
337 /* query the word */
338 #define _FLAG_BOTH(x) (((x)->flag & (SCWS_WORD_PART|SCWS_WORD_FULL)) == (SCWS_WORD_PART|SCWS_WORD_FULL))
339 #define _FLAG_FULL(x) ((x)->flag & SCWS_WORD_FULL)
340 #define _FLAG_PART(x) ((x)->flag & SCWS_WORD_PART)
341 #define _FLAG_MALLOC(x) ((x)->flag & SCWS_WORD_MALLOCED)
342
xdict_query(xdict_t xd,const char * key,int len)343 word_t xdict_query(xdict_t xd, const char *key, int len)
344 {
345 word_t value, value2;
346
347 value = value2 = NULL;
348 while (xd != NULL)
349 {
350 if (xd->xmode == SCWS_XDICT_MEM)
351 {
352 /* this is ThreadSafe, recommend. */
353 value = (word_t) xtree_nget((xtree_t) xd->xdict, key, len, NULL);
354 }
355 else
356 {
357 /* the value malloced in lib-XDB. free required */
358 value = (word_t) xdb_nget((xdb_t) xd->xdict, key, len, NULL);
359 if (value != NULL) value->flag |= SCWS_WORD_MALLOCED;
360 }
361 xd = xd->next;
362
363 // check value2
364 if (value != NULL)
365 {
366 if (value2 == NULL)
367 {
368 if (_FLAG_BOTH(value))
369 return value;
370 value2 = value;
371 }
372 else
373 {
374 if (_FLAG_FULL(value2) && _FLAG_PART(value))
375 {
376 value2->flag |= SCWS_WORD_PART;
377 if (_FLAG_MALLOC(value))
378 free(value);
379 return value2;
380 }
381 if (_FLAG_FULL(value) && _FLAG_PART(value2))
382 {
383 value->flag |= SCWS_WORD_PART;
384 if (_FLAG_MALLOC(value2))
385 free(value2);
386 return value;
387 }
388 if (_FLAG_MALLOC(value))
389 free(value);
390 }
391 }
392 }
393 return value2;
394 }
395