1 /*************************************************************************************************
2 * The q-gram database API of Tokyo Dystopia
3 * Copyright (C) 2007-2010 FAL Labs
4 * This file is part of Tokyo Dystopia.
5 * Tokyo Dystopia is free software; you can redistribute it and/or modify it under the terms of
6 * the GNU Lesser General Public License as published by the Free Software Foundation; either
7 * version 2.1 of the License or any later version. Tokyo Dystopia is distributed in the hope
8 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
10 * License for more details.
11 * You should have received a copy of the GNU Lesser General Public License along with Tokyo
12 * Dystopia; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13 * Boston, MA 02111-1307 USA.
14 *************************************************************************************************/
15
16
17 #include "tcqdb.h"
18 #include "myconf.h"
19
20 #define QDBMAGICDATA "[q-gram]" // magic data for identification
21 #define QDBIOBUFSIZ 65536 // size of an I/O buffer
22 #define QDBMAXWORDLEN 1024 // maximum length of each search word
23 #define QDBOCRUNIT 1024 // unit number of occurrence allocation
24 #define QDBCCBNUM 1048573 // bucket number of the token cache
25 #define QDBCCDEFICSIZ (1024LL*1024*128) // default capacity of the token cache
26 #define QDBZMMINSIZ 131072 // minimum memory size to use nullified region
27 #define QDBDIDSBNUM 262139 // bucket number of the deleted ID set
28 #define QDBDTKNBNUM 262139 // bucket number of the deleted token map
29 #define QDBBITMAPNUM 524287 // number of elements of search bitmap
30 #define QDBDEFFWMMAX 2048 // default maximum number forward matching expansion
31 #define QDBHJBNUMCO 4 // coefficient of the bucket number for hash join
32
33 #define QDBDEFETNUM 1000000 // default expected token number
34 #define QDBLMEMB 256 // number of members in each leaf of the index
35 #define QDBNMEMB 512 // number of members in each node of the index
36 #define QDBAPOW 9 // alignment power of the index
37 #define QDBFPOW 11 // free block pool power of the index
38 #define QDBLSMAX 8192 // maximum size of each leaf of the index
39 #define QDBLCNUMW 64 // number of cached leaf nodes for writer
40 #define QDBLCNUMR 1024 // number of cached leaf nodes for reader
41 #define QDBNCNUM 1024 // number of cached non-leaf nodes
42
43 typedef struct { // type of structure for a record occurrence
44 uint64_t id; // ID number
45 int32_t off; // offset from the first token
46 uint16_t seq; // sequence number
47 uint16_t hash; // hash value for counting sort
48 } QDBOCR;
49
50
51 /* private function prototypes */
52 static bool tcqdblockmethod(TCQDB *qdb, bool wr);
53 static bool tcqdbunlockmethod(TCQDB *qdb);
54 static bool tcqdbopenimpl(TCQDB *qdb, const char *path, int omode);
55 static bool tcqdbcloseimpl(TCQDB *qdb);
56 static bool tcqdbputimpl(TCQDB *qdb, int64_t id, const char *text);
57 static bool tcqdboutimpl(TCQDB *qdb, int64_t id, const char *text);
58 static uint64_t *tcqdbsearchimpl(TCQDB *qdb, const char *word, int smode, int *np);
59 static int tccmptokens(const uint16_t **a, const uint16_t **b);
60 static int tccmpocrs(QDBOCR *a, QDBOCR *b);
61 static int tccmpuint64(const uint64_t *a, const uint64_t *b);
62
63
64
65 /*************************************************************************************************
66 * API
67 *************************************************************************************************/
68
69
70 /* String containing the version information. */
71 const char *tdversion = _TD_VERSION;
72
73
74 /* Get the message string corresponding to an error code. */
tcqdberrmsg(int ecode)75 const char *tcqdberrmsg(int ecode){
76 return tcbdberrmsg(ecode);
77 }
78
79
80 /* Create a q-gram database object. */
tcqdbnew(void)81 TCQDB *tcqdbnew(void){
82 TCQDB *qdb = tcmalloc(sizeof(*qdb));
83 qdb->mmtx = tcmalloc(sizeof(pthread_rwlock_t));
84 if(pthread_rwlock_init(qdb->mmtx, NULL) != 0) tcmyfatal("pthread_rwlock_init failed");
85 qdb->idx = tcbdbnew();
86 if(!tcbdbsetmutex(qdb->idx)) tcmyfatal("tcbdbsetmutex failed");
87 qdb->open = false;
88 qdb->cc = NULL;
89 qdb->icsiz = QDBCCDEFICSIZ;
90 qdb->lcnum = 0;
91 qdb->dtokens = NULL;
92 qdb->dids = NULL;
93 qdb->etnum = QDBDEFETNUM;
94 qdb->opts = 0;
95 qdb->fwmmax = QDBDEFFWMMAX;
96 qdb->synccb = NULL;
97 qdb->syncopq = NULL;
98 return qdb;
99 }
100
101
102 /* Delete a q-gram database object. */
tcqdbdel(TCQDB * qdb)103 void tcqdbdel(TCQDB *qdb){
104 assert(qdb);
105 if(qdb->open) tcqdbclose(qdb);
106 tcbdbdel(qdb->idx);
107 pthread_rwlock_destroy(qdb->mmtx);
108 tcfree(qdb->mmtx);
109 tcfree(qdb);
110 }
111
112
113 /* Get the last happened error code of a q-gram database object. */
tcqdbecode(TCQDB * qdb)114 int tcqdbecode(TCQDB *qdb){
115 assert(qdb);
116 return tcbdbecode(qdb->idx);
117 }
118
119
120 /* Set the tuning parameters of a q-gram database object. */
tcqdbtune(TCQDB * qdb,int64_t etnum,uint8_t opts)121 bool tcqdbtune(TCQDB *qdb, int64_t etnum, uint8_t opts){
122 assert(qdb);
123 if(!tcqdblockmethod(qdb, true)) return false;
124 if(qdb->open){
125 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
126 tcqdbunlockmethod(qdb);
127 return false;
128 }
129 qdb->etnum = (etnum > 0) ? etnum : QDBDEFETNUM;
130 qdb->opts = opts;
131 tcqdbunlockmethod(qdb);
132 return true;
133 }
134
135
136 /* Set the caching parameters of a q-gram database object. */
tcqdbsetcache(TCQDB * qdb,int64_t icsiz,int32_t lcnum)137 bool tcqdbsetcache(TCQDB *qdb, int64_t icsiz, int32_t lcnum){
138 assert(qdb);
139 if(!tcqdblockmethod(qdb, true)) return false;
140 if(qdb->open){
141 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
142 tcqdbunlockmethod(qdb);
143 return false;
144 }
145 qdb->icsiz = (icsiz > 0) ? icsiz : QDBCCDEFICSIZ;
146 qdb->lcnum = (lcnum > 0) ? lcnum : 0;
147 tcqdbunlockmethod(qdb);
148 return true;
149 }
150
151
152 /* Set the maximum number of forward matching expansion of a q-gram database object. */
tcqdbsetfwmmax(TCQDB * qdb,uint32_t fwmmax)153 bool tcqdbsetfwmmax(TCQDB *qdb, uint32_t fwmmax){
154 assert(qdb);
155 if(!tcqdblockmethod(qdb, true)) return false;
156 if(qdb->open){
157 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
158 tcqdbunlockmethod(qdb);
159 return false;
160 }
161 qdb->fwmmax = fwmmax;
162 tcqdbunlockmethod(qdb);
163 return true;
164 }
165
166
167 /* Open a q-gram database object. */
tcqdbopen(TCQDB * qdb,const char * path,int omode)168 bool tcqdbopen(TCQDB *qdb, const char *path, int omode){
169 assert(qdb && path);
170 if(!tcqdblockmethod(qdb, true)) return false;
171 if(qdb->open){
172 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
173 tcqdbunlockmethod(qdb);
174 return false;
175 }
176 bool rv = tcqdbopenimpl(qdb, path, omode);
177 tcqdbunlockmethod(qdb);
178 return rv;
179 }
180
181
182 /* Close a q-gram database object. */
tcqdbclose(TCQDB * qdb)183 bool tcqdbclose(TCQDB *qdb){
184 assert(qdb);
185 if(!tcqdblockmethod(qdb, true)) return false;
186 if(!qdb->open){
187 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
188 tcqdbunlockmethod(qdb);
189 return false;
190 }
191 bool rv = tcqdbcloseimpl(qdb);
192 tcqdbunlockmethod(qdb);
193 return rv;
194 }
195
196
197 /* Store a record into a q-gram database object. */
tcqdbput(TCQDB * qdb,int64_t id,const char * text)198 bool tcqdbput(TCQDB *qdb, int64_t id, const char *text){
199 assert(qdb && id > 0 && text);
200 if(!tcqdblockmethod(qdb, true)) return false;
201 if(!qdb->open || !qdb->cc){
202 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
203 tcqdbunlockmethod(qdb);
204 return false;
205 }
206 if(tcidsetcheck(qdb->dids, id) && !tcqdbmemsync(qdb, 0)){
207 tcqdbunlockmethod(qdb);
208 return false;
209 }
210 bool rv = tcqdbputimpl(qdb, id, text);
211 tcqdbunlockmethod(qdb);
212 return rv;
213 }
214
215
216 /* Remove a record of a q-gram database object. */
tcqdbout(TCQDB * qdb,int64_t id,const char * text)217 bool tcqdbout(TCQDB *qdb, int64_t id, const char *text){
218 assert(qdb && id > 0 && text);
219 if(!tcqdblockmethod(qdb, true)) return false;
220 if(!qdb->open || !qdb->cc){
221 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
222 tcqdbunlockmethod(qdb);
223 return false;
224 }
225 if(tcidsetcheck(qdb->dids, id)){
226 tcqdbunlockmethod(qdb);
227 return true;
228 }
229 if(tcmaprnum(qdb->cc) > 0 && !tcqdbmemsync(qdb, 0)){
230 tcqdbunlockmethod(qdb);
231 return false;
232 }
233 bool rv = tcqdboutimpl(qdb, id, text);
234 tcqdbunlockmethod(qdb);
235 return rv;
236 }
237
238
239 /* Search a q-gram database. */
tcqdbsearch(TCQDB * qdb,const char * word,int smode,int * np)240 uint64_t *tcqdbsearch(TCQDB *qdb, const char *word, int smode, int *np){
241 assert(qdb && word && np);
242 if(!tcqdblockmethod(qdb, false)) return NULL;
243 if(!qdb->open){
244 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
245 tcqdbunlockmethod(qdb);
246 return NULL;
247 }
248 if(qdb->cc && (tcmaprnum(qdb->cc) > 0 || tcmaprnum(qdb->dtokens) > 0)){
249 tcqdbunlockmethod(qdb);
250 if(!tcqdblockmethod(qdb, true)) return NULL;
251 if(!tcqdbmemsync(qdb, 0)){
252 tcqdbunlockmethod(qdb);
253 return NULL;
254 }
255 tcqdbunlockmethod(qdb);
256 if(!tcqdblockmethod(qdb, false)) return NULL;
257 }
258 uint64_t *rv = tcqdbsearchimpl(qdb, word, smode, np);
259 tcqdbunlockmethod(qdb);
260 return rv;
261 }
262
263
264 /* Synchronize updated contents of a q-gram database object with the file and the device. */
tcqdbsync(TCQDB * qdb)265 bool tcqdbsync(TCQDB *qdb){
266 assert(qdb);
267 if(!tcqdblockmethod(qdb, true)) return false;
268 if(!qdb->open || !qdb->cc){
269 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
270 tcqdbunlockmethod(qdb);
271 return false;
272 }
273 bool err = false;
274 if(!tcqdbmemsync(qdb, 2)) err = true;
275 tcqdbunlockmethod(qdb);
276 return !err;
277 }
278
279
280 /* Optimize the file of a q-gram database object. */
tcqdboptimize(TCQDB * qdb)281 bool tcqdboptimize(TCQDB *qdb){
282 assert(qdb);
283 if(!tcqdblockmethod(qdb, true)) return false;
284 if(!qdb->open || !qdb->cc){
285 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
286 tcqdbunlockmethod(qdb);
287 return false;
288 }
289 bool err = false;
290 if(!tcqdbmemsync(qdb, 1)) err = true;
291 if(!tcbdboptimize(qdb->idx, 0, 0, 0, -1, -1, UINT8_MAX)) err = true;
292 tcqdbunlockmethod(qdb);
293 return !err;
294 }
295
296
297 /* Remove all records of a q-gram database object. */
tcqdbvanish(TCQDB * qdb)298 bool tcqdbvanish(TCQDB *qdb){
299 assert(qdb);
300 if(!tcqdblockmethod(qdb, true)) return false;
301 if(!qdb->open || !qdb->cc){
302 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
303 tcqdbunlockmethod(qdb);
304 return false;
305 }
306 bool err = false;
307 tcmapclear(qdb->cc);
308 tcmapclear(qdb->dtokens);
309 if(!tcqdbmemsync(qdb, 1)) err = true;
310 if(!tcbdbvanish(qdb->idx)) err = true;
311 tcqdbunlockmethod(qdb);
312 return !err;
313 }
314
315
316 /* Copy the database file of a q-gram database object. */
tcqdbcopy(TCQDB * qdb,const char * path)317 bool tcqdbcopy(TCQDB *qdb, const char *path){
318 assert(qdb && path);
319 if(!tcqdblockmethod(qdb, false)) return false;
320 if(!qdb->open || !qdb->cc){
321 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
322 tcqdbunlockmethod(qdb);
323 return false;
324 }
325 bool err = false;
326 if(!tcqdbmemsync(qdb, 1)) err = true;
327 if(!tcbdbcopy(qdb->idx, path)) err = true;
328 tcqdbunlockmethod(qdb);
329 return !err;
330 }
331
332
333 /* Get the file path of a q-gram database object. */
tcqdbpath(TCQDB * qdb)334 const char *tcqdbpath(TCQDB *qdb){
335 assert(qdb);
336 return tcbdbpath(qdb->idx);
337 }
338
339
340 /* Get the number of tokens of a q-gram database object. */
tcqdbtnum(TCQDB * qdb)341 uint64_t tcqdbtnum(TCQDB *qdb){
342 assert(qdb);
343 return tcbdbrnum(qdb->idx);
344 }
345
346
347 /* Get the size of the database file of a q-gram database object. */
tcqdbfsiz(TCQDB * qdb)348 uint64_t tcqdbfsiz(TCQDB *qdb){
349 assert(qdb);
350 return tcbdbfsiz(qdb->idx);
351 }
352
353
354
355 /*************************************************************************************************
356 * features for experts
357 *************************************************************************************************/
358
359
360 /* Set the file descriptor for debugging output. */
tcqdbsetdbgfd(TCQDB * qdb,int fd)361 void tcqdbsetdbgfd(TCQDB *qdb, int fd){
362 assert(qdb && fd >= 0);
363 tcbdbsetdbgfd(qdb->idx, fd);
364 }
365
366
367 /* Get the file descriptor for debugging output. */
tcqdbdbgfd(TCQDB * qdb)368 int tcqdbdbgfd(TCQDB *qdb){
369 assert(qdb);
370 return tcbdbdbgfd(qdb->idx);
371 }
372
373
374 /* Synchronize updating contents on memory of a q-gram database object. */
tcqdbmemsync(TCQDB * qdb,int level)375 bool tcqdbmemsync(TCQDB *qdb, int level){
376 assert(qdb);
377 if(!qdb->open || !qdb->cc){
378 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
379 return false;
380 }
381 bool err = false;
382 bool (*synccb)(int, int, const char *, void *) = qdb->synccb;
383 void *syncopq = qdb->syncopq;
384 TCBDB *idx = qdb->idx;
385 TCMAP *cc = qdb->cc;
386 if(synccb && !synccb(0, 0, "started", syncopq)){
387 tcbdbsetecode(qdb->idx, TCEMISC, __FILE__, __LINE__, __func__);
388 return false;
389 }
390 if(tcmaprnum(cc) > 0){
391 if(synccb && !synccb(0, 0, "getting tokens", syncopq)){
392 tcbdbsetecode(qdb->idx, TCEMISC, __FILE__, __LINE__, __func__);
393 return false;
394 }
395 int kn;
396 const char **keys = tcmapkeys2(cc, &kn);
397 if(synccb && !synccb(kn, 0, "sorting tokens", syncopq)){
398 tcbdbsetecode(qdb->idx, TCEMISC, __FILE__, __LINE__, __func__);
399 tcfree(keys);
400 return false;
401 }
402 qsort(keys, kn, sizeof(*keys), (int(*)(const void *, const void *))tccmptokens);
403 char token[TDNUMBUFSIZ*2];
404 for(int i = 0; i < kn; i++){
405 if(synccb && !synccb(kn, i + 1, "storing tokens", syncopq)){
406 tcbdbsetecode(qdb->idx, TCEMISC, __FILE__, __LINE__, __func__);
407 tcfree(keys);
408 return false;
409 }
410 const uint16_t *ary = (uint16_t *)keys[i];
411 tcstrucstoutf(ary, 2, token);
412 int tlen = strlen(token);
413 int vsiz;
414 const char *vbuf = tcmapget(cc, ary, 2 * sizeof(*ary), &vsiz);
415 if(!tcbdbputcat(idx, token, tlen, vbuf, vsiz)) err = true;
416 }
417 tcfree(keys);
418 tcmapclear(cc);
419 }
420 TCMAP *dtokens = qdb->dtokens;
421 TCIDSET *dids = qdb->dids;
422 if(tcmaprnum(dtokens) > 0){
423 if(synccb && !synccb(0, 0, "getting deleted tokens", syncopq)){
424 tcbdbsetecode(qdb->idx, TCEMISC, __FILE__, __LINE__, __func__);
425 return false;
426 }
427 int kn;
428 const char **keys = tcmapkeys2(dtokens, &kn);
429 if(synccb && !synccb(kn, 0, "sorting deleted tokens", syncopq)){
430 tcbdbsetecode(qdb->idx, TCEMISC, __FILE__, __LINE__, __func__);
431 tcfree(keys);
432 return false;
433 }
434 qsort(keys, kn, sizeof(*keys), (int(*)(const void *, const void *))tccmptokens);
435 char token[TDNUMBUFSIZ*2];
436 for(int i = 0; i < kn; i++){
437 if(synccb && !synccb(kn, i + 1, "storing deleted tokens", syncopq)){
438 tcbdbsetecode(qdb->idx, TCEMISC, __FILE__, __LINE__, __func__);
439 tcfree(keys);
440 return false;
441 }
442 const uint16_t *ary = (uint16_t *)keys[i];
443 tcstrucstoutf(ary, 2, token);
444 int tlen = strlen(token);
445 int vsiz;
446 const char *vbuf = tcbdbget3(idx, token, tlen, &vsiz);
447 if(!vbuf) continue;
448 char *nbuf = tcmalloc(vsiz + 1);
449 char *wp = nbuf;
450 const char *pv;
451 while(vsiz > 1){
452 pv = vbuf;
453 int step;
454 uint64_t id;
455 TDREADVNUMBUF64(vbuf, id, step);
456 vbuf += step;
457 vsiz -= step;
458 if(vsiz > 0){
459 uint32_t off;
460 TDREADVNUMBUF(vbuf, off, step);
461 vbuf += step;
462 vsiz -= step;
463 if(!tcidsetcheck(dids, id)){
464 int len = vbuf - pv;
465 memcpy(wp, pv, len);
466 wp += len;
467 }
468 }
469 }
470 int nsiz = wp - nbuf;
471 if(nsiz > 0){
472 if(!tcbdbput(idx, token, tlen, nbuf, nsiz)) err = true;
473 } else {
474 if(!tcbdbout(idx, token, tlen)) err = true;
475 }
476 tcfree(nbuf);
477 }
478 tcfree(keys);
479 tcmapclear(dtokens);
480 tcidsetclear(dids);
481 }
482 if(level > 0){
483 if(synccb && !synccb(0, 0, "synchronizing database", syncopq)){
484 tcbdbsetecode(qdb->idx, TCEMISC, __FILE__, __LINE__, __func__);
485 return false;
486 }
487 if(!tcbdbmemsync(idx, level > 1)) err = true;
488 }
489 if(synccb && !synccb(0, 0, "finished", syncopq)){
490 tcbdbsetecode(qdb->idx, TCEMISC, __FILE__, __LINE__, __func__);
491 return false;
492 }
493 return !err;
494 }
495
496
497 /* Clear the cache of a q-gram database object. */
tcqdbcacheclear(TCQDB * qdb)498 bool tcqdbcacheclear(TCQDB *qdb){
499 assert(qdb);
500 if(!qdb->open){
501 tcbdbsetecode(qdb->idx, TCEINVALID, __FILE__, __LINE__, __func__);
502 return false;
503 }
504 return tcbdbcacheclear(qdb->idx);
505 }
506
507
508 /* Get the inode number of the database file of a q-gram database object. */
tcqdbinode(TCQDB * qdb)509 uint64_t tcqdbinode(TCQDB *qdb){
510 assert(qdb);
511 return tcbdbinode(qdb->idx);
512 }
513
514
515 /* Get the modification time of the database file of a q-gram database object. */
tcqdbmtime(TCQDB * qdb)516 time_t tcqdbmtime(TCQDB *qdb){
517 assert(qdb);
518 return tcbdbmtime(qdb->idx);
519 }
520
521
522 /* Get the options of a q-gram database object. */
tcqdbopts(TCQDB * qdb)523 uint8_t tcqdbopts(TCQDB *qdb){
524 assert(qdb);
525 return tcbdbopts(qdb->idx);
526 }
527
528
529 /* Get the maximum number of forward matching expansion of a q-gram database object. */
tcqdbfwmmax(TCQDB * qdb)530 uint32_t tcqdbfwmmax(TCQDB *qdb){
531 assert(qdb);
532 return qdb->fwmmax;
533 }
534
535
536 /* Get the number of records in the cache of a q-gram database object. */
tcqdbcnum(TCQDB * qdb)537 uint32_t tcqdbcnum(TCQDB *qdb){
538 assert(qdb);
539 if(!qdb->cc) return 0;
540 return tcmaprnum(qdb->cc);
541 }
542
543
544 /* Set the callback function for sync progression of a q-gram database object. */
tcqdbsetsynccb(TCQDB * qdb,bool (* cb)(int,int,const char *,void *),void * opq)545 void tcqdbsetsynccb(TCQDB *qdb, bool (*cb)(int, int, const char *, void *), void *opq){
546 assert(qdb);
547 qdb->synccb = cb;
548 qdb->syncopq = opq;
549 }
550
551
552 /* Merge multiple result sets by union. */
tcqdbresunion(QDBRSET * rsets,int rsnum,int * np)553 uint64_t *tcqdbresunion(QDBRSET *rsets, int rsnum, int *np){
554 assert(rsets && rsnum >= 0 && np);
555 if(rsnum == 0){
556 *np = 0;
557 return tcmalloc(1);
558 }
559 if(rsnum == 1){
560 if(!rsets[0].ids){
561 *np = 0;
562 return tcmalloc(1);
563 }
564 *np = rsets[0].num;
565 return tcmemdup(rsets[0].ids, rsets[0].num * sizeof(rsets[0].ids[0]));
566 }
567 int sum = 0;
568 for(int i = 0; i < rsnum; i++){
569 if(!rsets[i].ids) continue;
570 sum += rsets[i].num;
571 }
572 uint64_t *res = tcmalloc(sum * sizeof(*res) + 1);
573 int rnum = 0;
574 for(int i = 0; i < rsnum; i++){
575 if(!rsets[i].ids) continue;
576 const uint64_t *ids = rsets[i].ids;
577 int num = rsets[i].num;
578 for(int j = 0; j < num; j++){
579 res[rnum++] = ids[j];
580 }
581 }
582 qsort(res, rnum, sizeof(*res), (int (*)(const void *, const void *))tccmpuint64);
583 int nnum = 0;
584 uint64_t lid = UINT64_MAX;
585 for(int i = 0; i < rnum; i++){
586 uint64_t id = res[i];
587 if(lid == id) continue;
588 res[nnum++] = id;
589 lid = id;
590 }
591 *np = nnum;
592 return res;
593 }
594
595
596 /* Merge multiple result sets by intersection. */
tcqdbresisect(QDBRSET * rsets,int rsnum,int * np)597 uint64_t *tcqdbresisect(QDBRSET *rsets, int rsnum, int *np){
598 assert(rsets && rsnum >= 0 && np);
599 for(int i = 0; i < rsnum; i++){
600 if(!rsets[i].ids){
601 *np = 0;
602 return tcmalloc(1);
603 }
604 }
605 if(rsnum == 0){
606 *np = 0;
607 return tcmalloc(1);
608 }
609 if(rsnum == 1){
610 *np = rsets[0].num;
611 return tcmemdup(rsets[0].ids, rsets[0].num * sizeof(rsets[0].ids[0]));
612 }
613 if(rsnum == 2){
614 uint64_t *small, *large;
615 int snum, lnum;
616 if(rsets[0].num < rsets[1].num){
617 small = rsets[0].ids;
618 snum = rsets[0].num;
619 large = rsets[1].ids;
620 lnum = rsets[1].num;
621 } else {
622 small = rsets[1].ids;
623 snum = rsets[1].num;
624 large = rsets[0].ids;
625 lnum = rsets[0].num;
626 }
627 uint64_t *res = tcmalloc(snum * sizeof(*res) + 1);
628 int rnum = 0;
629 TCIDSET *idset = tcidsetnew(snum * QDBHJBNUMCO + 1);
630 for(int i = 0; i < snum; i++){
631 tcidsetmark(idset, small[i]);
632 }
633 for(int i = 0; i < lnum; i++){
634 if(tcidsetcheck(idset, large[i])){
635 res[rnum++] = large[i];
636 if(rnum >= snum) break;
637 }
638 }
639 tcidsetdel(idset);
640 *np = rnum;
641 return res;
642 }
643 int sum = 0;
644 for(int i = 0; i < rsnum; i++){
645 sum += rsets[i].num;
646 }
647 uint64_t *res = tcmalloc(sum * sizeof(*res) + 1);
648 int rnum = 0;
649 for(int i = 0; i < rsnum; i++){
650 const uint64_t *ids = rsets[i].ids;
651 int num = rsets[i].num;
652 for(int j = 0; j < num; j++){
653 res[rnum++] = ids[j];
654 }
655 }
656 qsort(res, rnum, sizeof(*res), (int (*)(const void *, const void *))tccmpuint64);
657 int nnum = 0;
658 uint64_t lid = UINT64_MAX;
659 int dnum = 0;
660 for(int i = 0; i < rnum; i++){
661 uint64_t id = res[i];
662 if(lid == id){
663 dnum++;
664 if(dnum == rsnum) res[nnum++] = id;
665 } else {
666 dnum = 1;
667 }
668 lid = id;
669 }
670 *np = nnum;
671 return res;
672 }
673
674
675 /* Merge multiple result sets by difference. */
tcqdbresdiff(QDBRSET * rsets,int rsnum,int * np)676 uint64_t *tcqdbresdiff(QDBRSET *rsets, int rsnum, int *np){
677 assert(rsets && rsnum >= 0 && np);
678 if(rsnum == 0 || !rsets[0].ids){
679 *np = 0;
680 return tcmalloc(1);
681 }
682 if(rsnum == 1){
683 *np = rsets[0].num;
684 return tcmemdup(rsets[0].ids, rsets[0].num * sizeof(rsets[0].ids[0]));
685 }
686 int sum = 0;
687 for(int i = 1; i < rsnum; i++){
688 if(!rsets[i].ids) continue;
689 sum += rsets[i].num;
690 }
691 TCIDSET *idset = tcidsetnew(sum * QDBHJBNUMCO + 1);
692 for(int i = 1; i < rsnum; i++){
693 const uint64_t *ids = rsets[i].ids;
694 if(!ids) continue;
695 int num = rsets[i].num;
696 for(int j = 0; j < num; j++){
697 tcidsetmark(idset, ids[j]);
698 }
699 }
700 uint64_t *res = tcmalloc(rsets[0].num * sizeof(*res) + 1);
701 int rnum = 0;
702 const uint64_t *ids = rsets[0].ids;
703 int num = rsets[0].num;
704 for(int i = 0; i < num; i++){
705 if(!tcidsetcheck(idset, ids[i])) res[rnum++] = ids[i];
706 }
707 tcidsetdel(idset);
708 *np = rnum;
709 return res;
710 }
711
712
713 /* Normalize a text. */
tctextnormalize(char * text,int opts)714 void tctextnormalize(char *text, int opts){
715 assert(text);
716 bool lowmode = opts & TCTNLOWER;
717 bool nacmode = opts & TCTNNOACC;
718 bool spcmode = opts & TCTNSPACE;
719 int len = strlen(text);
720 uint16_t stack[QDBIOBUFSIZ];
721 uint16_t *ary = (len < QDBIOBUFSIZ) ? stack : tcmalloc(sizeof(*ary) * (len + 1));
722 int anum;
723 tcstrutftoucs(text, ary, &anum);
724 ary[anum] = 0x0000;
725 int nnum = 0;
726 for(int i = 0; i < anum; i++){
727 int c = ary[i];
728 int high = c >> 8;
729 if(high == 0x00){
730 if(c < 0x0020 || c == 0x007f){
731 // control characters
732 if(spcmode){
733 ary[nnum++] = 0x0020;
734 } else if(c == 0x0009 || c == 0x000a || c == 0x000d){
735 ary[nnum++] = c;
736 } else {
737 ary[nnum++] = 0x0020;
738 }
739 } else if(c == 0x00a0){
740 // no-break space
741 ary[nnum++] = 0x0020;
742 } else {
743 // otherwise
744 if(lowmode){
745 if(c < 0x007f){
746 if(c >= 0x0041 && c <= 0x005a) c += 0x20;
747 } else if(c >= 0x00c0 && c <= 0x00de && c != 0x00d7){
748 c += 0x20;
749 }
750 }
751 if(nacmode){
752 if(c >= 0x00c0 && c <= 0x00c5){
753 c = 'A';
754 } else if(c == 0x00c7){
755 c = 'C';
756 } if(c >= 0x00c7 && c <= 0x00cb){
757 c = 'E';
758 } if(c >= 0x00cc && c <= 0x00cf){
759 c = 'I';
760 } else if(c == 0x00d0){
761 c = 'D';
762 } else if(c == 0x00d1){
763 c = 'N';
764 } if((c >= 0x00d2 && c <= 0x00d6) || c == 0x00d8){
765 c = 'O';
766 } if(c >= 0x00d9 && c <= 0x00dc){
767 c = 'U';
768 } if(c == 0x00dd || c == 0x00de){
769 c = 'Y';
770 } else if(c == 0x00df){
771 c = 's';
772 } else if(c >= 0x00e0 && c <= 0x00e5){
773 c = 'a';
774 } else if(c == 0x00e7){
775 c = 'c';
776 } if(c >= 0x00e7 && c <= 0x00eb){
777 c = 'e';
778 } if(c >= 0x00ec && c <= 0x00ef){
779 c = 'i';
780 } else if(c == 0x00f0){
781 c = 'd';
782 } else if(c == 0x00f1){
783 c = 'n';
784 } if((c >= 0x00f2 && c <= 0x00f6) || c == 0x00f8){
785 c = 'o';
786 } if(c >= 0x00f9 && c <= 0x00fc){
787 c = 'u';
788 } if(c >= 0x00fd && c <= 0x00ff){
789 c = 'y';
790 }
791 }
792 ary[nnum++] = c;
793 }
794 } else if(high == 0x01){
795 // latin-1 extended
796 if(lowmode){
797 if(c <= 0x0137){
798 if((c & 1) == 0) c++;
799 } else if(c == 0x0138){
800 c += 0;
801 } else if(c <= 0x0148){
802 if((c & 1) == 1) c++;
803 } else if(c == 0x0149){
804 c += 0;
805 } else if(c <= 0x0177){
806 if((c & 1) == 0) c++;
807 } else if(c == 0x0178){
808 c = 0x00ff;
809 } else if(c <= 0x017e){
810 if((c & 1) == 1) c++;
811 } else if(c == 0x017f){
812 c += 0;
813 }
814 }
815 if(nacmode){
816 if(c == 0x00ff){
817 c = 'y';
818 } else if(c <= 0x0105){
819 c = ((c & 1) == 0) ? 'A' : 'a';
820 } else if(c <= 0x010d){
821 c = ((c & 1) == 0) ? 'C' : 'c';
822 } else if(c <= 0x0111){
823 c = ((c & 1) == 0) ? 'D' : 'd';
824 } else if(c <= 0x011b){
825 c = ((c & 1) == 0) ? 'E' : 'e';
826 } else if(c <= 0x0123){
827 c = ((c & 1) == 0) ? 'G' : 'g';
828 } else if(c <= 0x0127){
829 c = ((c & 1) == 0) ? 'H' : 'h';
830 } else if(c <= 0x0131){
831 c = ((c & 1) == 0) ? 'I' : 'i';
832 } else if(c == 0x0134){
833 c = 'J';
834 } else if(c == 0x0135){
835 c = 'j';
836 } else if(c == 0x0136){
837 c = 'K';
838 } else if(c == 0x0137){
839 c = 'k';
840 } else if(c == 0x0138){
841 c = 'k';
842 } else if(c >= 0x0139 && c <= 0x0142){
843 c = ((c & 1) == 1) ? 'L' : 'l';
844 } else if(c >= 0x0143 && c <= 0x0148){
845 c = ((c & 1) == 1) ? 'N' : 'n';
846 } else if(c >= 0x0149 && c <= 0x014b){
847 c = ((c & 1) == 0) ? 'N' : 'n';
848 } else if(c >= 0x014c && c <= 0x0151){
849 c = ((c & 1) == 0) ? 'O' : 'o';
850 } else if(c >= 0x0154 && c <= 0x0159){
851 c = ((c & 1) == 0) ? 'R' : 'r';
852 } else if(c >= 0x015a && c <= 0x0161){
853 c = ((c & 1) == 0) ? 'S' : 's';
854 } else if(c >= 0x0162 && c <= 0x0167){
855 c = ((c & 1) == 0) ? 'T' : 't';
856 } else if(c >= 0x0168 && c <= 0x0173){
857 c = ((c & 1) == 0) ? 'U' : 'u';
858 } else if(c == 0x0174){
859 c = 'W';
860 } else if(c == 0x0175){
861 c = 'w';
862 } else if(c == 0x0176){
863 c = 'Y';
864 } else if(c == 0x0177){
865 c = 'y';
866 } else if(c == 0x0178){
867 c = 'Y';
868 } else if(c >= 0x0179 && c <= 0x017e){
869 c = ((c & 1) == 1) ? 'Z' : 'z';
870 } else if(c == 0x017f){
871 c = 's';
872 }
873 }
874 ary[nnum++] = c;
875 } else if(high == 0x03){
876 // greek
877 if(lowmode){
878 if(c >= 0x0391 && c <= 0x03a9){
879 c += 0x20;
880 } else if(c >= 0x03d8 && c <= 0x03ef){
881 if((c & 1) == 0) c++;
882 } else if(c == 0x0374 || c == 0x03f7 || c == 0x03fa){
883 c++;
884 }
885 }
886 ary[nnum++] = c;
887 } else if(high == 0x04){
888 // cyrillic
889 if(lowmode){
890 if(c <= 0x040f){
891 c += 0x50;
892 } else if(c <= 0x042f){
893 c += 0x20;
894 } else if(c >= 0x0460 && c <= 0x0481){
895 if((c & 1) == 0) c++;
896 } else if(c >= 0x048a && c <= 0x04bf){
897 if((c & 1) == 0) c++;
898 } else if(c == 0x04c0){
899 c = 0x04cf;
900 } else if(c >= 0x04c1 && c <= 0x04ce){
901 if((c & 1) == 1) c++;
902 } else if(c >= 0x04d0){
903 if((c & 1) == 0) c++;
904 }
905 }
906 ary[nnum++] = c;
907 } else if(high == 0x20){
908 if(c == 0x2002){
909 // en space
910 ary[nnum++] = 0x0020;
911 } else if(c == 0x2003){
912 // em space
913 ary[nnum++] = 0x0020;
914 } else if(c == 0x2009){
915 // thin space
916 ary[nnum++] = 0x0020;
917 } else if(c == 0x2010){
918 // hyphen
919 ary[nnum++] = 0x002d;
920 } else if(c == 0x2015){
921 // fullwidth horizontal line
922 ary[nnum++] = 0x002d;
923 } else if(c == 0x2019){
924 // apostrophe
925 ary[nnum++] = 0x0027;
926 } else if(c == 0x2033){
927 // double quotes
928 ary[nnum++] = 0x0022;
929 } else {
930 // (otherwise)
931 ary[nnum++] = c;
932 }
933 } else if(high == 0x22){
934 if(c == 0x2212){
935 // minus sign
936 ary[nnum++] = 0x002d;
937 } else {
938 // (otherwise)
939 ary[nnum++] = c;
940 }
941 } else if(high == 0x30){
942 if(c == 0x3000){
943 // fullwidth space
944 if(spcmode){
945 ary[nnum++] = 0x0020;
946 } else {
947 ary[nnum++] = c;
948 }
949 } else {
950 // (otherwise)
951 ary[nnum++] = c;
952 }
953 } else if(high == 0xff){
954 if(c == 0xff01){
955 // fullwidth exclamation
956 ary[nnum++] = 0x0021;
957 } else if(c == 0xff03){
958 // fullwidth igeta
959 ary[nnum++] = 0x0023;
960 } else if(c == 0xff04){
961 // fullwidth dollar
962 ary[nnum++] = 0x0024;
963 } else if(c == 0xff05){
964 // fullwidth parcent
965 ary[nnum++] = 0x0025;
966 } else if(c == 0xff06){
967 // fullwidth ampersand
968 ary[nnum++] = 0x0026;
969 } else if(c == 0xff0a){
970 // fullwidth asterisk
971 ary[nnum++] = 0x002a;
972 } else if(c == 0xff0b){
973 // fullwidth plus
974 ary[nnum++] = 0x002b;
975 } else if(c == 0xff0c){
976 // fullwidth comma
977 ary[nnum++] = 0x002c;
978 } else if(c == 0xff0e){
979 // fullwidth period
980 ary[nnum++] = 0x002e;
981 } else if(c == 0xff0f){
982 // fullwidth slash
983 ary[nnum++] = 0x002f;
984 } else if(c == 0xff1a){
985 // fullwidth colon
986 ary[nnum++] = 0x003a;
987 } else if(c == 0xff1b){
988 // fullwidth semicolon
989 ary[nnum++] = 0x003b;
990 } else if(c == 0xff1d){
991 // fullwidth equal
992 ary[nnum++] = 0x003d;
993 } else if(c == 0xff1f){
994 // fullwidth question
995 ary[nnum++] = 0x003f;
996 } else if(c == 0xff20){
997 // fullwidth atmark
998 ary[nnum++] = 0x0040;
999 } else if(c == 0xff3c){
1000 // fullwidth backslash
1001 ary[nnum++] = 0x005c;
1002 } else if(c == 0xff3e){
1003 // fullwidth circumflex
1004 ary[nnum++] = 0x005e;
1005 } else if(c == 0xff3f){
1006 // fullwidth underscore
1007 ary[nnum++] = 0x005f;
1008 } else if(c == 0xff5c){
1009 // fullwidth vertical line
1010 ary[nnum++] = 0x007c;
1011 } else if(c >= 0xff21 && c <= 0xff3a){
1012 // fullwidth alphabets
1013 if(lowmode){
1014 c -= 0xfee0;
1015 if(c >= 0x0041 && c <= 0x005a) c += 0x20;
1016 ary[nnum++] = c;
1017 } else {
1018 ary[nnum++] = c - 0xfee0;
1019 }
1020 } else if(c >= 0xff41 && c <= 0xff5a){
1021 // fullwidth small alphabets
1022 ary[nnum++] = c - 0xfee0;
1023 } else if(c >= 0xff10 && c <= 0xff19){
1024 // fullwidth numbers
1025 ary[nnum++] = c - 0xfee0;
1026 } else if(c == 0xff61){
1027 // halfwidth full stop
1028 ary[nnum++] = 0x3002;
1029 } else if(c == 0xff62){
1030 // halfwidth left corner
1031 ary[nnum++] = 0x300c;
1032 } else if(c == 0xff63){
1033 // halfwidth right corner
1034 ary[nnum++] = 0x300d;
1035 } else if(c == 0xff64){
1036 // halfwidth comma
1037 ary[nnum++] = 0x3001;
1038 } else if(c == 0xff65){
1039 // halfwidth middle dot
1040 ary[nnum++] = 0x30fb;
1041 } else if(c == 0xff66){
1042 // halfwidth wo
1043 ary[nnum++] = 0x30f2;
1044 } else if(c >= 0xff67 && c <= 0xff6b){
1045 // halfwidth small a-o
1046 ary[nnum++] = (c - 0xff67) * 2 + 0x30a1;
1047 } else if(c >= 0xff6c && c <= 0xff6e){
1048 // halfwidth small ya-yo
1049 ary[nnum++] = (c - 0xff6c) * 2 + 0x30e3;
1050 } else if(c == 0xff6f){
1051 // halfwidth small tu
1052 ary[nnum++] = 0x30c3;
1053 } else if(c == 0xff70){
1054 // halfwidth prolonged mark
1055 ary[nnum++] = 0x30fc;
1056 } else if(c >= 0xff71 && c <= 0xff75){
1057 // halfwidth a-o
1058 ary[nnum] = (c - 0xff71) * 2 + 0x30a2;
1059 if(c == 0xff73 && ary[i+1] == 0xff9e){
1060 ary[nnum] = 0x30f4;
1061 i++;
1062 }
1063 nnum++;
1064 } else if(c >= 0xff76 && c <= 0xff7a){
1065 // halfwidth ka-ko
1066 ary[nnum] = (c - 0xff76) * 2 + 0x30ab;
1067 if(ary[i+1] == 0xff9e){
1068 ary[nnum]++;
1069 i++;
1070 }
1071 nnum++;
1072 } else if(c >= 0xff7b && c <= 0xff7f){
1073 // halfwidth sa-so
1074 ary[nnum] = (c - 0xff7b) * 2 + 0x30b5;
1075 if(ary[i+1] == 0xff9e){
1076 ary[nnum]++;
1077 i++;
1078 }
1079 nnum++;
1080 } else if(c >= 0xff80 && c <= 0xff84){
1081 // halfwidth ta-to
1082 ary[nnum] = (c - 0xff80) * 2 + 0x30bf + (c >= 0xff82 ? 1 : 0);
1083 if(ary[i+1] == 0xff9e){
1084 ary[nnum]++;
1085 i++;
1086 }
1087 nnum++;
1088 } else if(c >= 0xff85 && c <= 0xff89){
1089 // halfwidth na-no
1090 ary[nnum++] = c - 0xcebb;
1091 } else if(c >= 0xff8a && c <= 0xff8e){
1092 // halfwidth ha-ho
1093 ary[nnum] = (c - 0xff8a) * 3 + 0x30cf;
1094 if(ary[i+1] == 0xff9e){
1095 ary[nnum]++;
1096 i++;
1097 } else if(ary[i+1] == 0xff9f){
1098 ary[nnum] += 2;
1099 i++;
1100 }
1101 nnum++;
1102 } else if(c >= 0xff8f && c <= 0xff93){
1103 // halfwidth ma-mo
1104 ary[nnum++] = c - 0xceb1;
1105 } else if(c >= 0xff94 && c <= 0xff96){
1106 // halfwidth ya-yo
1107 ary[nnum++] = (c - 0xff94) * 2 + 0x30e4;
1108 } else if(c >= 0xff97 && c <= 0xff9b){
1109 // halfwidth ra-ro
1110 ary[nnum++] = c - 0xceae;
1111 } else if(c == 0xff9c){
1112 // halfwidth wa
1113 ary[nnum++] = 0x30ef;
1114 } else if(c == 0xff9d){
1115 // halfwidth nn
1116 ary[nnum++] = 0x30f3;
1117 } else {
1118 // otherwise
1119 ary[nnum++] = c;
1120 }
1121 } else {
1122 // otherwise
1123 ary[nnum++] = c;
1124 }
1125 }
1126 tcstrucstoutf(ary, nnum, text);
1127 if(ary != stack) tcfree(ary);
1128 if(spcmode) tcstrsqzspc(text);
1129 }
1130
1131
1132 /* Create an ID set object. */
tcidsetnew(uint32_t bnum)1133 TCIDSET *tcidsetnew(uint32_t bnum){
1134 if(bnum < 1) bnum = 1;
1135 TCIDSET *idset = tcmalloc(sizeof(*idset));
1136 uint64_t *buckets;
1137 if(bnum >= QDBZMMINSIZ / sizeof(*buckets)){
1138 buckets = tczeromap(bnum * sizeof(*buckets));
1139 } else {
1140 buckets = tccalloc(bnum, sizeof(*buckets));
1141 }
1142 idset->buckets = buckets;
1143 idset->bnum = bnum;
1144 idset->trails = tcmapnew2((bnum >> 2) + 1);
1145 return idset;
1146 }
1147
1148
1149 /* Delete an ID set object. */
tcidsetdel(TCIDSET * idset)1150 void tcidsetdel(TCIDSET *idset){
1151 assert(idset);
1152 tcmapdel(idset->trails);
1153 if(idset->bnum >= QDBZMMINSIZ / sizeof(idset->buckets[0])){
1154 tczerounmap(idset->buckets);
1155 } else {
1156 tcfree(idset->buckets);
1157 }
1158 tcfree(idset);
1159 }
1160
1161
1162 /* Mark an ID of an ID set object. */
tcidsetmark(TCIDSET * idset,int64_t id)1163 void tcidsetmark(TCIDSET *idset, int64_t id){
1164 assert(idset && id > 0);
1165 uint32_t bidx = id % idset->bnum;
1166 uint64_t rec = idset->buckets[bidx];
1167 if(rec == 0){
1168 idset->buckets[bidx] = id;
1169 } else {
1170 if((rec & INT64_MAX) == id) return;
1171 idset->buckets[bidx] = rec | INT64_MIN;
1172 tcmapputkeep(idset->trails, &id, sizeof(id), "", 0);
1173 }
1174 }
1175
1176
1177 /* Check an ID of an ID set object. */
tcidsetcheck(TCIDSET * idset,int64_t id)1178 bool tcidsetcheck(TCIDSET *idset, int64_t id){
1179 assert(idset && id >= 1);
1180 uint32_t bidx = id % idset->bnum;
1181 uint64_t rec = idset->buckets[bidx];
1182 if(rec == 0) return false;
1183 if((rec & INT64_MAX) == id) return true;
1184 if(rec <= INT64_MAX) return false;
1185 int vsiz;
1186 return tcmapget(idset->trails, &id, sizeof(id), &vsiz) != NULL;
1187 }
1188
1189
1190 /* Clear an ID set object. */
tcidsetclear(TCIDSET * idset)1191 void tcidsetclear(TCIDSET *idset){
1192 assert(idset);
1193 uint64_t *buckets = idset->buckets;
1194 uint32_t bnum = idset->bnum;
1195 for(int i = 0; i < bnum; i++){
1196 buckets[i] = 0;
1197 }
1198 tcmapclear(idset->trails);
1199 }
1200
1201
1202
1203 /*************************************************************************************************
1204 * private features
1205 *************************************************************************************************/
1206
1207
1208 /* Lock a method of the q-gram database object.
1209 `qdb' specifies the q-gram database object.
1210 `wr' specifies whether the lock is writer or not.
1211 If successful, the return value is true, else, it is false. */
tcqdblockmethod(TCQDB * qdb,bool wr)1212 static bool tcqdblockmethod(TCQDB *qdb, bool wr){
1213 assert(qdb);
1214 if(wr ? pthread_rwlock_wrlock(qdb->mmtx) != 0 : pthread_rwlock_rdlock(qdb->mmtx) != 0){
1215 tcbdbsetecode(qdb->idx, TCETHREAD, __FILE__, __LINE__, __func__);
1216 return false;
1217 }
1218 return true;
1219 }
1220
1221
1222 /* Unlock a method of the q-gram database object.
1223 `bdb' specifies the q-gram database object.
1224 If successful, the return value is true, else, it is false. */
tcqdbunlockmethod(TCQDB * qdb)1225 static bool tcqdbunlockmethod(TCQDB *qdb){
1226 assert(qdb);
1227 if(pthread_rwlock_unlock(qdb->mmtx) != 0){
1228 tcbdbsetecode(qdb->idx, TCETHREAD, __FILE__, __LINE__, __func__);
1229 return false;
1230 }
1231 return true;
1232 }
1233
1234
1235 /* Open a q-gram database object.
1236 `qdb' specifies the q-gram database object.
1237 `path' specifies the path of the database file.
1238 `omode' specifies the connection mode.
1239 If successful, the return value is true, else, it is false. */
tcqdbopenimpl(TCQDB * qdb,const char * path,int omode)1240 static bool tcqdbopenimpl(TCQDB *qdb, const char *path, int omode){
1241 assert(qdb && path);
1242 int bomode = BDBOREADER;
1243 if(omode & QDBOWRITER){
1244 bomode = BDBOWRITER;
1245 if(omode & QDBOCREAT) bomode |= BDBOCREAT;
1246 if(omode & QDBOTRUNC) bomode |= BDBOTRUNC;
1247 int64_t bnum = (qdb->etnum / QDBLMEMB) * 2 + 1;
1248 int bopts = 0;
1249 if(qdb->opts & QDBTLARGE) bopts |= BDBTLARGE;
1250 if(qdb->opts & QDBTDEFLATE) bopts |= BDBTDEFLATE;
1251 if(qdb->opts & QDBTBZIP) bopts |= BDBTBZIP;
1252 if(qdb->opts & QDBTTCBS) bopts |= BDBTTCBS;
1253 if(!tcbdbtune(qdb->idx, QDBLMEMB, QDBNMEMB, bnum, QDBAPOW, QDBFPOW, bopts)) return false;
1254 if(!tcbdbsetlsmax(qdb->idx, QDBLSMAX)) return false;
1255 }
1256 if(qdb->lcnum > 0){
1257 if(!tcbdbsetcache(qdb->idx, qdb->lcnum, qdb->lcnum / 4 + 1)) return false;
1258 } else {
1259 if(!tcbdbsetcache(qdb->idx, (omode & QDBOWRITER) ? QDBLCNUMW : QDBLCNUMR, QDBNCNUM))
1260 return false;
1261 }
1262 if(omode & QDBONOLCK) bomode |= BDBONOLCK;
1263 if(omode & QDBOLCKNB) bomode |= BDBOLCKNB;
1264 if(!tcbdbopen(qdb->idx, path, bomode)) return false;
1265 if((omode & QDBOWRITER) && tcbdbrnum(qdb->idx) < 1){
1266 memcpy(tcbdbopaque(qdb->idx), QDBMAGICDATA, strlen(QDBMAGICDATA));
1267 } else if(!(omode & QDBONOLCK) &&
1268 memcmp(tcbdbopaque(qdb->idx), QDBMAGICDATA, strlen(QDBMAGICDATA))){
1269 tcbdbclose(qdb->idx);
1270 tcbdbsetecode(qdb->idx, TCEMETA, __FILE__, __LINE__, __func__);
1271 return 0;
1272 }
1273 if(omode & QDBOWRITER){
1274 qdb->cc = tcmapnew2(QDBCCBNUM);
1275 qdb->dtokens = tcmapnew2(QDBDTKNBNUM);
1276 qdb->dids = tcidsetnew(QDBDIDSBNUM);
1277 }
1278 qdb->open = true;
1279 return true;
1280 }
1281
1282
1283 /* Close a q-gram database object.
1284 `qdb' specifies the q-gram database object.
1285 If successful, the return value is true, else, it is false. */
tcqdbcloseimpl(TCQDB * qdb)1286 static bool tcqdbcloseimpl(TCQDB *qdb){
1287 assert(qdb);
1288 bool err = false;
1289 if(qdb->cc){
1290 if((tcmaprnum(qdb->cc) > 0 || tcmaprnum(qdb->dtokens) > 0) && !tcqdbmemsync(qdb, 0))
1291 err = true;
1292 tcidsetdel(qdb->dids);
1293 tcmapdel(qdb->dtokens);
1294 tcmapdel(qdb->cc);
1295 qdb->cc = NULL;
1296 }
1297 if(!tcbdbclose(qdb->idx)) err = true;
1298 qdb->open = false;
1299 return !err;
1300 }
1301
1302
1303 /* Store a record into a q-gram database object.
1304 `qdb' specifies the q-gram database object.
1305 `id' specifies the ID number of the record.
1306 `text' specifies the string of the record.
1307 If successful, the return value is true, else, it is false. */
tcqdbputimpl(TCQDB * qdb,int64_t id,const char * text)1308 static bool tcqdbputimpl(TCQDB *qdb, int64_t id, const char *text){
1309 assert(qdb && id > 0 && text);
1310 int len = strlen(text);
1311 char idbuf[TDNUMBUFSIZ*2];
1312 int idsiz;
1313 TDSETVNUMBUF64(idsiz, idbuf, id);
1314 uint16_t stack[QDBIOBUFSIZ];
1315 uint16_t *ary = (len < QDBIOBUFSIZ) ? stack : tcmalloc(sizeof(*ary) * (len + 1));
1316 int anum;
1317 tcstrutftoucs(text, ary, &anum);
1318 ary[anum] = 0x0000;
1319 TCMAP *cc = qdb->cc;
1320 char *wp = idbuf + idsiz;
1321 for(int i = 0; i < anum; i++){
1322 int osiz;
1323 TDSETVNUMBUF(osiz, wp, i);
1324 tcmapputcat(cc, ary + i, 2 * sizeof(*ary), idbuf, idsiz + osiz);
1325 }
1326 if(ary != stack) tcfree(ary);
1327 bool err = false;
1328 if(tcmapmsiz(cc) >= qdb->icsiz && !tcqdbmemsync(qdb, 1)) err = true;
1329 return !err;
1330 }
1331
1332
1333 /* Remove a record of a q-gram database object.
1334 `qdb' specifies the q-gram database object.
1335 `id' specifies the ID number of the record.
1336 `text' specifies the string of the record.
1337 If successful, the return value is true, else, it is false. */
tcqdboutimpl(TCQDB * qdb,int64_t id,const char * text)1338 static bool tcqdboutimpl(TCQDB *qdb, int64_t id, const char *text){
1339 assert(qdb && id > 0 && text);
1340 int len = strlen(text);
1341 char idbuf[TDNUMBUFSIZ*2];
1342 int idsiz;
1343 TDSETVNUMBUF64(idsiz, idbuf, id);
1344 uint16_t stack[QDBIOBUFSIZ];
1345 uint16_t *ary = (len < QDBIOBUFSIZ) ? stack : tcmalloc(sizeof(*ary) * (len + 1));
1346 int anum;
1347 tcstrutftoucs(text, ary, &anum);
1348 ary[anum] = 0x0000;
1349 TCMAP *dtokens = qdb->dtokens;
1350 for(int i = 0; i < anum; i++){
1351 tcmapputkeep(dtokens, ary + i, 2 * sizeof(*ary), "", 0);
1352 }
1353 if(ary != stack) tcfree(ary);
1354 tcidsetmark(qdb->dids, id);
1355 bool err = false;
1356 if(tcmapmsiz(dtokens) >= qdb->icsiz && !tcqdbmemsync(qdb, 1)) err = true;
1357 return !err;
1358 }
1359
1360
1361 /* Search a q-gram database.
1362 `qdb' specifies the q-gram database object.
1363 `word' specifies the string of the word to be matched to.
1364 `smode' specifies the matching mode.
1365 `np' specifies the pointer to the variable into which the number of elements of the return
1366 value is assigned.
1367 If successful, the return value is the pointer to an array of ID numbers of the corresponding
1368 records. */
tcqdbsearchimpl(TCQDB * qdb,const char * word,int smode,int * np)1369 static uint64_t *tcqdbsearchimpl(TCQDB *qdb, const char *word, int smode, int *np){
1370 assert(qdb && word && np);
1371 TCBDB *idx = qdb->idx;
1372 uint64_t *res = NULL;
1373 int wsiz = strlen(word);
1374 uint16_t *ary = tcmalloc(sizeof(*ary) * (wsiz + 2));
1375 int anum;
1376 tcstrutftoucs(word, ary, &anum);
1377 for(int i = 0; i < 2; i++){
1378 ary[anum+i] = 0;
1379 }
1380 if(anum >= 2){
1381 QDBOCR *ocrs = tcmalloc(QDBOCRUNIT * sizeof(*ocrs));
1382 int onum = 0;
1383 int oanum = QDBOCRUNIT;
1384 int obase = 0;
1385 TCBITMAP *pkmap = TCBITMAPNEW(QDBBITMAPNUM);
1386 char token[2*3+1];
1387 uint16_t seq = 0;
1388 for(int i = 0; i < anum; i += 2){
1389 obase = onum;
1390 int diff = anum - i - 2;
1391 if(diff < 0){
1392 i += diff;
1393 diff = -diff;
1394 } else {
1395 diff = 0;
1396 }
1397 tcstrucstoutf(ary + i, 2, token);
1398 int tsiz = strlen(token);
1399 int csiz;
1400 const char *cbuf = tcbdbget3(idx, token, tsiz, &csiz);
1401 if(cbuf){
1402 while(csiz > 0){
1403 int64_t id = 0;
1404 int step;
1405 TDREADVNUMBUF64(cbuf, id, step);
1406 cbuf += step;
1407 csiz -= step;
1408 if(csiz > 0){
1409 int off;
1410 TDREADVNUMBUF(cbuf, off, step);
1411 cbuf += step;
1412 csiz -= step;
1413 off += diff;
1414 uint32_t hash = id % QDBBITMAPNUM;
1415 if(i == 0 || TCBITMAPCHECK(pkmap, hash)){
1416 if(onum >= oanum){
1417 oanum *= 2;
1418 ocrs = tcrealloc(ocrs, oanum * sizeof(*ocrs));
1419 }
1420 QDBOCR *ocr = ocrs + onum;
1421 ocr->id = id;
1422 ocr->off = off;
1423 ocr->seq = seq;
1424 ocr->hash = hash;
1425 onum++;
1426 if(i == 0) TCBITMAPON(pkmap, hash);
1427 }
1428 }
1429 }
1430 if(i == 0 && (smode == QDBSPREFIX || smode == QDBSFULL)){
1431 int nnum = 0;
1432 for(int j = 0; j < onum; j++){
1433 if(ocrs[j].off == 0) ocrs[nnum++] = ocrs[j];
1434 }
1435 onum = nnum;
1436 }
1437 }
1438 seq++;
1439 if(onum <= obase){
1440 onum = 0;
1441 break;
1442 }
1443 }
1444 if(smode == QDBSSUFFIX || smode == QDBSFULL){
1445 obase = onum;
1446 int diff = anum % 2 + 1;
1447 tcstrucstoutf(ary + (anum - 1), 2, token);
1448 int tsiz = strlen(token);
1449 int csiz;
1450 const char *cbuf = tcbdbget3(idx, token, tsiz, &csiz);
1451 if(cbuf){
1452 while(csiz > 0){
1453 int64_t id = 0;
1454 int step;
1455 TDREADVNUMBUF64(cbuf, id, step);
1456 cbuf += step;
1457 csiz -= step;
1458 if(csiz > 0){
1459 int off;
1460 TDREADVNUMBUF(cbuf, off, step);
1461 cbuf += step;
1462 csiz -= step;
1463 off += diff;
1464 uint32_t hash = id % QDBBITMAPNUM;
1465 if(TCBITMAPCHECK(pkmap, hash)){
1466 if(onum >= oanum){
1467 oanum *= 2;
1468 ocrs = tcrealloc(ocrs, oanum * sizeof(*ocrs));
1469 }
1470 QDBOCR *ocr = ocrs + onum;
1471 ocr->id = id;
1472 ocr->off = off;
1473 ocr->seq = seq;
1474 ocr->hash = hash;
1475 onum++;
1476 }
1477 }
1478 }
1479 }
1480 seq++;
1481 if(onum <= obase) onum = 0;
1482 }
1483 TCBITMAPDEL(pkmap);
1484 if(seq > 1){
1485 if(onum > UINT16_MAX){
1486 int flnum = onum * 16 + 1;
1487 TCBITMAP *flmap = TCBITMAPNEW(flnum);
1488 for(int i = obase; i < onum; i++){
1489 QDBOCR *ocr = ocrs + i;
1490 uint32_t hash = (((uint32_t)ocr->off << 16) | ocr->hash) % flnum;
1491 TCBITMAPON(flmap, hash);
1492 }
1493 int wi = 0;
1494 for(int i = 0; i < obase; i++){
1495 QDBOCR *ocr = ocrs + i;
1496 int rem = (seq - ocr->seq - 1) * 2;
1497 uint32_t hash = (((uint32_t)(ocr->off + rem) << 16) | ocr->hash) % flnum;
1498 if(TCBITMAPCHECK(flmap, hash)) ocrs[wi++] = *ocr;
1499 }
1500 for(int i = obase; i < onum; i++){
1501 ocrs[wi++] = ocrs[i];
1502 }
1503 onum = wi;
1504 TCBITMAPDEL(flmap);
1505 }
1506 if(onum > UINT16_MAX * 2){
1507 QDBOCR *rocrs = tcmalloc(sizeof(*rocrs) * onum);
1508 uint32_t counts[UINT16_MAX+1];
1509 memset(counts, 0, sizeof(counts));
1510 for(int i = 0; i < onum; i++){
1511 counts[ocrs[i].hash]++;
1512 }
1513 for(int i = 0; i < UINT16_MAX; i++){
1514 counts[i+1] += counts[i];
1515 }
1516 for(int i = onum - 1; i >= 0; i--){
1517 rocrs[--counts[ocrs[i].hash]] = ocrs[i];
1518 }
1519 for(int i = 0; i < UINT16_MAX; i++){
1520 int num = counts[i+1] - counts[i];
1521 if(num > 1) qsort(rocrs + counts[i], num, sizeof(*rocrs),
1522 (int (*)(const void *, const void *))tccmpocrs);
1523 }
1524 int num = onum - counts[UINT16_MAX];
1525 if(num > 1) qsort(rocrs + counts[UINT16_MAX], num, sizeof(*rocrs),
1526 (int (*)(const void *, const void *))tccmpocrs);
1527 tcfree(ocrs);
1528 ocrs = rocrs;
1529 } else if(onum > 1){
1530 qsort(ocrs, onum, sizeof(*ocrs),
1531 (int (*)(const void *, const void *))tccmpocrs);
1532 }
1533 }
1534 TCIDSET *idset = tcidsetnew(onum * QDBHJBNUMCO + 1);
1535 res = tcmalloc(sizeof(*res) * onum + 1);
1536 int rnum = 0;
1537 int rem = (seq - 1) * 2;
1538 int ri = 0;
1539 while(ri < onum){
1540 QDBOCR *ocr = ocrs + ri;
1541 ri++;
1542 if(ocr->seq > 0) continue;
1543 int64_t id = ocr->id;
1544 int32_t max = ocr->off;
1545 uint16_t seq = 1;
1546 for(int i = ri; i < onum; i++){
1547 QDBOCR *tocr = ocrs + i;
1548 if(tocr->id != id) break;
1549 if(tocr->seq == seq && tocr->off == max + 2){
1550 max = tocr->off;
1551 seq++;
1552 }
1553 }
1554 if(max == ocr->off + rem){
1555 if(!tcidsetcheck(idset, id)){
1556 res[rnum++] = id;
1557 tcidsetmark(idset, id);
1558 }
1559 while(ri < onum && ocrs[ri].id == id){
1560 ri++;
1561 }
1562 }
1563 }
1564 tcidsetdel(idset);
1565 tcfree(ocrs);
1566 *np = rnum;
1567 } else {
1568 TCIDSET *idset = tcidsetnew(QDBBITMAPNUM / 8 + 1);
1569 res = tcmalloc(sizeof(*res) * QDBOCRUNIT);
1570 int ranum = QDBOCRUNIT;
1571 int rnum = 0;
1572 BDBCUR *cur = tcbdbcurnew(idx);
1573 tcbdbcurjump(cur, word, wsiz);
1574 TCXSTR *key = tcxstrnew();
1575 TCXSTR *val = tcxstrnew();
1576 bool pchk = smode == QDBSPREFIX || smode == QDBSFULL;
1577 bool schk = smode == QDBSSUFFIX || smode == QDBSFULL;
1578 for(int i = 0; i < qdb->fwmmax && tcbdbcurrec(cur, key, val); i++){
1579 const char *kbuf = TCXSTRPTR(key);
1580 int ksiz = TCXSTRSIZE(key);
1581 if(ksiz < wsiz || memcmp(kbuf, word, wsiz)) break;
1582 if(schk && (ksiz != wsiz || memcmp(kbuf, word, wsiz))) break;
1583 const char *cbuf = TCXSTRPTR(val);
1584 int csiz = TCXSTRSIZE(val);
1585 while(csiz > 0){
1586 int64_t id = 0;
1587 int step;
1588 TDREADVNUMBUF64(cbuf, id, step);
1589 cbuf += step;
1590 csiz -= step;
1591 if(csiz > 0){
1592 int off;
1593 TDREADVNUMBUF(cbuf, off, step);
1594 cbuf += step;
1595 csiz -= step;
1596 if((!pchk || off == 0) && !tcidsetcheck(idset, id)){
1597 if(rnum >= ranum){
1598 ranum *= 2;
1599 res = tcrealloc(res, sizeof(*res) * ranum);
1600 }
1601 res[rnum++] = id;
1602 tcidsetmark(idset, id);
1603 }
1604 }
1605 }
1606 tcbdbcurnext(cur);
1607 }
1608 tcxstrdel(val);
1609 tcxstrdel(key);
1610 tcbdbcurdel(cur);
1611 tcidsetdel(idset);
1612 qsort(res, rnum, sizeof(*res), (int (*)(const void *, const void *))tccmpuint64);
1613 *np = rnum;
1614 }
1615 tcfree(ary);
1616 return res;
1617 }
1618
1619
1620 /* Compare two list elements in lexical order.
1621 `a' specifies the pointer to one element.
1622 `b' specifies the pointer to the other element.
1623 The return value is positive if the former is big, negative if the latter is big, 0 if both
1624 are equivalent. */
tccmptokens(const uint16_t ** a,const uint16_t ** b)1625 static int tccmptokens(const uint16_t **a, const uint16_t **b){
1626 assert(a && b);
1627 uint32_t anum = ((*a)[0] << 16UL) + (*a)[1];
1628 uint32_t bnum = ((*b)[0] << 16UL) + (*b)[1];
1629 return (anum < bnum) ? -1 : anum > bnum;
1630 }
1631
1632
1633 /* Compare two occurrences in identical order.
1634 `a' specifies the pointer to one occurrence.
1635 `b' specifies the pointer to the other occurrence.
1636 The return value is positive if the former is big, negative if the latter is big, 0 if both
1637 are equivalent. */
tccmpocrs(QDBOCR * a,QDBOCR * b)1638 static int tccmpocrs(QDBOCR *a, QDBOCR *b){
1639 assert(a && b);
1640 if(a->id > b->id) return 1;
1641 if(a->id < b->id) return -1;
1642 return a->off - b->off;
1643 }
1644
1645
1646 /* Compare two unsigned 64-bit integers.
1647 `a' specifies the pointer to one number.
1648 `b' specifies the pointer to the other number.
1649 The return value is positive if the former is big, negative if the latter is big, 0 if both
1650 are equivalent. */
tccmpuint64(const uint64_t * a,const uint64_t * b)1651 static int tccmpuint64(const uint64_t *a, const uint64_t *b){
1652 assert(a && b);
1653 return (*a < *b) ? -1 : *a > *b;
1654 }
1655
1656
1657
1658 // END OF FILE
1659