1 /*************************************************************************************************
2  * The q-gram database API of Tokyo Dystopia
3  *                                                               Copyright (C) 2007-2010 FAL Labs
4  * This file is part of Tokyo Dystopia.
5  * Tokyo Dystopia is free software; you can redistribute it and/or modify it under the terms of
6  * the GNU Lesser General Public License as published by the Free Software Foundation; either
7  * version 2.1 of the License or any later version.  Tokyo Dystopia is distributed in the hope
8  * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
10  * License for more details.
11  * You should have received a copy of the GNU Lesser General Public License along with Tokyo
12  * Dystopia; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13  * Boston, MA 02111-1307 USA.
14  *************************************************************************************************/
15 
16 
17 #ifndef _TCQDB_H                         /* duplication check */
18 #define _TCQDB_H
19 
20 
21 #if defined(__cplusplus)
22 #define __TCQDB_CLINKAGEBEGIN extern "C" {
23 #define __TCQDB_CLINKAGEEND }
24 #else
25 #define __TCQDB_CLINKAGEBEGIN
26 #define __TCQDB_CLINKAGEEND
27 #endif
28 __TCQDB_CLINKAGEBEGIN
29 
30 
31 #include <tcutil.h>
32 #include <tchdb.h>
33 #include <tcbdb.h>
34 
35 
36 
37 /*************************************************************************************************
38  * API
39  *************************************************************************************************/
40 
41 
42 typedef struct {                         /* type of structure for a q-gram database */
43   void *mmtx;                            /* mutex for method */
44   TCBDB *idx;                            /* internal database object */
45   bool open;                             /* whether the internal database is opened */
46   TCMAP *cc;                             /* cache of q-gram tokens */
47   uint64_t icsiz;                        /* capacity of the cache */
48   uint32_t lcnum;                        /* max number of cached leaves */
49   TCMAP *dtokens;                        /* deleted tokens */
50   struct _TCIDSET *dids;                 /* deleted ID numbers */
51   uint32_t etnum;                        /* expected number of tokens */
52   uint8_t opts;                          /* options */
53   uint32_t fwmmax;                       /* maximum number of forward matching expansion */
54   bool (*synccb)(int, int, const char *, void *);  /* callback function for sync progression */
55   void *syncopq;                         /* opaque for the sync callback function */
56 } TCQDB;
57 
58 enum {                                   /* enumeration for tuning options */
59   QDBTLARGE = 1 << 0,                    /* use 64-bit bucket array */
60   QDBTDEFLATE = 1 << 1,                  /* compress each page with Deflate */
61   QDBTBZIP = 1 << 2,                     /* compress each record with BZIP2 */
62   QDBTTCBS = 1 << 3                      /* compress each page with TCBS */
63 };
64 
65 enum {                                   /* enumeration for open modes */
66   QDBOREADER = 1 << 0,                   /* open as a reader */
67   QDBOWRITER = 1 << 1,                   /* open as a writer */
68   QDBOCREAT = 1 << 2,                    /* writer creating */
69   QDBOTRUNC = 1 << 3,                    /* writer truncating */
70   QDBONOLCK = 1 << 4,                    /* open without locking */
71   QDBOLCKNB = 1 << 5                     /* lock without blocking */
72 };
73 
74 enum {                                   /* enumeration for get modes */
75   QDBSSUBSTR,                            /* substring matching */
76   QDBSPREFIX,                            /* prefix matching */
77   QDBSSUFFIX,                            /* suffix matching */
78   QDBSFULL                               /* full matching */
79 };
80 
81 
82 /* String containing the version information. */
83 extern const char *tdversion;
84 
85 
86 /* Get the message string corresponding to an error code.
87    `ecode' specifies the error code.
88    The return value is the message string of the error code. */
89 const char *tcqdberrmsg(int ecode);
90 
91 
92 /* Create a q-gram database object.
93    The return value is the new q-gram database object. */
94 TCQDB *tcqdbnew(void);
95 
96 
97 /* Delete a q-gram database object.
98    `qdb' specifies the q-gram database object.
99    If the database is not closed, it is closed implicitly.  Note that the deleted object and its
100    derivatives can not be used anymore. */
101 void tcqdbdel(TCQDB *qdb);
102 
103 
104 /* Get the last happened error code of a q-gram database object.
105    `qdb' specifies the q-gram database object.
106    The return value is the last happened error code.
107    The following error code is defined: `TCESUCCESS' for success, `TCETHREAD' for threading
108    error, `TCEINVALID' for invalid operation, `TCENOFILE' for file not found, `TCENOPERM' for no
109    permission, `TCEMETA' for invalid meta data, `TCERHEAD' for invalid record header, `TCEOPEN'
110    for open error, `TCECLOSE' for close error, `TCETRUNC' for trunc error, `TCESYNC' for sync
111    error, `TCESTAT' for stat error, `TCESEEK' for seek error, `TCEREAD' for read error,
112    `TCEWRITE' for write error, `TCEMMAP' for mmap error, `TCELOCK' for lock error, `TCEUNLINK'
113    for unlink error, `TCERENAME' for rename error, `TCEMKDIR' for mkdir error, `TCERMDIR' for
114    rmdir error, `TCEKEEP' for existing record, `TCENOREC' for no record found, and `TCEMISC' for
115    miscellaneous error. */
116 int tcqdbecode(TCQDB *qdb);
117 
118 
119 /* Set the tuning parameters of a q-gram database object.
120    `qdb' specifies the q-gram database object which is not opened.
121    `etnum' specifies the expected number of tokens to be stored.  If it is not more than 0, the
122    default value is specified.  The default value is 1000000.
123    `opts' specifies options by bitwise-or: `QDBTLARGE' specifies that the size of the database
124    can be larger than 2GB by using 64-bit bucket array, `QDBTDEFLATE' specifies that each page
125    is compressed with Deflate encoding, `QDBTBZIP' specifies that each page is compressed with
126    BZIP2 encoding, `QDBTTCBS' specifies that each page is compressed with TCBS encoding.
127    If successful, the return value is true, else, it is false.
128    Note that the tuning parameters should be set before the database is opened. */
129 bool tcqdbtune(TCQDB *qdb, int64_t etnum, uint8_t opts);
130 
131 
132 /* Set the caching parameters of a q-gram database object.
133    `qdb' specifies the q-gram database object which is not opened.
134    `icsiz' specifies the capacity size of the token cache.  If it is not more than 0, the default
135    value is specified.  The default value is 134217728.
136    `lcnum' specifies the maximum number of cached leaf nodes of B+ tree.  If it is not more than
137    0, the default value is specified.  The default value is 64 for writer or 1024 for reader.
138    If successful, the return value is true, else, it is false.
139    Note that the caching parameters should be set before the database is opened. */
140 bool tcqdbsetcache(TCQDB *qdb, int64_t icsiz, int32_t lcnum);
141 
142 
143 /* Set the maximum number of forward matching expansion of a q-gram database object.
144    `qdb' specifies the q-gram database object.
145    `fwmmax' specifies the maximum number of forward matching expansion.
146    If successful, the return value is true, else, it is false.
147    Note that the matching parameters should be set before the database is opened. */
148 bool tcqdbsetfwmmax(TCQDB *qdb, uint32_t fwmmax);
149 
150 
151 /* Open a q-gram database object.
152    `qdb' specifies the q-gram database object.
153    `path' specifies the path of the database file.
154    `omode' specifies the connection mode: `QDBOWRITER' as a writer, `QDBOREADER' as a reader.
155    If the mode is `QDBOWRITER', the following may be added by bitwise-or: `QDBOCREAT', which
156    means it creates a new database if not exist, `QDBOTRUNC', which means it creates a new
157    database regardless if one exists.  Both of `QDBOREADER' and `QDBOWRITER' can be added to by
158    bitwise-or: `QDBONOLCK', which means it opens the database file without file locking, or
159    `QDBOLCKNB', which means locking is performed without blocking.
160    If successful, the return value is true, else, it is false. */
161 bool tcqdbopen(TCQDB *qdb, const char *path, int omode);
162 
163 
164 /* Close a q-gram database object.
165    `qdb' specifies the q-gram database object.
166    If successful, the return value is true, else, it is false.
167    Update of a database is assured to be written when the database is closed.  If a writer opens
168    a database but does not close it appropriately, the database will be broken. */
169 bool tcqdbclose(TCQDB *qdb);
170 
171 
172 /* Store a record into a q-gram database object.
173    `qdb' specifies the q-gram database object connected as a writer.
174    `id' specifies the ID number of the record.  It should be positive.
175    `text' specifies the string of the record, whose encoding should be UTF-8.
176    If successful, the return value is true, else, it is false. */
177 bool tcqdbput(TCQDB *qdb, int64_t id, const char *text);
178 
179 
180 /* Remove a record of a q-gram database object.
181    `qdb' specifies the q-gram database object connected as a writer.
182    `id' specifies the ID number of the record.  It should be positive.
183    `text' specifies the string of the record, which should be same as the stored one.
184    If successful, the return value is true, else, it is false. */
185 bool tcqdbout(TCQDB *qdb, int64_t id, const char *text);
186 
187 
188 /* Search a q-gram database.
189    `qdb' specifies the q-gram database object.
190    `word' specifies the string of the word to be matched to.
191    `smode' specifies the matching mode: `QDBSSUBSTR' as substring matching, `QDBSPREFIX' as prefix
192    matching, `QDBSSUFFIX' as suffix matching, or `QDBSFULL' as full matching.
193    `np' specifies the pointer to the variable into which the number of elements of the return
194    value is assigned.
195    If successful, the return value is the pointer to an array of ID numbers of the corresponding
196    records.  `NULL' is returned on failure.
197    Because the region of the return value is allocated with the `malloc' call, it should be
198    released with the `free' call when it is no longer in use. */
199 uint64_t *tcqdbsearch(TCQDB *qdb, const char *word, int smode, int *np);
200 
201 
202 /* Synchronize updated contents of a q-gram database object with the file and the device.
203    `qdb' specifies the q-gram database object connected as a writer.
204    If successful, the return value is true, else, it is false.
205    This function is useful when another process connects the same database file. */
206 bool tcqdbsync(TCQDB *qdb);
207 
208 
209 /* Optimize the file of a q-gram database object.
210    `qdb' specifies the q-gram database object connected as a writer.
211    If successful, the return value is true, else, it is false.
212    This function is useful to reduce the size of the database file with data fragmentation by
213    successive updating. */
214 bool tcqdboptimize(TCQDB *qdb);
215 
216 
217 /* Remove all records of a q-gram database object.
218    `qdb' specifies the q-gram database object connected as a writer.
219    If successful, the return value is true, else, it is false. */
220 bool tcqdbvanish(TCQDB *qdb);
221 
222 
223 /* Copy the database file of a q-gram database object.
224    `qdb' specifies the q-gram database object.
225    `path' specifies the path of the destination file.  If it begins with `@', the trailing
226    substring is executed as a command line.
227    If successful, the return value is true, else, it is false.  False is returned if the executed
228    command returns non-zero code.
229    The database file is assured to be kept synchronized and not modified while the copying or
230    executing operation is in progress.  So, this function is useful to create a backup file of
231    the database file. */
232 bool tcqdbcopy(TCQDB *qdb, const char *path);
233 
234 
235 /* Get the file path of a q-gram database object.
236    `qdb' specifies the q-gram database object.
237    The return value is the path of the database file or `NULL' if the object does not connect to
238    any database file. */
239 const char *tcqdbpath(TCQDB *qdb);
240 
241 
242 /* Get the number of tokens of a q-gram database object.
243    `qdb' specifies the q-gram database object.
244    The return value is the number of tokens or 0 if the object does not connect to any database
245    file. */
246 uint64_t tcqdbtnum(TCQDB *qdb);
247 
248 
249 /* Get the size of the database file of a q-gram database object.
250    `qdb' specifies the q-gram database object.
251    The return value is the size of the database file or 0 if the object does not connect to any
252    database file. */
253 uint64_t tcqdbfsiz(TCQDB *qdb);
254 
255 
256 
257 /*************************************************************************************************
258  * features for experts
259  *************************************************************************************************/
260 
261 
262 #define _TD_VERSION    "0.9.15"
263 #define _TD_LIBVER     115
264 #define _TD_FORMATVER  "0.9"
265 
266 #define QDBSYNCMSGF    "started"         /* first message of sync progression */
267 #define QDBSYNCMSGL    "finished"        /* last message of sync progression */
268 
269 typedef struct {                         /* type of structure for a result set */
270   uint64_t *ids;                         /* array of ID numbers */
271   int num;                               /* number of the array */
272 } QDBRSET;
273 
274 typedef struct _TCIDSET {                /* type of structure for an ID set */
275   uint64_t *buckets;                     /* bucket array */
276   uint32_t bnum;                         /* number of buckets */
277   TCMAP *trails;                         /* map of trailing records */
278 } TCIDSET;
279 
280 enum {                                   /* enumeration for text normalization options */
281   TCTNLOWER = 1 << 0,                    /* into lower cases */
282   TCTNNOACC = 1 << 1,                    /* into ASCII alphabets */
283   TCTNSPACE = 1 << 2                     /* into ASCII space */
284 };
285 
286 
287 /* Set the file descriptor for debugging output.
288    `qdb' specifies the q-gram database object.
289    `fd' specifies the file descriptor for debugging output. */
290 void tcqdbsetdbgfd(TCQDB *qdb, int fd);
291 
292 
293 /* Get the file descriptor for debugging output.
294    `qdb' specifies the q-gram database object.
295    The return value is the file descriptor for debugging output. */
296 int tcqdbdbgfd(TCQDB *qdb);
297 
298 
299 /* Synchronize updating contents on memory of a q-gram database object.
300    `qdb' specifies the q-gram database object.
301    `level' specifies the synchronization lavel; 0 means cache synchronization, 1 means database
302    synchronization, and 2 means file synchronization.
303    If successful, the return value is true, else, it is false. */
304 bool tcqdbmemsync(TCQDB *qdb, int level);
305 
306 
307 /* Clear the cache of a q-gram database object.
308    `qdb' specifies the q-gram database object.
309    If successful, the return value is true, else, it is false. */
310 bool tcqdbcacheclear(TCQDB *qdb);
311 
312 
313 /* Get the inode number of the database file of a q-gram database object.
314    `qdb' specifies the q-gram database object.
315    The return value is the inode number of the database file or 0 the object does not connect to
316    any database file. */
317 uint64_t tcqdbinode(TCQDB *qdb);
318 
319 
320 /* Get the modification time of the database file of a q-gram database object.
321    `qdb' specifies the q-gram database object.
322    The return value is the inode number of the database file or 0 the object does not connect to
323    any database file. */
324 time_t tcqdbmtime(TCQDB *qdb);
325 
326 
327 /* Get the options of a q-gram database object.
328    `qdb' specifies the q-gram database object.
329    The return value is the options. */
330 uint8_t tcqdbopts(TCQDB *qdb);
331 
332 
333 /* Get the maximum number of forward matching expansion of a q-gram database object.
334    `qdb' specifies the q-gram database object.
335    The return value is the maximum number of forward matching expansion. */
336 uint32_t tcqdbfwmmax(TCQDB *qdb);
337 
338 
339 /* Get the number of records in the cache of a q-gram database object.
340    `wdb' specifies the word database object.
341    The return value is the number of records in the cache. */
342 uint32_t tcqdbcnum(TCQDB *qdb);
343 
344 
345 /* Set the callback function for sync progression of a q-gram database object.
346    `qdb' specifies the q-gram database object.
347    `cb' specifies the pointer to the callback function for sync progression.  Its first argument
348    specifies the number of tokens to be synchronized.  Its second argument specifies the number
349    of processed tokens.  Its third argument specifies the message string.  The fourth argument
350    specifies an arbitrary pointer.  Its return value should be true usually, or false if the sync
351    operation should be terminated.
352    `opq' specifies the arbitrary pointer to be given to the callback function. */
353 void tcqdbsetsynccb(TCQDB *qdb, bool (*cb)(int, int, const char *, void *), void *opq);
354 
355 
356 /* Merge multiple result sets by union.
357    `rsets' specifies the pointer to the array of result sets.
358    `rsnum' specifies the number of the array.
359    `np' specifies the pointer to the variable into which the number of elements of the return
360    value is assigned.
361    If successful, the return value is the pointer to an array of ID numbers of the result.
362    Because the region of the return value is allocated with the `malloc' call, it should be
363    released with the `free' call when it is no longer in use. */
364 uint64_t *tcqdbresunion(QDBRSET *rsets, int rsnum, int *np);
365 
366 
367 /* Merge multiple result sets by intersection.
368    `rsets' specifies the pointer to the array of result sets.
369    `rsnum' specifies the number of the array.
370    `np' specifies the pointer to the variable into which the number of elements of the return
371    value is assigned.
372    If successful, the return value is the pointer to an array of ID numbers of the result.
373    Because the region of the return value is allocated with the `malloc' call, it should be
374    released with the `free' call when it is no longer in use. */
375 uint64_t *tcqdbresisect(QDBRSET *rsets, int rsnum, int *np);
376 
377 
378 /* Merge multiple result sets by difference.
379    `rsets' specifies the pointer to the array of result sets.
380    `rsnum' specifies the number of the array.
381    `np' specifies the pointer to the variable into which the number of elements of the return
382    value is assigned.
383    If successful, the return value is the pointer to an array of ID numbers of the result.
384    Because the region of the return value is allocated with the `malloc' call, it should be
385    released with the `free' call when it is no longer in use. */
386 uint64_t *tcqdbresdiff(QDBRSET *rsets, int rsnum, int *np);
387 
388 
389 /* Normalize a text.
390    `text' specifies the string of the record, whose encoding should be UTF-8.
391    `opts' specifies options by bitwise-or: `TCTNLOWER' specifies that alphabetical characters are
392    normalized into lower cases, `TCTNNOACC' specifies that alphabetical characters with accent
393    marks are normalized without accent marks, `TCTNSPACE' specifies that white space characters
394    are normalized into the ASCII space and they are squeezed into one. */
395 void tctextnormalize(char *text, int opts);
396 
397 
398 /* Create an ID set object.
399    `bnum' specifies the number of the buckets.
400    The return value is the new ID set object. */
401 TCIDSET *tcidsetnew(uint32_t bnum);
402 
403 
404 /* Delete an ID set object.
405    `idset' specifies the ID set object. */
406 void tcidsetdel(TCIDSET *idset);
407 
408 
409 /* Mark an ID number of an ID set object.
410    `idset' specifies the ID set object.
411    `id' specifies the ID number. */
412 void tcidsetmark(TCIDSET *idset, int64_t id);
413 
414 
415 /* Check an ID of an ID set object.
416    `idset' specifies the ID set object.
417    `id' specifies the ID number.
418    The return value is true if the ID number is marked, else, it is false. */
419 bool tcidsetcheck(TCIDSET *idset, int64_t id);
420 
421 
422 /* Clear an ID set object.
423    `idset' specifies the ID set object. */
424 void tcidsetclear(TCIDSET *idset);
425 
426 
427 
428 __TCQDB_CLINKAGEEND
429 #endif                                   /* duplication check */
430 
431 
432 /* END OF FILE */
433