1 /************************************************************************************************* 2 * The q-gram database API of Tokyo Dystopia 3 * Copyright (C) 2007-2010 FAL Labs 4 * This file is part of Tokyo Dystopia. 5 * Tokyo Dystopia is free software; you can redistribute it and/or modify it under the terms of 6 * the GNU Lesser General Public License as published by the Free Software Foundation; either 7 * version 2.1 of the License or any later version. Tokyo Dystopia is distributed in the hope 8 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 10 * License for more details. 11 * You should have received a copy of the GNU Lesser General Public License along with Tokyo 12 * Dystopia; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, 13 * Boston, MA 02111-1307 USA. 14 *************************************************************************************************/ 15 16 17 #ifndef _TCQDB_H /* duplication check */ 18 #define _TCQDB_H 19 20 21 #if defined(__cplusplus) 22 #define __TCQDB_CLINKAGEBEGIN extern "C" { 23 #define __TCQDB_CLINKAGEEND } 24 #else 25 #define __TCQDB_CLINKAGEBEGIN 26 #define __TCQDB_CLINKAGEEND 27 #endif 28 __TCQDB_CLINKAGEBEGIN 29 30 31 #include <tcutil.h> 32 #include <tchdb.h> 33 #include <tcbdb.h> 34 35 36 37 /************************************************************************************************* 38 * API 39 *************************************************************************************************/ 40 41 42 typedef struct { /* type of structure for a q-gram database */ 43 void *mmtx; /* mutex for method */ 44 TCBDB *idx; /* internal database object */ 45 bool open; /* whether the internal database is opened */ 46 TCMAP *cc; /* cache of q-gram tokens */ 47 uint64_t icsiz; /* capacity of the cache */ 48 uint32_t lcnum; /* max number of cached leaves */ 49 TCMAP *dtokens; /* deleted tokens */ 50 struct _TCIDSET *dids; /* deleted ID numbers */ 51 uint32_t etnum; /* expected number of tokens */ 52 uint8_t opts; /* options */ 53 uint32_t fwmmax; /* maximum number of forward matching expansion */ 54 bool (*synccb)(int, int, const char *, void *); /* callback function for sync progression */ 55 void *syncopq; /* opaque for the sync callback function */ 56 } TCQDB; 57 58 enum { /* enumeration for tuning options */ 59 QDBTLARGE = 1 << 0, /* use 64-bit bucket array */ 60 QDBTDEFLATE = 1 << 1, /* compress each page with Deflate */ 61 QDBTBZIP = 1 << 2, /* compress each record with BZIP2 */ 62 QDBTTCBS = 1 << 3 /* compress each page with TCBS */ 63 }; 64 65 enum { /* enumeration for open modes */ 66 QDBOREADER = 1 << 0, /* open as a reader */ 67 QDBOWRITER = 1 << 1, /* open as a writer */ 68 QDBOCREAT = 1 << 2, /* writer creating */ 69 QDBOTRUNC = 1 << 3, /* writer truncating */ 70 QDBONOLCK = 1 << 4, /* open without locking */ 71 QDBOLCKNB = 1 << 5 /* lock without blocking */ 72 }; 73 74 enum { /* enumeration for get modes */ 75 QDBSSUBSTR, /* substring matching */ 76 QDBSPREFIX, /* prefix matching */ 77 QDBSSUFFIX, /* suffix matching */ 78 QDBSFULL /* full matching */ 79 }; 80 81 82 /* String containing the version information. */ 83 extern const char *tdversion; 84 85 86 /* Get the message string corresponding to an error code. 87 `ecode' specifies the error code. 88 The return value is the message string of the error code. */ 89 const char *tcqdberrmsg(int ecode); 90 91 92 /* Create a q-gram database object. 93 The return value is the new q-gram database object. */ 94 TCQDB *tcqdbnew(void); 95 96 97 /* Delete a q-gram database object. 98 `qdb' specifies the q-gram database object. 99 If the database is not closed, it is closed implicitly. Note that the deleted object and its 100 derivatives can not be used anymore. */ 101 void tcqdbdel(TCQDB *qdb); 102 103 104 /* Get the last happened error code of a q-gram database object. 105 `qdb' specifies the q-gram database object. 106 The return value is the last happened error code. 107 The following error code is defined: `TCESUCCESS' for success, `TCETHREAD' for threading 108 error, `TCEINVALID' for invalid operation, `TCENOFILE' for file not found, `TCENOPERM' for no 109 permission, `TCEMETA' for invalid meta data, `TCERHEAD' for invalid record header, `TCEOPEN' 110 for open error, `TCECLOSE' for close error, `TCETRUNC' for trunc error, `TCESYNC' for sync 111 error, `TCESTAT' for stat error, `TCESEEK' for seek error, `TCEREAD' for read error, 112 `TCEWRITE' for write error, `TCEMMAP' for mmap error, `TCELOCK' for lock error, `TCEUNLINK' 113 for unlink error, `TCERENAME' for rename error, `TCEMKDIR' for mkdir error, `TCERMDIR' for 114 rmdir error, `TCEKEEP' for existing record, `TCENOREC' for no record found, and `TCEMISC' for 115 miscellaneous error. */ 116 int tcqdbecode(TCQDB *qdb); 117 118 119 /* Set the tuning parameters of a q-gram database object. 120 `qdb' specifies the q-gram database object which is not opened. 121 `etnum' specifies the expected number of tokens to be stored. If it is not more than 0, the 122 default value is specified. The default value is 1000000. 123 `opts' specifies options by bitwise-or: `QDBTLARGE' specifies that the size of the database 124 can be larger than 2GB by using 64-bit bucket array, `QDBTDEFLATE' specifies that each page 125 is compressed with Deflate encoding, `QDBTBZIP' specifies that each page is compressed with 126 BZIP2 encoding, `QDBTTCBS' specifies that each page is compressed with TCBS encoding. 127 If successful, the return value is true, else, it is false. 128 Note that the tuning parameters should be set before the database is opened. */ 129 bool tcqdbtune(TCQDB *qdb, int64_t etnum, uint8_t opts); 130 131 132 /* Set the caching parameters of a q-gram database object. 133 `qdb' specifies the q-gram database object which is not opened. 134 `icsiz' specifies the capacity size of the token cache. If it is not more than 0, the default 135 value is specified. The default value is 134217728. 136 `lcnum' specifies the maximum number of cached leaf nodes of B+ tree. If it is not more than 137 0, the default value is specified. The default value is 64 for writer or 1024 for reader. 138 If successful, the return value is true, else, it is false. 139 Note that the caching parameters should be set before the database is opened. */ 140 bool tcqdbsetcache(TCQDB *qdb, int64_t icsiz, int32_t lcnum); 141 142 143 /* Set the maximum number of forward matching expansion of a q-gram database object. 144 `qdb' specifies the q-gram database object. 145 `fwmmax' specifies the maximum number of forward matching expansion. 146 If successful, the return value is true, else, it is false. 147 Note that the matching parameters should be set before the database is opened. */ 148 bool tcqdbsetfwmmax(TCQDB *qdb, uint32_t fwmmax); 149 150 151 /* Open a q-gram database object. 152 `qdb' specifies the q-gram database object. 153 `path' specifies the path of the database file. 154 `omode' specifies the connection mode: `QDBOWRITER' as a writer, `QDBOREADER' as a reader. 155 If the mode is `QDBOWRITER', the following may be added by bitwise-or: `QDBOCREAT', which 156 means it creates a new database if not exist, `QDBOTRUNC', which means it creates a new 157 database regardless if one exists. Both of `QDBOREADER' and `QDBOWRITER' can be added to by 158 bitwise-or: `QDBONOLCK', which means it opens the database file without file locking, or 159 `QDBOLCKNB', which means locking is performed without blocking. 160 If successful, the return value is true, else, it is false. */ 161 bool tcqdbopen(TCQDB *qdb, const char *path, int omode); 162 163 164 /* Close a q-gram database object. 165 `qdb' specifies the q-gram database object. 166 If successful, the return value is true, else, it is false. 167 Update of a database is assured to be written when the database is closed. If a writer opens 168 a database but does not close it appropriately, the database will be broken. */ 169 bool tcqdbclose(TCQDB *qdb); 170 171 172 /* Store a record into a q-gram database object. 173 `qdb' specifies the q-gram database object connected as a writer. 174 `id' specifies the ID number of the record. It should be positive. 175 `text' specifies the string of the record, whose encoding should be UTF-8. 176 If successful, the return value is true, else, it is false. */ 177 bool tcqdbput(TCQDB *qdb, int64_t id, const char *text); 178 179 180 /* Remove a record of a q-gram database object. 181 `qdb' specifies the q-gram database object connected as a writer. 182 `id' specifies the ID number of the record. It should be positive. 183 `text' specifies the string of the record, which should be same as the stored one. 184 If successful, the return value is true, else, it is false. */ 185 bool tcqdbout(TCQDB *qdb, int64_t id, const char *text); 186 187 188 /* Search a q-gram database. 189 `qdb' specifies the q-gram database object. 190 `word' specifies the string of the word to be matched to. 191 `smode' specifies the matching mode: `QDBSSUBSTR' as substring matching, `QDBSPREFIX' as prefix 192 matching, `QDBSSUFFIX' as suffix matching, or `QDBSFULL' as full matching. 193 `np' specifies the pointer to the variable into which the number of elements of the return 194 value is assigned. 195 If successful, the return value is the pointer to an array of ID numbers of the corresponding 196 records. `NULL' is returned on failure. 197 Because the region of the return value is allocated with the `malloc' call, it should be 198 released with the `free' call when it is no longer in use. */ 199 uint64_t *tcqdbsearch(TCQDB *qdb, const char *word, int smode, int *np); 200 201 202 /* Synchronize updated contents of a q-gram database object with the file and the device. 203 `qdb' specifies the q-gram database object connected as a writer. 204 If successful, the return value is true, else, it is false. 205 This function is useful when another process connects the same database file. */ 206 bool tcqdbsync(TCQDB *qdb); 207 208 209 /* Optimize the file of a q-gram database object. 210 `qdb' specifies the q-gram database object connected as a writer. 211 If successful, the return value is true, else, it is false. 212 This function is useful to reduce the size of the database file with data fragmentation by 213 successive updating. */ 214 bool tcqdboptimize(TCQDB *qdb); 215 216 217 /* Remove all records of a q-gram database object. 218 `qdb' specifies the q-gram database object connected as a writer. 219 If successful, the return value is true, else, it is false. */ 220 bool tcqdbvanish(TCQDB *qdb); 221 222 223 /* Copy the database file of a q-gram database object. 224 `qdb' specifies the q-gram database object. 225 `path' specifies the path of the destination file. If it begins with `@', the trailing 226 substring is executed as a command line. 227 If successful, the return value is true, else, it is false. False is returned if the executed 228 command returns non-zero code. 229 The database file is assured to be kept synchronized and not modified while the copying or 230 executing operation is in progress. So, this function is useful to create a backup file of 231 the database file. */ 232 bool tcqdbcopy(TCQDB *qdb, const char *path); 233 234 235 /* Get the file path of a q-gram database object. 236 `qdb' specifies the q-gram database object. 237 The return value is the path of the database file or `NULL' if the object does not connect to 238 any database file. */ 239 const char *tcqdbpath(TCQDB *qdb); 240 241 242 /* Get the number of tokens of a q-gram database object. 243 `qdb' specifies the q-gram database object. 244 The return value is the number of tokens or 0 if the object does not connect to any database 245 file. */ 246 uint64_t tcqdbtnum(TCQDB *qdb); 247 248 249 /* Get the size of the database file of a q-gram database object. 250 `qdb' specifies the q-gram database object. 251 The return value is the size of the database file or 0 if the object does not connect to any 252 database file. */ 253 uint64_t tcqdbfsiz(TCQDB *qdb); 254 255 256 257 /************************************************************************************************* 258 * features for experts 259 *************************************************************************************************/ 260 261 262 #define _TD_VERSION "0.9.15" 263 #define _TD_LIBVER 115 264 #define _TD_FORMATVER "0.9" 265 266 #define QDBSYNCMSGF "started" /* first message of sync progression */ 267 #define QDBSYNCMSGL "finished" /* last message of sync progression */ 268 269 typedef struct { /* type of structure for a result set */ 270 uint64_t *ids; /* array of ID numbers */ 271 int num; /* number of the array */ 272 } QDBRSET; 273 274 typedef struct _TCIDSET { /* type of structure for an ID set */ 275 uint64_t *buckets; /* bucket array */ 276 uint32_t bnum; /* number of buckets */ 277 TCMAP *trails; /* map of trailing records */ 278 } TCIDSET; 279 280 enum { /* enumeration for text normalization options */ 281 TCTNLOWER = 1 << 0, /* into lower cases */ 282 TCTNNOACC = 1 << 1, /* into ASCII alphabets */ 283 TCTNSPACE = 1 << 2 /* into ASCII space */ 284 }; 285 286 287 /* Set the file descriptor for debugging output. 288 `qdb' specifies the q-gram database object. 289 `fd' specifies the file descriptor for debugging output. */ 290 void tcqdbsetdbgfd(TCQDB *qdb, int fd); 291 292 293 /* Get the file descriptor for debugging output. 294 `qdb' specifies the q-gram database object. 295 The return value is the file descriptor for debugging output. */ 296 int tcqdbdbgfd(TCQDB *qdb); 297 298 299 /* Synchronize updating contents on memory of a q-gram database object. 300 `qdb' specifies the q-gram database object. 301 `level' specifies the synchronization lavel; 0 means cache synchronization, 1 means database 302 synchronization, and 2 means file synchronization. 303 If successful, the return value is true, else, it is false. */ 304 bool tcqdbmemsync(TCQDB *qdb, int level); 305 306 307 /* Clear the cache of a q-gram database object. 308 `qdb' specifies the q-gram database object. 309 If successful, the return value is true, else, it is false. */ 310 bool tcqdbcacheclear(TCQDB *qdb); 311 312 313 /* Get the inode number of the database file of a q-gram database object. 314 `qdb' specifies the q-gram database object. 315 The return value is the inode number of the database file or 0 the object does not connect to 316 any database file. */ 317 uint64_t tcqdbinode(TCQDB *qdb); 318 319 320 /* Get the modification time of the database file of a q-gram database object. 321 `qdb' specifies the q-gram database object. 322 The return value is the inode number of the database file or 0 the object does not connect to 323 any database file. */ 324 time_t tcqdbmtime(TCQDB *qdb); 325 326 327 /* Get the options of a q-gram database object. 328 `qdb' specifies the q-gram database object. 329 The return value is the options. */ 330 uint8_t tcqdbopts(TCQDB *qdb); 331 332 333 /* Get the maximum number of forward matching expansion of a q-gram database object. 334 `qdb' specifies the q-gram database object. 335 The return value is the maximum number of forward matching expansion. */ 336 uint32_t tcqdbfwmmax(TCQDB *qdb); 337 338 339 /* Get the number of records in the cache of a q-gram database object. 340 `wdb' specifies the word database object. 341 The return value is the number of records in the cache. */ 342 uint32_t tcqdbcnum(TCQDB *qdb); 343 344 345 /* Set the callback function for sync progression of a q-gram database object. 346 `qdb' specifies the q-gram database object. 347 `cb' specifies the pointer to the callback function for sync progression. Its first argument 348 specifies the number of tokens to be synchronized. Its second argument specifies the number 349 of processed tokens. Its third argument specifies the message string. The fourth argument 350 specifies an arbitrary pointer. Its return value should be true usually, or false if the sync 351 operation should be terminated. 352 `opq' specifies the arbitrary pointer to be given to the callback function. */ 353 void tcqdbsetsynccb(TCQDB *qdb, bool (*cb)(int, int, const char *, void *), void *opq); 354 355 356 /* Merge multiple result sets by union. 357 `rsets' specifies the pointer to the array of result sets. 358 `rsnum' specifies the number of the array. 359 `np' specifies the pointer to the variable into which the number of elements of the return 360 value is assigned. 361 If successful, the return value is the pointer to an array of ID numbers of the result. 362 Because the region of the return value is allocated with the `malloc' call, it should be 363 released with the `free' call when it is no longer in use. */ 364 uint64_t *tcqdbresunion(QDBRSET *rsets, int rsnum, int *np); 365 366 367 /* Merge multiple result sets by intersection. 368 `rsets' specifies the pointer to the array of result sets. 369 `rsnum' specifies the number of the array. 370 `np' specifies the pointer to the variable into which the number of elements of the return 371 value is assigned. 372 If successful, the return value is the pointer to an array of ID numbers of the result. 373 Because the region of the return value is allocated with the `malloc' call, it should be 374 released with the `free' call when it is no longer in use. */ 375 uint64_t *tcqdbresisect(QDBRSET *rsets, int rsnum, int *np); 376 377 378 /* Merge multiple result sets by difference. 379 `rsets' specifies the pointer to the array of result sets. 380 `rsnum' specifies the number of the array. 381 `np' specifies the pointer to the variable into which the number of elements of the return 382 value is assigned. 383 If successful, the return value is the pointer to an array of ID numbers of the result. 384 Because the region of the return value is allocated with the `malloc' call, it should be 385 released with the `free' call when it is no longer in use. */ 386 uint64_t *tcqdbresdiff(QDBRSET *rsets, int rsnum, int *np); 387 388 389 /* Normalize a text. 390 `text' specifies the string of the record, whose encoding should be UTF-8. 391 `opts' specifies options by bitwise-or: `TCTNLOWER' specifies that alphabetical characters are 392 normalized into lower cases, `TCTNNOACC' specifies that alphabetical characters with accent 393 marks are normalized without accent marks, `TCTNSPACE' specifies that white space characters 394 are normalized into the ASCII space and they are squeezed into one. */ 395 void tctextnormalize(char *text, int opts); 396 397 398 /* Create an ID set object. 399 `bnum' specifies the number of the buckets. 400 The return value is the new ID set object. */ 401 TCIDSET *tcidsetnew(uint32_t bnum); 402 403 404 /* Delete an ID set object. 405 `idset' specifies the ID set object. */ 406 void tcidsetdel(TCIDSET *idset); 407 408 409 /* Mark an ID number of an ID set object. 410 `idset' specifies the ID set object. 411 `id' specifies the ID number. */ 412 void tcidsetmark(TCIDSET *idset, int64_t id); 413 414 415 /* Check an ID of an ID set object. 416 `idset' specifies the ID set object. 417 `id' specifies the ID number. 418 The return value is true if the ID number is marked, else, it is false. */ 419 bool tcidsetcheck(TCIDSET *idset, int64_t id); 420 421 422 /* Clear an ID set object. 423 `idset' specifies the ID set object. */ 424 void tcidsetclear(TCIDSET *idset); 425 426 427 428 __TCQDB_CLINKAGEEND 429 #endif /* duplication check */ 430 431 432 /* END OF FILE */ 433