1 /************************************************************************************************* 2 * The core API of Hyper Estraier 3 * Copyright (C) 2004-2007 Mikio Hirabayashi 4 * This file is part of Hyper Estraier. 5 * Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of 6 * the GNU Lesser General Public License as published by the Free Software Foundation; either 7 * version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope 8 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of 9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 10 * License for more details. 11 * You should have received a copy of the GNU Lesser General Public License along with Hyper 12 * Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, 13 * Boston, MA 02111-1307 USA. 14 *************************************************************************************************/ 15 16 17 #ifndef _ESTRAIER_H /* duplication check */ 18 #define _ESTRAIER_H 19 20 #if defined(__cplusplus) /* export for C++ */ 21 extern "C" { 22 #endif 23 24 25 26 /************************************************************************************************* 27 * common settings 28 *************************************************************************************************/ 29 30 31 /* version of Hyper Estraier */ 32 extern const char *est_version; 33 34 35 36 /************************************************************************************************* 37 * underlying headers 38 *************************************************************************************************/ 39 40 41 #include <depot.h> 42 #include <curia.h> 43 #include <cabin.h> 44 #include <villa.h> 45 #include <stdlib.h> 46 47 48 49 /************************************************************************************************* 50 * API for document 51 *************************************************************************************************/ 52 53 54 #define ESTDATTRID "@id" /* name of the attribute of the ID number */ 55 #define ESTDATTRURI "@uri" /* name of the attribute of the URI */ 56 #define ESTDATTRDIGEST "@digest" /* name of the attribute of message digest */ 57 #define ESTDATTRCDATE "@cdate" /* name of the attribute of creation date */ 58 #define ESTDATTRMDATE "@mdate" /* name of the attribute of modification date */ 59 #define ESTDATTRADATE "@adate" /* name of the attribute of access date */ 60 #define ESTDATTRTITLE "@title" /* name of the attribute of title */ 61 #define ESTDATTRAUTHOR "@author" /* name of the attribute of author */ 62 #define ESTDATTRTYPE "@type" /* name of the attribute of content type */ 63 #define ESTDATTRLANG "@lang" /* name of the attribute of language */ 64 #define ESTDATTRGENRE "@genre" /* name of the attribute of genre */ 65 #define ESTDATTRSIZE "@size" /* name of the attribute of entity size */ 66 #define ESTDATTRWEIGHT "@weight" /* name of the attribute of scoring weight */ 67 #define ESTDATTRMISC "@misc" /* name of the attribute of miscellaneous information */ 68 #define ESTDCNTLVECTOR "%VECTOR" /* name of the control code for keyword vector */ 69 #define ESTDCNTLSCORE "%SCORE" /* name of the control code for substitute score */ 70 #define ESTDCNTLSHADOW "%SHADOW" /* name of the control code for shadow document */ 71 72 typedef struct { /* type of structure for a document */ 73 int id; /* identification number */ 74 CBMAP *attrs; /* map of attributes */ 75 CBLIST *dtexts; /* list of shown text */ 76 CBMAP *kwords; /* map of keywords */ 77 } ESTDOC; 78 79 80 /* Create a document object. 81 The return value is an object of a document. */ 82 ESTDOC *est_doc_new(void); 83 84 85 /* Create a document object made from draft data. 86 `draft' specifies a string of draft data. 87 The return value is an object of a document. */ 88 ESTDOC *est_doc_new_from_draft(const char *draft); 89 90 91 /* Destroy a document object. 92 `doc' specifies a document object. */ 93 void est_doc_delete(ESTDOC *doc); 94 95 96 /* Add an attribute to a document object. 97 `doc' specifies a document object. 98 `name' specifies the name of an attribute. 99 `value' specifies the value of the attribute. If it is `NULL', the attribute is removed. */ 100 void est_doc_add_attr(ESTDOC *doc, const char *name, const char *value); 101 102 103 /* Add a sentence of text to a document object. 104 `doc' specifies a document object. 105 `text' specifies a sentence of text. */ 106 void est_doc_add_text(ESTDOC *doc, const char *text); 107 108 109 /* Add a hidden sentence to a document object. 110 `doc' specifies a document object. 111 `text' specifies a hidden sentence. */ 112 void est_doc_add_hidden_text(ESTDOC *doc, const char *text); 113 114 115 /* Attach keywords to a document object. 116 `doc' specifies a document object. 117 `kwords' specifies a map object of keywords. Keys of the map should be keywords of the 118 document and values should be their scores in decimal string. The map object is copied 119 internally. */ 120 void est_doc_set_keywords(ESTDOC *doc, CBMAP *kwords); 121 122 123 /* Set the substitute score of a document object. 124 `doc' specifies a document object. 125 `score' specifies the substitute score. It it is negative, the substitute score setting is 126 nullified. */ 127 void est_doc_set_score(ESTDOC *doc, int score); 128 129 130 /* Get the ID number of a document object. 131 `doc' specifies a document object. 132 The return value is the ID number of the document object. If the object has not been 133 registered, -1 is returned. */ 134 int est_doc_id(ESTDOC *doc); 135 136 137 /* Get a list of attribute names of a document object. 138 `doc' specifies a document object. 139 The return value is a new list object of attribute names of the document object. Because 140 the object of the return value is opened with the function `cblistopen', it should be closed 141 with the function `cblistclose' if it is no longer in use. */ 142 CBLIST *est_doc_attr_names(ESTDOC *doc); 143 144 145 /* Get the value of an attribute of a document object. 146 `doc' specifies a document object. 147 `name' specifies the name of an attribute. 148 The return value is the value of the attribute or `NULL' if it does not exist. The life 149 duration of the returned string is synchronous with the one of the document object. */ 150 const char *est_doc_attr(ESTDOC *doc, const char *name); 151 152 153 /* Get a list of sentences of the text of a document object. 154 `doc' specifies a document object. 155 The return value is a list object of sentences of the text of the document object. The life 156 duration of the returned object is synchronous with the one of the document object. */ 157 const CBLIST *est_doc_texts(ESTDOC *doc); 158 159 160 /* Concatenate sentences of the text of a document object. 161 `doc' specifies a document object. 162 The return value is concatenated sentences of the document object. Because the region of the 163 return value is allocated with the `malloc' call, it should be released with the `free' call 164 if it is no longer in use. */ 165 char *est_doc_cat_texts(ESTDOC *doc); 166 167 168 /* Get attached keywords of a document object. 169 `doc' specifies a document object. 170 The return value is a map object of keywords and their scores in decimal string. If no 171 keyword is attached, `NULL' is returned. The life duration of the returned object is 172 synchronous with the one of the document object. */ 173 CBMAP *est_doc_keywords(ESTDOC *doc); 174 175 176 /* Get the substitute score of a document object. 177 `doc' specifies a document object. 178 The return value is the substitute score or -1 if it is not set. */ 179 int est_doc_score(ESTDOC *doc); 180 181 182 /* Dump draft data of a document object. 183 `doc' specifies a document object. 184 The return value is draft data of the document object. Because the region of the return value 185 is allocated with the `malloc' call, it should be released with the `free' call if it is no 186 longer in use. */ 187 char *est_doc_dump_draft(ESTDOC *doc); 188 189 190 /* Make a snippet of the body text of a document object. 191 `doc' specifies a document object. 192 `word' specifies a list object of words to be highlight. 193 `wwidth' specifies whole width of the result. 194 `hwidth' specifies width of strings picked up from the beginning of the text. 195 `awidth' specifies width of strings picked up around each highlighted word. 196 The return value is a snippet string of the body text of the document object. There are tab 197 separated values. Each line is a string to be shown. Though most lines have only one field, 198 some lines have two fields. If the second field exists, the first field is to be shown with 199 highlighted, and the second field means its normalized form. Because the region of the 200 return value is allocated with the `malloc' call, it should be released with the `free' call 201 if it is no longer in use. */ 202 char *est_doc_make_snippet(ESTDOC *doc, const CBLIST *words, int wwidth, int hwidth, int awidth); 203 204 205 206 /************************************************************************************************* 207 * API for search conditions 208 *************************************************************************************************/ 209 210 211 #define ESTOPUVSET "[UVSET]" /* universal set */ 212 #define ESTOPID "[ID]" /* ID matching search */ 213 #define ESTOPURI "[URI]" /* URI matching search */ 214 #define ESTOPSIMILAR "[SIMILAR]" /* similarity search */ 215 #define ESTOPRANK "[RANK]" /* ranking search */ 216 217 #define ESTOPUNION "OR" /* union (conjunction) */ 218 #define ESTOPISECT "AND" /* intersection (disjunction) */ 219 #define ESTOPDIFF "ANDNOT" /* difference (intersection with negation) */ 220 #define ESTOPWCBW "[BW]" /* wild card for words beginning with a string */ 221 #define ESTOPWCEW "[EW]" /* wild card for words ending with a string */ 222 #define ESTOPWCRX "[RX]" /* wild card for words matching regular expressions */ 223 #define ESTOPWITH "WITH" /* delimiter for elements */ 224 225 #define ESTOPSTREQ "STREQ" /* string is equal */ 226 #define ESTOPSTRNE "STRNE" /* string is not equal */ 227 #define ESTOPSTRINC "STRINC" /* string is included in */ 228 #define ESTOPSTRBW "STRBW" /* string begins with */ 229 #define ESTOPSTREW "STREW" /* string ends with */ 230 #define ESTOPSTRAND "STRAND" /* string includes all tokens in */ 231 #define ESTOPSTROR "STROR" /* string includes at least one token in */ 232 #define ESTOPSTROREQ "STROREQ" /* string is equal at least one token in */ 233 #define ESTOPSTRRX "STRRX" /* string matches regular expressions of */ 234 #define ESTOPNUMEQ "NUMEQ" /* number or date is equal */ 235 #define ESTOPNUMNE "NUMNE" /* number or date is not equal */ 236 #define ESTOPNUMGT "NUMGT" /* number or date is greater than */ 237 #define ESTOPNUMGE "NUMGE" /* number or date is greater than or equal to */ 238 #define ESTOPNUMLT "NUMLT" /* number or date is less than */ 239 #define ESTOPNUMLE "NUMLE" /* number or date is less than or equal to */ 240 #define ESTOPNUMBT "NUMBT" /* number or date is between two tokens of */ 241 242 #define ESTORDIDA "[IDA]" /* ID numbers in ascending order */ 243 #define ESTORDIDD "[IDD]" /* ID numbers in descending order */ 244 #define ESTORDSCA "[SCA]" /* scores in ascending order */ 245 #define ESTORDSCD "[SCD]" /* scores in descending order */ 246 #define ESTORDSTRA "STRA" /* strings in ascending order */ 247 #define ESTORDSTRD "STRD" /* strings in descending order */ 248 #define ESTORDNUMA "NUMA" /* numbers in ascending order */ 249 #define ESTORDNUMD "NUMD" /* numbers in descending order */ 250 251 #define ESTECLSIMURL 10.0 /* eclipse considering similarity and URL */ 252 #define ESTECLSERV 100.0 /* eclipse on server basis */ 253 #define ESTECLDIR 101.0 /* eclipse on directory basis */ 254 #define ESTECLFILE 102.0 /* eclipse on file basis */ 255 256 typedef struct { /* type of structure for search conditions */ 257 char *phrase; /* search phrase */ 258 int gstep; /* step of N-gram */ 259 int tfidf; /* whether with TF-IDF tuning */ 260 int pmode; /* mode of phrase form */ 261 void (*cbxpn)(const char *, CBLIST *); /* callback function for query expansion */ 262 CBLIST *attrs; /* conditions with attributes */ 263 char *order; /* sorting order */ 264 int max; /* maximum number of retrieval */ 265 int skip; /* number of documents to be skipped */ 266 int auxmin; /* minimum hits to adopt the auxiliary index */ 267 CBMAP *auxwords; /* words which the auxiliary index has been used */ 268 int scfb; /* whether to feed back scores */ 269 int *scores; /* array of scores */ 270 int snum; /* number of elemnts of the score array */ 271 const int *nscores; /* array of narrowing scores */ 272 int nsnum; /* number of elemnts of the narrowing score array */ 273 int opts; /* options for preservation */ 274 double ecllim; /* lower limit of similarity eclipse */ 275 CBMAP *shadows; /* map of eclipsed documents */ 276 char *distinct; /* distinct attribute */ 277 int mask; /* mask for meta search */ 278 } ESTCOND; 279 280 enum { /* enumeration for options */ 281 ESTCONDSURE = 1 << 0, /* check every N-gram key */ 282 ESTCONDUSUAL = 1 << 1, /* check N-gram keys skipping by one */ 283 ESTCONDFAST = 1 << 2, /* check N-gram keys skipping by two */ 284 ESTCONDAGITO = 1 << 3, /* check N-gram keys skipping by three */ 285 ESTCONDNOIDF = 1 << 4, /* without TF-IDF tuning */ 286 ESTCONDSIMPLE = 1 << 10, /* with the simplified phrase */ 287 ESTCONDROUGH = 1 << 11, /* with the rough phrase */ 288 ESTCONDUNION = 1 << 15, /* with the union phrase */ 289 ESTCONDISECT = 1 << 16, /* with the intersection phrase */ 290 ESTCONDSCFB = 1 << 30 /* feed back scores (for debug) */ 291 }; 292 293 294 /* Create a condition object. 295 The return value is an object of search conditions. */ 296 ESTCOND *est_cond_new(void); 297 298 299 /* Destroy a condition object. 300 `cond' specifies a condition object. */ 301 void est_cond_delete(ESTCOND *cond); 302 303 304 /* Set the search phrase to a condition object. 305 `cond' specifies a condition object. 306 `phrase' specifies a search phrase. */ 307 void est_cond_set_phrase(ESTCOND *cond, const char *phrase); 308 309 310 /* Add an expression for an attribute to a condition object. 311 `cond' specifies a condition object. 312 `expr' specifies an expression for an attribute. */ 313 void est_cond_add_attr(ESTCOND *cond, const char *expr); 314 315 316 /* Set the order of a condition object. 317 `cond' specifies a condition object. 318 `expr' specifies an expression for the order. By default, the order is by score descending. */ 319 void est_cond_set_order(ESTCOND *cond, const char *expr); 320 321 322 /* Set the maximum number of retrieval of a condition object. 323 `cond' specifies a condition object. 324 `max' specifies the maximum number of retrieval. By default, the number of retrieval is not 325 limited. */ 326 void est_cond_set_max(ESTCOND *cond, int max); 327 328 329 /* Set the number of skipped documents of a condition object. 330 `cond' specifies a condition object. 331 `skip' specifies the number of documents to be skipped in the search result. */ 332 void est_cond_set_skip(ESTCOND *cond, int skip); 333 334 335 /* Set options of retrieval of a condition object. 336 `cond' specifies a condition object. 337 `options' specifies options: `ESTCONDSURE' specifies that it checks every N-gram key, 338 `ESTCONDUSUAL', which is the default, specifies that it checks N-gram keys with skipping one 339 key, `ESTCONDFAST' skips two keys, `ESTCONDAGITO' skips three keys, `ESTCONDNOIDF' specifies 340 not to perform TF-IDF tuning, `ESTCONDSIMPLE' specifies to use simplified phrase, 341 `ESTCONDROUGH' specifies to use rough phrase, `ESTCONDUNION' specifies to use union phrase, 342 `ESTCONDISECT' specifies to use intersection phrase, `ESTCONDSCFB' specifies to feed back 343 scores (only for debugging). Each option can be specified at the same time by bitwise or. If 344 keys are skipped, though search speed is improved, the relevance ratio grows less. */ 345 void est_cond_set_options(ESTCOND *cond, int options); 346 347 348 /* Set permission to adopt result of the auxiliary index. 349 `cond' specifies a condition object. 350 `min' specifies the minimum hits to adopt result of the auxiliary index. If it is not more 351 than 0, the auxiliary index is not used. By default, it is 32. */ 352 void est_cond_set_auxiliary(ESTCOND *cond, int min); 353 354 355 /* Set the lower limit of similarity eclipse. 356 `cond' specifies a condition object. 357 `limit' specifies the lower limit of similarity for documents to be eclipsed. Similarity is 358 between 0.0 and 1.0. If the limit is added by `ESTECLSIMURL', similarity is weighted by URL. 359 If the limit is `ESTECLSERV', similarity is ignored and documents in the same server are 360 eclipsed. If the limit is `ESTECLDIR', similarity is ignored and documents in the same 361 directory are eclipsed. If the limit is `ESTECLFILE', similarity is ignored and documents of 362 the same file are eclipsed. */ 363 void est_cond_set_eclipse(ESTCOND *cond, double limit); 364 365 366 /* Set the attribute distinction filter. 367 `cond' specifies a condition object. 368 `name' specifies the name of an attribute to be distinct. 369 If this filter is set, candidates which have same value of the attribute is omitted. */ 370 void est_cond_set_distinct(ESTCOND *cond, const char *name); 371 372 373 /* Set the mask of targets of meta search. 374 `cond' specifies a condition object. 375 `mask' specifies a masking number. 1 means the first target, 2 means the second target, 4 376 means the third target, and power values of 2 and their summation compose the mask. */ 377 void est_cond_set_mask(ESTCOND *cond, int mask); 378 379 380 381 /************************************************************************************************* 382 * API for database 383 *************************************************************************************************/ 384 385 386 #define ESTIDXDMAX 256 /* max number of the inverted index */ 387 #define ESTIDXDSTD 16 /* standard number of the inverted index */ 388 #define ESTPDOCIDMIN 2000000001 /* minimum ID number of pseudo documents */ 389 390 typedef struct { /* type of structure for the inverted index */ 391 char *name; /* name of the database */ 392 int omode; /* open mode */ 393 VILLA *dbs[ESTIDXDMAX]; /* database handles */ 394 int dnum; /* number of division */ 395 VILLA *cdb; /* current database handle */ 396 } ESTIDX; 397 398 typedef struct { /* type of structure for a database object */ 399 char *name; /* name of the database */ 400 int inode; /* inode of the database */ 401 DEPOT *metadb; /* handle of the meta database */ 402 ESTIDX *idxdb; /* handles of the inverted indexs */ 403 VILLA *fwmdb; /* handle of the database for forward matching */ 404 VILLA *auxdb; /* handle of the auxiliary index */ 405 VILLA *xfmdb; /* handle of the database for aux forward matching */ 406 CURIA *attrdb; /* handle of the database for attrutes */ 407 CURIA *textdb; /* handle of the database for texts */ 408 CURIA *kwddb; /* handle of the database for keywords */ 409 VILLA *listdb; /* handle of the database for document list */ 410 CBMAP *aidxs; /* map of attribute indexes */ 411 CBLIST *pdocs; /* list of pseudo documents */ 412 CBMAP *puris; /* map of URIs of pseudo documents */ 413 int ecode; /* last happened error code */ 414 int fatal; /* whether to have a fatal error */ 415 int dseq; /* sequence for document IDs */ 416 int dnum; /* number of the documents */ 417 int amode; /* mode of text analyzer */ 418 int zmode; /* mode of data compression */ 419 int smode; /* mode of score type */ 420 CBMAP *idxcc; /* cache for the inverted index */ 421 CBMAP *auxcc; /* cache for the auxiliary index */ 422 size_t icsiz; /* power of the cache */ 423 size_t icmax; /* max size of the cache */ 424 CBMAP *outcc; /* cache for deleted documents */ 425 CBMAP *keycc; /* cache for keys for TF-IDF */ 426 int kcmnum; /* max number of the key cache */ 427 CBMAP *attrcc; /* cache for attributes */ 428 int acmnum; /* max number of the attribute cache */ 429 CBMAP *textcc; /* cache for texts */ 430 int tcmnum; /* max number of the text cache */ 431 CBMAP *veccc; /* cache for keyword vectors */ 432 int vcmnum; /* max number of the vector cache */ 433 CBMAP *rescc; /* cache for results */ 434 int rcmnum; /* max number of the result cache */ 435 CBMAP *spacc; /* special cache for attributes */ 436 int scmnum; /* max number of the special cache */ 437 char *scname; /* name of the attribute for the special cache */ 438 void (*infocb)(const char *, void *); /* callback function to inform of events */ 439 void *infoop; /* opaque for the informing callback */ 440 DEPOT *dfdb; /* handle of the database for document frequency */ 441 int wildmax; /* maximum number of expansion of wild cards */ 442 CBMAP *metacc; /* cache for meta data */ 443 int flsflag; /* flag of flushing */ 444 int intflag; /* flag of thread interruption */ 445 } ESTDB; 446 447 enum { /* enumeration for error codes */ 448 ESTENOERR, /* no error */ 449 ESTEINVAL, /* invalid argument */ 450 ESTEACCES, /* access forbidden */ 451 ESTELOCK, /* lock failure */ 452 ESTEDB, /* database problem */ 453 ESTEIO, /* I/O problem */ 454 ESTENOITEM, /* no item */ 455 ESTEMISC = 9999 /* miscellaneous */ 456 }; 457 458 enum { /* enumeration for open modes */ 459 ESTDBREADER = 1 << 0, /* open as a reader */ 460 ESTDBWRITER = 1 << 1, /* open as a writer */ 461 ESTDBCREAT = 1 << 2, /* a writer creating */ 462 ESTDBTRUNC = 1 << 3, /* a writer truncating */ 463 ESTDBNOLCK = 1 << 4, /* open without locking */ 464 ESTDBLCKNB = 1 << 5, /* lock without blocking */ 465 ESTDBPERFNG = 1 << 10, /* use perfect N-gram analyzer */ 466 ESTDBCHRCAT = 1 << 11, /* use character category analyzer */ 467 ESTDBSMALL = 1 << 20, /* small tuning */ 468 ESTDBLARGE = 1 << 21, /* large tuning */ 469 ESTDBHUGE = 1 << 22, /* huge tuning */ 470 ESTDBHUGE2 = 1 << 23, /* huge tuning second */ 471 ESTDBHUGE3 = 1 << 24, /* huge tuning third */ 472 ESTDBSCVOID = 1 << 25, /* store scores as void */ 473 ESTDBSCINT = 1 << 26, /* store scores as integer */ 474 ESTDBSCASIS = 1 << 27 /* refrain from adjustment of scores */ 475 }; 476 477 enum { /* enumeration for data types of attribute index */ 478 ESTIDXATTRSEQ, /* for multipurpose sequencial access method */ 479 ESTIDXATTRSTR, /* for narrowing with attributes as strings */ 480 ESTIDXATTRNUM /* for narrowing with attributes as numbers */ 481 }; 482 483 enum { /* enumeration for options of optimization */ 484 ESTOPTNOPURGE = 1 << 0, /* omit purging dispensable region of deleted */ 485 ESTOPTNODBOPT = 1 << 1 /* omit optimization of the database files */ 486 }; 487 488 enum { /* enumeration for options of document merger */ 489 ESTMGCLEAN = 1 << 0 /* clean up dispensable regions */ 490 }; 491 492 enum { /* enumeration for options of document registration */ 493 ESTPDCLEAN = 1 << 0, /* clean up dispensable regions */ 494 ESTPDWEIGHT = 1 << 1 /* weight scores statically when indexing */ 495 }; 496 497 enum { /* enumeration for options of document deletion */ 498 ESTODCLEAN = 1 << 0 /* clean up dispensable regions */ 499 }; 500 501 enum { /* enumeration for options of document retrieval */ 502 ESTGDNOATTR = 1 << 0, /* no attributes */ 503 ESTGDNOTEXT = 1 << 1, /* no text */ 504 ESTGDNOKWD = 1 << 2 /* no keywords */ 505 }; 506 507 508 /* Get the string of an error code. 509 `ecode' specifies an error code. 510 The return value is the string of the error code. */ 511 const char *est_err_msg(int ecode); 512 513 514 /* Open a database. 515 `name' specifies the name of a database directory. 516 `omode' specifies open modes: `ESTDBWRITER' as a writer, `ESTDBREADER' as a reader. If the 517 mode is `ESTDBWRITER', the following may be added by bitwise or: `ESTDBCREAT', which means it 518 creates a new database if not exist, `ESTDBTRUNC', which means it creates a new database 519 regardless if one exists. Both of `ESTDBREADER' and `ESTDBWRITER' can be added to by 520 bitwise or: `ESTDBNOLCK', which means it opens a database file without file locking, or 521 `ESTDBLCKNB', which means locking is performed without blocking. If `ESTDBNOLCK' is used, 522 the application is responsible for exclusion control. `ESTDBCREAT' can be added to by bitwise 523 or: `ESTDBPERFNG', which means N-gram analysis is performed against European text also, 524 `ESTDBCHRCAT', which means character category analysis is performed instead of N-gram analysis, 525 `ESTDBSMALL', which means the index is tuned to register less than 50000 documents, 526 `ESTDBLARGE', which means the index is tuned to register more than 300000 documents, 527 `ESTDBHUGE', which means the index is tuned to register more than 1000000 documents, 528 `ESTDBHUGE2', which means the index is tuned to register more than 5000000 documents, 529 `ESTDBHUGE3', which means the index is tuned to register more than 10000000 documents, 530 `ESTDBSCVOID', which means scores are stored as void, `ESTDBSCINT', which means scores are 531 stored as 32-bit integer, `ESTDBSCASIS', which means scores are stored as-is and marked not 532 to be tuned when search. 533 `ecp' specifies the pointer to a variable to which the error code is assigned. 534 The return value is a database object of the database or `NULL' if failure. */ 535 ESTDB *est_db_open(const char *name, int omode, int *ecp); 536 537 538 /* Close a database. 539 `db' specifies a database object. 540 `ecp' specifies the pointer to a variable to which the error code is assigned. 541 The return value is true if success, else it is false. */ 542 int est_db_close(ESTDB *db, int *ecp); 543 544 545 /* Get the last happened error code of a database. 546 `db' specifies a database object. 547 The return value is the last happened error code of the database. */ 548 int est_db_error(ESTDB *db); 549 550 551 /* Check whether a database has a fatal error. 552 `db' specifies a database object. 553 The return value is true if the database has fatal erroor, else it is false. */ 554 int est_db_fatal(ESTDB *db); 555 556 557 /* Add an index for narrowing or sorting with document attributes. 558 `db' specifies a database object connected as a writer. 559 `name' specifies the name of an attribute. 560 `type' specifies the data type of attribute index; `ESTIDXATTRSEQ' for multipurpose sequencial 561 access method, `ESTIDXATTRSTR' for narrowing with attributes as strings, `ESTIDXATTRNUM' for 562 narrowing with attributes as numbers. 563 The return value is true if success, else it is false. 564 Note that this function should be called before the first document is registered. */ 565 int est_db_add_attr_index(ESTDB *db, const char *name, int type); 566 567 568 /* Flush index words in the cache of a database. 569 `db' specifies a database object connected as a writer. 570 `max' specifies the maximum number of words to be flushed. If it not more than zero, all 571 words are flushed. 572 The return value is true if success, else it is false. */ 573 int est_db_flush(ESTDB *db, int max); 574 575 576 /* Synchronize updating contents of a database. 577 `db' specifies a database object connected as a writer. 578 The return value is true if success, else it is false. */ 579 int est_db_sync(ESTDB *db); 580 581 582 /* Optimize a database. 583 `db' specifies a database object connected as a writer. 584 `options' specifies options: `ESTOPTNOPURGE' to omit purging dispensable region of deleted 585 documents, `ESTOPTNODBOPT' to omit optimization of the database files. The two can be 586 specified at the same time by bitwise or. 587 The return value is true if success, else it is false. */ 588 int est_db_optimize(ESTDB *db, int options); 589 590 591 /* Merge another database. 592 `db' specifies a database object connected as a writer. 593 `name' specifies the name of another database directory. 594 `options' specifies options: `ESTMGCLEAN' to clean up dispensable regions of the deleted 595 document. 596 The return value is true if success, else it is false. 597 Creation options of the two databases should be same entirely. ID numbers of imported 598 documents are changed within the sequence of the desitination database. If URIs of imported 599 documents conflict ones of exsisting documents, existing documents are removed. */ 600 int est_db_merge(ESTDB *db, const char *name, int options); 601 602 603 /* Add a document to a database. 604 `db' specifies a database object connected as a writer. 605 `doc' specifies a document object. The document object should have the URI attribute. 606 `options' specifies options: `ESTPDCLEAN' to clean up dispensable regions of the overwritten 607 document, `ESTPDWEIGHT' to weight scores statically with score weighting attribute. 608 The return value is true if success, else it is false. 609 If the URI attribute is same with an existing document in the database, the existing one is 610 deleted. */ 611 int est_db_put_doc(ESTDB *db, ESTDOC *doc, int options); 612 613 614 /* Remove a document from a database. 615 `db' specifies a database object connected as a writer. 616 `id' specifies the ID number of a registered document. 617 `options' specifies options: `ESTODCLEAN' to clean up dispensable regions of the deleted 618 document. 619 The return value is true if success, else it is false. */ 620 int est_db_out_doc(ESTDB *db, int id, int options); 621 622 623 /* Edit attributes of a document in a database. 624 `db' specifies a database object connected as a writer. 625 `doc' specifies a document object. 626 The return value is true if success, else it is false. 627 The ID can not be changed. If the URI is changed and it overlaps the URI of another 628 registered document, this function fails. */ 629 int est_db_edit_doc(ESTDB *db, ESTDOC *doc); 630 631 632 /* Retrieve a document in a database. 633 `db' specifies a database object. 634 `id' specifies the ID number of a registered document. 635 `options' specifies options: `ESTGDNOATTR' to ignore attributes, `ESTGDNOTEXT' to ignore 636 the body text, `ESTGDNOKWD' to ignore keywords. The three can be specified at the same time 637 by bitwise or. 638 The return value is a document object. It should be deleted with `est_doc_delete' if it is 639 no longer in use. On error, `NULL' is returned. */ 640 ESTDOC *est_db_get_doc(ESTDB *db, int id, int options); 641 642 643 /* Retrieve the value of an attribute of a document in a database. 644 `db' specifies a database object. 645 `id' specifies the ID number of a registered document. 646 `name' specifies the name of an attribute. 647 The return value is the value of the attribute or `NULL' if it does not exist. Because the 648 region of the return value is allocated with the `malloc' call, it should be released with 649 the `free' call if it is no longer in use. */ 650 char *est_db_get_doc_attr(ESTDB *db, int id, const char *name); 651 652 653 /* Get the ID of a document specified by URI. 654 `db' specifies a database object. 655 `uri' specifies the URI of a registered document. 656 The return value is the ID of the document. On error, -1 is returned. */ 657 int est_db_uri_to_id(ESTDB *db, const char *uri); 658 659 660 /* Get the name of a database. 661 `db' specifies a database object. 662 The return value is the name of the database. The life duration of the returned string is 663 synchronous with the one of the database object. */ 664 const char *est_db_name(ESTDB *db); 665 666 667 /* Get the number of documents in a database. 668 `db' specifies a database object. 669 The return value is the number of documents in the database. */ 670 int est_db_doc_num(ESTDB *db); 671 672 673 /* Get the number of unique words in a database. 674 `db' specifies a database object. 675 The return value is the number of unique words in the database. */ 676 int est_db_word_num(ESTDB *db); 677 678 679 /* Get the size of a database. 680 `db' specifies a database object. 681 The return value is the size of the database. */ 682 double est_db_size(ESTDB *db); 683 684 685 /* Search a database for documents corresponding a condition. 686 `db' specifies a database object. 687 `cond' specifies a condition object. 688 `nump' specifies the pointer to a variable to which the number of elements in the result is 689 assigned. 690 `hints' specifies a map object into which the number of documents corresponding to each word 691 is stored. If a word is in a negative condition, the number is negative. The element whose 692 key is an empty string specifies the number of whole result. If it is `NULL', it is not used. 693 The return value is an array whose elements are ID numbers of corresponding documents. 694 This function does never fail. Even if no document corresponds or an error occurs, an empty 695 array is returned. Because the region of the return value is allocated with the `malloc' 696 call, it should be released with the `free' call if it is no longer in use. */ 697 int *est_db_search(ESTDB *db, ESTCOND *cond, int *nump, CBMAP *hints); 698 699 700 /* Search plural databases for documents corresponding a condition. 701 `dbs' specifies an array whose elements are database objects. 702 `dbnum' specifies the number of elements of the array. 703 `cond' specifies a condition object. 704 `nump' specifies the pointer to a variable to which the number of elements in the result is 705 assigned. 706 `hints' specifies a map object into which the number of documents corresponding to each word 707 is stored. If a word is in a negative condition, the number is negative. The element whose 708 key is an empty string specifies the number of whole result. If it is `NULL', it is not used. 709 The return value is an array whose elements are indexes of container databases and ID numbers 710 of in each database alternately. 711 This function does never fail. Even if no document corresponds or an error occurs, an empty 712 array is returned. Because the region of the return value is allocated with the `malloc' 713 call, it should be released with the `free' call if it is no longer in use. */ 714 int *est_db_search_meta(ESTDB **dbs, int dbnum, ESTCOND *cond, int *nump, CBMAP *hints); 715 716 717 /* Check whether a document object matches the phrase of a search condition object definitely. 718 `db' specifies a database object. 719 `doc' specifies a document object. 720 `cond' specifies a search condition object. 721 The return value is true if the document matches the phrase of the condition object 722 definitely, else it is false. */ 723 int est_db_scan_doc(ESTDB *db, ESTDOC *doc, ESTCOND *cond); 724 725 726 /* Set the maximum size of the cache memory of a database. 727 `db' specifies a database object. 728 `size' specifies the maximum size of the index cache. By default, it is 64MB. If it is 729 negative, the current size is not changed. 730 `anum' specifies the maximum number of cached records for document attributes. By default, it 731 is 8192. If it is negative, the current size is not changed. 732 `tnum' specifies the maximum number of cached records for document texts. By default, it is 733 1024. If it is negative, the current size is not changed. 734 `rnum' specifies the maximum number of cached records for occurrence results. By default, it 735 is 256. If it is negative, the current size is not changed. */ 736 void est_db_set_cache_size(ESTDB *db, size_t size, int anum, int tnum, int rnum); 737 738 739 /* Add a pseudo index directory to a database. 740 `db' specifies a database object. 741 `path' specifies the path of a pseudo index directory. 742 The return value is true if success, else it is false. */ 743 int est_db_add_pseudo_index(ESTDB *db, const char *path); 744 745 746 747 /************************************************************************************************* 748 * features for experts 749 *************************************************************************************************/ 750 751 752 #define _EST_VERSION "1.4.13" 753 #define _EST_LIBVER 838 754 #define _EST_PROTVER "1.0" 755 756 #define _EST_PROJURL "http://hyperestraier.sourceforge.net/" 757 #define _EST_XNSEARCH "http://hyperestraier.sourceforge.net/xmlns/search" 758 #define _EST_XNNODE "http://hyperestraier.sourceforge.net/xmlns/node" 759 760 enum { /* enumeration for languages */ 761 ESTLANGEN, /* English */ 762 ESTLANGJA, /* Japanese */ 763 ESTLANGZH, /* Chinese */ 764 ESTLANGKO, /* Korean */ 765 ESTLANGMISC /* miscellaneous */ 766 }; 767 768 enum { /* enumeration for document parts */ 769 ESTMDATTR = 1 << 0, /* attributes */ 770 ESTMDTEXT = 1 << 1, /* texts */ 771 ESTMDKWD = 1 << 2 /* keywords */ 772 }; 773 774 enum { /* enumeration for database repair */ 775 ESTRPSTRICT = 1 << 0, /* perform strict consistency check */ 776 ESTRPSHODDY = 1 << 1 /* omit consistency check */ 777 }; 778 779 typedef struct { /* type of structure for an element of result map */ 780 const char *key; /* pointer to the key string */ 781 int score; /* total score */ 782 } ESTRESMAPELEM; 783 784 enum { /* enumeration for scoring for result map */ 785 ESTRMLOSUM, /* summation */ 786 ESTRMLOMAX, /* maximum */ 787 ESTRMLOMIN, /* minimum */ 788 ESTRMLOAVG /* average */ 789 }; 790 791 792 /* Break a sentence of text and extract words. 793 `text' specifies a sentence of text. 794 `list' specifies a list object to which extract words are added. 795 `norm' specifies whether to normalize the text. 796 `tail' specifies whether to pick up oddness N-gram at the end. */ 797 void est_break_text(const char *text, CBLIST *list, int norm, int tail); 798 799 800 /* Break a sentence of text and extract words using perfect N-gram analyzer. 801 `text' specifies a sentence of text. 802 `list' specifies a list object to which extract words are added. 803 `norm' specifies whether to normalize the text. 804 `tail' specifies whether to pick up oddness N-gram at the end. */ 805 void est_break_text_perfng(const char *text, CBLIST *list, int norm, int tail); 806 807 808 /* Break a sentence of text and extract words, using character category analyzer. 809 `text' specifies a sentence of text. 810 `list' specifies a list object to which extract words are added. 811 `norm' specifies whether to normalize the text. */ 812 void est_break_text_chrcat(const char *text, CBLIST *list, int norm); 813 814 815 /* Make a snippet of an arbitrary string. 816 `word' specifies a list object of words to be highlight. 817 `wwidth' specifies whole width of the result. 818 `hwidth' specifies width of strings picked up from the beginning of the text. 819 `awidth' specifies width of strings picked up around each highlighted word. 820 The return value is a snippet string of the string. Because the region of the return value is 821 allocated with the `malloc' call, it should be released with the `free' call if it is no 822 longer in use. */ 823 char *est_str_make_snippet(const char *str, const CBLIST *words, 824 int wwidth, int hwidth, int awidth); 825 826 827 /* Convert the character encoding of a string. 828 `ptr' specifies the pointer to a region. 829 `size' specifies the size of the region. If it is negative, the size is assigned with 830 `strlen(ptr)'. 831 `icode' specifies the name of encoding of the input string. 832 `ocode' specifies the name of encoding of the output string. 833 `sp' specifies the pointer to a variable to which the size of the region of the return 834 value is assigned. If it is `NULL', it is not used. 835 `mp' specifies the pointer to a variable to which the number of missing characters by failure 836 of conversion is assigned. If it is `NULL', it is not used. 837 If successful, the return value is the pointer to the result object, else, it is `NULL'. 838 Because an additional zero code is appended at the end of the region of the return value, 839 the return value can be treated as a character string. Because the region of the return 840 value is allocated with the `malloc' call, it should be released with the `free' call if it 841 is no longer in use. */ 842 char *est_iconv(const char *ptr, int size, const char *icode, const char *ocode, 843 int *sp, int *mp); 844 845 846 /* Detect the encoding of a string automatically. 847 `ptr' specifies the pointer to a region. 848 `size' specifies the size of the region. If it is negative, the size is assigned with 849 `strlen(ptr)'. 850 `plang' specifies a preferred language. As for now, `ESTLANGEN', `ESTLANGJA', `ESTLANGZH', 851 and `ESTLANGKO' are supported. 852 The return value is the string of the encoding name of the string. */ 853 const char *est_enc_name(const char *ptr, int size, int plang); 854 855 856 /* Convert a UTF-8 string into UTF-16BE. 857 `ptr' specifies the pointer to a region. 858 `size' specifies the size of the region. 859 `sp' specifies the pointer to a variable to which the size of the region of the return 860 value is assigned. 861 The return value is the pointer to the result object. Because an additional zero code is 862 appended at the end of the region of the return value, the return value can be treated as a 863 character string. Because the region of the return value is allocated with the `malloc' call, 864 it should be released with the `free' call if it is no longer in use. */ 865 char *est_uconv_in(const char *ptr, int size, int *sp); 866 867 868 /* Convert a UTF-16BE string into UTF-8. 869 `ptr' specifies the pointer to a region. 870 `size' specifies the size of the region. 871 `sp' specifies the pointer to a variable to which the size of the region of the return 872 value is assigned. If it is `NULL', it is not used. 873 The return value is the pointer to the result object. Because an additional zero code is 874 appended at the end of the region of the return value, the return value can be treated as a 875 character string. Because the region of the return value is allocated with the `malloc' call, 876 it should be released with the `free' call if it is no longer in use. */ 877 char *est_uconv_out(const char *ptr, int size, int *sp); 878 879 880 /* Compress a serial object with ZLIB. 881 `ptr' specifies the pointer to a region. 882 `size' specifies the size of the region. If it is negative, the size is assigned with 883 `strlen(ptr)'. 884 `sp' specifies the pointer to a variable to which the size of the region of the return 885 value is assigned. 886 `mode' specifies detail behavior. 0 specifies using the standard deflate encoding, -1 887 specifies the raw deflate encoding, and 1 specifies the GZIP encoding. 888 If successful, the return value is the pointer to the result object, else, it is `NULL'. 889 Because the region of the return value is allocated with the `malloc' call, it should be 890 released with the `free' call if it is no longer in use. */ 891 char *est_deflate(const char *ptr, int size, int *sp, int mode); 892 893 894 /* Decompress a serial object compressed with ZLIB. 895 `ptr' specifies the pointer to a region. 896 `size' specifies the size of the region. 897 `sp' specifies the pointer to a variable to which the size of the region of the return 898 value is assigned. If it is `NULL', it is not used. 899 `mode' specifies detail behavior. 0 specifies using the standard deflate encoding, -1 900 specifies the raw deflate encoding, and 1 specifies the GZIP encoding. 901 If successful, the return value is the pointer to the result object, else, it is `NULL'. 902 Because an additional zero code is appended at the end of the region of the return value, 903 the return value can be treated as a character string. Because the region of the return 904 value is allocated with the `malloc' call, it should be released with the `free' call if it 905 is no longer in use. */ 906 char *est_inflate(const char *ptr, int size, int *sp, int mode); 907 908 909 /* Compress a serial object with LZO. 910 `ptr' specifies the pointer to a region. 911 `size' specifies the size of the region. If it is negative, the size is assigned with 912 `strlen(ptr)'. 913 `sp' specifies the pointer to a variable to which the size of the region of the return 914 value is assigned. 915 If successful, the return value is the pointer to the result object, else, it is `NULL'. 916 Because the region of the return value is allocated with the `malloc' call, it should be 917 released with the `free' call if it is no longer in use. */ 918 char *est_lzoencode(const char *ptr, int size, int *sp); 919 920 921 /* Decompress a serial object compressed with LZO. 922 `ptr' specifies the pointer to a region. 923 `size' specifies the size of the region. 924 `sp' specifies the pointer to a variable to which the size of the region of the return 925 value is assigned. If it is `NULL', it is not used. 926 If successful, the return value is the pointer to the result object, else, it is `NULL'. 927 Because an additional zero code is appended at the end of the region of the return value, 928 the return value can be treated as a character string. Because the region of the return 929 value is allocated with the `malloc' call, it should be released with the `free' call if it 930 is no longer in use. */ 931 char *est_lzodecode(const char *ptr, int size, int *sp); 932 933 934 /* Compress a serial object with BZIP2. 935 `ptr' specifies the pointer to a region. 936 `size' specifies the size of the region. If it is negative, the size is assigned with 937 `strlen(ptr)'. 938 `sp' specifies the pointer to a variable to which the size of the region of the return 939 value is assigned. 940 If successful, the return value is the pointer to the result object, else, it is `NULL'. 941 Because the region of the return value is allocated with the `malloc' call, it should be 942 released with the `free' call if it is no longer in use. */ 943 char *est_bzencode(const char *ptr, int size, int *sp); 944 945 946 /* Decompress a serial object compressed with BZIP2. 947 `ptr' specifies the pointer to a region. 948 `size' specifies the size of the region. 949 `sp' specifies the pointer to a variable to which the size of the region of the return 950 value is assigned. If it is `NULL', it is not used. 951 If successful, the return value is the pointer to the result object, else, it is `NULL'. 952 Because an additional zero code is appended at the end of the region of the return value, 953 the return value can be treated as a character string. Because the region of the return 954 value is allocated with the `malloc' call, it should be released with the `free' call if it 955 is no longer in use. */ 956 char *est_bzdecode(const char *ptr, int size, int *sp); 957 958 959 /* Get the border string for draft data of documents. 960 The return value is the border string for draft data of documents. */ 961 const char *est_border_str(void); 962 963 964 /* Get the real random number. 965 The return value is the real random number between 0.0 and 1.0. */ 966 double est_random(void); 967 968 969 /* Get the random number in normal distribution. 970 The return value is the random number in normal distribution between 0.0 and 1.0. */ 971 double est_random_nd(void); 972 973 974 /* Get an MD5 hash string of a key string. 975 `key' specifies a string to be encrypted. 976 The return value is an MD5 hash string of the key string. Because the region of the return 977 value is allocated with the `malloc' call, it should be released with the `free' call if it 978 is no longer in use. */ 979 char *est_make_crypt(const char *key); 980 981 982 /* Check whether a key matches an MD5 hash string. 983 `key' specifies a string to be checked. 984 `hash' specifies an MD5 hash string. 985 The return value is true if the key matches the hash string, else it is false. */ 986 int est_match_crypt(const char *key, const char *hash); 987 988 989 /* Create a regular expression object. 990 `str' specifies a string of regular expressions. 991 The return value is a regular expression object or `NULL' if failure. 992 If the expression is leaded by "*I:", the pattern is case insensitive. */ 993 void *est_regex_new(const char *str); 994 995 996 /* Delete a regular expression object. 997 `regex' specifies a regular expression object. */ 998 void est_regex_delete(void *regex); 999 1000 1001 /* Check whether a regular expression matches a string. 1002 `regex' specifies a regular expression object. 1003 `str' specifies a string. 1004 The return value is true if the regular expression object matchs the string. */ 1005 int est_regex_match(const void *regex, const char *str); 1006 1007 1008 /* Check whether a regular expression matches a string. 1009 `rstr' specifies a regular expression string. 1010 `tstr' specifies a target string. 1011 The return value is true if the regular expression string matchs the target string. */ 1012 int est_regex_match_str(const char *rstr, const char *tstr); 1013 1014 1015 /* Replace each substring matching a regular expression string. 1016 `str' specifies a target string. 1017 `bef' specifies a string of regular expressions for substrings. 1018 `aft' specifies a string with which each substrings are replaced. Each "&" in the string is 1019 replaced with the matched substring. Each "\" in the string escapes the following character. 1020 Special escapes "\1" through "\9" referring to the corresponding matching sub-expressions in 1021 the regular expression string are supported. 1022 The return value is a new converted string. Even if the regular expression is invalid, a copy 1023 of the original string is returned. Because the region of the return value is allocated with 1024 the `malloc' call, it should be released with the `free' call if it is no longer in use. */ 1025 char *est_regex_replace(const char *str, const char *bef, const char *aft); 1026 1027 1028 /* Duplicate a document object. 1029 `doc' specifies a document object. 1030 The return value is a duplicated document object. */ 1031 ESTDOC *est_doc_dup(ESTDOC *doc); 1032 1033 1034 /* Set the ID number of a document object. 1035 `doc' specifies a document object. 1036 `id' specifies the ID number to set. */ 1037 void est_doc_set_id(ESTDOC *doc, int id); 1038 1039 1040 /* Get the hidden texts of a document object. 1041 `doc' specifies a document object. 1042 The return value is concatenated sentences of the hidden text of the document object. The 1043 life duration of the returned string is synchronous with the one of the document object. */ 1044 const char *est_doc_hidden_texts(ESTDOC *doc); 1045 1046 1047 /* Reduce the texts to fit to the specified size. 1048 `doc' specifies a document object. 1049 `len' specifies the total size of the texts. */ 1050 void est_doc_slim(ESTDOC *doc, int size); 1051 1052 1053 /* Check whether a docuemnt object is empty. 1054 `doc' specifies a document object. 1055 The return value is true the document is empty, else it is false. */ 1056 int est_doc_is_empty(ESTDOC *doc); 1057 1058 1059 /* Duplicate a condition object. 1060 `cond' specifies a condition object. 1061 The return value is a duplicated condition object. */ 1062 ESTCOND *est_cond_dup(ESTCOND *cond); 1063 1064 1065 /* Get the phrase of a condition object. 1066 `cond' specifies a condition object. 1067 The return value is the phrase of the condition object or `NULL' if it is not specified. The 1068 life duration of the returned string is synchronous with the one of the condition object. */ 1069 const char *est_cond_phrase(ESTCOND *cond); 1070 1071 1072 /* Get a list object of attribute expressions of a condition object. 1073 `cond' specifies a condition object. 1074 The return value is a list object of attribute expressions of the condition object or `NULL' if 1075 it is not specified. The life duration of the returned object is synchronous with the one of 1076 the condition object. */ 1077 const CBLIST *est_cond_attrs(ESTCOND *cond); 1078 1079 1080 /* Get the order expression of a condition object. 1081 `cond' specifies a condition object. 1082 The return value is the order expression of the condition object or `NULL' if it is not 1083 specified. The life duration of the returned string is synchronous with the one of the 1084 condition object. */ 1085 const char *est_cond_order(ESTCOND *cond); 1086 1087 1088 /* Get the maximum number of retrieval of a condition object. 1089 `cond' specifies a condition object. 1090 The return value is the maximum number of retrieval of the condition object or -1 if it is not 1091 specified. */ 1092 int est_cond_max(ESTCOND *cond); 1093 1094 1095 /* Get the number of skipped documents of a condition object. 1096 `cond' specifies a condition object. 1097 The return value is the number of documents to be skipped in the search result. */ 1098 int est_cond_skip(ESTCOND *cond); 1099 1100 1101 /* Get the options of a condition object. 1102 `cond' specifies a condition object. 1103 The return value is the options of the condition object. */ 1104 int est_cond_options(ESTCOND *cond); 1105 1106 1107 /* Get permission to adopt result of the auxiliary index. 1108 `cond' specifies a condition object. 1109 The return value is permission to adopt result of the auxiliary index. */ 1110 int est_cond_auxiliary(ESTCOND *cond); 1111 1112 1113 /* Get the attribute distinction filter. 1114 `cond' specifies a condition object. 1115 The return value is the name of the distinct attribute or `NULL' if it is not specified. The 1116 life duration of the returned string is synchronous with the one of the condition object. */ 1117 const char *est_cond_distinct(ESTCOND *cond); 1118 1119 1120 /* Get the mask of targets of meta search. 1121 `cond' specifies a condition object. 1122 The return value is the mask of targets of meta search. */ 1123 int est_cond_mask(ESTCOND *cond); 1124 1125 1126 /* Get the score of a document corresponding to a condition object. 1127 `cond' specifies a condition object. 1128 `index' specifies the index of an element of the result array of `est_db_search'. 1129 The return value is the score of the element or -1 if the index is out of bounds. */ 1130 int est_cond_score(ESTCOND *cond, int index); 1131 1132 1133 /* Get the score array of corresponding documents of a condition object. 1134 `cond' specifies a condition object. 1135 `nump' specifies the pointer to a variable to which the number of elements in the score array 1136 is assigned. 1137 The return value is the score array of corresponding documents. */ 1138 const int *est_cond_scores(ESTCOND *cond, int *nump); 1139 1140 1141 /* Set the narrowing scores of a condition object. 1142 `cond' specifies a condition object. 1143 `scores' specifies the pointer to an array of narrowing scores. The life duration of the 1144 array should be equal to or longer than the condition object itself. 1145 `num' specifies the number of the array. */ 1146 void est_cond_set_narrowing_scores(ESTCOND *cond, const int *scores, int num); 1147 1148 1149 /* Check whether a condition object has used the auxiliary index. 1150 `cond' specifies a condition object. 1151 `word' specifies a keyword to be checked. If it is an empty string, whether at least one 1152 keyword is used is checked. 1153 The return value is true if the condition object has used the auxiliary index, else it is 1154 false */ 1155 int est_cond_auxiliary_word(ESTCOND *cond, const char *word); 1156 1157 1158 /* Get an array of ID numbers of eclipsed docuemnts of a document in a condition object. 1159 `cond' specifies a condition object. 1160 `id' specifies the ID number of a parent document. 1161 `np' specifies the pointer to a variable to which the number of elements of the return value 1162 is assigned. 1163 The return value is an array whose elements expresse the ID numbers and their scores 1164 alternately. */ 1165 const int *est_cond_shadows(ESTCOND *cond, int id, int *np); 1166 1167 1168 /* Set the callback function for query expansion. 1169 `cond' specifies a condition object. 1170 `func' specifies the pointer to a function. The first argument of the callback specifies a 1171 word to be expand. The second argument speciifes a list object into which renewed words to 1172 be stored. */ 1173 void est_cond_set_expander(ESTCOND *cond, void (*func)(const char *, CBLIST *)); 1174 1175 1176 /* Set the error code of a database. 1177 `db' specifies a database object. 1178 `ecode' specifies a error code to set. */ 1179 void est_db_set_ecode(ESTDB *db, int ecode); 1180 1181 1182 /* Check whether an option is set. 1183 `db' specifies a database object. 1184 `option' specifies an option used when opening the database. 1185 The return value is 1 if the option is set, 0 if the option is not set, or -1 if it is 1186 unknown. */ 1187 int est_db_check_option(ESTDB *db, int option); 1188 1189 1190 /* Get the inode number of a database. 1191 `db' specifies a database object. 1192 The return value is the inode number of the database. */ 1193 int est_db_inode(ESTDB *db); 1194 1195 1196 /* Set the entity data of a document in a database. 1197 `db' specifies a database object connected as a writer. 1198 `id' specifies the ID number of a registered document. 1199 `ptr' specifies the pointer to a region of entity data. If it is `NULL', the entity data is 1200 removed. 1201 `size' specifies the size of the region. 1202 The return value is true if success, else it is false. */ 1203 int est_db_set_doc_entity(ESTDB *db, int id, const char *ptr, int size); 1204 1205 1206 /* Get the entity data of a document in a database. 1207 `db' specifies a database object. 1208 `id' specifies the ID number of a registered document. 1209 `sp' specifies the pointer to a variable to which the size of the region of the return value 1210 is assigned. 1211 The return value is the value of the entity data or `NULL' if it does not exist. Because the 1212 region of the return value is allocated with the `malloc' call, it should be released with 1213 the `free' call if it is no longer in use. */ 1214 char *est_db_get_doc_entity(ESTDB *db, int id, int *sp); 1215 1216 1217 /* Set the maximum number of expansion of wild cards. 1218 `db' specifies a database object. 1219 `num' specifies the maximum number of expansion of wild cards. */ 1220 void est_db_set_wildmax(ESTDB *db, int num); 1221 1222 1223 /* Add a piece of meta data to a database. 1224 `db' specifies a database object connected as a writer. 1225 `name' specifies the name of a piece of meta data. 1226 `value' specifies the value of the meta data. If it is `NULL', the meta data is removed. */ 1227 void est_db_add_meta(ESTDB *db, const char *name, const char *value); 1228 1229 1230 /* Get a list of names of meta data of a database. 1231 `db' specifies a database object. 1232 The return value is a new list object of meta data names of the document object. Because the 1233 object of the return value is opened with the function `cblistopen', it should be closed with 1234 the function `cblistclose' if it is no longer in use. */ 1235 CBLIST *est_db_meta_names(ESTDB *db); 1236 1237 1238 /* Get the value of a piece of meta data of a database. 1239 `db' specifies a database object. 1240 `name' specifies the name of a piece of meta data. 1241 The return value is the value of the meta data or `NULL' if it does not exist. Because the 1242 region of the return value is allocated with the `malloc' call, it should be released with 1243 the `free' call if it is no longer in use. */ 1244 char *est_db_meta(ESTDB *db, const char *name); 1245 1246 1247 /* Extract keywords of a document object. 1248 `db' specifies a database object for TF-IDF tuning. If it is `NULL', it is not used. 1249 `doc' specifies a document object. 1250 `max' specifies the maximum number of keywords to be extracted. 1251 The return value is a new map object of keywords and their scores in decimal string. Because 1252 the object of the return value is opened with the function `cbmapopen', it should be closed 1253 with the function `cbmapclose' if it is no longer in use. */ 1254 CBMAP *est_db_etch_doc(ESTDB *db, ESTDOC *doc, int max); 1255 1256 1257 /* Store a map object of keywords. 1258 `db' specifies a database object connected as a writer. 1259 `id' specifies the ID number of a document. 1260 `kwords' specifies a map object of keywords of the document. 1261 `weight' specifies weighting bias of scores. 1262 The return value is true if success, else it is false. */ 1263 int est_db_put_keywords(ESTDB *db, int id, CBMAP *kwords, double weight); 1264 1265 1266 /* Remove keywords of a document. 1267 `db' specifies a database object connected as a writer. 1268 `id' specifies the ID number of a document. 1269 The return value is true if success, else it is false. */ 1270 int est_db_out_keywords(ESTDB *db, int id); 1271 1272 1273 /* Retrieve a map object of keywords. 1274 `db' specifies a database object. 1275 `id' specifies the ID number of a document. 1276 The return value is a new map object of keywords and their scores in decimal string. If 1277 keywords of the document is not stored, `NULL' is returned. Because the object of the return 1278 value is opened with the function `cbmapopen', it should be closed with the function 1279 `cbmapclose' if it is no longer in use. */ 1280 CBMAP *est_db_get_keywords(ESTDB *db, int id); 1281 1282 1283 /* Mesure the total size of each inner records of a stored document. 1284 `db' specifies a database object. 1285 `id' specifies the ID number of a document. 1286 `parts' specifies document parts: `ESTMDATTR' for attributes, `ESTMDTEXT' for texts, and 1287 `ESTMDKWD' for keywords. They can be specified at the same time by bitwise or. 1288 The return value is the total size of each inner records of a stored document. */ 1289 int est_db_measure_doc(ESTDB *db, int id, int parts); 1290 1291 1292 /* Initialize the document iterator of a database. 1293 `db' specifies a database object. 1294 `prev' specifies the URI of the previous element of iteration. If it is `NULL', it is not used. 1295 The return value is true if success, else it is false. */ 1296 int est_db_iter_init(ESTDB *db, const char *prev); 1297 1298 1299 /* Get the next ID of the document iterator of a database. 1300 `db' specifies a database object. 1301 The return value is the next ID. If there is no more document, 0 is returned. On error, 1302 -1 is returned. */ 1303 int est_db_iter_next(ESTDB *db); 1304 1305 1306 /* Initialize the word iterator of a database. 1307 `db' specifies a database object. 1308 The return value is true if success, else it is false. */ 1309 int est_db_word_iter_init(ESTDB *db); 1310 1311 1312 /* Get the next word of the word iterator of a database. 1313 `db' specifies a database object. 1314 The return value is the next word. If there is no more word, `NULL' is returned. Because 1315 the region of the return value is allocated with the `malloc' call, it should be released 1316 with the `free' call if it is no longer in use. */ 1317 char *est_db_word_iter_next(ESTDB *db); 1318 1319 1320 /* Get the size of the record of a word. 1321 `db' specifies a database object. 1322 `word' specifies a word. 1323 The return value is the size of the record of the word. If there is no corresponding record, 1324 0 is returned. */ 1325 int est_db_word_rec_size(ESTDB *db, const char *word); 1326 1327 1328 /* Get the number of unique keywords in a database. 1329 `db' specifies a database object. 1330 The return value is the number of unique keywords in the database. */ 1331 int est_db_keyword_num(ESTDB *db); 1332 1333 1334 /* Initialize the keyword iterator of a database. 1335 `db' specifies a database object. 1336 The return value is true if success, else it is false. */ 1337 int est_db_keyword_iter_init(ESTDB *db); 1338 1339 1340 /* Get the next keyword of the word iterator of a database. 1341 `db' specifies a database object. 1342 The return value is the next word. If there is no more keyword, `NULL' is returned. Because 1343 the region of the return value is allocated with the `malloc' call, it should be released 1344 with the `free' call if it is no longer in use. */ 1345 char *est_db_keyword_iter_next(ESTDB *db); 1346 1347 1348 /* Get the size of the record of a keyword. 1349 `db' specifies a database object. 1350 `word' specifies a keyword. 1351 The return value is the size of the record of the keyword. If there is no corresponding 1352 record, 0 is returned. */ 1353 int est_db_keyword_rec_size(ESTDB *db, const char *word); 1354 1355 1356 /* Search documents corresponding a keyword for a database. 1357 `db' specifies a database object. 1358 `word' specifies a keyword. 1359 `nump' specifies the pointer to a variable to which the number of elements in the result is 1360 assigned. 1361 The return value is an array whose elements are ID numbers of corresponding documents. 1362 This function does never fail. Even if no document corresponds or an error occurs, an empty 1363 array is returned. Because the region of the return value is allocated with the `malloc' 1364 call, it should be released with the `free' call if it is no longer in use. */ 1365 int *est_db_keyword_search(ESTDB *db, const char *word, int *nump); 1366 1367 1368 /* Get the number of records in the cache memory of a database. 1369 `db' specifies a database object. 1370 The return value is the cache memory of a database. */ 1371 int est_db_cache_num(ESTDB *db); 1372 1373 1374 /* Get the size of used cache region. 1375 `db' specifies a database object. 1376 The return value is the size of used cache region. */ 1377 int est_db_used_cache_size(ESTDB *db); 1378 1379 1380 /* Set the special cache for narrowing and sorting with document attributes. 1381 `db' specifies a database object. 1382 `name' specifies the name of a document. 1383 `num' specifies the maximum number of cached records. */ 1384 void est_db_set_special_cache(ESTDB *db, const char *name, int num); 1385 1386 1387 /* Set the callback function to inform of database events. 1388 `db' specifies a database object. 1389 `func' specifies the pointer to a function. The first argument of the callback specifies a 1390 message of each event. The second argument specifies an arbitrary pointer of a opaque data. 1391 `opaque' specifies the pointer of the second argument of the callback. */ 1392 void est_db_set_informer(ESTDB *db, void (*func)(const char *, void *), void *opaque); 1393 1394 1395 /* Fill the cache for keys for TF-IDF. 1396 `db' specifies a database object. */ 1397 void est_db_fill_key_cache(ESTDB *db); 1398 1399 1400 /* Set the database of document frequency. 1401 `db' specifies a database object. 1402 `dfdb' specifies a database object of `DEPOT'. If it is `NULL', the setting is cleared. */ 1403 void est_db_set_dfdb(ESTDB *db, DEPOT *dfdb); 1404 1405 1406 /* Clear the result cache. 1407 `db' specifies a database object. */ 1408 void est_db_refresh_rescc(ESTDB *db); 1409 1410 1411 /* Charge the result cache. 1412 `db' specifies a database object. 1413 `max' specifies the maximum number of words to be charged. If it not more than zero, all 1414 words are charged. */ 1415 void est_db_charge_rescc(ESTDB *db, int max); 1416 1417 1418 /* Get a list of words in the result cache. 1419 `db' specifies a database object. 1420 The return value is a new list object of words in the result cache. Because the object of the 1421 return value is opened with the function `cblistopen', it should be closed with the function 1422 `cblistclose' if it is no longer in use. */ 1423 CBLIST *est_db_list_rescc(ESTDB *db); 1424 1425 1426 /* Get the number of pseudo documents in a database. 1427 `db' specifies a database object. 1428 The return value is the number of pseudo documents in the database. */ 1429 int est_db_pseudo_doc_num(ESTDB *db); 1430 1431 1432 /* Get a list of expressions of attribute indexes of a database. 1433 `db' specifies a database object. 1434 The return value is a new list object of expressions of attribute indexes. Because the object 1435 of the return value is opened with the function `cblistopen', it should be closed with the 1436 function `cblistclose' if it is no longer in use. */ 1437 CBLIST *est_db_attr_index_exprs(ESTDB *db); 1438 1439 1440 /* Interrupt long time processing. 1441 `db' specifies a database object. */ 1442 void est_db_interrupt(ESTDB *db); 1443 1444 1445 /* Repair a broken database directory. 1446 `name' specifies the name of a database directory. 1447 `options' specifies options: `ESTRPSTRICT' to perform strict consistency check, `ESTRPSHODDY' 1448 to omit consistency check. 1449 `ecp' specifies the pointer to a variable to which the error code is assigned. 1450 The return value is true if success, else it is false. */ 1451 int est_db_repair(const char *name, int options, int *ecp); 1452 1453 1454 /* Extract words for snippet from hints of search. 1455 `hints' specifies a map object whose records were set by `est_db_search'. 1456 The return value is a new list object of words to be highlighted. Because the object of the 1457 return value is opened with the function `cblistopen', it should be closed with the function 1458 `cblistclose' if it is no longer in use. */ 1459 CBLIST *est_hints_to_words(CBMAP *hints); 1460 1461 1462 /* Add a record into a result map for logical operation. 1463 `map' specifies a map object. 1464 `key' specifies the key of a record. 1465 `score' specifies the score of the record. 1466 `method' specifies a scoring method when logical operation. As for now, `ESTRMLOSUM', 1467 `ESTRMLOMAX', `ESTRMLOMIN', and `ESTRMLOAVG'. */ 1468 void est_resmap_add(CBMAP *map, const char *key, int score, int method); 1469 1470 1471 /* Dump a result list of a result map for logical operation. 1472 `map' specifies a map object. 1473 `min' specifies the minimum number of times for which each element of the result occurs. 1474 `nump' specifies the pointer to a variable to which the number of elements in the result is 1475 assigned. 1476 The return value is an array whose elements are structures of keys and scores. Because the 1477 region of the return value is allocated with the `malloc' call, it should be released with the 1478 `free' call if it is no longer in use. */ 1479 ESTRESMAPELEM *est_resmap_dump(CBMAP *map, int min, int *nump); 1480 1481 1482 /* Reset the environment of the process. 1483 This function sets the standard streams as binary mode and resets environment variables for 1484 locale. */ 1485 void est_proc_env_reset(void); 1486 1487 1488 /* Make a directory. 1489 `path' specifies the path of a new directory. 1490 The return value is true if success, else it is false. */ 1491 int est_mkdir(const char *path); 1492 1493 1494 /* Remove a directory and its contents recursively. 1495 `path' specifies the path of a directory. 1496 The return value is true if success, else it is false. */ 1497 int est_rmdir_rec(const char *path); 1498 1499 1500 /* Get the canonicalized absolute pathname of a file. 1501 `path' specifies the path of a file. 1502 The return value is the canonicalized absolute pathname of a file. Because the region of the 1503 return value is allocated with the `malloc' call, it should be released with the `free' call 1504 if it is no longer in use. */ 1505 char *est_realpath(const char *path); 1506 1507 1508 /* Get the inode number of a file. 1509 `path' specifies the path of a file. 1510 The return value is the inode number of a file or -1 on error. */ 1511 int est_inode(const char *path); 1512 1513 1514 /* Change modification time of a file. 1515 `path' specifies the path of a file. 1516 `mtime' specifies modification time. If it is negative, the current time is set. 1517 The return value is true if success, else it is false. */ 1518 int est_utime(const char *path, time_t mtime); 1519 1520 1521 /* Get the time of day in milliseconds. 1522 The return value is the time of day in milliseconds. */ 1523 double est_gettimeofday(void); 1524 1525 1526 /* Suspend execution for microsecond intervals. 1527 `usec' specifies microseconds to sleep for. */ 1528 void est_usleep(unsigned long usec); 1529 1530 1531 /* Set a signal handler. 1532 `signum' specifies the number of a target signal. 1533 `sighandler' specifies the pointer to a function. The argument of the handler specifies the 1534 number of the catched signal. If it is `SIG_IGN', the signal is ignored. */ 1535 void est_signal(int signum, void (*sighandler)(int)); 1536 1537 1538 /* Send a signal to a process. 1539 `pid' specifies the PID of a target process. 1540 `sig' specifies a signal code. 1541 The return value is true if success, else it is false. */ 1542 int est_kill(int pid, int sig); 1543 1544 1545 /* Get the load ratio of the physical memory. 1546 The return value is the load ratio of the physical memory. 1547 As for now, this function returns 0.0 on platforms except for Windows. */ 1548 double est_memory_usage(void); 1549 1550 1551 /* Get the media type of an extention. 1552 `ext' specifies the extension of a file path. 1553 The return value is the media time of the extension. */ 1554 const char *est_ext_type(const char *ext); 1555 1556 1557 /* Set a seed vector from a map object. 1558 `svmap' specifies a map object of a seed vector. 1559 `svec' specifies a vector object. 1560 `vnum' specifies the number of dimensions of the vector. */ 1561 void est_vector_set_seed(CBMAP *svmap, int *svec, int vnum); 1562 1563 1564 /* Set a target vector from a map object. 1565 `svmap' specifies a map object of a seed vector. 1566 `tvmap' specifies a map object of a target vector. 1567 `tvec' specifies a vector object. 1568 `vnum' specifies the number of dimensions of the vector. */ 1569 void est_vector_set_target(CBMAP *svmap, CBMAP *tvmap, int *tvec, int vnum); 1570 1571 1572 /* Get the cosine of the angle of two vectors. 1573 `avec' specifies a vector object. 1574 `bvec' specifies the other vector object. 1575 `vnum' specifies the number of dimensions of the vector. 1576 The return value is the cosine of the angle of two vectors. */ 1577 double est_vector_cosine(const int *avec, const int *bvec, int vnum); 1578 1579 1580 1581 #if defined(__cplusplus) /* export for C++ */ 1582 } 1583 #endif 1584 1585 #endif /* duplication check */ 1586 1587 1588 /* END OF FILE */ 1589