1 /* Copyright(C) 2004 Brazil 2 3 This library is free software; you can redistribute it and/or 4 modify it under the terms of the GNU Lesser General Public 5 License as published by the Free Software Foundation; either 6 version 2.1 of the License, or (at your option) any later version. 7 8 This library is distributed in the hope that it will be useful, 9 but WITHOUT ANY WARRANTY; without even the implied warranty of 10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 11 Lesser General Public License for more details. 12 13 You should have received a copy of the GNU Lesser General Public 14 License along with this library; if not, write to the Free Software 15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 */ 17 #ifndef SENNA_H 18 #define SENNA_H 19 20 #ifdef __cplusplus 21 extern "C" { 22 #endif 23 24 /* available bit for sen_index_create flag */ 25 #define SEN_INDEX_NORMALIZE 0x0001 26 #define SEN_INDEX_SPLIT_ALPHA 0x0002 27 #define SEN_INDEX_SPLIT_DIGIT 0x0004 28 #define SEN_INDEX_SPLIT_SYMBOL 0x0008 29 #define SEN_INDEX_MORPH_ANALYSE 0x0000 30 #define SEN_INDEX_NGRAM 0x0010 31 #define SEN_INDEX_DELIMITED 0x0020 32 #define SEN_INDEX_ENABLE_SUFFIX_SEARCH 0x0100 33 #define SEN_INDEX_DISABLE_SUFFIX_SEARCH 0x0200 34 #define SEN_INDEX_WITH_VGRAM 0x1000 35 #define SEN_INDEX_SHARED_LEXICON 0x2000 36 #define SEN_INDEX_WITH_VACUUM 0x8000 37 38 /* 16 tokenizers can be registered */ 39 #define SEN_INDEX_TOKENIZER_MASK 0x00f0 40 41 #define SEN_SYM_MAX_KEY_SIZE 8192 42 43 #define SEN_SYM_WITH_SIS 0x80000000 44 45 #define SEN_SNIP_NORMALIZE 0x0001 46 #define SEN_SNIP_COPY_TAG 0x0002 47 #define SEN_SNIP_SKIP_LEADING_SPACES 0x0004 48 #define SEN_QUERY_SCAN_NORMALIZE SEN_SNIP_NORMALIZE 49 50 #define SEN_LEX_NGRAM_UNIT_SIZE 2 51 52 #ifndef SEN_STACK_SIZE 53 #define SEN_STACK_SIZE 0x10000000 54 #endif /* SEN_STACK_SIZE */ 55 56 #ifndef SEN_QUERY_AND 57 #define SEN_QUERY_AND '+' 58 #endif /* SEN_QUERY_AND */ 59 #ifndef SEN_QUERY_BUT 60 #define SEN_QUERY_BUT '-' 61 #endif /* SEN_QUERY_BUT */ 62 #ifndef SEN_QUERY_ADJ_INC 63 #define SEN_QUERY_ADJ_INC '>' 64 #endif /* SEN_QUERY_ADJ_POS2 */ 65 #ifndef SEN_QUERY_ADJ_DEC 66 #define SEN_QUERY_ADJ_DEC '<' 67 #endif /* SEN_QUERY_ADJ_POS1 */ 68 #ifndef SEN_QUERY_ADJ_NEG 69 #define SEN_QUERY_ADJ_NEG '~' 70 #endif /* SEN_QUERY_ADJ_NEG */ 71 #ifndef SEN_QUERY_PREFIX 72 #define SEN_QUERY_PREFIX '*' 73 #endif /* SEN_QUERY_PREFIX */ 74 #ifndef SEN_QUERY_PARENL 75 #define SEN_QUERY_PARENL '(' 76 #endif /* SEN_QUERY_PARENL */ 77 #ifndef SEN_QUERY_PARENR 78 #define SEN_QUERY_PARENR ')' 79 #endif /* SEN_QUERY_PARENR */ 80 #ifndef SEN_QUERY_QUOTEL 81 #define SEN_QUERY_QUOTEL '"' 82 #endif /* SEN_QUERY_QUOTEL */ 83 #ifndef SEN_QUERY_QUOTER 84 #define SEN_QUERY_QUOTER '"' 85 #endif /* SEN_QUERY_QUOTER */ 86 #ifndef SEN_QUERY_ESCAPE 87 #define SEN_QUERY_ESCAPE '\\' 88 #endif /* SEN_QUERY_ESCAPE */ 89 90 #define SEN_SYM_NIL 0 91 92 typedef enum { 93 sen_success = 0, 94 sen_memory_exhausted, 95 sen_invalid_format, 96 sen_file_operation_error, 97 sen_invalid_argument, 98 sen_other_error, 99 sen_external_error, 100 sen_internal_error, 101 sen_abnormal_error, 102 sen_end_of_data 103 } sen_rc; 104 105 typedef enum { 106 sen_enc_default = 0, 107 sen_enc_none, 108 sen_enc_euc_jp, 109 sen_enc_utf8, 110 sen_enc_sjis, 111 sen_enc_latin1, 112 sen_enc_koi8r 113 } sen_encoding; 114 115 typedef enum { 116 sen_rec_document = 0, 117 sen_rec_section, 118 sen_rec_position, 119 sen_rec_userdef, 120 sen_rec_none 121 } sen_rec_unit; 122 123 typedef enum { 124 sen_sel_or = 0, 125 sen_sel_and, 126 sen_sel_but, 127 sen_sel_adjust 128 } sen_sel_operator; 129 130 typedef enum { 131 sen_sel_exact = 0, 132 sen_sel_partial, 133 sen_sel_unsplit, 134 sen_sel_near, 135 sen_sel_near2, 136 sen_sel_similar, 137 sen_sel_term_extract, 138 sen_sel_prefix, 139 sen_sel_suffix 140 } sen_sel_mode; 141 142 typedef enum { 143 sen_sort_descending = 0, 144 sen_sort_ascending = 1 145 } sen_sort_mode; 146 147 typedef enum { 148 sen_log_none = 0, 149 sen_log_emerg, 150 sen_log_alert, 151 sen_log_crit, 152 sen_log_error, 153 sen_log_warning, 154 sen_log_notice, 155 sen_log_info, 156 sen_log_debug, 157 sen_log_dump 158 } sen_log_level; 159 160 typedef struct _sen_db sen_db; 161 typedef struct _sen_ctx sen_ctx; 162 typedef struct _sen_ctx_info sen_ctx_info; 163 typedef struct _sen_set sen_set; 164 typedef struct _sen_sym sen_sym; 165 typedef struct _sen_inv sen_inv; 166 typedef struct _sen_index sen_index; 167 typedef struct _sen_set_cursor sen_set_cursor; 168 typedef struct _sen_set_element *sen_set_eh; 169 typedef struct _sen_value sen_value; 170 typedef struct _sen_values sen_values; 171 typedef struct _sen_select_optarg sen_select_optarg; 172 typedef struct _sen_group_optarg sen_group_optarg; 173 typedef struct _sen_sort_optarg sen_sort_optarg; 174 typedef struct _sen_snip sen_snip; 175 typedef struct _sen_query sen_query; 176 typedef struct _sen_logger_info sen_logger_info; 177 typedef struct _sen_snip_mapping sen_snip_mapping; 178 typedef struct _sen_records_heap sen_records_heap; 179 typedef struct _sen_vgram sen_vgram; 180 typedef struct _sen_vgram_buf sen_vgram_buf; 181 typedef struct _sen_sym_scan_hit sen_sym_scan_hit; 182 typedef struct _sen_sym_cursor sen_sym_cursor; 183 184 typedef unsigned sen_id; 185 186 #define SEN_ID_MAX 0x3fffffff 187 188 typedef sen_set_eh sen_recordh; 189 typedef sen_set sen_records; 190 typedef sen_sort_optarg sen_set_sort_optarg; 191 192 struct _sen_ctx_info { 193 int fd; 194 unsigned int com_status; 195 unsigned int com_info; 196 struct _sen_rbuf *outbuf; 197 unsigned char stat; 198 }; 199 200 struct _sen_index { 201 int foreign_flags; 202 sen_sym *keys; 203 sen_sym *lexicon; 204 sen_inv *inv; 205 sen_vgram *vgram; 206 }; 207 208 struct _sen_value { 209 const char *str; 210 unsigned int str_len; 211 unsigned int weight; 212 }; 213 214 struct _sen_values { 215 int n_values; 216 sen_value *values; 217 }; 218 219 struct _sen_select_optarg { 220 sen_sel_mode mode; 221 int similarity_threshold; 222 int max_interval; 223 int *weight_vector; 224 int vector_size; 225 int (*func)(sen_records *, const void *, int, void *); 226 void *func_arg; 227 int max_size; 228 }; 229 230 struct _sen_group_optarg { 231 sen_sort_mode mode; 232 int (*func)(sen_records *, const sen_recordh *, void *, void *); 233 void *func_arg; 234 int key_size; 235 }; 236 237 struct _sen_sort_optarg { 238 sen_sort_mode mode; 239 int (*compar)(sen_records *, sen_recordh *, sen_records *, sen_recordh *, void *); 240 void *compar_arg; 241 }; 242 243 struct _sen_snip_mapping { 244 void *dummy; 245 }; 246 247 #define SEN_LOG_TIME 1 248 #define SEN_LOG_TITLE 2 249 #define SEN_LOG_MESSAGE 4 250 #define SEN_LOG_LOCATION 8 251 252 struct _sen_logger_info { 253 sen_log_level max_level; 254 int flags; 255 void (*func)(int, const char *, const char *, const char *, const char *, void *); 256 void *func_arg; 257 }; 258 259 struct _sen_sym_scan_hit { 260 sen_id id; 261 unsigned int offset; 262 unsigned int length; 263 }; 264 265 typedef int (*query_term_callback)(const char *, unsigned int, void *); 266 267 268 /******** query language API ********/ 269 270 sen_db *sen_db_create(const char *path, int flags, sen_encoding encoding); 271 sen_db *sen_db_open(const char *path); 272 sen_rc sen_db_close(sen_db *s); 273 274 #define SEN_CTX_MORE 0x01 275 #define SEN_CTX_TAIL 0x02 276 #define SEN_CTX_HEAD 0x04 277 #define SEN_CTX_QUIET 0x08 278 #define SEN_CTX_QUIT 0x10 279 280 #define SEN_CTX_USEQL 1 281 #define SEN_CTX_BATCHMODE 2 282 283 sen_ctx *sen_ctx_open(sen_db *db, int flags); 284 sen_ctx *sen_ctx_connect(const char *host, int port, int flags); 285 sen_rc sen_ctx_load(sen_ctx *c, const char *path); 286 sen_rc sen_ctx_send(sen_ctx *c, char *str, unsigned int str_len, int flags); 287 sen_rc sen_ctx_recv(sen_ctx *c, char **str, unsigned int *str_len, int *flags); 288 sen_rc sen_ctx_close(sen_ctx *c); 289 sen_rc sen_ctx_info_get(sen_ctx *c, sen_ctx_info *info); 290 291 /******** basic API ********/ 292 293 sen_rc sen_init(void); 294 sen_rc sen_fin(void); 295 sen_rc sen_info(char **version, 296 char **configure_options, 297 char **config_path, 298 sen_encoding *default_encoding, 299 unsigned int *initial_n_segments, 300 unsigned int *partial_match_threshold); 301 302 sen_index *sen_index_create(const char *path, int key_size, int flags, 303 int initial_n_segments, sen_encoding encoding); 304 sen_index *sen_index_open(const char *path); 305 sen_rc sen_index_close(sen_index *i); 306 sen_rc sen_index_remove(const char *path); 307 sen_rc sen_index_rename(const char *old_name, const char *new_name); 308 sen_rc sen_index_upd(sen_index *i, const void *key, 309 const char *oldvalue, unsigned int oldvalue_len, 310 const char *newvalue, unsigned int newvalue_len); 311 sen_records *sen_index_sel(sen_index *i, 312 const char *string, unsigned int string_len); 313 #ifdef USE_QUERY_ABORT 314 void sen_index_set_abort_callback(sen_index *i, int (*cb)(void*), void *arg); 315 #endif /* USE_QUERY_ABORT */ 316 int sen_records_next(sen_records *r, void *keybuf, int buf_size, int *score); 317 sen_rc sen_records_rewind(sen_records *r); 318 int sen_records_curr_score(sen_records *r); 319 int sen_records_curr_key(sen_records *r, void *keybuf, int buf_size); 320 int sen_records_nhits(sen_records *r); 321 int sen_records_find(sen_records *r, const void *key); 322 sen_rc sen_records_close(sen_records *r); 323 324 /******** advanced API ********/ 325 326 sen_values *sen_values_open(void); 327 sen_rc sen_values_close(sen_values *v); 328 sen_rc sen_values_add(sen_values *v, 329 const char *str, unsigned int str_len, 330 unsigned int weight); 331 332 sen_records *sen_records_open(sen_rec_unit record_unit, 333 sen_rec_unit subrec_unit, 334 unsigned int max_n_subrecs); 335 sen_records *sen_records_union(sen_records *a, sen_records *b); 336 sen_records *sen_records_subtract(sen_records *a, sen_records *b); 337 sen_records *sen_records_intersect(sen_records *a, sen_records *b); 338 int sen_records_difference(sen_records *a, sen_records *b); 339 sen_rc sen_records_sort(sen_records *r, int limit, sen_sort_optarg *optarg); 340 sen_rc sen_records_group(sen_records *r, int limit, sen_group_optarg *optarg); 341 const sen_recordh *sen_records_curr_rec(sen_records *r); 342 const sen_recordh *sen_records_at(sen_records *r, const void *key, 343 unsigned section, unsigned pos, 344 int *score, int *n_subrecs); 345 sen_rc sen_record_info(sen_records *r, const sen_recordh *rh, 346 void *keybuf, int buf_size, int *key_size, 347 int *section, int *pos, int *score, int *n_subrecs); 348 sen_rc sen_record_subrec_info(sen_records *r, const sen_recordh *rh, 349 int index, void *keybuf, int buf_size, 350 int *key_size, int *section, int *pos, int *score); 351 sen_index *sen_index_create_with_keys(const char *path, sen_sym *keys, int flags, 352 int initial_n_segments, sen_encoding encoding); 353 sen_index *sen_index_open_with_keys(const char *path, sen_sym *keys); 354 sen_index *sen_index_create_with_keys_lexicon(const char *path, sen_sym *keys, 355 sen_sym *lexicon, int initial_n_segments); 356 sen_index *sen_index_open_with_keys_lexicon(const char *path, sen_sym *keys, 357 sen_sym *lexicon); 358 sen_rc sen_index_update(sen_index *i, const void *key, unsigned int section, 359 sen_values *oldvalues, sen_values *newvalues); 360 sen_rc sen_index_select(sen_index *i, 361 const char *string, unsigned int string_len, 362 sen_records *r, 363 sen_sel_operator op, sen_select_optarg *optarg); 364 sen_rc sen_index_info(sen_index *i, int *key_size, int *flags, 365 int *initial_n_segments, sen_encoding *encoding, 366 unsigned *nrecords_keys, unsigned *file_size_keys, 367 unsigned *nrecords_lexicon, unsigned *file_size_lexicon, 368 unsigned long long *inv_seg_size, 369 unsigned long long *inv_chunk_size); 370 int sen_index_path(sen_index *i, char *pathbuf, int buf_size); 371 372 sen_query *sen_query_open(const char *str, unsigned int str_len, 373 sen_sel_operator default_op, 374 int max_exprs, sen_encoding encoding); 375 unsigned int sen_query_rest(sen_query *q, const char ** const rest); 376 sen_rc sen_query_close(sen_query *q); 377 sen_rc sen_query_exec(sen_index *i, sen_query *q, sen_records *r, sen_sel_operator op); 378 void sen_query_term(sen_query *q, query_term_callback func, void *func_arg); 379 sen_rc sen_query_scan(sen_query *q, const char **strs, unsigned int *str_lens, 380 unsigned int nstrs, int flags, int *found, int *score); 381 sen_snip *sen_query_snip(sen_query *query, int flags, 382 unsigned int width, unsigned int max_results, 383 unsigned int n_tags, 384 const char **opentags, unsigned int *opentag_lens, 385 const char **closetags, unsigned int *closetag_lens, 386 sen_snip_mapping *mapping); 387 388 sen_rc sen_index_del(sen_index *i, const void *key); 389 390 /******** low level API ********/ 391 392 sen_set *sen_set_open(unsigned key_size, unsigned value_size, unsigned init_size); 393 sen_rc sen_set_close(sen_set *set); 394 sen_rc sen_set_info(sen_set *set, unsigned *key_size, 395 unsigned *value_size, unsigned *n_entries); 396 sen_set_eh *sen_set_get(sen_set *set, const void *key, void **value); 397 sen_set_eh *sen_set_at(sen_set *set, const void *key, void **value); 398 sen_rc sen_set_del(sen_set *set, sen_set_eh *e); 399 sen_set_cursor *sen_set_cursor_open(sen_set *set); 400 sen_set_eh *sen_set_cursor_next(sen_set_cursor *cursor, void **key, void **value); 401 sen_rc sen_set_cursor_close(sen_set_cursor *cursor); 402 sen_rc sen_set_element_info(sen_set *set, const sen_set_eh *e, 403 void **key, void **value); 404 sen_set *sen_set_union(sen_set *a, sen_set *b); 405 sen_set *sen_set_subtract(sen_set *a, sen_set *b); 406 sen_set *sen_set_intersect(sen_set *a, sen_set *b); 407 int sen_set_difference(sen_set *a, sen_set *b); 408 sen_set_eh *sen_set_sort(sen_set *set, int limit, sen_set_sort_optarg *optarg); 409 410 sen_sym *sen_sym_create(const char *path, unsigned key_size, 411 unsigned flags, sen_encoding encoding); 412 sen_sym *sen_sym_open(const char *path); 413 sen_rc sen_sym_info(sen_sym *sym, int *key_size, unsigned *flags, 414 sen_encoding *encoding, unsigned *nrecords, unsigned *file_size); 415 sen_rc sen_sym_close(sen_sym *sym); 416 sen_rc sen_sym_remove(const char *path); 417 418 /* Lookup the sym table and find the id of the corresponding entry. 419 * If no matches are found, create a new entry, and return that ID 420 */ 421 sen_id sen_sym_get(sen_sym *sym, const void *key); 422 423 /* Lookup the sym table and find the id of the corresponding entry. 424 * If no matches are found return SEN_SYM_NIL 425 */ 426 sen_id sen_sym_at(sen_sym *sym, const void *key); 427 sen_rc sen_sym_del(sen_sym *sym, const void *key); 428 unsigned int sen_sym_size(sen_sym *sym); 429 int sen_sym_key(sen_sym *sym, sen_id id, void *keybuf, int buf_size); 430 sen_set *sen_sym_prefix_search(sen_sym *sym, const void *key); 431 sen_set *sen_sym_suffix_search(sen_sym *sym, const void *key); 432 sen_id sen_sym_common_prefix_search(sen_sym *sym, const void *key); 433 int sen_sym_pocket_get(sen_sym *sym, sen_id id); 434 sen_rc sen_sym_pocket_set(sen_sym *sym, sen_id id, unsigned int value); 435 sen_id sen_sym_next(sen_sym *sym, sen_id id); 436 int sen_sym_scan(sen_sym *sym, const char *str, unsigned int str_len, 437 sen_sym_scan_hit *sh, unsigned int sh_size, const char **rest); 438 439 #define SEN_SYM_DESCENDING 0 440 #define SEN_SYM_ASCENDING 1 441 #define SEN_SYM_GE 0 442 #define SEN_SYM_GT 2 443 #define SEN_SYM_LE 0 444 #define SEN_SYM_LT 4 445 446 sen_sym_cursor *sen_sym_cursor_open(sen_sym *sym, sen_ctx *ctx, 447 const void *min, const void *max, int flags); 448 sen_id sen_sym_cursor_next(sen_sym_cursor *c); 449 void sen_sym_cursor_close(sen_sym_cursor *c); 450 451 /******** utility API ********/ 452 sen_snip *sen_snip_open(sen_encoding encoding, int flags, unsigned int width, 453 unsigned int max_results, 454 const char *defaultopentag, unsigned int defaultopentag_len, 455 const char *defaultclosetag, unsigned int defaultclosetag_len, 456 sen_snip_mapping *mapping); 457 sen_rc sen_snip_close(sen_snip *snip); 458 sen_rc sen_snip_add_cond(sen_snip *snip, 459 const char *keyword, unsigned int keyword_len, 460 const char *opentag, unsigned int opentag_len, 461 const char *closetag, unsigned int closetag_len); 462 sen_rc sen_snip_exec(sen_snip *snip, 463 const char *string, unsigned int string_len, 464 unsigned int *nresults, unsigned int *max_tagged_len); 465 sen_rc sen_snip_get_result(sen_snip *snip, const unsigned int index, 466 char *result, unsigned int *result_len); 467 468 sen_records_heap *sen_records_heap_open(int size, int limit, sen_sort_optarg *optarg); 469 sen_rc sen_records_heap_add(sen_records_heap *h, sen_records *r); 470 int sen_records_heap_next(sen_records_heap *h); 471 sen_records *sen_records_heap_head(sen_records_heap *h); 472 sen_rc sen_records_heap_close(sen_records_heap *h); 473 474 int sen_inv_entry_info(sen_inv *inv, sen_id key, unsigned *a, unsigned *pocket, 475 unsigned *chunk, unsigned *chunk_size, unsigned *buffer_free, 476 unsigned *nterms, unsigned *nterms_void, unsigned *tid, 477 unsigned *size_in_chunk, unsigned *pos_in_chunk, 478 unsigned *size_in_buffer, unsigned *pos_in_buffer); 479 480 /* flags for sen_str_normalize */ 481 #define SEN_STR_REMOVEBLANK 1 482 #define SEN_STR_WITH_CTYPES 2 483 #define SEN_STR_WITH_CHECKS 4 484 int sen_str_normalize(const char *str, unsigned int str_len, 485 sen_encoding encoding, int flags, 486 char *nstrbuf, int buf_size); 487 unsigned int sen_str_charlen(const char *str, sen_encoding encoding); 488 489 /* misc */ 490 491 sen_rc sen_logger_info_set(const sen_logger_info *info); 492 493 void sen_logger_put(sen_log_level level, 494 const char *file, int line, const char *func, char *fmt, ...); 495 496 int sen_logger_pass(sen_log_level level); 497 498 #define SEN_LOG(level,...) \ 499 if (sen_logger_pass(level)) {\ 500 sen_logger_put((level), __FILE__, __LINE__, __FUNCTION__, __VA_ARGS__);\ 501 } 502 503 #ifndef SEN_LOG_DEFAULT_LEVEL 504 #define SEN_LOG_DEFAULT_LEVEL sen_log_notice 505 #endif /* SEN_LOG_DEFAULT_LEVEL */ 506 507 #define sen_log(...) \ 508 if (sen_logger_pass(SEN_LOG_DEFAULT_LEVEL)) {\ 509 sen_logger_put(SEN_LOG_DEFAULT_LEVEL, __FILE__, __LINE__, __FUNCTION__, __VA_ARGS__);\ 510 } 511 512 sen_rc sen_lex_set_mecab_args(int argc, char **argv); 513 514 #ifdef __cplusplus 515 } 516 #endif 517 518 #endif /* SENNA_H */ 519