1 /* Copyright(C) 2004 Brazil
2 
3   This library is free software; you can redistribute it and/or
4   modify it under the terms of the GNU Lesser General Public
5   License as published by the Free Software Foundation; either
6   version 2.1 of the License, or (at your option) any later version.
7 
8   This library is distributed in the hope that it will be useful,
9   but WITHOUT ANY WARRANTY; without even the implied warranty of
10   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11   Lesser General Public License for more details.
12 
13   You should have received a copy of the GNU Lesser General Public
14   License along with this library; if not, write to the Free Software
15   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 #ifndef SENNA_H
18 #define SENNA_H
19 
20 #ifdef  __cplusplus
21 extern "C" {
22 #endif
23 
24 /* available bit for sen_index_create flag */
25 #define SEN_INDEX_NORMALIZE                     0x0001
26 #define SEN_INDEX_SPLIT_ALPHA                   0x0002
27 #define SEN_INDEX_SPLIT_DIGIT                   0x0004
28 #define SEN_INDEX_SPLIT_SYMBOL                  0x0008
29 #define SEN_INDEX_MORPH_ANALYSE                 0x0000
30 #define SEN_INDEX_NGRAM                         0x0010
31 #define SEN_INDEX_DELIMITED                     0x0020
32 #define SEN_INDEX_ENABLE_SUFFIX_SEARCH          0x0100
33 #define SEN_INDEX_DISABLE_SUFFIX_SEARCH         0x0200
34 #define SEN_INDEX_WITH_VGRAM                    0x1000
35 #define SEN_INDEX_SHARED_LEXICON                0x2000
36 #define SEN_INDEX_WITH_VACUUM                   0x8000
37 
38 /* 16 tokenizers can be registered */
39 #define SEN_INDEX_TOKENIZER_MASK                0x00f0
40 
41 #define SEN_SYM_MAX_KEY_SIZE                    8192
42 
43 #define SEN_SYM_WITH_SIS                        0x80000000
44 
45 #define SEN_SNIP_NORMALIZE                      0x0001
46 #define SEN_SNIP_COPY_TAG                       0x0002
47 #define SEN_SNIP_SKIP_LEADING_SPACES            0x0004
48 #define SEN_QUERY_SCAN_NORMALIZE                SEN_SNIP_NORMALIZE
49 
50 #define SEN_LEX_NGRAM_UNIT_SIZE                 2
51 
52 #ifndef SEN_STACK_SIZE
53 #define SEN_STACK_SIZE                          0x10000000
54 #endif /* SEN_STACK_SIZE */
55 
56 #ifndef SEN_QUERY_AND
57 #define SEN_QUERY_AND '+'
58 #endif /* SEN_QUERY_AND */
59 #ifndef SEN_QUERY_BUT
60 #define SEN_QUERY_BUT '-'
61 #endif /* SEN_QUERY_BUT */
62 #ifndef SEN_QUERY_ADJ_INC
63 #define SEN_QUERY_ADJ_INC '>'
64 #endif /* SEN_QUERY_ADJ_POS2 */
65 #ifndef SEN_QUERY_ADJ_DEC
66 #define SEN_QUERY_ADJ_DEC '<'
67 #endif /* SEN_QUERY_ADJ_POS1 */
68 #ifndef SEN_QUERY_ADJ_NEG
69 #define SEN_QUERY_ADJ_NEG '~'
70 #endif /* SEN_QUERY_ADJ_NEG */
71 #ifndef SEN_QUERY_PREFIX
72 #define SEN_QUERY_PREFIX '*'
73 #endif /* SEN_QUERY_PREFIX */
74 #ifndef SEN_QUERY_PARENL
75 #define SEN_QUERY_PARENL '('
76 #endif /* SEN_QUERY_PARENL */
77 #ifndef SEN_QUERY_PARENR
78 #define SEN_QUERY_PARENR ')'
79 #endif /* SEN_QUERY_PARENR */
80 #ifndef SEN_QUERY_QUOTEL
81 #define SEN_QUERY_QUOTEL '"'
82 #endif /* SEN_QUERY_QUOTEL */
83 #ifndef SEN_QUERY_QUOTER
84 #define SEN_QUERY_QUOTER '"'
85 #endif /* SEN_QUERY_QUOTER */
86 #ifndef SEN_QUERY_ESCAPE
87 #define SEN_QUERY_ESCAPE '\\'
88 #endif /* SEN_QUERY_ESCAPE */
89 
90 #define SEN_SYM_NIL 0
91 
92 typedef enum {
93   sen_success = 0,
94   sen_memory_exhausted,
95   sen_invalid_format,
96   sen_file_operation_error,
97   sen_invalid_argument,
98   sen_other_error,
99   sen_external_error,
100   sen_internal_error,
101   sen_abnormal_error,
102   sen_end_of_data
103 } sen_rc;
104 
105 typedef enum {
106   sen_enc_default = 0,
107   sen_enc_none,
108   sen_enc_euc_jp,
109   sen_enc_utf8,
110   sen_enc_sjis,
111   sen_enc_latin1,
112   sen_enc_koi8r
113 } sen_encoding;
114 
115 typedef enum {
116   sen_rec_document = 0,
117   sen_rec_section,
118   sen_rec_position,
119   sen_rec_userdef,
120   sen_rec_none
121 } sen_rec_unit;
122 
123 typedef enum {
124   sen_sel_or = 0,
125   sen_sel_and,
126   sen_sel_but,
127   sen_sel_adjust
128 } sen_sel_operator;
129 
130 typedef enum {
131   sen_sel_exact = 0,
132   sen_sel_partial,
133   sen_sel_unsplit,
134   sen_sel_near,
135   sen_sel_near2,
136   sen_sel_similar,
137   sen_sel_term_extract,
138   sen_sel_prefix,
139   sen_sel_suffix
140 } sen_sel_mode;
141 
142 typedef enum {
143   sen_sort_descending = 0,
144   sen_sort_ascending = 1
145 } sen_sort_mode;
146 
147 typedef enum {
148   sen_log_none = 0,
149   sen_log_emerg,
150   sen_log_alert,
151   sen_log_crit,
152   sen_log_error,
153   sen_log_warning,
154   sen_log_notice,
155   sen_log_info,
156   sen_log_debug,
157   sen_log_dump
158 } sen_log_level;
159 
160 typedef struct _sen_db sen_db;
161 typedef struct _sen_ctx sen_ctx;
162 typedef struct _sen_ctx_info sen_ctx_info;
163 typedef struct _sen_set sen_set;
164 typedef struct _sen_sym sen_sym;
165 typedef struct _sen_inv sen_inv;
166 typedef struct _sen_index sen_index;
167 typedef struct _sen_set_cursor sen_set_cursor;
168 typedef struct _sen_set_element *sen_set_eh;
169 typedef struct _sen_value sen_value;
170 typedef struct _sen_values sen_values;
171 typedef struct _sen_select_optarg sen_select_optarg;
172 typedef struct _sen_group_optarg sen_group_optarg;
173 typedef struct _sen_sort_optarg sen_sort_optarg;
174 typedef struct _sen_snip sen_snip;
175 typedef struct _sen_query sen_query;
176 typedef struct _sen_logger_info sen_logger_info;
177 typedef struct _sen_snip_mapping sen_snip_mapping;
178 typedef struct _sen_records_heap sen_records_heap;
179 typedef struct _sen_vgram sen_vgram;
180 typedef struct _sen_vgram_buf sen_vgram_buf;
181 typedef struct _sen_sym_scan_hit sen_sym_scan_hit;
182 typedef struct _sen_sym_cursor sen_sym_cursor;
183 
184 typedef unsigned sen_id;
185 
186 #define SEN_ID_MAX 0x3fffffff
187 
188 typedef sen_set_eh sen_recordh;
189 typedef sen_set sen_records;
190 typedef sen_sort_optarg sen_set_sort_optarg;
191 
192 struct _sen_ctx_info {
193   int fd;
194   unsigned int com_status;
195   unsigned int com_info;
196   struct _sen_rbuf *outbuf;
197   unsigned char stat;
198 };
199 
200 struct _sen_index {
201   int foreign_flags;
202   sen_sym *keys;
203   sen_sym *lexicon;
204   sen_inv *inv;
205   sen_vgram *vgram;
206 };
207 
208 struct _sen_value {
209   const char *str;
210   unsigned int str_len;
211   unsigned int weight;
212 };
213 
214 struct _sen_values {
215   int n_values;
216   sen_value *values;
217 };
218 
219 struct _sen_select_optarg {
220   sen_sel_mode mode;
221   int similarity_threshold;
222   int max_interval;
223   int *weight_vector;
224   int vector_size;
225   int (*func)(sen_records *, const void *, int, void *);
226   void *func_arg;
227   int max_size;
228 };
229 
230 struct _sen_group_optarg {
231   sen_sort_mode mode;
232   int (*func)(sen_records *, const sen_recordh *, void *, void *);
233   void *func_arg;
234   int key_size;
235 };
236 
237 struct _sen_sort_optarg {
238   sen_sort_mode mode;
239   int (*compar)(sen_records *, sen_recordh *, sen_records *, sen_recordh *, void *);
240   void *compar_arg;
241 };
242 
243 struct _sen_snip_mapping {
244   void *dummy;
245 };
246 
247 #define SEN_LOG_TIME      1
248 #define SEN_LOG_TITLE     2
249 #define SEN_LOG_MESSAGE   4
250 #define SEN_LOG_LOCATION  8
251 
252 struct _sen_logger_info {
253   sen_log_level max_level;
254   int flags;
255   void (*func)(int, const char *, const char *, const char *, const char *, void *);
256   void *func_arg;
257 };
258 
259 struct _sen_sym_scan_hit {
260   sen_id id;
261   unsigned int offset;
262   unsigned int length;
263 };
264 
265 typedef int (*query_term_callback)(const char *, unsigned int, void *);
266 
267 
268 /******** query language API ********/
269 
270 sen_db *sen_db_create(const char *path, int flags, sen_encoding encoding);
271 sen_db *sen_db_open(const char *path);
272 sen_rc sen_db_close(sen_db *s);
273 
274 #define SEN_CTX_MORE  0x01
275 #define SEN_CTX_TAIL  0x02
276 #define SEN_CTX_HEAD  0x04
277 #define SEN_CTX_QUIET 0x08
278 #define SEN_CTX_QUIT  0x10
279 
280 #define SEN_CTX_USEQL 1
281 #define SEN_CTX_BATCHMODE 2
282 
283 sen_ctx *sen_ctx_open(sen_db *db, int flags);
284 sen_ctx *sen_ctx_connect(const char *host, int port, int flags);
285 sen_rc sen_ctx_load(sen_ctx *c, const char *path);
286 sen_rc sen_ctx_send(sen_ctx *c, char *str, unsigned int str_len, int flags);
287 sen_rc sen_ctx_recv(sen_ctx *c, char **str, unsigned int *str_len, int *flags);
288 sen_rc sen_ctx_close(sen_ctx *c);
289 sen_rc sen_ctx_info_get(sen_ctx *c, sen_ctx_info *info);
290 
291 /******** basic API ********/
292 
293 sen_rc sen_init(void);
294 sen_rc sen_fin(void);
295 sen_rc sen_info(char **version,
296                 char **configure_options,
297                 char **config_path,
298                 sen_encoding *default_encoding,
299                 unsigned int *initial_n_segments,
300                 unsigned int *partial_match_threshold);
301 
302 sen_index *sen_index_create(const char *path, int key_size, int flags,
303                             int initial_n_segments, sen_encoding encoding);
304 sen_index *sen_index_open(const char *path);
305 sen_rc sen_index_close(sen_index *i);
306 sen_rc sen_index_remove(const char *path);
307 sen_rc sen_index_rename(const char *old_name, const char *new_name);
308 sen_rc sen_index_upd(sen_index *i, const void *key,
309                      const char *oldvalue, unsigned int oldvalue_len,
310                      const char *newvalue, unsigned int newvalue_len);
311 sen_records *sen_index_sel(sen_index *i,
312                            const char *string, unsigned int string_len);
313 #ifdef USE_QUERY_ABORT
314 void sen_index_set_abort_callback(sen_index *i, int (*cb)(void*), void *arg);
315 #endif /* USE_QUERY_ABORT */
316 int sen_records_next(sen_records *r, void *keybuf, int buf_size, int *score);
317 sen_rc sen_records_rewind(sen_records *r);
318 int sen_records_curr_score(sen_records *r);
319 int sen_records_curr_key(sen_records *r, void *keybuf, int buf_size);
320 int sen_records_nhits(sen_records *r);
321 int sen_records_find(sen_records *r, const void *key);
322 sen_rc sen_records_close(sen_records *r);
323 
324 /******** advanced API ********/
325 
326 sen_values *sen_values_open(void);
327 sen_rc sen_values_close(sen_values *v);
328 sen_rc sen_values_add(sen_values *v,
329                       const char *str, unsigned int str_len,
330                       unsigned int weight);
331 
332 sen_records *sen_records_open(sen_rec_unit record_unit,
333                               sen_rec_unit subrec_unit,
334                               unsigned int max_n_subrecs);
335 sen_records *sen_records_union(sen_records *a, sen_records *b);
336 sen_records *sen_records_subtract(sen_records *a, sen_records *b);
337 sen_records *sen_records_intersect(sen_records *a, sen_records *b);
338 int sen_records_difference(sen_records *a, sen_records *b);
339 sen_rc sen_records_sort(sen_records *r, int limit, sen_sort_optarg *optarg);
340 sen_rc sen_records_group(sen_records *r, int limit, sen_group_optarg *optarg);
341 const sen_recordh *sen_records_curr_rec(sen_records *r);
342 const sen_recordh *sen_records_at(sen_records *r, const void *key,
343                                    unsigned section, unsigned pos,
344                                    int *score, int *n_subrecs);
345 sen_rc sen_record_info(sen_records *r, const sen_recordh *rh,
346                        void *keybuf, int buf_size, int *key_size,
347                        int *section, int *pos, int *score, int *n_subrecs);
348 sen_rc sen_record_subrec_info(sen_records *r, const sen_recordh *rh,
349                               int index, void *keybuf, int buf_size,
350                               int *key_size, int *section, int *pos, int *score);
351 sen_index *sen_index_create_with_keys(const char *path, sen_sym *keys, int flags,
352                                       int initial_n_segments, sen_encoding encoding);
353 sen_index *sen_index_open_with_keys(const char *path, sen_sym *keys);
354 sen_index *sen_index_create_with_keys_lexicon(const char *path, sen_sym *keys,
355                                               sen_sym *lexicon, int initial_n_segments);
356 sen_index *sen_index_open_with_keys_lexicon(const char *path, sen_sym *keys,
357                                             sen_sym *lexicon);
358 sen_rc sen_index_update(sen_index *i, const void *key, unsigned int section,
359                         sen_values *oldvalues, sen_values *newvalues);
360 sen_rc sen_index_select(sen_index *i,
361                         const char *string, unsigned int string_len,
362                         sen_records *r,
363                         sen_sel_operator op, sen_select_optarg *optarg);
364 sen_rc sen_index_info(sen_index *i, int *key_size, int *flags,
365                       int *initial_n_segments, sen_encoding *encoding,
366                       unsigned *nrecords_keys, unsigned *file_size_keys,
367                       unsigned *nrecords_lexicon, unsigned *file_size_lexicon,
368                       unsigned long long *inv_seg_size,
369                       unsigned long long *inv_chunk_size);
370 int sen_index_path(sen_index *i, char *pathbuf, int buf_size);
371 
372 sen_query *sen_query_open(const char *str, unsigned int str_len,
373                           sen_sel_operator default_op,
374                           int max_exprs, sen_encoding encoding);
375 unsigned int sen_query_rest(sen_query *q, const char ** const rest);
376 sen_rc sen_query_close(sen_query *q);
377 sen_rc sen_query_exec(sen_index *i, sen_query *q, sen_records *r, sen_sel_operator op);
378 void sen_query_term(sen_query *q, query_term_callback func, void *func_arg);
379 sen_rc sen_query_scan(sen_query *q, const char **strs, unsigned int *str_lens,
380                       unsigned int nstrs, int flags, int *found, int *score);
381 sen_snip *sen_query_snip(sen_query *query, int flags,
382                          unsigned int width, unsigned int max_results,
383                          unsigned int n_tags,
384                          const char **opentags, unsigned int *opentag_lens,
385                          const char **closetags, unsigned int *closetag_lens,
386                          sen_snip_mapping *mapping);
387 
388 sen_rc sen_index_del(sen_index *i, const void *key);
389 
390 /******** low level API ********/
391 
392 sen_set *sen_set_open(unsigned key_size, unsigned value_size, unsigned init_size);
393 sen_rc sen_set_close(sen_set *set);
394 sen_rc sen_set_info(sen_set *set, unsigned *key_size,
395                     unsigned *value_size, unsigned *n_entries);
396 sen_set_eh *sen_set_get(sen_set *set, const void *key, void **value);
397 sen_set_eh *sen_set_at(sen_set *set, const void *key, void **value);
398 sen_rc sen_set_del(sen_set *set, sen_set_eh *e);
399 sen_set_cursor *sen_set_cursor_open(sen_set *set);
400 sen_set_eh *sen_set_cursor_next(sen_set_cursor *cursor, void **key, void **value);
401 sen_rc sen_set_cursor_close(sen_set_cursor *cursor);
402 sen_rc sen_set_element_info(sen_set *set, const sen_set_eh *e,
403                             void **key, void **value);
404 sen_set *sen_set_union(sen_set *a, sen_set *b);
405 sen_set *sen_set_subtract(sen_set *a, sen_set *b);
406 sen_set *sen_set_intersect(sen_set *a, sen_set *b);
407 int sen_set_difference(sen_set *a, sen_set *b);
408 sen_set_eh *sen_set_sort(sen_set *set, int limit, sen_set_sort_optarg *optarg);
409 
410 sen_sym *sen_sym_create(const char *path, unsigned key_size,
411                          unsigned flags, sen_encoding encoding);
412 sen_sym *sen_sym_open(const char *path);
413 sen_rc sen_sym_info(sen_sym *sym, int *key_size, unsigned *flags,
414                     sen_encoding *encoding, unsigned *nrecords, unsigned *file_size);
415 sen_rc sen_sym_close(sen_sym *sym);
416 sen_rc sen_sym_remove(const char *path);
417 
418 /* Lookup the sym table and find the id of the corresponding entry.
419  * If no matches are found, create a new entry, and return that ID
420  */
421 sen_id sen_sym_get(sen_sym *sym, const void *key);
422 
423 /* Lookup the sym table and find the id of the corresponding entry.
424  * If no matches are found return SEN_SYM_NIL
425  */
426 sen_id sen_sym_at(sen_sym *sym, const void *key);
427 sen_rc sen_sym_del(sen_sym *sym, const void *key);
428 unsigned int sen_sym_size(sen_sym *sym);
429 int sen_sym_key(sen_sym *sym, sen_id id, void *keybuf, int buf_size);
430 sen_set *sen_sym_prefix_search(sen_sym *sym, const void *key);
431 sen_set *sen_sym_suffix_search(sen_sym *sym, const void *key);
432 sen_id sen_sym_common_prefix_search(sen_sym *sym, const void *key);
433 int sen_sym_pocket_get(sen_sym *sym, sen_id id);
434 sen_rc sen_sym_pocket_set(sen_sym *sym, sen_id id, unsigned int value);
435 sen_id sen_sym_next(sen_sym *sym, sen_id id);
436 int sen_sym_scan(sen_sym *sym, const char *str, unsigned int str_len,
437                  sen_sym_scan_hit *sh, unsigned int sh_size, const char **rest);
438 
439 #define SEN_SYM_DESCENDING 0
440 #define SEN_SYM_ASCENDING  1
441 #define SEN_SYM_GE 0
442 #define SEN_SYM_GT 2
443 #define SEN_SYM_LE 0
444 #define SEN_SYM_LT 4
445 
446 sen_sym_cursor *sen_sym_cursor_open(sen_sym *sym, sen_ctx *ctx,
447                                     const void *min, const void *max, int flags);
448 sen_id sen_sym_cursor_next(sen_sym_cursor *c);
449 void sen_sym_cursor_close(sen_sym_cursor *c);
450 
451 /******** utility API ********/
452 sen_snip *sen_snip_open(sen_encoding encoding, int flags, unsigned int width,
453                         unsigned int max_results,
454                         const char *defaultopentag, unsigned int defaultopentag_len,
455                         const char *defaultclosetag, unsigned int defaultclosetag_len,
456                         sen_snip_mapping *mapping);
457 sen_rc sen_snip_close(sen_snip *snip);
458 sen_rc sen_snip_add_cond(sen_snip *snip,
459                          const char *keyword, unsigned int keyword_len,
460                          const char *opentag, unsigned int opentag_len,
461                          const char *closetag, unsigned int closetag_len);
462 sen_rc sen_snip_exec(sen_snip *snip,
463                      const char *string, unsigned int string_len,
464                      unsigned int *nresults, unsigned int *max_tagged_len);
465 sen_rc sen_snip_get_result(sen_snip *snip, const unsigned int index,
466                            char *result, unsigned int *result_len);
467 
468 sen_records_heap *sen_records_heap_open(int size, int limit, sen_sort_optarg *optarg);
469 sen_rc sen_records_heap_add(sen_records_heap *h, sen_records *r);
470 int sen_records_heap_next(sen_records_heap *h);
471 sen_records *sen_records_heap_head(sen_records_heap *h);
472 sen_rc sen_records_heap_close(sen_records_heap *h);
473 
474 int sen_inv_entry_info(sen_inv *inv, sen_id key, unsigned *a, unsigned *pocket,
475                        unsigned *chunk, unsigned *chunk_size, unsigned *buffer_free,
476                        unsigned *nterms, unsigned *nterms_void, unsigned *tid,
477                        unsigned *size_in_chunk, unsigned *pos_in_chunk,
478                        unsigned *size_in_buffer, unsigned *pos_in_buffer);
479 
480 /* flags for sen_str_normalize */
481 #define SEN_STR_REMOVEBLANK 1
482 #define SEN_STR_WITH_CTYPES 2
483 #define SEN_STR_WITH_CHECKS 4
484 int sen_str_normalize(const char *str, unsigned int str_len,
485                       sen_encoding encoding, int flags,
486                       char *nstrbuf, int buf_size);
487 unsigned int sen_str_charlen(const char *str, sen_encoding encoding);
488 
489 /* misc */
490 
491 sen_rc sen_logger_info_set(const sen_logger_info *info);
492 
493 void sen_logger_put(sen_log_level level,
494                     const char *file, int line, const char *func, char *fmt, ...);
495 
496 int sen_logger_pass(sen_log_level level);
497 
498 #define SEN_LOG(level,...) \
499 if (sen_logger_pass(level)) {\
500   sen_logger_put((level), __FILE__, __LINE__, __FUNCTION__, __VA_ARGS__);\
501 }
502 
503 #ifndef SEN_LOG_DEFAULT_LEVEL
504 #define SEN_LOG_DEFAULT_LEVEL sen_log_notice
505 #endif /* SEN_LOG_DEFAULT_LEVEL */
506 
507 #define sen_log(...) \
508 if (sen_logger_pass(SEN_LOG_DEFAULT_LEVEL)) {\
509   sen_logger_put(SEN_LOG_DEFAULT_LEVEL, __FILE__, __LINE__, __FUNCTION__, __VA_ARGS__);\
510 }
511 
512 sen_rc sen_lex_set_mecab_args(int argc, char **argv);
513 
514 #ifdef __cplusplus
515 }
516 #endif
517 
518 #endif /* SENNA_H */
519