1 #ifndef FTS_TOKENIZER_H
2 #define FTS_TOKENIZER_H
3 
4 /*
5  Settings are given in the form of a const char * const *settings =
6  {"key, "value", "key2", "value2", NULL} array of string pairs. Some
7  keys, like "no_parent" and "search" are a sort of boolean and the
8  value does not matter, just mentioning the key enables the functionality.
9  The array has to be NULL terminated.
10 */
11 /* Email address header tokenizer that returns "user@domain.org" input as
12    "user@domain.org" token as well as passing it through to the parent
13    (generic) tokenizer, which also returns "user", "domain" and "org".
14    This allows searching the mails with their individual components, but also
15    allows doing an explicit "user@domain" search, which returns only mails
16    matching that exact address (instead of e.g. a mail with both user@domain2
17    and user2@domain words). */
18 /* Settings:
19    "no_parent", Return only our tokens, no data for parent to process.
20    Defaults to disabled. Should normally not be needed.
21 
22    "search" Remove addresses from parent data stream, so they are not processed
23    further. Defaults to disabled. Enable by defining the keyword (and any
24    value). */
25 extern const struct fts_tokenizer *fts_tokenizer_email_address;
26 
27 /* Generic email content tokenizer. Cuts text into tokens. */
28 /* Settings:
29    "maxlen" Maximum length of token, before an arbitrary cut off is made.
30    Defaults to FTS_DEFAULT_TOKEN_MAX_LENGTH.
31 
32    "algorithm", accepted values are "simple" or "tr29". Defines the
33    method for looking for word boundaries. Simple is faster and will
34    work for many texts, especially those using latin alphabets, but
35    leaves corner cases. The tr29 implements a version of Unicode
36    technical report 29 word boundary lookup. It might work better with
37    e.g. texts containing Katakana or hebrew characters, but it is not
38    possible to use a single algorithm for all existing languages. It
39    is also significantly slower than simple. The algorithms also
40    differ in some details, e.g. simple will cut "a.b" and tr29 will
41    not. The default is "simple" */
42 extern const struct fts_tokenizer *fts_tokenizer_generic;
43 
44 /*
45  Tokenizing workflow, find --> create --> filter --> destroy.
46  Do init before first use and deinit after all done.
47  */
48 
49 /* Register all built-in tokenizers. */
50 void fts_tokenizers_init(void);
51 void fts_tokenizers_deinit(void);
52 
53 const struct fts_tokenizer *fts_tokenizer_find(const char *name);
54 
55 /* Create a new tokenizer. The settings are described above. */
56 int fts_tokenizer_create(const struct fts_tokenizer *tok_class,
57 			 struct fts_tokenizer *parent,
58 			 const char *const *settings,
59 			 struct fts_tokenizer **tokenizer_r,
60 			 const char **error_r);
61 void fts_tokenizer_ref(struct fts_tokenizer *tok);
62 void fts_tokenizer_unref(struct fts_tokenizer **tok);
63 
64 /* Reset FTS tokenizer state */
65 void fts_tokenizer_reset(struct fts_tokenizer *tok);
66 
67 /*
68    Returns 1 if *token_r was returned, 0 if more data is needed, -1 on error.
69 
70    This function should be called with the same data+size until it
71    returns 0. After that fts_tokenizer_final() should be called until it
72    returns 0 to flush out the final token(s).
73 
74    data must contain only valid complete UTF-8 sequences, but otherwise it
75    may be broken into however small pieces. (Input to this function typically
76    comes from message-decoder, which returns only complete UTF-8 sequences.) */
77 
78 int fts_tokenizer_next(struct fts_tokenizer *tok,
79 		       const unsigned char *data, size_t size,
80 		       const char **token_r, const char **error_r);
81 /* Returns same as fts_tokenizer_next(). */
82 int fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r,
83 			const char **error_r);
84 
85 const char *fts_tokenizer_name(const struct fts_tokenizer *tok);
86 
87 #endif
88