1 /* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
2
3 #include "lib.h"
4 #include "array.h"
5 #include "istream.h"
6 #include "str.h"
7 #include "strfuncs.h"
8 #include "fts-tokenizer.h"
9 #include "fts-tokenizer-private.h"
10
11 static ARRAY(const struct fts_tokenizer *) fts_tokenizer_classes;
12
fts_tokenizers_init(void)13 void fts_tokenizers_init(void)
14 {
15 if (!array_is_created(&fts_tokenizer_classes)) {
16 fts_tokenizer_register(fts_tokenizer_generic);
17 fts_tokenizer_register(fts_tokenizer_email_address);
18 }
19 }
20
fts_tokenizers_deinit(void)21 void fts_tokenizers_deinit(void)
22 {
23 if (array_is_created(&fts_tokenizer_classes))
24 array_free(&fts_tokenizer_classes);
25 }
26
27 /* private */
fts_tokenizer_register(const struct fts_tokenizer * tok_class)28 void fts_tokenizer_register(const struct fts_tokenizer *tok_class)
29 {
30 if (!array_is_created(&fts_tokenizer_classes))
31 i_array_init(&fts_tokenizer_classes, FTS_TOKENIZER_CLASSES_NR);
32 array_push_back(&fts_tokenizer_classes, &tok_class);
33 }
34
35 /* private */
fts_tokenizer_unregister(const struct fts_tokenizer * tok_class)36 void fts_tokenizer_unregister(const struct fts_tokenizer *tok_class)
37 {
38 const struct fts_tokenizer *const *tp;
39 unsigned int idx;
40
41 array_foreach(&fts_tokenizer_classes, tp) {
42 if (strcmp((*tp)->name, tok_class->name) == 0) {
43 idx = array_foreach_idx(&fts_tokenizer_classes, tp);
44 array_delete(&fts_tokenizer_classes, idx, 1);
45 if (array_count(&fts_tokenizer_classes) == 0)
46 array_free(&fts_tokenizer_classes);
47 return;
48 }
49 }
50 i_unreached();
51 }
52
fts_tokenizer_find(const char * name)53 const struct fts_tokenizer *fts_tokenizer_find(const char *name)
54 {
55 const struct fts_tokenizer *tok;
56
57 array_foreach_elem(&fts_tokenizer_classes, tok) {
58 if (strcmp(tok->name, name) == 0)
59 return tok;
60 }
61 return NULL;
62 }
63
fts_tokenizer_name(const struct fts_tokenizer * tok)64 const char *fts_tokenizer_name(const struct fts_tokenizer *tok)
65 {
66 return tok->name;
67 }
68
fts_tokenizer_self_reset(struct fts_tokenizer * tok)69 static void fts_tokenizer_self_reset(struct fts_tokenizer *tok)
70 {
71 tok->prev_data = NULL;
72 tok->prev_size = 0;
73 tok->prev_skip = 0;
74 tok->prev_reply_finished = TRUE;
75 }
76
fts_tokenizer_create(const struct fts_tokenizer * tok_class,struct fts_tokenizer * parent,const char * const * settings,struct fts_tokenizer ** tokenizer_r,const char ** error_r)77 int fts_tokenizer_create(const struct fts_tokenizer *tok_class,
78 struct fts_tokenizer *parent,
79 const char *const *settings,
80 struct fts_tokenizer **tokenizer_r,
81 const char **error_r)
82 {
83 struct fts_tokenizer *tok;
84 const char *empty_settings = NULL;
85
86 i_assert(settings == NULL || str_array_length(settings) % 2 == 0);
87
88 if (settings == NULL)
89 settings = &empty_settings;
90
91 if (tok_class->v->create(settings, &tok, error_r) < 0) {
92 *tokenizer_r = NULL;
93 return -1;
94 }
95 tok->refcount = 1;
96 fts_tokenizer_self_reset(tok);
97 if (parent != NULL) {
98 fts_tokenizer_ref(parent);
99 tok->parent = parent;
100 tok->parent_input = buffer_create_dynamic(default_pool, 128);
101 }
102
103 *tokenizer_r = tok;
104 return 0;
105 }
106
fts_tokenizer_ref(struct fts_tokenizer * tok)107 void fts_tokenizer_ref(struct fts_tokenizer *tok)
108 {
109 i_assert(tok->refcount > 0);
110
111 tok->refcount++;
112 }
113
fts_tokenizer_unref(struct fts_tokenizer ** _tok)114 void fts_tokenizer_unref(struct fts_tokenizer **_tok)
115 {
116 struct fts_tokenizer *tok = *_tok;
117
118 i_assert(tok->refcount > 0);
119 *_tok = NULL;
120
121 if (--tok->refcount > 0)
122 return;
123
124 buffer_free(&tok->parent_input);
125 if (tok->parent != NULL)
126 fts_tokenizer_unref(&tok->parent);
127 tok->v->destroy(tok);
128 }
129
130 static int
fts_tokenizer_next_self(struct fts_tokenizer * tok,const unsigned char * data,size_t size,const char ** token_r,const char ** error_r)131 fts_tokenizer_next_self(struct fts_tokenizer *tok,
132 const unsigned char *data, size_t size,
133 const char **token_r, const char **error_r)
134 {
135 int ret = 0;
136 size_t skip = 0;
137
138 i_assert(tok->prev_reply_finished ||
139 (data == tok->prev_data && size == tok->prev_size));
140
141 if (tok->prev_reply_finished) {
142 /* whole new data */
143 ret = tok->v->next(tok, data, size, &skip, token_r, error_r);
144 } else {
145 /* continuing previous data */
146 i_assert(tok->prev_skip <= size);
147
148 const unsigned char *data_next;
149 if (data != NULL)
150 data_next = data + tok->prev_skip;
151 else {
152 i_assert(tok->prev_skip == 0 && size == 0);
153 data_next = NULL;
154 }
155 ret = tok->v->next(tok, data_next,
156 size - tok->prev_skip, &skip,
157 token_r, error_r);
158 }
159
160 if (ret > 0) {
161 i_assert(skip <= size - tok->prev_skip);
162 tok->prev_data = data;
163 tok->prev_size = size;
164 tok->prev_skip = tok->prev_skip + skip;
165 tok->prev_reply_finished = FALSE;
166 } else if (ret == 0) {
167 /* we need a new data block */
168 fts_tokenizer_self_reset(tok);
169 }
170 return ret;
171 }
172
fts_tokenizer_reset(struct fts_tokenizer * tok)173 void fts_tokenizer_reset(struct fts_tokenizer *tok)
174 {
175 tok->v->reset(tok);
176 fts_tokenizer_self_reset(tok);
177 }
178
fts_tokenizer_next(struct fts_tokenizer * tok,const unsigned char * data,size_t size,const char ** token_r,const char ** error_r)179 int fts_tokenizer_next(struct fts_tokenizer *tok,
180 const unsigned char *data, size_t size,
181 const char **token_r, const char **error_r)
182 {
183 int ret;
184
185 switch (tok->parent_state) {
186 case FTS_TOKENIZER_PARENT_STATE_ADD_DATA:
187 ret = fts_tokenizer_next_self(tok, data, size, token_r, error_r);
188 if (ret <= 0 || tok->parent == NULL || tok->skip_parents)
189 break;
190 buffer_set_used_size(tok->parent_input, 0);
191 buffer_append(tok->parent_input, *token_r, strlen(*token_r));
192 tok->parent_state++;
193 /* fall through */
194 case FTS_TOKENIZER_PARENT_STATE_NEXT_OUTPUT:
195 ret = fts_tokenizer_next(tok->parent, tok->parent_input->data,
196 tok->parent_input->used, token_r, error_r);
197 if (ret != 0)
198 break;
199 tok->parent_state++;
200 /* fall through */
201 case FTS_TOKENIZER_PARENT_STATE_FINALIZE:
202 ret = fts_tokenizer_next(tok->parent, NULL, 0, token_r, error_r);
203 if (ret != 0)
204 break;
205 /* we're finished sending this token to parent tokenizer.
206 see if our own tokenizer has more tokens available */
207 tok->parent_state = FTS_TOKENIZER_PARENT_STATE_ADD_DATA;
208 return fts_tokenizer_next(tok, data, size, token_r, error_r);
209 default:
210 i_unreached();
211 }
212 /* we must not be returning empty tokens */
213 i_assert(ret <= 0 || (*token_r)[0] != '\0');
214 return ret;
215 }
216
fts_tokenizer_final(struct fts_tokenizer * tok,const char ** token_r,const char ** error_r)217 int fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r,
218 const char **error_r)
219 {
220 return fts_tokenizer_next(tok, NULL, 0, token_r, error_r);
221 }
222