1 /* Copyright (c) 2014-2018 Dovecot authors, see the included COPYING file */
2 
3 #include "lib.h"
4 #include "array.h"
5 #include "istream.h"
6 #include "str.h"
7 #include "strfuncs.h"
8 #include "fts-tokenizer.h"
9 #include "fts-tokenizer-private.h"
10 
11 static ARRAY(const struct fts_tokenizer *) fts_tokenizer_classes;
12 
fts_tokenizers_init(void)13 void fts_tokenizers_init(void)
14 {
15 	if (!array_is_created(&fts_tokenizer_classes)) {
16 		fts_tokenizer_register(fts_tokenizer_generic);
17 		fts_tokenizer_register(fts_tokenizer_email_address);
18 	}
19 }
20 
fts_tokenizers_deinit(void)21 void fts_tokenizers_deinit(void)
22 {
23 	if (array_is_created(&fts_tokenizer_classes))
24 		array_free(&fts_tokenizer_classes);
25 }
26 
27 /* private */
fts_tokenizer_register(const struct fts_tokenizer * tok_class)28 void fts_tokenizer_register(const struct fts_tokenizer *tok_class)
29 {
30 	if (!array_is_created(&fts_tokenizer_classes))
31 		i_array_init(&fts_tokenizer_classes, FTS_TOKENIZER_CLASSES_NR);
32 	array_push_back(&fts_tokenizer_classes, &tok_class);
33 }
34 
35 /* private */
fts_tokenizer_unregister(const struct fts_tokenizer * tok_class)36 void fts_tokenizer_unregister(const struct fts_tokenizer *tok_class)
37 {
38 	const struct fts_tokenizer *const *tp;
39 	unsigned int idx;
40 
41 	array_foreach(&fts_tokenizer_classes, tp) {
42 		if (strcmp((*tp)->name, tok_class->name) == 0) {
43 			idx = array_foreach_idx(&fts_tokenizer_classes, tp);
44 			array_delete(&fts_tokenizer_classes, idx, 1);
45 			if (array_count(&fts_tokenizer_classes) == 0)
46 				array_free(&fts_tokenizer_classes);
47 			return;
48 		}
49 	}
50 	i_unreached();
51 }
52 
fts_tokenizer_find(const char * name)53 const struct fts_tokenizer *fts_tokenizer_find(const char *name)
54 {
55 	const struct fts_tokenizer *tok;
56 
57 	array_foreach_elem(&fts_tokenizer_classes, tok) {
58 		if (strcmp(tok->name, name) == 0)
59 			return tok;
60 	}
61 	return NULL;
62 }
63 
fts_tokenizer_name(const struct fts_tokenizer * tok)64 const char *fts_tokenizer_name(const struct fts_tokenizer *tok)
65 {
66 	return tok->name;
67 }
68 
fts_tokenizer_self_reset(struct fts_tokenizer * tok)69 static void fts_tokenizer_self_reset(struct fts_tokenizer *tok)
70 {
71 	tok->prev_data = NULL;
72 	tok->prev_size = 0;
73 	tok->prev_skip = 0;
74 	tok->prev_reply_finished = TRUE;
75 }
76 
fts_tokenizer_create(const struct fts_tokenizer * tok_class,struct fts_tokenizer * parent,const char * const * settings,struct fts_tokenizer ** tokenizer_r,const char ** error_r)77 int fts_tokenizer_create(const struct fts_tokenizer *tok_class,
78 			 struct fts_tokenizer *parent,
79 			 const char *const *settings,
80 			 struct fts_tokenizer **tokenizer_r,
81 			 const char **error_r)
82 {
83 	struct fts_tokenizer *tok;
84 	const char *empty_settings = NULL;
85 
86 	i_assert(settings == NULL || str_array_length(settings) % 2 == 0);
87 
88 	if (settings == NULL)
89 		settings = &empty_settings;
90 
91 	if (tok_class->v->create(settings, &tok, error_r) < 0) {
92 		*tokenizer_r = NULL;
93 		return -1;
94 	}
95 	tok->refcount = 1;
96 	fts_tokenizer_self_reset(tok);
97 	if (parent != NULL) {
98 		fts_tokenizer_ref(parent);
99 		tok->parent = parent;
100 		tok->parent_input = buffer_create_dynamic(default_pool, 128);
101 	}
102 
103 	*tokenizer_r = tok;
104 	return 0;
105 }
106 
fts_tokenizer_ref(struct fts_tokenizer * tok)107 void fts_tokenizer_ref(struct fts_tokenizer *tok)
108 {
109 	i_assert(tok->refcount > 0);
110 
111 	tok->refcount++;
112 }
113 
fts_tokenizer_unref(struct fts_tokenizer ** _tok)114 void fts_tokenizer_unref(struct fts_tokenizer **_tok)
115 {
116 	struct fts_tokenizer *tok = *_tok;
117 
118 	i_assert(tok->refcount > 0);
119 	*_tok = NULL;
120 
121 	if (--tok->refcount > 0)
122 		return;
123 
124 	buffer_free(&tok->parent_input);
125 	if (tok->parent != NULL)
126 		fts_tokenizer_unref(&tok->parent);
127 	tok->v->destroy(tok);
128 }
129 
130 static int
fts_tokenizer_next_self(struct fts_tokenizer * tok,const unsigned char * data,size_t size,const char ** token_r,const char ** error_r)131 fts_tokenizer_next_self(struct fts_tokenizer *tok,
132                         const unsigned char *data, size_t size,
133                         const char **token_r, const char **error_r)
134 {
135 	int ret = 0;
136 	size_t skip = 0;
137 
138 	i_assert(tok->prev_reply_finished ||
139 		 (data == tok->prev_data && size == tok->prev_size));
140 
141 	if (tok->prev_reply_finished) {
142 		/* whole new data */
143 		ret = tok->v->next(tok, data, size, &skip, token_r, error_r);
144 	} else {
145 		/* continuing previous data */
146 		i_assert(tok->prev_skip <= size);
147 
148 		const unsigned char *data_next;
149 		if (data != NULL)
150 			data_next = data + tok->prev_skip;
151 		else {
152 			i_assert(tok->prev_skip == 0 && size == 0);
153 			data_next = NULL;
154 		}
155 		ret = tok->v->next(tok, data_next,
156 				   size - tok->prev_skip, &skip,
157 				   token_r, error_r);
158 	}
159 
160 	if (ret > 0) {
161 		i_assert(skip <= size - tok->prev_skip);
162 		tok->prev_data = data;
163 		tok->prev_size = size;
164 		tok->prev_skip = tok->prev_skip + skip;
165 		tok->prev_reply_finished = FALSE;
166 	} else if (ret == 0) {
167 		/* we need a new data block */
168 		fts_tokenizer_self_reset(tok);
169 	}
170 	return ret;
171 }
172 
fts_tokenizer_reset(struct fts_tokenizer * tok)173 void fts_tokenizer_reset(struct fts_tokenizer *tok)
174 {
175 	tok->v->reset(tok);
176 	fts_tokenizer_self_reset(tok);
177 }
178 
fts_tokenizer_next(struct fts_tokenizer * tok,const unsigned char * data,size_t size,const char ** token_r,const char ** error_r)179 int fts_tokenizer_next(struct fts_tokenizer *tok,
180 		       const unsigned char *data, size_t size,
181 		       const char **token_r, const char **error_r)
182 {
183 	int ret;
184 
185 	switch (tok->parent_state) {
186 	case FTS_TOKENIZER_PARENT_STATE_ADD_DATA:
187 		ret = fts_tokenizer_next_self(tok, data, size, token_r, error_r);
188 		if (ret <= 0 || tok->parent == NULL || tok->skip_parents)
189 			break;
190 		buffer_set_used_size(tok->parent_input, 0);
191 		buffer_append(tok->parent_input, *token_r, strlen(*token_r));
192 		tok->parent_state++;
193 		/* fall through */
194 	case FTS_TOKENIZER_PARENT_STATE_NEXT_OUTPUT:
195 		ret = fts_tokenizer_next(tok->parent, tok->parent_input->data,
196 		                         tok->parent_input->used, token_r, error_r);
197 		if (ret != 0)
198 			break;
199 		tok->parent_state++;
200 		/* fall through */
201 	case FTS_TOKENIZER_PARENT_STATE_FINALIZE:
202 		ret = fts_tokenizer_next(tok->parent, NULL, 0, token_r, error_r);
203 		if (ret != 0)
204 			break;
205 		/* we're finished sending this token to parent tokenizer.
206 		   see if our own tokenizer has more tokens available */
207 		tok->parent_state = FTS_TOKENIZER_PARENT_STATE_ADD_DATA;
208 		return fts_tokenizer_next(tok, data, size, token_r, error_r);
209 	default:
210 		i_unreached();
211 	}
212 	/* we must not be returning empty tokens */
213 	i_assert(ret <= 0 || (*token_r)[0] != '\0');
214 	return ret;
215 }
216 
fts_tokenizer_final(struct fts_tokenizer * tok,const char ** token_r,const char ** error_r)217 int fts_tokenizer_final(struct fts_tokenizer *tok, const char **token_r,
218 			const char **error_r)
219 {
220 	return fts_tokenizer_next(tok, NULL, 0, token_r, error_r);
221 }
222