1 #include "pgroonga.h"
2 #include "pgrn-compatible.h"
3 #include "pgrn-global.h"
4 #include "pgrn-groonga.h"
5 #include "pgrn-tokenize.h"
6
7 #include <catalog/pg_type.h>
8 #include <utils/array.h>
9 #include <utils/builtins.h>
10 #include <utils/json.h>
11
12 static grn_ctx *ctx = &PGrnContext;
13 static grn_obj *lexicon;
14 static grn_obj tokenizerValue;
15 static grn_obj normalizerValue;
16 static grn_obj tokenFiltersValue;
17 static grn_obj tokens;
18 static grn_obj tokenMetadataName;
19 static grn_obj tokenMetadataValue;
20 static grn_obj tokenJSON;
21
22 PGDLLEXPORT PG_FUNCTION_INFO_V1(pgroonga_tokenize);
23
24 typedef struct {
25 grn_id id;
26 grn_obj value;
27 int32_t position;
28 grn_bool forcePrefixSearch;
29 uint64_t sourceOffset;
30 uint32_t sourceLength;
31 uint32_t sourceFirstCharacterLength;
32 grn_obj metadata;
33 } PGrnToken;
34
35 static void
PGrnTokensInit(void)36 PGrnTokensInit(void)
37 {
38 GRN_VALUE_FIX_SIZE_INIT(&tokens, GRN_OBJ_VECTOR, GRN_ID_NIL);
39 }
40
41 static size_t
PGrnTokensSize(void)42 PGrnTokensSize(void)
43 {
44 return GRN_BULK_VSIZE(&tokens) / sizeof(PGrnToken);
45 }
46
47 static PGrnToken *
PGrnTokensAt(size_t i)48 PGrnTokensAt(size_t i)
49 {
50 PGrnToken *rawTokens;
51 rawTokens = (PGrnToken *) GRN_BULK_HEAD(&tokens);
52 return rawTokens + i;
53 }
54
55 static void
PGrnTokensReinit(void)56 PGrnTokensReinit(void)
57 {
58 size_t i;
59 size_t nTokens;
60
61 nTokens = PGrnTokensSize();
62 for (i = 0; i < nTokens; i++)
63 {
64 PGrnToken *token;
65 token = PGrnTokensAt(i);
66 GRN_OBJ_FIN(ctx, &(token->value));
67 GRN_OBJ_FIN(ctx, &(token->metadata));
68 }
69 GRN_BULK_REWIND(&tokens);
70 }
71
72 static void
PGrnTokensAppend(grn_id id,grn_token_cursor * tokenCursor)73 PGrnTokensAppend(grn_id id, grn_token_cursor *tokenCursor)
74 {
75 PGrnToken *token;
76 grn_token *grnToken;
77
78 grn_bulk_space(ctx, &tokens, sizeof(PGrnToken));
79 token = ((PGrnToken *) (GRN_BULK_CURR(&tokens))) - 1;
80 GRN_TEXT_INIT(&(token->value), 0);
81 GRN_TEXT_INIT(&(token->metadata), GRN_OBJ_VECTOR);
82
83 token->id = id;
84
85 grnToken = grn_token_cursor_get_token(ctx, tokenCursor);
86 {
87 grn_obj *data = grn_token_get_data(ctx, grnToken);
88 GRN_TEXT_SET(ctx,
89 &(token->value),
90 GRN_TEXT_VALUE(data),
91 GRN_TEXT_LEN(data));
92 }
93 token->position = grn_token_get_position(ctx, grnToken);
94 token->forcePrefixSearch = grn_token_get_position(ctx, grnToken);
95 token->sourceOffset = grn_token_get_source_offset(ctx, grnToken);
96 token->sourceLength = grn_token_get_source_length(ctx, grnToken);
97 token->sourceFirstCharacterLength =
98 grn_token_get_source_first_character_length(ctx, grnToken);
99 {
100 grn_obj *metadata;
101 size_t nMetadata;
102 size_t i;
103
104 metadata = grn_token_get_metadata(ctx, grnToken);
105 nMetadata = grn_token_metadata_get_size(ctx, metadata);
106 for (i = 0; i < nMetadata; i++) {
107 GRN_BULK_REWIND(&tokenMetadataName);
108 GRN_BULK_REWIND(&tokenMetadataValue);
109 grn_token_metadata_at(ctx,
110 metadata,
111 i,
112 &tokenMetadataName,
113 &tokenMetadataValue);
114 if (GRN_TEXT_LEN(&tokenMetadataName) == 0) {
115 continue;
116 }
117 grn_vector_add_element(ctx,
118 &(token->metadata),
119 GRN_BULK_HEAD(&tokenMetadataName),
120 GRN_BULK_VSIZE(&tokenMetadataName),
121 0,
122 tokenMetadataName.header.domain);
123 grn_vector_add_element(ctx,
124 &(token->metadata),
125 GRN_BULK_HEAD(&tokenMetadataValue),
126 GRN_BULK_VSIZE(&tokenMetadataValue),
127 0,
128 tokenMetadataValue.header.domain);
129 }
130 }
131 }
132
133 static void
PGrnTokensFin(void)134 PGrnTokensFin(void)
135 {
136 PGrnTokensReinit();
137 GRN_OBJ_FIN(ctx, &tokens);
138 }
139
140 void
PGrnInitializeTokenize(void)141 PGrnInitializeTokenize(void)
142 {
143 lexicon = grn_table_create(ctx, NULL, 0, NULL,
144 GRN_OBJ_TABLE_PAT_KEY,
145 grn_ctx_at(ctx, GRN_DB_SHORT_TEXT),
146 NULL);
147 GRN_TEXT_INIT(&tokenizerValue, 0);
148 GRN_TEXT_INIT(&normalizerValue, 0);
149 GRN_TEXT_INIT(&tokenFiltersValue, 0);
150 PGrnTokensInit();
151 GRN_TEXT_INIT(&tokenMetadataName, 0);
152 GRN_VOID_INIT(&tokenMetadataValue);
153 GRN_TEXT_INIT(&tokenJSON, 0);
154 }
155
156 void
PGrnFinalizeTokenize(void)157 PGrnFinalizeTokenize(void)
158 {
159 GRN_OBJ_FIN(ctx, &tokenJSON);
160 GRN_OBJ_FIN(ctx, &tokenMetadataValue);
161 GRN_OBJ_FIN(ctx, &tokenMetadataName);
162 PGrnTokensFin();
163 GRN_OBJ_FIN(ctx, &tokenFiltersValue);
164 GRN_OBJ_FIN(ctx, &normalizerValue);
165 GRN_OBJ_FIN(ctx, &tokenizerValue);
166 grn_obj_close(ctx, lexicon);
167 }
168
169 static void
PGrnTokenizeSetModule(const char * moduleName,grn_info_type type,text * newValue)170 PGrnTokenizeSetModule(const char *moduleName,
171 grn_info_type type,
172 text *newValue)
173 {
174 grn_obj *value;
175
176 switch (type)
177 {
178 case GRN_INFO_DEFAULT_TOKENIZER:
179 value = &tokenizerValue;
180 break;
181 case GRN_INFO_NORMALIZER:
182 value = &normalizerValue;
183 break;
184 case GRN_INFO_TOKEN_FILTERS:
185 value = &tokenFiltersValue;
186 break;
187 default:
188 PGrnCheck("tokenize: invalid %s type: <%d>", moduleName, type);
189 return;
190 }
191
192 if (newValue)
193 {
194 if (VARSIZE_ANY_EXHDR(newValue) == GRN_TEXT_LEN(value) &&
195 memcmp(VARDATA_ANY(newValue),
196 GRN_TEXT_VALUE(value),
197 GRN_TEXT_LEN(value)) == 0)
198 {
199 return;
200 }
201
202 GRN_TEXT_SET(ctx,
203 value,
204 VARDATA_ANY(newValue),
205 VARSIZE_ANY_EXHDR(newValue));
206 grn_obj_set_info(ctx, lexicon, type, value);
207 PGrnCheck("tokenize: failed to set %s", moduleName);
208 }
209 else
210 {
211 if (GRN_TEXT_LEN(value) == 0)
212 return;
213
214 GRN_BULK_REWIND(value);
215 grn_obj_set_info(ctx, lexicon, type, value);
216 PGrnCheck("tokenize: failed to set %s", moduleName);
217 }
218 }
219
220 static ArrayType *
PGrnTokenizeCreateArray(void)221 PGrnTokenizeCreateArray(void)
222 {
223 size_t i;
224 size_t nTokens;
225 Datum *tokenData;
226 int dims[1];
227 int lbs[1];
228
229 nTokens = PGrnTokensSize();
230 if (nTokens == 0)
231 {
232 return construct_empty_array(JSONOID);
233 }
234
235 tokenData = palloc(sizeof(Datum) * nTokens);
236 for (i = 0; i < nTokens; i++)
237 {
238 grn_content_type type = GRN_CONTENT_JSON;
239 PGrnToken *token = PGrnTokensAt(i);
240 int nElements = 3;
241 bool haveSourceLocation = false;
242 bool haveMetadata = false;
243 text *json;
244
245 GRN_BULK_REWIND(&tokenJSON);
246 if (token->sourceOffset > 0 || token->sourceLength > 0)
247 {
248 haveSourceLocation = true;
249 nElements += 3;
250 }
251 if (grn_vector_size(ctx, &(token->metadata)) > 0)
252 {
253 haveMetadata = true;
254 nElements++;
255 }
256 grn_output_map_open(ctx, &tokenJSON, type, "token", nElements);
257 grn_output_cstr(ctx, &tokenJSON, type, "value");
258 grn_output_str(ctx, &tokenJSON, type,
259 GRN_TEXT_VALUE(&(token->value)),
260 GRN_TEXT_LEN(&(token->value)));
261 grn_output_cstr(ctx, &tokenJSON, type, "position");
262 grn_output_uint32(ctx, &tokenJSON, type, token->position);
263 grn_output_cstr(ctx, &tokenJSON, type, "force_prefix_search");
264 grn_output_bool(ctx, &tokenJSON, type, token->forcePrefixSearch);
265 if (haveSourceLocation)
266 {
267 grn_output_cstr(ctx, &tokenJSON, type, "source_offset");
268 grn_output_uint64(ctx, &tokenJSON, type, token->sourceOffset);
269 grn_output_cstr(ctx, &tokenJSON, type, "source_length");
270 grn_output_uint32(ctx, &tokenJSON, type, token->sourceLength);
271 grn_output_cstr(ctx, &tokenJSON, type,
272 "source_first_character_length");
273 grn_output_uint32(ctx, &tokenJSON, type,
274 token->sourceFirstCharacterLength);
275 }
276 if (haveMetadata)
277 {
278 size_t j;
279 size_t nMetadata;
280
281 nMetadata = grn_vector_size(ctx, &(token->metadata)) / 2;
282 grn_output_cstr(ctx, &tokenJSON, type, "metadata");
283 grn_output_map_open(ctx, &tokenJSON, type, "metadata", nMetadata);
284 for (j = 0; j < nMetadata; j++)
285 {
286 const char *rawName;
287 unsigned int rawNameLength;
288 const char *rawValue;
289 unsigned int rawValueLength;
290 grn_id valueDomain;
291
292 rawNameLength = grn_vector_get_element(ctx,
293 &(token->metadata),
294 j * 2,
295 &rawName,
296 NULL,
297 NULL);
298 grn_output_str(ctx, &tokenJSON, type, rawName, rawNameLength);
299
300 rawValueLength = grn_vector_get_element(ctx,
301 &(token->metadata),
302 j * 2 + 1,
303 &rawValue,
304 NULL,
305 &valueDomain);
306 grn_obj_reinit(ctx, &tokenMetadataValue, valueDomain, 0);
307 grn_bulk_write(ctx, &tokenMetadataValue,
308 rawValue, rawValueLength);
309 grn_output_obj(ctx, &tokenJSON, type, &tokenMetadataValue, NULL);
310 }
311 grn_output_map_close(ctx, &tokenJSON, type);
312 }
313 grn_output_map_close(ctx, &tokenJSON, type);
314
315 json = cstring_to_text_with_len(GRN_TEXT_VALUE(&tokenJSON),
316 GRN_TEXT_LEN(&tokenJSON));
317 tokenData[i] = PointerGetDatum(json);
318 }
319 dims[0] = nTokens;
320 lbs[0] = 1;
321 return construct_md_array(tokenData,
322 NULL,
323 1,
324 dims,
325 lbs,
326 JSONOID,
327 -1,
328 false,
329 'i');
330 }
331
332 static ArrayType *
PGrnTokenize(text * target)333 PGrnTokenize(text *target)
334 {
335 grn_token_cursor *tokenCursor;
336
337 tokenCursor = grn_token_cursor_open(ctx,
338 lexicon,
339 VARDATA_ANY(target),
340 VARSIZE_ANY_EXHDR(target),
341 GRN_TOKEN_ADD,
342 0);
343 PGrnCheck("tokenize: failed to create token cursor");
344
345 PGrnTokensReinit();
346 while (grn_token_cursor_get_status(ctx, tokenCursor) ==
347 GRN_TOKEN_CURSOR_DOING)
348 {
349 grn_id id = grn_token_cursor_next(ctx, tokenCursor);
350
351 if (id == GRN_ID_NIL)
352 continue;
353
354 PGrnTokensAppend(id, tokenCursor);
355 }
356 grn_token_cursor_close(ctx, tokenCursor);
357
358 return PGrnTokenizeCreateArray();
359 }
360
361 /**
362 * pgroonga_tokenize(target text, options text[]) : json[]
363 *
364 * options:
365 * "tokenizer", tokenizer text,
366 * "normalizer", normalizer text,
367 * "token_filters", token_filters text,
368 * ...
369 */
370 Datum
pgroonga_tokenize(PG_FUNCTION_ARGS)371 pgroonga_tokenize(PG_FUNCTION_ARGS)
372 {
373 const char *tag = "[tokenize]";
374 text *target;
375 ArrayType *options;
376 text *tokenizerName = NULL;
377 text *normalizerName = NULL;
378 text *tokenFiltersName = NULL;
379 ArrayType *pgTokens;
380
381 target = PG_GETARG_TEXT_PP(0);
382 options = PG_GETARG_ARRAYTYPE_P(1);
383
384 if (ARR_NDIM(options) > 0)
385 {
386 ArrayIterator iterator;
387 Datum nameDatum;
388 bool isNULL;
389
390 iterator = pgrn_array_create_iterator(options, 0);
391 while (array_iterate(iterator, &nameDatum, &isNULL))
392 {
393 text *name = DatumGetTextPP(nameDatum);
394 Datum valueDatum;
395 text *value;
396
397 if (!array_iterate(iterator, &valueDatum, &isNULL))
398 {
399 PGrnCheckRC(GRN_INVALID_ARGUMENT,
400 "%s parameter value is missing: <%.*s>",
401 tag,
402 (int) VARSIZE_ANY_EXHDR(name),
403 VARDATA_ANY(name));
404 }
405
406 value = DatumGetTextPP(valueDatum);
407
408 #define NAME_EQUAL(n) \
409 (VARSIZE_ANY_EXHDR(name) == strlen(n) && \
410 strcmp(VARDATA_ANY(name), n) == 0)
411
412 if (NAME_EQUAL("tokenizer"))
413 {
414 tokenizerName = value;
415 }
416 else if (NAME_EQUAL("normalizer"))
417 {
418 normalizerName = value;
419 }
420 else if (NAME_EQUAL("token_filters"))
421 {
422 tokenFiltersName = value;
423 }
424 else
425 {
426 PGrnCheckRC(GRN_INVALID_ARGUMENT,
427 "%s unknown parameter name: <%.*s>",
428 tag,
429 (int) VARSIZE_ANY_EXHDR(name),
430 VARDATA_ANY(name));
431 }
432 #undef NAME_EQUAL
433 }
434
435 array_free_iterator(iterator);
436 }
437
438 PGrnTokenizeSetModule("tokenizer",
439 GRN_INFO_DEFAULT_TOKENIZER,
440 tokenizerName);
441 PGrnTokenizeSetModule("normalizer",
442 GRN_INFO_NORMALIZER,
443 normalizerName);
444 PGrnTokenizeSetModule("token filters",
445 GRN_INFO_TOKEN_FILTERS,
446 tokenFiltersName);
447
448 pgTokens = PGrnTokenize(target);
449
450 PG_RETURN_POINTER(pgTokens);
451 }
452