1 /* -*- c-basic-offset: 2 -*- */
2 /* Copyright(C) 2012-2015 Brazil
3
4 This library is free software; you can redistribute it and/or
5 modify it under the terms of the GNU Lesser General Public
6 License version 2.1 as published by the Free Software Foundation.
7
8 This library is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 Lesser General Public License for more details.
12
13 You should have received a copy of the GNU Lesser General Public
14 License along with this library; if not, write to the Free Software
15 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA
16 */
17
18 #ifdef GRN_EMBEDDED
19 # define GRN_PLUGIN_FUNCTION_TAG query_expanders_tsv
20 #endif
21
22 #ifdef HAVE_CONFIG_H
23 # include <config.h>
24 #endif /* HAVE_CONFIG_H */
25
26 #include <groonga/plugin.h>
27
28 #include <stdlib.h>
29 #include <string.h>
30
31 #ifdef WIN32
32 # include <windows.h>
33 # include <share.h>
34 #endif /* WIN32 */
35
36 #define MAX_SYNONYM_BYTES 4096
37
38 static grn_hash *synonyms = NULL;
39
40 #ifdef WIN32
41 static char win32_synonyms_file[MAX_PATH] = "";
42 const char *
get_system_synonyms_file(void)43 get_system_synonyms_file(void)
44 {
45 if (win32_synonyms_file[0] == '\0') {
46 const char *base_dir;
47 const char *relative_path = GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE;
48 size_t base_dir_length;
49
50 base_dir = grn_plugin_windows_base_dir();
51 base_dir_length = strlen(base_dir);
52 grn_strcpy(win32_synonyms_file, MAX_PATH, base_dir);
53 grn_strcat(win32_synonyms_file, MAX_PATH, "/");
54 grn_strcat(win32_synonyms_file, MAX_PATH, relative_path);
55 }
56 return win32_synonyms_file;
57 }
58
59 #else /* WIN32 */
60 const char *
get_system_synonyms_file(void)61 get_system_synonyms_file(void)
62 {
63 return GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE;
64 }
65 #endif /* WIN32 */
66
67 static grn_bool
is_comment_mark(char character)68 is_comment_mark(char character)
69 {
70 return character == '#';
71 }
72
73 static grn_encoding
detect_coding_part(grn_ctx * ctx,const char * line,size_t line_length)74 detect_coding_part(grn_ctx *ctx, const char *line, size_t line_length)
75 {
76 grn_encoding encoding = GRN_ENC_NONE;
77 grn_obj null_terminated_line_buffer;
78 const char *c_line;
79 const char *coding_part_keyword = "coding: ";
80 const char *coding_part;
81 const char *encoding_name;
82
83 GRN_TEXT_INIT(&null_terminated_line_buffer, 0);
84 GRN_TEXT_PUT(ctx, &null_terminated_line_buffer, line, line_length);
85 GRN_TEXT_PUTC(ctx, &null_terminated_line_buffer, '\0');
86
87 c_line = GRN_TEXT_VALUE(&null_terminated_line_buffer);
88 coding_part = strstr(c_line, coding_part_keyword);
89 if (coding_part) {
90 encoding_name = coding_part + strlen(coding_part_keyword);
91 if (grn_strncasecmp(encoding_name, "utf-8", strlen("utf-8")) == 0 ||
92 grn_strncasecmp(encoding_name, "utf8", strlen("utf8")) == 0) {
93 encoding = GRN_ENC_UTF8;
94 } else if (grn_strncasecmp(encoding_name, "sjis", strlen("sjis")) == 0 ||
95 grn_strncasecmp(encoding_name, "Shift_JIS", strlen("Shift_JIS")) == 0) {
96 encoding = GRN_ENC_SJIS;
97 } else if (grn_strncasecmp(encoding_name, "EUC-JP", strlen("EUC-JP")) == 0 ||
98 grn_strncasecmp(encoding_name, "euc_jp", strlen("euc_jp")) == 0) {
99 encoding = GRN_ENC_EUC_JP;
100 } else if (grn_strncasecmp(encoding_name, "latin1", strlen("latin1")) == 0) {
101 encoding = GRN_ENC_LATIN1;
102 } else if (grn_strncasecmp(encoding_name, "KOI8-R", strlen("KOI8-R")) == 0 ||
103 grn_strncasecmp(encoding_name, "koi8r", strlen("koi8r")) == 0) {
104 encoding = GRN_ENC_KOI8R;
105 }
106 } else {
107 encoding = ctx->encoding;
108 }
109 GRN_OBJ_FIN(ctx, &null_terminated_line_buffer);
110
111 return encoding;
112 }
113
114 static grn_encoding
guess_encoding(grn_ctx * ctx,const char ** line,size_t * line_length)115 guess_encoding(grn_ctx *ctx, const char **line, size_t *line_length)
116 {
117 const char bom[] = {0xef, 0xbb, 0xbf};
118 size_t bom_length = sizeof(bom);
119
120 if (*line_length >= bom_length && memcmp(*line, bom, bom_length) == 0) {
121 *line += bom_length;
122 *line_length -= bom_length;
123 return GRN_ENC_UTF8;
124 }
125
126 if (!is_comment_mark((*line)[0])) {
127 return ctx->encoding;
128 }
129
130 return detect_coding_part(ctx, (*line) + 1, (*line_length) - 1);
131 }
132
133 static void
parse_synonyms_file_line(grn_ctx * ctx,const char * line,size_t line_length,grn_obj * key,grn_obj * value)134 parse_synonyms_file_line(grn_ctx *ctx, const char *line, size_t line_length,
135 grn_obj *key, grn_obj *value)
136 {
137 size_t i = 0;
138
139 if (is_comment_mark(line[i])) {
140 return;
141 }
142
143 while (i < line_length) {
144 char character = line[i];
145 i++;
146 if (character == '\t') {
147 break;
148 }
149 GRN_TEXT_PUTC(ctx, key, character);
150 }
151
152 if (i == line_length) {
153 return;
154 }
155
156 GRN_TEXT_PUTS(ctx, value, "((");
157 while (i < line_length) {
158 char character = line[i];
159 i++;
160 if (character == '\t') {
161 GRN_TEXT_PUTS(ctx, value, ") OR (");
162 } else {
163 GRN_TEXT_PUTC(ctx, value, character);
164 }
165 }
166 GRN_TEXT_PUTS(ctx, value, "))");
167
168 {
169 grn_id id;
170 void *value_location = NULL;
171
172 id = grn_hash_add(ctx, synonyms, GRN_TEXT_VALUE(key), GRN_TEXT_LEN(key),
173 &value_location, NULL);
174 if (id == GRN_ID_NIL) {
175 GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING,
176 "[plugin][query-expander][tsv] "
177 "failed to register key: <%.*s>",
178 (int)GRN_TEXT_LEN(key), GRN_TEXT_VALUE(key));
179 return;
180 }
181
182 if (GRN_TEXT_LEN(value) <= MAX_SYNONYM_BYTES - 1) {
183 GRN_TEXT_PUTC(ctx, value, '\0');
184 } else {
185 grn_bulk_truncate(ctx, value, MAX_SYNONYM_BYTES - 1);
186 GRN_TEXT_PUTC(ctx, value, '\0');
187 }
188 grn_memcpy(value_location, GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value));
189 }
190 }
191
192 static void
load_synonyms(grn_ctx * ctx)193 load_synonyms(grn_ctx *ctx)
194 {
195 static char path_env[GRN_ENV_BUFFER_SIZE];
196 const char *path;
197 grn_file_reader *file_reader;
198 int number_of_lines;
199 grn_encoding encoding;
200 grn_obj line, key, value;
201
202 grn_getenv("GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE",
203 path_env,
204 GRN_ENV_BUFFER_SIZE);
205 if (path_env[0]) {
206 path = path_env;
207 } else {
208 path = get_system_synonyms_file();
209 }
210 file_reader = grn_file_reader_open(ctx, path);
211 if (!file_reader) {
212 GRN_LOG(ctx, GRN_LOG_WARNING,
213 "[plugin][query-expander][tsv] "
214 "synonyms file doesn't exist: <%s>",
215 path);
216 return;
217 }
218
219 GRN_TEXT_INIT(&line, 0);
220 GRN_TEXT_INIT(&key, 0);
221 GRN_TEXT_INIT(&value, 0);
222 grn_bulk_reserve(ctx, &value, MAX_SYNONYM_BYTES);
223 number_of_lines = 0;
224 while (grn_file_reader_read_line(ctx, file_reader, &line) == GRN_SUCCESS) {
225 const char *line_value = GRN_TEXT_VALUE(&line);
226 size_t line_length = GRN_TEXT_LEN(&line);
227
228 if (line_length > 0 && line_value[line_length - 1] == '\n') {
229 if (line_length > 1 && line_value[line_length - 2] == '\r') {
230 line_length -= 2;
231 } else {
232 line_length -= 1;
233 }
234 }
235 number_of_lines++;
236 if (number_of_lines == 1) {
237 encoding = guess_encoding(ctx, &line_value, &line_length);
238 }
239 GRN_BULK_REWIND(&key);
240 GRN_BULK_REWIND(&value);
241 parse_synonyms_file_line(ctx, line_value, line_length, &key, &value);
242 GRN_BULK_REWIND(&line);
243 }
244 GRN_OBJ_FIN(ctx, &line);
245 GRN_OBJ_FIN(ctx, &key);
246 GRN_OBJ_FIN(ctx, &value);
247
248 grn_file_reader_close(ctx, file_reader);
249 }
250
251 static grn_obj *
func_query_expander_tsv(grn_ctx * ctx,int nargs,grn_obj ** args,grn_user_data * user_data)252 func_query_expander_tsv(grn_ctx *ctx, int nargs, grn_obj **args,
253 grn_user_data *user_data)
254 {
255 grn_rc rc = GRN_END_OF_DATA;
256 grn_id id;
257 grn_obj *term, *expanded_term;
258 void *value;
259 grn_obj *rc_object;
260
261 term = args[0];
262 expanded_term = args[1];
263 id = grn_hash_get(ctx, synonyms,
264 GRN_TEXT_VALUE(term), GRN_TEXT_LEN(term),
265 &value);
266 if (id != GRN_ID_NIL) {
267 const char *query = value;
268 GRN_TEXT_PUTS(ctx, expanded_term, query);
269 rc = GRN_SUCCESS;
270 }
271
272 rc_object = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_INT32, 0);
273 if (rc_object) {
274 GRN_INT32_SET(ctx, rc_object, rc);
275 }
276
277 return rc_object;
278 }
279
280 grn_rc
GRN_PLUGIN_INIT(grn_ctx * ctx)281 GRN_PLUGIN_INIT(grn_ctx *ctx)
282 {
283 if (!synonyms) {
284 synonyms = grn_hash_create(ctx, NULL,
285 GRN_TABLE_MAX_KEY_SIZE,
286 MAX_SYNONYM_BYTES,
287 GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_VAR_SIZE);
288 if (!synonyms) {
289 return ctx->rc;
290 }
291 load_synonyms(ctx);
292 }
293 return ctx->rc;
294 }
295
296 grn_rc
GRN_PLUGIN_REGISTER(grn_ctx * ctx)297 GRN_PLUGIN_REGISTER(grn_ctx *ctx)
298 {
299 grn_proc_create(ctx, "QueryExpanderTSV", strlen("QueryExpanderTSV"),
300 GRN_PROC_FUNCTION,
301 func_query_expander_tsv, NULL, NULL,
302 0, NULL);
303 return GRN_SUCCESS;
304 }
305
306 grn_rc
GRN_PLUGIN_FIN(grn_ctx * ctx)307 GRN_PLUGIN_FIN(grn_ctx *ctx)
308 {
309 if (synonyms) {
310 grn_hash_close(ctx, synonyms);
311 synonyms = NULL;
312 }
313 return GRN_SUCCESS;
314 }
315