1 /* -*- c-basic-offset: 2 -*- */
2 /* Copyright(C) 2012-2015 Brazil
3 
4   This library is free software; you can redistribute it and/or
5   modify it under the terms of the GNU Lesser General Public
6   License version 2.1 as published by the Free Software Foundation.
7 
8   This library is distributed in the hope that it will be useful,
9   but WITHOUT ANY WARRANTY; without even the implied warranty of
10   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11   Lesser General Public License for more details.
12 
13   You should have received a copy of the GNU Lesser General Public
14   License along with this library; if not, write to the Free Software
15   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1335  USA
16 */
17 
18 #ifdef GRN_EMBEDDED
19 #  define GRN_PLUGIN_FUNCTION_TAG query_expanders_tsv
20 #endif
21 
22 #ifdef HAVE_CONFIG_H
23 # include <config.h>
24 #endif /* HAVE_CONFIG_H */
25 
26 #include <groonga/plugin.h>
27 
28 #include <stdlib.h>
29 #include <string.h>
30 
31 #ifdef WIN32
32 # include <windows.h>
33 # include <share.h>
34 #endif /* WIN32 */
35 
36 #define MAX_SYNONYM_BYTES 4096
37 
38 static grn_hash *synonyms = NULL;
39 
40 #ifdef WIN32
41 static char win32_synonyms_file[MAX_PATH] = "";
42 const char *
get_system_synonyms_file(void)43 get_system_synonyms_file(void)
44 {
45   if (win32_synonyms_file[0] == '\0') {
46     const char *base_dir;
47     const char *relative_path = GRN_QUERY_EXPANDER_TSV_RELATIVE_SYNONYMS_FILE;
48     size_t base_dir_length;
49 
50     base_dir = grn_plugin_windows_base_dir();
51     base_dir_length = strlen(base_dir);
52     grn_strcpy(win32_synonyms_file, MAX_PATH, base_dir);
53     grn_strcat(win32_synonyms_file, MAX_PATH, "/");
54     grn_strcat(win32_synonyms_file, MAX_PATH, relative_path);
55   }
56   return win32_synonyms_file;
57 }
58 
59 #else /* WIN32 */
60 const char *
get_system_synonyms_file(void)61 get_system_synonyms_file(void)
62 {
63   return GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE;
64 }
65 #endif /* WIN32 */
66 
67 static grn_bool
is_comment_mark(char character)68 is_comment_mark(char character)
69 {
70   return character == '#';
71 }
72 
73 static grn_encoding
detect_coding_part(grn_ctx * ctx,const char * line,size_t line_length)74 detect_coding_part(grn_ctx *ctx, const char *line, size_t line_length)
75 {
76   grn_encoding encoding = GRN_ENC_NONE;
77   grn_obj null_terminated_line_buffer;
78   const char *c_line;
79   const char *coding_part_keyword = "coding: ";
80   const char *coding_part;
81   const char *encoding_name;
82 
83   GRN_TEXT_INIT(&null_terminated_line_buffer, 0);
84   GRN_TEXT_PUT(ctx, &null_terminated_line_buffer, line, line_length);
85   GRN_TEXT_PUTC(ctx, &null_terminated_line_buffer, '\0');
86 
87   c_line = GRN_TEXT_VALUE(&null_terminated_line_buffer);
88   coding_part = strstr(c_line, coding_part_keyword);
89   if (coding_part) {
90     encoding_name = coding_part + strlen(coding_part_keyword);
91     if (grn_strncasecmp(encoding_name, "utf-8", strlen("utf-8")) == 0 ||
92         grn_strncasecmp(encoding_name, "utf8", strlen("utf8")) == 0) {
93       encoding = GRN_ENC_UTF8;
94     } else if (grn_strncasecmp(encoding_name, "sjis", strlen("sjis")) == 0 ||
95                grn_strncasecmp(encoding_name, "Shift_JIS", strlen("Shift_JIS")) == 0) {
96       encoding = GRN_ENC_SJIS;
97     } else if (grn_strncasecmp(encoding_name, "EUC-JP", strlen("EUC-JP")) == 0 ||
98                grn_strncasecmp(encoding_name, "euc_jp", strlen("euc_jp")) == 0) {
99       encoding = GRN_ENC_EUC_JP;
100     } else if (grn_strncasecmp(encoding_name, "latin1", strlen("latin1")) == 0) {
101       encoding = GRN_ENC_LATIN1;
102     } else if (grn_strncasecmp(encoding_name, "KOI8-R", strlen("KOI8-R")) == 0 ||
103                grn_strncasecmp(encoding_name, "koi8r", strlen("koi8r")) == 0) {
104       encoding = GRN_ENC_KOI8R;
105     }
106   } else {
107     encoding = ctx->encoding;
108   }
109   GRN_OBJ_FIN(ctx, &null_terminated_line_buffer);
110 
111   return encoding;
112 }
113 
114 static grn_encoding
guess_encoding(grn_ctx * ctx,const char ** line,size_t * line_length)115 guess_encoding(grn_ctx *ctx, const char **line, size_t *line_length)
116 {
117   const char bom[] = {0xef, 0xbb, 0xbf};
118   size_t bom_length = sizeof(bom);
119 
120   if (*line_length >= bom_length && memcmp(*line, bom, bom_length) == 0) {
121     *line += bom_length;
122     *line_length -= bom_length;
123     return GRN_ENC_UTF8;
124   }
125 
126   if (!is_comment_mark((*line)[0])) {
127     return ctx->encoding;
128   }
129 
130   return detect_coding_part(ctx, (*line) + 1, (*line_length) - 1);
131 }
132 
133 static void
parse_synonyms_file_line(grn_ctx * ctx,const char * line,size_t line_length,grn_obj * key,grn_obj * value)134 parse_synonyms_file_line(grn_ctx *ctx, const char *line, size_t line_length,
135                          grn_obj *key, grn_obj *value)
136 {
137   size_t i = 0;
138 
139   if (is_comment_mark(line[i])) {
140     return;
141   }
142 
143   while (i < line_length) {
144     char character = line[i];
145     i++;
146     if (character == '\t') {
147       break;
148     }
149     GRN_TEXT_PUTC(ctx, key, character);
150   }
151 
152   if (i == line_length) {
153     return;
154   }
155 
156   GRN_TEXT_PUTS(ctx, value, "((");
157   while (i < line_length) {
158     char character = line[i];
159     i++;
160     if (character == '\t') {
161       GRN_TEXT_PUTS(ctx, value, ") OR (");
162     } else {
163       GRN_TEXT_PUTC(ctx, value, character);
164     }
165   }
166   GRN_TEXT_PUTS(ctx, value, "))");
167 
168   {
169     grn_id id;
170     void *value_location = NULL;
171 
172     id = grn_hash_add(ctx, synonyms, GRN_TEXT_VALUE(key), GRN_TEXT_LEN(key),
173                       &value_location, NULL);
174     if (id == GRN_ID_NIL) {
175       GRN_PLUGIN_LOG(ctx, GRN_LOG_WARNING,
176                      "[plugin][query-expander][tsv] "
177                      "failed to register key: <%.*s>",
178                      (int)GRN_TEXT_LEN(key), GRN_TEXT_VALUE(key));
179       return;
180     }
181 
182     if (GRN_TEXT_LEN(value) <= MAX_SYNONYM_BYTES - 1) {
183       GRN_TEXT_PUTC(ctx, value, '\0');
184     } else {
185       grn_bulk_truncate(ctx, value, MAX_SYNONYM_BYTES - 1);
186       GRN_TEXT_PUTC(ctx, value, '\0');
187     }
188     grn_memcpy(value_location, GRN_TEXT_VALUE(value), GRN_TEXT_LEN(value));
189   }
190 }
191 
192 static void
load_synonyms(grn_ctx * ctx)193 load_synonyms(grn_ctx *ctx)
194 {
195   static char path_env[GRN_ENV_BUFFER_SIZE];
196   const char *path;
197   grn_file_reader *file_reader;
198   int number_of_lines;
199   grn_encoding encoding;
200   grn_obj line, key, value;
201 
202   grn_getenv("GRN_QUERY_EXPANDER_TSV_SYNONYMS_FILE",
203              path_env,
204              GRN_ENV_BUFFER_SIZE);
205   if (path_env[0]) {
206     path = path_env;
207   } else {
208     path = get_system_synonyms_file();
209   }
210   file_reader = grn_file_reader_open(ctx, path);
211   if (!file_reader) {
212     GRN_LOG(ctx, GRN_LOG_WARNING,
213             "[plugin][query-expander][tsv] "
214             "synonyms file doesn't exist: <%s>",
215             path);
216     return;
217   }
218 
219   GRN_TEXT_INIT(&line, 0);
220   GRN_TEXT_INIT(&key, 0);
221   GRN_TEXT_INIT(&value, 0);
222   grn_bulk_reserve(ctx, &value, MAX_SYNONYM_BYTES);
223   number_of_lines = 0;
224   while (grn_file_reader_read_line(ctx, file_reader, &line) == GRN_SUCCESS) {
225     const char *line_value = GRN_TEXT_VALUE(&line);
226     size_t line_length = GRN_TEXT_LEN(&line);
227 
228     if (line_length > 0 && line_value[line_length - 1] == '\n') {
229       if (line_length > 1 && line_value[line_length - 2] == '\r') {
230         line_length -= 2;
231       } else {
232         line_length -= 1;
233       }
234     }
235     number_of_lines++;
236     if (number_of_lines == 1) {
237       encoding = guess_encoding(ctx, &line_value, &line_length);
238     }
239     GRN_BULK_REWIND(&key);
240     GRN_BULK_REWIND(&value);
241     parse_synonyms_file_line(ctx, line_value, line_length, &key, &value);
242     GRN_BULK_REWIND(&line);
243   }
244   GRN_OBJ_FIN(ctx, &line);
245   GRN_OBJ_FIN(ctx, &key);
246   GRN_OBJ_FIN(ctx, &value);
247 
248   grn_file_reader_close(ctx, file_reader);
249 }
250 
251 static grn_obj *
func_query_expander_tsv(grn_ctx * ctx,int nargs,grn_obj ** args,grn_user_data * user_data)252 func_query_expander_tsv(grn_ctx *ctx, int nargs, grn_obj **args,
253                         grn_user_data *user_data)
254 {
255   grn_rc rc = GRN_END_OF_DATA;
256   grn_id id;
257   grn_obj *term, *expanded_term;
258   void *value;
259   grn_obj *rc_object;
260 
261   term = args[0];
262   expanded_term = args[1];
263   id = grn_hash_get(ctx, synonyms,
264                     GRN_TEXT_VALUE(term), GRN_TEXT_LEN(term),
265                     &value);
266   if (id != GRN_ID_NIL) {
267     const char *query = value;
268     GRN_TEXT_PUTS(ctx, expanded_term, query);
269     rc = GRN_SUCCESS;
270   }
271 
272   rc_object = grn_plugin_proc_alloc(ctx, user_data, GRN_DB_INT32, 0);
273   if (rc_object) {
274     GRN_INT32_SET(ctx, rc_object, rc);
275   }
276 
277   return rc_object;
278 }
279 
280 grn_rc
GRN_PLUGIN_INIT(grn_ctx * ctx)281 GRN_PLUGIN_INIT(grn_ctx *ctx)
282 {
283   if (!synonyms) {
284     synonyms = grn_hash_create(ctx, NULL,
285                                GRN_TABLE_MAX_KEY_SIZE,
286                                MAX_SYNONYM_BYTES,
287                                GRN_OBJ_TABLE_HASH_KEY | GRN_OBJ_KEY_VAR_SIZE);
288     if (!synonyms) {
289       return ctx->rc;
290     }
291     load_synonyms(ctx);
292   }
293   return ctx->rc;
294 }
295 
296 grn_rc
GRN_PLUGIN_REGISTER(grn_ctx * ctx)297 GRN_PLUGIN_REGISTER(grn_ctx *ctx)
298 {
299   grn_proc_create(ctx, "QueryExpanderTSV", strlen("QueryExpanderTSV"),
300                   GRN_PROC_FUNCTION,
301                   func_query_expander_tsv, NULL, NULL,
302                   0, NULL);
303   return GRN_SUCCESS;
304 }
305 
306 grn_rc
GRN_PLUGIN_FIN(grn_ctx * ctx)307 GRN_PLUGIN_FIN(grn_ctx *ctx)
308 {
309   if (synonyms) {
310     grn_hash_close(ctx, synonyms);
311     synonyms = NULL;
312   }
313   return GRN_SUCCESS;
314 }
315