1 /*
2 *
3 * re.c -
4 *
5 * $Id: re.c,v 1.32.8.15 2008-03-06 15:38:09 opengl2772 Exp $
6 *
7 * Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
8 * Copyright (C) 2000-2008 Namazu Project All rights reserved.
9 * This is free software with ABSOLUTELY NO WARRANTY.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24 * 02111-1307, USA
25 *
26 * This file must be encoded in EUC-JP encoding
27 *
28 */
29
30 #ifdef HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33 #ifdef HAVE_SUPPORT_H
34 # include "support.h"
35 #endif
36
37 #include <stdio.h>
38 #ifdef HAVE_STDLIB_H
39 # include <stdlib.h>
40 #endif
41
42 #ifdef HAVE_ERRNO_H
43 # include <errno.h>
44 #endif
45
46 #ifdef HAVE_STRING_H
47 # include <string.h>
48 #else
49 # include <strings.h>
50 #endif
51
52 #include "libnamazu.h"
53 #include "regex.h"
54 #include "util.h"
55 #include "hlist.h"
56 #include "replace.h"
57 #include "re.h"
58 #include "l10n-ja.h"
59 #include "var.h"
60
61 #define STEP 256
62
63 static NmzResult nmz_regex_grep_standard ( struct re_pattern_buffer *rp, FILE *fp );
64 static NmzResult nmz_regex_grep_field ( struct re_pattern_buffer *rp, FILE *fp, const char * field );
65
66 /*
67 *
68 * Public functions
69 *
70 */
71
72 /*
73 * FIXME: Dirty coding...
74 */
75 NmzResult
nmz_regex_grep(const char * expr,FILE * fp,const char * field,int field_mode)76 nmz_regex_grep(const char *expr, FILE *fp, const char *field, int field_mode)
77 {
78 char tmpexpr[BUFSIZE] = "";
79 struct re_pattern_buffer *rp;
80 NmzResult val;
81
82 val.num = 0;
83 val.data = NULL;
84 val.stat = SUCCESS;
85
86 if (nmz_is_lang_ja()) {
87 /* japanese only */
88 nmz_re_mbcinit(MBCTYPE_EUC);
89 } else {
90 nmz_re_mbcinit(MBCTYPE_ASCII);
91 }
92
93 rp = ALLOC(struct re_pattern_buffer);
94 MEMZERO((char *)rp, struct re_pattern_buffer, 1);
95 rp->buffer = 0;
96 rp->allocated = 0;
97
98 strncpy(tmpexpr, expr, BUFSIZE - 1); /* save orig_expr */
99 nmz_debug_printf("REGEX: '%s'\n", tmpexpr);
100
101 nmz_re_compile_pattern(tmpexpr, strlen(tmpexpr), rp);
102
103 if (!field_mode) {
104 val = nmz_regex_grep_standard(rp, fp);
105 } else {
106 val = nmz_regex_grep_field(rp, fp, field);
107 }
108
109 nmz_re_free_pattern(rp);
110
111 return val;
112 }
113
114 static NmzResult
nmz_regex_grep_standard(struct re_pattern_buffer * rp,FILE * fp)115 nmz_regex_grep_standard(struct re_pattern_buffer *rp, FILE *fp)
116 {
117 char buf[BUFSIZE] = "";
118 int i, n, maxmatch, maxhit;
119 NmzResult val, tmp;
120
121 val.num = 0;
122 val.data = NULL;
123 val.stat = SUCCESS;
124 tmp.num = 0;
125 tmp.data = NULL;
126 tmp.stat = SUCCESS;
127
128 maxmatch = nmz_get_maxmatch();
129 maxhit = nmz_get_maxhit();
130
131 for (i = n = 0; fgets(buf, BUFSIZE - 1, fp); i++) {
132 if (buf[strlen(buf) - 1] != '\n') { /* too long */
133 i--;
134 continue;
135 }
136 buf[strlen(buf) - 1] = '\0'; /* LF to NULL */
137 if (strlen(buf) == 0) {
138 continue;
139 }
140 nmz_strlower(buf);
141 if (nmz_re_search(rp, buf, strlen(buf), 0, strlen(buf), 0) != -1) {
142 /* Matched */
143 tmp = nmz_get_hlist(i);
144 if (tmp.stat == ERR_FATAL) {
145 nmz_free_hlist(val);
146 return tmp;
147 }
148 if (tmp.num > maxhit) {
149 nmz_free_hlist(tmp);
150 nmz_free_hlist(val);
151 val.data = NULL;
152 val.stat = ERR_TOO_MUCH_HIT;
153 break;
154 }
155
156 if (tmp.num > 0) {
157 n++;
158 if (n > maxmatch) {
159 nmz_free_hlist(tmp);
160 nmz_free_hlist(val);
161 val.data = NULL;
162 val.stat = ERR_TOO_MUCH_MATCH;
163 return val;
164 }
165
166 val = nmz_ormerge(val, tmp);
167 if (val.stat == ERR_FATAL) {
168 return val;
169 }
170 if (val.num > maxhit) {
171 nmz_free_hlist(val);
172 val.data = NULL;
173 val.stat = ERR_TOO_MUCH_HIT;
174 break;
175 }
176 }
177
178 if (nmz_is_debugmode()) {
179 char buf2[BUFSIZE];
180
181 fseek(Nmz.w, nmz_getidxptr(Nmz.wi, i), 0);
182 fgets(buf2, BUFSIZE, Nmz.w);
183 nmz_chomp(buf2);
184 nmz_debug_printf("re: %s, (%d:%s), %d, %d\n",
185 buf2, i, buf, tmp.num, val.num);
186 }
187 }
188 }
189
190 return val;
191 }
192
193 static NmzResult
nmz_regex_grep_field(struct re_pattern_buffer * rp,FILE * fp,const char * field)194 nmz_regex_grep_field(struct re_pattern_buffer *rp, FILE *fp, const char *field)
195 {
196 char buf[BUFSIZE] = "";
197 int i, n, size = 0, maxhit, uri_mode = 0;
198 NmzResult val;
199 FILE *date_index;
200
201 val.num = 0;
202 val.data = NULL;
203 val.stat = SUCCESS;
204
205 date_index = fopen(NMZ.t, "rb");
206 if (date_index == NULL) {
207 nmz_set_dyingmsg(nmz_msg("%s: %s", NMZ.t, strerror(errno)));
208 val.stat = ERR_FATAL;
209 return val; /* error */
210 }
211
212 {
213 nmz_malloc_hlist(&val, size += STEP);
214 if (val.stat == ERR_FATAL) {
215 fclose(date_index);
216 return val;
217 }
218 val.num = 0; /* set 0 for no matching case */
219 if (strcmp(field, "uri") == 0) {
220 uri_mode = 1;
221 }
222 }
223
224 maxhit = nmz_get_maxhit();
225
226 for (i = n = 0; fgets(buf, BUFSIZE - 1, fp); i++) {
227 if (buf[strlen(buf) - 1] != '\n') { /* too long */
228 i--;
229 continue;
230 }
231 buf[strlen(buf) - 1] = '\0'; /* LF to NULL */
232 if (strlen(buf) == 0) {
233 continue;
234 }
235 if (uri_mode) { /* consider the REPLACE directive in namazurc */
236 nmz_replace_uri(buf);
237 }
238 nmz_strlower(buf);
239 if (nmz_re_search(rp, buf, strlen(buf), 0, strlen(buf), 0) != -1) {
240 /* Matched */
241 struct nmz_data data;
242
243 if (fseek(date_index, i * sizeof(data.date), 0) != 0) {
244 nmz_set_dyingmsg(nmz_msg("%s: %s", NMZ.t, strerror(errno)));
245 fclose(date_index);
246 nmz_free_hlist(val);
247 val.data = NULL;
248 val.stat = ERR_FATAL;
249 return val; /* error */
250 }
251 nmz_fread(&data.date, sizeof(data.date), 1, date_index);
252
253 if (data.date == -1) {
254 continue;
255 }
256
257 n++;
258 if (n > maxhit) {
259 fclose(date_index);
260 nmz_free_hlist(val);
261 val.data = NULL;
262 val.stat = ERR_TOO_MUCH_HIT;
263 return val;
264 }
265 {
266 if (n > size) {
267 nmz_realloc_hlist(&val, size += STEP);
268 if (val.stat == ERR_FATAL) {
269 fclose(date_index);
270 return val;
271 }
272 }
273 val.data[n-1].docid = i;
274 val.data[n-1].score = 1; /* score = 1 */
275 val.num = n;
276 }
277
278 if (nmz_is_debugmode()) {
279 nmz_debug_printf("field: [%d]<%s> id: %d\n",
280 val.num, buf, i);
281 }
282 }
283 }
284
285 fclose(date_index);
286
287 return val;
288 }
289