1 /* This file is part of the Zebra server.
2 Copyright (C) 2004-2013 Index Data
3
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
17
18 */
19
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <assert.h>
26 #include <string.h>
27 #include <ctype.h>
28
29 #include <yaz/tpath.h>
30 #include <idzebra/util.h>
31 #include <dfa.h>
32 #include <idzebra/recgrs.h>
33
34 #if HAVE_TCL_H
35 #include <tcl.h>
36
37 #if MAJOR_VERSION >= 8
38 #define HAVE_TCL_OBJECTS
39 #endif
40 #endif
41
42 #define REGX_DEBUG 0
43
44 #define F_WIN_EOF 2000000000
45 #define F_WIN_READ 1
46
47 #define REGX_EOF 0
48 #define REGX_PATTERN 1
49 #define REGX_BODY 2
50 #define REGX_BEGIN 3
51 #define REGX_END 4
52 #define REGX_CODE 5
53 #define REGX_CONTEXT 6
54 #define REGX_INIT 7
55
56 struct regxCode {
57 char *str;
58 #if HAVE_TCL_OBJECTS
59 Tcl_Obj *tcl_obj;
60 #endif
61 };
62
63 struct lexRuleAction {
64 int which;
65 union {
66 struct {
67 struct DFA *dfa; /* REGX_PATTERN */
68 int body;
69 } pattern;
70 struct regxCode *code; /* REGX_CODE */
71 } u;
72 struct lexRuleAction *next;
73 };
74
75 struct lexRuleInfo {
76 int no;
77 struct lexRuleAction *actionList;
78 };
79
80 struct lexRule {
81 struct lexRuleInfo info;
82 struct lexRule *next;
83 };
84
85 struct lexContext {
86 char *name;
87 struct DFA *dfa;
88 struct lexRule *rules;
89 struct lexRuleInfo **fastRule;
90 int ruleNo;
91 int initFlag;
92
93 struct lexRuleAction *beginActionList;
94 struct lexRuleAction *endActionList;
95 struct lexRuleAction *initActionList;
96 struct lexContext *next;
97 };
98
99 struct lexConcatBuf {
100 int max;
101 char *buf;
102 };
103
104 struct lexSpec {
105 char *name;
106 struct lexContext *context;
107
108 struct lexContext **context_stack;
109 int context_stack_size;
110 int context_stack_top;
111
112 int lineNo;
113 NMEM m;
114 data1_handle dh;
115 #if HAVE_TCL_H
116 Tcl_Interp *tcl_interp;
117 #endif
118 struct ZebraRecStream *stream;
119 off_t (*f_win_ef)(struct ZebraRecStream *s, off_t *);
120
121 int f_win_start; /* first byte of buffer is this file offset */
122 int f_win_end; /* last byte of buffer is this offset - 1 */
123 int f_win_size; /* size of buffer */
124 char *f_win_buf; /* buffer itself */
125 int (*f_win_rf)(struct ZebraRecStream *, char *, size_t);
126 off_t (*f_win_sf)(struct ZebraRecStream *, off_t);
127
128 struct lexConcatBuf *concatBuf;
129 int maxLevel;
130 data1_node **d1_stack;
131 int d1_level;
132 int stop_flag;
133
134 int *arg_start;
135 int *arg_end;
136 int arg_no;
137 int ptr;
138 };
139
140 struct lexSpecs {
141 struct lexSpec *spec;
142 char type[256];
143 };
144
f_win_get(struct lexSpec * spec,off_t start_pos,off_t end_pos,int * size)145 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
146 int *size)
147 {
148 int i, r, off = start_pos - spec->f_win_start;
149
150 if (off >= 0 && end_pos <= spec->f_win_end)
151 {
152 *size = end_pos - start_pos;
153 return spec->f_win_buf + off;
154 }
155 if (off < 0 || start_pos >= spec->f_win_end)
156 {
157 (*spec->f_win_sf)(spec->stream, start_pos);
158 spec->f_win_start = start_pos;
159
160 if (!spec->f_win_buf)
161 spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
162 *size = (*spec->f_win_rf)(spec->stream, spec->f_win_buf,
163 spec->f_win_size);
164 spec->f_win_end = spec->f_win_start + *size;
165
166 if (*size > end_pos - start_pos)
167 *size = end_pos - start_pos;
168 return spec->f_win_buf;
169 }
170 for (i = 0; i<spec->f_win_end - start_pos; i++)
171 spec->f_win_buf[i] = spec->f_win_buf[i + off];
172 r = (*spec->f_win_rf)(spec->stream,
173 spec->f_win_buf + i,
174 spec->f_win_size - i);
175 spec->f_win_start = start_pos;
176 spec->f_win_end += r;
177 *size = i + r;
178 if (*size > end_pos - start_pos)
179 *size = end_pos - start_pos;
180 return spec->f_win_buf;
181 }
182
f_win_advance(struct lexSpec * spec,int * pos)183 static int f_win_advance (struct lexSpec *spec, int *pos)
184 {
185 int size;
186 char *buf;
187
188 if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
189 return spec->f_win_buf[(*pos)++ - spec->f_win_start];
190 if (*pos == F_WIN_EOF)
191 return 0;
192 buf = f_win_get (spec, *pos, *pos+1, &size);
193 if (size == 1)
194 {
195 (*pos)++;
196 return *buf;
197 }
198 *pos = F_WIN_EOF;
199 return 0;
200 }
201
regxCodeDel(struct regxCode ** pp)202 static void regxCodeDel (struct regxCode **pp)
203 {
204 struct regxCode *p = *pp;
205 if (p)
206 {
207 #if HAVE_TCL_OBJECTS
208 if (p->tcl_obj)
209 Tcl_DecrRefCount (p->tcl_obj);
210 #endif
211 xfree (p->str);
212 xfree (p);
213 *pp = NULL;
214 }
215 }
216
regxCodeMk(struct regxCode ** pp,const char * buf,int len)217 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
218 {
219 struct regxCode *p;
220
221 p = (struct regxCode *) xmalloc (sizeof(*p));
222 p->str = (char *) xmalloc (len+1);
223 memcpy (p->str, buf, len);
224 p->str[len] = '\0';
225 #if HAVE_TCL_OBJECTS
226 p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
227 if (p->tcl_obj)
228 Tcl_IncrRefCount (p->tcl_obj);
229 #endif
230 *pp = p;
231 }
232
lexSpecDFA(void)233 static struct DFA *lexSpecDFA (void)
234 {
235 struct DFA *dfa;
236
237 dfa = dfa_init ();
238 dfa_parse_cmap_del (dfa, ' ');
239 dfa_parse_cmap_del (dfa, '\t');
240 dfa_parse_cmap_add (dfa, '/', 0);
241 return dfa;
242 }
243
actionListDel(struct lexRuleAction ** rap)244 static void actionListDel (struct lexRuleAction **rap)
245 {
246 struct lexRuleAction *ra1, *ra;
247
248 for (ra = *rap; ra; ra = ra1)
249 {
250 ra1 = ra->next;
251 switch (ra->which)
252 {
253 case REGX_PATTERN:
254 dfa_delete (&ra->u.pattern.dfa);
255 break;
256 case REGX_CODE:
257 regxCodeDel (&ra->u.code);
258 break;
259 }
260 xfree (ra);
261 }
262 *rap = NULL;
263 }
264
lexContextCreate(const char * name)265 static struct lexContext *lexContextCreate (const char *name)
266 {
267 struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
268
269 p->name = xstrdup (name);
270 p->ruleNo = 1;
271 p->initFlag = 0;
272 p->dfa = lexSpecDFA ();
273 p->rules = NULL;
274 p->fastRule = NULL;
275 p->beginActionList = NULL;
276 p->endActionList = NULL;
277 p->initActionList = NULL;
278 p->next = NULL;
279 return p;
280 }
281
lexContextDestroy(struct lexContext * p)282 static void lexContextDestroy (struct lexContext *p)
283 {
284 struct lexRule *rp, *rp1;
285
286 dfa_delete (&p->dfa);
287 xfree (p->fastRule);
288 for (rp = p->rules; rp; rp = rp1)
289 {
290 rp1 = rp->next;
291 actionListDel (&rp->info.actionList);
292 xfree (rp);
293 }
294 actionListDel (&p->beginActionList);
295 actionListDel (&p->endActionList);
296 actionListDel (&p->initActionList);
297 xfree (p->name);
298 xfree (p);
299 }
300
lexSpecCreate(const char * name,data1_handle dh)301 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
302 {
303 struct lexSpec *p;
304 int i;
305
306 p = (struct lexSpec *) xmalloc (sizeof(*p));
307 p->name = (char *) xmalloc (strlen(name)+1);
308 strcpy (p->name, name);
309
310 #if HAVE_TCL_H
311 p->tcl_interp = 0;
312 #endif
313 p->dh = dh;
314 p->context = NULL;
315 p->context_stack_size = 100;
316 p->context_stack = (struct lexContext **)
317 xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
318 p->f_win_buf = NULL;
319
320 p->maxLevel = 128;
321 p->concatBuf = (struct lexConcatBuf *)
322 xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
323 for (i = 0; i < p->maxLevel; i++)
324 {
325 p->concatBuf[i].max = 0;
326 p->concatBuf[i].buf = 0;
327 }
328 p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
329 p->d1_level = 0;
330 return p;
331 }
332
lexSpecDestroy(struct lexSpec ** pp)333 static void lexSpecDestroy (struct lexSpec **pp)
334 {
335 struct lexSpec *p;
336 struct lexContext *lt;
337 int i;
338
339 assert (pp);
340 p = *pp;
341 if (!p)
342 return ;
343
344 for (i = 0; i < p->maxLevel; i++)
345 xfree (p->concatBuf[i].buf);
346 xfree (p->concatBuf);
347
348 lt = p->context;
349 while (lt)
350 {
351 struct lexContext *lt_next = lt->next;
352 lexContextDestroy (lt);
353 lt = lt_next;
354 }
355 #if HAVE_TCL_OBJECTS
356 if (p->tcl_interp)
357 Tcl_DeleteInterp (p->tcl_interp);
358 #endif
359 xfree (p->name);
360 xfree (p->f_win_buf);
361 xfree (p->context_stack);
362 xfree (p->d1_stack);
363 xfree (p);
364 *pp = NULL;
365 }
366
readParseToken(const char ** cpp,int * len)367 static int readParseToken (const char **cpp, int *len)
368 {
369 const char *cp = *cpp;
370 char cmd[32];
371 int i, level;
372
373 while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
374 cp++;
375 switch (*cp)
376 {
377 case '\0':
378 return 0;
379 case '/':
380 *cpp = cp+1;
381 return REGX_PATTERN;
382 case '{':
383 *cpp = cp+1;
384 level = 1;
385 while (*++cp)
386 {
387 if (*cp == '{')
388 level++;
389 else if (*cp == '}')
390 {
391 level--;
392 if (level == 0)
393 break;
394 }
395 }
396 *len = cp - *cpp;
397 return REGX_CODE;
398 default:
399 i = 0;
400 while (1)
401 {
402 if (*cp >= 'a' && *cp <= 'z')
403 cmd[i] = *cp;
404 else if (*cp >= 'A' && *cp <= 'Z')
405 cmd[i] = *cp + 'a' - 'A';
406 else
407 break;
408 if (i < (int) sizeof(cmd)-2)
409 i++;
410 cp++;
411 }
412 cmd[i] = '\0';
413 if (i == 0)
414 {
415 yaz_log (YLOG_WARN, "bad character %d %c", *cp, *cp);
416 cp++;
417 while (*cp && *cp != ' ' && *cp != '\t' &&
418 *cp != '\n' && *cp != '\r')
419 cp++;
420 *cpp = cp;
421 return 0;
422 }
423 *cpp = cp;
424 if (!strcmp (cmd, "begin"))
425 return REGX_BEGIN;
426 else if (!strcmp (cmd, "end"))
427 return REGX_END;
428 else if (!strcmp (cmd, "body"))
429 return REGX_BODY;
430 else if (!strcmp (cmd, "context"))
431 return REGX_CONTEXT;
432 else if (!strcmp (cmd, "init"))
433 return REGX_INIT;
434 else
435 {
436 yaz_log (YLOG_WARN, "bad command %s", cmd);
437 return 0;
438 }
439 }
440 }
441
actionListMk(struct lexSpec * spec,const char * s,struct lexRuleAction ** ap)442 static int actionListMk (struct lexSpec *spec, const char *s,
443 struct lexRuleAction **ap)
444 {
445 int r, tok, len;
446 int bodyMark = 0;
447 const char *s0;
448
449 while ((tok = readParseToken (&s, &len)))
450 {
451 switch (tok)
452 {
453 case REGX_BODY:
454 bodyMark = 1;
455 continue;
456 case REGX_CODE:
457 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
458 (*ap)->which = tok;
459 regxCodeMk (&(*ap)->u.code, s, len);
460 s += len+1;
461 break;
462 case REGX_PATTERN:
463 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
464 (*ap)->which = tok;
465 (*ap)->u.pattern.body = bodyMark;
466 bodyMark = 0;
467 (*ap)->u.pattern.dfa = lexSpecDFA ();
468 s0 = s;
469 r = dfa_parse ((*ap)->u.pattern.dfa, &s);
470 if (r || *s != '/')
471 {
472 int pos = s - s0;
473 xfree (*ap);
474 *ap = NULL;
475 yaz_log(YLOG_WARN, "regular expression error '%.*s'", pos, s0);
476 return -1;
477 }
478 else
479 {
480 int pos = s - s0;
481 if (debug_dfa_tran)
482 printf("pattern: %.*s\n", pos, s0);
483 dfa_mkstate((*ap)->u.pattern.dfa);
484 s++;
485 }
486 break;
487 case REGX_BEGIN:
488 yaz_log (YLOG_WARN, "cannot use BEGIN here");
489 continue;
490 case REGX_INIT:
491 yaz_log (YLOG_WARN, "cannot use INIT here");
492 continue;
493 case REGX_END:
494 *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
495 (*ap)->which = tok;
496 break;
497 }
498 ap = &(*ap)->next;
499 }
500 *ap = NULL;
501 return 0;
502 }
503
readOneSpec(struct lexSpec * spec,const char * s)504 int readOneSpec (struct lexSpec *spec, const char *s)
505 {
506 int len, r, tok;
507 struct lexRule *rp;
508 struct lexContext *lc;
509
510 tok = readParseToken (&s, &len);
511 if (tok == REGX_CONTEXT)
512 {
513 char context_name[32];
514 tok = readParseToken (&s, &len);
515 if (tok != REGX_CODE)
516 {
517 yaz_log (YLOG_WARN, "missing name after CONTEXT keyword");
518 return 0;
519 }
520 if (len > 31)
521 len = 31;
522 memcpy (context_name, s, len);
523 context_name[len] = '\0';
524 lc = lexContextCreate (context_name);
525 lc->next = spec->context;
526 spec->context = lc;
527 return 0;
528 }
529 if (!spec->context)
530 spec->context = lexContextCreate ("main");
531
532 switch (tok)
533 {
534 case REGX_BEGIN:
535 actionListDel (&spec->context->beginActionList);
536 actionListMk (spec, s, &spec->context->beginActionList);
537 break;
538 case REGX_END:
539 actionListDel (&spec->context->endActionList);
540 actionListMk (spec, s, &spec->context->endActionList);
541 break;
542 case REGX_INIT:
543 actionListDel (&spec->context->initActionList);
544 actionListMk (spec, s, &spec->context->initActionList);
545 break;
546 case REGX_PATTERN:
547 #if REGX_DEBUG
548 yaz_log (YLOG_LOG, "rule %d %s", spec->context->ruleNo, s);
549 #endif
550 r = dfa_parse (spec->context->dfa, &s);
551 if (r)
552 {
553 yaz_log (YLOG_WARN, "regular expression error. r=%d", r);
554 return -1;
555 }
556 if (*s != '/')
557 {
558 yaz_log (YLOG_WARN, "expects / at end of pattern. got %c", *s);
559 return -1;
560 }
561 s++;
562 rp = (struct lexRule *) xmalloc (sizeof(*rp));
563 rp->info.no = spec->context->ruleNo++;
564 rp->next = spec->context->rules;
565 spec->context->rules = rp;
566 actionListMk (spec, s, &rp->info.actionList);
567 }
568 return 0;
569 }
570
readFileSpec(struct lexSpec * spec)571 int readFileSpec (struct lexSpec *spec)
572 {
573 struct lexContext *lc;
574 int c, i, errors = 0;
575 FILE *spec_inf = 0;
576 WRBUF lineBuf;
577 char fname[256];
578
579 #if HAVE_TCL_H
580 if (spec->tcl_interp)
581 {
582 sprintf (fname, "%s.tflt", spec->name);
583 spec_inf = data1_path_fopen (spec->dh, fname, "r");
584 }
585 #endif
586 if (!spec_inf)
587 {
588 sprintf (fname, "%s.flt", spec->name);
589 spec_inf = data1_path_fopen (spec->dh, fname, "r");
590 }
591 if (!spec_inf)
592 {
593 yaz_log (YLOG_ERRNO|YLOG_WARN, "cannot read spec file %s", spec->name);
594 return -1;
595 }
596 yaz_log (YLOG_LOG, "reading regx filter %s", fname);
597 #if HAVE_TCL_H
598 if (spec->tcl_interp)
599 yaz_log (YLOG_LOG, "Tcl enabled");
600 #endif
601
602 #if 0
603 debug_dfa_trav = 0;
604 debug_dfa_tran = 1;
605 debug_dfa_followpos = 0;
606 dfa_verbose = 1;
607 #endif
608
609 lineBuf = wrbuf_alloc();
610 spec->lineNo = 0;
611 c = getc (spec_inf);
612 while (c != EOF)
613 {
614 wrbuf_rewind (lineBuf);
615 if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
616 {
617 while (c != '\n' && c != EOF)
618 c = getc (spec_inf);
619 spec->lineNo++;
620 if (c == '\n')
621 c = getc (spec_inf);
622 }
623 else
624 {
625 int addLine = 0;
626
627 while (1)
628 {
629 int c1 = c;
630 wrbuf_putc(lineBuf, c);
631 c = getc (spec_inf);
632 while (c == '\r')
633 c = getc (spec_inf);
634 if (c == EOF)
635 break;
636 if (c1 == '\n')
637 {
638 if (c != ' ' && c != '\t')
639 break;
640 addLine++;
641 }
642 }
643 wrbuf_putc(lineBuf, '\0');
644 readOneSpec (spec, wrbuf_buf(lineBuf));
645 spec->lineNo += addLine;
646 }
647 }
648 fclose (spec_inf);
649 wrbuf_destroy(lineBuf);
650
651 for (lc = spec->context; lc; lc = lc->next)
652 {
653 struct lexRule *rp;
654 lc->fastRule = (struct lexRuleInfo **)
655 xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
656 for (i = 0; i < lc->ruleNo; i++)
657 lc->fastRule[i] = NULL;
658 for (rp = lc->rules; rp; rp = rp->next)
659 lc->fastRule[rp->info.no] = &rp->info;
660 dfa_mkstate (lc->dfa);
661 }
662 if (errors)
663 return -1;
664
665 return 0;
666 }
667
668 #if 0
669 static struct lexSpec *curLexSpec = NULL;
670 #endif
671
execData(struct lexSpec * spec,const char * ebuf,int elen,int formatted_text,const char * attribute_str,int attribute_len)672 static void execData (struct lexSpec *spec,
673 const char *ebuf, int elen, int formatted_text,
674 const char *attribute_str, int attribute_len)
675 {
676 struct data1_node *res, *parent;
677 int org_len;
678
679 if (elen == 0) /* shouldn't happen, but it does! */
680 return ;
681 #if REGX_DEBUG
682 if (elen > 80)
683 yaz_log (YLOG_LOG, "data(%d bytes) %.40s ... %.*s", elen,
684 ebuf, 40, ebuf + elen-40);
685 else if (elen == 1 && ebuf[0] == '\n')
686 {
687 yaz_log (YLOG_LOG, "data(new line)");
688 }
689 else if (elen > 0)
690 yaz_log (YLOG_LOG, "data(%d bytes) %.*s", elen, elen, ebuf);
691 else
692 yaz_log (YLOG_LOG, "data(%d bytes)", elen);
693 #endif
694
695 if (spec->d1_level <= 1)
696 return;
697
698 parent = spec->d1_stack[spec->d1_level -1];
699 assert (parent);
700
701 if (attribute_str)
702 {
703 data1_xattr **ap;
704 res = parent;
705 if (res->which != DATA1N_tag)
706 return;
707 /* sweep through exising attributes.. */
708 for (ap = &res->u.tag.attributes; *ap; ap = &(*ap)->next)
709 if (strlen((*ap)->name) == attribute_len &&
710 !memcmp((*ap)->name, attribute_str, attribute_len))
711 break;
712 if (!*ap)
713 {
714 /* new attribute. Create it with name + value */
715 *ap = nmem_malloc(spec->m, sizeof(**ap));
716
717 (*ap)->name = nmem_malloc(spec->m, attribute_len+1);
718 memcpy((*ap)->name, attribute_str, attribute_len);
719 (*ap)->name[attribute_len] = '\0';
720
721 (*ap)->value = nmem_malloc(spec->m, elen+1);
722 memcpy((*ap)->value, ebuf, elen);
723 (*ap)->value[elen] = '\0';
724 (*ap)->next = 0;
725 }
726 else
727 {
728 /* append to value if attribute already exists */
729 char *nv = nmem_malloc(spec->m, elen + 1 + strlen((*ap)->value));
730 strcpy(nv, (*ap)->value);
731 memcpy (nv + strlen(nv), ebuf, elen);
732 nv[strlen(nv)+elen] = '\0';
733 (*ap)->value = nv;
734 }
735 }
736 else
737 {
738 if ((res = spec->d1_stack[spec->d1_level]) &&
739 res->which == DATA1N_data)
740 org_len = res->u.data.len;
741 else
742 {
743 org_len = 0;
744
745 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_data, parent);
746 res->u.data.what = DATA1I_text;
747 res->u.data.len = 0;
748 res->u.data.formatted_text = formatted_text;
749 res->u.data.data = 0;
750
751 if (spec->d1_stack[spec->d1_level])
752 spec->d1_stack[spec->d1_level]->next = res;
753 spec->d1_stack[spec->d1_level] = res;
754 }
755 if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
756 {
757 char *old_buf, *new_buf;
758
759 spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
760 new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
761 if ((old_buf = spec->concatBuf[spec->d1_level].buf))
762 {
763 memcpy (new_buf, old_buf, org_len);
764 xfree (old_buf);
765 }
766 spec->concatBuf[spec->d1_level].buf = new_buf;
767 }
768 memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
769 res->u.data.len += elen;
770 }
771 }
772
execDataP(struct lexSpec * spec,const char * ebuf,int elen,int formatted_text)773 static void execDataP (struct lexSpec *spec,
774 const char *ebuf, int elen, int formatted_text)
775 {
776 execData (spec, ebuf, elen, formatted_text, 0, 0);
777 }
778
tagDataRelease(struct lexSpec * spec)779 static void tagDataRelease (struct lexSpec *spec)
780 {
781 data1_node *res;
782
783 if ((res = spec->d1_stack[spec->d1_level]) &&
784 res->which == DATA1N_data &&
785 res->u.data.what == DATA1I_text)
786 {
787 assert (!res->u.data.data);
788 assert (res->u.data.len > 0);
789 if (res->u.data.len > DATA1_LOCALDATA)
790 res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
791 else
792 res->u.data.data = res->lbuf;
793 memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
794 res->u.data.len);
795 }
796 }
797
variantBegin(struct lexSpec * spec,const char * class_str,int class_len,const char * type_str,int type_len,const char * value_str,int value_len)798 static void variantBegin (struct lexSpec *spec,
799 const char *class_str, int class_len,
800 const char *type_str, int type_len,
801 const char *value_str, int value_len)
802 {
803 struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
804 char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
805 data1_vartype *tp;
806 int i;
807 data1_node *res;
808
809 if (spec->d1_level == 0)
810 {
811 yaz_log (YLOG_WARN, "in variant begin. No record type defined");
812 return ;
813 }
814 if (class_len >= DATA1_MAX_SYMBOL)
815 class_len = DATA1_MAX_SYMBOL-1;
816 memcpy (tclass, class_str, class_len);
817 tclass[class_len] = '\0';
818
819 if (type_len >= DATA1_MAX_SYMBOL)
820 type_len = DATA1_MAX_SYMBOL-1;
821 memcpy (ttype, type_str, type_len);
822 ttype[type_len] = '\0';
823
824 #if REGX_DEBUG
825 yaz_log (YLOG_LOG, "variant begin(%s,%s,%d)", tclass, ttype,
826 spec->d1_level);
827 #endif
828
829 if (!(tp =
830 data1_getvartypeby_absyn(spec->dh, parent->root->u.root.absyn,
831 tclass, ttype)))
832 return;
833
834 if (parent->which != DATA1N_variant)
835 {
836 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
837 if (spec->d1_stack[spec->d1_level])
838 tagDataRelease (spec);
839 spec->d1_stack[spec->d1_level] = res;
840 spec->d1_stack[++(spec->d1_level)] = NULL;
841 }
842 for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
843 if (spec->d1_stack[i]->u.variant.type == tp)
844 {
845 spec->d1_level = i;
846 break;
847 }
848
849 #if REGX_DEBUG
850 yaz_log (YLOG_LOG, "variant node(%d)", spec->d1_level);
851 #endif
852 parent = spec->d1_stack[spec->d1_level-1];
853 res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
854 res->u.variant.type = tp;
855
856 if (value_len >= DATA1_LOCALDATA)
857 value_len =DATA1_LOCALDATA-1;
858 memcpy (res->lbuf, value_str, value_len);
859 res->lbuf[value_len] = '\0';
860
861 res->u.variant.value = res->lbuf;
862
863 if (spec->d1_stack[spec->d1_level])
864 tagDataRelease (spec);
865 spec->d1_stack[spec->d1_level] = res;
866 spec->d1_stack[++(spec->d1_level)] = NULL;
867 }
868
tagStrip(const char ** tag,int * len)869 static void tagStrip (const char **tag, int *len)
870 {
871 int i;
872
873 for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
874 ;
875 *len = i;
876 for (i = 0; i < *len && isspace((*tag)[i]); i++)
877 ;
878 *tag += i;
879 *len -= i;
880 }
881
tagBegin(struct lexSpec * spec,const char * tag,int len)882 static void tagBegin (struct lexSpec *spec,
883 const char *tag, int len)
884 {
885 if (spec->d1_level == 0)
886 {
887 yaz_log (YLOG_WARN, "in element begin. No record type defined");
888 return ;
889 }
890 tagStrip (&tag, &len);
891 if (spec->d1_stack[spec->d1_level])
892 tagDataRelease (spec);
893
894 #if REGX_DEBUG
895 yaz_log (YLOG_LOG, "begin tag(%.*s, %d)", len, tag, spec->d1_level);
896 #endif
897
898 spec->d1_stack[spec->d1_level] = data1_mk_tag_n (
899 spec->dh, spec->m, tag, len, 0, spec->d1_stack[spec->d1_level -1]);
900 spec->d1_stack[++(spec->d1_level)] = NULL;
901 }
902
tagEnd(struct lexSpec * spec,int min_level,const char * tag,int len)903 static void tagEnd (struct lexSpec *spec, int min_level,
904 const char *tag, int len)
905 {
906 tagStrip (&tag, &len);
907 while (spec->d1_level > min_level)
908 {
909 tagDataRelease (spec);
910 (spec->d1_level)--;
911 if (spec->d1_level == 0)
912 break;
913 if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
914 (!tag ||
915 (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
916 (size_t) len &&
917 !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
918 break;
919 }
920 #if REGX_DEBUG
921 yaz_log (YLOG_LOG, "end tag(%d)", spec->d1_level);
922 #endif
923 }
924
925
tryMatch(struct lexSpec * spec,int * pptr,int * mptr,struct DFA * dfa,int greedy)926 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
927 struct DFA *dfa, int greedy)
928 {
929 struct DFA_state *state = dfa->states[0];
930 struct DFA_tran *t;
931 unsigned char c = 0;
932 unsigned char c_prev = 0;
933 int ptr = *pptr; /* current pointer */
934 int start_ptr = *pptr; /* first char of match */
935 int last_ptr = 0; /* last char of match */
936 int last_rule = 0; /* rule number of current match */
937 int restore_ptr = 0;
938 int i;
939
940 if (ptr)
941 {
942 --ptr;
943 c = f_win_advance (spec, &ptr);
944 }
945 while (1)
946 {
947 if (dfa->states[0] == state)
948 {
949 c_prev = c;
950 restore_ptr = ptr;
951 }
952 c = f_win_advance (spec, &ptr);
953
954 if (ptr == F_WIN_EOF)
955 {
956 if (last_rule)
957 {
958 *mptr = start_ptr;
959 *pptr = last_ptr;
960 return 1;
961 }
962 break;
963 }
964
965 t = state->trans;
966 i = state->tran_no;
967 while (1)
968 if (--i < 0) /* no transition for character c */
969 {
970 if (last_rule)
971 {
972 *mptr = start_ptr; /* match starts here */
973 *pptr = last_ptr; /* match end here (+1) */
974 return 1;
975 }
976 state = dfa->states[0];
977
978 ptr = restore_ptr;
979 c = f_win_advance (spec, &ptr);
980
981 start_ptr = ptr;
982
983 break;
984 }
985 else if (c >= t->ch[0] && c <= t->ch[1])
986 {
987 state = dfa->states[t->to];
988 if (state->rule_no && c_prev == '\n')
989 {
990 last_rule = state->rule_no;
991 last_ptr = ptr;
992 }
993 else if (state->rule_nno)
994 {
995 last_rule = state->rule_nno;
996 last_ptr = ptr;
997 }
998 break;
999 }
1000 else
1001 t++;
1002 }
1003 return 0;
1004 }
1005
execTok(struct lexSpec * spec,const char ** src,const char ** tokBuf,int * tokLen)1006 static int execTok (struct lexSpec *spec, const char **src,
1007 const char **tokBuf, int *tokLen)
1008 {
1009 const char *s = *src;
1010
1011 while (*s == ' ' || *s == '\t')
1012 s++;
1013 if (!*s)
1014 return 0;
1015 if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1016 {
1017 int n = 0;
1018 s++;
1019 while (*s >= '0' && *s <= '9')
1020 n = n*10 + (*s++ -'0');
1021 if (spec->arg_no == 0)
1022 {
1023 *tokBuf = "";
1024 *tokLen = 0;
1025 }
1026 else
1027 {
1028 if (n >= spec->arg_no)
1029 n = spec->arg_no-1;
1030 *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1031 tokLen);
1032 }
1033 }
1034 else if (*s == '\"')
1035 {
1036 *tokBuf = ++s;
1037 while (*s && *s != '\"')
1038 s++;
1039 *tokLen = s - *tokBuf;
1040 if (*s)
1041 s++;
1042 *src = s;
1043 }
1044 else if (*s == '\n' || *s == ';')
1045 {
1046 *src = s+1;
1047 return 1;
1048 }
1049 else if (*s == '-')
1050 {
1051 *tokBuf = s++;
1052 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1053 *s != ';')
1054 s++;
1055 *tokLen = s - *tokBuf;
1056 *src = s;
1057 return 3;
1058 }
1059 else
1060 {
1061 *tokBuf = s++;
1062 while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1063 *s != ';')
1064 s++;
1065 *tokLen = s - *tokBuf;
1066 }
1067 *src = s;
1068 return 2;
1069 }
1070
regxStrz(const char * src,int len,char * str)1071 static char *regxStrz (const char *src, int len, char *str)
1072 {
1073 if (len > 63)
1074 len = 63;
1075 memcpy (str, src, len);
1076 str[len] = '\0';
1077 return str;
1078 }
1079
1080 #if HAVE_TCL_H
cmd_tcl_begin(ClientData clientData,Tcl_Interp * interp,int argc,const char ** argv)1081 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1082 int argc, const char **argv)
1083 {
1084 struct lexSpec *spec = (struct lexSpec *) clientData;
1085 if (argc < 2)
1086 return TCL_ERROR;
1087 if (!strcmp(argv[1], "record") && argc == 3)
1088 {
1089 const char *absynName = argv[2];
1090 data1_node *res;
1091
1092 #if REGX_DEBUG
1093 yaz_log (YLOG_LOG, "begin record %s", absynName);
1094 #endif
1095 res = data1_mk_root (spec->dh, spec->m, absynName);
1096
1097 spec->d1_level = 0;
1098
1099 spec->d1_stack[spec->d1_level++] = res;
1100
1101 res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1102
1103 spec->d1_stack[spec->d1_level++] = res;
1104
1105 spec->d1_stack[spec->d1_level] = NULL;
1106 }
1107 else if (!strcmp(argv[1], "element") && argc == 3)
1108 {
1109 tagBegin (spec, argv[2], strlen(argv[2]));
1110 }
1111 else if (!strcmp (argv[1], "variant") && argc == 5)
1112 {
1113 variantBegin (spec, argv[2], strlen(argv[2]),
1114 argv[3], strlen(argv[3]),
1115 argv[4], strlen(argv[4]));
1116 }
1117 else if (!strcmp (argv[1], "context") && argc == 3)
1118 {
1119 struct lexContext *lc = spec->context;
1120 #if REGX_DEBUG
1121 yaz_log (YLOG_LOG, "begin context %s",argv[2]);
1122 #endif
1123 while (lc && strcmp (argv[2], lc->name))
1124 lc = lc->next;
1125 if (lc)
1126 {
1127 spec->context_stack[++(spec->context_stack_top)] = lc;
1128 }
1129 else
1130 yaz_log (YLOG_WARN, "unknown context %s", argv[2]);
1131 }
1132 else
1133 return TCL_ERROR;
1134 return TCL_OK;
1135 }
1136
cmd_tcl_end(ClientData clientData,Tcl_Interp * interp,int argc,const char ** argv)1137 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1138 int argc, const char **argv)
1139 {
1140 struct lexSpec *spec = (struct lexSpec *) clientData;
1141 if (argc < 2)
1142 return TCL_ERROR;
1143
1144 if (!strcmp (argv[1], "record"))
1145 {
1146 while (spec->d1_level)
1147 {
1148 tagDataRelease (spec);
1149 (spec->d1_level)--;
1150 }
1151 #if REGX_DEBUG
1152 yaz_log (YLOG_LOG, "end record");
1153 #endif
1154 spec->stop_flag = 1;
1155 }
1156 else if (!strcmp (argv[1], "element"))
1157 {
1158 int min_level = 2;
1159 const char *element = 0;
1160 if (argc >= 3 && !strcmp(argv[2], "-record"))
1161 {
1162 min_level = 0;
1163 if (argc == 4)
1164 element = argv[3];
1165 }
1166 else
1167 if (argc == 3)
1168 element = argv[2];
1169 tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1170 if (spec->d1_level <= 1)
1171 {
1172 #if REGX_DEBUG
1173 yaz_log (YLOG_LOG, "end element end records");
1174 #endif
1175 spec->stop_flag = 1;
1176 }
1177 }
1178 else if (!strcmp (argv[1], "context"))
1179 {
1180 #if REGX_DEBUG
1181 yaz_log (YLOG_LOG, "end context");
1182 #endif
1183 if (spec->context_stack_top)
1184 (spec->context_stack_top)--;
1185 }
1186 else
1187 return TCL_ERROR;
1188 return TCL_OK;
1189 }
1190
cmd_tcl_data(ClientData clientData,Tcl_Interp * interp,int argc,const char ** argv)1191 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1192 int argc, const char **argv)
1193 {
1194 int argi = 1;
1195 int textFlag = 0;
1196 const char *element = 0;
1197 const char *attribute = 0;
1198 struct lexSpec *spec = (struct lexSpec *) clientData;
1199
1200 while (argi < argc)
1201 {
1202 if (!strcmp("-text", argv[argi]))
1203 {
1204 textFlag = 1;
1205 argi++;
1206 }
1207 else if (!strcmp("-element", argv[argi]))
1208 {
1209 argi++;
1210 if (argi < argc)
1211 element = argv[argi++];
1212 }
1213 else if (!strcmp("-attribute", argv[argi]))
1214 {
1215 argi++;
1216 if (argi < argc)
1217 attribute = argv[argi++];
1218 }
1219 else
1220 break;
1221 }
1222 if (element)
1223 tagBegin (spec, element, strlen(element));
1224
1225 while (argi < argc)
1226 {
1227 #if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0)
1228 Tcl_DString ds;
1229 char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds);
1230 execData (spec, native, strlen(native), textFlag, attribute,
1231 attribute ? strlen(attribute) : 0);
1232 Tcl_DStringFree (&ds);
1233 #else
1234 execData (spec, argv[argi], strlen(argv[argi]), textFlag, attribute,
1235 attribute ? strlen(attribute) : 0);
1236 #endif
1237 argi++;
1238 }
1239 if (element)
1240 tagEnd (spec, 2, NULL, 0);
1241 return TCL_OK;
1242 }
1243
cmd_tcl_unread(ClientData clientData,Tcl_Interp * interp,int argc,const char ** argv)1244 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1245 int argc, const char **argv)
1246 {
1247 struct lexSpec *spec = (struct lexSpec *) clientData;
1248 int argi = 1;
1249 int offset = 0;
1250 int no;
1251
1252 while (argi < argc)
1253 {
1254 if (!strcmp("-offset", argv[argi]))
1255 {
1256 argi++;
1257 if (argi < argc)
1258 {
1259 offset = atoi(argv[argi]);
1260 argi++;
1261 }
1262 }
1263 else
1264 break;
1265 }
1266 if (argi != argc-1)
1267 return TCL_ERROR;
1268 no = atoi(argv[argi]);
1269 if (no >= spec->arg_no)
1270 no = spec->arg_no - 1;
1271 spec->ptr = spec->arg_start[no] + offset;
1272 return TCL_OK;
1273 }
1274
execTcl(struct lexSpec * spec,struct regxCode * code)1275 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1276 {
1277 int i;
1278 int ret;
1279 for (i = 0; i < spec->arg_no; i++)
1280 {
1281 char var_name[10], *var_buf;
1282 int var_len, ch;
1283
1284 sprintf (var_name, "%d", i);
1285 var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1286 &var_len);
1287 if (var_buf)
1288 {
1289 ch = var_buf[var_len];
1290 var_buf[var_len] = '\0';
1291 Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1292 var_buf[var_len] = ch;
1293 }
1294 }
1295 #if HAVE_TCL_OBJECTS
1296 ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1297 #else
1298 ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1299 #endif
1300 if (ret != TCL_OK)
1301 {
1302 const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1303 yaz_log(YLOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1304 #if TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION < 5
1305 spec->tcl_interp->errorLine,
1306 #else
1307 Tcl_GetErrorLine(spec->tcl_interp),
1308 #endif
1309 Tcl_GetStringResult(spec->tcl_interp),
1310 err ? err : "[NO ERRORINFO]");
1311 }
1312 }
1313 /* HAVE_TCL_H */
1314 #endif
1315
execCode(struct lexSpec * spec,struct regxCode * code)1316 static void execCode (struct lexSpec *spec, struct regxCode *code)
1317 {
1318 const char *s = code->str;
1319 int cmd_len, r;
1320 const char *cmd_str;
1321
1322 r = execTok (spec, &s, &cmd_str, &cmd_len);
1323 while (r)
1324 {
1325 char *p, ptmp[64];
1326
1327 if (r == 1)
1328 {
1329 r = execTok (spec, &s, &cmd_str, &cmd_len);
1330 continue;
1331 }
1332 p = regxStrz (cmd_str, cmd_len, ptmp);
1333 if (!strcmp (p, "begin"))
1334 {
1335 r = execTok (spec, &s, &cmd_str, &cmd_len);
1336 if (r < 2)
1337 {
1338 yaz_log (YLOG_WARN, "missing keyword after 'begin'");
1339 continue;
1340 }
1341 p = regxStrz (cmd_str, cmd_len, ptmp);
1342 if (!strcmp (p, "record"))
1343 {
1344 r = execTok (spec, &s, &cmd_str, &cmd_len);
1345 if (r < 2)
1346 continue;
1347 if (spec->d1_level <= 1)
1348 {
1349 static char absynName[64];
1350 data1_node *res;
1351
1352 if (cmd_len > 63)
1353 cmd_len = 63;
1354 memcpy (absynName, cmd_str, cmd_len);
1355 absynName[cmd_len] = '\0';
1356 #if REGX_DEBUG
1357 yaz_log (YLOG_LOG, "begin record %s", absynName);
1358 #endif
1359 res = data1_mk_root (spec->dh, spec->m, absynName);
1360
1361 spec->d1_level = 0;
1362
1363 spec->d1_stack[spec->d1_level++] = res;
1364
1365 res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1366
1367 spec->d1_stack[spec->d1_level++] = res;
1368
1369 spec->d1_stack[spec->d1_level] = NULL;
1370 }
1371 r = execTok (spec, &s, &cmd_str, &cmd_len);
1372 }
1373 else if (!strcmp (p, "element"))
1374 {
1375 r = execTok (spec, &s, &cmd_str, &cmd_len);
1376 if (r < 2)
1377 continue;
1378 tagBegin (spec, cmd_str, cmd_len);
1379 r = execTok (spec, &s, &cmd_str, &cmd_len);
1380 }
1381 else if (!strcmp (p, "variant"))
1382 {
1383 int class_len;
1384 const char *class_str = NULL;
1385 int type_len;
1386 const char *type_str = NULL;
1387 int value_len;
1388 const char *value_str = NULL;
1389 r = execTok (spec, &s, &cmd_str, &cmd_len);
1390 if (r < 2)
1391 continue;
1392 class_str = cmd_str;
1393 class_len = cmd_len;
1394 r = execTok (spec, &s, &cmd_str, &cmd_len);
1395 if (r < 2)
1396 continue;
1397 type_str = cmd_str;
1398 type_len = cmd_len;
1399
1400 r = execTok (spec, &s, &cmd_str, &cmd_len);
1401 if (r < 2)
1402 continue;
1403 value_str = cmd_str;
1404 value_len = cmd_len;
1405
1406 variantBegin (spec, class_str, class_len,
1407 type_str, type_len, value_str, value_len);
1408
1409
1410 r = execTok (spec, &s, &cmd_str, &cmd_len);
1411 }
1412 else if (!strcmp (p, "context"))
1413 {
1414 if (r > 1)
1415 {
1416 struct lexContext *lc = spec->context;
1417 r = execTok (spec, &s, &cmd_str, &cmd_len);
1418 p = regxStrz (cmd_str, cmd_len, ptmp);
1419 #if REGX_DEBUG
1420 yaz_log (YLOG_LOG, "begin context %s", p);
1421 #endif
1422 while (lc && strcmp (p, lc->name))
1423 lc = lc->next;
1424 if (lc)
1425 spec->context_stack[++(spec->context_stack_top)] = lc;
1426 else
1427 yaz_log (YLOG_WARN, "unknown context %s", p);
1428
1429 }
1430 r = execTok (spec, &s, &cmd_str, &cmd_len);
1431 }
1432 else
1433 {
1434 yaz_log (YLOG_WARN, "bad keyword '%s' after begin", p);
1435 }
1436 }
1437 else if (!strcmp (p, "end"))
1438 {
1439 r = execTok (spec, &s, &cmd_str, &cmd_len);
1440 if (r < 2)
1441 {
1442 yaz_log (YLOG_WARN, "missing keyword after 'end'");
1443 continue;
1444 }
1445 p = regxStrz (cmd_str, cmd_len, ptmp);
1446 if (!strcmp (p, "record"))
1447 {
1448 while (spec->d1_level)
1449 {
1450 tagDataRelease (spec);
1451 (spec->d1_level)--;
1452 }
1453 r = execTok (spec, &s, &cmd_str, &cmd_len);
1454 #if REGX_DEBUG
1455 yaz_log (YLOG_LOG, "end record");
1456 #endif
1457 spec->stop_flag = 1;
1458 }
1459 else if (!strcmp (p, "element"))
1460 {
1461 int min_level = 2;
1462 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1463 {
1464 if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1465 min_level = 0;
1466 }
1467 if (r > 2)
1468 {
1469 tagEnd (spec, min_level, cmd_str, cmd_len);
1470 r = execTok (spec, &s, &cmd_str, &cmd_len);
1471 }
1472 else
1473 tagEnd (spec, min_level, NULL, 0);
1474 if (spec->d1_level <= 1)
1475 {
1476 #if REGX_DEBUG
1477 yaz_log (YLOG_LOG, "end element end records");
1478 #endif
1479 spec->stop_flag = 1;
1480 }
1481
1482 }
1483 else if (!strcmp (p, "context"))
1484 {
1485 #if REGX_DEBUG
1486 yaz_log (YLOG_LOG, "end context");
1487 #endif
1488 if (spec->context_stack_top)
1489 (spec->context_stack_top)--;
1490 r = execTok (spec, &s, &cmd_str, &cmd_len);
1491 }
1492 else
1493 yaz_log (YLOG_WARN, "bad keyword '%s' after end", p);
1494 }
1495 else if (!strcmp (p, "data"))
1496 {
1497 int textFlag = 0;
1498 int element_len;
1499 const char *element_str = NULL;
1500 int attribute_len;
1501 const char *attribute_str = NULL;
1502
1503 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1504 {
1505 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1506 textFlag = 1;
1507 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1508 {
1509 r = execTok (spec, &s, &element_str, &element_len);
1510 if (r < 2)
1511 break;
1512 }
1513 else if (cmd_len==10 && !memcmp ("-attribute", cmd_str,
1514 cmd_len))
1515 {
1516 r = execTok (spec, &s, &attribute_str, &attribute_len);
1517 if (r < 2)
1518 break;
1519 }
1520 else
1521 yaz_log (YLOG_WARN, "bad data option: %.*s",
1522 cmd_len, cmd_str);
1523 }
1524 if (r != 2)
1525 {
1526 yaz_log (YLOG_WARN, "missing data item after data");
1527 continue;
1528 }
1529 if (element_str)
1530 tagBegin (spec, element_str, element_len);
1531 do
1532 {
1533 execData (spec, cmd_str, cmd_len, textFlag,
1534 attribute_str, attribute_len);
1535 r = execTok (spec, &s, &cmd_str, &cmd_len);
1536 } while (r > 1);
1537 if (element_str)
1538 tagEnd (spec, 2, NULL, 0);
1539 }
1540 else if (!strcmp (p, "unread"))
1541 {
1542 int no, offset;
1543 r = execTok (spec, &s, &cmd_str, &cmd_len);
1544 if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1545 {
1546 r = execTok (spec, &s, &cmd_str, &cmd_len);
1547 if (r < 2)
1548 {
1549 yaz_log (YLOG_WARN, "missing number after -offset");
1550 continue;
1551 }
1552 p = regxStrz (cmd_str, cmd_len, ptmp);
1553 offset = atoi (p);
1554 r = execTok (spec, &s, &cmd_str, &cmd_len);
1555 }
1556 else
1557 offset = 0;
1558 if (r < 2)
1559 {
1560 yaz_log (YLOG_WARN, "missing index after unread command");
1561 continue;
1562 }
1563 if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1564 {
1565 yaz_log (YLOG_WARN, "bad index after unread command");
1566 continue;
1567 }
1568 else
1569 {
1570 no = *cmd_str - '0';
1571 if (no >= spec->arg_no)
1572 no = spec->arg_no - 1;
1573 spec->ptr = spec->arg_start[no] + offset;
1574 }
1575 r = execTok (spec, &s, &cmd_str, &cmd_len);
1576 }
1577 else if (!strcmp (p, "context"))
1578 {
1579 if (r > 1)
1580 {
1581 struct lexContext *lc = spec->context;
1582 r = execTok (spec, &s, &cmd_str, &cmd_len);
1583 p = regxStrz (cmd_str, cmd_len, ptmp);
1584
1585 while (lc && strcmp (p, lc->name))
1586 lc = lc->next;
1587 if (lc)
1588 spec->context_stack[spec->context_stack_top] = lc;
1589 else
1590 yaz_log (YLOG_WARN, "unknown context %s", p);
1591
1592 }
1593 r = execTok (spec, &s, &cmd_str, &cmd_len);
1594 }
1595 else
1596 {
1597 yaz_log (YLOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1598 r = execTok (spec, &s, &cmd_str, &cmd_len);
1599 continue;
1600 }
1601 if (r > 1)
1602 {
1603 yaz_log (YLOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1604 do {
1605 r = execTok (spec, &s, &cmd_str, &cmd_len);
1606 } while (r > 1);
1607 }
1608 }
1609 }
1610
1611
execAction(struct lexSpec * spec,struct lexRuleAction * ap,int start_ptr,int * pptr)1612 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1613 int start_ptr, int *pptr)
1614 {
1615 int sptr;
1616 int arg_start[20];
1617 int arg_end[20];
1618 int arg_no = 1;
1619
1620 if (!ap)
1621 return 1;
1622 arg_start[0] = start_ptr;
1623 arg_end[0] = *pptr;
1624 spec->arg_start = arg_start;
1625 spec->arg_end = arg_end;
1626
1627 while (ap)
1628 {
1629 switch (ap->which)
1630 {
1631 case REGX_PATTERN:
1632 if (ap->u.pattern.body)
1633 {
1634 arg_start[arg_no] = *pptr;
1635 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 0))
1636 {
1637 arg_end[arg_no] = F_WIN_EOF;
1638 arg_no++;
1639 arg_start[arg_no] = F_WIN_EOF;
1640 arg_end[arg_no] = F_WIN_EOF;
1641 yaz_log(YLOG_DEBUG, "Pattern match rest of record");
1642 *pptr = F_WIN_EOF;
1643 }
1644 else
1645 {
1646 arg_end[arg_no] = sptr;
1647 arg_no++;
1648 arg_start[arg_no] = sptr;
1649 arg_end[arg_no] = *pptr;
1650 }
1651 }
1652 else
1653 {
1654 arg_start[arg_no] = *pptr;
1655 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 1))
1656 return 1;
1657 if (sptr != arg_start[arg_no])
1658 return 1;
1659 arg_end[arg_no] = *pptr;
1660 }
1661 arg_no++;
1662 break;
1663 case REGX_CODE:
1664 spec->arg_no = arg_no;
1665 spec->ptr = *pptr;
1666 #if HAVE_TCL_H
1667 if (spec->tcl_interp)
1668 execTcl(spec, ap->u.code);
1669 else
1670 execCode (spec, ap->u.code);
1671 #else
1672 execCode (spec, ap->u.code);
1673 #endif
1674 *pptr = spec->ptr;
1675 if (spec->stop_flag)
1676 return 0;
1677 break;
1678 case REGX_END:
1679 arg_start[arg_no] = *pptr;
1680 arg_end[arg_no] = F_WIN_EOF;
1681 arg_no++;
1682 *pptr = F_WIN_EOF;
1683 }
1684 ap = ap->next;
1685 }
1686 return 1;
1687 }
1688
execRule(struct lexSpec * spec,struct lexContext * context,int ruleNo,int start_ptr,int * pptr)1689 static int execRule (struct lexSpec *spec, struct lexContext *context,
1690 int ruleNo, int start_ptr, int *pptr)
1691 {
1692 #if REGX_DEBUG
1693 yaz_log (YLOG_LOG, "exec rule %d", ruleNo);
1694 #endif
1695 return execAction (spec, context->fastRule[ruleNo]->actionList,
1696 start_ptr, pptr);
1697 }
1698
lexNode(struct lexSpec * spec,int * ptr)1699 int lexNode (struct lexSpec *spec, int *ptr)
1700 {
1701 struct lexContext *context = spec->context_stack[spec->context_stack_top];
1702 struct DFA_state *state = context->dfa->states[0];
1703 struct DFA_tran *t;
1704 unsigned char c;
1705 unsigned char c_prev = '\n';
1706 int i;
1707 int last_rule = 0; /* rule number of current match */
1708 int last_ptr = *ptr; /* last char of match */
1709 int start_ptr = *ptr; /* first char of match */
1710 int skip_ptr = *ptr; /* first char of run */
1711 int more = 0;
1712
1713 while (1)
1714 {
1715 c = f_win_advance (spec, ptr);
1716 if (*ptr == F_WIN_EOF)
1717 {
1718 /* end of file met */
1719 if (last_rule)
1720 {
1721 /* there was a match */
1722 if (skip_ptr < start_ptr)
1723 {
1724 /* deal with chars that didn't match */
1725 int size;
1726 char *buf;
1727 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1728 execDataP (spec, buf, size, 0);
1729 }
1730 /* restore pointer */
1731 *ptr = last_ptr;
1732 /* execute rule */
1733 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1734 return more;
1735 /* restore skip pointer */
1736 skip_ptr = *ptr;
1737 last_rule = 0;
1738 }
1739 else if (skip_ptr < *ptr)
1740 {
1741 /* deal with chars that didn't match */
1742 int size;
1743 char *buf;
1744 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1745 execDataP (spec, buf, size, 0);
1746 }
1747 state = context->dfa->states[0];
1748 if (*ptr == F_WIN_EOF)
1749 return more;
1750 }
1751 t = state->trans;
1752 i = state->tran_no;
1753 while (1)
1754 if (--i < 0)
1755 { /* no transition for character c ... */
1756 if (last_rule)
1757 {
1758 if (skip_ptr < start_ptr)
1759 {
1760 /* deal with chars that didn't match */
1761 int size;
1762 char *buf;
1763 buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1764 execDataP (spec, buf, size, 0);
1765 }
1766 /* restore pointer */
1767 *ptr = last_ptr;
1768 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1769 {
1770 if (spec->f_win_ef && *ptr != F_WIN_EOF)
1771 {
1772 off_t end_offset = *ptr;
1773 #if REGX_DEBUG
1774 yaz_log (YLOG_LOG, "regx: endf ptr=%d", *ptr);
1775 #endif
1776 (*spec->f_win_ef)(spec->stream, &end_offset);
1777 }
1778 return more;
1779 }
1780 context = spec->context_stack[spec->context_stack_top];
1781 skip_ptr = *ptr;
1782 last_rule = 0;
1783 last_ptr = start_ptr = *ptr;
1784 if (start_ptr > 0)
1785 {
1786 --start_ptr;
1787 c_prev = f_win_advance (spec, &start_ptr);
1788 }
1789 }
1790 else
1791 {
1792 c_prev = f_win_advance (spec, &start_ptr);
1793 *ptr = start_ptr;
1794 }
1795 state = context->dfa->states[0];
1796 break;
1797 }
1798 else if (c >= t->ch[0] && c <= t->ch[1])
1799 { /* transition ... */
1800 state = context->dfa->states[t->to];
1801 if (state->rule_no)
1802 {
1803 if (c_prev == '\n')
1804 {
1805 last_rule = state->rule_no;
1806 last_ptr = *ptr;
1807 }
1808 else if (state->rule_nno)
1809 {
1810 last_rule = state->rule_nno;
1811 last_ptr = *ptr;
1812 }
1813 more = 1;
1814 }
1815 break;
1816 }
1817 else
1818 t++;
1819 }
1820 return more;
1821 }
1822
lexRoot(struct lexSpec * spec,off_t offset,const char * context_name)1823 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1824 const char *context_name)
1825 {
1826 struct lexContext *lt = spec->context;
1827 int ptr = offset;
1828 int ret;
1829
1830 spec->stop_flag = 0;
1831 spec->d1_level = 0;
1832 spec->context_stack_top = 0;
1833 while (lt)
1834 {
1835 if (!strcmp (lt->name, context_name))
1836 break;
1837 lt = lt->next;
1838 }
1839 if (!lt)
1840 {
1841 yaz_log (YLOG_WARN, "cannot find context %s", context_name);
1842 return NULL;
1843 }
1844 spec->context_stack[spec->context_stack_top] = lt;
1845 spec->d1_stack[spec->d1_level] = NULL;
1846 #if 1
1847 if (!lt->initFlag)
1848 {
1849 lt->initFlag = 1;
1850 execAction (spec, lt->initActionList, ptr, &ptr);
1851 }
1852 #endif
1853 execAction (spec, lt->beginActionList, ptr, &ptr);
1854
1855 ret = lexNode (spec, &ptr);
1856 while (spec->d1_level)
1857 {
1858 tagDataRelease (spec);
1859 (spec->d1_level)--;
1860 }
1861 if (!ret)
1862 return 0;
1863 execAction (spec, lt->endActionList, ptr, &ptr);
1864 return spec->d1_stack[0];
1865 }
1866
grs_destroy(void * clientData)1867 void grs_destroy(void *clientData)
1868 {
1869 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1870 if (specs->spec)
1871 {
1872 lexSpecDestroy(&specs->spec);
1873 }
1874 xfree (specs);
1875 }
1876
grs_init(Res res,RecType recType)1877 void *grs_init(Res res, RecType recType)
1878 {
1879 struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
1880 specs->spec = 0;
1881 strcpy(specs->type, "");
1882 return specs;
1883 }
1884
1885
grs_config(void * clientData,Res res,const char * args)1886 ZEBRA_RES grs_config(void *clientData, Res res, const char *args)
1887 {
1888 struct lexSpecs *specs = (struct lexSpecs *) clientData;
1889 if (strlen(args) < sizeof(specs->type))
1890 strcpy(specs->type, args);
1891 return ZEBRA_OK;
1892 }
1893
grs_read_regx(struct grs_read_info * p)1894 data1_node *grs_read_regx (struct grs_read_info *p)
1895 {
1896 int res;
1897 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1898 struct lexSpec **curLexSpec = &specs->spec;
1899 off_t start_offset;
1900
1901 #if REGX_DEBUG
1902 yaz_log (YLOG_LOG, "grs_read_regx");
1903 #endif
1904 if (!*curLexSpec || strcmp ((*curLexSpec)->name, specs->type))
1905 {
1906 if (*curLexSpec)
1907 lexSpecDestroy (curLexSpec);
1908 *curLexSpec = lexSpecCreate (specs->type, p->dh);
1909 res = readFileSpec (*curLexSpec);
1910 if (res)
1911 {
1912 lexSpecDestroy (curLexSpec);
1913 return NULL;
1914 }
1915 }
1916 (*curLexSpec)->dh = p->dh;
1917 start_offset = p->stream->tellf(p->stream);
1918 if (start_offset == 0)
1919 {
1920 (*curLexSpec)->f_win_start = 0;
1921 (*curLexSpec)->f_win_end = 0;
1922 (*curLexSpec)->f_win_rf = p->stream->readf;
1923 (*curLexSpec)->f_win_sf = p->stream->seekf;
1924 (*curLexSpec)->stream = p->stream;
1925 (*curLexSpec)->f_win_ef = p->stream->endf;
1926 (*curLexSpec)->f_win_size = 500000;
1927 }
1928 (*curLexSpec)->m = p->mem;
1929 return lexRoot (*curLexSpec, start_offset, "main");
1930 }
1931
extract_regx(void * clientData,struct recExtractCtrl * ctrl)1932 static int extract_regx(void *clientData, struct recExtractCtrl *ctrl)
1933 {
1934 return zebra_grs_extract(clientData, ctrl, grs_read_regx);
1935 }
1936
retrieve_regx(void * clientData,struct recRetrieveCtrl * ctrl)1937 static int retrieve_regx(void *clientData, struct recRetrieveCtrl *ctrl)
1938 {
1939 return zebra_grs_retrieve(clientData, ctrl, grs_read_regx);
1940 }
1941
1942 static struct recType regx_type = {
1943 0,
1944 "grs.regx",
1945 grs_init,
1946 grs_config,
1947 grs_destroy,
1948 extract_regx,
1949 retrieve_regx,
1950 };
1951
1952
1953 #if HAVE_TCL_H
grs_read_tcl(struct grs_read_info * p)1954 data1_node *grs_read_tcl (struct grs_read_info *p)
1955 {
1956 int res;
1957 struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1958 struct lexSpec **curLexSpec = &specs->spec;
1959 off_t start_offset;
1960
1961 #if REGX_DEBUG
1962 yaz_log (YLOG_LOG, "grs_read_tcl");
1963 #endif
1964 if (!*curLexSpec || strcmp ((*curLexSpec)->name, specs->type))
1965 {
1966 Tcl_Interp *tcl_interp;
1967 if (*curLexSpec)
1968 lexSpecDestroy (curLexSpec);
1969 *curLexSpec = lexSpecCreate (specs->type, p->dh);
1970 Tcl_FindExecutable("");
1971 tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
1972 Tcl_Init(tcl_interp);
1973 Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
1974 Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
1975 Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
1976 Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
1977 *curLexSpec, 0);
1978 res = readFileSpec (*curLexSpec);
1979 if (res)
1980 {
1981 lexSpecDestroy (curLexSpec);
1982 return NULL;
1983 }
1984 }
1985 (*curLexSpec)->dh = p->dh;
1986 start_offset = p->stream->tellf(p->stream);
1987 if (start_offset == 0)
1988 {
1989 (*curLexSpec)->f_win_start = 0;
1990 (*curLexSpec)->f_win_end = 0;
1991 (*curLexSpec)->f_win_rf = p->stream->readf;
1992 (*curLexSpec)->f_win_sf = p->stream->seekf;
1993 (*curLexSpec)->stream = p->stream;
1994 (*curLexSpec)->f_win_ef = p->stream->endf;
1995 (*curLexSpec)->f_win_size = 500000;
1996 }
1997 (*curLexSpec)->m = p->mem;
1998 return lexRoot (*curLexSpec, start_offset, "main");
1999 }
2000
extract_tcl(void * clientData,struct recExtractCtrl * ctrl)2001 static int extract_tcl(void *clientData, struct recExtractCtrl *ctrl)
2002 {
2003 return zebra_grs_extract(clientData, ctrl, grs_read_tcl);
2004 }
2005
retrieve_tcl(void * clientData,struct recRetrieveCtrl * ctrl)2006 static int retrieve_tcl(void *clientData, struct recRetrieveCtrl *ctrl)
2007 {
2008 return zebra_grs_retrieve(clientData, ctrl, grs_read_tcl);
2009 }
2010
2011 static struct recType tcl_type = {
2012 0,
2013 "grs.tcl",
2014 grs_init,
2015 grs_config,
2016 grs_destroy,
2017 extract_tcl,
2018 retrieve_tcl,
2019 };
2020
2021 #endif
2022
2023 RecType
2024 #if IDZEBRA_STATIC_GRS_REGX
2025 idzebra_filter_grs_regx
2026 #else
2027 idzebra_filter
2028 #endif
2029
2030 [] = {
2031 ®x_type,
2032 #if HAVE_TCL_H
2033 &tcl_type,
2034 #endif
2035 0,
2036 };
2037 /*
2038 * Local variables:
2039 * c-basic-offset: 4
2040 * c-file-style: "Stroustrup"
2041 * indent-tabs-mode: nil
2042 * End:
2043 * vim: shiftwidth=4 tabstop=8 expandtab
2044 */
2045
2046