1 /* This file is part of the Zebra server.
2    Copyright (C) 2004-2013 Index Data
3 
4 Zebra is free software; you can redistribute it and/or modify it under
5 the terms of the GNU General Public License as published by the Free
6 Software Foundation; either version 2, or (at your option) any later
7 version.
8 
9 Zebra is distributed in the hope that it will be useful, but WITHOUT ANY
10 WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
12 for more details.
13 
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 
18 */
19 
20 #if HAVE_CONFIG_H
21 #include <config.h>
22 #endif
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <assert.h>
26 #include <string.h>
27 #include <ctype.h>
28 
29 #include <yaz/tpath.h>
30 #include <idzebra/util.h>
31 #include <dfa.h>
32 #include <idzebra/recgrs.h>
33 
34 #if HAVE_TCL_H
35 #include <tcl.h>
36 
37 #if MAJOR_VERSION >= 8
38 #define HAVE_TCL_OBJECTS
39 #endif
40 #endif
41 
42 #define REGX_DEBUG 0
43 
44 #define F_WIN_EOF 2000000000
45 #define F_WIN_READ 1
46 
47 #define REGX_EOF     0
48 #define REGX_PATTERN 1
49 #define REGX_BODY    2
50 #define REGX_BEGIN   3
51 #define REGX_END     4
52 #define REGX_CODE    5
53 #define REGX_CONTEXT 6
54 #define REGX_INIT    7
55 
56 struct regxCode {
57     char *str;
58 #if HAVE_TCL_OBJECTS
59     Tcl_Obj *tcl_obj;
60 #endif
61 };
62 
63 struct lexRuleAction {
64     int which;
65     union {
66         struct {
67             struct DFA *dfa;    /* REGX_PATTERN */
68             int body;
69         } pattern;
70         struct regxCode *code;  /* REGX_CODE */
71     } u;
72     struct lexRuleAction *next;
73 };
74 
75 struct lexRuleInfo {
76     int no;
77     struct lexRuleAction *actionList;
78 };
79 
80 struct lexRule {
81     struct lexRuleInfo info;
82     struct lexRule *next;
83 };
84 
85 struct lexContext {
86     char *name;
87     struct DFA *dfa;
88     struct lexRule *rules;
89     struct lexRuleInfo **fastRule;
90     int ruleNo;
91     int initFlag;
92 
93     struct lexRuleAction *beginActionList;
94     struct lexRuleAction *endActionList;
95     struct lexRuleAction *initActionList;
96     struct lexContext *next;
97 };
98 
99 struct lexConcatBuf {
100     int max;
101     char *buf;
102 };
103 
104 struct lexSpec {
105     char *name;
106     struct lexContext *context;
107 
108     struct lexContext **context_stack;
109     int context_stack_size;
110     int context_stack_top;
111 
112     int lineNo;
113     NMEM m;
114     data1_handle dh;
115 #if HAVE_TCL_H
116     Tcl_Interp *tcl_interp;
117 #endif
118     struct ZebraRecStream *stream;
119     off_t (*f_win_ef)(struct ZebraRecStream *s, off_t *);
120 
121     int f_win_start;      /* first byte of buffer is this file offset */
122     int f_win_end;        /* last byte of buffer is this offset - 1 */
123     int f_win_size;       /* size of buffer */
124     char *f_win_buf;      /* buffer itself */
125     int (*f_win_rf)(struct ZebraRecStream *, char *, size_t);
126     off_t (*f_win_sf)(struct ZebraRecStream *, off_t);
127 
128     struct lexConcatBuf *concatBuf;
129     int maxLevel;
130     data1_node **d1_stack;
131     int d1_level;
132     int stop_flag;
133 
134     int *arg_start;
135     int *arg_end;
136     int arg_no;
137     int ptr;
138 };
139 
140 struct lexSpecs {
141     struct lexSpec *spec;
142     char type[256];
143 };
144 
f_win_get(struct lexSpec * spec,off_t start_pos,off_t end_pos,int * size)145 static char *f_win_get (struct lexSpec *spec, off_t start_pos, off_t end_pos,
146                         int *size)
147 {
148     int i, r, off = start_pos - spec->f_win_start;
149 
150     if (off >= 0 && end_pos <= spec->f_win_end)
151     {
152         *size = end_pos - start_pos;
153         return spec->f_win_buf + off;
154     }
155     if (off < 0 || start_pos >= spec->f_win_end)
156     {
157         (*spec->f_win_sf)(spec->stream, start_pos);
158         spec->f_win_start = start_pos;
159 
160         if (!spec->f_win_buf)
161             spec->f_win_buf = (char *) xmalloc (spec->f_win_size);
162         *size = (*spec->f_win_rf)(spec->stream, spec->f_win_buf,
163                                   spec->f_win_size);
164         spec->f_win_end = spec->f_win_start + *size;
165 
166         if (*size > end_pos - start_pos)
167             *size = end_pos - start_pos;
168         return spec->f_win_buf;
169     }
170     for (i = 0; i<spec->f_win_end - start_pos; i++)
171         spec->f_win_buf[i] = spec->f_win_buf[i + off];
172     r = (*spec->f_win_rf)(spec->stream,
173                           spec->f_win_buf + i,
174                           spec->f_win_size - i);
175     spec->f_win_start = start_pos;
176     spec->f_win_end += r;
177     *size = i + r;
178     if (*size > end_pos - start_pos)
179         *size = end_pos - start_pos;
180     return spec->f_win_buf;
181 }
182 
f_win_advance(struct lexSpec * spec,int * pos)183 static int f_win_advance (struct lexSpec *spec, int *pos)
184 {
185     int size;
186     char *buf;
187 
188     if (*pos >= spec->f_win_start && *pos < spec->f_win_end)
189         return spec->f_win_buf[(*pos)++ - spec->f_win_start];
190     if (*pos == F_WIN_EOF)
191         return 0;
192     buf = f_win_get (spec, *pos, *pos+1, &size);
193     if (size == 1)
194     {
195         (*pos)++;
196         return *buf;
197     }
198     *pos = F_WIN_EOF;
199     return 0;
200 }
201 
regxCodeDel(struct regxCode ** pp)202 static void regxCodeDel (struct regxCode **pp)
203 {
204     struct regxCode *p = *pp;
205     if (p)
206     {
207 #if HAVE_TCL_OBJECTS
208 	if (p->tcl_obj)
209 	    Tcl_DecrRefCount (p->tcl_obj);
210 #endif
211         xfree (p->str);
212         xfree (p);
213         *pp = NULL;
214     }
215 }
216 
regxCodeMk(struct regxCode ** pp,const char * buf,int len)217 static void regxCodeMk (struct regxCode **pp, const char *buf, int len)
218 {
219     struct regxCode *p;
220 
221     p = (struct regxCode *) xmalloc (sizeof(*p));
222     p->str = (char *) xmalloc (len+1);
223     memcpy (p->str, buf, len);
224     p->str[len] = '\0';
225 #if HAVE_TCL_OBJECTS
226     p->tcl_obj = Tcl_NewStringObj ((char *) buf, len);
227     if (p->tcl_obj)
228 	Tcl_IncrRefCount (p->tcl_obj);
229 #endif
230     *pp = p;
231 }
232 
lexSpecDFA(void)233 static struct DFA *lexSpecDFA (void)
234 {
235     struct DFA *dfa;
236 
237     dfa = dfa_init ();
238     dfa_parse_cmap_del (dfa, ' ');
239     dfa_parse_cmap_del (dfa, '\t');
240     dfa_parse_cmap_add (dfa, '/', 0);
241     return dfa;
242 }
243 
actionListDel(struct lexRuleAction ** rap)244 static void actionListDel (struct lexRuleAction **rap)
245 {
246     struct lexRuleAction *ra1, *ra;
247 
248     for (ra = *rap; ra; ra = ra1)
249     {
250         ra1 = ra->next;
251         switch (ra->which)
252         {
253         case REGX_PATTERN:
254             dfa_delete (&ra->u.pattern.dfa);
255             break;
256         case REGX_CODE:
257             regxCodeDel (&ra->u.code);
258             break;
259         }
260         xfree (ra);
261     }
262     *rap = NULL;
263 }
264 
lexContextCreate(const char * name)265 static struct lexContext *lexContextCreate (const char *name)
266 {
267     struct lexContext *p = (struct lexContext *) xmalloc (sizeof(*p));
268 
269     p->name = xstrdup (name);
270     p->ruleNo = 1;
271     p->initFlag = 0;
272     p->dfa = lexSpecDFA ();
273     p->rules = NULL;
274     p->fastRule = NULL;
275     p->beginActionList = NULL;
276     p->endActionList = NULL;
277     p->initActionList = NULL;
278     p->next = NULL;
279     return p;
280 }
281 
lexContextDestroy(struct lexContext * p)282 static void lexContextDestroy (struct lexContext *p)
283 {
284     struct lexRule *rp, *rp1;
285 
286     dfa_delete (&p->dfa);
287     xfree (p->fastRule);
288     for (rp = p->rules; rp; rp = rp1)
289     {
290 	rp1 = rp->next;
291         actionListDel (&rp->info.actionList);
292         xfree (rp);
293     }
294     actionListDel (&p->beginActionList);
295     actionListDel (&p->endActionList);
296     actionListDel (&p->initActionList);
297     xfree (p->name);
298     xfree (p);
299 }
300 
lexSpecCreate(const char * name,data1_handle dh)301 static struct lexSpec *lexSpecCreate (const char *name, data1_handle dh)
302 {
303     struct lexSpec *p;
304     int i;
305 
306     p = (struct lexSpec *) xmalloc (sizeof(*p));
307     p->name = (char *) xmalloc (strlen(name)+1);
308     strcpy (p->name, name);
309 
310 #if HAVE_TCL_H
311     p->tcl_interp = 0;
312 #endif
313     p->dh = dh;
314     p->context = NULL;
315     p->context_stack_size = 100;
316     p->context_stack = (struct lexContext **)
317 	xmalloc (sizeof(*p->context_stack) * p->context_stack_size);
318     p->f_win_buf = NULL;
319 
320     p->maxLevel = 128;
321     p->concatBuf = (struct lexConcatBuf *)
322 	xmalloc (sizeof(*p->concatBuf) * p->maxLevel);
323     for (i = 0; i < p->maxLevel; i++)
324     {
325 	p->concatBuf[i].max = 0;
326 	p->concatBuf[i].buf = 0;
327     }
328     p->d1_stack = (data1_node **) xmalloc (sizeof(*p->d1_stack) * p->maxLevel);
329     p->d1_level = 0;
330     return p;
331 }
332 
lexSpecDestroy(struct lexSpec ** pp)333 static void lexSpecDestroy (struct lexSpec **pp)
334 {
335     struct lexSpec *p;
336     struct lexContext *lt;
337     int i;
338 
339     assert (pp);
340     p = *pp;
341     if (!p)
342         return ;
343 
344     for (i = 0; i < p->maxLevel; i++)
345 	xfree (p->concatBuf[i].buf);
346     xfree (p->concatBuf);
347 
348     lt = p->context;
349     while (lt)
350     {
351 	struct lexContext *lt_next = lt->next;
352 	lexContextDestroy (lt);
353 	lt = lt_next;
354     }
355 #if HAVE_TCL_OBJECTS
356     if (p->tcl_interp)
357 	Tcl_DeleteInterp (p->tcl_interp);
358 #endif
359     xfree (p->name);
360     xfree (p->f_win_buf);
361     xfree (p->context_stack);
362     xfree (p->d1_stack);
363     xfree (p);
364     *pp = NULL;
365 }
366 
readParseToken(const char ** cpp,int * len)367 static int readParseToken (const char **cpp, int *len)
368 {
369     const char *cp = *cpp;
370     char cmd[32];
371     int i, level;
372 
373     while (*cp == ' ' || *cp == '\t' || *cp == '\n' || *cp == '\r')
374         cp++;
375     switch (*cp)
376     {
377     case '\0':
378         return 0;
379     case '/':
380         *cpp = cp+1;
381         return REGX_PATTERN;
382     case '{':
383         *cpp = cp+1;
384         level = 1;
385         while (*++cp)
386         {
387             if (*cp == '{')
388                 level++;
389             else if (*cp == '}')
390             {
391                 level--;
392                 if (level == 0)
393                     break;
394             }
395         }
396         *len = cp - *cpp;
397         return REGX_CODE;
398     default:
399         i = 0;
400         while (1)
401         {
402             if (*cp >= 'a' && *cp <= 'z')
403                 cmd[i] = *cp;
404             else if (*cp >= 'A' && *cp <= 'Z')
405                 cmd[i] = *cp + 'a' - 'A';
406             else
407                 break;
408             if (i < (int) sizeof(cmd)-2)
409 		i++;
410             cp++;
411         }
412         cmd[i] = '\0';
413         if (i == 0)
414         {
415             yaz_log (YLOG_WARN, "bad character %d %c", *cp, *cp);
416             cp++;
417             while (*cp && *cp != ' ' && *cp != '\t' &&
418                    *cp != '\n' && *cp != '\r')
419                 cp++;
420             *cpp = cp;
421             return 0;
422         }
423         *cpp = cp;
424         if (!strcmp (cmd, "begin"))
425             return REGX_BEGIN;
426         else if (!strcmp (cmd, "end"))
427             return REGX_END;
428         else if (!strcmp (cmd, "body"))
429             return REGX_BODY;
430 	else if (!strcmp (cmd, "context"))
431 	    return REGX_CONTEXT;
432 	else if (!strcmp (cmd, "init"))
433 	    return REGX_INIT;
434         else
435         {
436             yaz_log (YLOG_WARN, "bad command %s", cmd);
437             return 0;
438         }
439     }
440 }
441 
actionListMk(struct lexSpec * spec,const char * s,struct lexRuleAction ** ap)442 static int actionListMk (struct lexSpec *spec, const char *s,
443                          struct lexRuleAction **ap)
444 {
445     int r, tok, len;
446     int bodyMark = 0;
447     const char *s0;
448 
449     while ((tok = readParseToken (&s, &len)))
450     {
451         switch (tok)
452         {
453         case REGX_BODY:
454             bodyMark = 1;
455             continue;
456         case REGX_CODE:
457             *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
458             (*ap)->which = tok;
459             regxCodeMk (&(*ap)->u.code, s, len);
460             s += len+1;
461             break;
462         case REGX_PATTERN:
463             *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
464             (*ap)->which = tok;
465             (*ap)->u.pattern.body = bodyMark;
466             bodyMark = 0;
467             (*ap)->u.pattern.dfa = lexSpecDFA ();
468 	    s0 = s;
469             r = dfa_parse ((*ap)->u.pattern.dfa, &s);
470             if (r || *s != '/')
471             {
472                 int pos = s - s0;
473                 xfree (*ap);
474                 *ap = NULL;
475                 yaz_log(YLOG_WARN, "regular expression error '%.*s'", pos, s0);
476                 return -1;
477             }
478             else
479             {
480                 int pos = s - s0;
481                 if (debug_dfa_tran)
482                     printf("pattern: %.*s\n", pos, s0);
483                 dfa_mkstate((*ap)->u.pattern.dfa);
484                 s++;
485             }
486             break;
487         case REGX_BEGIN:
488             yaz_log (YLOG_WARN, "cannot use BEGIN here");
489             continue;
490         case REGX_INIT:
491             yaz_log (YLOG_WARN, "cannot use INIT here");
492             continue;
493         case REGX_END:
494             *ap = (struct lexRuleAction *) xmalloc (sizeof(**ap));
495             (*ap)->which = tok;
496             break;
497         }
498         ap = &(*ap)->next;
499     }
500     *ap = NULL;
501     return 0;
502 }
503 
readOneSpec(struct lexSpec * spec,const char * s)504 int readOneSpec (struct lexSpec *spec, const char *s)
505 {
506     int len, r, tok;
507     struct lexRule *rp;
508     struct lexContext *lc;
509 
510     tok = readParseToken (&s, &len);
511     if (tok == REGX_CONTEXT)
512     {
513 	char context_name[32];
514 	tok = readParseToken (&s, &len);
515 	if (tok != REGX_CODE)
516 	{
517 	    yaz_log (YLOG_WARN, "missing name after CONTEXT keyword");
518 	    return 0;
519 	}
520 	if (len > 31)
521 	    len = 31;
522 	memcpy (context_name, s, len);
523 	context_name[len] = '\0';
524 	lc = lexContextCreate (context_name);
525 	lc->next = spec->context;
526 	spec->context = lc;
527 	return 0;
528     }
529     if (!spec->context)
530 	spec->context = lexContextCreate ("main");
531 
532     switch (tok)
533     {
534     case REGX_BEGIN:
535         actionListDel (&spec->context->beginActionList);
536         actionListMk (spec, s, &spec->context->beginActionList);
537 	break;
538     case REGX_END:
539         actionListDel (&spec->context->endActionList);
540         actionListMk (spec, s, &spec->context->endActionList);
541 	break;
542     case REGX_INIT:
543         actionListDel (&spec->context->initActionList);
544         actionListMk (spec, s, &spec->context->initActionList);
545 	break;
546     case REGX_PATTERN:
547 #if REGX_DEBUG
548 	yaz_log (YLOG_LOG, "rule %d %s", spec->context->ruleNo, s);
549 #endif
550         r = dfa_parse (spec->context->dfa, &s);
551         if (r)
552         {
553             yaz_log (YLOG_WARN, "regular expression error. r=%d", r);
554             return -1;
555         }
556         if (*s != '/')
557         {
558             yaz_log (YLOG_WARN, "expects / at end of pattern. got %c", *s);
559             return -1;
560         }
561         s++;
562         rp = (struct lexRule *) xmalloc (sizeof(*rp));
563         rp->info.no = spec->context->ruleNo++;
564         rp->next = spec->context->rules;
565         spec->context->rules = rp;
566         actionListMk (spec, s, &rp->info.actionList);
567     }
568     return 0;
569 }
570 
readFileSpec(struct lexSpec * spec)571 int readFileSpec (struct lexSpec *spec)
572 {
573     struct lexContext *lc;
574     int c, i, errors = 0;
575     FILE *spec_inf = 0;
576     WRBUF lineBuf;
577     char fname[256];
578 
579 #if HAVE_TCL_H
580     if (spec->tcl_interp)
581     {
582 	sprintf (fname, "%s.tflt", spec->name);
583 	spec_inf = data1_path_fopen (spec->dh, fname, "r");
584     }
585 #endif
586     if (!spec_inf)
587     {
588 	sprintf (fname, "%s.flt", spec->name);
589 	spec_inf = data1_path_fopen (spec->dh, fname, "r");
590     }
591     if (!spec_inf)
592     {
593         yaz_log (YLOG_ERRNO|YLOG_WARN, "cannot read spec file %s", spec->name);
594         return -1;
595     }
596     yaz_log (YLOG_LOG, "reading regx filter %s", fname);
597 #if HAVE_TCL_H
598     if (spec->tcl_interp)
599 	yaz_log (YLOG_LOG, "Tcl enabled");
600 #endif
601 
602 #if 0
603     debug_dfa_trav = 0;
604     debug_dfa_tran = 1;
605     debug_dfa_followpos = 0;
606     dfa_verbose = 1;
607 #endif
608 
609     lineBuf = wrbuf_alloc();
610     spec->lineNo = 0;
611     c = getc (spec_inf);
612     while (c != EOF)
613     {
614 	wrbuf_rewind (lineBuf);
615         if (c == '#' || c == '\n' || c == ' ' || c == '\t' || c == '\r')
616         {
617             while (c != '\n' && c != EOF)
618                 c = getc (spec_inf);
619             spec->lineNo++;
620             if (c == '\n')
621                 c = getc (spec_inf);
622         }
623         else
624         {
625             int addLine = 0;
626 
627             while (1)
628             {
629                 int c1 = c;
630 		wrbuf_putc(lineBuf, c);
631                 c = getc (spec_inf);
632 		while (c == '\r')
633 		    c = getc (spec_inf);
634                 if (c == EOF)
635                     break;
636                 if (c1 == '\n')
637                 {
638                     if (c != ' ' && c != '\t')
639                         break;
640                     addLine++;
641                 }
642             }
643 	    wrbuf_putc(lineBuf, '\0');
644             readOneSpec (spec, wrbuf_buf(lineBuf));
645             spec->lineNo += addLine;
646         }
647     }
648     fclose (spec_inf);
649     wrbuf_destroy(lineBuf);
650 
651     for (lc = spec->context; lc; lc = lc->next)
652     {
653 	struct lexRule *rp;
654 	lc->fastRule = (struct lexRuleInfo **)
655 	    xmalloc (sizeof(*lc->fastRule) * lc->ruleNo);
656 	for (i = 0; i < lc->ruleNo; i++)
657 	    lc->fastRule[i] = NULL;
658 	for (rp = lc->rules; rp; rp = rp->next)
659 	    lc->fastRule[rp->info.no] = &rp->info;
660 	dfa_mkstate (lc->dfa);
661     }
662     if (errors)
663         return -1;
664 
665     return 0;
666 }
667 
668 #if 0
669 static struct lexSpec *curLexSpec = NULL;
670 #endif
671 
execData(struct lexSpec * spec,const char * ebuf,int elen,int formatted_text,const char * attribute_str,int attribute_len)672 static void execData (struct lexSpec *spec,
673                       const char *ebuf, int elen, int formatted_text,
674 		      const char *attribute_str, int attribute_len)
675 {
676     struct data1_node *res, *parent;
677     int org_len;
678 
679     if (elen == 0) /* shouldn't happen, but it does! */
680 	return ;
681 #if REGX_DEBUG
682     if (elen > 80)
683         yaz_log (YLOG_LOG, "data(%d bytes) %.40s ... %.*s", elen,
684 	      ebuf, 40, ebuf + elen-40);
685     else if (elen == 1 && ebuf[0] == '\n')
686     {
687         yaz_log (YLOG_LOG, "data(new line)");
688     }
689     else if (elen > 0)
690         yaz_log (YLOG_LOG, "data(%d bytes) %.*s", elen, elen, ebuf);
691     else
692         yaz_log (YLOG_LOG, "data(%d bytes)", elen);
693 #endif
694 
695     if (spec->d1_level <= 1)
696         return;
697 
698     parent = spec->d1_stack[spec->d1_level -1];
699     assert (parent);
700 
701     if (attribute_str)
702     {
703 	data1_xattr **ap;
704 	res = parent;
705 	if (res->which != DATA1N_tag)
706 	    return;
707 	/* sweep through exising attributes.. */
708 	for (ap = &res->u.tag.attributes; *ap; ap = &(*ap)->next)
709 	    if (strlen((*ap)->name) == attribute_len &&
710 		!memcmp((*ap)->name, attribute_str, attribute_len))
711 		break;
712 	if (!*ap)
713 	{
714 	    /* new attribute. Create it with name + value */
715 	    *ap = nmem_malloc(spec->m, sizeof(**ap));
716 
717 	    (*ap)->name = nmem_malloc(spec->m, attribute_len+1);
718 	    memcpy((*ap)->name, attribute_str, attribute_len);
719 	    (*ap)->name[attribute_len] = '\0';
720 
721 	    (*ap)->value = nmem_malloc(spec->m, elen+1);
722 	    memcpy((*ap)->value, ebuf, elen);
723 	    (*ap)->value[elen] = '\0';
724 	    (*ap)->next = 0;
725 	}
726 	else
727 	{
728 	    /* append to value if attribute already exists */
729 	    char *nv = nmem_malloc(spec->m, elen + 1 + strlen((*ap)->value));
730 	    strcpy(nv, (*ap)->value);
731 	    memcpy (nv + strlen(nv), ebuf, elen);
732 	    nv[strlen(nv)+elen] = '\0';
733 	    (*ap)->value = nv;
734 	}
735     }
736     else
737     {
738 	if ((res = spec->d1_stack[spec->d1_level]) &&
739 	    res->which == DATA1N_data)
740 	    org_len = res->u.data.len;
741 	else
742 	{
743 	    org_len = 0;
744 
745 	    res = data1_mk_node2 (spec->dh, spec->m, DATA1N_data, parent);
746 	    res->u.data.what = DATA1I_text;
747 	    res->u.data.len = 0;
748 	    res->u.data.formatted_text = formatted_text;
749 	    res->u.data.data = 0;
750 
751 	    if (spec->d1_stack[spec->d1_level])
752 		spec->d1_stack[spec->d1_level]->next = res;
753 	    spec->d1_stack[spec->d1_level] = res;
754 	}
755 	if (org_len + elen >= spec->concatBuf[spec->d1_level].max)
756 	{
757 	    char *old_buf, *new_buf;
758 
759 	    spec->concatBuf[spec->d1_level].max = org_len + elen + 256;
760 	    new_buf = (char *) xmalloc (spec->concatBuf[spec->d1_level].max);
761 	    if ((old_buf = spec->concatBuf[spec->d1_level].buf))
762 	    {
763 		memcpy (new_buf, old_buf, org_len);
764 		xfree (old_buf);
765 	    }
766 	    spec->concatBuf[spec->d1_level].buf = new_buf;
767 	}
768 	memcpy (spec->concatBuf[spec->d1_level].buf + org_len, ebuf, elen);
769 	res->u.data.len += elen;
770     }
771 }
772 
execDataP(struct lexSpec * spec,const char * ebuf,int elen,int formatted_text)773 static void execDataP (struct lexSpec *spec,
774                        const char *ebuf, int elen, int formatted_text)
775 {
776     execData (spec, ebuf, elen, formatted_text, 0, 0);
777 }
778 
tagDataRelease(struct lexSpec * spec)779 static void tagDataRelease (struct lexSpec *spec)
780 {
781     data1_node *res;
782 
783     if ((res = spec->d1_stack[spec->d1_level]) &&
784 	res->which == DATA1N_data &&
785 	res->u.data.what == DATA1I_text)
786     {
787 	assert (!res->u.data.data);
788 	assert (res->u.data.len > 0);
789 	if (res->u.data.len > DATA1_LOCALDATA)
790 	    res->u.data.data = (char *) nmem_malloc (spec->m, res->u.data.len);
791 	else
792 	    res->u.data.data = res->lbuf;
793 	memcpy (res->u.data.data, spec->concatBuf[spec->d1_level].buf,
794 		res->u.data.len);
795     }
796 }
797 
variantBegin(struct lexSpec * spec,const char * class_str,int class_len,const char * type_str,int type_len,const char * value_str,int value_len)798 static void variantBegin (struct lexSpec *spec,
799 			  const char *class_str, int class_len,
800 			  const char *type_str, int type_len,
801 			  const char *value_str, int value_len)
802 {
803     struct data1_node *parent = spec->d1_stack[spec->d1_level -1];
804     char tclass[DATA1_MAX_SYMBOL], ttype[DATA1_MAX_SYMBOL];
805     data1_vartype *tp;
806     int i;
807     data1_node *res;
808 
809     if (spec->d1_level == 0)
810     {
811         yaz_log (YLOG_WARN, "in variant begin. No record type defined");
812         return ;
813     }
814     if (class_len >= DATA1_MAX_SYMBOL)
815 	class_len = DATA1_MAX_SYMBOL-1;
816     memcpy (tclass, class_str, class_len);
817     tclass[class_len] = '\0';
818 
819     if (type_len >= DATA1_MAX_SYMBOL)
820 	type_len = DATA1_MAX_SYMBOL-1;
821     memcpy (ttype, type_str, type_len);
822     ttype[type_len] = '\0';
823 
824 #if REGX_DEBUG
825     yaz_log (YLOG_LOG, "variant begin(%s,%s,%d)", tclass, ttype,
826 	  spec->d1_level);
827 #endif
828 
829     if (!(tp =
830 	  data1_getvartypeby_absyn(spec->dh, parent->root->u.root.absyn,
831 				   tclass, ttype)))
832 	return;
833 
834     if (parent->which != DATA1N_variant)
835     {
836 	res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
837 	if (spec->d1_stack[spec->d1_level])
838 	    tagDataRelease (spec);
839 	spec->d1_stack[spec->d1_level] = res;
840 	spec->d1_stack[++(spec->d1_level)] = NULL;
841     }
842     for (i = spec->d1_level-1; spec->d1_stack[i]->which == DATA1N_variant; i--)
843 	if (spec->d1_stack[i]->u.variant.type == tp)
844 	{
845 	    spec->d1_level = i;
846 	    break;
847 	}
848 
849 #if REGX_DEBUG
850     yaz_log (YLOG_LOG, "variant node(%d)", spec->d1_level);
851 #endif
852     parent = spec->d1_stack[spec->d1_level-1];
853     res = data1_mk_node2 (spec->dh, spec->m, DATA1N_variant, parent);
854     res->u.variant.type = tp;
855 
856     if (value_len >= DATA1_LOCALDATA)
857 	value_len =DATA1_LOCALDATA-1;
858     memcpy (res->lbuf, value_str, value_len);
859     res->lbuf[value_len] = '\0';
860 
861     res->u.variant.value = res->lbuf;
862 
863     if (spec->d1_stack[spec->d1_level])
864 	tagDataRelease (spec);
865     spec->d1_stack[spec->d1_level] = res;
866     spec->d1_stack[++(spec->d1_level)] = NULL;
867 }
868 
tagStrip(const char ** tag,int * len)869 static void tagStrip (const char **tag, int *len)
870 {
871     int i;
872 
873     for (i = *len; i > 0 && isspace((*tag)[i-1]); --i)
874         ;
875     *len = i;
876     for (i = 0; i < *len && isspace((*tag)[i]); i++)
877         ;
878     *tag += i;
879     *len -= i;
880 }
881 
tagBegin(struct lexSpec * spec,const char * tag,int len)882 static void tagBegin (struct lexSpec *spec,
883                       const char *tag, int len)
884 {
885     if (spec->d1_level == 0)
886     {
887         yaz_log (YLOG_WARN, "in element begin. No record type defined");
888         return ;
889     }
890     tagStrip (&tag, &len);
891     if (spec->d1_stack[spec->d1_level])
892 	tagDataRelease (spec);
893 
894 #if REGX_DEBUG
895     yaz_log (YLOG_LOG, "begin tag(%.*s, %d)", len, tag, spec->d1_level);
896 #endif
897 
898     spec->d1_stack[spec->d1_level] = data1_mk_tag_n (
899         spec->dh, spec->m, tag, len, 0, spec->d1_stack[spec->d1_level -1]);
900     spec->d1_stack[++(spec->d1_level)] = NULL;
901 }
902 
tagEnd(struct lexSpec * spec,int min_level,const char * tag,int len)903 static void tagEnd (struct lexSpec *spec, int min_level,
904                     const char *tag, int len)
905 {
906     tagStrip (&tag, &len);
907     while (spec->d1_level > min_level)
908     {
909 	tagDataRelease (spec);
910         (spec->d1_level)--;
911         if (spec->d1_level == 0)
912 	    break;
913         if ((spec->d1_stack[spec->d1_level]->which == DATA1N_tag) &&
914 	    (!tag ||
915 	     (strlen(spec->d1_stack[spec->d1_level]->u.tag.tag) ==
916 	      (size_t) len &&
917 	      !memcmp (spec->d1_stack[spec->d1_level]->u.tag.tag, tag, len))))
918             break;
919     }
920 #if REGX_DEBUG
921     yaz_log (YLOG_LOG, "end tag(%d)", spec->d1_level);
922 #endif
923 }
924 
925 
tryMatch(struct lexSpec * spec,int * pptr,int * mptr,struct DFA * dfa,int greedy)926 static int tryMatch (struct lexSpec *spec, int *pptr, int *mptr,
927                      struct DFA *dfa, int greedy)
928 {
929     struct DFA_state *state = dfa->states[0];
930     struct DFA_tran *t;
931     unsigned char c = 0;
932     unsigned char c_prev = 0;
933     int ptr = *pptr;          /* current pointer */
934     int start_ptr = *pptr;    /* first char of match */
935     int last_ptr = 0;         /* last char of match */
936     int last_rule = 0;        /* rule number of current match */
937     int restore_ptr = 0;
938     int i;
939 
940     if (ptr)
941     {
942 	--ptr;
943         c = f_win_advance (spec, &ptr);
944     }
945     while (1)
946     {
947 	if (dfa->states[0] == state)
948 	{
949 	    c_prev = c;
950 	    restore_ptr = ptr;
951 	}
952         c = f_win_advance (spec, &ptr);
953 
954         if (ptr == F_WIN_EOF)
955         {
956             if (last_rule)
957             {
958                 *mptr = start_ptr;
959                 *pptr = last_ptr;
960                 return 1;
961             }
962             break;
963         }
964 
965         t = state->trans;
966         i = state->tran_no;
967         while (1)
968             if (--i < 0)    /* no transition for character c */
969             {
970                 if (last_rule)
971                 {
972                     *mptr = start_ptr;     /* match starts here */
973                     *pptr = last_ptr;      /* match end here (+1) */
974                     return 1;
975                 }
976                 state = dfa->states[0];
977 
978 		ptr = restore_ptr;
979 		c = f_win_advance (spec, &ptr);
980 
981                 start_ptr = ptr;
982 
983                 break;
984             }
985             else if (c >= t->ch[0] && c <= t->ch[1])
986             {
987                 state = dfa->states[t->to];
988                 if (state->rule_no && c_prev == '\n')
989 		{
990 		    last_rule = state->rule_no;
991 		    last_ptr = ptr;
992 		}
993 		else if (state->rule_nno)
994 		{
995 		    last_rule = state->rule_nno;
996 		    last_ptr = ptr;
997 		}
998 		break;
999             }
1000             else
1001                 t++;
1002     }
1003     return 0;
1004 }
1005 
execTok(struct lexSpec * spec,const char ** src,const char ** tokBuf,int * tokLen)1006 static int execTok (struct lexSpec *spec, const char **src,
1007                     const char **tokBuf, int *tokLen)
1008 {
1009     const char *s = *src;
1010 
1011     while (*s == ' ' || *s == '\t')
1012         s++;
1013     if (!*s)
1014         return 0;
1015     if (*s == '$' && s[1] >= '0' && s[1] <= '9')
1016     {
1017         int n = 0;
1018         s++;
1019         while (*s >= '0' && *s <= '9')
1020             n = n*10 + (*s++ -'0');
1021         if (spec->arg_no == 0)
1022         {
1023             *tokBuf = "";
1024             *tokLen = 0;
1025         }
1026         else
1027         {
1028             if (n >= spec->arg_no)
1029                 n = spec->arg_no-1;
1030             *tokBuf = f_win_get (spec, spec->arg_start[n], spec->arg_end[n],
1031 				 tokLen);
1032         }
1033     }
1034     else if (*s == '\"')
1035     {
1036         *tokBuf = ++s;
1037         while (*s && *s != '\"')
1038             s++;
1039         *tokLen = s - *tokBuf;
1040         if (*s)
1041             s++;
1042         *src = s;
1043     }
1044     else if (*s == '\n' || *s == ';')
1045     {
1046         *src = s+1;
1047         return 1;
1048     }
1049     else if (*s == '-')
1050     {
1051         *tokBuf = s++;
1052         while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1053                *s != ';')
1054             s++;
1055         *tokLen = s - *tokBuf;
1056         *src = s;
1057         return 3;
1058     }
1059     else
1060     {
1061         *tokBuf = s++;
1062         while (*s && *s != ' ' && *s != '\t' && *s != '\n' && *s != '\r' &&
1063                *s != ';')
1064             s++;
1065         *tokLen = s - *tokBuf;
1066     }
1067     *src = s;
1068     return 2;
1069 }
1070 
regxStrz(const char * src,int len,char * str)1071 static char *regxStrz (const char *src, int len, char *str)
1072 {
1073     if (len > 63)
1074         len = 63;
1075     memcpy (str, src, len);
1076     str[len] = '\0';
1077     return str;
1078 }
1079 
1080 #if HAVE_TCL_H
cmd_tcl_begin(ClientData clientData,Tcl_Interp * interp,int argc,const char ** argv)1081 static int cmd_tcl_begin (ClientData clientData, Tcl_Interp *interp,
1082 			  int argc, const char **argv)
1083 {
1084     struct lexSpec *spec = (struct lexSpec *) clientData;
1085     if (argc < 2)
1086 	return TCL_ERROR;
1087     if (!strcmp(argv[1], "record") && argc == 3)
1088     {
1089 	const char *absynName = argv[2];
1090         data1_node *res;
1091 
1092 #if REGX_DEBUG
1093 	yaz_log (YLOG_LOG, "begin record %s", absynName);
1094 #endif
1095         res = data1_mk_root (spec->dh, spec->m, absynName);
1096 
1097 	spec->d1_level = 0;
1098 
1099         spec->d1_stack[spec->d1_level++] = res;
1100 
1101         res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1102 
1103         spec->d1_stack[spec->d1_level++] = res;
1104 
1105         spec->d1_stack[spec->d1_level] = NULL;
1106     }
1107     else if (!strcmp(argv[1], "element") && argc == 3)
1108     {
1109 	tagBegin (spec, argv[2], strlen(argv[2]));
1110     }
1111     else if (!strcmp (argv[1], "variant") && argc == 5)
1112     {
1113 	variantBegin (spec, argv[2], strlen(argv[2]),
1114 		      argv[3], strlen(argv[3]),
1115 		      argv[4], strlen(argv[4]));
1116     }
1117     else if (!strcmp (argv[1], "context") && argc == 3)
1118     {
1119 	struct lexContext *lc = spec->context;
1120 #if REGX_DEBUG
1121 	yaz_log (YLOG_LOG, "begin context %s",argv[2]);
1122 #endif
1123 	while (lc && strcmp (argv[2], lc->name))
1124 	    lc = lc->next;
1125 	if (lc)
1126 	{
1127 	    spec->context_stack[++(spec->context_stack_top)] = lc;
1128 	}
1129 	else
1130 	    yaz_log (YLOG_WARN, "unknown context %s", argv[2]);
1131     }
1132     else
1133 	return TCL_ERROR;
1134     return TCL_OK;
1135 }
1136 
cmd_tcl_end(ClientData clientData,Tcl_Interp * interp,int argc,const char ** argv)1137 static int cmd_tcl_end (ClientData clientData, Tcl_Interp *interp,
1138 			int argc, const char **argv)
1139 {
1140     struct lexSpec *spec = (struct lexSpec *) clientData;
1141     if (argc < 2)
1142 	return TCL_ERROR;
1143 
1144     if (!strcmp (argv[1], "record"))
1145     {
1146 	while (spec->d1_level)
1147 	{
1148 	    tagDataRelease (spec);
1149 	    (spec->d1_level)--;
1150 	}
1151 #if REGX_DEBUG
1152 	yaz_log (YLOG_LOG, "end record");
1153 #endif
1154 	spec->stop_flag = 1;
1155     }
1156     else if (!strcmp (argv[1], "element"))
1157     {
1158 	int min_level = 2;
1159 	const char *element = 0;
1160 	if (argc >= 3 && !strcmp(argv[2], "-record"))
1161 	{
1162 	    min_level = 0;
1163 	    if (argc == 4)
1164 		element = argv[3];
1165 	}
1166 	else
1167 	    if (argc == 3)
1168 		element = argv[2];
1169 	tagEnd (spec, min_level, element, (element ? strlen(element) : 0));
1170 	if (spec->d1_level <= 1)
1171 	{
1172 #if REGX_DEBUG
1173 	    yaz_log (YLOG_LOG, "end element end records");
1174 #endif
1175 	    spec->stop_flag = 1;
1176 	}
1177     }
1178     else if (!strcmp (argv[1], "context"))
1179     {
1180 #if REGX_DEBUG
1181 	yaz_log (YLOG_LOG, "end context");
1182 #endif
1183 	if (spec->context_stack_top)
1184 	    (spec->context_stack_top)--;
1185     }
1186     else
1187 	return TCL_ERROR;
1188     return TCL_OK;
1189 }
1190 
cmd_tcl_data(ClientData clientData,Tcl_Interp * interp,int argc,const char ** argv)1191 static int cmd_tcl_data (ClientData clientData, Tcl_Interp *interp,
1192 			 int argc, const char **argv)
1193 {
1194     int argi = 1;
1195     int textFlag = 0;
1196     const char *element = 0;
1197     const char *attribute = 0;
1198     struct lexSpec *spec = (struct lexSpec *) clientData;
1199 
1200     while (argi < argc)
1201     {
1202 	if (!strcmp("-text", argv[argi]))
1203 	{
1204 	    textFlag = 1;
1205 	    argi++;
1206 	}
1207 	else if (!strcmp("-element", argv[argi]))
1208 	{
1209 	    argi++;
1210 	    if (argi < argc)
1211 		element = argv[argi++];
1212 	}
1213 	else if (!strcmp("-attribute", argv[argi]))
1214 	{
1215 	    argi++;
1216 	    if (argi < argc)
1217 		attribute = argv[argi++];
1218 	}
1219 	else
1220 	    break;
1221     }
1222     if (element)
1223 	tagBegin (spec, element, strlen(element));
1224 
1225     while (argi < argc)
1226     {
1227 #if TCL_MAJOR_VERSION > 8 || (TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION > 0)
1228 	Tcl_DString ds;
1229 	char *native = Tcl_UtfToExternalDString(0, argv[argi], -1, &ds);
1230 	execData (spec, native, strlen(native), textFlag, attribute,
1231 		  attribute ? strlen(attribute) : 0);
1232 	Tcl_DStringFree (&ds);
1233 #else
1234 	execData (spec, argv[argi], strlen(argv[argi]), textFlag, attribute,
1235 		  attribute ? strlen(attribute) : 0);
1236 #endif
1237 	argi++;
1238     }
1239     if (element)
1240 	tagEnd (spec, 2, NULL, 0);
1241     return TCL_OK;
1242 }
1243 
cmd_tcl_unread(ClientData clientData,Tcl_Interp * interp,int argc,const char ** argv)1244 static int cmd_tcl_unread (ClientData clientData, Tcl_Interp *interp,
1245 			   int argc, const char **argv)
1246 {
1247     struct lexSpec *spec = (struct lexSpec *) clientData;
1248     int argi = 1;
1249     int offset = 0;
1250     int no;
1251 
1252     while (argi < argc)
1253     {
1254 	if (!strcmp("-offset", argv[argi]))
1255 	{
1256 	    argi++;
1257 	    if (argi < argc)
1258 	    {
1259 		offset = atoi(argv[argi]);
1260 		argi++;
1261 	    }
1262 	}
1263 	else
1264 	    break;
1265     }
1266     if (argi != argc-1)
1267 	return TCL_ERROR;
1268     no = atoi(argv[argi]);
1269     if (no >= spec->arg_no)
1270 	no = spec->arg_no - 1;
1271     spec->ptr = spec->arg_start[no] + offset;
1272     return TCL_OK;
1273 }
1274 
execTcl(struct lexSpec * spec,struct regxCode * code)1275 static void execTcl (struct lexSpec *spec, struct regxCode *code)
1276 {
1277     int i;
1278     int ret;
1279     for (i = 0; i < spec->arg_no; i++)
1280     {
1281 	char var_name[10], *var_buf;
1282 	int var_len, ch;
1283 
1284 	sprintf (var_name, "%d", i);
1285 	var_buf = f_win_get (spec, spec->arg_start[i], spec->arg_end[i],
1286 			     &var_len);
1287 	if (var_buf)
1288 	{
1289 	    ch = var_buf[var_len];
1290 	    var_buf[var_len] = '\0';
1291 	    Tcl_SetVar (spec->tcl_interp, var_name, var_buf, 0);
1292 	    var_buf[var_len] = ch;
1293 	}
1294     }
1295 #if HAVE_TCL_OBJECTS
1296     ret = Tcl_GlobalEvalObj(spec->tcl_interp, code->tcl_obj);
1297 #else
1298     ret = Tcl_GlobalEval (spec->tcl_interp, code->str);
1299 #endif
1300     if (ret != TCL_OK)
1301     {
1302     	const char *err = Tcl_GetVar(spec->tcl_interp, "errorInfo", 0);
1303 	yaz_log(YLOG_FATAL, "Tcl error, line=%d, \"%s\"\n%s",
1304 #if TCL_MAJOR_VERSION == 8 && TCL_MINOR_VERSION < 5
1305 	    spec->tcl_interp->errorLine,
1306 #else
1307 	    Tcl_GetErrorLine(spec->tcl_interp),
1308 #endif
1309 	    Tcl_GetStringResult(spec->tcl_interp),
1310 	    err ? err : "[NO ERRORINFO]");
1311     }
1312 }
1313 /* HAVE_TCL_H */
1314 #endif
1315 
execCode(struct lexSpec * spec,struct regxCode * code)1316 static void execCode (struct lexSpec *spec, struct regxCode *code)
1317 {
1318     const char *s = code->str;
1319     int cmd_len, r;
1320     const char *cmd_str;
1321 
1322     r = execTok (spec, &s, &cmd_str, &cmd_len);
1323     while (r)
1324     {
1325         char *p, ptmp[64];
1326 
1327         if (r == 1)
1328         {
1329             r = execTok (spec, &s, &cmd_str, &cmd_len);
1330             continue;
1331         }
1332         p = regxStrz (cmd_str, cmd_len, ptmp);
1333         if (!strcmp (p, "begin"))
1334         {
1335             r = execTok (spec, &s, &cmd_str, &cmd_len);
1336             if (r < 2)
1337 	    {
1338 		yaz_log (YLOG_WARN, "missing keyword after 'begin'");
1339                 continue;
1340 	    }
1341             p = regxStrz (cmd_str, cmd_len, ptmp);
1342             if (!strcmp (p, "record"))
1343             {
1344                 r = execTok (spec, &s, &cmd_str, &cmd_len);
1345                 if (r < 2)
1346                     continue;
1347                 if (spec->d1_level <= 1)
1348                 {
1349                     static char absynName[64];
1350                     data1_node *res;
1351 
1352                     if (cmd_len > 63)
1353                         cmd_len = 63;
1354                     memcpy (absynName, cmd_str, cmd_len);
1355                     absynName[cmd_len] = '\0';
1356 #if REGX_DEBUG
1357                     yaz_log (YLOG_LOG, "begin record %s", absynName);
1358 #endif
1359                     res = data1_mk_root (spec->dh, spec->m, absynName);
1360 
1361 		    spec->d1_level = 0;
1362 
1363                     spec->d1_stack[spec->d1_level++] = res;
1364 
1365                     res = data1_mk_tag (spec->dh, spec->m, absynName, 0, res);
1366 
1367                     spec->d1_stack[spec->d1_level++] = res;
1368 
1369                     spec->d1_stack[spec->d1_level] = NULL;
1370                 }
1371                 r = execTok (spec, &s, &cmd_str, &cmd_len);
1372             }
1373             else if (!strcmp (p, "element"))
1374             {
1375                 r = execTok (spec, &s, &cmd_str, &cmd_len);
1376                 if (r < 2)
1377                     continue;
1378                 tagBegin (spec, cmd_str, cmd_len);
1379                 r = execTok (spec, &s, &cmd_str, &cmd_len);
1380             }
1381 	    else if (!strcmp (p, "variant"))
1382 	    {
1383 		int class_len;
1384 		const char *class_str = NULL;
1385 		int type_len;
1386 		const char *type_str = NULL;
1387 		int value_len;
1388 		const char *value_str = NULL;
1389 		r = execTok (spec, &s, &cmd_str, &cmd_len);
1390 		if (r < 2)
1391 		    continue;
1392 		class_str = cmd_str;
1393 		class_len = cmd_len;
1394 		r = execTok (spec, &s, &cmd_str, &cmd_len);
1395 		if (r < 2)
1396 		    continue;
1397 		type_str = cmd_str;
1398 		type_len = cmd_len;
1399 
1400 		r = execTok (spec, &s, &cmd_str, &cmd_len);
1401 		if (r < 2)
1402 		    continue;
1403 		value_str = cmd_str;
1404 		value_len = cmd_len;
1405 
1406                 variantBegin (spec, class_str, class_len,
1407 			      type_str, type_len, value_str, value_len);
1408 
1409 
1410 		r = execTok (spec, &s, &cmd_str, &cmd_len);
1411 	    }
1412 	    else if (!strcmp (p, "context"))
1413 	    {
1414 		if (r > 1)
1415 		{
1416 		    struct lexContext *lc = spec->context;
1417 		    r = execTok (spec, &s, &cmd_str, &cmd_len);
1418 		    p = regxStrz (cmd_str, cmd_len, ptmp);
1419 #if REGX_DEBUG
1420 		    yaz_log (YLOG_LOG, "begin context %s", p);
1421 #endif
1422 		    while (lc && strcmp (p, lc->name))
1423 			lc = lc->next;
1424 		    if (lc)
1425 			spec->context_stack[++(spec->context_stack_top)] = lc;
1426 		    else
1427 			yaz_log (YLOG_WARN, "unknown context %s", p);
1428 
1429 		}
1430 		r = execTok (spec, &s, &cmd_str, &cmd_len);
1431 	    }
1432 	    else
1433 	    {
1434 		yaz_log (YLOG_WARN, "bad keyword '%s' after begin", p);
1435 	    }
1436         }
1437         else if (!strcmp (p, "end"))
1438         {
1439             r = execTok (spec, &s, &cmd_str, &cmd_len);
1440             if (r < 2)
1441 	    {
1442 		yaz_log (YLOG_WARN, "missing keyword after 'end'");
1443 		continue;
1444 	    }
1445 	    p = regxStrz (cmd_str, cmd_len, ptmp);
1446 	    if (!strcmp (p, "record"))
1447 	    {
1448 		while (spec->d1_level)
1449 		{
1450 		    tagDataRelease (spec);
1451 		    (spec->d1_level)--;
1452 		}
1453 		r = execTok (spec, &s, &cmd_str, &cmd_len);
1454 #if REGX_DEBUG
1455 		yaz_log (YLOG_LOG, "end record");
1456 #endif
1457 		spec->stop_flag = 1;
1458 	    }
1459 	    else if (!strcmp (p, "element"))
1460 	    {
1461                 int min_level = 2;
1462                 while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1463                 {
1464                     if (cmd_len==7 && !memcmp ("-record", cmd_str, cmd_len))
1465                         min_level = 0;
1466                 }
1467 		if (r > 2)
1468 		{
1469 		    tagEnd (spec, min_level, cmd_str, cmd_len);
1470 		    r = execTok (spec, &s, &cmd_str, &cmd_len);
1471 		}
1472 		else
1473 		    tagEnd (spec, min_level, NULL, 0);
1474                 if (spec->d1_level <= 1)
1475                 {
1476 #if REGX_DEBUG
1477 		    yaz_log (YLOG_LOG, "end element end records");
1478 #endif
1479 		    spec->stop_flag = 1;
1480                 }
1481 
1482 	    }
1483 	    else if (!strcmp (p, "context"))
1484 	    {
1485 #if REGX_DEBUG
1486 		yaz_log (YLOG_LOG, "end context");
1487 #endif
1488 		if (spec->context_stack_top)
1489 		    (spec->context_stack_top)--;
1490 		r = execTok (spec, &s, &cmd_str, &cmd_len);
1491 	    }
1492 	    else
1493 		yaz_log (YLOG_WARN, "bad keyword '%s' after end", p);
1494 	}
1495         else if (!strcmp (p, "data"))
1496         {
1497             int textFlag = 0;
1498             int element_len;
1499             const char *element_str = NULL;
1500 	    int attribute_len;
1501 	    const char *attribute_str = NULL;
1502 
1503             while ((r = execTok (spec, &s, &cmd_str, &cmd_len)) == 3)
1504             {
1505                 if (cmd_len==5 && !memcmp ("-text", cmd_str, cmd_len))
1506                     textFlag = 1;
1507                 else if (cmd_len==8 && !memcmp ("-element", cmd_str, cmd_len))
1508                 {
1509                     r = execTok (spec, &s, &element_str, &element_len);
1510                     if (r < 2)
1511                         break;
1512                 }
1513                 else if (cmd_len==10 && !memcmp ("-attribute", cmd_str,
1514 						 cmd_len))
1515                 {
1516                     r = execTok (spec, &s, &attribute_str, &attribute_len);
1517                     if (r < 2)
1518                         break;
1519                 }
1520                 else
1521                     yaz_log (YLOG_WARN, "bad data option: %.*s",
1522                           cmd_len, cmd_str);
1523             }
1524             if (r != 2)
1525             {
1526                 yaz_log (YLOG_WARN, "missing data item after data");
1527                 continue;
1528             }
1529             if (element_str)
1530                 tagBegin (spec, element_str, element_len);
1531             do
1532             {
1533                 execData (spec, cmd_str, cmd_len, textFlag,
1534 			  attribute_str, attribute_len);
1535                 r = execTok (spec, &s, &cmd_str, &cmd_len);
1536             } while (r > 1);
1537             if (element_str)
1538                 tagEnd (spec, 2, NULL, 0);
1539         }
1540         else if (!strcmp (p, "unread"))
1541         {
1542             int no, offset;
1543             r = execTok (spec, &s, &cmd_str, &cmd_len);
1544             if (r==3 && cmd_len == 7 && !memcmp ("-offset", cmd_str, cmd_len))
1545             {
1546                 r = execTok (spec, &s, &cmd_str, &cmd_len);
1547                 if (r < 2)
1548                 {
1549                     yaz_log (YLOG_WARN, "missing number after -offset");
1550                     continue;
1551                 }
1552                 p = regxStrz (cmd_str, cmd_len, ptmp);
1553                 offset = atoi (p);
1554                 r = execTok (spec, &s, &cmd_str, &cmd_len);
1555             }
1556             else
1557                 offset = 0;
1558             if (r < 2)
1559             {
1560                 yaz_log (YLOG_WARN, "missing index after unread command");
1561                 continue;
1562             }
1563             if (cmd_len != 1 || *cmd_str < '0' || *cmd_str > '9')
1564             {
1565                 yaz_log (YLOG_WARN, "bad index after unread command");
1566                 continue;
1567             }
1568             else
1569             {
1570                 no = *cmd_str - '0';
1571                 if (no >= spec->arg_no)
1572                     no = spec->arg_no - 1;
1573                 spec->ptr = spec->arg_start[no] + offset;
1574             }
1575             r = execTok (spec, &s, &cmd_str, &cmd_len);
1576         }
1577 	else if (!strcmp (p, "context"))
1578 	{
1579             if (r > 1)
1580 	    {
1581 		struct lexContext *lc = spec->context;
1582 		r = execTok (spec, &s, &cmd_str, &cmd_len);
1583 		p = regxStrz (cmd_str, cmd_len, ptmp);
1584 
1585 		while (lc && strcmp (p, lc->name))
1586 		    lc = lc->next;
1587 		if (lc)
1588 		    spec->context_stack[spec->context_stack_top] = lc;
1589 		else
1590 		    yaz_log (YLOG_WARN, "unknown context %s", p);
1591 
1592 	    }
1593 	    r = execTok (spec, &s, &cmd_str, &cmd_len);
1594 	}
1595         else
1596         {
1597             yaz_log (YLOG_WARN, "unknown code command '%.*s'", cmd_len, cmd_str);
1598             r = execTok (spec, &s, &cmd_str, &cmd_len);
1599             continue;
1600         }
1601         if (r > 1)
1602         {
1603             yaz_log (YLOG_WARN, "ignoring token %.*s", cmd_len, cmd_str);
1604             do {
1605                 r = execTok (spec, &s, &cmd_str, &cmd_len);
1606             } while (r > 1);
1607         }
1608     }
1609 }
1610 
1611 
execAction(struct lexSpec * spec,struct lexRuleAction * ap,int start_ptr,int * pptr)1612 static int execAction (struct lexSpec *spec, struct lexRuleAction *ap,
1613                        int start_ptr, int *pptr)
1614 {
1615     int sptr;
1616     int arg_start[20];
1617     int arg_end[20];
1618     int arg_no = 1;
1619 
1620     if (!ap)
1621 	return 1;
1622     arg_start[0] = start_ptr;
1623     arg_end[0] = *pptr;
1624     spec->arg_start = arg_start;
1625     spec->arg_end = arg_end;
1626 
1627     while (ap)
1628     {
1629         switch (ap->which)
1630         {
1631         case REGX_PATTERN:
1632             if (ap->u.pattern.body)
1633             {
1634                 arg_start[arg_no] = *pptr;
1635                 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 0))
1636                 {
1637                     arg_end[arg_no] = F_WIN_EOF;
1638                     arg_no++;
1639                     arg_start[arg_no] = F_WIN_EOF;
1640                     arg_end[arg_no] = F_WIN_EOF;
1641 		    yaz_log(YLOG_DEBUG, "Pattern match rest of record");
1642 		    *pptr = F_WIN_EOF;
1643                 }
1644                 else
1645                 {
1646                     arg_end[arg_no] = sptr;
1647                     arg_no++;
1648                     arg_start[arg_no] = sptr;
1649                     arg_end[arg_no] = *pptr;
1650                 }
1651             }
1652             else
1653             {
1654                 arg_start[arg_no] = *pptr;
1655                 if (!tryMatch (spec, pptr, &sptr, ap->u.pattern.dfa, 1))
1656                     return 1;
1657                 if (sptr != arg_start[arg_no])
1658                     return 1;
1659                 arg_end[arg_no] = *pptr;
1660             }
1661             arg_no++;
1662             break;
1663         case REGX_CODE:
1664 	    spec->arg_no = arg_no;
1665 	    spec->ptr = *pptr;
1666 #if HAVE_TCL_H
1667 	    if (spec->tcl_interp)
1668 		execTcl(spec, ap->u.code);
1669 	    else
1670 		execCode (spec, ap->u.code);
1671 #else
1672 	    execCode (spec, ap->u.code);
1673 #endif
1674 	    *pptr = spec->ptr;
1675 	    if (spec->stop_flag)
1676 		return 0;
1677             break;
1678         case REGX_END:
1679             arg_start[arg_no] = *pptr;
1680             arg_end[arg_no] = F_WIN_EOF;
1681             arg_no++;
1682             *pptr = F_WIN_EOF;
1683         }
1684         ap = ap->next;
1685     }
1686     return 1;
1687 }
1688 
execRule(struct lexSpec * spec,struct lexContext * context,int ruleNo,int start_ptr,int * pptr)1689 static int execRule (struct lexSpec *spec, struct lexContext *context,
1690                      int ruleNo, int start_ptr, int *pptr)
1691 {
1692 #if REGX_DEBUG
1693     yaz_log (YLOG_LOG, "exec rule %d", ruleNo);
1694 #endif
1695     return execAction (spec, context->fastRule[ruleNo]->actionList,
1696                        start_ptr, pptr);
1697 }
1698 
lexNode(struct lexSpec * spec,int * ptr)1699 int lexNode (struct lexSpec *spec, int *ptr)
1700 {
1701     struct lexContext *context = spec->context_stack[spec->context_stack_top];
1702     struct DFA_state *state = context->dfa->states[0];
1703     struct DFA_tran *t;
1704     unsigned char c;
1705     unsigned char c_prev = '\n';
1706     int i;
1707     int last_rule = 0;        /* rule number of current match */
1708     int last_ptr = *ptr;      /* last char of match */
1709     int start_ptr = *ptr;     /* first char of match */
1710     int skip_ptr = *ptr;      /* first char of run */
1711     int more = 0;
1712 
1713     while (1)
1714     {
1715         c = f_win_advance (spec, ptr);
1716         if (*ptr == F_WIN_EOF)
1717         {
1718 	    /* end of file met */
1719             if (last_rule)
1720             {
1721 		/* there was a match */
1722                 if (skip_ptr < start_ptr)
1723                 {
1724 		    /* deal with chars that didn't match */
1725                     int size;
1726                     char *buf;
1727                     buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1728                     execDataP (spec, buf, size, 0);
1729                 }
1730 		/* restore pointer */
1731                 *ptr = last_ptr;
1732 		/* execute rule */
1733                 if (!execRule (spec, context, last_rule, start_ptr, ptr))
1734                     return more;
1735 		/* restore skip pointer */
1736                 skip_ptr = *ptr;
1737                 last_rule = 0;
1738             }
1739             else if (skip_ptr < *ptr)
1740             {
1741 		/* deal with chars that didn't match */
1742                 int size;
1743                 char *buf;
1744                 buf = f_win_get (spec, skip_ptr, *ptr, &size);
1745                 execDataP (spec, buf, size, 0);
1746             }
1747             state = context->dfa->states[0];
1748             if (*ptr == F_WIN_EOF)
1749                 return more;
1750         }
1751         t = state->trans;
1752         i = state->tran_no;
1753         while (1)
1754             if (--i < 0)
1755             {   /* no transition for character c ... */
1756                 if (last_rule)
1757                 {
1758                     if (skip_ptr < start_ptr)
1759                     {
1760 			/* deal with chars that didn't match */
1761                         int size;
1762                         char *buf;
1763                         buf = f_win_get (spec, skip_ptr, start_ptr, &size);
1764                         execDataP (spec, buf, size, 0);
1765                     }
1766 		    /* restore pointer */
1767                     *ptr = last_ptr;
1768                     if (!execRule (spec, context, last_rule, start_ptr, ptr))
1769                     {
1770                         if (spec->f_win_ef && *ptr != F_WIN_EOF)
1771 			{
1772                             off_t end_offset = *ptr;
1773 #if REGX_DEBUG
1774 			    yaz_log (YLOG_LOG, "regx: endf ptr=%d", *ptr);
1775 #endif
1776                             (*spec->f_win_ef)(spec->stream, &end_offset);
1777 			}
1778                         return more;
1779                     }
1780 		    context = spec->context_stack[spec->context_stack_top];
1781                     skip_ptr = *ptr;
1782                     last_rule = 0;
1783                     last_ptr = start_ptr = *ptr;
1784                     if (start_ptr > 0)
1785                     {
1786                         --start_ptr;
1787                         c_prev = f_win_advance (spec, &start_ptr);
1788                     }
1789                 }
1790                 else
1791                 {
1792                     c_prev = f_win_advance (spec, &start_ptr);
1793                     *ptr = start_ptr;
1794                 }
1795                 state = context->dfa->states[0];
1796                 break;
1797             }
1798             else if (c >= t->ch[0] && c <= t->ch[1])
1799             {   /* transition ... */
1800                 state = context->dfa->states[t->to];
1801                 if (state->rule_no)
1802                 {
1803                     if (c_prev == '\n')
1804                     {
1805                         last_rule = state->rule_no;
1806                         last_ptr = *ptr;
1807                     }
1808                     else if (state->rule_nno)
1809                     {
1810                         last_rule = state->rule_nno;
1811                         last_ptr = *ptr;
1812                     }
1813                     more = 1;
1814                 }
1815                 break;
1816             }
1817             else
1818                 t++;
1819     }
1820     return more;
1821 }
1822 
lexRoot(struct lexSpec * spec,off_t offset,const char * context_name)1823 static data1_node *lexRoot (struct lexSpec *spec, off_t offset,
1824 			    const char *context_name)
1825 {
1826     struct lexContext *lt = spec->context;
1827     int ptr = offset;
1828     int ret;
1829 
1830     spec->stop_flag = 0;
1831     spec->d1_level = 0;
1832     spec->context_stack_top = 0;
1833     while (lt)
1834     {
1835 	if (!strcmp (lt->name, context_name))
1836 	    break;
1837 	lt = lt->next;
1838     }
1839     if (!lt)
1840     {
1841 	yaz_log (YLOG_WARN, "cannot find context %s", context_name);
1842 	return NULL;
1843     }
1844     spec->context_stack[spec->context_stack_top] = lt;
1845     spec->d1_stack[spec->d1_level] = NULL;
1846 #if 1
1847     if (!lt->initFlag)
1848     {
1849 	lt->initFlag = 1;
1850 	execAction (spec, lt->initActionList, ptr, &ptr);
1851     }
1852 #endif
1853     execAction (spec, lt->beginActionList, ptr, &ptr);
1854 
1855     ret = lexNode (spec, &ptr);
1856     while (spec->d1_level)
1857     {
1858 	tagDataRelease (spec);
1859 	(spec->d1_level)--;
1860     }
1861     if (!ret)
1862         return 0;
1863     execAction (spec, lt->endActionList, ptr, &ptr);
1864     return spec->d1_stack[0];
1865 }
1866 
grs_destroy(void * clientData)1867 void grs_destroy(void *clientData)
1868 {
1869     struct lexSpecs *specs = (struct lexSpecs *) clientData;
1870     if (specs->spec)
1871     {
1872 	lexSpecDestroy(&specs->spec);
1873     }
1874     xfree (specs);
1875 }
1876 
grs_init(Res res,RecType recType)1877 void *grs_init(Res res, RecType recType)
1878 {
1879     struct lexSpecs *specs = (struct lexSpecs *) xmalloc (sizeof(*specs));
1880     specs->spec = 0;
1881     strcpy(specs->type, "");
1882     return specs;
1883 }
1884 
1885 
grs_config(void * clientData,Res res,const char * args)1886 ZEBRA_RES grs_config(void *clientData, Res res, const char *args)
1887 {
1888     struct lexSpecs *specs = (struct lexSpecs *) clientData;
1889     if (strlen(args) < sizeof(specs->type))
1890 	strcpy(specs->type, args);
1891     return ZEBRA_OK;
1892 }
1893 
grs_read_regx(struct grs_read_info * p)1894 data1_node *grs_read_regx (struct grs_read_info *p)
1895 {
1896     int res;
1897     struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1898     struct lexSpec **curLexSpec = &specs->spec;
1899     off_t start_offset;
1900 
1901 #if REGX_DEBUG
1902     yaz_log (YLOG_LOG, "grs_read_regx");
1903 #endif
1904     if (!*curLexSpec || strcmp ((*curLexSpec)->name, specs->type))
1905     {
1906         if (*curLexSpec)
1907             lexSpecDestroy (curLexSpec);
1908         *curLexSpec = lexSpecCreate (specs->type, p->dh);
1909         res = readFileSpec (*curLexSpec);
1910         if (res)
1911         {
1912             lexSpecDestroy (curLexSpec);
1913             return NULL;
1914         }
1915     }
1916     (*curLexSpec)->dh = p->dh;
1917     start_offset = p->stream->tellf(p->stream);
1918     if (start_offset == 0)
1919     {
1920         (*curLexSpec)->f_win_start = 0;
1921         (*curLexSpec)->f_win_end = 0;
1922         (*curLexSpec)->f_win_rf = p->stream->readf;
1923         (*curLexSpec)->f_win_sf = p->stream->seekf;
1924         (*curLexSpec)->stream = p->stream;
1925         (*curLexSpec)->f_win_ef = p->stream->endf;
1926         (*curLexSpec)->f_win_size = 500000;
1927     }
1928     (*curLexSpec)->m = p->mem;
1929     return lexRoot (*curLexSpec, start_offset, "main");
1930 }
1931 
extract_regx(void * clientData,struct recExtractCtrl * ctrl)1932 static int extract_regx(void *clientData, struct recExtractCtrl *ctrl)
1933 {
1934     return zebra_grs_extract(clientData, ctrl, grs_read_regx);
1935 }
1936 
retrieve_regx(void * clientData,struct recRetrieveCtrl * ctrl)1937 static int retrieve_regx(void *clientData, struct recRetrieveCtrl *ctrl)
1938 {
1939     return zebra_grs_retrieve(clientData, ctrl, grs_read_regx);
1940 }
1941 
1942 static struct recType regx_type = {
1943     0,
1944     "grs.regx",
1945     grs_init,
1946     grs_config,
1947     grs_destroy,
1948     extract_regx,
1949     retrieve_regx,
1950 };
1951 
1952 
1953 #if HAVE_TCL_H
grs_read_tcl(struct grs_read_info * p)1954 data1_node *grs_read_tcl (struct grs_read_info *p)
1955 {
1956     int res;
1957     struct lexSpecs *specs = (struct lexSpecs *) p->clientData;
1958     struct lexSpec **curLexSpec = &specs->spec;
1959     off_t start_offset;
1960 
1961 #if REGX_DEBUG
1962     yaz_log (YLOG_LOG, "grs_read_tcl");
1963 #endif
1964     if (!*curLexSpec || strcmp ((*curLexSpec)->name, specs->type))
1965     {
1966 	Tcl_Interp *tcl_interp;
1967         if (*curLexSpec)
1968             lexSpecDestroy (curLexSpec);
1969         *curLexSpec = lexSpecCreate (specs->type, p->dh);
1970 	Tcl_FindExecutable("");
1971 	tcl_interp = (*curLexSpec)->tcl_interp = Tcl_CreateInterp();
1972 	Tcl_Init(tcl_interp);
1973 	Tcl_CreateCommand (tcl_interp, "begin", cmd_tcl_begin, *curLexSpec, 0);
1974 	Tcl_CreateCommand (tcl_interp, "end", cmd_tcl_end, *curLexSpec, 0);
1975 	Tcl_CreateCommand (tcl_interp, "data", cmd_tcl_data, *curLexSpec, 0);
1976 	Tcl_CreateCommand (tcl_interp, "unread", cmd_tcl_unread,
1977 			   *curLexSpec, 0);
1978         res = readFileSpec (*curLexSpec);
1979         if (res)
1980         {
1981             lexSpecDestroy (curLexSpec);
1982             return NULL;
1983         }
1984     }
1985     (*curLexSpec)->dh = p->dh;
1986     start_offset = p->stream->tellf(p->stream);
1987     if (start_offset == 0)
1988     {
1989         (*curLexSpec)->f_win_start = 0;
1990         (*curLexSpec)->f_win_end = 0;
1991         (*curLexSpec)->f_win_rf = p->stream->readf;
1992         (*curLexSpec)->f_win_sf = p->stream->seekf;
1993         (*curLexSpec)->stream = p->stream;
1994         (*curLexSpec)->f_win_ef = p->stream->endf;
1995         (*curLexSpec)->f_win_size = 500000;
1996     }
1997     (*curLexSpec)->m = p->mem;
1998     return lexRoot (*curLexSpec, start_offset, "main");
1999 }
2000 
extract_tcl(void * clientData,struct recExtractCtrl * ctrl)2001 static int extract_tcl(void *clientData, struct recExtractCtrl *ctrl)
2002 {
2003     return zebra_grs_extract(clientData, ctrl, grs_read_tcl);
2004 }
2005 
retrieve_tcl(void * clientData,struct recRetrieveCtrl * ctrl)2006 static int retrieve_tcl(void *clientData, struct recRetrieveCtrl *ctrl)
2007 {
2008     return zebra_grs_retrieve(clientData, ctrl, grs_read_tcl);
2009 }
2010 
2011 static struct recType tcl_type = {
2012     0,
2013     "grs.tcl",
2014     grs_init,
2015     grs_config,
2016     grs_destroy,
2017     extract_tcl,
2018     retrieve_tcl,
2019 };
2020 
2021 #endif
2022 
2023 RecType
2024 #if IDZEBRA_STATIC_GRS_REGX
2025 idzebra_filter_grs_regx
2026 #else
2027 idzebra_filter
2028 #endif
2029 
2030 [] = {
2031     &regx_type,
2032 #if HAVE_TCL_H
2033     &tcl_type,
2034 #endif
2035     0,
2036 };
2037 /*
2038  * Local variables:
2039  * c-basic-offset: 4
2040  * c-file-style: "Stroustrup"
2041  * indent-tabs-mode: nil
2042  * End:
2043  * vim: shiftwidth=4 tabstop=8 expandtab
2044  */
2045 
2046