1 /*
2 System: Structured text retrieval tool sgrep.
3 Module: parser.c
4 Author: Pekka Kilpel�inen & Jani Jaakkola
5 Description: Parses expression string ( which has been gathered in
6 main.c and preprocessed in preprocess.c )
7 Returns operator tree.
8 Used through fuction parse_tree()
9 Version history: Original version February 1995 by JJ & PK
10 Copyright: University of Helsinki, Dept. of Computer Science
11 Distributed under GNU General Public Lisence
12 See file COPYING for details
13 */
14 #include <stdio.h>
15 #include <ctype.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include "defines.h"
19
20 /* Maximum length of one phrase. */
21 #define MAX_PHRASE_LENGTH 512
22 /* Maximum length of a reserved word (operation). Could also be smaller */
23 #define MAX_R_WORD 20
24
25 /*
26 * This points to first phrase in the phrase list
27 */
28 struct PHRASE_NODE *first_phrase=NULL;
29
30 void parse_error(char *);
31 char *give_str();
32 int get_next_char();
33 int get_next(string **);
34 struct TREE_NODE *reg_expr(int *, string **);
35 struct TREE_NODE *basic_expr(int *, string **);
36 struct TREE_NODE *oper_expr(int *, string **,struct TREE_NODE *);
37
38 #ifdef DEBUG
39 void print_tree(struct TREE_NODE *);
40 #endif
41
42 /*
43 * When looking for reserved words they are matched to these strings
44 */
45 char *r_word_str[]=
46 { "in","not","containing","or","..","_.","._","__","extracting",
47 "quote","_quote","quote_","_quote_",
48 "equal", /* PK Febr 12 '96 */
49 "outer","inner","concat","start","end","chars","join" };
50
51 /*
52 * The command string and a index to it
53 */
54 char *expr_str=NULL;
55 int expr_ind=0;
56
57 /*
58 * for telling where errors occurred
59 */
60 #define ELLENGTH 79
61 #define SHOWR 5
62 int line,column;
63 int errcol,errind;
64
65 /*
66 * Gets next character from input expression. Counts also lines and columns
67 * return character must be unsigned to keep sgrep is 8-bit clean
68 */
get_next_char()69 int get_next_char()
70 {
71 switch (expr_str[expr_ind]) {
72 case '\n':
73 column=-1;
74 line++;
75 break;
76 case '\t':
77 expr_str[expr_ind]=' ';
78 break;
79 }
80 column++;
81 return ( (unsigned char *)expr_str)[expr_ind++];
82 }
83
84 /*
85 * Scans next 'word' from input string. If word was a phrase sets **phrase
86 */
get_next(string ** phrase)87 int get_next(string **phrase)
88 {
89 static int ch=-1;
90 char str[MAX_PHRASE_LENGTH];
91 int i=0,j=0;
92 int start;
93 static int state=0;
94 int onum=0;
95
96 /* if we already have next character in ch then ch!=-1. Otherwise we have to
97 get one */
98 if (ch==-1) ch=get_next_char();
99
100 /* These show the place where possible error started */
101 errcol=column;
102 errind=expr_ind;
103
104 /* Finite automaton for recognizing next word */
105 do {
106 switch (state) {
107
108 /* This is the start state */
109 case 0:
110 /* Lets skip the spaces */
111 while (isspace(ch)) ch=get_next_char();
112
113 /* These show the place where possible error started */
114 errcol=column;
115 errind=expr_ind;
116
117 switch(ch) {
118 case 0:
119 return W_END;
120 case '(':
121 ch=-1;
122 return W_LPAREN;
123 case ')':
124 ch=-1;
125 return W_RPAREN;
126 case '[':
127 ch=-1;
128 return W_LBRACK;
129 case ']':
130 ch=-1;
131 return W_RBRACK;
132 case ',':
133 ch=-1;
134 return W_COMMA;
135 case '\"':
136 i=0;
137 state=1;
138 break;
139 case '#':
140 state=4;
141 break;
142 }
143 if ( isalpha(ch) || ch=='.' || ch=='_' )
144 {
145 i=1;
146 str[0]=tolower(ch);
147 state=2;
148 start=column;
149 }
150 if (ch>='0' && ch<='9')
151 {
152 i=1;
153 str[0]=ch;
154 start=column;
155 state=5;
156 }
157 if ( state==0 )
158 {
159 #ifdef DEBUG
160 fprintf(stderr,"Character %c ascii(%d)\n",ch,ch);
161 #endif
162 parse_error("Invalid character");
163 }
164 break;
165
166 /* This state reads a phrase string */
167 case 1:
168 if (ch==0 || ch=='\n')
169 parse_error("Unterminated phrase string");
170 if ( ch<' ' )
171 parse_error("Unprintable character in phrase");
172 if (ch=='\"')
173 {
174 if ( i==0 )
175 parse_error("Empty phrase");
176 *phrase=init_string(i,str);
177 state=0;
178 ch=-1;
179 return W_PHRASE;
180 }
181 if ( i==MAX_PHRASE_LENGTH )
182 {
183 fprintf(stderr,"%s ( > %d ) %s %d\n%s\n%s\n",
184 "Phrase length exceeds",MAX_PHRASE_LENGTH,"on line",line,
185 "Either you have forgotten the quote at the end of phrase or",
186 "you have to recompile with greater MAX_PHRASE_LENGTH.");
187 exit(2);
188 }
189 if (ch=='\\' ) state=3;
190 if ( state==1 ) str[i++]=ch;
191 break;
192
193 /* This state reads a reserved word (operator) */
194 case 2:
195 if ( ch==0 || ( !isalpha(ch) && ch!='.' && ch!='_' ) )
196 {
197 state=0;
198 str[i]=0;
199 for(j=0;j<R_WORDS;j++)
200 if (strcmp(r_word_str[j],str)==0) return j;
201 parse_error("Unknown word");
202 }
203 if (i==MAX_R_WORD)
204 parse_error("Unknown word");
205 str[i++]=tolower(ch);
206 break;
207
208 /* This state handles escape sequences */
209 case 3:
210 if ( ch==0 ) parse_error("Unterminated phrase");
211 switch (ch) {
212 case 't': ch='\t';break;
213 case 'n': ch='\n';break;
214 case 'r': ch='\r';break;
215 case 'f': ch='\f';break;
216 case 'b': ch='\b';break;
217 case '\\': ch='\\';break;
218 case '\"': ch='\"';break;
219 case '0':
220 case '1':
221 case '2':
222 case '3':
223 /* Octal number requires new state */
224 state=6;
225 onum=64*(ch-'0');
226 break;
227 default:
228 errcol=column;
229 errind=expr_ind;
230 parse_error("Unknown escape sequence");
231 break;
232 }
233 if (state==3) /* always true except for octal number */
234 {
235 str[i++]=ch;
236 state=1;
237 }
238 break;
239 /* This state skips comments */
240 case 4:
241 if (ch==0)
242 {
243 state=0;
244 return W_END;
245 }
246 if (ch=='\n') state=0;
247 break;
248 /* We read a number */
249 case 5:
250 if ( ch<'0'|| ch>'9' )
251 {
252 state=0;
253 str[i]=0;
254 *phrase=init_string(i,str);
255 return W_NUMBER;
256 }
257 if (i==9)
258 parse_error("Too big number");
259 str[i++]=ch;
260 break;
261 /* Read octal number #2 */
262 case 6:
263 if ( ch<'0' || ch>'7')
264 {
265 errcol=column;
266 errind=expr_ind;
267 parse_error("Octal number expected");
268 }
269 onum+=8*(ch-'0');
270 state=7;
271 break;
272 /* Read octal number #3 */
273 case 7:
274 if ( ch<'0' || ch>'7')
275 {
276 errcol=column;
277 errind=expr_ind;
278 parse_error("Octal number expected");
279 }
280 onum+=ch-'0';
281 if (onum==0)
282 {
283 errcol=column-3;
284 errind=expr_ind-3;
285 parse_error("\\000 in phrase does'nt work (yet)\n");
286 }
287 str[i++]=onum;
288 state=1;
289 break;
290 }
291
292 ch=get_next_char();
293 } while (TRUE);
294
295 }
296
297 /*
298 * Shows a given error message & where it occurred
299 */
parse_error(char * error)300 void parse_error(char *error)
301 {
302 char erlin[ELLENGTH+1];
303 int i;
304
305 if (errcol-ELLENGTH+SHOWR>0)
306 errind-=ELLENGTH-SHOWR;
307 else errind-=errcol;
308 for(i=0;i<ELLENGTH && expr_str[i+errind] && expr_str[i+errind]!='\n';i++)
309 erlin[i]=expr_str[i+errind];
310 erlin[i]=0;
311 fprintf(stderr,"Parse error on line %d column %d :\n\t%s\n%s\n",
312 line,errcol,error,erlin);
313 if ( errcol>ELLENGTH-SHOWR ) errcol=ELLENGTH-SHOWR;
314 for (i=0;i<errcol-1;i++) fprintf(stderr," ");
315 fprintf(stderr,"^\n");
316 exit(2);
317 }
318
319 /*
320 * Creates and initializez an operator node to parse tree
321 */
create_tree_node(int oper)322 struct TREE_NODE *create_tree_node(int oper)
323 {
324 struct TREE_NODE *n;
325
326 n=(struct TREE_NODE *)e_malloc(sizeof(struct TREE_NODE));
327 n->left=NULL;
328 n->right=NULL;
329 n->parent=NULL;
330 n->oper=oper;
331 n->number=-1;
332 n->leaf=NULL;
333 n->label_left=LABEL_NOTKNOWN;
334 n->label_right=LABEL_NOTKNOWN;
335 n->refcount=0;
336 n->GC_list=NULL;
337 return n;
338 }
339
340 /*
341 * Creates and initializes a leaf node to parse tree
342 */
create_leaf_node(int oper,string * phrase,int phrase_label)343 struct TREE_NODE * create_leaf_node(int oper, string *phrase, int phrase_label)
344 {
345 struct TREE_NODE *n;
346
347 /* Let's create a leaf node */
348 n=create_tree_node(oper);
349 n->label_left=phrase_label;
350 /* Let's create a phrase node for the phrase */
351 n->leaf=(struct PHRASE_NODE *)
352 e_malloc(sizeof (struct PHRASE_NODE));
353 n->leaf->phrase=phrase;
354 return n;
355 }
356
357 /*
358 * Here starts recursive parser
359 */
360
361 /* production basic_expr->constant_list */
cons_list(int * next,string ** phrase,struct GC_LIST * c_list)362 void cons_list(int *next, string **phrase, struct GC_LIST *c_list)
363 {
364 int s,e,ps,pe;
365 char *cons_err="invalid constant region list";
366
367 stats.constant_lists++;
368
369 ps=-1;pe=-1;
370 while (*next!=W_RBRACK) /* We can fall out here immediately,
371 which means we have empty list */
372 {
373 if (*next!=W_LPAREN)
374 parse_error(cons_err);
375 *next=get_next(phrase);
376 if (*next!=W_NUMBER)
377 parse_error(cons_err);
378 s=atoi((char *)(**phrase).s);
379 *next=get_next(phrase);
380 if (*next!=W_COMMA)
381 parse_error(cons_err);
382 *next=get_next(phrase);
383 if (*next!=W_NUMBER)
384 parse_error(cons_err);
385 e=atoi((char *)(**phrase).s);
386 *next=get_next(phrase);
387 if (*next!=W_RPAREN)
388 parse_error(cons_err);
389 if (e<s)
390 parse_error("region end point must be greater than start point");
391 *next=get_next(phrase);
392 if (s<ps || (s==ps && e<=pe) )
393 parse_error("constant gc list must be sorted");
394 if (e<=pe || s==ps)
395 {
396 /* nesting detected */
397 c_list->nested=TRUE;
398 }
399 add_region(c_list,s,e);
400 ps=s;pe=e;
401 }
402 }
403
404 /* Which kind of basic expressions */
basic_expr(int * next,string ** phrase)405 struct TREE_NODE *basic_expr(int *next, string **phrase)
406 {
407 struct TREE_NODE *n;
408
409 switch (*next) {
410 case W_PHRASE:
411 /* A phrase was found */
412 n=create_leaf_node(PHRASE,*phrase,LABEL_PHRASE);
413 /* Let's put the new phrase node to phrase list */
414 n->leaf->next=first_phrase;
415 first_phrase=n->leaf;
416 *next=get_next(phrase);
417 return n;
418 case W_START:
419 /* reserved word 'start' was found */
420 n=create_leaf_node(PHRASE,NULL,LABEL_START);
421 /* we use already created list (see create_constant_lists() )*/
422 n->leaf->GC_list=start_list;
423 *next=get_next(phrase);
424 return n;
425 case W_LAST:
426 /* reserved word 'end' was found */
427 n=create_leaf_node(PHRASE,NULL,LABEL_END);
428 /* we use already created list (see create_constant_lists() )*/
429 n->leaf->GC_list=end_list;
430 *next=get_next(phrase);
431 return n;
432 case W_LBRACK:
433 /* We got a constant region list */
434 n=create_leaf_node(PHRASE,NULL,LABEL_CONS);
435 n->leaf->GC_list=new_gclist();
436 *next=get_next(phrase);
437 cons_list(next,phrase,n->leaf->GC_list);
438 *next=get_next(phrase);
439 return n;
440 case W_CHARS:
441 /* reserved wors 'chars' was found */
442 n=create_leaf_node(PHRASE,NULL,LABEL_CHARS);
443 /* we use already created list (see create_constant_lists() )*/
444 n->leaf->GC_list=chars_list;
445 *next=get_next(phrase);
446 return n;
447 case W_LPAREN:
448 *next=get_next(phrase);
449 n=reg_expr(next,phrase);
450 if (*next!=W_RPAREN)
451 parse_error(") expected");
452 *next=get_next(phrase);
453 return n;
454 case W_OUTER:
455 case W_INNER:
456 case W_CONCAT:
457 /* So we have function. Let's create a tree node for it */
458 n=create_tree_node( (*next==W_OUTER) ? OUTER:
459 ((*next==W_INNER) ? INNER:CONCAT) );
460 /* Let's parse the parameter */
461 *next=get_next(phrase);
462 if (*next!=W_LPAREN)
463 parse_error("( expected");
464 *next=get_next(phrase);
465 n->left=reg_expr(next,phrase);
466 n->right=NULL; /* Function has only one parameter */
467 if (*next!=W_RPAREN)
468 parse_error(") expected");
469 *next=get_next(phrase);
470 return n;
471 case W_JOIN:
472 /* So we have join function. Let's create a tree node for it */
473 n=create_tree_node(JOIN);
474 /* Let's parse the parameters */
475 *next=get_next(phrase);
476 if (*next!=W_LPAREN)
477 parse_error("( expected");
478 *next=get_next(phrase);
479 if (*next!=W_NUMBER)
480 parse_error("join needs a number: join(NUMBER,expression)");
481 n->number=atoi((char *)(**phrase).s);
482 if (n->number<1)
483 parse_error("join number must be >= 1");
484 *next=get_next(phrase);
485 if (*next!=W_COMMA)
486 parse_error("join expected comma: join(NUMBER,expression)");
487 *next=get_next(phrase);
488 n->left=reg_expr(next,phrase);
489 n->right=NULL;
490 if (*next!=W_RPAREN)
491 parse_error(") expected");
492 *next=get_next(phrase);
493 return n;
494 default:
495 parse_error("Basic expression expected\n");
496 }
497 /* Dummy return for keeping c++ compilers quiet */
498 return NULL;
499 }
500
501 /* Which kind region_expression */
reg_expr(int * next,string ** phrase)502 struct TREE_NODE *reg_expr(int *next, string **phrase)
503 {
504 struct TREE_NODE *left;
505 if (*next==W_END)
506 parse_error("Unexpected end of expression");
507
508 left=basic_expr(next,phrase);
509 if ( *next==W_END || *next==W_RPAREN ) return left;
510 return oper_expr(next,phrase,left);
511 }
512
513 /* Which kind of operator expression */
oper_expr(int * next,string ** phrase,struct TREE_NODE * left)514 struct TREE_NODE *oper_expr(int *next, string **phrase,struct TREE_NODE *left)
515 {
516 struct TREE_NODE *o=NULL;
517
518 switch (*next)
519 {
520 case W_IN:
521 o=create_tree_node(IN);
522 break;
523 case W_CONTAINING:
524 o=create_tree_node(CONTAINING);
525 break;
526 /* PK Febr 95 */
527 case W_EQUAL:
528 o=create_tree_node(EQUAL);
529 break;
530 /* PK Febr 95 */
531 case W_OR:
532 o=create_tree_node(OR);
533 break;
534 case W_ORDERED:
535 o=create_tree_node(ORDERED);
536 break;
537 case W_L_ORDERED:
538 o=create_tree_node(L_ORDERED);
539 break;
540 case W_R_ORDERED:
541 o=create_tree_node(R_ORDERED);
542 break;
543 case W_LR_ORDERED:
544 o=create_tree_node(LR_ORDERED);
545 break;
546 case W_EXTRACTING:
547 o=create_tree_node(EXTRACTING);
548 break;
549 case W_QUOTE:
550 o=create_tree_node(QUOTE);
551 break;
552 case W_R_QUOTE:
553 o=create_tree_node(R_QUOTE);
554 break;
555 case W_L_QUOTE:
556 o=create_tree_node(L_QUOTE);
557 break;
558 case W_LR_QUOTE:
559 o=create_tree_node(LR_QUOTE);
560 break;
561 case W_NOT:
562 *next=get_next(phrase);
563 if (*next==W_CONTAINING) o=create_tree_node(NOT_CONTAINING);
564 else if (*next==W_IN) o=create_tree_node(NOT_IN);
565 else if (*next==W_EQUAL) o=create_tree_node(NOT_EQUAL);
566 else parse_error("'not' must be followed by 'in', 'containing' or 'equal'");
567 break;
568 default:
569 parse_error("Operator expected");
570 }
571 *next=get_next(phrase);
572 o->right=basic_expr(next,phrase);
573 o->left=left;
574 if (*next==W_END || *next==W_RPAREN) return o;
575 return oper_expr(next,phrase,o);
576 }
577
578 /* End of recursive parser */
579
580 /*
581 * Parses the given command string. Returns root node of the parse tree.
582 * phrase_list returns pointer to phrase list
583 */
parse_string(const char * str,struct PHRASE_NODE ** phrase_list)584 struct TREE_NODE *parse_string(const char *str,
585 struct PHRASE_NODE **phrase_list)
586 {
587 string *phrase;
588 int next;
589 struct TREE_NODE *root;
590
591 line=1;
592 column=0;
593
594 expr_str=(char *) str;
595
596 next=get_next(&phrase);
597 root=reg_expr(&next,&phrase);
598
599 if (next==W_RPAREN)
600 {
601 parse_error("Too many )'s");
602 }
603
604 if (next!=W_END)
605 {
606 fprintf(stderr,"Premature end of parsing. ( This shouldn't happen!! )\n");
607 exit(3);
608 }
609
610 #ifdef DEBUG
611 print_tree(root);
612 fprintf(stderr,"\n");
613 #endif
614 *phrase_list=first_phrase;
615 return root;
616 }
617
618 #ifdef DEBUG
print_tree(struct TREE_NODE * root)619 void print_tree(struct TREE_NODE *root)
620 {
621 if ( root==NULL )
622 {
623 fprintf(stderr,"\nprint_tree: got NULL node\n");
624 exit(3);
625 }
626 if ( root->oper==PHRASE )
627 {
628 fprintf(stderr," \"%s\"",root->leaf->phrase->s);
629 return;
630 }
631 if (root->oper<0 || root->oper>R_WORDS)
632 {
633 printf("\nprint tree: got invalid oper (%d)\n",root->oper);
634 exit(3);
635 }
636
637 if (root->right!=NULL)
638 {
639 fprintf(stderr," (");
640 print_tree(root->left);
641
642 fprintf(stderr," %s",r_word_str[root->oper]);
643
644 print_tree(root->right);
645 fprintf(stderr," )");
646 } else
647 {
648 fprintf(stderr," %s(",r_word_str[root->oper]);
649 print_tree(root->left);
650 fprintf(stderr," )");
651 }
652 }
653 #endif
654