1 /*
2 	System: Structured text retrieval tool sgrep.
3 	Module: parser.c
4 	Author: Pekka Kilpel�inen & Jani Jaakkola
5 	Description: Parses expression string ( which has been gathered in
6 	             main.c and preprocessed in preprocess.c )
7 	             Returns operator tree.
8 	             Used through fuction parse_tree()
9 	Version history: Original version February 1995 by JJ & PK
10 	Copyright: University of Helsinki, Dept. of Computer Science
11 		   Distributed under GNU General Public Lisence
12 		   See file COPYING for details
13 */
14 #include <stdio.h>
15 #include <ctype.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include "defines.h"
19 
20 /* Maximum length of one phrase. */
21 #define MAX_PHRASE_LENGTH 512
22 /* Maximum length of a reserved word (operation). Could also be smaller */
23 #define MAX_R_WORD 20
24 
25 /*
26  * This points to first phrase in the phrase list
27  */
28 struct PHRASE_NODE *first_phrase=NULL;
29 
30 void parse_error(char *);
31 char *give_str();
32 int get_next_char();
33 int get_next(string **);
34 struct TREE_NODE *reg_expr(int *, string **);
35 struct TREE_NODE *basic_expr(int *, string **);
36 struct TREE_NODE *oper_expr(int *, string **,struct TREE_NODE *);
37 
38 #ifdef DEBUG
39 void print_tree(struct TREE_NODE *);
40 #endif
41 
42 /*
43  * When looking for reserved words they are matched to these strings
44  */
45 char *r_word_str[]=
46 	{ "in","not","containing","or","..","_.","._","__","extracting",
47 	"quote","_quote","quote_","_quote_",
48 	"equal",		/* PK Febr 12 '96 */
49 	"outer","inner","concat","start","end","chars","join" };
50 
51 /*
52  * The command string and a index to it
53  */
54 char *expr_str=NULL;
55 int expr_ind=0;
56 
57 /*
58  * for telling where errors occurred
59  */
60 #define ELLENGTH 79
61 #define SHOWR 5
62 int line,column;
63 int errcol,errind;
64 
65 /*
66  * Gets next character from input expression. Counts also lines and columns
67  * return character must be unsigned to keep sgrep is 8-bit clean
68  */
get_next_char()69 int get_next_char()
70 {
71 	switch (expr_str[expr_ind]) {
72 		case '\n':
73 			column=-1;
74 			line++;
75 			break;
76 		case '\t':
77 			expr_str[expr_ind]=' ';
78 			break;
79 	}
80 	column++;
81 	return ( (unsigned char *)expr_str)[expr_ind++];
82 }
83 
84 /*
85  * Scans next 'word' from input string. If word was a phrase sets **phrase
86  */
get_next(string ** phrase)87 int get_next(string **phrase)
88 {
89 	static int ch=-1;
90 	char str[MAX_PHRASE_LENGTH];
91 	int i=0,j=0;
92 	int start;
93 	static int state=0;
94 	int onum=0;
95 
96 /* if we already have next character in ch then ch!=-1. Otherwise we have to
97    get one */
98 	if (ch==-1) ch=get_next_char();
99 
100 /* These show the place where possible error started */
101 	errcol=column;
102 	errind=expr_ind;
103 
104 /* Finite automaton for recognizing next word */
105 	do {
106 	switch (state) {
107 
108 	/* This is the start state */
109 	case 0:
110 		/* Lets skip the spaces */
111 		while (isspace(ch)) ch=get_next_char();
112 
113 		/* These show the place where possible error started */
114 		errcol=column;
115 		errind=expr_ind;
116 
117 		switch(ch) {
118 		case 0:
119 			return W_END;
120 		case '(':
121 			ch=-1;
122 			return W_LPAREN;
123 		case ')':
124 			ch=-1;
125 			return W_RPAREN;
126 		case '[':
127 			ch=-1;
128 			return W_LBRACK;
129 		case ']':
130 			ch=-1;
131 			return W_RBRACK;
132 		case ',':
133 			ch=-1;
134 			return W_COMMA;
135 		case '\"':
136 			i=0;
137 			state=1;
138 			break;
139 		case '#':
140 			state=4;
141 			break;
142 		}
143 		if ( isalpha(ch) || ch=='.' || ch=='_' )
144 		{
145 			i=1;
146 			str[0]=tolower(ch);
147 			state=2;
148 			start=column;
149 		}
150 		if (ch>='0' && ch<='9')
151 		{
152 			i=1;
153 			str[0]=ch;
154 			start=column;
155 			state=5;
156 		}
157 		if ( state==0 )
158 		{
159 #ifdef DEBUG
160 			fprintf(stderr,"Character %c ascii(%d)\n",ch,ch);
161 #endif
162 			parse_error("Invalid character");
163 		}
164 		break;
165 
166 	/* This state reads a phrase string */
167 	case 1:
168 		if (ch==0 || ch=='\n')
169 			parse_error("Unterminated phrase string");
170 		if ( ch<' ' )
171 			parse_error("Unprintable character in phrase");
172 		if (ch=='\"')
173 		{
174 			if ( i==0 )
175 				parse_error("Empty phrase");
176 			*phrase=init_string(i,str);
177 			state=0;
178 			ch=-1;
179 			return W_PHRASE;
180 		}
181 		if ( i==MAX_PHRASE_LENGTH )
182 		{
183 			fprintf(stderr,"%s ( > %d ) %s %d\n%s\n%s\n",
184 	"Phrase length exceeds",MAX_PHRASE_LENGTH,"on line",line,
185 	"Either you have forgotten the quote  at the end of phrase or",
186 	"you have to recompile with greater MAX_PHRASE_LENGTH.");
187 			exit(2);
188 		}
189 		if (ch=='\\' ) state=3;
190 		if ( state==1 ) str[i++]=ch;
191 		break;
192 
193 	/* This state reads a reserved word (operator) */
194 	case 2:
195 		if ( ch==0 || ( !isalpha(ch) && ch!='.' && ch!='_' ) )
196 		{
197 			state=0;
198 			str[i]=0;
199 			for(j=0;j<R_WORDS;j++)
200 				if (strcmp(r_word_str[j],str)==0) return j;
201 			parse_error("Unknown word");
202 		}
203 		if (i==MAX_R_WORD)
204 			parse_error("Unknown word");
205 		str[i++]=tolower(ch);
206 		break;
207 
208 	/* This state handles escape sequences */
209 	case 3:
210 		if ( ch==0 ) parse_error("Unterminated phrase");
211 		switch (ch) {
212 		case 't': ch='\t';break;
213 		case 'n': ch='\n';break;
214 		case 'r': ch='\r';break;
215 		case 'f': ch='\f';break;
216 		case 'b': ch='\b';break;
217 		case '\\': ch='\\';break;
218 		case '\"': ch='\"';break;
219 		case '0':
220 		case '1':
221 		case '2':
222 		case '3':
223 			/* Octal number requires new state */
224 			state=6;
225 			onum=64*(ch-'0');
226 			break;
227 		default:
228 			errcol=column;
229 			errind=expr_ind;
230 			parse_error("Unknown escape sequence");
231 			break;
232 		}
233 		if (state==3) /* always true except for octal number */
234 		{
235 			str[i++]=ch;
236 			state=1;
237 		}
238 		break;
239 	/* This state skips comments */
240 	case 4:
241 		if (ch==0)
242 		{
243 			state=0;
244 			return W_END;
245 		}
246 		if (ch=='\n') state=0;
247 		break;
248 	/* We read a number */
249 	case 5:
250 		if ( ch<'0'|| ch>'9' )
251 		{
252 			state=0;
253 			str[i]=0;
254 			*phrase=init_string(i,str);
255 			return W_NUMBER;
256 		}
257 		if (i==9)
258 			parse_error("Too big number");
259 		str[i++]=ch;
260 		break;
261 	/* Read octal number #2 */
262 	case 6:
263 		if ( ch<'0' || ch>'7')
264 		{
265 			errcol=column;
266 			errind=expr_ind;
267 			parse_error("Octal number expected");
268 		}
269 		onum+=8*(ch-'0');
270 		state=7;
271 		break;
272 	/* Read octal number #3 */
273 	case 7:
274 		if ( ch<'0' || ch>'7')
275 		{
276 			errcol=column;
277 			errind=expr_ind;
278 			parse_error("Octal number expected");
279 		}
280 		onum+=ch-'0';
281 		if (onum==0)
282 		{
283                         errcol=column-3;
284                         errind=expr_ind-3;
285 			parse_error("\\000 in phrase does'nt work (yet)\n");
286 		}
287 		str[i++]=onum;
288 		state=1;
289 		break;
290 	}
291 
292 	ch=get_next_char();
293 	} while (TRUE);
294 
295 }
296 
297 /*
298  * Shows a given error message & where it occurred
299  */
parse_error(char * error)300 void parse_error(char *error)
301 {
302 	char erlin[ELLENGTH+1];
303 	int i;
304 
305 	if (errcol-ELLENGTH+SHOWR>0)
306 		errind-=ELLENGTH-SHOWR;
307 	else errind-=errcol;
308 	for(i=0;i<ELLENGTH && expr_str[i+errind] && expr_str[i+errind]!='\n';i++)
309 		erlin[i]=expr_str[i+errind];
310 	erlin[i]=0;
311 	fprintf(stderr,"Parse error on line %d column %d :\n\t%s\n%s\n",
312 		line,errcol,error,erlin);
313 	if ( errcol>ELLENGTH-SHOWR ) errcol=ELLENGTH-SHOWR;
314         for (i=0;i<errcol-1;i++) fprintf(stderr," ");
315         fprintf(stderr,"^\n");
316         exit(2);
317 }
318 
319 /*
320  * Creates and initializez an operator node to parse tree
321  */
create_tree_node(int oper)322 struct TREE_NODE *create_tree_node(int oper)
323 {
324 	struct TREE_NODE *n;
325 
326 	n=(struct TREE_NODE *)e_malloc(sizeof(struct TREE_NODE));
327 	n->left=NULL;
328 	n->right=NULL;
329 	n->parent=NULL;
330 	n->oper=oper;
331 	n->number=-1;
332 	n->leaf=NULL;
333 	n->label_left=LABEL_NOTKNOWN;
334 	n->label_right=LABEL_NOTKNOWN;
335 	n->refcount=0;
336 	n->GC_list=NULL;
337 	return n;
338 }
339 
340 /*
341  * Creates and initializes a leaf node to parse tree
342  */
create_leaf_node(int oper,string * phrase,int phrase_label)343 struct TREE_NODE * create_leaf_node(int oper, string *phrase, int phrase_label)
344 {
345 	struct TREE_NODE *n;
346 
347 	/* Let's create a leaf node */
348 	n=create_tree_node(oper);
349 	n->label_left=phrase_label;
350 	/* Let's create a phrase node for the phrase */
351 	n->leaf=(struct PHRASE_NODE *)
352 		e_malloc(sizeof (struct PHRASE_NODE));
353 	n->leaf->phrase=phrase;
354 	return n;
355 }
356 
357 /*
358  * Here starts recursive parser
359  */
360 
361 /* production basic_expr->constant_list */
cons_list(int * next,string ** phrase,struct GC_LIST * c_list)362 void cons_list(int *next, string **phrase, struct GC_LIST *c_list)
363 {
364 	int s,e,ps,pe;
365 	char *cons_err="invalid constant region list";
366 
367 	stats.constant_lists++;
368 
369 	ps=-1;pe=-1;
370 	while (*next!=W_RBRACK) /* We can fall out here immediately,
371 				   which means we have empty list */
372 	{
373 		if (*next!=W_LPAREN)
374 			parse_error(cons_err);
375 		*next=get_next(phrase);
376 		if (*next!=W_NUMBER)
377 			parse_error(cons_err);
378 		s=atoi((char *)(**phrase).s);
379 		*next=get_next(phrase);
380 		if (*next!=W_COMMA)
381 			parse_error(cons_err);
382 		*next=get_next(phrase);
383 		if (*next!=W_NUMBER)
384 			parse_error(cons_err);
385 		e=atoi((char *)(**phrase).s);
386 		*next=get_next(phrase);
387 		if (*next!=W_RPAREN)
388 			parse_error(cons_err);
389 		if (e<s)
390 			parse_error("region end point must be greater than start point");
391 		*next=get_next(phrase);
392 		if (s<ps || (s==ps && e<=pe) )
393 			parse_error("constant gc list must be sorted");
394 		if (e<=pe || s==ps)
395 		{
396 			/* nesting detected */
397 			c_list->nested=TRUE;
398 		}
399 		add_region(c_list,s,e);
400 		ps=s;pe=e;
401 	}
402 }
403 
404 /* Which kind of basic expressions */
basic_expr(int * next,string ** phrase)405 struct TREE_NODE *basic_expr(int *next, string **phrase)
406 {
407 	struct TREE_NODE *n;
408 
409 	switch (*next) {
410 	case W_PHRASE:
411 		/* A phrase was found */
412 		n=create_leaf_node(PHRASE,*phrase,LABEL_PHRASE);
413 		/* Let's put the new phrase node to phrase list */
414 		n->leaf->next=first_phrase;
415 		first_phrase=n->leaf;
416 		*next=get_next(phrase);
417 		return n;
418 	case W_START:
419 		/* reserved word 'start' was found */
420 		n=create_leaf_node(PHRASE,NULL,LABEL_START);
421 		/* we use already created list (see create_constant_lists() )*/
422 		n->leaf->GC_list=start_list;
423 		*next=get_next(phrase);
424 		return n;
425 	case W_LAST:
426 		/* reserved word 'end' was found */
427 		n=create_leaf_node(PHRASE,NULL,LABEL_END);
428 		/* we use already created list (see create_constant_lists() )*/
429 		n->leaf->GC_list=end_list;
430 		*next=get_next(phrase);
431 		return n;
432 	case W_LBRACK:
433 		/* We got a constant region list */
434 		n=create_leaf_node(PHRASE,NULL,LABEL_CONS);
435 		n->leaf->GC_list=new_gclist();
436 		*next=get_next(phrase);
437 		cons_list(next,phrase,n->leaf->GC_list);
438 		*next=get_next(phrase);
439 		return n;
440 	case W_CHARS:
441 		/* reserved wors 'chars' was found */
442 		n=create_leaf_node(PHRASE,NULL,LABEL_CHARS);
443 		/* we use already created list (see create_constant_lists() )*/
444 		n->leaf->GC_list=chars_list;
445 		*next=get_next(phrase);
446 		return n;
447 	case W_LPAREN:
448 		*next=get_next(phrase);
449 		n=reg_expr(next,phrase);
450 		if (*next!=W_RPAREN)
451 			parse_error(") expected");
452 		*next=get_next(phrase);
453 		return n;
454 	case W_OUTER:
455 	case W_INNER:
456 	case W_CONCAT:
457 		/* So we have function. Let's create a tree node for it */
458 		n=create_tree_node( (*next==W_OUTER) ? OUTER:
459 			((*next==W_INNER) ? INNER:CONCAT) );
460 		/* Let's parse the parameter */
461 		*next=get_next(phrase);
462 		if (*next!=W_LPAREN)
463 			parse_error("( expected");
464 		*next=get_next(phrase);
465 		n->left=reg_expr(next,phrase);
466 		n->right=NULL; /* Function has only one parameter */
467 		if (*next!=W_RPAREN)
468 			parse_error(") expected");
469 		*next=get_next(phrase);
470 		return n;
471 	case W_JOIN:
472 		/* So we have join function. Let's create a tree node for it */
473 		n=create_tree_node(JOIN);
474 		/* Let's parse the parameters */
475 		*next=get_next(phrase);
476 		if (*next!=W_LPAREN)
477 			parse_error("( expected");
478 		*next=get_next(phrase);
479 		if (*next!=W_NUMBER)
480 			parse_error("join needs a number: join(NUMBER,expression)");
481 		n->number=atoi((char *)(**phrase).s);
482 		if (n->number<1)
483 			parse_error("join number must be >= 1");
484 		*next=get_next(phrase);
485 		if (*next!=W_COMMA)
486 			parse_error("join expected comma: join(NUMBER,expression)");
487 		*next=get_next(phrase);
488 		n->left=reg_expr(next,phrase);
489 		n->right=NULL;
490 		if (*next!=W_RPAREN)
491 			parse_error(") expected");
492 		*next=get_next(phrase);
493 		return n;
494 	default:
495 		parse_error("Basic expression expected\n");
496 	}
497 	/* Dummy return for keeping c++ compilers quiet */
498 	return NULL;
499 }
500 
501 /* Which kind region_expression */
reg_expr(int * next,string ** phrase)502 struct TREE_NODE *reg_expr(int *next, string **phrase)
503 {
504 	struct TREE_NODE *left;
505 	if (*next==W_END)
506 		parse_error("Unexpected end of expression");
507 
508 	left=basic_expr(next,phrase);
509 	if ( *next==W_END || *next==W_RPAREN ) return left;
510 	return oper_expr(next,phrase,left);
511 }
512 
513 /* Which kind of operator expression */
oper_expr(int * next,string ** phrase,struct TREE_NODE * left)514 struct TREE_NODE *oper_expr(int *next, string **phrase,struct TREE_NODE *left)
515 {
516 	struct TREE_NODE *o=NULL;
517 
518 	switch (*next)
519 	{
520 	case W_IN:
521 		o=create_tree_node(IN);
522 		break;
523 	case W_CONTAINING:
524 		o=create_tree_node(CONTAINING);
525 		break;
526 /* PK Febr 95 */
527 	case W_EQUAL:
528 		o=create_tree_node(EQUAL);
529 		break;
530 /* PK Febr 95 */
531 	case W_OR:
532 		o=create_tree_node(OR);
533 		break;
534 	case W_ORDERED:
535 		o=create_tree_node(ORDERED);
536 		break;
537 	case W_L_ORDERED:
538 		o=create_tree_node(L_ORDERED);
539 		break;
540 	case W_R_ORDERED:
541 		o=create_tree_node(R_ORDERED);
542 		break;
543 	case W_LR_ORDERED:
544 		o=create_tree_node(LR_ORDERED);
545 		break;
546 	case W_EXTRACTING:
547 		o=create_tree_node(EXTRACTING);
548 		break;
549 	case W_QUOTE:
550 		o=create_tree_node(QUOTE);
551 		break;
552 	case W_R_QUOTE:
553 		o=create_tree_node(R_QUOTE);
554 		break;
555 	case W_L_QUOTE:
556 		o=create_tree_node(L_QUOTE);
557 		break;
558 	case W_LR_QUOTE:
559 		o=create_tree_node(LR_QUOTE);
560 		break;
561 	case W_NOT:
562 		*next=get_next(phrase);
563 		if (*next==W_CONTAINING) o=create_tree_node(NOT_CONTAINING);
564 		else if (*next==W_IN) o=create_tree_node(NOT_IN);
565 		else if (*next==W_EQUAL) o=create_tree_node(NOT_EQUAL);
566 		else parse_error("'not' must be followed by 'in', 'containing' or 'equal'");
567 		break;
568 	default:
569 		parse_error("Operator expected");
570 	}
571 	*next=get_next(phrase);
572 	o->right=basic_expr(next,phrase);
573 	o->left=left;
574 	if (*next==W_END || *next==W_RPAREN) return o;
575 	return oper_expr(next,phrase,o);
576 }
577 
578 /* End of recursive parser */
579 
580 /*
581  * Parses the given command string. Returns root node of the parse tree.
582  * phrase_list returns pointer to phrase list
583  */
parse_string(const char * str,struct PHRASE_NODE ** phrase_list)584 struct TREE_NODE *parse_string(const char *str,
585 	struct PHRASE_NODE **phrase_list)
586 {
587 	string *phrase;
588 	int next;
589 	struct TREE_NODE *root;
590 
591 	line=1;
592 	column=0;
593 
594 	expr_str=(char *) str;
595 
596 	next=get_next(&phrase);
597 	root=reg_expr(&next,&phrase);
598 
599 	if (next==W_RPAREN)
600 	{
601 		parse_error("Too many )'s");
602 	}
603 
604 	if (next!=W_END)
605 	{
606 		fprintf(stderr,"Premature end of parsing. ( This shouldn't happen!! )\n");
607 		exit(3);
608 	}
609 
610 #ifdef DEBUG
611 	print_tree(root);
612 	fprintf(stderr,"\n");
613 #endif
614 	*phrase_list=first_phrase;
615 	return root;
616 }
617 
618 #ifdef DEBUG
print_tree(struct TREE_NODE * root)619 void print_tree(struct TREE_NODE *root)
620 {
621 	if ( root==NULL )
622 	{
623 		fprintf(stderr,"\nprint_tree: got NULL node\n");
624 		exit(3);
625 	}
626 	if ( root->oper==PHRASE )
627 	{
628 		fprintf(stderr," \"%s\"",root->leaf->phrase->s);
629 		return;
630 	}
631 	if (root->oper<0 || root->oper>R_WORDS)
632 	{
633 		printf("\nprint tree: got invalid oper (%d)\n",root->oper);
634 		exit(3);
635 	}
636 
637 	if (root->right!=NULL)
638 	{
639 		fprintf(stderr," (");
640 		print_tree(root->left);
641 
642 		fprintf(stderr," %s",r_word_str[root->oper]);
643 
644 		print_tree(root->right);
645 		fprintf(stderr," )");
646 	} else
647 	{
648 		fprintf(stderr," %s(",r_word_str[root->oper]);
649 		print_tree(root->left);
650 		fprintf(stderr," )");
651 	}
652 }
653 #endif
654