1 //	crm_preprocessor.c  - statement preprocessor utilities
2 
3 // Copyright 2001-2009 William S. Yerazunis.
4 // This file is under GPLv3, as described in COPYING.
5 
6 //  include some standard files
7 #include "crm114_sysincludes.h"
8 
9 //  include any local crm114 configuration file
10 #include "crm114_config.h"
11 
12 //  include the crm114 data structures file
13 #include "crm114_structs.h"
14 
15 //  and include the routine declarations file
16 #include "crm114.h"
17 
18 //
19 //       the actual textual representations of the flags, with their values
20 //     DON'T FORGET TO ALSO MODIFY THIS IN crm114_structs.h !!
21 
22 FLAG_DEF crm_flags[46] =
23   {
24     {"fromstart", CRM_FROMSTART},
25     {"fromnext", CRM_FROMNEXT},
26     {"fromend", CRM_FROMEND},
27     {"newend", CRM_NEWEND},
28     {"fromcurrent", CRM_FROMCURRENT},
29     {"nocase", CRM_NOCASE},
30     {"absent", CRM_ABSENT},
31     {"basic", CRM_BASIC},
32     {"backwards", CRM_BACKWARDS},
33     {"literal", CRM_LITERAL},
34     {"nomultiline", CRM_BYLINE},
35     {"byline", CRM_BYLINE},
36     {"bychar", CRM_BYCHAR},
37     {"string", CRM_BYCHAR},
38     {"bychunk", CRM_BYCHUNK},
39     {"byeof", CRM_BYEOF},
40     {"eofaccepts", CRM_EOFACCEPTS},
41     {"eofretry", CRM_EOFRETRY},
42     {"append", CRM_APPEND},
43     {"keep", CRM_KEEP},
44     {"async", CRM_ASYNC},
45     {"refute", CRM_REFUTE},
46     {"microgroom", CRM_MICROGROOM},
47     {"markovian", CRM_MARKOVIAN},
48     {"markov", CRM_MARKOVIAN},
49     {"osb", CRM_OSB_BAYES},
50     {"correlate", CRM_CORRELATE},
51     {"winnow", CRM_OSB_WINNOW},
52     {"unique", CRM_UNIQUE},
53     {"chi2", CRM_CHI2},
54     {"entropy", CRM_ENTROPY},
55     {"entropic", CRM_ENTROPY},
56     {"osbf", CRM_OSBF },
57     {"hyperspace", CRM_HYPERSPACE},
58     {"unigram", CRM_UNIGRAM},
59     {"crosslink", CRM_CROSSLINK},
60     {"default", CRM_DEFAULT},
61     {"lineedit", CRM_READLINE},
62     {"sks", CRM_SKS},
63     {"svm", CRM_SVM},
64     {"fscm", CRM_FSCM},
65     {"neural", CRM_NEURAL_NET},
66     {"erase", CRM_ERASE},
67     {"pca", CRM_PCA},
68     {"", 0},
69     {"", 0}
70   };
71 
72 #define CRM_MAXFLAGS 43
73 
74 
75 
76 
77 //    The magic flag parser.  Given a string of input, return the
78 //    codes that were found as the (long int) return value.  If an
79 //    unrecognized code is found, squalk an error (whether it is fatal
80 //    or not is another issue)
81 //
82 //    Note that since flags (like variables) are always ASCII, we don't
83 //    need to worry about 8-bit-safety.
84 //
crm_flagparse(char * input,long inlen)85 unsigned long long crm_flagparse (char *input, long inlen)  //  the user input
86 {
87   char flagtext [MAX_PATTERN];
88   char *remtext;
89   long remlen;
90   char *wtext;
91   long flagsearch_start_here;
92   long wstart;
93   long wlen;
94   unsigned long long outcode;
95 
96   int done;
97   int i;
98   int j;
99   int k;
100   int recog_flag;
101 
102   outcode = 0;
103 
104   memmove (flagtext, input, inlen);
105   flagtext[inlen] = '\000';
106 
107   if (internal_trace)
108     fprintf (stderr, "Flag string: %s\n", flagtext);
109 
110   //  now loop on thru the nextwords,
111   remtext = flagtext;
112   done = 0;
113   remlen = inlen;
114   wstart = 0;
115   wlen = 0;
116   flagsearch_start_here = 0;
117   while (!done && remlen > 0)
118     {
119       i=crm_nextword (remtext, remlen, flagsearch_start_here, &wstart, &wlen);
120       flagsearch_start_here = wstart + wlen + 1;
121       if (wlen > 0)
122 	{
123 	  //    We got a word, so aim wtext at it
124 	  wtext = &(remtext[wstart]);
125 	  if (internal_trace)
126 	    {
127 	      fprintf (stderr, "found flag, len %ld: ", wlen) ;
128 	      for (j = 0; j < wlen; j++) fprintf (stderr, "%c", wtext[j]);
129 	      fprintf (stderr, "\n");
130 	    };
131 
132 	  //    find sch in our table, squalk a nonfatal/fatal if necessary.
133 	  recog_flag = 0;
134 	  for (j = 0; j <= CRM_MAXFLAGS; j++)
135 	    {
136 	      // fprintf (stderr, " Trying %s (%ld) \n", crm_flags[j].string, crm_flags[j].value );
137 	      k = strlen (crm_flags[j].string);
138 	      if (k == wlen
139 		  && 0 == strncasecmp (wtext, crm_flags[j].string, k))
140 		{
141 		  //    mark this flag as valid so we don't squalk an error
142 		  recog_flag = 1;
143 		  //     and OR this into our outcode
144 		  outcode = outcode | crm_flags[j].value;
145 		  if (user_trace)
146 		    {
147 		      fprintf (stderr, "Mode #%d, '%s' turned on. \n",
148 			       j,
149 			       crm_flags[j].string);
150 		    };
151 		};
152 	    };
153 
154 	  //   check to see if we need to squalk an error condition
155 	  if (recog_flag == 0)
156 	    {
157 	      long q;
158 	      char foo[1024];
159 	      strncpy (foo, wtext, 128);
160 	      foo[wlen] = '\000';
161 	      q = nonfatalerror5 ("Darn...  unrecognized flag :",
162 				  foo, CRM_ENGINE_HERE);
163 	    };
164 
165 
166 	  //  and finally,  move sch up to point at the remaining string
167 	  if (remlen <= 0) done = 1;
168 	}
169       else
170 	done = 1;
171     };
172 
173   if (internal_trace )
174     fprintf (stderr, "Flag code is : %llx\n", outcode);
175 
176   return (outcode);
177 }
178 
179 //     Get the next word in a string.  "word" is defined by the
180 //     continuous span of characters that are above ascii ! (> hex 0x20
181 //
182 //     The search starts at the "start" position given; the start position
183 //     is updated on each call and so is mutilated.  To step through a
184 //     arglist, you must add the returned value of "len" to the returned
185 //     value of start!
186 //
187 //     The returned value is 0/1 as to whether we found
188 //     a valid word, and *start and *length, which give it's position.
189 //
crm_nextword(char * input,long inlen,long starthere,long * start,long * len)190 long crm_nextword ( char *input,
191 		    long inlen,
192 		    long starthere,
193 		    long *start,
194 		    long *len)
195 {
196   *start = starthere;
197   *len = 0;
198   //   find start of string (if it exists)
199   while (*start < inlen && input [*start] <= 0x20 ) *start = *start + 1;
200 
201   //  check - did we hit the end and still be invalid?  If so, return 0
202   if (*start == inlen) return (0);
203 
204   //    if we get to here, then we have a valid string.
205   *len = 0;
206   while ((*start+*len) < inlen
207 	 && input [*start+*len] > 0x20 ) *len = *len + 1;
208 
209   return ( (*len) > 0);
210 }
211 
212 
213 
214 //
215 //    experimental code for a statement-type-sensitive parser.
216 //   Not in use yet... but someday... goal is to provide better error
217 //   detection.
218 
crm_profiled_statement_parse(char * in,long slen,ARGPARSE_BLOCK * apb,long amin,long amax,long pmin,long pmax,long bmin,long bmax,long smin,long smax)219 int crm_profiled_statement_parse ( char *in,
220 				   long slen,
221 				   ARGPARSE_BLOCK *apb,
222 				   long amin, long amax,
223 				   long pmin, long pmax,
224 				   long bmin, long bmax,
225 				   long smin, long smax)
226 {
227   return (0);
228 }
229 
230 //      parse a CRM114 statement; this is mostly a setup routine for
231 //     the generic parser.
232 
crm_statement_parse(char * in,long slen,ARGPARSE_BLOCK * apb)233 int crm_statement_parse ( char *in,
234 			  long slen,
235 			  ARGPARSE_BLOCK *apb)
236 {
237 #define CRM_STATEMENT_PARSE_MAXARG 10
238   int i,  k;
239 
240   long ftype[CRM_STATEMENT_PARSE_MAXARG];
241   long fstart[CRM_STATEMENT_PARSE_MAXARG];
242   long flen [CRM_STATEMENT_PARSE_MAXARG];
243 
244   //     we call the generic parser with the right args to slice and
245   //     dice the incoming statement into declension-delimited parts
246   k = crm_generic_parse_line ( in,
247 			       slen,
248 			       "<([/",
249 			       ">)]/",
250 			       "\\\\\\\\",      // this is four backslashes
251 			       CRM_STATEMENT_PARSE_MAXARG,
252 			       ftype,
253 			       fstart,
254 			       flen);
255 
256   //   now we have all these nice chunks... we split them up into the
257   //   various allowed categories.
258 
259 
260   //   start out with empties on each possible chunk
261   apb->a1start = NULL; apb->a1len = 0;
262   apb->p1start = NULL; apb->p1len = 0;
263   apb->p2start = NULL; apb->p2len = 0;
264   apb->p3start = NULL; apb->p3len = 0;
265   apb->b1start = NULL; apb->b1len = 0;
266   apb->s1start = NULL; apb->s1len = 0;
267   apb->s2start = NULL; apb->s2len = 0;
268 
269   //   Scan through the incoming chunks
270   for (i = 0; i < k; i++)
271     {
272       switch (ftype[i])
273        	{
274 	case CRM_ANGLES:
275 	  {
276 	    //  Grab the angles, if we don't have one already
277 	    if (apb->a1start == NULL)
278 	      {
279 		apb->a1start = &in[fstart[i]];
280 		apb->a1len = flen [i];
281 	      }
282 	    else nonfatalerror5
283 		   ("There are multiple flag sets on this line.",
284 		    " ignoring all but the first", CRM_ENGINE_HERE);
285 	  }
286 	  break;
287 	case CRM_PARENS:
288 	  {
289 	    //  grab a set of parens, cascading till we find an one
290 	    if (apb->p1start == NULL)
291 	      {
292 		apb->p1start = &in[fstart[i]];
293 		apb->p1len = flen [i];
294 	      }
295 	    else
296 	      if (apb->p2start == NULL)
297 		{
298 		  apb->p2start = &in[fstart[i]];
299 		  apb->p2len = flen [i];
300 		}
301 	      else
302 		if (apb->p3start == NULL)
303 		  {
304 		    apb->p3start = &in[fstart[i]];
305 		    apb->p3len = flen [i];
306 		  }
307 		else
308 		  nonfatalerror5
309 		    ("Too many parenthesized varlists.",
310 		     "ignoring the excess varlists.", CRM_ENGINE_HERE);
311 	  }
312 	  break;
313 	case CRM_BOXES:
314 	  {
315 	    //  Grab the angles, if we don't have one already
316 	    if (apb->b1start == NULL)
317 	      {
318 		apb->b1start = &in[fstart[i]];
319 		apb->b1len = flen [i];
320 	      }
321 	    else nonfatalerror5
322 		   ("There are multiple domain limits on this line.",
323 		    " ignoring all but the first", CRM_ENGINE_HERE);
324 	  }
325 	  break;
326 	case CRM_SLASHES:
327 	  {
328 	    //  grab a set of parens, cascading till we find an one
329 	    if (apb->s1start == NULL)
330 	      {
331 		apb->s1start = &in[fstart[i]];
332 		apb->s1len = flen [i];
333 	      }
334 	    else
335 	      if (apb->s2start == NULL)
336 		{
337 		  apb->s2start = &in[fstart[i]];
338 		  apb->s2len = flen [i];
339 		}
340 	      else
341 		nonfatalerror5 (
342 		       "There are too many regex sets in this statement,",
343 		       " ignoring all but the first.", CRM_ENGINE_HERE);
344 	  }
345 	  break;
346 	default:
347 	  fatalerror5( "Declensional parser returned an undefined typecode!",
348 		       "What the HECK did you do to cause this?",
349 		       CRM_ENGINE_HERE);
350 	};
351     }
352   return (k);    // return value is how many declensional arguments we found.
353 };
354 
355 
356 //     The new and improved line core parser routine.  Instead of
357 //     being totally ad hoc, this new parser actually retains context
358 //     durng the parse.
359 //
360 //     this hopefully will keep the parser from getting confused by [] in
361 //     the slash matching and other such abominations.
362 //
363 //     (one way to view this style of parsing is that each arg in a
364 //     CRM114 statement is "declined" by it's delimiters to determine
365 //     what role this variable is to play in the statement.  Kinda like
366 //     Latin - to a major extent, you can mix the parts around and it
367 //     won't make any difference.
368 
crm_generic_parse_line(char * txt,long len,char * schars,char * fchars,char * echars,long maxargs,long * ftype,long * fstart,long * flen)369 int crm_generic_parse_line (
370 		    char *txt,       //   the start of the program line
371 		    long len,        //   how long is the line
372 		    char *schars,    //   characters that can "start" an arg
373 		    char *fchars,    //   characters that "finish" an arg
374 		    char *echars,    //   characters that escape in an arg
375 		    long maxargs,    //   howm many things to search for (max)
376 		    long *ftype,     //   type of thing found (index by schars)
377 		    long *fstart,    //   starting location of found arg
378 		    long *flen       //   length of found arg
379 		    )
380 {
381   //    the general algorithm here is to move along the input line,
382   //    looking for one of the characters in schars.  When we find it,
383   //    we lock onto that and commit to finding an arg of that type.
384   //    We then start scanning ahead keeping count of schars minus echars.
385   //    when the count hits zero, it's end for that arg and we move onward
386   //    to the next arg, with the same procedure.
387   //
388   //    note that when we are scanning for a new arg, we are open to args
389   //    of any type (as defined by the members of schars, while in an arg
390   //    we are looking only for the unescaped outstanding echar and are blind
391   //    to everything else.
392   //
393   //    when not in an arg, we do not have any escape character active.
394   //
395   //     We return the number of args found
396 
397   long chidx;
398   char curchar;
399   long argc;
400   long i;
401   long itype;
402   long depth;
403 
404   //    zeroize the outputs to start...
405   for (i = 0; i < maxargs; i++)
406     {
407       ftype[i] = -1;
408       fstart[i] = 0;
409       flen[i] = 0;
410     };
411 
412 
413   //    scan forward, looking for any member of schars
414 
415   depth = 0;
416   chidx = -1;
417   argc = 0;
418   itype = -1;
419 
420   if (internal_trace)
421     {
422       fprintf (stderr, " declensional parsing for %ld chars on: ", len);
423       for (i = 0; i < len; i++)
424 	fprintf (stderr, "%c", txt[i]);
425       fprintf (stderr, "\n");
426     }
427 
428   while (chidx < len  &&  argc <= maxargs)
429     {
430       chidx++;
431       curchar = txt[chidx];
432       if (itype == -1)     // are we looking for an argstart char?
433 	{
434 	  //    is curchar one of the start chars?  (this is 8-bit-safe,
435 	  //     because schars is always normal ASCII)
436 	  for (i = 0; i < strlen (schars); i++)
437 	    if (curchar == schars[i])
438 	      {
439 		if (internal_trace)
440 		  fprintf (stderr, "   found opener %c at %ld,",curchar,chidx);
441 		itype = i;
442 		fstart[argc] = chidx + 1;
443 		ftype [argc] = itype;
444 		depth = 1;
445 	      };
446 	  //  if it wasn't a start-character for an arg, we are done.
447 	}
448       else    // nope, we're in an arg, so we check for unescaped schar
449 	     // and fchar characers
450 	{
451 	  //  if (curchar == fchars [itype] && txt[chidx-1] != echars[itype])
452           if (curchar == fchars [itype]
453 	      && (txt[chidx-1] != echars[itype]
454 		  || txt[chidx-1] == txt[chidx-2]))
455 	    {
456 	      depth--;
457 	      if (depth == 0)
458 		{
459 		  //   we've found the end of the text arg.  Close it off and
460 		  //   note it into the output vectors
461 		  flen [argc] = chidx - fstart[argc] ;
462 		  if (internal_trace)
463 		    {
464 		      int q;
465 		      fprintf (stderr, " close %c at %ld --", curchar, chidx);
466 		      for (q = fstart[argc]; q < fstart[argc]+flen[argc]; q++)
467 			fprintf (stderr, "%c", txt[q]);
468 		      fprintf (stderr, "-- len %ld\n", flen[argc]);
469 		    };
470 		  itype = -1;
471 		  argc++;
472 		};
473 	    }
474 	  else
475 	    //if (curchar == schars [itype] && txt[chidx-1] != echars[itype])
476 	    if (curchar == schars [itype]
477 		&& (txt[chidx-1] != echars[itype]
478 		    || txt[chidx-1] == txt[chidx-2]))
479 	      {
480 		depth++;
481 	      };
482 	};
483       //    if we weren't a schar or an unexcaped echar, we're done!
484     };
485   if (depth != 0)
486     {
487       char errstmt[MAX_PATTERN];
488       flen[argc] = chidx - fstart[argc];
489       //
490       //   GROT GROT GROT Somehow, sometimes we get flen[argc] < 0.   It's
491       //   always with buggy userprograms, but we shouldn't need this anyway.
492       //   So, until we find out what _we_ are doing wrong, leave the check
493       //   for flen[argc] < 0 in here.
494       //
495       if (flen[argc] < 0) flen[argc] = 0;
496       strncpy ( errstmt, &txt[fstart[argc]],
497 		flen[argc] );
498       nonfatalerror5 (" This operand doesn't seem to end.  Bug?  \n -->  ",
499 		      errstmt, CRM_ENGINE_HERE);
500       argc++;
501     };
502   return (argc);
503 }
504 
505 //    and to avoid all the mumbo-jumbo, an easy way to get a copy of
506 //    an arg found by the declensional parser.
crm_get_pgm_arg(char * to,long tolen,char * from,long fromlen)507 void crm_get_pgm_arg (char *to, long tolen, char *from, long fromlen)
508 {
509   long len;
510 
511   if (to == NULL)
512     return;
513 
514   if (from == NULL)
515     {
516       to[0] = '\000';
517     }
518   else
519     {
520       len = tolen - 1;
521       if (len > fromlen ) len = fromlen ;
522       memmove (to, from, len);
523       to[len] = '\000';
524     }
525 }
526