1 // crm_preprocessor.c - statement preprocessor utilities
2
3 // Copyright 2001-2009 William S. Yerazunis.
4 // This file is under GPLv3, as described in COPYING.
5
6 // include some standard files
7 #include "crm114_sysincludes.h"
8
9 // include any local crm114 configuration file
10 #include "crm114_config.h"
11
12 // include the crm114 data structures file
13 #include "crm114_structs.h"
14
15 // and include the routine declarations file
16 #include "crm114.h"
17
18 //
19 // the actual textual representations of the flags, with their values
20 // DON'T FORGET TO ALSO MODIFY THIS IN crm114_structs.h !!
21
22 FLAG_DEF crm_flags[46] =
23 {
24 {"fromstart", CRM_FROMSTART},
25 {"fromnext", CRM_FROMNEXT},
26 {"fromend", CRM_FROMEND},
27 {"newend", CRM_NEWEND},
28 {"fromcurrent", CRM_FROMCURRENT},
29 {"nocase", CRM_NOCASE},
30 {"absent", CRM_ABSENT},
31 {"basic", CRM_BASIC},
32 {"backwards", CRM_BACKWARDS},
33 {"literal", CRM_LITERAL},
34 {"nomultiline", CRM_BYLINE},
35 {"byline", CRM_BYLINE},
36 {"bychar", CRM_BYCHAR},
37 {"string", CRM_BYCHAR},
38 {"bychunk", CRM_BYCHUNK},
39 {"byeof", CRM_BYEOF},
40 {"eofaccepts", CRM_EOFACCEPTS},
41 {"eofretry", CRM_EOFRETRY},
42 {"append", CRM_APPEND},
43 {"keep", CRM_KEEP},
44 {"async", CRM_ASYNC},
45 {"refute", CRM_REFUTE},
46 {"microgroom", CRM_MICROGROOM},
47 {"markovian", CRM_MARKOVIAN},
48 {"markov", CRM_MARKOVIAN},
49 {"osb", CRM_OSB_BAYES},
50 {"correlate", CRM_CORRELATE},
51 {"winnow", CRM_OSB_WINNOW},
52 {"unique", CRM_UNIQUE},
53 {"chi2", CRM_CHI2},
54 {"entropy", CRM_ENTROPY},
55 {"entropic", CRM_ENTROPY},
56 {"osbf", CRM_OSBF },
57 {"hyperspace", CRM_HYPERSPACE},
58 {"unigram", CRM_UNIGRAM},
59 {"crosslink", CRM_CROSSLINK},
60 {"default", CRM_DEFAULT},
61 {"lineedit", CRM_READLINE},
62 {"sks", CRM_SKS},
63 {"svm", CRM_SVM},
64 {"fscm", CRM_FSCM},
65 {"neural", CRM_NEURAL_NET},
66 {"erase", CRM_ERASE},
67 {"pca", CRM_PCA},
68 {"", 0},
69 {"", 0}
70 };
71
72 #define CRM_MAXFLAGS 43
73
74
75
76
77 // The magic flag parser. Given a string of input, return the
78 // codes that were found as the (long int) return value. If an
79 // unrecognized code is found, squalk an error (whether it is fatal
80 // or not is another issue)
81 //
82 // Note that since flags (like variables) are always ASCII, we don't
83 // need to worry about 8-bit-safety.
84 //
crm_flagparse(char * input,long inlen)85 unsigned long long crm_flagparse (char *input, long inlen) // the user input
86 {
87 char flagtext [MAX_PATTERN];
88 char *remtext;
89 long remlen;
90 char *wtext;
91 long flagsearch_start_here;
92 long wstart;
93 long wlen;
94 unsigned long long outcode;
95
96 int done;
97 int i;
98 int j;
99 int k;
100 int recog_flag;
101
102 outcode = 0;
103
104 memmove (flagtext, input, inlen);
105 flagtext[inlen] = '\000';
106
107 if (internal_trace)
108 fprintf (stderr, "Flag string: %s\n", flagtext);
109
110 // now loop on thru the nextwords,
111 remtext = flagtext;
112 done = 0;
113 remlen = inlen;
114 wstart = 0;
115 wlen = 0;
116 flagsearch_start_here = 0;
117 while (!done && remlen > 0)
118 {
119 i=crm_nextword (remtext, remlen, flagsearch_start_here, &wstart, &wlen);
120 flagsearch_start_here = wstart + wlen + 1;
121 if (wlen > 0)
122 {
123 // We got a word, so aim wtext at it
124 wtext = &(remtext[wstart]);
125 if (internal_trace)
126 {
127 fprintf (stderr, "found flag, len %ld: ", wlen) ;
128 for (j = 0; j < wlen; j++) fprintf (stderr, "%c", wtext[j]);
129 fprintf (stderr, "\n");
130 };
131
132 // find sch in our table, squalk a nonfatal/fatal if necessary.
133 recog_flag = 0;
134 for (j = 0; j <= CRM_MAXFLAGS; j++)
135 {
136 // fprintf (stderr, " Trying %s (%ld) \n", crm_flags[j].string, crm_flags[j].value );
137 k = strlen (crm_flags[j].string);
138 if (k == wlen
139 && 0 == strncasecmp (wtext, crm_flags[j].string, k))
140 {
141 // mark this flag as valid so we don't squalk an error
142 recog_flag = 1;
143 // and OR this into our outcode
144 outcode = outcode | crm_flags[j].value;
145 if (user_trace)
146 {
147 fprintf (stderr, "Mode #%d, '%s' turned on. \n",
148 j,
149 crm_flags[j].string);
150 };
151 };
152 };
153
154 // check to see if we need to squalk an error condition
155 if (recog_flag == 0)
156 {
157 long q;
158 char foo[1024];
159 strncpy (foo, wtext, 128);
160 foo[wlen] = '\000';
161 q = nonfatalerror5 ("Darn... unrecognized flag :",
162 foo, CRM_ENGINE_HERE);
163 };
164
165
166 // and finally, move sch up to point at the remaining string
167 if (remlen <= 0) done = 1;
168 }
169 else
170 done = 1;
171 };
172
173 if (internal_trace )
174 fprintf (stderr, "Flag code is : %llx\n", outcode);
175
176 return (outcode);
177 }
178
179 // Get the next word in a string. "word" is defined by the
180 // continuous span of characters that are above ascii ! (> hex 0x20
181 //
182 // The search starts at the "start" position given; the start position
183 // is updated on each call and so is mutilated. To step through a
184 // arglist, you must add the returned value of "len" to the returned
185 // value of start!
186 //
187 // The returned value is 0/1 as to whether we found
188 // a valid word, and *start and *length, which give it's position.
189 //
crm_nextword(char * input,long inlen,long starthere,long * start,long * len)190 long crm_nextword ( char *input,
191 long inlen,
192 long starthere,
193 long *start,
194 long *len)
195 {
196 *start = starthere;
197 *len = 0;
198 // find start of string (if it exists)
199 while (*start < inlen && input [*start] <= 0x20 ) *start = *start + 1;
200
201 // check - did we hit the end and still be invalid? If so, return 0
202 if (*start == inlen) return (0);
203
204 // if we get to here, then we have a valid string.
205 *len = 0;
206 while ((*start+*len) < inlen
207 && input [*start+*len] > 0x20 ) *len = *len + 1;
208
209 return ( (*len) > 0);
210 }
211
212
213
214 //
215 // experimental code for a statement-type-sensitive parser.
216 // Not in use yet... but someday... goal is to provide better error
217 // detection.
218
crm_profiled_statement_parse(char * in,long slen,ARGPARSE_BLOCK * apb,long amin,long amax,long pmin,long pmax,long bmin,long bmax,long smin,long smax)219 int crm_profiled_statement_parse ( char *in,
220 long slen,
221 ARGPARSE_BLOCK *apb,
222 long amin, long amax,
223 long pmin, long pmax,
224 long bmin, long bmax,
225 long smin, long smax)
226 {
227 return (0);
228 }
229
230 // parse a CRM114 statement; this is mostly a setup routine for
231 // the generic parser.
232
crm_statement_parse(char * in,long slen,ARGPARSE_BLOCK * apb)233 int crm_statement_parse ( char *in,
234 long slen,
235 ARGPARSE_BLOCK *apb)
236 {
237 #define CRM_STATEMENT_PARSE_MAXARG 10
238 int i, k;
239
240 long ftype[CRM_STATEMENT_PARSE_MAXARG];
241 long fstart[CRM_STATEMENT_PARSE_MAXARG];
242 long flen [CRM_STATEMENT_PARSE_MAXARG];
243
244 // we call the generic parser with the right args to slice and
245 // dice the incoming statement into declension-delimited parts
246 k = crm_generic_parse_line ( in,
247 slen,
248 "<([/",
249 ">)]/",
250 "\\\\\\\\", // this is four backslashes
251 CRM_STATEMENT_PARSE_MAXARG,
252 ftype,
253 fstart,
254 flen);
255
256 // now we have all these nice chunks... we split them up into the
257 // various allowed categories.
258
259
260 // start out with empties on each possible chunk
261 apb->a1start = NULL; apb->a1len = 0;
262 apb->p1start = NULL; apb->p1len = 0;
263 apb->p2start = NULL; apb->p2len = 0;
264 apb->p3start = NULL; apb->p3len = 0;
265 apb->b1start = NULL; apb->b1len = 0;
266 apb->s1start = NULL; apb->s1len = 0;
267 apb->s2start = NULL; apb->s2len = 0;
268
269 // Scan through the incoming chunks
270 for (i = 0; i < k; i++)
271 {
272 switch (ftype[i])
273 {
274 case CRM_ANGLES:
275 {
276 // Grab the angles, if we don't have one already
277 if (apb->a1start == NULL)
278 {
279 apb->a1start = &in[fstart[i]];
280 apb->a1len = flen [i];
281 }
282 else nonfatalerror5
283 ("There are multiple flag sets on this line.",
284 " ignoring all but the first", CRM_ENGINE_HERE);
285 }
286 break;
287 case CRM_PARENS:
288 {
289 // grab a set of parens, cascading till we find an one
290 if (apb->p1start == NULL)
291 {
292 apb->p1start = &in[fstart[i]];
293 apb->p1len = flen [i];
294 }
295 else
296 if (apb->p2start == NULL)
297 {
298 apb->p2start = &in[fstart[i]];
299 apb->p2len = flen [i];
300 }
301 else
302 if (apb->p3start == NULL)
303 {
304 apb->p3start = &in[fstart[i]];
305 apb->p3len = flen [i];
306 }
307 else
308 nonfatalerror5
309 ("Too many parenthesized varlists.",
310 "ignoring the excess varlists.", CRM_ENGINE_HERE);
311 }
312 break;
313 case CRM_BOXES:
314 {
315 // Grab the angles, if we don't have one already
316 if (apb->b1start == NULL)
317 {
318 apb->b1start = &in[fstart[i]];
319 apb->b1len = flen [i];
320 }
321 else nonfatalerror5
322 ("There are multiple domain limits on this line.",
323 " ignoring all but the first", CRM_ENGINE_HERE);
324 }
325 break;
326 case CRM_SLASHES:
327 {
328 // grab a set of parens, cascading till we find an one
329 if (apb->s1start == NULL)
330 {
331 apb->s1start = &in[fstart[i]];
332 apb->s1len = flen [i];
333 }
334 else
335 if (apb->s2start == NULL)
336 {
337 apb->s2start = &in[fstart[i]];
338 apb->s2len = flen [i];
339 }
340 else
341 nonfatalerror5 (
342 "There are too many regex sets in this statement,",
343 " ignoring all but the first.", CRM_ENGINE_HERE);
344 }
345 break;
346 default:
347 fatalerror5( "Declensional parser returned an undefined typecode!",
348 "What the HECK did you do to cause this?",
349 CRM_ENGINE_HERE);
350 };
351 }
352 return (k); // return value is how many declensional arguments we found.
353 };
354
355
356 // The new and improved line core parser routine. Instead of
357 // being totally ad hoc, this new parser actually retains context
358 // durng the parse.
359 //
360 // this hopefully will keep the parser from getting confused by [] in
361 // the slash matching and other such abominations.
362 //
363 // (one way to view this style of parsing is that each arg in a
364 // CRM114 statement is "declined" by it's delimiters to determine
365 // what role this variable is to play in the statement. Kinda like
366 // Latin - to a major extent, you can mix the parts around and it
367 // won't make any difference.
368
crm_generic_parse_line(char * txt,long len,char * schars,char * fchars,char * echars,long maxargs,long * ftype,long * fstart,long * flen)369 int crm_generic_parse_line (
370 char *txt, // the start of the program line
371 long len, // how long is the line
372 char *schars, // characters that can "start" an arg
373 char *fchars, // characters that "finish" an arg
374 char *echars, // characters that escape in an arg
375 long maxargs, // howm many things to search for (max)
376 long *ftype, // type of thing found (index by schars)
377 long *fstart, // starting location of found arg
378 long *flen // length of found arg
379 )
380 {
381 // the general algorithm here is to move along the input line,
382 // looking for one of the characters in schars. When we find it,
383 // we lock onto that and commit to finding an arg of that type.
384 // We then start scanning ahead keeping count of schars minus echars.
385 // when the count hits zero, it's end for that arg and we move onward
386 // to the next arg, with the same procedure.
387 //
388 // note that when we are scanning for a new arg, we are open to args
389 // of any type (as defined by the members of schars, while in an arg
390 // we are looking only for the unescaped outstanding echar and are blind
391 // to everything else.
392 //
393 // when not in an arg, we do not have any escape character active.
394 //
395 // We return the number of args found
396
397 long chidx;
398 char curchar;
399 long argc;
400 long i;
401 long itype;
402 long depth;
403
404 // zeroize the outputs to start...
405 for (i = 0; i < maxargs; i++)
406 {
407 ftype[i] = -1;
408 fstart[i] = 0;
409 flen[i] = 0;
410 };
411
412
413 // scan forward, looking for any member of schars
414
415 depth = 0;
416 chidx = -1;
417 argc = 0;
418 itype = -1;
419
420 if (internal_trace)
421 {
422 fprintf (stderr, " declensional parsing for %ld chars on: ", len);
423 for (i = 0; i < len; i++)
424 fprintf (stderr, "%c", txt[i]);
425 fprintf (stderr, "\n");
426 }
427
428 while (chidx < len && argc <= maxargs)
429 {
430 chidx++;
431 curchar = txt[chidx];
432 if (itype == -1) // are we looking for an argstart char?
433 {
434 // is curchar one of the start chars? (this is 8-bit-safe,
435 // because schars is always normal ASCII)
436 for (i = 0; i < strlen (schars); i++)
437 if (curchar == schars[i])
438 {
439 if (internal_trace)
440 fprintf (stderr, " found opener %c at %ld,",curchar,chidx);
441 itype = i;
442 fstart[argc] = chidx + 1;
443 ftype [argc] = itype;
444 depth = 1;
445 };
446 // if it wasn't a start-character for an arg, we are done.
447 }
448 else // nope, we're in an arg, so we check for unescaped schar
449 // and fchar characers
450 {
451 // if (curchar == fchars [itype] && txt[chidx-1] != echars[itype])
452 if (curchar == fchars [itype]
453 && (txt[chidx-1] != echars[itype]
454 || txt[chidx-1] == txt[chidx-2]))
455 {
456 depth--;
457 if (depth == 0)
458 {
459 // we've found the end of the text arg. Close it off and
460 // note it into the output vectors
461 flen [argc] = chidx - fstart[argc] ;
462 if (internal_trace)
463 {
464 int q;
465 fprintf (stderr, " close %c at %ld --", curchar, chidx);
466 for (q = fstart[argc]; q < fstart[argc]+flen[argc]; q++)
467 fprintf (stderr, "%c", txt[q]);
468 fprintf (stderr, "-- len %ld\n", flen[argc]);
469 };
470 itype = -1;
471 argc++;
472 };
473 }
474 else
475 //if (curchar == schars [itype] && txt[chidx-1] != echars[itype])
476 if (curchar == schars [itype]
477 && (txt[chidx-1] != echars[itype]
478 || txt[chidx-1] == txt[chidx-2]))
479 {
480 depth++;
481 };
482 };
483 // if we weren't a schar or an unexcaped echar, we're done!
484 };
485 if (depth != 0)
486 {
487 char errstmt[MAX_PATTERN];
488 flen[argc] = chidx - fstart[argc];
489 //
490 // GROT GROT GROT Somehow, sometimes we get flen[argc] < 0. It's
491 // always with buggy userprograms, but we shouldn't need this anyway.
492 // So, until we find out what _we_ are doing wrong, leave the check
493 // for flen[argc] < 0 in here.
494 //
495 if (flen[argc] < 0) flen[argc] = 0;
496 strncpy ( errstmt, &txt[fstart[argc]],
497 flen[argc] );
498 nonfatalerror5 (" This operand doesn't seem to end. Bug? \n --> ",
499 errstmt, CRM_ENGINE_HERE);
500 argc++;
501 };
502 return (argc);
503 }
504
505 // and to avoid all the mumbo-jumbo, an easy way to get a copy of
506 // an arg found by the declensional parser.
crm_get_pgm_arg(char * to,long tolen,char * from,long fromlen)507 void crm_get_pgm_arg (char *to, long tolen, char *from, long fromlen)
508 {
509 long len;
510
511 if (to == NULL)
512 return;
513
514 if (from == NULL)
515 {
516 to[0] = '\000';
517 }
518 else
519 {
520 len = tolen - 1;
521 if (len > fromlen ) len = fromlen ;
522 memmove (to, from, len);
523 to[len] = '\000';
524 }
525 }
526