1 #ifdef RCSID
2 static char RCSid[] =
3 "$Header$";
4 #endif
5 
6 /*
7  *   Copyright (c) 2000, 2002 Michael J. Roberts.  All Rights Reserved.
8  *
9  *   Please see the accompanying license file, LICENSE.TXT, for information
10  *   on using and copying this software.
11  */
12 /*
13 Name
14   vmgram.cpp - T3 Grammar Production metaclass
15 Function
16 
17 Notes
18 
19 Modified
20   02/15/00 MJRoberts  - Creation
21 */
22 
23 #include <stdlib.h>
24 #include <string.h>
25 
26 #include "t3std.h"
27 #include "vmtype.h"
28 #include "vmstack.h"
29 #include "vmlst.h"
30 #include "vmgram.h"
31 #include "vmerr.h"
32 #include "vmerrnum.h"
33 #include "vmmeta.h"
34 #include "vmdict.h"
35 #include "vmtobj.h"
36 #include "vmpredef.h"
37 
38 
39 /* ------------------------------------------------------------------------ */
40 /*
41  *   Grammar production object - memory pool.  This is a simple
42  *   suballocator that obtains large blocks from the system allocator,
43  *   then hands out pieces of those blocks.  Allocating is very cheap
44  *   (just a pointer increment in most cases), and we don't track
45  *   individual allocations and deletions but just throw away all
46  *   suballocated memory at once.
47  */
48 
49 /* size of each memory block */
50 const size_t VMGRAMPROD_MEM_BLOCK_SIZE = 16*1024;
51 
52 /*
53  *   memory pool block
54  */
55 struct CVmGramProdMemBlock
56 {
57     /* next block in chain */
58     CVmGramProdMemBlock *nxt_;
59 
60     /* bytes of the block */
61     char buf_[VMGRAMPROD_MEM_BLOCK_SIZE];
62 };
63 
64 /*
65  *   memory pool object
66  */
67 class CVmGramProdMem
68 {
69 public:
CVmGramProdMem()70     CVmGramProdMem()
71     {
72         /* no memory blocks yet */
73         block_head_ = 0;
74         block_cur_ = 0;
75 
76         /* we don't yet have a block */
77         cur_rem_ = 0;
78         cur_free_ = 0;
79     }
80 
~CVmGramProdMem()81     ~CVmGramProdMem()
82     {
83         /* delete each of our blocks */
84         while (block_head_ != 0)
85         {
86             CVmGramProdMemBlock *nxt;
87 
88             /* remember the next block */
89             nxt = block_head_->nxt_;
90 
91             /* delete this block */
92             t3free(block_head_);
93 
94             /* move on to the next */
95             block_head_ = nxt;
96         }
97     }
98 
99     /* reset - delete all previously sub-allocated memory */
reset()100     void reset()
101     {
102         /* start over with the first block */
103         block_cur_ = block_head_;
104 
105         /* initialize the free pointer for the new block, if we have one */
106         if (block_cur_ != 0)
107         {
108             /* start at the beginning of the new block */
109             cur_free_ = block_cur_->buf_;
110 
111             /* this entire block is available */
112             cur_rem_ = VMGRAMPROD_MEM_BLOCK_SIZE;
113         }
114         else
115         {
116             /* there are no blocks, so there's no memory available yet */
117             cur_rem_ = 0;
118             cur_free_ = 0;
119         }
120     }
121 
122     /* allocate memory */
alloc(size_t siz)123     void *alloc(size_t siz)
124     {
125         void *ret;
126 
127         /* round the size to the local hardware boundary */
128         siz = osrndsz(siz);
129 
130         /* if it exceeds the block size, fail */
131         if (siz > VMGRAMPROD_MEM_BLOCK_SIZE)
132             return 0;
133 
134         /* if we don't have enough space in this block, go to the next */
135         if (siz > cur_rem_)
136         {
137             /* allocate a new block if necessary */
138             if (block_cur_ == 0)
139             {
140                 /* there's nothing in the list yet - set up the first block */
141                 block_head_ = (CVmGramProdMemBlock *)
142                               t3malloc(sizeof(CVmGramProdMemBlock));
143 
144                 /* activate the new block */
145                 block_cur_ = block_head_;
146 
147                 /* there's nothing after this block */
148                 block_cur_->nxt_ = 0;
149             }
150             else if (block_cur_->nxt_ == 0)
151             {
152                 /* we're at the end of the list - add a block */
153                 block_cur_->nxt_ = (CVmGramProdMemBlock *)
154                                    t3malloc(sizeof(CVmGramProdMemBlock));
155 
156                 /* advance to the new block */
157                 block_cur_ = block_cur_->nxt_;
158 
159                 /* the new block is the last in the chain */
160                 block_cur_->nxt_ = 0;
161             }
162             else
163             {
164                 /* another block follows - advance to it */
165                 block_cur_ = block_cur_->nxt_;
166             }
167 
168             /* start at the beginning of the new block */
169             cur_free_ = block_cur_->buf_;
170 
171             /* this entire block is available */
172             cur_rem_ = VMGRAMPROD_MEM_BLOCK_SIZE;
173         }
174 
175         /* the return value will be the current free pointer */
176         ret = cur_free_;
177 
178         /* consume the space */
179         cur_free_ += siz;
180         cur_rem_ -= siz;
181 
182         /* return the block location */
183         return ret;
184     }
185 
186 private:
187     /* head of block list */
188     CVmGramProdMemBlock *block_head_;
189 
190     /* block we're currently suballocating out of */
191     CVmGramProdMemBlock *block_cur_;
192 
193     /* current free pointer in current block */
194     char *cur_free_;
195 
196     /* amount of space remaining in current block */
197     size_t cur_rem_;
198 };
199 
200 /* ------------------------------------------------------------------------ */
201 /*
202  *   Input token array entry.  We make a private copy of the input token list
203  *   (i.e., the token list we're trying to match to our grammar rules) during
204  *   parsing using one of these structures for each token.
205  */
206 struct vmgramprod_tok
207 {
208     /* original token value */
209     vm_val_t val_;
210 
211     /* token type */
212     ulong typ_;
213 
214     /* token text */
215     const char *txt_;
216 
217     /* length of the token text */
218     size_t len_;
219 
220     /* hash value of the token */
221     unsigned int hash_;
222 
223     /* number of vocabulary matches associated with the word */
224     size_t match_cnt_;
225 
226     /* pointer to list of matches for the word */
227     vmgram_match_info *matches_;
228 };
229 
230 
231 /* ------------------------------------------------------------------------ */
232 /*
233  *   Parsing state match object.  A match can be terminal, in which case
234  *   it refers to a single token of input, or it can be a production, in
235  *   which case it refers to a subtree of match objects.
236  */
237 struct CVmGramProdMatch
238 {
239     /* allocate */
allocCVmGramProdMatch240     static CVmGramProdMatch *alloc(CVmGramProdMem *mem, size_t tok_pos,
241                                    const vm_val_t *tok_match_result,
242                                    int matched_star,
243                                    vm_prop_id_t target_prop,
244                                    vm_obj_id_t proc_obj,
245                                    const CVmGramProdMatch **sub_match_list,
246                                    size_t sub_match_cnt)
247     {
248         CVmGramProdMatch *match;
249 
250         /* allocate space */
251         match = (CVmGramProdMatch *)mem->alloc(sizeof(CVmGramProdMatch));
252 
253         /* initialize */
254         match->tok_pos_ = tok_pos;
255         match->matched_star_ = matched_star;
256         match->target_prop_ = target_prop;
257         match->proc_obj_ = proc_obj;
258         match->sub_match_list_ = sub_match_list;
259         match->sub_match_cnt_ = sub_match_cnt;
260 
261         /* save the token match value, if one was given */
262         if (tok_match_result != 0)
263             match->tok_match_result_ = *tok_match_result;
264         else
265             match->tok_match_result_.set_nil();
266 
267         /* return the new match */
268         return match;
269     }
270 
271     /* property ID with which this match is associated */
272     vm_prop_id_t target_prop_;
273 
274     /*
275      *   token position of the matching token; ignored if the production
276      *   subtree information is valid
277      */
278     size_t tok_pos_;
279 
280     /*
281      *   The token match value.  This is the result code from the Dictionary
282      *   comparator's matchValues() function, which typically encodes
283      *   information about how the value matched (truncation, case folding,
284      *   accent approximation, etc) in bitwise flags.
285      */
286     vm_val_t tok_match_result_;
287 
288     /*
289      *   flag: this is a '*' token in the rule; these don't really match any
290      *   tokens at all, since they merely serve to stop parsing regardless
291      *   of whether or not more input tokens are present
292      */
293     int matched_star_;
294 
295     /*
296      *   object ID of match processor - if this is valid, this match
297      *   refers to a production; otherwise, the match refers to a single
298      *   token
299      */
300     vm_obj_id_t proc_obj_;
301 
302     /* pointer to array of sub-matches */
303     const CVmGramProdMatch **sub_match_list_;
304 
305     /* number of sub-match entries */
306     size_t sub_match_cnt_;
307 };
308 
309 /*
310  *   Top-level match list holder
311  */
312 struct CVmGramProdMatchEntry
313 {
314     /* the match tree */
315     CVmGramProdMatch *match_;
316 
317     /* token position at the end of the match */
318     size_t tok_pos_;
319 
320     /* next entry in the list */
321     CVmGramProdMatchEntry *nxt_;
322 };
323 
324 /*
325  *   Parsing state object.  At any given time, we will have one or more of
326  *   these state objects in our processing queue.  A state object tracks a
327  *   parsing position - the token position, the alternative position, the
328  *   match list so far for the alternative (i.e., the match for each item
329  *   in the alternative up to but not including the current position), and
330  *   the enclosing parsing state object (i.e., the parsing state that
331  *   "recursed" into this alternative).
332  */
333 struct CVmGramProdState
334 {
335     /* allocate */
allocCVmGramProdState336     static CVmGramProdState *alloc(CVmGramProdMem *mem,
337                                    int tok_pos, int alt_pos,
338                                    CVmGramProdState *enclosing,
339                                    const vmgram_alt_info *altp,
340                                    vm_obj_id_t prod_obj,
341                                    int circular_alt)
342     {
343         size_t match_cnt;
344         CVmGramProdState *state;
345 
346         /* get the number of match list slots we'll need */
347         match_cnt = altp->tok_cnt;
348 
349         /* allocate space, including space for the match list */
350         state = (CVmGramProdState *)
351                 mem->alloc(sizeof(CVmGramProdState)
352                            + (match_cnt - 1)*sizeof(CVmGramProdMatch *));
353 
354         /* initialize */
355         state->init(tok_pos, alt_pos, enclosing, altp, prod_obj,
356                     circular_alt);
357 
358         /* return the new item */
359         return state;
360     }
361 
362     /* initialize */
initCVmGramProdState363     void init(int tok_pos, int alt_pos, CVmGramProdState *enclosing,
364               const vmgram_alt_info *altp, vm_obj_id_t prod_obj,
365               int circular_alt)
366     {
367         /* remember the position parameters */
368         tok_pos_ = tok_pos;
369         alt_pos_ = alt_pos;
370 
371         /* we haven't matched a '*' token yet */
372         matched_star_ = FALSE;
373 
374         /* remember our stack position */
375         enclosing_ = enclosing;
376 
377         /* remember our alternative image data pointer */
378         altp_ = altp;
379 
380         /* remember the production object */
381         prod_obj_ = prod_obj;
382 
383         /* note if we had circular alternatives in the same production */
384         circular_alt_ = circular_alt;
385 
386         /* no target property for sub-production yet */
387         sub_target_prop_ = VM_INVALID_PROP;
388 
389         /* we're not in the queue yet */
390         nxt_ = 0;
391     }
392 
393     /* clone the state */
cloneCVmGramProdState394     CVmGramProdState *clone(CVmGramProdMem *mem)
395     {
396         CVmGramProdState *new_state;
397         CVmGramProdState *new_enclosing;
398 
399         /* clone our enclosing state */
400         new_enclosing = (enclosing_ != 0 ? enclosing_->clone(mem) : 0);
401 
402         /* create a new state object */
403         new_state = alloc(mem, tok_pos_, alt_pos_, new_enclosing,
404                           altp_, prod_obj_, circular_alt_);
405 
406         /* copy the target sub-production property */
407         new_state->sub_target_prop_ = sub_target_prop_;
408 
409         /* copy the valid portion of my match list */
410         if (alt_pos_ != 0)
411             memcpy(new_state->match_list_, match_list_,
412                    alt_pos_ * sizeof(match_list_[0]));
413 
414         /* we're not yet in any queue */
415         new_state->nxt_ = 0;
416 
417         /* set the '*' flag in the cloned state */
418         new_state->matched_star_ = matched_star_;
419 
420         /* return the clone */
421         return new_state;
422     }
423 
424     /* current token position, as an index into the token list */
425     size_t tok_pos_;
426 
427     /*
428      *   flag: we've matched a '*' token in a production or subproduction,
429      *   so we should consider all remaining tokens to have been consumed
430      *   (we don't advance tok_pos_ past all of the tokens, because we
431      *   separately want to keep track of the number of tokens we matched
432      *   for everything up to the '*')
433      */
434     int matched_star_;
435 
436     /*
437      *   current alternative position, as an index into the alternative's
438      *   list of items
439      */
440     size_t alt_pos_;
441 
442     /*
443      *   the enclosing state object - this is the state object that
444      *   "recursed" to our position
445      */
446     CVmGramProdState *enclosing_;
447 
448     /* pointer to alternative definition */
449     const vmgram_alt_info *altp_;
450 
451     /* object ID of the production object */
452     vm_obj_id_t prod_obj_;
453 
454     /* the target property for the pending sub-production */
455     vm_prop_id_t sub_target_prop_;
456 
457     /* next production state in the processing queue */
458     CVmGramProdState *nxt_;
459 
460     /* the production contains circular alternatives */
461     int circular_alt_;
462 
463     /*
464      *   Match list.  This list is allocated with enough space for one
465      *   match for each of our items (the number of items is given by
466      *   vmgram_alt_tokcnt(altp_)).  At any given time, these will be
467      *   valid up to but not including the current alt_pos_ index.
468      */
469     const struct CVmGramProdMatch *match_list_[1];
470 };
471 
472 /*
473  *   Queues.  We maintain three queues: the primary work queue, which
474  *   contains pending parsing states that we're still attempting to match;
475  *   the success queue, which contains a list of match trees for
476  *   successfully completed matches; and the badness queue, which contains
477  *   a list of parsing states that we can try as fallbacks if we fail to
478  *   find a match in any of the work queue items.
479  */
480 struct CVmGramProdQueue
481 {
482     /* clear the queues */
clearCVmGramProdQueue483     void clear()
484     {
485         work_queue_ = 0;
486         badness_queue_ = 0;
487         success_list_ = 0;
488     }
489 
490     /* head of work queue */
491     CVmGramProdState *work_queue_;
492 
493     /* head of badness queue */
494     CVmGramProdState *badness_queue_;
495 
496     /* head of success list */
497     CVmGramProdMatchEntry *success_list_;
498 };
499 
500 /* ------------------------------------------------------------------------ */
501 /*
502  *   Grammar-Production object statics
503  */
504 
505 /* metaclass registration object */
506 static CVmMetaclassGramProd metaclass_reg_obj;
507 CVmMetaclass *CVmObjGramProd::metaclass_reg_ = &metaclass_reg_obj;
508 
509 /* function table */
510 int (CVmObjGramProd::
511      *CVmObjGramProd::func_table_[])(VMG_ vm_obj_id_t self,
512                                      vm_val_t *retval, uint *argc) =
513 {
514     &CVmObjGramProd::getp_undef,
515     &CVmObjGramProd::getp_parse
516 };
517 
518 /* ------------------------------------------------------------------------ */
519 /*
520  *   Grammar-Production metaclass implementation
521  */
522 
523 /*
524  *   create dynamically using stack arguments
525  */
create_from_stack(VMG_ const uchar ** pc_ptr,uint argc)526 vm_obj_id_t CVmObjGramProd::create_from_stack(VMG_ const uchar **pc_ptr,
527                                               uint argc)
528 {
529     /* this type of object cannot be dynamically created */
530     err_throw(VMERR_BAD_DYNAMIC_NEW);
531 
532     /* we won't reach this point, but the compiler wouldn't know */
533     return VM_INVALID_OBJ;
534 }
535 
536 /*
537  *   constructor
538  */
CVmObjGramProd(VMG0_)539 CVmObjGramProd::CVmObjGramProd(VMG0_)
540 {
541     /* allocate our extension structure from the variable heap */
542     ext_ = (char *)G_mem->get_var_heap()
543            ->alloc_mem(sizeof(vm_gram_ext), this);
544 
545     /* we have no image data yet */
546     get_ext()->image_data_ = 0;
547     get_ext()->image_data_size_ = 0;
548 
549     /* we haven't set up our alternatives yet */
550     get_ext()->alts_ = 0;
551     get_ext()->alt_cnt_ = 0;
552 
553     /* we haven't cached any hash values yet */
554     get_ext()->hashes_cached_ = FALSE;
555     get_ext()->comparator_ = VM_INVALID_OBJ;
556 
557     /* presume we have no circular rules */
558     get_ext()->has_circular_alt = FALSE;
559 
560     /* create our memory pool */
561     get_ext()->mem_ = new CVmGramProdMem();
562 
563     /*
564      *   allocate initial property enumeration space (we'll expand this as
565      *   needed, so the initial size is just a guess)
566      */
567     get_ext()->prop_enum_max_ = 64;
568     get_ext()->prop_enum_arr_ = (vmgram_match_info *)
569                                 t3malloc(get_ext()->prop_enum_max_
570                                          * sizeof(vmgram_match_info));
571 }
572 
573 /*
574  *   notify of deletion
575  */
notify_delete(VMG_ int in_root_set)576 void CVmObjGramProd::notify_delete(VMG_ int in_root_set)
577 {
578     /* free our additional data */
579     if (ext_ != 0)
580     {
581         /*
582          *   If we've allocated our alternatives, delete the memory.  We
583          *   allocate the entire complex structure in one block, which we use
584          *   for the alts_ array, so we merely need to delete that one block.
585          */
586         if (get_ext()->alts_ != 0)
587             t3free(get_ext()->alts_);
588 
589         /* delete our memory pool */
590         delete get_ext()->mem_;
591 
592         /* delete our property enumeration space */
593         t3free(get_ext()->prop_enum_arr_);
594 
595         /* free the extension */
596         G_mem->get_var_heap()->free_mem(ext_);
597     }
598 }
599 
600 /*
601  *   get a property
602  */
get_prop(VMG_ vm_prop_id_t prop,vm_val_t * retval,vm_obj_id_t self,vm_obj_id_t * source_obj,uint * argc)603 int CVmObjGramProd::get_prop(VMG_ vm_prop_id_t prop, vm_val_t *retval,
604                              vm_obj_id_t self, vm_obj_id_t *source_obj,
605                              uint *argc)
606 {
607     ushort func_idx;
608 
609     /* translate the property index to an index into our function table */
610     func_idx = G_meta_table
611                ->prop_to_vector_idx(metaclass_reg_->get_reg_idx(), prop);
612 
613     /* call the appropriate function */
614     if ((this->*func_table_[func_idx])(vmg_ self, retval, argc))
615     {
616         *source_obj = metaclass_reg_->get_class_obj(vmg0_);
617         return TRUE;
618     }
619 
620     /* inherit default handling */
621     return CVmObject::get_prop(vmg_ prop, retval, self, source_obj, argc);
622 }
623 
624 /*
625  *   set a property
626  */
set_prop(VMG_ class CVmUndo * undo,vm_obj_id_t self,vm_prop_id_t prop,const vm_val_t * val)627 void CVmObjGramProd::set_prop(VMG_ class CVmUndo *undo,
628                               vm_obj_id_t self, vm_prop_id_t prop,
629                               const vm_val_t *val)
630 {
631     /* no settable properties - throw an error */
632     err_throw(VMERR_INVALID_SETPROP);
633 }
634 
635 /*
636  *   load from an image file
637  */
load_from_image(VMG_ vm_obj_id_t self,const char * ptr,size_t siz)638 void CVmObjGramProd::load_from_image(VMG_ vm_obj_id_t self,
639                                      const char *ptr, size_t siz)
640 {
641     const char *p;
642     size_t alt_cnt;
643     size_t tok_cnt;
644     size_t i;
645     size_t alo_siz;
646     vmgram_alt_info *next_alt;
647     vmgram_tok_info *next_tok;
648     char *next_byte;
649 
650     /* remember where our image data comes from */
651     get_ext()->image_data_ = ptr;
652     get_ext()->image_data_size_ = siz;
653 
654     /*
655      *   For our alternatives structure, start with the base amount of
656      *   memory, which is the amount we'll need for the array of alternative
657      *   entries themselves.  The first UINT2 gives the number of
658      *   alternatives in our grammar rule.
659      */
660     alt_cnt = osrp2(ptr);
661     alo_siz = alt_cnt * sizeof(vmgram_alt_info);
662 
663     /*
664      *   Scan the alternatives.  Check for circular (left-recursive)
665      *   alternatives, and add up how much memory we'll need for our decoded
666      *   altneratives structure.
667      */
668     for (i = alt_cnt, p = ptr + 2, tok_cnt = 0 ; i != 0 ; --i)
669     {
670         const char *tokp;
671         size_t alt_tok_cnt;
672         size_t j;
673 
674         /* get the token count and first token for this alternative */
675         alt_tok_cnt = vmgram_alt_tokcnt(p);
676         tokp = vmgram_alt_tokptr(p);
677 
678         /*
679          *   For our allocation size, add the amount of memory we need for
680          *   this alternative entry's array of tokens.
681          */
682         alo_siz += alt_tok_cnt * sizeof(vmgram_tok_info);
683 
684         /* include these tokens in the cumulative token count */
685         tok_cnt += alt_tok_cnt;
686 
687         /* if the alternative is circular, note it */
688         if (alt_tok_cnt != 0
689             && vmgram_tok_type(tokp) == VMGRAM_MATCH_PROD
690             && vmgram_tok_prod_obj(tokp) == self)
691         {
692             /* note the existence of a circular reference */
693             get_ext()->has_circular_alt = TRUE;
694         }
695 
696         /* scan the tokens to see how much memory we'll need to store them */
697         for (j = 0 ; j < alt_tok_cnt ; ++j, tokp = get_next_alt_tok(tokp))
698         {
699             /* check what sort of extra memory we'll need for this token */
700             switch(vmgram_tok_type(tokp))
701             {
702             case VMGRAM_MATCH_NSPEECH:
703                 /* we need the array of vm_prop_id_t's */
704                 alo_siz += osrndsz(vmgram_tok_vocn_cnt(tokp)
705                                    * sizeof(vm_prop_id_t));
706                 break;
707 
708             case VMGRAM_MATCH_LITERAL:
709                 /* we need space for the string */
710                 alo_siz += osrndsz(vmgram_tok_lit_len(tokp));
711                 break;
712             }
713         }
714 
715         /* the next alternative starts after the last token */
716         p = tokp;
717     }
718 
719     /*
720      *   Allocate our block of memory to store the decoded alternatives.  Put
721      *   the whole thing in one block, and put the alternative array at the
722      *   start of the block.
723      */
724     get_ext()->alt_cnt_ = alt_cnt;
725     get_ext()->alts_ = next_alt = (vmgram_alt_info *)t3malloc(alo_siz);
726 
727     /* allocate the tokens right after the alternative array */
728     next_tok = (vmgram_tok_info *)&get_ext()->alts_[alt_cnt];
729 
730     /* allocate the other miscellaneous data after the token array */
731     next_byte = (char *)&next_tok[tok_cnt];
732 
733     /* run through the image data again and decode it */
734     for (i = alt_cnt, p = ptr + 2, tok_cnt = 0 ; i != 0 ; --i)
735     {
736         const char *tokp;
737         size_t alt_tok_cnt;
738         size_t j;
739 
740         /* get the token count and first token for this alternative */
741         alt_tok_cnt = vmgram_alt_tokcnt(p);
742         tokp = vmgram_alt_tokptr(p);
743 
744         /* set up this alternative structure */
745         next_alt->score = vmgram_alt_score(p);
746         next_alt->badness = vmgram_alt_badness(p);
747         next_alt->proc_obj = vmgram_alt_procobj(p);
748         next_alt->tok_cnt = vmgram_alt_tokcnt(p);
749         next_alt->toks = next_tok;
750 
751         /* consume this alternative entry */
752         ++next_alt;
753 
754         /* scan and decode the tokens */
755         for (j = 0 ; j < alt_tok_cnt ; ++j, tokp = get_next_alt_tok(tokp))
756         {
757             size_t k;
758 
759             /* set up this token's base information */
760             next_tok->prop = vmgram_tok_prop(tokp);
761             next_tok->typ = vmgram_tok_type(tokp);
762 
763             /* decode the type-specific data */
764             switch(vmgram_tok_type(tokp))
765             {
766             case VMGRAM_MATCH_PROD:
767                 next_tok->typinfo.prod_obj = vmgram_tok_prod_obj(tokp);
768                 break;
769 
770             case VMGRAM_MATCH_SPEECH:
771                 next_tok->typinfo.speech_prop = vmgram_tok_voc_prop(tokp);
772                 break;
773 
774             case VMGRAM_MATCH_NSPEECH:
775                 /* set up our array, using the miscellaneous space pool */
776                 next_tok->typinfo.nspeech.cnt = vmgram_tok_vocn_cnt(tokp);
777                 next_tok->typinfo.nspeech.props = (vm_prop_id_t *)next_byte;
778 
779                 /* copy the properties */
780                 for (k = 0 ; k < next_tok->typinfo.nspeech.cnt ; ++k)
781                     next_tok->typinfo.nspeech.props[k] =
782                         vmgram_tok_vocn_prop(tokp, k);
783 
784                 /* consume the space from the pool */
785                 next_byte += osrndsz(next_tok->typinfo.nspeech.cnt
786                                      * sizeof(vm_prop_id_t));
787                 break;
788 
789             case VMGRAM_MATCH_LITERAL:
790                 /* take the space out of the miscellaneous space pool */
791                 next_tok->typinfo.lit.len = vmgram_tok_lit_len(tokp);
792                 next_tok->typinfo.lit.str = next_byte;
793 
794                 /* copy the data */
795                 memcpy(next_tok->typinfo.lit.str, vmgram_tok_lit_txt(tokp),
796                        next_tok->typinfo.lit.len);
797 
798                 /* consume the space from the pool */
799                 next_byte += osrndsz(next_tok->typinfo.lit.len);
800                 break;
801 
802             case VMGRAM_MATCH_TOKTYPE:
803                 next_tok->typinfo.toktyp_enum = vmgram_tok_tok_enum(tokp);
804                 break;
805             }
806 
807             /* consume the token slot */
808             ++next_tok;
809         }
810 
811         /* the next alternative starts after the last token */
812         p = tokp;
813     }
814 }
815 
816 /*
817  *   Remove stale weak references.
818  */
remove_stale_weak_refs(VMG0_)819 void CVmObjGramProd::remove_stale_weak_refs(VMG0_)
820 {
821     /*
822      *   Our reference to the dictionary comparator object is weak (we're
823      *   only caching it - we don't want to prevent the object from being
824      *   collected if no one else wants it).  So, forget it if the comparator
825      *   is being deleted.
826      */
827     if (get_ext()->comparator_ != VM_INVALID_OBJ
828         && G_obj_table->is_obj_deletable(get_ext()->comparator_))
829     {
830         /* forget the comparator */
831         get_ext()->comparator_ = VM_INVALID_OBJ;
832 
833         /*
834          *   our cached hash values depend on the comparator, so they're now
835          *   invalid - forget about them
836          */
837         get_ext()->hashes_cached_ = FALSE;
838     }
839 }
840 
841 
842 /*
843  *   Context for enum_props_cb
844  */
845 struct enum_props_ctx
846 {
847     /* object extension */
848     vm_gram_ext *ext;
849 
850     /* number of properties in our list so far */
851     size_t cnt;
852 };
853 
854 /*
855  *   Callback for enumerating word properties
856  */
enum_props_cb(VMG_ void * ctx0,vm_prop_id_t prop,const vm_val_t *)857 void CVmObjGramProd::enum_props_cb(VMG_ void *ctx0, vm_prop_id_t prop,
858                                    const vm_val_t * /*match_val*/)
859 {
860     enum_props_ctx *ctx = (enum_props_ctx *)ctx0;
861     vmgram_match_info *ep;
862     size_t i;
863 
864     /* if this property is already in our list, don't add it again */
865     for (i = 0, ep = ctx->ext->prop_enum_arr_ ; i < ctx->cnt ; ++i, ++ep)
866     {
867         /* if we already have this one, we can ignore it */
868         if (ep->prop == prop)
869             return;
870     }
871 
872     /* we need to add it - if the array is full, expand it */
873     if (ctx->cnt == ctx->ext->prop_enum_max_)
874     {
875         /* reallocate the array with more space */
876         ctx->ext->prop_enum_max_ += 64;
877         ctx->ext->prop_enum_arr_ = (vmgram_match_info *)
878                                    t3realloc(ctx->ext->prop_enum_arr_,
879                                              ctx->ext->prop_enum_max_
880                                              * sizeof(vmgram_match_info));
881     }
882 
883     /* add the property to the array */
884     ep = &ctx->ext->prop_enum_arr_[ctx->cnt];
885     ep->prop = prop;
886 
887     /* count the new entry */
888     ctx->cnt++;
889 }
890 
891 /*
892  *   Execute the parseToken method
893  */
getp_parse(VMG_ vm_obj_id_t self,vm_val_t * retval,uint * argc)894 int CVmObjGramProd::getp_parse(VMG_ vm_obj_id_t self,
895                                vm_val_t *retval, uint *argc)
896 {
897     vm_val_t *tokval;
898     vm_val_t dictval;
899     CVmObjDict *dict;
900     size_t tok_cnt;
901     size_t i;
902     vmgramprod_tok *tok;
903     const char *toklistp;
904     int orig_argc = (argc != 0 ? *argc : 0);
905     CVmGramProdQueue queues;
906     CVmObjList *lst;
907     size_t succ_cnt;
908     CVmGramProdMatchEntry *match;
909     static CVmNativeCodeDesc desc(2);
910 
911     /* check arguments */
912     if (get_prop_check_argc(retval, argc, &desc))
913         return TRUE;
914 
915     /* reset our memory pool */
916     get_ext()->mem_->reset();
917 
918     /* clear the work queues */
919     queues.clear();
920 
921     /*
922      *   get the tokenList argument and make sure it's a list; leave it on
923      *   the stack for now so it remains reachable for the garbage
924      *   collector
925      */
926     tokval = G_stk->get(0);
927     if ((toklistp = tokval->get_as_list(vmg0_)) == 0)
928         err_throw(VMERR_LIST_VAL_REQD);
929 
930     /* get the length of the token list */
931     tok_cnt = vmb_get_len(tokval->get_as_list(vmg0_));
932 
933     /* check for a dictionary argument */
934     if (orig_argc >= 2)
935     {
936         /* get the dictionary argument */
937         dictval = *G_stk->get(1);
938 
939         /* make sure it's an object or nil */
940         if (dictval.typ != VM_OBJ && dictval.typ != VM_NIL)
941             err_throw(VMERR_OBJ_VAL_REQD);
942     }
943     else
944     {
945         /* use a nil dictionary by default */
946         dictval.set_nil();
947     }
948 
949     /* get the dictionary object and check that it's really a dictionary */
950     if (dictval.typ != VM_NIL)
951     {
952         /* if it's not a CVmObjDict object, it's the wrong type */
953         if (!CVmObjDict::is_dictionary_obj(vmg_ dictval.val.obj))
954         {
955             /* wrong type - throw an error */
956             err_throw(VMERR_INVAL_OBJ_TYPE);
957         }
958 
959         /* get the VM object pointer */
960         dict = (CVmObjDict *)vm_objp(vmg_ dictval.val.obj);
961     }
962     else
963     {
964         /* there's no dictionary object */
965         dict = 0;
966     }
967 
968     /*
969      *   For quick and easy access, make our own private copy of the token
970      *   and type lists.  First, allocate an array of token structures.
971      */
972     tok = (vmgramprod_tok *)get_ext()->mem_
973           ->alloc(tok_cnt * sizeof(vmgramprod_tok));
974 
975     /* copy the token string references and types into our list */
976     for (i = 0 ; i < tok_cnt ; ++i)
977     {
978         vm_val_t ele_val;
979         vm_val_t tok_typ_val;
980         vm_val_t tok_str_val;
981         const char *subp;
982         const char *tokstrp;
983 
984         /* presume we won't have any vocabulary properties for the word */
985         tok[i].match_cnt_ = 0;
986         tok[i].matches_ = 0;
987 
988         /* get this element from the list, and treat it as a list */
989         CVmObjList::index_list(vmg_ &ele_val, toklistp, i+1);
990         subp = ele_val.get_as_list(vmg0_);
991 
992         /* if this element isn't a list, it's an error */
993         if (subp == 0)
994             err_throw(VMERR_BAD_TYPE_BIF);
995 
996         /*
997          *   parse the token sublist: the first element is the token value,
998          *   and the second element is the token type
999          */
1000         CVmObjList::index_list(vmg_ &tok_str_val, subp, 1);
1001         CVmObjList::index_list(vmg_ &tok_typ_val, subp, 2);
1002 
1003         /* use the value if it's an enumerator; if not, use zero */
1004         if (tok_typ_val.typ == VM_ENUM)
1005             tok[i].typ_ = tok_typ_val.val.enumval;
1006         else
1007             tok[i].typ_ = 0;
1008 
1009         /* get the token value as a string */
1010         tokstrp = tok_str_val.get_as_string(vmg0_);
1011 
1012         /* if it's a string, copy it; otherwise, ignore the value */
1013         if (tokstrp != 0)
1014         {
1015             /* remember the string value */
1016             tok[i].val_ = tok_str_val;
1017 
1018             /* point the token to the string */
1019             tok[i].txt_ = tokstrp + VMB_LEN;
1020             tok[i].len_ = vmb_get_len(tokstrp);
1021 
1022             /* calculate and store the token's hash value */
1023             tok[i].hash_ = calc_hash(vmg_ dict, &tok_str_val,
1024                                      tokstrp + VMB_LEN, vmb_get_len(tokstrp));
1025 
1026             /*
1027              *   if we have a dictionary, enumerate the properties
1028              *   associated with the word
1029              */
1030             if (dict != 0)
1031             {
1032                 enum_props_ctx ctx;
1033 
1034                 /* set up our callback context */
1035                 ctx.ext = get_ext();
1036                 ctx.cnt = 0;
1037 
1038                 /* enumerate the properties */
1039                 dict->enum_word_props(vmg_ &enum_props_cb, &ctx,
1040                                       &tok_str_val, tok[i].txt_, tok[i].len_);
1041 
1042                 /*
1043                  *   if we found any properties for the word, make a copy
1044                  *   for the token list
1045                  */
1046                 if (ctx.cnt != 0)
1047                 {
1048                     /* allocate space for the list */
1049                     tok[i].match_cnt_ = ctx.cnt;
1050                     tok[i].matches_ =
1051                         (vmgram_match_info *)get_ext()->mem_
1052                         ->alloc(ctx.cnt * sizeof(vmgram_match_info));
1053 
1054                     /* copy the list */
1055                     memcpy(tok[i].matches_, get_ext()->prop_enum_arr_,
1056                            ctx.cnt * sizeof(tok[i].matches_[0]));
1057                 }
1058             }
1059         }
1060         else
1061         {
1062             /* it's not a string - use a null string value for this token */
1063             tok[i].txt_ = 0;
1064             tok[i].len_ = 0;
1065         }
1066     }
1067 
1068     /* enqueue a match state item for each of our alternatives */
1069     enqueue_alts(vmg_ get_ext()->mem_, tok, tok_cnt, 0, 0,
1070                  &queues, self, FALSE, 0, dict);
1071 
1072     /* process the work queue */
1073     process_work_queue(vmg_ get_ext()->mem_, tok, tok_cnt,
1074                        &queues, dict);
1075 
1076     /* count the entries in the success list */
1077     for (succ_cnt = 0, match = queues.success_list_ ; match != 0 ;
1078          ++succ_cnt, match = match->nxt_) ;
1079 
1080     /* allocate the return list */
1081     retval->set_obj(CVmObjList::create(vmg_ FALSE, succ_cnt));
1082     lst = (CVmObjList *)vm_objp(vmg_ retval->val.obj);
1083 
1084     /* initially clear the elements of the return list to nil */
1085     for (i = 0 ; i < succ_cnt ; ++i)
1086     {
1087         vm_val_t nil_val;
1088 
1089         /* set this element to nil */
1090         nil_val.set_nil();
1091         lst->cons_set_element(i, &nil_val);
1092     }
1093 
1094     /* push the list momentarily to protect it from garbage collection */
1095     G_stk->push(retval);
1096 
1097     /* set the elements */
1098     for (succ_cnt = 0, match = queues.success_list_ ; match != 0 ;
1099          ++succ_cnt, match = match->nxt_)
1100     {
1101         vm_val_t tree_val;
1102         vm_val_t tok_match_val;
1103         size_t first_tok;
1104         size_t last_tok;
1105 
1106         /*
1107          *   Create a list to hold the token match list - this is a list of
1108          *   the dictionary's comparator's matchValue() results for the
1109          *   tokens that matched literals in the grammar rule.  We need a
1110          *   list with one element per token that we matched.
1111          *
1112          *   We can't actually populate this list yet, but all we need for
1113          *   the moment are references to it.  So, create the list; we'll
1114          *   fill it in as we traverse the match to build the result tree.
1115          */
1116         tok_match_val.set_obj(
1117             CVmObjList::create(vmg_ FALSE, match->tok_pos_));
1118 
1119         /* push the token match value list for gc protection */
1120         G_stk->push(&tok_match_val);
1121 
1122         /* build the object tree for this success object */
1123         build_match_tree(vmg_ match->match_, tokval, &tok_match_val,
1124                          &tree_val, &first_tok, &last_tok);
1125 
1126         /* discard our token match list's gc protection stack entry */
1127         G_stk->discard();
1128 
1129         /*
1130          *   add the match tree's root object to the main return list (note
1131          *   that this protects it from garbage collection by virtue of the
1132          *   main return list being protected from garbage collection)
1133          */
1134         lst->cons_set_element(succ_cnt, &tree_val);
1135     }
1136 
1137     /* discard the gc protection */
1138     G_stk->discard();
1139 
1140     /* discard arguments */
1141     G_stk->discard(orig_argc);
1142 
1143     /* evaluation successful */
1144     return TRUE;
1145 }
1146 
1147 /*
1148  *   Build an object tree for a match
1149  */
build_match_tree(VMG_ const CVmGramProdMatch * match,const vm_val_t * toklist,const vm_val_t * tokmatchlist,vm_val_t * retval,size_t * first_tok,size_t * last_tok)1150 void CVmObjGramProd::build_match_tree(VMG_ const CVmGramProdMatch *match,
1151                                       const vm_val_t *toklist,
1152                                       const vm_val_t *tokmatchlist,
1153                                       vm_val_t *retval,
1154                                       size_t *first_tok, size_t *last_tok)
1155 {
1156     /* check to see what kind of match we have */
1157     if (match->proc_obj_ != VM_INVALID_OBJ)
1158     {
1159         vm_obj_id_t obj_id;
1160         CVmObjTads *objp;
1161         size_t i;
1162 
1163         /*
1164          *   Create the object to hold the current tree level.  The only
1165          *   constructor argument is the superclass, which is the match's
1166          *   processor object.
1167          */
1168         G_stk->push()->set_obj(match->proc_obj_);
1169         obj_id = CVmObjTads::create_from_stack(vmg_ 0, 1);
1170 
1171         /* get a pointer to the new object */
1172         objp = (CVmObjTads *)vm_objp(vmg_ obj_id);
1173 
1174         /* push this object to protect it from gc for a moment */
1175         G_stk->push()->set_obj(obj_id);
1176 
1177         /*
1178          *   Initialize the caller's first/last token indices to an
1179          *   arbitrary invalid range, in case we have no subproductions.  A
1180          *   production with no tokens and no subproductions matches no
1181          *   tokens at all; we indicate this by using an invalid range, with
1182          *   the first token index higher than the last token index.
1183          */
1184         *first_tok = 1;
1185         *last_tok = 0;
1186 
1187         /* build the sub-match entries */
1188         for (i = 0 ; i < match->sub_match_cnt_ ; ++i)
1189         {
1190             size_t first_sub_tok;
1191             size_t last_sub_tok;
1192             vm_val_t val;
1193 
1194             /* build this subtree */
1195             build_match_tree(vmg_ match->sub_match_list_[i],
1196                              toklist, tokmatchlist,
1197                              &val, &first_sub_tok, &last_sub_tok);
1198 
1199             /*
1200              *   If this is the first subtree, use it as the tentative
1201              *   limits for our overall match so far; otherwise, expand our
1202              *   limits if they are outside our range so far.
1203              *
1204              *   If the submatch doesn't include any tokens, it obviously
1205              *   has no effect on our range.  The submatch range will
1206              *   indicate that the first token index is greater than the
1207              *   last token index if the submatch includes no tokens.
1208              */
1209             if (i == 0)
1210             {
1211                 /* it's the first subtree - it's all we know as yet */
1212                 *first_tok = first_sub_tok;
1213                 *last_tok = last_sub_tok;
1214             }
1215             else if (first_sub_tok <= last_sub_tok)
1216             {
1217                 /* check to see if it expands our current range */
1218                 if (first_sub_tok < *first_tok)
1219                     *first_tok = first_sub_tok;
1220                 if (last_sub_tok > *last_tok)
1221                     *last_tok = last_sub_tok;
1222             }
1223 
1224             /*
1225              *   save the subtree with the current match object if there's a
1226              *   property in which to save it
1227              */
1228             if (match->sub_match_list_[i]->target_prop_ != VM_INVALID_PROP)
1229             {
1230                 /*
1231                  *   Set the processor object property for the value.  Note
1232                  *   that we don't have to keep undo for this change, since
1233                  *   we just created the tree object ourselves, and we don't
1234                  *   create any undo savepoints - an object created after
1235                  *   the most recent undo savepoint doesn't need to keep
1236                  *   undo information, since the entire object will be
1237                  *   deleted if we undo to the savepoint.
1238                  */
1239                 objp->set_prop(vmg_ 0, obj_id,
1240                                match->sub_match_list_[i]->target_prop_,
1241                                &val);
1242             }
1243         }
1244 
1245         /*
1246          *   if we have exported properties for recording the token index
1247          *   range, save them with the object
1248          */
1249         if (G_predef->gramprod_first_tok != VM_INVALID_PROP
1250             && G_predef->gramprod_last_tok != VM_INVALID_PROP)
1251         {
1252             vm_val_t val;
1253 
1254             /* save the first token index */
1255             val.set_int((int)*first_tok + 1);
1256             objp->set_prop(vmg_ 0, obj_id,
1257                            G_predef->gramprod_first_tok, &val);
1258 
1259             /* save the last token index */
1260             val.set_int((int)*last_tok + 1);
1261             objp->set_prop(vmg_ 0, obj_id,
1262                            G_predef->gramprod_last_tok, &val);
1263         }
1264 
1265         /*
1266          *   if we have the token list property exported, set it to the
1267          *   original token list reference
1268          */
1269         if (G_predef->gramprod_token_list != VM_INVALID_PROP)
1270         {
1271             /* save the token list reference */
1272             objp->set_prop(vmg_ 0, obj_id,
1273                            G_predef->gramprod_token_list, toklist);
1274         }
1275 
1276         /* if we have the token match list property exported, set it */
1277         if (G_predef->gramprod_token_match_list != VM_INVALID_PROP)
1278         {
1279             /* save the token match list reference */
1280             objp->set_prop(vmg_ 0, obj_id,
1281                            G_predef->gramprod_token_match_list, tokmatchlist);
1282         }
1283 
1284         /* discard our gc protection */
1285         G_stk->discard();
1286 
1287         /* the return value is the object */
1288         retval->set_obj(obj_id);
1289     }
1290     else
1291     {
1292         const char *lstp;
1293 
1294         /* get the token list */
1295         lstp = toklist->get_as_list(vmg0_);
1296 
1297         /* make sure the index is in range */
1298         if (match->tok_pos_ < vmb_get_len(lstp))
1299         {
1300             vm_val_t ele_val;
1301             CVmObjList *match_lst;
1302 
1303             /* get the token from the list */
1304             CVmObjList::index_list(vmg_ &ele_val, lstp, match->tok_pos_ + 1);
1305 
1306             /*
1307              *   the token is itself a list, whose first element is the
1308              *   token's value - retrieve the value
1309              */
1310             lstp = ele_val.get_as_list(vmg0_);
1311             CVmObjList::index_list(vmg_ retval, lstp, 1);
1312 
1313             /*
1314              *   Store the token match result in the result list.  If this is
1315              *   a '*' match, it doesn't have a result list contribution,
1316              *   because '*' doesn't actually match any tokens (it merely
1317              *   stops parsing).
1318              */
1319             if (!match->matched_star_)
1320             {
1321                 /* get the match list */
1322                 match_lst = (CVmObjList *)vm_objp(vmg_ tokmatchlist->val.obj);
1323 
1324                 /*
1325                  *   set the element at this token position in the match list
1326                  *   to the token match result
1327                  */
1328                 match_lst->cons_set_element(
1329                     match->tok_pos_, &match->tok_match_result_);
1330             }
1331         }
1332         else
1333         {
1334             /*
1335              *   the index is past the end of the list - this must be a
1336              *   '*' token that matched nothing (i.e., the token list was
1337              *   fully consumed before we reached the '*'), so just return
1338              *   nil for the match value
1339              */
1340             retval->set_nil();
1341         }
1342 
1343         /*
1344          *   We match one token, so it's the first and last index.  Note
1345          *   that if this is a '*' match, we don't match any tokens.
1346          */
1347         if (!match->matched_star_)
1348             *first_tok = *last_tok = match->tok_pos_;
1349     }
1350 }
1351 
1352 /*
1353  *   Process the work queue
1354  */
process_work_queue(VMG_ CVmGramProdMem * mem,const vmgramprod_tok * tok,size_t tok_cnt,CVmGramProdQueue * queues,CVmObjDict * dict)1355 void CVmObjGramProd::process_work_queue(VMG_ CVmGramProdMem *mem,
1356                                         const vmgramprod_tok *tok,
1357                                         size_t tok_cnt,
1358                                         CVmGramProdQueue *queues,
1359                                         CVmObjDict *dict)
1360 {
1361     /* keep going until the work queue and badness queue are empty */
1362     for (;;)
1363     {
1364         /*
1365          *   If the work queue is empty, fall back on the badness queue.
1366          *   Ignore the badness queue if we have any successful matches,
1367          *   since non-badness matches always take precedence over badness
1368          *   items.
1369          */
1370         if (queues->work_queue_ == 0 && queues->success_list_ == 0)
1371         {
1372             int first;
1373             int min_badness;
1374             CVmGramProdState *cur;
1375             CVmGramProdState *prv;
1376             CVmGramProdState *nxt;
1377 
1378             /* find the lowest badness rating in the queue */
1379             for (first = TRUE, cur = queues->badness_queue_ ; cur != 0 ;
1380                  cur = cur->nxt_, first = FALSE)
1381             {
1382                 /*
1383                  *   if we're on the first item, note its badness as the
1384                  *   tentative minimum; otherwise, note the current item's
1385                  *   badness if it's lower than the lowest we've seen so
1386                  *   far
1387                  */
1388                 if (first || cur->altp_->badness < min_badness)
1389                     min_badness = cur->altp_->badness;
1390             }
1391 
1392             /*
1393              *   move each of the items whose badness matches the minimum
1394              *   badness out of the badness queue and into the work queue
1395              */
1396             for (cur = queues->badness_queue_, prv = 0 ; cur != 0 ;
1397                  cur = nxt)
1398             {
1399                 /* remember the next item, in case we remove this one */
1400                 nxt = cur->nxt_;
1401 
1402                 /* if this item has minimum badness, move it */
1403                 if (cur->altp_->badness == min_badness)
1404                 {
1405                     /* unlink it from the badness queue */
1406                     if (prv == 0)
1407                         queues->badness_queue_ = cur->nxt_;
1408                     else
1409                         prv->nxt_ = cur->nxt_;
1410 
1411                     /* link it into the work queue */
1412                     cur->nxt_ = queues->work_queue_;
1413                     queues->work_queue_ = cur;
1414                 }
1415                 else
1416                 {
1417                     /*
1418                      *   this item is staying in the list - note it as the
1419                      *   item preceding the next item
1420                      */
1421                     prv = cur;
1422                 }
1423             }
1424         }
1425 
1426         /* if the work queue is still empty, we're out of work to do */
1427         if (queues->work_queue_ == 0)
1428             break;
1429 
1430         /* process the head of the work queue */
1431         process_work_queue_head(vmg_ mem, tok, tok_cnt, queues, dict);
1432     }
1433 }
1434 
1435 /*
1436  *   Process a work queue entry
1437  */
process_work_queue_head(VMG_ CVmGramProdMem * mem,const vmgramprod_tok * tok,size_t tok_cnt,CVmGramProdQueue * queues,CVmObjDict * dict)1438 void CVmObjGramProd::process_work_queue_head(VMG_ CVmGramProdMem *mem,
1439                                              const vmgramprod_tok *tok,
1440                                              size_t tok_cnt,
1441                                              CVmGramProdQueue *queues,
1442                                              CVmObjDict *dict)
1443 {
1444     CVmGramProdState *state;
1445     CVmGramProdMatch *match;
1446     const vmgram_tok_info *tokp;
1447     int tok_matched_star;
1448     vm_prop_id_t enclosing_target_prop;
1449 
1450     /* get the first entry from the queue */
1451     state = queues->work_queue_;
1452 
1453     /* unlink this entry from the queue */
1454     queues->work_queue_ = state->nxt_;
1455     state->nxt_ = 0;
1456 
1457     /* get the token pointer for the next active entry in the alternative */
1458     tokp = state->altp_->toks + state->alt_pos_;
1459 
1460     /* presume we won't match a '*' token */
1461     tok_matched_star = FALSE;
1462 
1463     /* process the remaining items in the entry */
1464     while (state->alt_pos_ < state->altp_->tok_cnt)
1465     {
1466         int match;
1467         vm_val_t match_result;
1468 
1469         /* presume we won't find a match */
1470         match_result.set_nil();
1471 
1472         /* see what we have for this token */
1473         switch(tokp->typ)
1474         {
1475         case VMGRAM_MATCH_PROD:
1476             {
1477                 vm_obj_id_t sub_obj_id;
1478                 CVmObjGramProd *sub_objp;
1479 
1480                 /*
1481                  *   This is a sub-production node.  Get the
1482                  *   sub-production object.
1483                  */
1484                 sub_obj_id = tokp->typinfo.prod_obj;
1485 
1486                 /* make sure it's of the correct metaclass */
1487                 if (!CVmObjGramProd::is_gramprod_obj(vmg_ sub_obj_id))
1488                 {
1489                     /* wrong type - throw an error */
1490                     err_throw(VMERR_INVAL_OBJ_TYPE);
1491                 }
1492 
1493                 /* get the sub-production object */
1494                 sub_objp = (CVmObjGramProd *)vm_objp(vmg_ sub_obj_id);
1495 
1496                 /*
1497                  *   set my subproduction target property, so that the
1498                  *   sub-production can set the target property correctly
1499                  */
1500                 state->sub_target_prop_ = tokp->prop;
1501 
1502                 /* enqueue the alternatives for the sub-production */
1503                 sub_objp->enqueue_alts(vmg_ mem, tok, tok_cnt,
1504                                        state->tok_pos_, state, queues,
1505                                        sub_obj_id, FALSE, 0, dict);
1506             }
1507 
1508             /*
1509              *   Do not process the current state any further for now -
1510              *   we'll get back to it when (and if) we finish processing
1511              *   the sub-production.  Note that we don't even put the
1512              *   current state back in the queue - it's stacked behind the
1513              *   sub-production, and will be re-enqueued when we
1514              *   successfully finish with the sub-production.
1515              */
1516             return;
1517 
1518         case VMGRAM_MATCH_SPEECH:
1519             /* part of speech - check to see if the current token matches */
1520             match = (state->tok_pos_ < tok_cnt
1521                      && find_prop_in_tok(&tok[state->tok_pos_],
1522                                          tokp->typinfo.speech_prop));
1523             break;
1524 
1525         case VMGRAM_MATCH_NSPEECH:
1526             /*
1527              *   multiple parts of speech - check to see if the current token
1528              *   matches any of the parts of the speech
1529              */
1530             if (state->tok_pos_ < tok_cnt)
1531             {
1532                 size_t rem;
1533                 const vm_prop_id_t *prop;
1534 
1535                 /* presume we won't find a match */
1536                 match = FALSE;
1537 
1538                 /* check each item in the list */
1539                 for (prop = tokp->typinfo.nspeech.props,
1540                      rem = tokp->typinfo.nspeech.cnt ;
1541                      rem != 0 ; --rem, ++prop)
1542                 {
1543                     /* if this one matches, we have a match */
1544                     if (find_prop_in_tok(&tok[state->tok_pos_], *prop))
1545                     {
1546                         /* note the match */
1547                         match = TRUE;
1548 
1549                         /* no need to look any further at this token */
1550                         break;
1551                     }
1552                 }
1553             }
1554             else
1555             {
1556                 /*
1557                  *   we're out of tokens, so we definitely don't have a
1558                  *   match, since we must match at least one of the possible
1559                  *   parts of speech of this item
1560                  */
1561                 match = FALSE;
1562             }
1563             break;
1564 
1565         case VMGRAM_MATCH_LITERAL:
1566             /*
1567              *   Literal - check for a match to the string.  Test the hash
1568              *   values first, as this is much faster than comparing the
1569              *   strings, and at least tells us if the strings fail to match
1570              *   (and failing to match being by far the most common case,
1571              *   this saves us doing a full comparison most of the time).
1572              */
1573             match = (state->tok_pos_ < tok_cnt
1574                      && tokp->typinfo.lit.hash == tok[state->tok_pos_].hash_
1575                      && tok_equals_lit(vmg_ &tok[state->tok_pos_],
1576                                        tokp->typinfo.lit.str,
1577                                        tokp->typinfo.lit.len,
1578                                        dict, &match_result));
1579             break;
1580 
1581         case VMGRAM_MATCH_TOKTYPE:
1582             /* token type */
1583             match = (state->tok_pos_ < tok_cnt
1584                      && (tok[state->tok_pos_].typ_
1585                          == tokp->typinfo.toktyp_enum));
1586             break;
1587 
1588         case VMGRAM_MATCH_STAR:
1589             /*
1590              *   this matches anything remaining (it also matches if
1591              *   there's nothing remaining)
1592              */
1593             match = TRUE;
1594 
1595             /* note that we matched a star in the state */
1596             state->matched_star_ = TRUE;
1597 
1598             /* note that we matched a star for this particular token */
1599             tok_matched_star = TRUE;
1600             break;
1601         }
1602 
1603         /* check for a match */
1604         if (match)
1605         {
1606             /*
1607              *   we matched this token - add a token match to our state's
1608              *   match list for the current position
1609              */
1610             state->match_list_[state->alt_pos_] =
1611                 CVmGramProdMatch::alloc(mem, state->tok_pos_, &match_result,
1612                                         tok_matched_star, tokp->prop,
1613                                         VM_INVALID_OBJ, 0, 0);
1614 
1615             /* we're done with this alternative item - move on */
1616             state->alt_pos_++;
1617 
1618             /*
1619              *   If we matched a real token, consume the matched input
1620              *   token.  For a '*', we don't want to consume anything, since
1621              *   a '*' merely stops parsing and doesn't actually match
1622              *   anything.
1623              */
1624             if (!state->matched_star_)
1625                 state->tok_pos_++;
1626 
1627             /* move on to the next alternative token item */
1628             ++tokp;
1629         }
1630         else
1631         {
1632             /*
1633              *   This is not a match - reject this alternative.  We do not
1634              *   need to process this item further, so we can stop now,
1635              *   without returning this state to the work queue.  Simply
1636              *   abandon this work queue item.
1637              */
1638             return;
1639         }
1640     }
1641 
1642     /*
1643      *   If we make it here, we've reached the end of this alternative and
1644      *   have matched everything in it.  This means that the alternative
1645      *   was successful, so we can perform a 'reduce' operation to replace
1646      *   the matched tokens with this production.
1647      */
1648 
1649     /* if we have an enclosing state, get its target property */
1650     enclosing_target_prop = (state->enclosing_ != 0
1651                              ? state->enclosing_->sub_target_prop_
1652                              : VM_INVALID_PROP);
1653 
1654     /* create a match for the entire alternative (i.e., the subproduction) */
1655     match = CVmGramProdMatch::alloc(mem, 0, 0, FALSE, enclosing_target_prop,
1656                                     state->altp_->proc_obj,
1657                                     &state->match_list_[0],
1658                                     state->altp_->tok_cnt);
1659 
1660     /*
1661      *   If the state we just matched was marked on creation as coming
1662      *   from an alternative with circular alternatives, we've just come
1663      *   up with a match for each circular alternative.  To avoid infinite
1664      *   recursion, we never enqueue the circular alternatives; however,
1665      *   now that we know we have a match for the first element of each
1666      *   circular alternative, we can check each of them to see if we can
1667      *   enqueue them.
1668      */
1669     if (state->circular_alt_)
1670     {
1671         vm_obj_id_t prod_obj_id;
1672         CVmObjGramProd *prod_objp;
1673 
1674         /*
1675          *   Get the production from which this alternative came (there
1676          *   should be no need to check the metaclass, since we could only
1677          *   have created the state object from a valid production in the
1678          *   first place).
1679          *
1680          *   First, make sure it's of the correct class.  (It almost
1681          *   certainly is, since the compiler should have generated this
1682          *   reference automatically, so there should be no possibility of a
1683          *   user error.  However, if it's not a production object, we'll
1684          *   probably crash by blindly casting it to one; so we'll check
1685          *   here, just to be sure that the compiler didn't do something
1686          *   wrong and the image file didn't become corrupted.)
1687          */
1688         if (!CVmObjGramProd::is_gramprod_obj(vmg_ state->prod_obj_))
1689         {
1690             /* wrong type - throw an error */
1691             err_throw(VMERR_INVAL_OBJ_TYPE);
1692         }
1693 
1694         /* get the object, properly cast */
1695         prod_obj_id = state->prod_obj_;
1696         prod_objp = (CVmObjGramProd *)vm_objp(vmg_ prod_obj_id);
1697 
1698         /*
1699          *   Enqueue the matching circular alternatives from the original
1700          *   production.  Our match becomes the first token match in the
1701          *   circular alternative (i.e., it matches the leading circular
1702          *   reference element).
1703          */
1704         prod_objp->enqueue_alts(vmg_ mem, tok, tok_cnt, state->tok_pos_,
1705                                 state->enclosing_, queues, prod_obj_id,
1706                                 TRUE, match, dict);
1707     }
1708 
1709     /* check for an enclosing state to pop */
1710     if (state->enclosing_ != 0)
1711     {
1712         /* add the match to the enclosing state's match list */
1713         state->enclosing_->match_list_[state->enclosing_->alt_pos_] = match;
1714         state->enclosing_->alt_pos_++;
1715 
1716         /*
1717          *   Move the enclosing state's token position to our token position
1718          *   - since it now encompasses our match, it has consumed all of
1719          *   the tokens therein.  Likewise, set the '*' flag in the parent
1720          *   to our own setting.
1721          */
1722         state->enclosing_->tok_pos_ = state->tok_pos_;
1723         state->enclosing_->matched_star_ = state->matched_star_;
1724 
1725         /* enqueue the enclosing state so we can continue parsing it */
1726         enqueue_state(state->enclosing_, queues);
1727     }
1728     else
1729     {
1730         CVmGramProdMatchEntry *entry;
1731 
1732         /*
1733          *   This is a top-level state, so we've completed parsing.  If
1734          *   we've consumed all input, or we matched a '*' token, we were
1735          *   successful.  If there's more input remaining, this is not a
1736          *   match.
1737          */
1738         if (state->tok_pos_ < tok_cnt && !state->matched_star_)
1739         {
1740             /*
1741              *   There's more input remaining, so this isn't a match.
1742              *   Reject this alternative.  We do not need to process this
1743              *   item further, so stop now without returning this state
1744              *   object to the work queue.
1745              */
1746 
1747             /* abandon this work queue item */
1748             return;
1749         }
1750 
1751         /* create a new entry for the match list */
1752         entry = (CVmGramProdMatchEntry *)mem->alloc(sizeof(*entry));
1753         entry->match_ = match;
1754         entry->tok_pos_ = state->tok_pos_;
1755 
1756         /* link the entry into the list */
1757         entry->nxt_ = queues->success_list_;
1758         queues->success_list_ = entry;
1759     }
1760 }
1761 
1762 /*
1763  *   Visit my alternatives and enqueue each one that is a possible match
1764  *   for a given token list.
1765  */
enqueue_alts(VMG_ CVmGramProdMem * mem,const vmgramprod_tok * tok,size_t tok_cnt,size_t start_tok_pos,CVmGramProdState * enclosing_state,CVmGramProdQueue * queues,vm_obj_id_t self,int circ_only,CVmGramProdMatch * circ_match,CVmObjDict * dict)1766 void CVmObjGramProd::enqueue_alts(VMG_ CVmGramProdMem *mem,
1767                                   const vmgramprod_tok *tok,
1768                                   size_t tok_cnt, size_t start_tok_pos,
1769                                   CVmGramProdState *enclosing_state,
1770                                   CVmGramProdQueue *queues, vm_obj_id_t self,
1771                                   int circ_only,
1772                                   CVmGramProdMatch *circ_match,
1773                                   CVmObjDict *dict)
1774 {
1775     const vmgram_alt_info *altp;
1776     size_t i;
1777     int need_to_clone;
1778     int has_circ;
1779     vm_obj_id_t comparator;
1780 
1781     /*
1782      *   We don't need to clone the enclosing state until we've set up one
1783      *   new state referring back to it.  However, if we're doing circular
1784      *   alternatives, the enclosing state could also be enqueued
1785      *   separately as its own state, so do clone all required copies in
1786      *   this case.
1787      */
1788     need_to_clone = circ_only;
1789 
1790     /* note whether or not we have any circular alternatives */
1791     has_circ = get_ext()->has_circular_alt;
1792 
1793     /*
1794      *   Cache hash values for all of our literals.  We need to do this if we
1795      *   haven't done it before, or if the comparator associated with the
1796      *   dictionary is different than it was last time we cached the values.
1797      */
1798     comparator = (dict != 0 ? dict->get_comparator() : VM_INVALID_OBJ);
1799     if (!get_ext()->hashes_cached_ || get_ext()->comparator_ != comparator)
1800         cache_hashes(vmg_ dict);
1801 
1802     /*
1803      *   run through our alternatives and enqueue each one that looks
1804      *   plausible
1805      */
1806     for (altp = get_ext()->alts_, i = get_ext()->alt_cnt_ ; i != 0 ;
1807          --i, ++altp)
1808     {
1809         vmgram_tok_info *first_tokp;
1810         vmgram_tok_info *tokp;
1811         size_t cur_alt_tok;
1812         size_t alt_tok_cnt;
1813         size_t tok_idx;
1814         int found_mismatch;
1815         int prod_before;
1816         int is_circ;
1817 
1818         /* start at the first token remaining in the string */
1819         tok_idx = start_tok_pos;
1820 
1821         /* get the first token for this alternative */
1822         tokp = first_tokp = altp->toks;
1823 
1824         /* get the number of tokens in the alternative */
1825         alt_tok_cnt = altp->tok_cnt;
1826 
1827         /* presume we will find no mismatches in this check */
1828         found_mismatch = FALSE;
1829 
1830         /*
1831          *   note whether this alternative is a direct circular reference
1832          *   or not (it is directly circular if the first token points
1833          *   back to this same production)
1834          */
1835         is_circ = (alt_tok_cnt != 0
1836                    && tokp->typ == VMGRAM_MATCH_PROD
1837                    && tokp->typinfo.prod_obj == self);
1838 
1839         /*
1840          *   we don't have a production before the current item (this is
1841          *   important because it determines whether we must consider the
1842          *   current token or any following token when trying to match a
1843          *   literal, part of speech, or token type)
1844          */
1845         prod_before = FALSE;
1846 
1847         /* scan the tokens in the alternative */
1848         for (cur_alt_tok = 0 ; cur_alt_tok < alt_tok_cnt ;
1849              ++cur_alt_tok, ++tokp)
1850         {
1851             int match;
1852             int match_star;
1853             int ignore_item;
1854             vm_val_t match_result;
1855 
1856             /* presume we won't find a match */
1857             match = FALSE;
1858             match_star = FALSE;
1859 
1860             /* presume we won't be able to ignore the item */
1861             ignore_item = FALSE;
1862 
1863             /*
1864              *   Try to find a match to the current alternative item.  If
1865              *   we've already found a mismatch, don't bother, since we need
1866              *   no further evidence to skip this alternative; in this case
1867              *   we're still looping over the alternative items simply to
1868              *   find the beginning of the next alternative.
1869              */
1870             for ( ; !found_mismatch ; ++tok_idx)
1871             {
1872                 /* see what kind of item we have */
1873                 switch(tokp->typ)
1874                 {
1875                 case VMGRAM_MATCH_PROD:
1876                     /*
1877                      *   We can't rule out a production without checking
1878                      *   it fully, which we're not doing on this pass;
1879                      *   however, note that we found a production, because
1880                      *   it means that we must from now on try skipping
1881                      *   any number of tokens before ruling out a match on
1882                      *   other grounds, since the production could
1883                      *   potentially match any number of tokens.
1884                      *
1885                      *   If we're doing circular matches only, and this is
1886                      *   a circular production, and we're on the first
1887                      *   token in our list, we already know it matches, so
1888                      *   don't even count it as a production before the
1889                      *   item.
1890                      */
1891                     if (is_circ && circ_only && cur_alt_tok == 0)
1892                     {
1893                         /*
1894                          *   ignore the fact that it's a production, since
1895                          *   we already know it matches - we don't want to
1896                          *   be flexible about tokens following this item
1897                          *   since we know the exact number that match it
1898                          */
1899                     }
1900                     else
1901                     {
1902                         /* note the production item */
1903                         prod_before = TRUE;
1904                     }
1905 
1906                     /*
1907                      *   we can ignore this item for the purposes of
1908                      *   detecting a mismatch, because we can't tell
1909                      *   during this superficial scan whether it matches
1910                      */
1911                     ignore_item = TRUE;
1912                     break;
1913 
1914                 case VMGRAM_MATCH_SPEECH:
1915                     /* we must match a part of speech */
1916                     if (tok_idx < tok_cnt
1917                         && find_prop_in_tok(&tok[tok_idx],
1918                                             tokp->typinfo.speech_prop))
1919                     {
1920                         /* it's a match */
1921                         match = TRUE;
1922                     }
1923                     break;
1924 
1925                 case VMGRAM_MATCH_NSPEECH:
1926                     /* multiple parts of speech */
1927                     if (tok_idx < tok_cnt)
1928                     {
1929                         vm_prop_id_t *prop;
1930                         size_t rem;
1931 
1932                         /* presume we won't find a match */
1933                         match = FALSE;
1934 
1935                         /* check each item in the list */
1936                         for (prop = tokp->typinfo.nspeech.props,
1937                              rem = tokp->typinfo.nspeech.cnt ; rem != 0 ;
1938                              --rem, ++prop)
1939                         {
1940                             /* if this one matches, we have a match */
1941                             if (find_prop_in_tok(&tok[tok_idx], *prop))
1942                             {
1943                                 /* note the match */
1944                                 match = TRUE;
1945 
1946                                 /* no need to look any further */
1947                                 break;
1948                             }
1949                         }
1950                     }
1951                     else
1952                     {
1953                         /* we're out of tokens, so we have no match */
1954                         match = FALSE;
1955                     }
1956                     break;
1957 
1958                 case VMGRAM_MATCH_LITERAL:
1959                     /*
1960                      *   Match a literal.  Compare the hash values first,
1961                      *   since a negative match on the hash values will tell
1962                      *   us for sure that the text won't match; since not
1963                      *   matching is by far the most common case, this saves
1964                      *   us the work of doing full string comparisons most of
1965                      *   the time.
1966                      */
1967                     if (tok_idx < tok_cnt
1968                         && tok[tok_idx].txt_ != 0
1969                         && tokp->typinfo.lit.hash == tok[tok_idx].hash_
1970                         && tok_equals_lit(vmg_ &tok[tok_idx],
1971                                           tokp->typinfo.lit.str,
1972                                           tokp->typinfo.lit.len,
1973                                           dict, &match_result))
1974                     {
1975                         /* it's a match */
1976                         match = TRUE;
1977                     }
1978                     break;
1979 
1980                 case VMGRAM_MATCH_TOKTYPE:
1981                     /* we must match a token type */
1982                     if (tok_idx < tok_cnt
1983                         && tok[tok_idx].typ_ == tokp->typinfo.toktyp_enum)
1984                         match = TRUE;
1985                     break;
1986 
1987                 case VMGRAM_MATCH_STAR:
1988                     /* consume everything remaining on the line */
1989                     tok_idx = tok_cnt;
1990 
1991                     /* this is a match */
1992                     match = TRUE;
1993                     match_star = TRUE;
1994                     break;
1995                 }
1996 
1997                 /* if we found a match, we can stop scanning now */
1998                 if (match)
1999                     break;
2000 
2001                 /*
2002                  *   we didn't match this token - if we have a production
2003                  *   before this point, we must keep scanning token, since
2004                  *   the production could end up matching any number of
2005                  *   tokens; if we don't have a production item, though,
2006                  *   we don't need to look any further, and we can rule
2007                  *   out this alternative immediately
2008                  */
2009                 if (!prod_before)
2010                     break;
2011 
2012                 /*
2013                  *   if we're ignoring this item (because we can't decide
2014                  *   now whether it's a match or not, and hence can't rule
2015                  *   it out), don't scan any tokens for the item
2016                  */
2017                 if (ignore_item)
2018                     break;
2019 
2020                 /*
2021                  *   if we didn't find a match, and we're out of tokens,
2022                  *   stop looking - we have no more tokens to look at to
2023                  *   find a match
2024                  */
2025                 if (!match && tok_idx >= tok_cnt)
2026                     break;
2027             }
2028 
2029             /* check to see if we found a match */
2030             if (match)
2031             {
2032                 /*
2033                  *   we found a match - skip the current token (unless we
2034                  *   matched a '*' item, in which case we've already
2035                  *   consumed all remaining tokens)
2036                  */
2037                 if (!match_star && tok_idx < tok_cnt)
2038                     ++tok_idx;
2039             }
2040             else if (!ignore_item)
2041             {
2042                 /*
2043                  *   we didn't match, and we can't ignore this item - note
2044                  *   that the alternative does not match
2045                  */
2046                 found_mismatch = TRUE;
2047 
2048                 /* finish up with this alternative and proceed to the next */
2049                 break;
2050             }
2051         }
2052 
2053         /*
2054          *   If we didn't find a reason to rule out this alternative,
2055          *   enqueue the alternative.  Never enqueue circular references,
2056          *   unless we're ONLY enqueuing circular references.
2057          */
2058         if (!found_mismatch
2059             && ((!circ_only && !is_circ)
2060                 || (circ_only && is_circ)))
2061         {
2062             CVmGramProdState *state;
2063 
2064             /* create and enqueue the new state */
2065             state = enqueue_new_state(mem, start_tok_pos, enclosing_state,
2066                                       altp, self, &need_to_clone, queues,
2067                                       has_circ);
2068 
2069             /*
2070              *   if we are enqueuing circular references, we already have
2071              *   a match for the first (circular) token, so fill it in
2072              */
2073             if (circ_only)
2074             {
2075                 CVmGramProdMatch *match;
2076 
2077                 /* create a match for the alternative */
2078                 match = CVmGramProdMatch::
2079                         alloc(mem, 0, 0, FALSE, first_tokp->prop,
2080                               circ_match->proc_obj_,
2081                               circ_match->sub_match_list_,
2082                               circ_match->sub_match_cnt_);
2083 
2084                 /* set the first match in the token */
2085                 state->match_list_[0] = match;
2086 
2087                 /* set up at the second token */
2088                 state->alt_pos_ = 1;
2089             }
2090         }
2091     }
2092 }
2093 
2094 /*
2095  *   Cache hash values for all of our literal tokens
2096  */
cache_hashes(VMG_ CVmObjDict * dict)2097 void CVmObjGramProd::cache_hashes(VMG_ CVmObjDict *dict)
2098 {
2099     vm_obj_id_t comparator;
2100     vmgram_alt_info *alt;
2101     size_t i;
2102 
2103     /* get the comparator */
2104     comparator = (dict != 0 ? dict->get_comparator() : VM_INVALID_OBJ);
2105 
2106     /* run through our alternatives */
2107     for (i = get_ext()->alt_cnt_, alt = get_ext()->alts_ ;
2108          i != 0 ; --i, ++alt)
2109     {
2110         vmgram_tok_info *tok;
2111         size_t j;
2112 
2113         /* run through our tokens */
2114         for (j = alt->tok_cnt, tok = alt->toks ; j != 0 ; --j, ++tok)
2115         {
2116             /* if this is a literal token, calculate its hash value */
2117             if (tok->typ == VMGRAM_MATCH_LITERAL)
2118             {
2119                 /* calculate and store our hash value */
2120                 tok->typinfo.lit.hash = calc_hash(
2121                     vmg_ dict, 0, tok->typinfo.lit.str, tok->typinfo.lit.len);
2122             }
2123         }
2124     }
2125 
2126     /* note that we've cached hash values, and note the comparator we used */
2127     get_ext()->hashes_cached_ = TRUE;
2128     get_ext()->comparator_ = comparator;
2129 }
2130 
2131 /*
2132  *   Create and enqueue a new state
2133  */
2134 CVmGramProdState *CVmObjGramProd::
enqueue_new_state(CVmGramProdMem * mem,size_t start_tok_pos,CVmGramProdState * enclosing_state,const vmgram_alt_info * altp,vm_obj_id_t self,int * need_to_clone,CVmGramProdQueue * queues,int circular_alt)2135    enqueue_new_state(CVmGramProdMem *mem,
2136                      size_t start_tok_pos,
2137                      CVmGramProdState *enclosing_state,
2138                      const vmgram_alt_info *altp, vm_obj_id_t self,
2139                      int *need_to_clone, CVmGramProdQueue *queues,
2140                      int circular_alt)
2141 {
2142     CVmGramProdState *state;
2143 
2144     /* create the new state */
2145     state = create_new_state(mem, start_tok_pos, enclosing_state,
2146                              altp, self, need_to_clone, circular_alt);
2147 
2148     /*
2149      *   Add the item to the appropriate queue.  If the item has an
2150      *   associated badness, add it to the badness queue.  Otherwise, add
2151      *   it to the work queue.
2152      */
2153     if (altp->badness != 0)
2154     {
2155         /*
2156          *   we have a badness rating - add it to the badness queue, since
2157          *   we don't want to process it until we entirely exhaust better
2158          *   possibilities
2159          */
2160         state->nxt_ = queues->badness_queue_;
2161         queues->badness_queue_ = state;
2162     }
2163     else
2164     {
2165         /* enqueue the state */
2166         enqueue_state(state, queues);
2167     }
2168 
2169     /* return the new state */
2170     return state;
2171 }
2172 
2173 /*
2174  *   Create a new state
2175  */
2176 CVmGramProdState *CVmObjGramProd::
create_new_state(CVmGramProdMem * mem,size_t start_tok_pos,CVmGramProdState * enclosing_state,const vmgram_alt_info * altp,vm_obj_id_t self,int * need_to_clone,int circular_alt)2177    create_new_state(CVmGramProdMem *mem, size_t start_tok_pos,
2178                     CVmGramProdState *enclosing_state,
2179                     const vmgram_alt_info *altp, vm_obj_id_t self,
2180                     int *need_to_clone, int circular_alt)
2181 {
2182     /*
2183      *   if necessary, clone the enclosing state; we need to do this if we
2184      *   enqueue more than one nested alternative, since each nested
2185      *   alternative could parse the rest of the token list differently
2186      *   and thus provide a different result to the enclosing state
2187      */
2188     if (*need_to_clone && enclosing_state != 0)
2189         enclosing_state = enclosing_state->clone(mem);
2190 
2191     /*
2192      *   we will need to clone the enclosing state if we create another
2193      *   alternative after this one
2194      */
2195     *need_to_clone = TRUE;
2196 
2197     /* create a new state object for the alternative */
2198     return CVmGramProdState::alloc(mem, start_tok_pos, 0,
2199                                    enclosing_state, altp, self,
2200                                    circular_alt);
2201 }
2202 
2203 /*
2204  *   Enqueue a state in the work queue
2205  */
enqueue_state(CVmGramProdState * state,CVmGramProdQueue * queues)2206 void CVmObjGramProd::enqueue_state(CVmGramProdState *state,
2207                                    CVmGramProdQueue *queues)
2208 {
2209     /* add it to the work queue */
2210     state->nxt_ = queues->work_queue_;
2211     queues->work_queue_ = state;
2212 }
2213 
2214 /*
2215  *   Determine if a token from the input matches a literal token, allowing
2216  *   the input token to be a truncated version of the literal token if the
2217  *   given truncation length is non-zero.
2218  */
tok_equals_lit(VMG_ const struct vmgramprod_tok * tok,const char * lit,size_t lit_len,CVmObjDict * dict,vm_val_t * result_val)2219 int CVmObjGramProd::tok_equals_lit(VMG_ const struct vmgramprod_tok *tok,
2220                                    const char *lit, size_t lit_len,
2221                                    CVmObjDict *dict, vm_val_t *result_val)
2222 {
2223     /*
2224      *   if there's a dictionary, ask it to do the comparison; otherwise,
2225      *   use an exact literal match
2226      */
2227     if (dict != 0)
2228     {
2229         /* ask the dictionary for the comparison */
2230         return dict->match_strings(vmg_ &tok->val_, tok->txt_, tok->len_,
2231                                    lit, lit_len, result_val);
2232     }
2233     else
2234     {
2235         /* perform an exact comparison */
2236         result_val->set_int(tok->len_ == lit_len
2237                             && memcmp(tok->txt_, lit, lit_len) == 0);
2238         return result_val->val.intval;
2239     }
2240 }
2241 
2242 /*
2243  *   Calculate the hash value for a literal token, using our dictionary's
2244  *   comparator.
2245  */
calc_hash(VMG_ CVmObjDict * dict,const vm_val_t * strval,const char * str,size_t len)2246 unsigned int CVmObjGramProd::calc_hash(VMG_ CVmObjDict *dict,
2247                                        const vm_val_t *strval,
2248                                        const char *str, size_t len)
2249 {
2250     /*
2251      *   if we have a dictionary, ask it to ask it for the hash value, as the
2252      *   dictionary will ask its comparator to do the work; otherwise,
2253      *   calculate our own hash value
2254      */
2255     if (dict != 0)
2256     {
2257         /*
2258          *   We're doing comparisons through the dictionary's comparator, so
2259          *   we must calculate hash values using the comparator as well to
2260          *   keep the hash and match operations consistent.
2261          */
2262         return dict->calc_hash(vmg_ strval, str, len);
2263     }
2264     else
2265     {
2266         uint hash;
2267 
2268         /*
2269          *   We don't have a dictionary, so we're doing our own string
2270          *   comparisons, which we do as exact byte-for-byte matches.  We can
2271          *   thus calculate an arbitrary hash value of our own devising.
2272          */
2273         for (hash = 0 ; len != 0 ; ++str, --len)
2274             hash += (unsigned char)*str;
2275 
2276         /* return the hash value we calculated */
2277         return hash;
2278     }
2279 }
2280 
2281 
2282 /*
2283  *   Find a property in a token's list.  Returns true if the property is
2284  *   there, false if not.
2285  */
find_prop_in_tok(const vmgramprod_tok * tok,vm_prop_id_t prop)2286 int CVmObjGramProd::find_prop_in_tok(const vmgramprod_tok *tok,
2287                                      vm_prop_id_t prop)
2288 {
2289     size_t i;
2290     const vmgram_match_info *p;
2291 
2292     /* search for the property */
2293     for (i = tok->match_cnt_, p = tok->matches_ ; i != 0 ; --i, ++p)
2294     {
2295         /* if this is the one we're looking for, return success */
2296         if (p->prop == prop)
2297             return TRUE;
2298     }
2299 
2300     /* we didn't find it */
2301     return FALSE;
2302 }
2303 
2304 /*
2305  *   Get the next token item after the given item in an alternative
2306  */
get_next_alt_tok(const char * tokp)2307 const char *CVmObjGramProd::get_next_alt_tok(const char *tokp)
2308 {
2309     switch(vmgram_tok_type(tokp))
2310     {
2311     case VMGRAM_MATCH_PROD:
2312         return tokp + VMGRAM_TOK_PROD_SIZE;
2313 
2314     case VMGRAM_MATCH_SPEECH:
2315         return tokp + VMGRAM_TOK_SPEECH_SIZE;
2316 
2317     case VMGRAM_MATCH_NSPEECH:
2318         return tokp + VMGRAM_TOK_NSPEECH_SIZE(tokp);
2319 
2320     case VMGRAM_MATCH_LITERAL:
2321         return tokp + VMGRAM_TOK_LIT_SIZE(tokp);
2322 
2323     case VMGRAM_MATCH_TOKTYPE:
2324         return tokp + VMGRAM_TOK_TYPE_SIZE;
2325 
2326     case VMGRAM_MATCH_STAR:
2327         return tokp + VMGRAM_TOK_STAR_SIZE;
2328 
2329     default:
2330         err_throw(VMERR_INVAL_METACLASS_DATA);
2331         return 0;
2332     }
2333 }
2334 
2335