1 #ifdef RCSID
2 static char RCSid[] =
3 "$Header$";
4 #endif
5
6 /*
7 * Copyright (c) 2000, 2002 Michael J. Roberts. All Rights Reserved.
8 *
9 * Please see the accompanying license file, LICENSE.TXT, for information
10 * on using and copying this software.
11 */
12 /*
13 Name
14 vmgram.cpp - T3 Grammar Production metaclass
15 Function
16
17 Notes
18
19 Modified
20 02/15/00 MJRoberts - Creation
21 */
22
23 #include <stdlib.h>
24 #include <string.h>
25
26 #include "t3std.h"
27 #include "vmtype.h"
28 #include "vmstack.h"
29 #include "vmlst.h"
30 #include "vmgram.h"
31 #include "vmerr.h"
32 #include "vmerrnum.h"
33 #include "vmmeta.h"
34 #include "vmdict.h"
35 #include "vmtobj.h"
36 #include "vmpredef.h"
37
38
39 /* ------------------------------------------------------------------------ */
40 /*
41 * Grammar production object - memory pool. This is a simple
42 * suballocator that obtains large blocks from the system allocator,
43 * then hands out pieces of those blocks. Allocating is very cheap
44 * (just a pointer increment in most cases), and we don't track
45 * individual allocations and deletions but just throw away all
46 * suballocated memory at once.
47 */
48
49 /* size of each memory block */
50 const size_t VMGRAMPROD_MEM_BLOCK_SIZE = 16*1024;
51
52 /*
53 * memory pool block
54 */
55 struct CVmGramProdMemBlock
56 {
57 /* next block in chain */
58 CVmGramProdMemBlock *nxt_;
59
60 /* bytes of the block */
61 char buf_[VMGRAMPROD_MEM_BLOCK_SIZE];
62 };
63
64 /*
65 * memory pool object
66 */
67 class CVmGramProdMem
68 {
69 public:
CVmGramProdMem()70 CVmGramProdMem()
71 {
72 /* no memory blocks yet */
73 block_head_ = 0;
74 block_cur_ = 0;
75
76 /* we don't yet have a block */
77 cur_rem_ = 0;
78 cur_free_ = 0;
79 }
80
~CVmGramProdMem()81 ~CVmGramProdMem()
82 {
83 /* delete each of our blocks */
84 while (block_head_ != 0)
85 {
86 CVmGramProdMemBlock *nxt;
87
88 /* remember the next block */
89 nxt = block_head_->nxt_;
90
91 /* delete this block */
92 t3free(block_head_);
93
94 /* move on to the next */
95 block_head_ = nxt;
96 }
97 }
98
99 /* reset - delete all previously sub-allocated memory */
reset()100 void reset()
101 {
102 /* start over with the first block */
103 block_cur_ = block_head_;
104
105 /* initialize the free pointer for the new block, if we have one */
106 if (block_cur_ != 0)
107 {
108 /* start at the beginning of the new block */
109 cur_free_ = block_cur_->buf_;
110
111 /* this entire block is available */
112 cur_rem_ = VMGRAMPROD_MEM_BLOCK_SIZE;
113 }
114 else
115 {
116 /* there are no blocks, so there's no memory available yet */
117 cur_rem_ = 0;
118 cur_free_ = 0;
119 }
120 }
121
122 /* allocate memory */
alloc(size_t siz)123 void *alloc(size_t siz)
124 {
125 void *ret;
126
127 /* round the size to the local hardware boundary */
128 siz = osrndsz(siz);
129
130 /* if it exceeds the block size, fail */
131 if (siz > VMGRAMPROD_MEM_BLOCK_SIZE)
132 return 0;
133
134 /* if we don't have enough space in this block, go to the next */
135 if (siz > cur_rem_)
136 {
137 /* allocate a new block if necessary */
138 if (block_cur_ == 0)
139 {
140 /* there's nothing in the list yet - set up the first block */
141 block_head_ = (CVmGramProdMemBlock *)
142 t3malloc(sizeof(CVmGramProdMemBlock));
143
144 /* activate the new block */
145 block_cur_ = block_head_;
146
147 /* there's nothing after this block */
148 block_cur_->nxt_ = 0;
149 }
150 else if (block_cur_->nxt_ == 0)
151 {
152 /* we're at the end of the list - add a block */
153 block_cur_->nxt_ = (CVmGramProdMemBlock *)
154 t3malloc(sizeof(CVmGramProdMemBlock));
155
156 /* advance to the new block */
157 block_cur_ = block_cur_->nxt_;
158
159 /* the new block is the last in the chain */
160 block_cur_->nxt_ = 0;
161 }
162 else
163 {
164 /* another block follows - advance to it */
165 block_cur_ = block_cur_->nxt_;
166 }
167
168 /* start at the beginning of the new block */
169 cur_free_ = block_cur_->buf_;
170
171 /* this entire block is available */
172 cur_rem_ = VMGRAMPROD_MEM_BLOCK_SIZE;
173 }
174
175 /* the return value will be the current free pointer */
176 ret = cur_free_;
177
178 /* consume the space */
179 cur_free_ += siz;
180 cur_rem_ -= siz;
181
182 /* return the block location */
183 return ret;
184 }
185
186 private:
187 /* head of block list */
188 CVmGramProdMemBlock *block_head_;
189
190 /* block we're currently suballocating out of */
191 CVmGramProdMemBlock *block_cur_;
192
193 /* current free pointer in current block */
194 char *cur_free_;
195
196 /* amount of space remaining in current block */
197 size_t cur_rem_;
198 };
199
200 /* ------------------------------------------------------------------------ */
201 /*
202 * Input token array entry. We make a private copy of the input token list
203 * (i.e., the token list we're trying to match to our grammar rules) during
204 * parsing using one of these structures for each token.
205 */
206 struct vmgramprod_tok
207 {
208 /* original token value */
209 vm_val_t val_;
210
211 /* token type */
212 ulong typ_;
213
214 /* token text */
215 const char *txt_;
216
217 /* length of the token text */
218 size_t len_;
219
220 /* hash value of the token */
221 unsigned int hash_;
222
223 /* number of vocabulary matches associated with the word */
224 size_t match_cnt_;
225
226 /* pointer to list of matches for the word */
227 vmgram_match_info *matches_;
228 };
229
230
231 /* ------------------------------------------------------------------------ */
232 /*
233 * Parsing state match object. A match can be terminal, in which case
234 * it refers to a single token of input, or it can be a production, in
235 * which case it refers to a subtree of match objects.
236 */
237 struct CVmGramProdMatch
238 {
239 /* allocate */
allocCVmGramProdMatch240 static CVmGramProdMatch *alloc(CVmGramProdMem *mem, size_t tok_pos,
241 const vm_val_t *tok_match_result,
242 int matched_star,
243 vm_prop_id_t target_prop,
244 vm_obj_id_t proc_obj,
245 const CVmGramProdMatch **sub_match_list,
246 size_t sub_match_cnt)
247 {
248 CVmGramProdMatch *match;
249
250 /* allocate space */
251 match = (CVmGramProdMatch *)mem->alloc(sizeof(CVmGramProdMatch));
252
253 /* initialize */
254 match->tok_pos_ = tok_pos;
255 match->matched_star_ = matched_star;
256 match->target_prop_ = target_prop;
257 match->proc_obj_ = proc_obj;
258 match->sub_match_list_ = sub_match_list;
259 match->sub_match_cnt_ = sub_match_cnt;
260
261 /* save the token match value, if one was given */
262 if (tok_match_result != 0)
263 match->tok_match_result_ = *tok_match_result;
264 else
265 match->tok_match_result_.set_nil();
266
267 /* return the new match */
268 return match;
269 }
270
271 /* property ID with which this match is associated */
272 vm_prop_id_t target_prop_;
273
274 /*
275 * token position of the matching token; ignored if the production
276 * subtree information is valid
277 */
278 size_t tok_pos_;
279
280 /*
281 * The token match value. This is the result code from the Dictionary
282 * comparator's matchValues() function, which typically encodes
283 * information about how the value matched (truncation, case folding,
284 * accent approximation, etc) in bitwise flags.
285 */
286 vm_val_t tok_match_result_;
287
288 /*
289 * flag: this is a '*' token in the rule; these don't really match any
290 * tokens at all, since they merely serve to stop parsing regardless
291 * of whether or not more input tokens are present
292 */
293 int matched_star_;
294
295 /*
296 * object ID of match processor - if this is valid, this match
297 * refers to a production; otherwise, the match refers to a single
298 * token
299 */
300 vm_obj_id_t proc_obj_;
301
302 /* pointer to array of sub-matches */
303 const CVmGramProdMatch **sub_match_list_;
304
305 /* number of sub-match entries */
306 size_t sub_match_cnt_;
307 };
308
309 /*
310 * Top-level match list holder
311 */
312 struct CVmGramProdMatchEntry
313 {
314 /* the match tree */
315 CVmGramProdMatch *match_;
316
317 /* token position at the end of the match */
318 size_t tok_pos_;
319
320 /* next entry in the list */
321 CVmGramProdMatchEntry *nxt_;
322 };
323
324 /*
325 * Parsing state object. At any given time, we will have one or more of
326 * these state objects in our processing queue. A state object tracks a
327 * parsing position - the token position, the alternative position, the
328 * match list so far for the alternative (i.e., the match for each item
329 * in the alternative up to but not including the current position), and
330 * the enclosing parsing state object (i.e., the parsing state that
331 * "recursed" into this alternative).
332 */
333 struct CVmGramProdState
334 {
335 /* allocate */
allocCVmGramProdState336 static CVmGramProdState *alloc(CVmGramProdMem *mem,
337 int tok_pos, int alt_pos,
338 CVmGramProdState *enclosing,
339 const vmgram_alt_info *altp,
340 vm_obj_id_t prod_obj,
341 int circular_alt)
342 {
343 size_t match_cnt;
344 CVmGramProdState *state;
345
346 /* get the number of match list slots we'll need */
347 match_cnt = altp->tok_cnt;
348
349 /* allocate space, including space for the match list */
350 state = (CVmGramProdState *)
351 mem->alloc(sizeof(CVmGramProdState)
352 + (match_cnt - 1)*sizeof(CVmGramProdMatch *));
353
354 /* initialize */
355 state->init(tok_pos, alt_pos, enclosing, altp, prod_obj,
356 circular_alt);
357
358 /* return the new item */
359 return state;
360 }
361
362 /* initialize */
initCVmGramProdState363 void init(int tok_pos, int alt_pos, CVmGramProdState *enclosing,
364 const vmgram_alt_info *altp, vm_obj_id_t prod_obj,
365 int circular_alt)
366 {
367 /* remember the position parameters */
368 tok_pos_ = tok_pos;
369 alt_pos_ = alt_pos;
370
371 /* we haven't matched a '*' token yet */
372 matched_star_ = FALSE;
373
374 /* remember our stack position */
375 enclosing_ = enclosing;
376
377 /* remember our alternative image data pointer */
378 altp_ = altp;
379
380 /* remember the production object */
381 prod_obj_ = prod_obj;
382
383 /* note if we had circular alternatives in the same production */
384 circular_alt_ = circular_alt;
385
386 /* no target property for sub-production yet */
387 sub_target_prop_ = VM_INVALID_PROP;
388
389 /* we're not in the queue yet */
390 nxt_ = 0;
391 }
392
393 /* clone the state */
cloneCVmGramProdState394 CVmGramProdState *clone(CVmGramProdMem *mem)
395 {
396 CVmGramProdState *new_state;
397 CVmGramProdState *new_enclosing;
398
399 /* clone our enclosing state */
400 new_enclosing = (enclosing_ != 0 ? enclosing_->clone(mem) : 0);
401
402 /* create a new state object */
403 new_state = alloc(mem, tok_pos_, alt_pos_, new_enclosing,
404 altp_, prod_obj_, circular_alt_);
405
406 /* copy the target sub-production property */
407 new_state->sub_target_prop_ = sub_target_prop_;
408
409 /* copy the valid portion of my match list */
410 if (alt_pos_ != 0)
411 memcpy(new_state->match_list_, match_list_,
412 alt_pos_ * sizeof(match_list_[0]));
413
414 /* we're not yet in any queue */
415 new_state->nxt_ = 0;
416
417 /* set the '*' flag in the cloned state */
418 new_state->matched_star_ = matched_star_;
419
420 /* return the clone */
421 return new_state;
422 }
423
424 /* current token position, as an index into the token list */
425 size_t tok_pos_;
426
427 /*
428 * flag: we've matched a '*' token in a production or subproduction,
429 * so we should consider all remaining tokens to have been consumed
430 * (we don't advance tok_pos_ past all of the tokens, because we
431 * separately want to keep track of the number of tokens we matched
432 * for everything up to the '*')
433 */
434 int matched_star_;
435
436 /*
437 * current alternative position, as an index into the alternative's
438 * list of items
439 */
440 size_t alt_pos_;
441
442 /*
443 * the enclosing state object - this is the state object that
444 * "recursed" to our position
445 */
446 CVmGramProdState *enclosing_;
447
448 /* pointer to alternative definition */
449 const vmgram_alt_info *altp_;
450
451 /* object ID of the production object */
452 vm_obj_id_t prod_obj_;
453
454 /* the target property for the pending sub-production */
455 vm_prop_id_t sub_target_prop_;
456
457 /* next production state in the processing queue */
458 CVmGramProdState *nxt_;
459
460 /* the production contains circular alternatives */
461 int circular_alt_;
462
463 /*
464 * Match list. This list is allocated with enough space for one
465 * match for each of our items (the number of items is given by
466 * vmgram_alt_tokcnt(altp_)). At any given time, these will be
467 * valid up to but not including the current alt_pos_ index.
468 */
469 const struct CVmGramProdMatch *match_list_[1];
470 };
471
472 /*
473 * Queues. We maintain three queues: the primary work queue, which
474 * contains pending parsing states that we're still attempting to match;
475 * the success queue, which contains a list of match trees for
476 * successfully completed matches; and the badness queue, which contains
477 * a list of parsing states that we can try as fallbacks if we fail to
478 * find a match in any of the work queue items.
479 */
480 struct CVmGramProdQueue
481 {
482 /* clear the queues */
clearCVmGramProdQueue483 void clear()
484 {
485 work_queue_ = 0;
486 badness_queue_ = 0;
487 success_list_ = 0;
488 }
489
490 /* head of work queue */
491 CVmGramProdState *work_queue_;
492
493 /* head of badness queue */
494 CVmGramProdState *badness_queue_;
495
496 /* head of success list */
497 CVmGramProdMatchEntry *success_list_;
498 };
499
500 /* ------------------------------------------------------------------------ */
501 /*
502 * Grammar-Production object statics
503 */
504
505 /* metaclass registration object */
506 static CVmMetaclassGramProd metaclass_reg_obj;
507 CVmMetaclass *CVmObjGramProd::metaclass_reg_ = &metaclass_reg_obj;
508
509 /* function table */
510 int (CVmObjGramProd::
511 *CVmObjGramProd::func_table_[])(VMG_ vm_obj_id_t self,
512 vm_val_t *retval, uint *argc) =
513 {
514 &CVmObjGramProd::getp_undef,
515 &CVmObjGramProd::getp_parse
516 };
517
518 /* ------------------------------------------------------------------------ */
519 /*
520 * Grammar-Production metaclass implementation
521 */
522
523 /*
524 * create dynamically using stack arguments
525 */
create_from_stack(VMG_ const uchar ** pc_ptr,uint argc)526 vm_obj_id_t CVmObjGramProd::create_from_stack(VMG_ const uchar **pc_ptr,
527 uint argc)
528 {
529 /* this type of object cannot be dynamically created */
530 err_throw(VMERR_BAD_DYNAMIC_NEW);
531
532 /* we won't reach this point, but the compiler wouldn't know */
533 return VM_INVALID_OBJ;
534 }
535
536 /*
537 * constructor
538 */
CVmObjGramProd(VMG0_)539 CVmObjGramProd::CVmObjGramProd(VMG0_)
540 {
541 /* allocate our extension structure from the variable heap */
542 ext_ = (char *)G_mem->get_var_heap()
543 ->alloc_mem(sizeof(vm_gram_ext), this);
544
545 /* we have no image data yet */
546 get_ext()->image_data_ = 0;
547 get_ext()->image_data_size_ = 0;
548
549 /* we haven't set up our alternatives yet */
550 get_ext()->alts_ = 0;
551 get_ext()->alt_cnt_ = 0;
552
553 /* we haven't cached any hash values yet */
554 get_ext()->hashes_cached_ = FALSE;
555 get_ext()->comparator_ = VM_INVALID_OBJ;
556
557 /* presume we have no circular rules */
558 get_ext()->has_circular_alt = FALSE;
559
560 /* create our memory pool */
561 get_ext()->mem_ = new CVmGramProdMem();
562
563 /*
564 * allocate initial property enumeration space (we'll expand this as
565 * needed, so the initial size is just a guess)
566 */
567 get_ext()->prop_enum_max_ = 64;
568 get_ext()->prop_enum_arr_ = (vmgram_match_info *)
569 t3malloc(get_ext()->prop_enum_max_
570 * sizeof(vmgram_match_info));
571 }
572
573 /*
574 * notify of deletion
575 */
notify_delete(VMG_ int in_root_set)576 void CVmObjGramProd::notify_delete(VMG_ int in_root_set)
577 {
578 /* free our additional data */
579 if (ext_ != 0)
580 {
581 /*
582 * If we've allocated our alternatives, delete the memory. We
583 * allocate the entire complex structure in one block, which we use
584 * for the alts_ array, so we merely need to delete that one block.
585 */
586 if (get_ext()->alts_ != 0)
587 t3free(get_ext()->alts_);
588
589 /* delete our memory pool */
590 delete get_ext()->mem_;
591
592 /* delete our property enumeration space */
593 t3free(get_ext()->prop_enum_arr_);
594
595 /* free the extension */
596 G_mem->get_var_heap()->free_mem(ext_);
597 }
598 }
599
600 /*
601 * get a property
602 */
get_prop(VMG_ vm_prop_id_t prop,vm_val_t * retval,vm_obj_id_t self,vm_obj_id_t * source_obj,uint * argc)603 int CVmObjGramProd::get_prop(VMG_ vm_prop_id_t prop, vm_val_t *retval,
604 vm_obj_id_t self, vm_obj_id_t *source_obj,
605 uint *argc)
606 {
607 ushort func_idx;
608
609 /* translate the property index to an index into our function table */
610 func_idx = G_meta_table
611 ->prop_to_vector_idx(metaclass_reg_->get_reg_idx(), prop);
612
613 /* call the appropriate function */
614 if ((this->*func_table_[func_idx])(vmg_ self, retval, argc))
615 {
616 *source_obj = metaclass_reg_->get_class_obj(vmg0_);
617 return TRUE;
618 }
619
620 /* inherit default handling */
621 return CVmObject::get_prop(vmg_ prop, retval, self, source_obj, argc);
622 }
623
624 /*
625 * set a property
626 */
set_prop(VMG_ class CVmUndo * undo,vm_obj_id_t self,vm_prop_id_t prop,const vm_val_t * val)627 void CVmObjGramProd::set_prop(VMG_ class CVmUndo *undo,
628 vm_obj_id_t self, vm_prop_id_t prop,
629 const vm_val_t *val)
630 {
631 /* no settable properties - throw an error */
632 err_throw(VMERR_INVALID_SETPROP);
633 }
634
635 /*
636 * load from an image file
637 */
load_from_image(VMG_ vm_obj_id_t self,const char * ptr,size_t siz)638 void CVmObjGramProd::load_from_image(VMG_ vm_obj_id_t self,
639 const char *ptr, size_t siz)
640 {
641 const char *p;
642 size_t alt_cnt;
643 size_t tok_cnt;
644 size_t i;
645 size_t alo_siz;
646 vmgram_alt_info *next_alt;
647 vmgram_tok_info *next_tok;
648 char *next_byte;
649
650 /* remember where our image data comes from */
651 get_ext()->image_data_ = ptr;
652 get_ext()->image_data_size_ = siz;
653
654 /*
655 * For our alternatives structure, start with the base amount of
656 * memory, which is the amount we'll need for the array of alternative
657 * entries themselves. The first UINT2 gives the number of
658 * alternatives in our grammar rule.
659 */
660 alt_cnt = osrp2(ptr);
661 alo_siz = alt_cnt * sizeof(vmgram_alt_info);
662
663 /*
664 * Scan the alternatives. Check for circular (left-recursive)
665 * alternatives, and add up how much memory we'll need for our decoded
666 * altneratives structure.
667 */
668 for (i = alt_cnt, p = ptr + 2, tok_cnt = 0 ; i != 0 ; --i)
669 {
670 const char *tokp;
671 size_t alt_tok_cnt;
672 size_t j;
673
674 /* get the token count and first token for this alternative */
675 alt_tok_cnt = vmgram_alt_tokcnt(p);
676 tokp = vmgram_alt_tokptr(p);
677
678 /*
679 * For our allocation size, add the amount of memory we need for
680 * this alternative entry's array of tokens.
681 */
682 alo_siz += alt_tok_cnt * sizeof(vmgram_tok_info);
683
684 /* include these tokens in the cumulative token count */
685 tok_cnt += alt_tok_cnt;
686
687 /* if the alternative is circular, note it */
688 if (alt_tok_cnt != 0
689 && vmgram_tok_type(tokp) == VMGRAM_MATCH_PROD
690 && vmgram_tok_prod_obj(tokp) == self)
691 {
692 /* note the existence of a circular reference */
693 get_ext()->has_circular_alt = TRUE;
694 }
695
696 /* scan the tokens to see how much memory we'll need to store them */
697 for (j = 0 ; j < alt_tok_cnt ; ++j, tokp = get_next_alt_tok(tokp))
698 {
699 /* check what sort of extra memory we'll need for this token */
700 switch(vmgram_tok_type(tokp))
701 {
702 case VMGRAM_MATCH_NSPEECH:
703 /* we need the array of vm_prop_id_t's */
704 alo_siz += osrndsz(vmgram_tok_vocn_cnt(tokp)
705 * sizeof(vm_prop_id_t));
706 break;
707
708 case VMGRAM_MATCH_LITERAL:
709 /* we need space for the string */
710 alo_siz += osrndsz(vmgram_tok_lit_len(tokp));
711 break;
712 }
713 }
714
715 /* the next alternative starts after the last token */
716 p = tokp;
717 }
718
719 /*
720 * Allocate our block of memory to store the decoded alternatives. Put
721 * the whole thing in one block, and put the alternative array at the
722 * start of the block.
723 */
724 get_ext()->alt_cnt_ = alt_cnt;
725 get_ext()->alts_ = next_alt = (vmgram_alt_info *)t3malloc(alo_siz);
726
727 /* allocate the tokens right after the alternative array */
728 next_tok = (vmgram_tok_info *)&get_ext()->alts_[alt_cnt];
729
730 /* allocate the other miscellaneous data after the token array */
731 next_byte = (char *)&next_tok[tok_cnt];
732
733 /* run through the image data again and decode it */
734 for (i = alt_cnt, p = ptr + 2, tok_cnt = 0 ; i != 0 ; --i)
735 {
736 const char *tokp;
737 size_t alt_tok_cnt;
738 size_t j;
739
740 /* get the token count and first token for this alternative */
741 alt_tok_cnt = vmgram_alt_tokcnt(p);
742 tokp = vmgram_alt_tokptr(p);
743
744 /* set up this alternative structure */
745 next_alt->score = vmgram_alt_score(p);
746 next_alt->badness = vmgram_alt_badness(p);
747 next_alt->proc_obj = vmgram_alt_procobj(p);
748 next_alt->tok_cnt = vmgram_alt_tokcnt(p);
749 next_alt->toks = next_tok;
750
751 /* consume this alternative entry */
752 ++next_alt;
753
754 /* scan and decode the tokens */
755 for (j = 0 ; j < alt_tok_cnt ; ++j, tokp = get_next_alt_tok(tokp))
756 {
757 size_t k;
758
759 /* set up this token's base information */
760 next_tok->prop = vmgram_tok_prop(tokp);
761 next_tok->typ = vmgram_tok_type(tokp);
762
763 /* decode the type-specific data */
764 switch(vmgram_tok_type(tokp))
765 {
766 case VMGRAM_MATCH_PROD:
767 next_tok->typinfo.prod_obj = vmgram_tok_prod_obj(tokp);
768 break;
769
770 case VMGRAM_MATCH_SPEECH:
771 next_tok->typinfo.speech_prop = vmgram_tok_voc_prop(tokp);
772 break;
773
774 case VMGRAM_MATCH_NSPEECH:
775 /* set up our array, using the miscellaneous space pool */
776 next_tok->typinfo.nspeech.cnt = vmgram_tok_vocn_cnt(tokp);
777 next_tok->typinfo.nspeech.props = (vm_prop_id_t *)next_byte;
778
779 /* copy the properties */
780 for (k = 0 ; k < next_tok->typinfo.nspeech.cnt ; ++k)
781 next_tok->typinfo.nspeech.props[k] =
782 vmgram_tok_vocn_prop(tokp, k);
783
784 /* consume the space from the pool */
785 next_byte += osrndsz(next_tok->typinfo.nspeech.cnt
786 * sizeof(vm_prop_id_t));
787 break;
788
789 case VMGRAM_MATCH_LITERAL:
790 /* take the space out of the miscellaneous space pool */
791 next_tok->typinfo.lit.len = vmgram_tok_lit_len(tokp);
792 next_tok->typinfo.lit.str = next_byte;
793
794 /* copy the data */
795 memcpy(next_tok->typinfo.lit.str, vmgram_tok_lit_txt(tokp),
796 next_tok->typinfo.lit.len);
797
798 /* consume the space from the pool */
799 next_byte += osrndsz(next_tok->typinfo.lit.len);
800 break;
801
802 case VMGRAM_MATCH_TOKTYPE:
803 next_tok->typinfo.toktyp_enum = vmgram_tok_tok_enum(tokp);
804 break;
805 }
806
807 /* consume the token slot */
808 ++next_tok;
809 }
810
811 /* the next alternative starts after the last token */
812 p = tokp;
813 }
814 }
815
816 /*
817 * Remove stale weak references.
818 */
remove_stale_weak_refs(VMG0_)819 void CVmObjGramProd::remove_stale_weak_refs(VMG0_)
820 {
821 /*
822 * Our reference to the dictionary comparator object is weak (we're
823 * only caching it - we don't want to prevent the object from being
824 * collected if no one else wants it). So, forget it if the comparator
825 * is being deleted.
826 */
827 if (get_ext()->comparator_ != VM_INVALID_OBJ
828 && G_obj_table->is_obj_deletable(get_ext()->comparator_))
829 {
830 /* forget the comparator */
831 get_ext()->comparator_ = VM_INVALID_OBJ;
832
833 /*
834 * our cached hash values depend on the comparator, so they're now
835 * invalid - forget about them
836 */
837 get_ext()->hashes_cached_ = FALSE;
838 }
839 }
840
841
842 /*
843 * Context for enum_props_cb
844 */
845 struct enum_props_ctx
846 {
847 /* object extension */
848 vm_gram_ext *ext;
849
850 /* number of properties in our list so far */
851 size_t cnt;
852 };
853
854 /*
855 * Callback for enumerating word properties
856 */
enum_props_cb(VMG_ void * ctx0,vm_prop_id_t prop,const vm_val_t *)857 void CVmObjGramProd::enum_props_cb(VMG_ void *ctx0, vm_prop_id_t prop,
858 const vm_val_t * /*match_val*/)
859 {
860 enum_props_ctx *ctx = (enum_props_ctx *)ctx0;
861 vmgram_match_info *ep;
862 size_t i;
863
864 /* if this property is already in our list, don't add it again */
865 for (i = 0, ep = ctx->ext->prop_enum_arr_ ; i < ctx->cnt ; ++i, ++ep)
866 {
867 /* if we already have this one, we can ignore it */
868 if (ep->prop == prop)
869 return;
870 }
871
872 /* we need to add it - if the array is full, expand it */
873 if (ctx->cnt == ctx->ext->prop_enum_max_)
874 {
875 /* reallocate the array with more space */
876 ctx->ext->prop_enum_max_ += 64;
877 ctx->ext->prop_enum_arr_ = (vmgram_match_info *)
878 t3realloc(ctx->ext->prop_enum_arr_,
879 ctx->ext->prop_enum_max_
880 * sizeof(vmgram_match_info));
881 }
882
883 /* add the property to the array */
884 ep = &ctx->ext->prop_enum_arr_[ctx->cnt];
885 ep->prop = prop;
886
887 /* count the new entry */
888 ctx->cnt++;
889 }
890
891 /*
892 * Execute the parseToken method
893 */
getp_parse(VMG_ vm_obj_id_t self,vm_val_t * retval,uint * argc)894 int CVmObjGramProd::getp_parse(VMG_ vm_obj_id_t self,
895 vm_val_t *retval, uint *argc)
896 {
897 vm_val_t *tokval;
898 vm_val_t dictval;
899 CVmObjDict *dict;
900 size_t tok_cnt;
901 size_t i;
902 vmgramprod_tok *tok;
903 const char *toklistp;
904 int orig_argc = (argc != 0 ? *argc : 0);
905 CVmGramProdQueue queues;
906 CVmObjList *lst;
907 size_t succ_cnt;
908 CVmGramProdMatchEntry *match;
909 static CVmNativeCodeDesc desc(2);
910
911 /* check arguments */
912 if (get_prop_check_argc(retval, argc, &desc))
913 return TRUE;
914
915 /* reset our memory pool */
916 get_ext()->mem_->reset();
917
918 /* clear the work queues */
919 queues.clear();
920
921 /*
922 * get the tokenList argument and make sure it's a list; leave it on
923 * the stack for now so it remains reachable for the garbage
924 * collector
925 */
926 tokval = G_stk->get(0);
927 if ((toklistp = tokval->get_as_list(vmg0_)) == 0)
928 err_throw(VMERR_LIST_VAL_REQD);
929
930 /* get the length of the token list */
931 tok_cnt = vmb_get_len(tokval->get_as_list(vmg0_));
932
933 /* check for a dictionary argument */
934 if (orig_argc >= 2)
935 {
936 /* get the dictionary argument */
937 dictval = *G_stk->get(1);
938
939 /* make sure it's an object or nil */
940 if (dictval.typ != VM_OBJ && dictval.typ != VM_NIL)
941 err_throw(VMERR_OBJ_VAL_REQD);
942 }
943 else
944 {
945 /* use a nil dictionary by default */
946 dictval.set_nil();
947 }
948
949 /* get the dictionary object and check that it's really a dictionary */
950 if (dictval.typ != VM_NIL)
951 {
952 /* if it's not a CVmObjDict object, it's the wrong type */
953 if (!CVmObjDict::is_dictionary_obj(vmg_ dictval.val.obj))
954 {
955 /* wrong type - throw an error */
956 err_throw(VMERR_INVAL_OBJ_TYPE);
957 }
958
959 /* get the VM object pointer */
960 dict = (CVmObjDict *)vm_objp(vmg_ dictval.val.obj);
961 }
962 else
963 {
964 /* there's no dictionary object */
965 dict = 0;
966 }
967
968 /*
969 * For quick and easy access, make our own private copy of the token
970 * and type lists. First, allocate an array of token structures.
971 */
972 tok = (vmgramprod_tok *)get_ext()->mem_
973 ->alloc(tok_cnt * sizeof(vmgramprod_tok));
974
975 /* copy the token string references and types into our list */
976 for (i = 0 ; i < tok_cnt ; ++i)
977 {
978 vm_val_t ele_val;
979 vm_val_t tok_typ_val;
980 vm_val_t tok_str_val;
981 const char *subp;
982 const char *tokstrp;
983
984 /* presume we won't have any vocabulary properties for the word */
985 tok[i].match_cnt_ = 0;
986 tok[i].matches_ = 0;
987
988 /* get this element from the list, and treat it as a list */
989 CVmObjList::index_list(vmg_ &ele_val, toklistp, i+1);
990 subp = ele_val.get_as_list(vmg0_);
991
992 /* if this element isn't a list, it's an error */
993 if (subp == 0)
994 err_throw(VMERR_BAD_TYPE_BIF);
995
996 /*
997 * parse the token sublist: the first element is the token value,
998 * and the second element is the token type
999 */
1000 CVmObjList::index_list(vmg_ &tok_str_val, subp, 1);
1001 CVmObjList::index_list(vmg_ &tok_typ_val, subp, 2);
1002
1003 /* use the value if it's an enumerator; if not, use zero */
1004 if (tok_typ_val.typ == VM_ENUM)
1005 tok[i].typ_ = tok_typ_val.val.enumval;
1006 else
1007 tok[i].typ_ = 0;
1008
1009 /* get the token value as a string */
1010 tokstrp = tok_str_val.get_as_string(vmg0_);
1011
1012 /* if it's a string, copy it; otherwise, ignore the value */
1013 if (tokstrp != 0)
1014 {
1015 /* remember the string value */
1016 tok[i].val_ = tok_str_val;
1017
1018 /* point the token to the string */
1019 tok[i].txt_ = tokstrp + VMB_LEN;
1020 tok[i].len_ = vmb_get_len(tokstrp);
1021
1022 /* calculate and store the token's hash value */
1023 tok[i].hash_ = calc_hash(vmg_ dict, &tok_str_val,
1024 tokstrp + VMB_LEN, vmb_get_len(tokstrp));
1025
1026 /*
1027 * if we have a dictionary, enumerate the properties
1028 * associated with the word
1029 */
1030 if (dict != 0)
1031 {
1032 enum_props_ctx ctx;
1033
1034 /* set up our callback context */
1035 ctx.ext = get_ext();
1036 ctx.cnt = 0;
1037
1038 /* enumerate the properties */
1039 dict->enum_word_props(vmg_ &enum_props_cb, &ctx,
1040 &tok_str_val, tok[i].txt_, tok[i].len_);
1041
1042 /*
1043 * if we found any properties for the word, make a copy
1044 * for the token list
1045 */
1046 if (ctx.cnt != 0)
1047 {
1048 /* allocate space for the list */
1049 tok[i].match_cnt_ = ctx.cnt;
1050 tok[i].matches_ =
1051 (vmgram_match_info *)get_ext()->mem_
1052 ->alloc(ctx.cnt * sizeof(vmgram_match_info));
1053
1054 /* copy the list */
1055 memcpy(tok[i].matches_, get_ext()->prop_enum_arr_,
1056 ctx.cnt * sizeof(tok[i].matches_[0]));
1057 }
1058 }
1059 }
1060 else
1061 {
1062 /* it's not a string - use a null string value for this token */
1063 tok[i].txt_ = 0;
1064 tok[i].len_ = 0;
1065 }
1066 }
1067
1068 /* enqueue a match state item for each of our alternatives */
1069 enqueue_alts(vmg_ get_ext()->mem_, tok, tok_cnt, 0, 0,
1070 &queues, self, FALSE, 0, dict);
1071
1072 /* process the work queue */
1073 process_work_queue(vmg_ get_ext()->mem_, tok, tok_cnt,
1074 &queues, dict);
1075
1076 /* count the entries in the success list */
1077 for (succ_cnt = 0, match = queues.success_list_ ; match != 0 ;
1078 ++succ_cnt, match = match->nxt_) ;
1079
1080 /* allocate the return list */
1081 retval->set_obj(CVmObjList::create(vmg_ FALSE, succ_cnt));
1082 lst = (CVmObjList *)vm_objp(vmg_ retval->val.obj);
1083
1084 /* initially clear the elements of the return list to nil */
1085 for (i = 0 ; i < succ_cnt ; ++i)
1086 {
1087 vm_val_t nil_val;
1088
1089 /* set this element to nil */
1090 nil_val.set_nil();
1091 lst->cons_set_element(i, &nil_val);
1092 }
1093
1094 /* push the list momentarily to protect it from garbage collection */
1095 G_stk->push(retval);
1096
1097 /* set the elements */
1098 for (succ_cnt = 0, match = queues.success_list_ ; match != 0 ;
1099 ++succ_cnt, match = match->nxt_)
1100 {
1101 vm_val_t tree_val;
1102 vm_val_t tok_match_val;
1103 size_t first_tok;
1104 size_t last_tok;
1105
1106 /*
1107 * Create a list to hold the token match list - this is a list of
1108 * the dictionary's comparator's matchValue() results for the
1109 * tokens that matched literals in the grammar rule. We need a
1110 * list with one element per token that we matched.
1111 *
1112 * We can't actually populate this list yet, but all we need for
1113 * the moment are references to it. So, create the list; we'll
1114 * fill it in as we traverse the match to build the result tree.
1115 */
1116 tok_match_val.set_obj(
1117 CVmObjList::create(vmg_ FALSE, match->tok_pos_));
1118
1119 /* push the token match value list for gc protection */
1120 G_stk->push(&tok_match_val);
1121
1122 /* build the object tree for this success object */
1123 build_match_tree(vmg_ match->match_, tokval, &tok_match_val,
1124 &tree_val, &first_tok, &last_tok);
1125
1126 /* discard our token match list's gc protection stack entry */
1127 G_stk->discard();
1128
1129 /*
1130 * add the match tree's root object to the main return list (note
1131 * that this protects it from garbage collection by virtue of the
1132 * main return list being protected from garbage collection)
1133 */
1134 lst->cons_set_element(succ_cnt, &tree_val);
1135 }
1136
1137 /* discard the gc protection */
1138 G_stk->discard();
1139
1140 /* discard arguments */
1141 G_stk->discard(orig_argc);
1142
1143 /* evaluation successful */
1144 return TRUE;
1145 }
1146
1147 /*
1148 * Build an object tree for a match
1149 */
build_match_tree(VMG_ const CVmGramProdMatch * match,const vm_val_t * toklist,const vm_val_t * tokmatchlist,vm_val_t * retval,size_t * first_tok,size_t * last_tok)1150 void CVmObjGramProd::build_match_tree(VMG_ const CVmGramProdMatch *match,
1151 const vm_val_t *toklist,
1152 const vm_val_t *tokmatchlist,
1153 vm_val_t *retval,
1154 size_t *first_tok, size_t *last_tok)
1155 {
1156 /* check to see what kind of match we have */
1157 if (match->proc_obj_ != VM_INVALID_OBJ)
1158 {
1159 vm_obj_id_t obj_id;
1160 CVmObjTads *objp;
1161 size_t i;
1162
1163 /*
1164 * Create the object to hold the current tree level. The only
1165 * constructor argument is the superclass, which is the match's
1166 * processor object.
1167 */
1168 G_stk->push()->set_obj(match->proc_obj_);
1169 obj_id = CVmObjTads::create_from_stack(vmg_ 0, 1);
1170
1171 /* get a pointer to the new object */
1172 objp = (CVmObjTads *)vm_objp(vmg_ obj_id);
1173
1174 /* push this object to protect it from gc for a moment */
1175 G_stk->push()->set_obj(obj_id);
1176
1177 /*
1178 * Initialize the caller's first/last token indices to an
1179 * arbitrary invalid range, in case we have no subproductions. A
1180 * production with no tokens and no subproductions matches no
1181 * tokens at all; we indicate this by using an invalid range, with
1182 * the first token index higher than the last token index.
1183 */
1184 *first_tok = 1;
1185 *last_tok = 0;
1186
1187 /* build the sub-match entries */
1188 for (i = 0 ; i < match->sub_match_cnt_ ; ++i)
1189 {
1190 size_t first_sub_tok;
1191 size_t last_sub_tok;
1192 vm_val_t val;
1193
1194 /* build this subtree */
1195 build_match_tree(vmg_ match->sub_match_list_[i],
1196 toklist, tokmatchlist,
1197 &val, &first_sub_tok, &last_sub_tok);
1198
1199 /*
1200 * If this is the first subtree, use it as the tentative
1201 * limits for our overall match so far; otherwise, expand our
1202 * limits if they are outside our range so far.
1203 *
1204 * If the submatch doesn't include any tokens, it obviously
1205 * has no effect on our range. The submatch range will
1206 * indicate that the first token index is greater than the
1207 * last token index if the submatch includes no tokens.
1208 */
1209 if (i == 0)
1210 {
1211 /* it's the first subtree - it's all we know as yet */
1212 *first_tok = first_sub_tok;
1213 *last_tok = last_sub_tok;
1214 }
1215 else if (first_sub_tok <= last_sub_tok)
1216 {
1217 /* check to see if it expands our current range */
1218 if (first_sub_tok < *first_tok)
1219 *first_tok = first_sub_tok;
1220 if (last_sub_tok > *last_tok)
1221 *last_tok = last_sub_tok;
1222 }
1223
1224 /*
1225 * save the subtree with the current match object if there's a
1226 * property in which to save it
1227 */
1228 if (match->sub_match_list_[i]->target_prop_ != VM_INVALID_PROP)
1229 {
1230 /*
1231 * Set the processor object property for the value. Note
1232 * that we don't have to keep undo for this change, since
1233 * we just created the tree object ourselves, and we don't
1234 * create any undo savepoints - an object created after
1235 * the most recent undo savepoint doesn't need to keep
1236 * undo information, since the entire object will be
1237 * deleted if we undo to the savepoint.
1238 */
1239 objp->set_prop(vmg_ 0, obj_id,
1240 match->sub_match_list_[i]->target_prop_,
1241 &val);
1242 }
1243 }
1244
1245 /*
1246 * if we have exported properties for recording the token index
1247 * range, save them with the object
1248 */
1249 if (G_predef->gramprod_first_tok != VM_INVALID_PROP
1250 && G_predef->gramprod_last_tok != VM_INVALID_PROP)
1251 {
1252 vm_val_t val;
1253
1254 /* save the first token index */
1255 val.set_int((int)*first_tok + 1);
1256 objp->set_prop(vmg_ 0, obj_id,
1257 G_predef->gramprod_first_tok, &val);
1258
1259 /* save the last token index */
1260 val.set_int((int)*last_tok + 1);
1261 objp->set_prop(vmg_ 0, obj_id,
1262 G_predef->gramprod_last_tok, &val);
1263 }
1264
1265 /*
1266 * if we have the token list property exported, set it to the
1267 * original token list reference
1268 */
1269 if (G_predef->gramprod_token_list != VM_INVALID_PROP)
1270 {
1271 /* save the token list reference */
1272 objp->set_prop(vmg_ 0, obj_id,
1273 G_predef->gramprod_token_list, toklist);
1274 }
1275
1276 /* if we have the token match list property exported, set it */
1277 if (G_predef->gramprod_token_match_list != VM_INVALID_PROP)
1278 {
1279 /* save the token match list reference */
1280 objp->set_prop(vmg_ 0, obj_id,
1281 G_predef->gramprod_token_match_list, tokmatchlist);
1282 }
1283
1284 /* discard our gc protection */
1285 G_stk->discard();
1286
1287 /* the return value is the object */
1288 retval->set_obj(obj_id);
1289 }
1290 else
1291 {
1292 const char *lstp;
1293
1294 /* get the token list */
1295 lstp = toklist->get_as_list(vmg0_);
1296
1297 /* make sure the index is in range */
1298 if (match->tok_pos_ < vmb_get_len(lstp))
1299 {
1300 vm_val_t ele_val;
1301 CVmObjList *match_lst;
1302
1303 /* get the token from the list */
1304 CVmObjList::index_list(vmg_ &ele_val, lstp, match->tok_pos_ + 1);
1305
1306 /*
1307 * the token is itself a list, whose first element is the
1308 * token's value - retrieve the value
1309 */
1310 lstp = ele_val.get_as_list(vmg0_);
1311 CVmObjList::index_list(vmg_ retval, lstp, 1);
1312
1313 /*
1314 * Store the token match result in the result list. If this is
1315 * a '*' match, it doesn't have a result list contribution,
1316 * because '*' doesn't actually match any tokens (it merely
1317 * stops parsing).
1318 */
1319 if (!match->matched_star_)
1320 {
1321 /* get the match list */
1322 match_lst = (CVmObjList *)vm_objp(vmg_ tokmatchlist->val.obj);
1323
1324 /*
1325 * set the element at this token position in the match list
1326 * to the token match result
1327 */
1328 match_lst->cons_set_element(
1329 match->tok_pos_, &match->tok_match_result_);
1330 }
1331 }
1332 else
1333 {
1334 /*
1335 * the index is past the end of the list - this must be a
1336 * '*' token that matched nothing (i.e., the token list was
1337 * fully consumed before we reached the '*'), so just return
1338 * nil for the match value
1339 */
1340 retval->set_nil();
1341 }
1342
1343 /*
1344 * We match one token, so it's the first and last index. Note
1345 * that if this is a '*' match, we don't match any tokens.
1346 */
1347 if (!match->matched_star_)
1348 *first_tok = *last_tok = match->tok_pos_;
1349 }
1350 }
1351
1352 /*
1353 * Process the work queue
1354 */
process_work_queue(VMG_ CVmGramProdMem * mem,const vmgramprod_tok * tok,size_t tok_cnt,CVmGramProdQueue * queues,CVmObjDict * dict)1355 void CVmObjGramProd::process_work_queue(VMG_ CVmGramProdMem *mem,
1356 const vmgramprod_tok *tok,
1357 size_t tok_cnt,
1358 CVmGramProdQueue *queues,
1359 CVmObjDict *dict)
1360 {
1361 /* keep going until the work queue and badness queue are empty */
1362 for (;;)
1363 {
1364 /*
1365 * If the work queue is empty, fall back on the badness queue.
1366 * Ignore the badness queue if we have any successful matches,
1367 * since non-badness matches always take precedence over badness
1368 * items.
1369 */
1370 if (queues->work_queue_ == 0 && queues->success_list_ == 0)
1371 {
1372 int first;
1373 int min_badness;
1374 CVmGramProdState *cur;
1375 CVmGramProdState *prv;
1376 CVmGramProdState *nxt;
1377
1378 /* find the lowest badness rating in the queue */
1379 for (first = TRUE, cur = queues->badness_queue_ ; cur != 0 ;
1380 cur = cur->nxt_, first = FALSE)
1381 {
1382 /*
1383 * if we're on the first item, note its badness as the
1384 * tentative minimum; otherwise, note the current item's
1385 * badness if it's lower than the lowest we've seen so
1386 * far
1387 */
1388 if (first || cur->altp_->badness < min_badness)
1389 min_badness = cur->altp_->badness;
1390 }
1391
1392 /*
1393 * move each of the items whose badness matches the minimum
1394 * badness out of the badness queue and into the work queue
1395 */
1396 for (cur = queues->badness_queue_, prv = 0 ; cur != 0 ;
1397 cur = nxt)
1398 {
1399 /* remember the next item, in case we remove this one */
1400 nxt = cur->nxt_;
1401
1402 /* if this item has minimum badness, move it */
1403 if (cur->altp_->badness == min_badness)
1404 {
1405 /* unlink it from the badness queue */
1406 if (prv == 0)
1407 queues->badness_queue_ = cur->nxt_;
1408 else
1409 prv->nxt_ = cur->nxt_;
1410
1411 /* link it into the work queue */
1412 cur->nxt_ = queues->work_queue_;
1413 queues->work_queue_ = cur;
1414 }
1415 else
1416 {
1417 /*
1418 * this item is staying in the list - note it as the
1419 * item preceding the next item
1420 */
1421 prv = cur;
1422 }
1423 }
1424 }
1425
1426 /* if the work queue is still empty, we're out of work to do */
1427 if (queues->work_queue_ == 0)
1428 break;
1429
1430 /* process the head of the work queue */
1431 process_work_queue_head(vmg_ mem, tok, tok_cnt, queues, dict);
1432 }
1433 }
1434
1435 /*
1436 * Process a work queue entry
1437 */
process_work_queue_head(VMG_ CVmGramProdMem * mem,const vmgramprod_tok * tok,size_t tok_cnt,CVmGramProdQueue * queues,CVmObjDict * dict)1438 void CVmObjGramProd::process_work_queue_head(VMG_ CVmGramProdMem *mem,
1439 const vmgramprod_tok *tok,
1440 size_t tok_cnt,
1441 CVmGramProdQueue *queues,
1442 CVmObjDict *dict)
1443 {
1444 CVmGramProdState *state;
1445 CVmGramProdMatch *match;
1446 const vmgram_tok_info *tokp;
1447 int tok_matched_star;
1448 vm_prop_id_t enclosing_target_prop;
1449
1450 /* get the first entry from the queue */
1451 state = queues->work_queue_;
1452
1453 /* unlink this entry from the queue */
1454 queues->work_queue_ = state->nxt_;
1455 state->nxt_ = 0;
1456
1457 /* get the token pointer for the next active entry in the alternative */
1458 tokp = state->altp_->toks + state->alt_pos_;
1459
1460 /* presume we won't match a '*' token */
1461 tok_matched_star = FALSE;
1462
1463 /* process the remaining items in the entry */
1464 while (state->alt_pos_ < state->altp_->tok_cnt)
1465 {
1466 int match;
1467 vm_val_t match_result;
1468
1469 /* presume we won't find a match */
1470 match_result.set_nil();
1471
1472 /* see what we have for this token */
1473 switch(tokp->typ)
1474 {
1475 case VMGRAM_MATCH_PROD:
1476 {
1477 vm_obj_id_t sub_obj_id;
1478 CVmObjGramProd *sub_objp;
1479
1480 /*
1481 * This is a sub-production node. Get the
1482 * sub-production object.
1483 */
1484 sub_obj_id = tokp->typinfo.prod_obj;
1485
1486 /* make sure it's of the correct metaclass */
1487 if (!CVmObjGramProd::is_gramprod_obj(vmg_ sub_obj_id))
1488 {
1489 /* wrong type - throw an error */
1490 err_throw(VMERR_INVAL_OBJ_TYPE);
1491 }
1492
1493 /* get the sub-production object */
1494 sub_objp = (CVmObjGramProd *)vm_objp(vmg_ sub_obj_id);
1495
1496 /*
1497 * set my subproduction target property, so that the
1498 * sub-production can set the target property correctly
1499 */
1500 state->sub_target_prop_ = tokp->prop;
1501
1502 /* enqueue the alternatives for the sub-production */
1503 sub_objp->enqueue_alts(vmg_ mem, tok, tok_cnt,
1504 state->tok_pos_, state, queues,
1505 sub_obj_id, FALSE, 0, dict);
1506 }
1507
1508 /*
1509 * Do not process the current state any further for now -
1510 * we'll get back to it when (and if) we finish processing
1511 * the sub-production. Note that we don't even put the
1512 * current state back in the queue - it's stacked behind the
1513 * sub-production, and will be re-enqueued when we
1514 * successfully finish with the sub-production.
1515 */
1516 return;
1517
1518 case VMGRAM_MATCH_SPEECH:
1519 /* part of speech - check to see if the current token matches */
1520 match = (state->tok_pos_ < tok_cnt
1521 && find_prop_in_tok(&tok[state->tok_pos_],
1522 tokp->typinfo.speech_prop));
1523 break;
1524
1525 case VMGRAM_MATCH_NSPEECH:
1526 /*
1527 * multiple parts of speech - check to see if the current token
1528 * matches any of the parts of the speech
1529 */
1530 if (state->tok_pos_ < tok_cnt)
1531 {
1532 size_t rem;
1533 const vm_prop_id_t *prop;
1534
1535 /* presume we won't find a match */
1536 match = FALSE;
1537
1538 /* check each item in the list */
1539 for (prop = tokp->typinfo.nspeech.props,
1540 rem = tokp->typinfo.nspeech.cnt ;
1541 rem != 0 ; --rem, ++prop)
1542 {
1543 /* if this one matches, we have a match */
1544 if (find_prop_in_tok(&tok[state->tok_pos_], *prop))
1545 {
1546 /* note the match */
1547 match = TRUE;
1548
1549 /* no need to look any further at this token */
1550 break;
1551 }
1552 }
1553 }
1554 else
1555 {
1556 /*
1557 * we're out of tokens, so we definitely don't have a
1558 * match, since we must match at least one of the possible
1559 * parts of speech of this item
1560 */
1561 match = FALSE;
1562 }
1563 break;
1564
1565 case VMGRAM_MATCH_LITERAL:
1566 /*
1567 * Literal - check for a match to the string. Test the hash
1568 * values first, as this is much faster than comparing the
1569 * strings, and at least tells us if the strings fail to match
1570 * (and failing to match being by far the most common case,
1571 * this saves us doing a full comparison most of the time).
1572 */
1573 match = (state->tok_pos_ < tok_cnt
1574 && tokp->typinfo.lit.hash == tok[state->tok_pos_].hash_
1575 && tok_equals_lit(vmg_ &tok[state->tok_pos_],
1576 tokp->typinfo.lit.str,
1577 tokp->typinfo.lit.len,
1578 dict, &match_result));
1579 break;
1580
1581 case VMGRAM_MATCH_TOKTYPE:
1582 /* token type */
1583 match = (state->tok_pos_ < tok_cnt
1584 && (tok[state->tok_pos_].typ_
1585 == tokp->typinfo.toktyp_enum));
1586 break;
1587
1588 case VMGRAM_MATCH_STAR:
1589 /*
1590 * this matches anything remaining (it also matches if
1591 * there's nothing remaining)
1592 */
1593 match = TRUE;
1594
1595 /* note that we matched a star in the state */
1596 state->matched_star_ = TRUE;
1597
1598 /* note that we matched a star for this particular token */
1599 tok_matched_star = TRUE;
1600 break;
1601 }
1602
1603 /* check for a match */
1604 if (match)
1605 {
1606 /*
1607 * we matched this token - add a token match to our state's
1608 * match list for the current position
1609 */
1610 state->match_list_[state->alt_pos_] =
1611 CVmGramProdMatch::alloc(mem, state->tok_pos_, &match_result,
1612 tok_matched_star, tokp->prop,
1613 VM_INVALID_OBJ, 0, 0);
1614
1615 /* we're done with this alternative item - move on */
1616 state->alt_pos_++;
1617
1618 /*
1619 * If we matched a real token, consume the matched input
1620 * token. For a '*', we don't want to consume anything, since
1621 * a '*' merely stops parsing and doesn't actually match
1622 * anything.
1623 */
1624 if (!state->matched_star_)
1625 state->tok_pos_++;
1626
1627 /* move on to the next alternative token item */
1628 ++tokp;
1629 }
1630 else
1631 {
1632 /*
1633 * This is not a match - reject this alternative. We do not
1634 * need to process this item further, so we can stop now,
1635 * without returning this state to the work queue. Simply
1636 * abandon this work queue item.
1637 */
1638 return;
1639 }
1640 }
1641
1642 /*
1643 * If we make it here, we've reached the end of this alternative and
1644 * have matched everything in it. This means that the alternative
1645 * was successful, so we can perform a 'reduce' operation to replace
1646 * the matched tokens with this production.
1647 */
1648
1649 /* if we have an enclosing state, get its target property */
1650 enclosing_target_prop = (state->enclosing_ != 0
1651 ? state->enclosing_->sub_target_prop_
1652 : VM_INVALID_PROP);
1653
1654 /* create a match for the entire alternative (i.e., the subproduction) */
1655 match = CVmGramProdMatch::alloc(mem, 0, 0, FALSE, enclosing_target_prop,
1656 state->altp_->proc_obj,
1657 &state->match_list_[0],
1658 state->altp_->tok_cnt);
1659
1660 /*
1661 * If the state we just matched was marked on creation as coming
1662 * from an alternative with circular alternatives, we've just come
1663 * up with a match for each circular alternative. To avoid infinite
1664 * recursion, we never enqueue the circular alternatives; however,
1665 * now that we know we have a match for the first element of each
1666 * circular alternative, we can check each of them to see if we can
1667 * enqueue them.
1668 */
1669 if (state->circular_alt_)
1670 {
1671 vm_obj_id_t prod_obj_id;
1672 CVmObjGramProd *prod_objp;
1673
1674 /*
1675 * Get the production from which this alternative came (there
1676 * should be no need to check the metaclass, since we could only
1677 * have created the state object from a valid production in the
1678 * first place).
1679 *
1680 * First, make sure it's of the correct class. (It almost
1681 * certainly is, since the compiler should have generated this
1682 * reference automatically, so there should be no possibility of a
1683 * user error. However, if it's not a production object, we'll
1684 * probably crash by blindly casting it to one; so we'll check
1685 * here, just to be sure that the compiler didn't do something
1686 * wrong and the image file didn't become corrupted.)
1687 */
1688 if (!CVmObjGramProd::is_gramprod_obj(vmg_ state->prod_obj_))
1689 {
1690 /* wrong type - throw an error */
1691 err_throw(VMERR_INVAL_OBJ_TYPE);
1692 }
1693
1694 /* get the object, properly cast */
1695 prod_obj_id = state->prod_obj_;
1696 prod_objp = (CVmObjGramProd *)vm_objp(vmg_ prod_obj_id);
1697
1698 /*
1699 * Enqueue the matching circular alternatives from the original
1700 * production. Our match becomes the first token match in the
1701 * circular alternative (i.e., it matches the leading circular
1702 * reference element).
1703 */
1704 prod_objp->enqueue_alts(vmg_ mem, tok, tok_cnt, state->tok_pos_,
1705 state->enclosing_, queues, prod_obj_id,
1706 TRUE, match, dict);
1707 }
1708
1709 /* check for an enclosing state to pop */
1710 if (state->enclosing_ != 0)
1711 {
1712 /* add the match to the enclosing state's match list */
1713 state->enclosing_->match_list_[state->enclosing_->alt_pos_] = match;
1714 state->enclosing_->alt_pos_++;
1715
1716 /*
1717 * Move the enclosing state's token position to our token position
1718 * - since it now encompasses our match, it has consumed all of
1719 * the tokens therein. Likewise, set the '*' flag in the parent
1720 * to our own setting.
1721 */
1722 state->enclosing_->tok_pos_ = state->tok_pos_;
1723 state->enclosing_->matched_star_ = state->matched_star_;
1724
1725 /* enqueue the enclosing state so we can continue parsing it */
1726 enqueue_state(state->enclosing_, queues);
1727 }
1728 else
1729 {
1730 CVmGramProdMatchEntry *entry;
1731
1732 /*
1733 * This is a top-level state, so we've completed parsing. If
1734 * we've consumed all input, or we matched a '*' token, we were
1735 * successful. If there's more input remaining, this is not a
1736 * match.
1737 */
1738 if (state->tok_pos_ < tok_cnt && !state->matched_star_)
1739 {
1740 /*
1741 * There's more input remaining, so this isn't a match.
1742 * Reject this alternative. We do not need to process this
1743 * item further, so stop now without returning this state
1744 * object to the work queue.
1745 */
1746
1747 /* abandon this work queue item */
1748 return;
1749 }
1750
1751 /* create a new entry for the match list */
1752 entry = (CVmGramProdMatchEntry *)mem->alloc(sizeof(*entry));
1753 entry->match_ = match;
1754 entry->tok_pos_ = state->tok_pos_;
1755
1756 /* link the entry into the list */
1757 entry->nxt_ = queues->success_list_;
1758 queues->success_list_ = entry;
1759 }
1760 }
1761
1762 /*
1763 * Visit my alternatives and enqueue each one that is a possible match
1764 * for a given token list.
1765 */
enqueue_alts(VMG_ CVmGramProdMem * mem,const vmgramprod_tok * tok,size_t tok_cnt,size_t start_tok_pos,CVmGramProdState * enclosing_state,CVmGramProdQueue * queues,vm_obj_id_t self,int circ_only,CVmGramProdMatch * circ_match,CVmObjDict * dict)1766 void CVmObjGramProd::enqueue_alts(VMG_ CVmGramProdMem *mem,
1767 const vmgramprod_tok *tok,
1768 size_t tok_cnt, size_t start_tok_pos,
1769 CVmGramProdState *enclosing_state,
1770 CVmGramProdQueue *queues, vm_obj_id_t self,
1771 int circ_only,
1772 CVmGramProdMatch *circ_match,
1773 CVmObjDict *dict)
1774 {
1775 const vmgram_alt_info *altp;
1776 size_t i;
1777 int need_to_clone;
1778 int has_circ;
1779 vm_obj_id_t comparator;
1780
1781 /*
1782 * We don't need to clone the enclosing state until we've set up one
1783 * new state referring back to it. However, if we're doing circular
1784 * alternatives, the enclosing state could also be enqueued
1785 * separately as its own state, so do clone all required copies in
1786 * this case.
1787 */
1788 need_to_clone = circ_only;
1789
1790 /* note whether or not we have any circular alternatives */
1791 has_circ = get_ext()->has_circular_alt;
1792
1793 /*
1794 * Cache hash values for all of our literals. We need to do this if we
1795 * haven't done it before, or if the comparator associated with the
1796 * dictionary is different than it was last time we cached the values.
1797 */
1798 comparator = (dict != 0 ? dict->get_comparator() : VM_INVALID_OBJ);
1799 if (!get_ext()->hashes_cached_ || get_ext()->comparator_ != comparator)
1800 cache_hashes(vmg_ dict);
1801
1802 /*
1803 * run through our alternatives and enqueue each one that looks
1804 * plausible
1805 */
1806 for (altp = get_ext()->alts_, i = get_ext()->alt_cnt_ ; i != 0 ;
1807 --i, ++altp)
1808 {
1809 vmgram_tok_info *first_tokp;
1810 vmgram_tok_info *tokp;
1811 size_t cur_alt_tok;
1812 size_t alt_tok_cnt;
1813 size_t tok_idx;
1814 int found_mismatch;
1815 int prod_before;
1816 int is_circ;
1817
1818 /* start at the first token remaining in the string */
1819 tok_idx = start_tok_pos;
1820
1821 /* get the first token for this alternative */
1822 tokp = first_tokp = altp->toks;
1823
1824 /* get the number of tokens in the alternative */
1825 alt_tok_cnt = altp->tok_cnt;
1826
1827 /* presume we will find no mismatches in this check */
1828 found_mismatch = FALSE;
1829
1830 /*
1831 * note whether this alternative is a direct circular reference
1832 * or not (it is directly circular if the first token points
1833 * back to this same production)
1834 */
1835 is_circ = (alt_tok_cnt != 0
1836 && tokp->typ == VMGRAM_MATCH_PROD
1837 && tokp->typinfo.prod_obj == self);
1838
1839 /*
1840 * we don't have a production before the current item (this is
1841 * important because it determines whether we must consider the
1842 * current token or any following token when trying to match a
1843 * literal, part of speech, or token type)
1844 */
1845 prod_before = FALSE;
1846
1847 /* scan the tokens in the alternative */
1848 for (cur_alt_tok = 0 ; cur_alt_tok < alt_tok_cnt ;
1849 ++cur_alt_tok, ++tokp)
1850 {
1851 int match;
1852 int match_star;
1853 int ignore_item;
1854 vm_val_t match_result;
1855
1856 /* presume we won't find a match */
1857 match = FALSE;
1858 match_star = FALSE;
1859
1860 /* presume we won't be able to ignore the item */
1861 ignore_item = FALSE;
1862
1863 /*
1864 * Try to find a match to the current alternative item. If
1865 * we've already found a mismatch, don't bother, since we need
1866 * no further evidence to skip this alternative; in this case
1867 * we're still looping over the alternative items simply to
1868 * find the beginning of the next alternative.
1869 */
1870 for ( ; !found_mismatch ; ++tok_idx)
1871 {
1872 /* see what kind of item we have */
1873 switch(tokp->typ)
1874 {
1875 case VMGRAM_MATCH_PROD:
1876 /*
1877 * We can't rule out a production without checking
1878 * it fully, which we're not doing on this pass;
1879 * however, note that we found a production, because
1880 * it means that we must from now on try skipping
1881 * any number of tokens before ruling out a match on
1882 * other grounds, since the production could
1883 * potentially match any number of tokens.
1884 *
1885 * If we're doing circular matches only, and this is
1886 * a circular production, and we're on the first
1887 * token in our list, we already know it matches, so
1888 * don't even count it as a production before the
1889 * item.
1890 */
1891 if (is_circ && circ_only && cur_alt_tok == 0)
1892 {
1893 /*
1894 * ignore the fact that it's a production, since
1895 * we already know it matches - we don't want to
1896 * be flexible about tokens following this item
1897 * since we know the exact number that match it
1898 */
1899 }
1900 else
1901 {
1902 /* note the production item */
1903 prod_before = TRUE;
1904 }
1905
1906 /*
1907 * we can ignore this item for the purposes of
1908 * detecting a mismatch, because we can't tell
1909 * during this superficial scan whether it matches
1910 */
1911 ignore_item = TRUE;
1912 break;
1913
1914 case VMGRAM_MATCH_SPEECH:
1915 /* we must match a part of speech */
1916 if (tok_idx < tok_cnt
1917 && find_prop_in_tok(&tok[tok_idx],
1918 tokp->typinfo.speech_prop))
1919 {
1920 /* it's a match */
1921 match = TRUE;
1922 }
1923 break;
1924
1925 case VMGRAM_MATCH_NSPEECH:
1926 /* multiple parts of speech */
1927 if (tok_idx < tok_cnt)
1928 {
1929 vm_prop_id_t *prop;
1930 size_t rem;
1931
1932 /* presume we won't find a match */
1933 match = FALSE;
1934
1935 /* check each item in the list */
1936 for (prop = tokp->typinfo.nspeech.props,
1937 rem = tokp->typinfo.nspeech.cnt ; rem != 0 ;
1938 --rem, ++prop)
1939 {
1940 /* if this one matches, we have a match */
1941 if (find_prop_in_tok(&tok[tok_idx], *prop))
1942 {
1943 /* note the match */
1944 match = TRUE;
1945
1946 /* no need to look any further */
1947 break;
1948 }
1949 }
1950 }
1951 else
1952 {
1953 /* we're out of tokens, so we have no match */
1954 match = FALSE;
1955 }
1956 break;
1957
1958 case VMGRAM_MATCH_LITERAL:
1959 /*
1960 * Match a literal. Compare the hash values first,
1961 * since a negative match on the hash values will tell
1962 * us for sure that the text won't match; since not
1963 * matching is by far the most common case, this saves
1964 * us the work of doing full string comparisons most of
1965 * the time.
1966 */
1967 if (tok_idx < tok_cnt
1968 && tok[tok_idx].txt_ != 0
1969 && tokp->typinfo.lit.hash == tok[tok_idx].hash_
1970 && tok_equals_lit(vmg_ &tok[tok_idx],
1971 tokp->typinfo.lit.str,
1972 tokp->typinfo.lit.len,
1973 dict, &match_result))
1974 {
1975 /* it's a match */
1976 match = TRUE;
1977 }
1978 break;
1979
1980 case VMGRAM_MATCH_TOKTYPE:
1981 /* we must match a token type */
1982 if (tok_idx < tok_cnt
1983 && tok[tok_idx].typ_ == tokp->typinfo.toktyp_enum)
1984 match = TRUE;
1985 break;
1986
1987 case VMGRAM_MATCH_STAR:
1988 /* consume everything remaining on the line */
1989 tok_idx = tok_cnt;
1990
1991 /* this is a match */
1992 match = TRUE;
1993 match_star = TRUE;
1994 break;
1995 }
1996
1997 /* if we found a match, we can stop scanning now */
1998 if (match)
1999 break;
2000
2001 /*
2002 * we didn't match this token - if we have a production
2003 * before this point, we must keep scanning token, since
2004 * the production could end up matching any number of
2005 * tokens; if we don't have a production item, though,
2006 * we don't need to look any further, and we can rule
2007 * out this alternative immediately
2008 */
2009 if (!prod_before)
2010 break;
2011
2012 /*
2013 * if we're ignoring this item (because we can't decide
2014 * now whether it's a match or not, and hence can't rule
2015 * it out), don't scan any tokens for the item
2016 */
2017 if (ignore_item)
2018 break;
2019
2020 /*
2021 * if we didn't find a match, and we're out of tokens,
2022 * stop looking - we have no more tokens to look at to
2023 * find a match
2024 */
2025 if (!match && tok_idx >= tok_cnt)
2026 break;
2027 }
2028
2029 /* check to see if we found a match */
2030 if (match)
2031 {
2032 /*
2033 * we found a match - skip the current token (unless we
2034 * matched a '*' item, in which case we've already
2035 * consumed all remaining tokens)
2036 */
2037 if (!match_star && tok_idx < tok_cnt)
2038 ++tok_idx;
2039 }
2040 else if (!ignore_item)
2041 {
2042 /*
2043 * we didn't match, and we can't ignore this item - note
2044 * that the alternative does not match
2045 */
2046 found_mismatch = TRUE;
2047
2048 /* finish up with this alternative and proceed to the next */
2049 break;
2050 }
2051 }
2052
2053 /*
2054 * If we didn't find a reason to rule out this alternative,
2055 * enqueue the alternative. Never enqueue circular references,
2056 * unless we're ONLY enqueuing circular references.
2057 */
2058 if (!found_mismatch
2059 && ((!circ_only && !is_circ)
2060 || (circ_only && is_circ)))
2061 {
2062 CVmGramProdState *state;
2063
2064 /* create and enqueue the new state */
2065 state = enqueue_new_state(mem, start_tok_pos, enclosing_state,
2066 altp, self, &need_to_clone, queues,
2067 has_circ);
2068
2069 /*
2070 * if we are enqueuing circular references, we already have
2071 * a match for the first (circular) token, so fill it in
2072 */
2073 if (circ_only)
2074 {
2075 CVmGramProdMatch *match;
2076
2077 /* create a match for the alternative */
2078 match = CVmGramProdMatch::
2079 alloc(mem, 0, 0, FALSE, first_tokp->prop,
2080 circ_match->proc_obj_,
2081 circ_match->sub_match_list_,
2082 circ_match->sub_match_cnt_);
2083
2084 /* set the first match in the token */
2085 state->match_list_[0] = match;
2086
2087 /* set up at the second token */
2088 state->alt_pos_ = 1;
2089 }
2090 }
2091 }
2092 }
2093
2094 /*
2095 * Cache hash values for all of our literal tokens
2096 */
cache_hashes(VMG_ CVmObjDict * dict)2097 void CVmObjGramProd::cache_hashes(VMG_ CVmObjDict *dict)
2098 {
2099 vm_obj_id_t comparator;
2100 vmgram_alt_info *alt;
2101 size_t i;
2102
2103 /* get the comparator */
2104 comparator = (dict != 0 ? dict->get_comparator() : VM_INVALID_OBJ);
2105
2106 /* run through our alternatives */
2107 for (i = get_ext()->alt_cnt_, alt = get_ext()->alts_ ;
2108 i != 0 ; --i, ++alt)
2109 {
2110 vmgram_tok_info *tok;
2111 size_t j;
2112
2113 /* run through our tokens */
2114 for (j = alt->tok_cnt, tok = alt->toks ; j != 0 ; --j, ++tok)
2115 {
2116 /* if this is a literal token, calculate its hash value */
2117 if (tok->typ == VMGRAM_MATCH_LITERAL)
2118 {
2119 /* calculate and store our hash value */
2120 tok->typinfo.lit.hash = calc_hash(
2121 vmg_ dict, 0, tok->typinfo.lit.str, tok->typinfo.lit.len);
2122 }
2123 }
2124 }
2125
2126 /* note that we've cached hash values, and note the comparator we used */
2127 get_ext()->hashes_cached_ = TRUE;
2128 get_ext()->comparator_ = comparator;
2129 }
2130
2131 /*
2132 * Create and enqueue a new state
2133 */
2134 CVmGramProdState *CVmObjGramProd::
enqueue_new_state(CVmGramProdMem * mem,size_t start_tok_pos,CVmGramProdState * enclosing_state,const vmgram_alt_info * altp,vm_obj_id_t self,int * need_to_clone,CVmGramProdQueue * queues,int circular_alt)2135 enqueue_new_state(CVmGramProdMem *mem,
2136 size_t start_tok_pos,
2137 CVmGramProdState *enclosing_state,
2138 const vmgram_alt_info *altp, vm_obj_id_t self,
2139 int *need_to_clone, CVmGramProdQueue *queues,
2140 int circular_alt)
2141 {
2142 CVmGramProdState *state;
2143
2144 /* create the new state */
2145 state = create_new_state(mem, start_tok_pos, enclosing_state,
2146 altp, self, need_to_clone, circular_alt);
2147
2148 /*
2149 * Add the item to the appropriate queue. If the item has an
2150 * associated badness, add it to the badness queue. Otherwise, add
2151 * it to the work queue.
2152 */
2153 if (altp->badness != 0)
2154 {
2155 /*
2156 * we have a badness rating - add it to the badness queue, since
2157 * we don't want to process it until we entirely exhaust better
2158 * possibilities
2159 */
2160 state->nxt_ = queues->badness_queue_;
2161 queues->badness_queue_ = state;
2162 }
2163 else
2164 {
2165 /* enqueue the state */
2166 enqueue_state(state, queues);
2167 }
2168
2169 /* return the new state */
2170 return state;
2171 }
2172
2173 /*
2174 * Create a new state
2175 */
2176 CVmGramProdState *CVmObjGramProd::
create_new_state(CVmGramProdMem * mem,size_t start_tok_pos,CVmGramProdState * enclosing_state,const vmgram_alt_info * altp,vm_obj_id_t self,int * need_to_clone,int circular_alt)2177 create_new_state(CVmGramProdMem *mem, size_t start_tok_pos,
2178 CVmGramProdState *enclosing_state,
2179 const vmgram_alt_info *altp, vm_obj_id_t self,
2180 int *need_to_clone, int circular_alt)
2181 {
2182 /*
2183 * if necessary, clone the enclosing state; we need to do this if we
2184 * enqueue more than one nested alternative, since each nested
2185 * alternative could parse the rest of the token list differently
2186 * and thus provide a different result to the enclosing state
2187 */
2188 if (*need_to_clone && enclosing_state != 0)
2189 enclosing_state = enclosing_state->clone(mem);
2190
2191 /*
2192 * we will need to clone the enclosing state if we create another
2193 * alternative after this one
2194 */
2195 *need_to_clone = TRUE;
2196
2197 /* create a new state object for the alternative */
2198 return CVmGramProdState::alloc(mem, start_tok_pos, 0,
2199 enclosing_state, altp, self,
2200 circular_alt);
2201 }
2202
2203 /*
2204 * Enqueue a state in the work queue
2205 */
enqueue_state(CVmGramProdState * state,CVmGramProdQueue * queues)2206 void CVmObjGramProd::enqueue_state(CVmGramProdState *state,
2207 CVmGramProdQueue *queues)
2208 {
2209 /* add it to the work queue */
2210 state->nxt_ = queues->work_queue_;
2211 queues->work_queue_ = state;
2212 }
2213
2214 /*
2215 * Determine if a token from the input matches a literal token, allowing
2216 * the input token to be a truncated version of the literal token if the
2217 * given truncation length is non-zero.
2218 */
tok_equals_lit(VMG_ const struct vmgramprod_tok * tok,const char * lit,size_t lit_len,CVmObjDict * dict,vm_val_t * result_val)2219 int CVmObjGramProd::tok_equals_lit(VMG_ const struct vmgramprod_tok *tok,
2220 const char *lit, size_t lit_len,
2221 CVmObjDict *dict, vm_val_t *result_val)
2222 {
2223 /*
2224 * if there's a dictionary, ask it to do the comparison; otherwise,
2225 * use an exact literal match
2226 */
2227 if (dict != 0)
2228 {
2229 /* ask the dictionary for the comparison */
2230 return dict->match_strings(vmg_ &tok->val_, tok->txt_, tok->len_,
2231 lit, lit_len, result_val);
2232 }
2233 else
2234 {
2235 /* perform an exact comparison */
2236 result_val->set_int(tok->len_ == lit_len
2237 && memcmp(tok->txt_, lit, lit_len) == 0);
2238 return result_val->val.intval;
2239 }
2240 }
2241
2242 /*
2243 * Calculate the hash value for a literal token, using our dictionary's
2244 * comparator.
2245 */
calc_hash(VMG_ CVmObjDict * dict,const vm_val_t * strval,const char * str,size_t len)2246 unsigned int CVmObjGramProd::calc_hash(VMG_ CVmObjDict *dict,
2247 const vm_val_t *strval,
2248 const char *str, size_t len)
2249 {
2250 /*
2251 * if we have a dictionary, ask it to ask it for the hash value, as the
2252 * dictionary will ask its comparator to do the work; otherwise,
2253 * calculate our own hash value
2254 */
2255 if (dict != 0)
2256 {
2257 /*
2258 * We're doing comparisons through the dictionary's comparator, so
2259 * we must calculate hash values using the comparator as well to
2260 * keep the hash and match operations consistent.
2261 */
2262 return dict->calc_hash(vmg_ strval, str, len);
2263 }
2264 else
2265 {
2266 uint hash;
2267
2268 /*
2269 * We don't have a dictionary, so we're doing our own string
2270 * comparisons, which we do as exact byte-for-byte matches. We can
2271 * thus calculate an arbitrary hash value of our own devising.
2272 */
2273 for (hash = 0 ; len != 0 ; ++str, --len)
2274 hash += (unsigned char)*str;
2275
2276 /* return the hash value we calculated */
2277 return hash;
2278 }
2279 }
2280
2281
2282 /*
2283 * Find a property in a token's list. Returns true if the property is
2284 * there, false if not.
2285 */
find_prop_in_tok(const vmgramprod_tok * tok,vm_prop_id_t prop)2286 int CVmObjGramProd::find_prop_in_tok(const vmgramprod_tok *tok,
2287 vm_prop_id_t prop)
2288 {
2289 size_t i;
2290 const vmgram_match_info *p;
2291
2292 /* search for the property */
2293 for (i = tok->match_cnt_, p = tok->matches_ ; i != 0 ; --i, ++p)
2294 {
2295 /* if this is the one we're looking for, return success */
2296 if (p->prop == prop)
2297 return TRUE;
2298 }
2299
2300 /* we didn't find it */
2301 return FALSE;
2302 }
2303
2304 /*
2305 * Get the next token item after the given item in an alternative
2306 */
get_next_alt_tok(const char * tokp)2307 const char *CVmObjGramProd::get_next_alt_tok(const char *tokp)
2308 {
2309 switch(vmgram_tok_type(tokp))
2310 {
2311 case VMGRAM_MATCH_PROD:
2312 return tokp + VMGRAM_TOK_PROD_SIZE;
2313
2314 case VMGRAM_MATCH_SPEECH:
2315 return tokp + VMGRAM_TOK_SPEECH_SIZE;
2316
2317 case VMGRAM_MATCH_NSPEECH:
2318 return tokp + VMGRAM_TOK_NSPEECH_SIZE(tokp);
2319
2320 case VMGRAM_MATCH_LITERAL:
2321 return tokp + VMGRAM_TOK_LIT_SIZE(tokp);
2322
2323 case VMGRAM_MATCH_TOKTYPE:
2324 return tokp + VMGRAM_TOK_TYPE_SIZE;
2325
2326 case VMGRAM_MATCH_STAR:
2327 return tokp + VMGRAM_TOK_STAR_SIZE;
2328
2329 default:
2330 err_throw(VMERR_INVAL_METACLASS_DATA);
2331 return 0;
2332 }
2333 }
2334
2335