1 /* $Header$ */
2 
3 /*
4  *   Copyright (c) 2000, 2002 Michael J. Roberts.  All Rights Reserved.
5  *
6  *   Please see the accompanying license file, LICENSE.TXT, for information
7  *   on using and copying this software.
8  */
9 /*
10 Name
11   vmgram.h - T3 grammar-production metaclass
12 Function
13 
14 Notes
15 
16 Modified
17   02/15/00 MJRoberts  - Creation
18 */
19 
20 #ifndef VMGRAM_H
21 #define VMGRAM_H
22 
23 #include <stdlib.h>
24 #include <string.h>
25 
26 #include "os.h"
27 #include "t3std.h"
28 #include "vmtype.h"
29 #include "vmglob.h"
30 #include "vmobj.h"
31 
32 /* ------------------------------------------------------------------------ */
33 /*
34  *   intrinsic function vector indices
35  */
36 enum vmobjgram_meta_fnset
37 {
38     /* undefined function */
39     VMOBJGRAM_UNDEF = 0,
40 
41     /* parseTokens(tokenList, dict) */
42     VMOBJGRAM_PARSE = 1
43 };
44 
45 /* ------------------------------------------------------------------------ */
46 /*
47  *   Match types
48  */
49 enum vmgram_match_type
50 {
51     /* production - matches a sub-production */
52     VMGRAM_MATCH_PROD = 1,
53 
54     /*
55      *   part of speech - matches a word that appears in the dictionary
56      *   under a particular part of speech
57      */
58     VMGRAM_MATCH_SPEECH = 2,
59 
60     /* literal - matches a literal string */
61     VMGRAM_MATCH_LITERAL = 3,
62 
63     /* token type - matches any token of a given type */
64     VMGRAM_MATCH_TOKTYPE = 4,
65 
66     /* star - matches all remaining input tokens */
67     VMGRAM_MATCH_STAR = 5,
68 
69     /*
70      *   N parts of speech - matches a word that appears in the dictionary
71      *   under any of a set of N parts of speech
72      */
73     VMGRAM_MATCH_NSPEECH = 6
74 };
75 
76 /* ------------------------------------------------------------------------ */
77 /*
78  *   Grammar production object - image file format
79  *
80  *   UINT2 alt_count
81  *.  alternative 1
82  *.  alternative 2
83  *.  etc
84  *
85  *   Each alternative has the following structure:
86  *
87  *.  INT2 score
88  *.  INT2 badness
89  *.  UINT4 processor_object_id
90  *.  UINT2 token_count
91  *.  token 1
92  *.  token 2
93  *.  etc
94  *
95  *   Each token has this structure:
96  *
97  *   UINT2 property_association
98  *.  BYTE token_match_type (see below)
99  *.  extra data depending on token_match_type (see below)
100  *
101  *   The extra data for the token varies by match type:
102  *
103  *   VMGRAM_MATCH_PROD - a UINT4 giving the production object ID
104  *
105  *   VMGRAM_MATCH_SPEECH - a UINT2 giving the vocabulary property
106  *
107  *   VMGRAM_MATCH_NSPEECH - a UINT2 giving a count, then that many
108  *   additional UINT2's giving a list of vocabulary properties
109  *
110  *   VMGRAM_MATCH_LITERAL - a UINT2 byte-length prefix followed by the
111  *   UTF8-encoded bytes of the literal string
112  *
113  *   VMGRAM_MATCH_TOKTYPE - a UINT4 giving the token enum's ID
114  *
115  *   VMGRAM_MATCH_STAR - no additional data
116  */
117 
118 /* pull the various parts out of an alternative byte stream */
119 #define vmgram_alt_score(p)      osrp2(p)
120 #define vmgram_alt_badness(p)    osrp2((p) + 2)
121 #define vmgram_alt_procobj(p)    ((vm_obj_id_t)osrp4((p) + 4))
122 #define vmgram_alt_tokcnt(p)     osrp2((p) + 8)
123 #define vmgram_alt_tokptr(p)     ((p) + 10)
124 
125 /* pull the header parts out of a token in an alternative */
126 #define vmgram_tok_prop(p)       ((vm_prop_id_t)osrp2(p))
127 #define vmgram_tok_type(p)       (*((p) + 2))
128 
129 /* pull the production object from a VMGRAM_MATCH_PROD token */
130 #define vmgram_tok_prod_obj(p)   ((vm_obj_id_t)osrp4((p) + 3))
131 
132 /* pull the part-of-speech property from a VMGRAM_MATCH_SPEECH token */
133 #define vmgram_tok_voc_prop(p)   ((vm_prop_id_t)osrp2((p) + 3))
134 
135 /* pull the literal length/text from a VMGRAM_MATCH_LITERAL token */
136 #define vmgram_tok_lit_len(p)    osrp2((p) + 3)
137 #define vmgram_tok_lit_txt(p)    ((p) + 5)
138 
139 /* pull the enum from a VMGRAM_MATCH_TOKTYPE token */
140 #define vmgram_tok_tok_enum(p)   ((ulong)osrp4((p) + 3))
141 
142 /* pull the count/nth property from a VMGRAM_MATCH_NSPEECH token */
143 #define vmgram_tok_vocn_cnt(p)      osrp2((p) + 3)
144 #define vmgram_tok_vocn_prop(p, n)  osrp2((p) + 5 + (n)*2)
145 
146 /* get the size of a token of the given type */
147 #define VMGRAM_TOK_PROD_SIZE     (3 + 4)
148 #define VMGRAM_TOK_SPEECH_SIZE   (3 + 2)
149 #define VMGRAM_TOK_LIT_SIZE(p)   (3 + 2 + vmgram_tok_lit_len(p))
150 #define VMGRAM_TOK_TYPE_SIZE     (3 + 4)
151 #define VMGRAM_TOK_STAR_SIZE     (3 + 0)
152 #define VMGRAM_TOK_NSPEECH_SIZE(p) (3 + 2 + vmgram_tok_vocn_cnt(p)*2)
153 
154 /* property/match result enumeration entry */
155 struct vmgram_match_info
156 {
157     vm_prop_id_t prop;
158 };
159 
160 /*
161  *   Grammar production object extension
162  */
163 struct vm_gram_ext
164 {
165     /* pointer to load image data, if any */
166     const char *image_data_;
167     size_t image_data_size_;
168 
169     /*
170      *   The last comparator object we used to calculate hash values for
171      *   literals.  Each time we need literal hash values, we'll check to see
172      *   if we are using the same comparator we were last time; if so, we'll
173      *   use the cached hash values, otherwise we'll recalculate them.  We
174      *   reference this object weakly.
175      */
176     vm_obj_id_t comparator_;
177 
178     /* flag: we've cached hash values for our literals */
179     uint hashes_cached_ : 1;
180 
181     /*
182      *   flag: there's at least one circular rule among my rules (i.e.,
183      *   there's a rule whose first element is a self-reference
184      *   subproduction)
185      */
186     uint has_circular_alt : 1;
187 
188     /* private memory pool - we use this to make allocation cheaper */
189     class CVmGramProdMem *mem_;
190 
191     /*
192      *   Property list enumeration space.  We use this to build a list of
193      *   properties for which a dictionary word is defined.  We'll expand
194      *   this list as needed when we find we need more space.
195      */
196     vmgram_match_info *prop_enum_arr_;
197     size_t prop_enum_max_;
198 
199     /* array of rule alternatives */
200     struct vmgram_alt_info *alts_;
201     size_t alt_cnt_;
202 };
203 
204 /*
205  *   Alternative object.  Each of these objects represents one of our rule
206  *   alternatives.
207  */
208 struct vmgram_alt_info
209 {
210     /* the alternative's score and badness values */
211     int score;
212     int badness;
213 
214     /*
215      *   the "processor object" for this alternative - this is the class we
216      *   instantiate to represent a match to the alternative
217      */
218     vm_obj_id_t proc_obj;
219 
220     /* array of token elements in the alternative */
221     struct vmgram_tok_info *toks;
222     size_t tok_cnt;
223 };
224 
225 /*
226  *   Grammar rule token entry.  This represents a token in a grammar rule.
227  */
228 struct vmgram_tok_info
229 {
230     /*
231      *   property association - this is the property of the processor object
232      *   that we'll set to point to the match object or input token if we
233      *   match this rule token
234      */
235     vm_prop_id_t prop;
236 
237     /* token type - this is a VMGRAM_MATCH_xxx value */
238     uchar typ;
239 
240     /* extra data, depending on 'typ' */
241     union
242     {
243         /* VMGRAM_MATCH_PROD - the sub-production object */
244         vm_obj_id_t prod_obj;
245 
246         /* VMGRAM_MATCH_SPEECH - the part-of-speech property */
247         vm_prop_id_t speech_prop;
248 
249         /* VMGRAM_MATCH_NSPEECH - an array of part-of-speech proeprties */
250         struct
251         {
252             size_t cnt;
253             vm_prop_id_t *props;
254         } nspeech;
255 
256         /* VMGRAM_MATCH_LITERAL - the literal string to match */
257         struct
258         {
259             /* the literal text and its length */
260             char *str;
261             size_t len;
262 
263             /* the cached hash value for the literal */
264             uint hash;
265         } lit;
266 
267         /* VMGRAM_MATCH_TOKTYPE - token type enum */
268         uint32 toktyp_enum;
269 
270     } typinfo;
271 };
272 
273 /* ------------------------------------------------------------------------ */
274 /*
275  *   Grammar-Production object interface
276  */
277 class CVmObjGramProd: public CVmObject
278 {
279     friend class CVmMetaclassGramProd;
280 
281 public:
282     /* metaclass registration object */
283     static class CVmMetaclass *metaclass_reg_;
get_metaclass_reg()284     class CVmMetaclass *get_metaclass_reg() const { return metaclass_reg_; }
285 
286     /* am I of the given metaclass? */
is_of_metaclass(class CVmMetaclass * meta)287     virtual int is_of_metaclass(class CVmMetaclass *meta) const
288     {
289         /* try my own metaclass and my base class */
290         return (meta == metaclass_reg_
291                 || CVmObject::is_of_metaclass(meta));
292     }
293 
294     /* create dynamically using stack arguments */
295     static vm_obj_id_t create_from_stack(VMG_ const uchar **pc_ptr,
296                                          uint argc);
297 
298     /*
299      *   call a static property - we don't have any of our own, so simply
300      *   "inherit" the base class handling
301      */
call_stat_prop(VMG_ vm_val_t * result,const uchar ** pc_ptr,uint * argc,vm_prop_id_t prop)302     static int call_stat_prop(VMG_ vm_val_t *result,
303                               const uchar **pc_ptr, uint *argc,
304                               vm_prop_id_t prop)
305         { return CVmObject::call_stat_prop(vmg_ result, pc_ptr, argc, prop); }
306 
307     /* determine if an object is a GrammarProduction object */
is_gramprod_obj(VMG_ vm_obj_id_t obj)308     static int is_gramprod_obj(VMG_ vm_obj_id_t obj)
309         { return vm_objp(vmg_ obj)->is_of_metaclass(metaclass_reg_); }
310 
311     /* notify of deletion */
312     void notify_delete(VMG_ int in_root_set);
313 
314     /* get a property */
315     int get_prop(VMG_ vm_prop_id_t prop, vm_val_t *val,
316                  vm_obj_id_t self, vm_obj_id_t *source_obj, uint *argc);
317 
318     /* set a property */
319     void set_prop(VMG_ class CVmUndo *undo,
320                   vm_obj_id_t self, vm_prop_id_t prop, const vm_val_t *val);
321 
322     /* receive notification of a new undo savepoint */
notify_new_savept()323     void notify_new_savept() { }
324 
325     /* apply undo */
apply_undo(VMG_ struct CVmUndoRecord *)326     void apply_undo(VMG_ struct CVmUndoRecord *) { }
327 
328     /* discard additional information associated with an undo record */
discard_undo(VMG_ struct CVmUndoRecord *)329     void discard_undo(VMG_ struct CVmUndoRecord *) { }
330 
331     /* mark a reference in an undo record */
mark_undo_ref(VMG_ struct CVmUndoRecord *)332     void mark_undo_ref(VMG_ struct CVmUndoRecord *) { }
333 
334     /* remove stale weak references from an undo record */
remove_stale_undo_weak_ref(VMG_ struct CVmUndoRecord *)335     void remove_stale_undo_weak_ref(VMG_ struct CVmUndoRecord *) { }
336 
337     /*
338      *   mark references - we can only reference root-set objects (since
339      *   we cannot be modified during execution), hence we don't need to
340      *   mark anything here
341      */
mark_refs(VMG_ uint)342     void mark_refs(VMG_ uint) { }
343 
344     /* remove weak references */
345     void remove_stale_weak_refs(VMG0_);
346 
347     /* load from an image file */
348     void load_from_image(VMG_ vm_obj_id_t self, const char *ptr, size_t siz);
349 
350     /*
351      *   restore to image file state/save/restore - we can't change at
352      *   run-time, so there's nothing to save or load
353      */
reset_to_image(VMG_ vm_obj_id_t)354     void reset_to_image(VMG_ vm_obj_id_t /*self*/) { }
save_to_file(VMG_ class CVmFile *)355     void save_to_file(VMG_ class CVmFile *) { }
restore_from_file(VMG_ vm_obj_id_t self,class CVmFile *,class CVmObjFixup *)356     void restore_from_file(VMG_ vm_obj_id_t self,
357                            class CVmFile *, class CVmObjFixup *) { }
358 
359     /* determine if the object has been changed since it was loaded */
is_changed_since_load()360     int is_changed_since_load() const { return FALSE; }
361 
362     /*
363      *   rebuild for image file - we can't change during execution, so our
364      *   image file data never change
365      */
366     virtual ulong rebuild_image(VMG_ char *buf, ulong buflen);
367 
368     /* convert to constant data */
convert_to_const_data(VMG_ class CVmConstMapper *,vm_obj_id_t)369     virtual void convert_to_const_data(VMG_ class CVmConstMapper *,
370                                        vm_obj_id_t) { }
371 
372 protected:
373     /* private constructor */
374     CVmObjGramProd(VMG0_);
375 
376     /* property evaluation - undefined property */
getp_undef(VMG_ vm_obj_id_t,vm_val_t *,uint *)377     int getp_undef(VMG_ vm_obj_id_t, vm_val_t *, uint *) { return FALSE; }
378 
379     /* property evaluation - parseTokens */
380     int getp_parse(VMG_ vm_obj_id_t self, vm_val_t *val, uint *argc);
381 
382     /* get my extension, properly cast */
get_ext()383     vm_gram_ext *get_ext() const { return (vm_gram_ext *)ext_; }
384 
385     /* callback for dictionary word property enumeration */
386     static void enum_props_cb(VMG_ void *ctx, vm_prop_id_t prop,
387                               const vm_val_t *match_val);
388 
389     /* search a token for a match to the given vocabulary property */
390     static int find_prop_in_tok(const struct vmgramprod_tok *tok,
391                                 vm_prop_id_t prop);
392 
393     /* get the next token in an alternative */
394     static const char *get_next_alt_tok(const char *tokp);
395 
396     /* enqueue our alternatives */
397     void enqueue_alts(VMG_ class CVmGramProdMem *mem,
398                       const struct vmgramprod_tok *tok,
399                       size_t tok_cnt, size_t start_tok_pos,
400                       struct CVmGramProdState *state,
401                       struct CVmGramProdQueue *queues,
402                       vm_obj_id_t self, int circ_only,
403                       struct CVmGramProdMatch *circ_match,
404                       class CVmObjDict *dict);
405 
406     /* create and enqueue a new state */
407     static struct CVmGramProdState *
408         enqueue_new_state(class CVmGramProdMem *mem,
409                           size_t start_tok_pos,
410                           struct CVmGramProdState *enclosing_state,
411                           const vmgram_alt_info *altp, vm_obj_id_t self,
412                           int *need_to_clone,
413                           struct CVmGramProdQueue *queues,
414                           int circular_alt);
415 
416     /* create a new state */
417     static struct CVmGramProdState *
418         create_new_state(class CVmGramProdMem *mem,
419                          size_t start_tok_pos,
420                          struct CVmGramProdState *enclosing_state,
421                          const vmgram_alt_info *altp, vm_obj_id_t self,
422                          int *need_to_clone, int circular_alt);
423 
424     /* enqueue a state */
425     static void enqueue_state(struct CVmGramProdState *state,
426                               struct CVmGramProdQueue *queues);
427 
428     /* process the work queue */
429     static void process_work_queue(VMG_ CVmGramProdMem *mem,
430                                    const struct vmgramprod_tok *tok,
431                                    size_t tok_cnt,
432                                    struct CVmGramProdQueue *queues,
433                                    class CVmObjDict *dict);
434 
435 
436     /* process the first work queue entry */
437     static void process_work_queue_head(VMG_ CVmGramProdMem *mem,
438                                         const struct vmgramprod_tok *tok,
439                                         size_t tok_cnt,
440                                         struct CVmGramProdQueue *queues,
441                                         class CVmObjDict *dict);
442 
443     /* build a match tree */
444     static void build_match_tree(VMG_ const struct CVmGramProdMatch *match,
445                                  const vm_val_t *tok_list,
446                                  const vm_val_t *tok_match_list,
447                                  vm_val_t *retval,
448                                  size_t *first_tok, size_t *last_tok);
449 
450     /* cache the hash values for the literal tokens in our alternatives */
451     void cache_hashes(VMG_ CVmObjDict *dict);
452 
453     /* calculate the hash value for a literal string */
454     static unsigned int calc_hash(VMG_ class CVmObjDict *dict,
455                                   const vm_val_t *strval,
456                                   const char *str, size_t len);
457 
458     /* check to see if a token matches a literal */
459     static int tok_equals_lit(VMG_ const struct vmgramprod_tok *tok,
460                               const char *lit, size_t lit_len,
461                               class CVmObjDict *dict,
462                               vm_val_t *match_result);
463 
464     /* property evaluation function table */
465     static int (CVmObjGramProd::*func_table_[])(VMG_ vm_obj_id_t self,
466                                                 vm_val_t *retval, uint *argc);
467 };
468 
469 
470 /* ------------------------------------------------------------------------ */
471 /*
472  *   Registration table object
473  */
474 class CVmMetaclassGramProd: public CVmMetaclass
475 {
476 public:
477     /* get the global name */
get_meta_name()478     const char *get_meta_name() const { return "grammar-production/030000"; }
479 
480     /* create from image file */
create_for_image_load(VMG_ vm_obj_id_t id)481     void create_for_image_load(VMG_ vm_obj_id_t id)
482     {
483         new (vmg_ id) CVmObjGramProd(vmg0_);
484         G_obj_table->set_obj_gc_characteristics(id, FALSE, TRUE);
485     }
486 
487     /* create from restoring from saved state */
create_for_restore(VMG_ vm_obj_id_t id)488     void create_for_restore(VMG_ vm_obj_id_t id)
489     {
490         new (vmg_ id) CVmObjGramProd(vmg0_);
491         G_obj_table->set_obj_gc_characteristics(id, FALSE, TRUE);
492     }
493 
494     /* create dynamically using stack arguments */
create_from_stack(VMG_ const uchar ** pc_ptr,uint argc)495     vm_obj_id_t create_from_stack(VMG_ const uchar **pc_ptr, uint argc)
496         { return CVmObjGramProd::create_from_stack(vmg_ pc_ptr, argc); }
497 
498     /* call a static property */
call_stat_prop(VMG_ vm_val_t * result,const uchar ** pc_ptr,uint * argc,vm_prop_id_t prop)499     int call_stat_prop(VMG_ vm_val_t *result,
500                        const uchar **pc_ptr, uint *argc,
501                        vm_prop_id_t prop)
502     {
503         return CVmObjGramProd::
504             call_stat_prop(vmg_ result, pc_ptr, argc, prop);
505     }
506 };
507 
508 
509 #endif /* VMGRAM_H */
510 
511 /*
512  *   Register the class
513  */
514 VM_REGISTER_METACLASS(CVmObjGramProd)
515 
516