1 /* $Header$ */ 2 3 /* 4 * Copyright (c) 2000, 2002 Michael J. Roberts. All Rights Reserved. 5 * 6 * Please see the accompanying license file, LICENSE.TXT, for information 7 * on using and copying this software. 8 */ 9 /* 10 Name 11 vmgram.h - T3 grammar-production metaclass 12 Function 13 14 Notes 15 16 Modified 17 02/15/00 MJRoberts - Creation 18 */ 19 20 #ifndef VMGRAM_H 21 #define VMGRAM_H 22 23 #include <stdlib.h> 24 #include <string.h> 25 26 #include "os.h" 27 #include "t3std.h" 28 #include "vmtype.h" 29 #include "vmglob.h" 30 #include "vmobj.h" 31 32 /* ------------------------------------------------------------------------ */ 33 /* 34 * intrinsic function vector indices 35 */ 36 enum vmobjgram_meta_fnset 37 { 38 /* undefined function */ 39 VMOBJGRAM_UNDEF = 0, 40 41 /* parseTokens(tokenList, dict) */ 42 VMOBJGRAM_PARSE = 1 43 }; 44 45 /* ------------------------------------------------------------------------ */ 46 /* 47 * Match types 48 */ 49 enum vmgram_match_type 50 { 51 /* production - matches a sub-production */ 52 VMGRAM_MATCH_PROD = 1, 53 54 /* 55 * part of speech - matches a word that appears in the dictionary 56 * under a particular part of speech 57 */ 58 VMGRAM_MATCH_SPEECH = 2, 59 60 /* literal - matches a literal string */ 61 VMGRAM_MATCH_LITERAL = 3, 62 63 /* token type - matches any token of a given type */ 64 VMGRAM_MATCH_TOKTYPE = 4, 65 66 /* star - matches all remaining input tokens */ 67 VMGRAM_MATCH_STAR = 5, 68 69 /* 70 * N parts of speech - matches a word that appears in the dictionary 71 * under any of a set of N parts of speech 72 */ 73 VMGRAM_MATCH_NSPEECH = 6 74 }; 75 76 /* ------------------------------------------------------------------------ */ 77 /* 78 * Grammar production object - image file format 79 * 80 * UINT2 alt_count 81 *. alternative 1 82 *. alternative 2 83 *. etc 84 * 85 * Each alternative has the following structure: 86 * 87 *. INT2 score 88 *. INT2 badness 89 *. UINT4 processor_object_id 90 *. UINT2 token_count 91 *. token 1 92 *. token 2 93 *. etc 94 * 95 * Each token has this structure: 96 * 97 * UINT2 property_association 98 *. BYTE token_match_type (see below) 99 *. extra data depending on token_match_type (see below) 100 * 101 * The extra data for the token varies by match type: 102 * 103 * VMGRAM_MATCH_PROD - a UINT4 giving the production object ID 104 * 105 * VMGRAM_MATCH_SPEECH - a UINT2 giving the vocabulary property 106 * 107 * VMGRAM_MATCH_NSPEECH - a UINT2 giving a count, then that many 108 * additional UINT2's giving a list of vocabulary properties 109 * 110 * VMGRAM_MATCH_LITERAL - a UINT2 byte-length prefix followed by the 111 * UTF8-encoded bytes of the literal string 112 * 113 * VMGRAM_MATCH_TOKTYPE - a UINT4 giving the token enum's ID 114 * 115 * VMGRAM_MATCH_STAR - no additional data 116 */ 117 118 /* pull the various parts out of an alternative byte stream */ 119 #define vmgram_alt_score(p) osrp2(p) 120 #define vmgram_alt_badness(p) osrp2((p) + 2) 121 #define vmgram_alt_procobj(p) ((vm_obj_id_t)osrp4((p) + 4)) 122 #define vmgram_alt_tokcnt(p) osrp2((p) + 8) 123 #define vmgram_alt_tokptr(p) ((p) + 10) 124 125 /* pull the header parts out of a token in an alternative */ 126 #define vmgram_tok_prop(p) ((vm_prop_id_t)osrp2(p)) 127 #define vmgram_tok_type(p) (*((p) + 2)) 128 129 /* pull the production object from a VMGRAM_MATCH_PROD token */ 130 #define vmgram_tok_prod_obj(p) ((vm_obj_id_t)osrp4((p) + 3)) 131 132 /* pull the part-of-speech property from a VMGRAM_MATCH_SPEECH token */ 133 #define vmgram_tok_voc_prop(p) ((vm_prop_id_t)osrp2((p) + 3)) 134 135 /* pull the literal length/text from a VMGRAM_MATCH_LITERAL token */ 136 #define vmgram_tok_lit_len(p) osrp2((p) + 3) 137 #define vmgram_tok_lit_txt(p) ((p) + 5) 138 139 /* pull the enum from a VMGRAM_MATCH_TOKTYPE token */ 140 #define vmgram_tok_tok_enum(p) ((ulong)osrp4((p) + 3)) 141 142 /* pull the count/nth property from a VMGRAM_MATCH_NSPEECH token */ 143 #define vmgram_tok_vocn_cnt(p) osrp2((p) + 3) 144 #define vmgram_tok_vocn_prop(p, n) osrp2((p) + 5 + (n)*2) 145 146 /* get the size of a token of the given type */ 147 #define VMGRAM_TOK_PROD_SIZE (3 + 4) 148 #define VMGRAM_TOK_SPEECH_SIZE (3 + 2) 149 #define VMGRAM_TOK_LIT_SIZE(p) (3 + 2 + vmgram_tok_lit_len(p)) 150 #define VMGRAM_TOK_TYPE_SIZE (3 + 4) 151 #define VMGRAM_TOK_STAR_SIZE (3 + 0) 152 #define VMGRAM_TOK_NSPEECH_SIZE(p) (3 + 2 + vmgram_tok_vocn_cnt(p)*2) 153 154 /* property/match result enumeration entry */ 155 struct vmgram_match_info 156 { 157 vm_prop_id_t prop; 158 }; 159 160 /* 161 * Grammar production object extension 162 */ 163 struct vm_gram_ext 164 { 165 /* pointer to load image data, if any */ 166 const char *image_data_; 167 size_t image_data_size_; 168 169 /* 170 * The last comparator object we used to calculate hash values for 171 * literals. Each time we need literal hash values, we'll check to see 172 * if we are using the same comparator we were last time; if so, we'll 173 * use the cached hash values, otherwise we'll recalculate them. We 174 * reference this object weakly. 175 */ 176 vm_obj_id_t comparator_; 177 178 /* flag: we've cached hash values for our literals */ 179 uint hashes_cached_ : 1; 180 181 /* 182 * flag: there's at least one circular rule among my rules (i.e., 183 * there's a rule whose first element is a self-reference 184 * subproduction) 185 */ 186 uint has_circular_alt : 1; 187 188 /* private memory pool - we use this to make allocation cheaper */ 189 class CVmGramProdMem *mem_; 190 191 /* 192 * Property list enumeration space. We use this to build a list of 193 * properties for which a dictionary word is defined. We'll expand 194 * this list as needed when we find we need more space. 195 */ 196 vmgram_match_info *prop_enum_arr_; 197 size_t prop_enum_max_; 198 199 /* array of rule alternatives */ 200 struct vmgram_alt_info *alts_; 201 size_t alt_cnt_; 202 }; 203 204 /* 205 * Alternative object. Each of these objects represents one of our rule 206 * alternatives. 207 */ 208 struct vmgram_alt_info 209 { 210 /* the alternative's score and badness values */ 211 int score; 212 int badness; 213 214 /* 215 * the "processor object" for this alternative - this is the class we 216 * instantiate to represent a match to the alternative 217 */ 218 vm_obj_id_t proc_obj; 219 220 /* array of token elements in the alternative */ 221 struct vmgram_tok_info *toks; 222 size_t tok_cnt; 223 }; 224 225 /* 226 * Grammar rule token entry. This represents a token in a grammar rule. 227 */ 228 struct vmgram_tok_info 229 { 230 /* 231 * property association - this is the property of the processor object 232 * that we'll set to point to the match object or input token if we 233 * match this rule token 234 */ 235 vm_prop_id_t prop; 236 237 /* token type - this is a VMGRAM_MATCH_xxx value */ 238 uchar typ; 239 240 /* extra data, depending on 'typ' */ 241 union 242 { 243 /* VMGRAM_MATCH_PROD - the sub-production object */ 244 vm_obj_id_t prod_obj; 245 246 /* VMGRAM_MATCH_SPEECH - the part-of-speech property */ 247 vm_prop_id_t speech_prop; 248 249 /* VMGRAM_MATCH_NSPEECH - an array of part-of-speech proeprties */ 250 struct 251 { 252 size_t cnt; 253 vm_prop_id_t *props; 254 } nspeech; 255 256 /* VMGRAM_MATCH_LITERAL - the literal string to match */ 257 struct 258 { 259 /* the literal text and its length */ 260 char *str; 261 size_t len; 262 263 /* the cached hash value for the literal */ 264 uint hash; 265 } lit; 266 267 /* VMGRAM_MATCH_TOKTYPE - token type enum */ 268 uint32 toktyp_enum; 269 270 } typinfo; 271 }; 272 273 /* ------------------------------------------------------------------------ */ 274 /* 275 * Grammar-Production object interface 276 */ 277 class CVmObjGramProd: public CVmObject 278 { 279 friend class CVmMetaclassGramProd; 280 281 public: 282 /* metaclass registration object */ 283 static class CVmMetaclass *metaclass_reg_; get_metaclass_reg()284 class CVmMetaclass *get_metaclass_reg() const { return metaclass_reg_; } 285 286 /* am I of the given metaclass? */ is_of_metaclass(class CVmMetaclass * meta)287 virtual int is_of_metaclass(class CVmMetaclass *meta) const 288 { 289 /* try my own metaclass and my base class */ 290 return (meta == metaclass_reg_ 291 || CVmObject::is_of_metaclass(meta)); 292 } 293 294 /* create dynamically using stack arguments */ 295 static vm_obj_id_t create_from_stack(VMG_ const uchar **pc_ptr, 296 uint argc); 297 298 /* 299 * call a static property - we don't have any of our own, so simply 300 * "inherit" the base class handling 301 */ call_stat_prop(VMG_ vm_val_t * result,const uchar ** pc_ptr,uint * argc,vm_prop_id_t prop)302 static int call_stat_prop(VMG_ vm_val_t *result, 303 const uchar **pc_ptr, uint *argc, 304 vm_prop_id_t prop) 305 { return CVmObject::call_stat_prop(vmg_ result, pc_ptr, argc, prop); } 306 307 /* determine if an object is a GrammarProduction object */ is_gramprod_obj(VMG_ vm_obj_id_t obj)308 static int is_gramprod_obj(VMG_ vm_obj_id_t obj) 309 { return vm_objp(vmg_ obj)->is_of_metaclass(metaclass_reg_); } 310 311 /* notify of deletion */ 312 void notify_delete(VMG_ int in_root_set); 313 314 /* get a property */ 315 int get_prop(VMG_ vm_prop_id_t prop, vm_val_t *val, 316 vm_obj_id_t self, vm_obj_id_t *source_obj, uint *argc); 317 318 /* set a property */ 319 void set_prop(VMG_ class CVmUndo *undo, 320 vm_obj_id_t self, vm_prop_id_t prop, const vm_val_t *val); 321 322 /* receive notification of a new undo savepoint */ notify_new_savept()323 void notify_new_savept() { } 324 325 /* apply undo */ apply_undo(VMG_ struct CVmUndoRecord *)326 void apply_undo(VMG_ struct CVmUndoRecord *) { } 327 328 /* discard additional information associated with an undo record */ discard_undo(VMG_ struct CVmUndoRecord *)329 void discard_undo(VMG_ struct CVmUndoRecord *) { } 330 331 /* mark a reference in an undo record */ mark_undo_ref(VMG_ struct CVmUndoRecord *)332 void mark_undo_ref(VMG_ struct CVmUndoRecord *) { } 333 334 /* remove stale weak references from an undo record */ remove_stale_undo_weak_ref(VMG_ struct CVmUndoRecord *)335 void remove_stale_undo_weak_ref(VMG_ struct CVmUndoRecord *) { } 336 337 /* 338 * mark references - we can only reference root-set objects (since 339 * we cannot be modified during execution), hence we don't need to 340 * mark anything here 341 */ mark_refs(VMG_ uint)342 void mark_refs(VMG_ uint) { } 343 344 /* remove weak references */ 345 void remove_stale_weak_refs(VMG0_); 346 347 /* load from an image file */ 348 void load_from_image(VMG_ vm_obj_id_t self, const char *ptr, size_t siz); 349 350 /* 351 * restore to image file state/save/restore - we can't change at 352 * run-time, so there's nothing to save or load 353 */ reset_to_image(VMG_ vm_obj_id_t)354 void reset_to_image(VMG_ vm_obj_id_t /*self*/) { } save_to_file(VMG_ class CVmFile *)355 void save_to_file(VMG_ class CVmFile *) { } restore_from_file(VMG_ vm_obj_id_t self,class CVmFile *,class CVmObjFixup *)356 void restore_from_file(VMG_ vm_obj_id_t self, 357 class CVmFile *, class CVmObjFixup *) { } 358 359 /* determine if the object has been changed since it was loaded */ is_changed_since_load()360 int is_changed_since_load() const { return FALSE; } 361 362 /* 363 * rebuild for image file - we can't change during execution, so our 364 * image file data never change 365 */ 366 virtual ulong rebuild_image(VMG_ char *buf, ulong buflen); 367 368 /* convert to constant data */ convert_to_const_data(VMG_ class CVmConstMapper *,vm_obj_id_t)369 virtual void convert_to_const_data(VMG_ class CVmConstMapper *, 370 vm_obj_id_t) { } 371 372 protected: 373 /* private constructor */ 374 CVmObjGramProd(VMG0_); 375 376 /* property evaluation - undefined property */ getp_undef(VMG_ vm_obj_id_t,vm_val_t *,uint *)377 int getp_undef(VMG_ vm_obj_id_t, vm_val_t *, uint *) { return FALSE; } 378 379 /* property evaluation - parseTokens */ 380 int getp_parse(VMG_ vm_obj_id_t self, vm_val_t *val, uint *argc); 381 382 /* get my extension, properly cast */ get_ext()383 vm_gram_ext *get_ext() const { return (vm_gram_ext *)ext_; } 384 385 /* callback for dictionary word property enumeration */ 386 static void enum_props_cb(VMG_ void *ctx, vm_prop_id_t prop, 387 const vm_val_t *match_val); 388 389 /* search a token for a match to the given vocabulary property */ 390 static int find_prop_in_tok(const struct vmgramprod_tok *tok, 391 vm_prop_id_t prop); 392 393 /* get the next token in an alternative */ 394 static const char *get_next_alt_tok(const char *tokp); 395 396 /* enqueue our alternatives */ 397 void enqueue_alts(VMG_ class CVmGramProdMem *mem, 398 const struct vmgramprod_tok *tok, 399 size_t tok_cnt, size_t start_tok_pos, 400 struct CVmGramProdState *state, 401 struct CVmGramProdQueue *queues, 402 vm_obj_id_t self, int circ_only, 403 struct CVmGramProdMatch *circ_match, 404 class CVmObjDict *dict); 405 406 /* create and enqueue a new state */ 407 static struct CVmGramProdState * 408 enqueue_new_state(class CVmGramProdMem *mem, 409 size_t start_tok_pos, 410 struct CVmGramProdState *enclosing_state, 411 const vmgram_alt_info *altp, vm_obj_id_t self, 412 int *need_to_clone, 413 struct CVmGramProdQueue *queues, 414 int circular_alt); 415 416 /* create a new state */ 417 static struct CVmGramProdState * 418 create_new_state(class CVmGramProdMem *mem, 419 size_t start_tok_pos, 420 struct CVmGramProdState *enclosing_state, 421 const vmgram_alt_info *altp, vm_obj_id_t self, 422 int *need_to_clone, int circular_alt); 423 424 /* enqueue a state */ 425 static void enqueue_state(struct CVmGramProdState *state, 426 struct CVmGramProdQueue *queues); 427 428 /* process the work queue */ 429 static void process_work_queue(VMG_ CVmGramProdMem *mem, 430 const struct vmgramprod_tok *tok, 431 size_t tok_cnt, 432 struct CVmGramProdQueue *queues, 433 class CVmObjDict *dict); 434 435 436 /* process the first work queue entry */ 437 static void process_work_queue_head(VMG_ CVmGramProdMem *mem, 438 const struct vmgramprod_tok *tok, 439 size_t tok_cnt, 440 struct CVmGramProdQueue *queues, 441 class CVmObjDict *dict); 442 443 /* build a match tree */ 444 static void build_match_tree(VMG_ const struct CVmGramProdMatch *match, 445 const vm_val_t *tok_list, 446 const vm_val_t *tok_match_list, 447 vm_val_t *retval, 448 size_t *first_tok, size_t *last_tok); 449 450 /* cache the hash values for the literal tokens in our alternatives */ 451 void cache_hashes(VMG_ CVmObjDict *dict); 452 453 /* calculate the hash value for a literal string */ 454 static unsigned int calc_hash(VMG_ class CVmObjDict *dict, 455 const vm_val_t *strval, 456 const char *str, size_t len); 457 458 /* check to see if a token matches a literal */ 459 static int tok_equals_lit(VMG_ const struct vmgramprod_tok *tok, 460 const char *lit, size_t lit_len, 461 class CVmObjDict *dict, 462 vm_val_t *match_result); 463 464 /* property evaluation function table */ 465 static int (CVmObjGramProd::*func_table_[])(VMG_ vm_obj_id_t self, 466 vm_val_t *retval, uint *argc); 467 }; 468 469 470 /* ------------------------------------------------------------------------ */ 471 /* 472 * Registration table object 473 */ 474 class CVmMetaclassGramProd: public CVmMetaclass 475 { 476 public: 477 /* get the global name */ get_meta_name()478 const char *get_meta_name() const { return "grammar-production/030000"; } 479 480 /* create from image file */ create_for_image_load(VMG_ vm_obj_id_t id)481 void create_for_image_load(VMG_ vm_obj_id_t id) 482 { 483 new (vmg_ id) CVmObjGramProd(vmg0_); 484 G_obj_table->set_obj_gc_characteristics(id, FALSE, TRUE); 485 } 486 487 /* create from restoring from saved state */ create_for_restore(VMG_ vm_obj_id_t id)488 void create_for_restore(VMG_ vm_obj_id_t id) 489 { 490 new (vmg_ id) CVmObjGramProd(vmg0_); 491 G_obj_table->set_obj_gc_characteristics(id, FALSE, TRUE); 492 } 493 494 /* create dynamically using stack arguments */ create_from_stack(VMG_ const uchar ** pc_ptr,uint argc)495 vm_obj_id_t create_from_stack(VMG_ const uchar **pc_ptr, uint argc) 496 { return CVmObjGramProd::create_from_stack(vmg_ pc_ptr, argc); } 497 498 /* call a static property */ call_stat_prop(VMG_ vm_val_t * result,const uchar ** pc_ptr,uint * argc,vm_prop_id_t prop)499 int call_stat_prop(VMG_ vm_val_t *result, 500 const uchar **pc_ptr, uint *argc, 501 vm_prop_id_t prop) 502 { 503 return CVmObjGramProd:: 504 call_stat_prop(vmg_ result, pc_ptr, argc, prop); 505 } 506 }; 507 508 509 #endif /* VMGRAM_H */ 510 511 /* 512 * Register the class 513 */ 514 VM_REGISTER_METACLASS(CVmObjGramProd) 515 516