1 /*
2  *   Copyright (c) 2002 by Michael J. Roberts.  All Rights Reserved.
3  *
4  *   Please see the accompanying license file, LICENSE.TXT, for information
5  *   on using and copying this software.
6  */
7 /*
8 Name
9   vmstrcmp.h - T3 String Comparator intrinsic class
10 Function
11   Defines the String Comparator intrinsic class, which provides native
12   code that performs complex, parameterized string comparisons.  We offer
13   the following customizable options for our comparisons:
14 
15   - We can match exactly on case, or without regard to case.
16 
17   - We can optionally match a value to a truncated reference value
18   (which allows user input to use abbreviated forms of dictionary words, for
19   example).  The minimum truncation length is a settable option.
20 
21   - We can use equivalence mappings that allow a given character in a
22   reference string to match different characters in value strings.  For
23   example, we could specify that an "a" with an acute accent in a reference
24   string matches an unaccented "a" in a value string.  Each such mapping can
25   specify result flag bits, so a caller can determine if particular
26   equivalence mappings were used in making a match.
27 
28   This class implements the generic "comparator" interface, by providing
29   a hash value calculator method and a value comparison method, so a String
30   Comparator instance can be used as a Dictionary's comparator object.
31 
32   StringComparator objects are immutable; all of our parameters are set
33   in the constructor.  This is desirable because it allows the object to be
34   installed in a Dictionary (or any other hash table-based structure)
35   without any danger that the hash table will need to be rebuilt as long as
36   the same comparator is installed.
37 Notes
38 
39 Modified
40   09/01/02 MJRoberts  - Creation
41 */
42 
43 #ifndef VMSTRCMP_H
44 #define VMSTRCMP_H
45 
46 #include <stdlib.h>
47 #include <os.h>
48 #include "vmtype.h"
49 #include "vmobj.h"
50 #include "vmglob.h"
51 
52 
53 /* ------------------------------------------------------------------------ */
54 /*
55  *   Our serialized data stream, in both the image file and a saved file,
56  *   consists of:
57  *
58  *   UINT2 truncation_length
59  *.  UINT2 flags
60  *.  UINT2 equivalence_mapping_count
61  *.  UINT2 equivalence_total_value_chars
62  *.  equivalence_mappings
63  *
64  *   The 'flags' value consists of the following combination of bit fields:
65  *
66  *   0x0001 - the match is case-sensitive
67  *
68  *   The 'equivalence_total_value_chars' gives the sum total of the value
69  *   string characters in ALL of the equivalence mappings.  This value is
70  *   stored simply to make it easier to calculate the memory allocation
71  *   needs when loading this object.
72  *
73  *   Each 'equivalence_mapping' entry is arranged like this:
74  *
75  *   UINT2 reference_char
76  *.  UBYTE value_char_count
77  *.  UINT4 uc_result_flags
78  *.  UINT4 lc_result_flags
79  *.  UINT2 value_char[value_char_count]
80  *
81  *   Each character is given as a 16-bit Unicode value.  These values map
82  *   directly to the corresponding vmobj_strcmp_equiv structure entries.
83  */
84 
85 /* ------------------------------------------------------------------------ */
86 /*
87  *   Our in-memory extension.
88  */
89 struct vmobj_strcmp_ext
90 {
91     /*
92      *   The truncation length for reference strings, or zero if no
93      *   truncation is allowed.  This is the minimum length that we must
94      *   match when the value string is shorter than the reference string.
95      */
96     size_t trunc_len;
97 
98     /*
99      *   Case sensitivity.  If this is true, then our matches are sensitive
100      *   to case, which means that we must match each character exactly on
101      *   case.  If this is false, then our matches are insensitive to case,
102      *   so we can match an upper-case letter to the corresponding
103      *   lower-case letter.
104      */
105     int case_sensitive;
106 
107     /*
108      *   Equivalence mapping table, giving the mapping for each "reference"
109      *   string character.  This is a two-tiered array: the first tier is
110      *   indexed by the high-order 8 bits of a reference character, and
111      *   gives a pointer to the second tier array, or a null pointer if
112      *   there is no mapping for any character with the given high-order 8
113      *   bits.  The second tier is indexed by the low-order 8 bits, and
114      *   gives a pointer to the equivalence mapping structure for the
115      *   character, or a null pointer if there is no mapping for the
116      *   character.
117      */
118     struct vmobj_strcmp_equiv **equiv[256];
119 };
120 
121 /*
122  *   Equivalence mapping entry.  Note that we don't store the reference
123  *   character in a mapping structure, because we can only reach these
124  *   mapping structures by indexing the mapping array with the reference
125  *   character, and thus must always already know the reference character
126  *   before we can even reach one of these.
127  */
128 struct vmobj_strcmp_equiv
129 {
130     /* string of value characters matching this reference character */
131     size_t val_ch_cnt;
132     wchar_t *val_ch;
133 
134     /*
135      *   Additive result flags for upper-case input matches: this value is
136      *   bitwise-OR'd into the result code when this equivalence mapping is
137      *   used to match the value to an upper-case input letter.
138      */
139     unsigned long uc_result_flags;
140 
141     /* additive result flags for lower-case input matches */
142     unsigned long lc_result_flags;
143 };
144 
145 /* ------------------------------------------------------------------------ */
146 /*
147  *   String Comparator intrinsic class
148  */
149 class CVmObjStrComp: public CVmObject
150 {
151     friend class CVmMetaclassStrComp;
152 
153 public:
154     /* metaclass registration object */
155     static class CVmMetaclass *metaclass_reg_;
get_metaclass_reg()156     class CVmMetaclass *get_metaclass_reg() const { return metaclass_reg_; }
157 
158     /* am I of the given metaclass? */
is_of_metaclass(class CVmMetaclass * meta)159     virtual int is_of_metaclass(class CVmMetaclass *meta) const
160     {
161         /* try my own metaclass and my base class */
162         return (meta == metaclass_reg_
163                 || CVmObject::is_of_metaclass(meta));
164     }
165 
166     /* am I a StringComparator object? */
is_strcmp_obj(VMG_ vm_obj_id_t obj)167     static int is_strcmp_obj(VMG_ vm_obj_id_t obj)
168         { return vm_objp(vmg_ obj)->is_of_metaclass(metaclass_reg_); }
169 
170     /* create dynamically using stack arguments */
171     static vm_obj_id_t create_from_stack(VMG_ const uchar **pc_ptr,
172                                          uint argc);
173 
174     /*
175      *   call a static property - we don't have any of our own, so simply
176      *   "inherit" the base class handling
177      */
call_stat_prop(VMG_ vm_val_t * result,const uchar ** pc_ptr,uint * argc,vm_prop_id_t prop)178     static int call_stat_prop(VMG_ vm_val_t *result,
179                               const uchar **pc_ptr, uint *argc,
180                               vm_prop_id_t prop)
181     {
182         /* defer to our base class */
183         return CVmObject::call_stat_prop(vmg_ result, pc_ptr, argc, prop);
184     }
185 
186     /* notify of deletion */
187     void notify_delete(VMG_ int in_root_set);
188 
189     /* set a property */
190     void set_prop(VMG_ class CVmUndo *undo,
191                   vm_obj_id_t self, vm_prop_id_t prop, const vm_val_t *val);
192 
193     /* get a property */
194     int get_prop(VMG_ vm_prop_id_t prop, vm_val_t *val,
195                  vm_obj_id_t self, vm_obj_id_t *source_obj, uint *argc);
196 
197     /* undo operations - we are immutable and hence keep no undo */
notify_new_savept()198     void notify_new_savept() { }
apply_undo(VMG_ struct CVmUndoRecord *)199     void apply_undo(VMG_ struct CVmUndoRecord *) { }
mark_undo_ref(VMG_ struct CVmUndoRecord *)200     void mark_undo_ref(VMG_ struct CVmUndoRecord *) { }
remove_stale_undo_weak_ref(VMG_ struct CVmUndoRecord *)201     void remove_stale_undo_weak_ref(VMG_ struct CVmUndoRecord *) { }
202 
203     /* we reference no other objects */
mark_refs(VMG_ uint)204     void mark_refs(VMG_ uint) { }
remove_stale_weak_refs(VMG0_)205     void remove_stale_weak_refs(VMG0_) { }
206 
207     /* load from an image file */
208     void load_from_image(VMG_ vm_obj_id_t, const char *ptr, size_t);
209 
210     /* rebuild for image file */
211     virtual ulong rebuild_image(VMG_ char *buf, ulong buflen);
212 
213     /* save to a file */
214     void save_to_file(VMG_ class CVmFile *fp);
215 
216     /* restore from a file */
217     void restore_from_file(VMG_ vm_obj_id_t self,
218                            class CVmFile *fp, class CVmObjFixup *fixup);
219 
220     /*
221      *   Direct Interface.  These functions correspond to methods we expose
222      *   through the get_prop() interface, but can be called directly from
223      *   the C++ code of other intrinsic classes (such as Dictionary) to
224      *   avoid the overhead of going through the get_prop() mechanism.
225      *   These are virtual to allow derived intrinsic classes to override
226      *   the implementation of the public VM-visible interface.
227      */
228 
229     /* calculate a hash value for a constant string */
230     virtual unsigned int calc_hash(const char *str, size_t len);
231 
232     /* match two strings */
233     virtual unsigned long match_strings(const char *valstr, size_t vallen,
234                                         const char *refstr, size_t reflen);
235 
236 protected:
237     /* create with no extension */
CVmObjStrComp()238     CVmObjStrComp() { ext_ = 0; }
239 
240     /* delete my extension */
241     void delete_ext(VMG0_);
242 
243     /* get my extension data */
get_ext()244     vmobj_strcmp_ext *get_ext() const { return (vmobj_strcmp_ext *)ext_; }
245 
246     /* load from an abstact stream object */
247     void load_from_stream(VMG_ class CVmStream *str);
248 
249     /*
250      *   Write to an abstract stream object.  Returns the number of bytes
251      *   actually needed to store the object.
252      *
253      *   If 'bytes_avail' is non-null, it indicates the maximum number of
254      *   bytes available for writing; if we need more than this amount, we
255      *   won't write anything at all, but will simply return the number of
256      *   bytes we actually need.
257      */
258     ulong write_to_stream(VMG_ class CVmStream *str, ulong *bytes_avail);
259 
260     /* allocate and initialize our extension */
261     void alloc_ext(VMG_ size_t trunc_len, int case_sensitive,
262                    size_t equiv_cnt, size_t total_chars,
263                    class CVmObjStrCompMapReader *reader);
264 
265     /* count of equivalence mappings */
266     void count_equiv_mappings(size_t *equiv_cnt, size_t *total_ch_cnt);
267 
268     /* property evaluator - undefined property */
getp_undef(VMG_ vm_obj_id_t,vm_val_t *,uint *)269     int getp_undef(VMG_ vm_obj_id_t, vm_val_t *, uint *) { return FALSE; }
270 
271     /* property evaluator - calculate a hash value */
272     int getp_calc_hash(VMG_ vm_obj_id_t, vm_val_t *val, uint *argc);
273 
274     /* property evaluator - match two values */
275     int getp_match_values(VMG_ vm_obj_id_t, vm_val_t *val, uint *argc);
276 
277     /* property evaluation function table */
278     static int (CVmObjStrComp::*func_table_[])(VMG_ vm_obj_id_t self,
279                                                vm_val_t *retval, uint *argc);
280 };
281 
282 /* ------------------------------------------------------------------------ */
283 /*
284  *   Registration table object
285  */
286 class CVmMetaclassStrComp: public CVmMetaclass
287 {
288 public:
289     /* get the global name */
get_meta_name()290     const char *get_meta_name() const { return "string-comparator/030000"; }
291 
292     /* create from image file */
create_for_image_load(VMG_ vm_obj_id_t id)293     void create_for_image_load(VMG_ vm_obj_id_t id)
294     {
295         new (vmg_ id) CVmObjStrComp();
296         G_obj_table->set_obj_gc_characteristics(id, FALSE, FALSE);
297     }
298 
299     /* create from restoring from saved state */
create_for_restore(VMG_ vm_obj_id_t id)300     void create_for_restore(VMG_ vm_obj_id_t id)
301     {
302         new (vmg_ id) CVmObjStrComp();
303         G_obj_table->set_obj_gc_characteristics(id, FALSE, FALSE);
304     }
305 
306     /* create dynamically using stack arguments */
create_from_stack(VMG_ const uchar ** pc_ptr,uint argc)307     vm_obj_id_t create_from_stack(VMG_ const uchar **pc_ptr, uint argc)
308         { return CVmObjStrComp::create_from_stack(vmg_ pc_ptr, argc); }
309 
310     /* call a static property */
call_stat_prop(VMG_ vm_val_t * result,const uchar ** pc_ptr,uint * argc,vm_prop_id_t prop)311     int call_stat_prop(VMG_ vm_val_t *result,
312                        const uchar **pc_ptr, uint *argc,
313                        vm_prop_id_t prop)
314     {
315         return CVmObjStrComp::call_stat_prop(vmg_ result, pc_ptr, argc, prop);
316     }
317 };
318 
319 #endif /* VMSTRCMP_H */
320 
321 /*
322  *   Register the class
323  */
324 VM_REGISTER_METACLASS(CVmObjStrComp)
325 
326