1 /*
2  *  Heap string representation.
3  *
4  *  Strings are byte sequences ordinarily stored in extended UTF-8 format,
5  *  allowing values larger than the official UTF-8 range (used internally)
6  *  and also allowing UTF-8 encoding of surrogate pairs (CESU-8 format).
7  *  Strings may also be invalid UTF-8 altogether which is the case e.g. with
8  *  strings used as internal property names and raw buffers converted to
9  *  strings.  In such cases the 'clen' field contains an inaccurate value.
10  *
11  *  ECMAScript requires support for 32-bit long strings.  However, since each
12  *  16-bit codepoint can take 3 bytes in CESU-8, this representation can only
13  *  support about 1.4G codepoint long strings in extreme cases.  This is not
14  *  really a practical issue.
15  */
16 
17 #if !defined(DUK_HSTRING_H_INCLUDED)
18 #define DUK_HSTRING_H_INCLUDED
19 
20 /* Impose a maximum string length for now.  Restricted artificially to
21  * ensure adding a heap header length won't overflow size_t.  The limit
22  * should be synchronized with DUK_HBUFFER_MAX_BYTELEN.
23  *
24  * E5.1 makes provisions to support strings longer than 4G characters.
25  * This limit should be eliminated on 64-bit platforms (and increased
26  * closer to maximum support on 32-bit platforms).
27  */
28 
29 #if defined(DUK_USE_STRLEN16)
30 #define DUK_HSTRING_MAX_BYTELEN                     (0x0000ffffUL)
31 #else
32 #define DUK_HSTRING_MAX_BYTELEN                     (0x7fffffffUL)
33 #endif
34 
35 /* XXX: could add flags for "is valid CESU-8" (ECMAScript compatible strings),
36  * "is valid UTF-8", "is valid extended UTF-8" (internal strings are not,
37  * regexp bytecode is), and "contains non-BMP characters".  These are not
38  * needed right now.
39  */
40 
41 /* With lowmem builds the high 16 bits of duk_heaphdr are used for other
42  * purposes, so this leaves 7 duk_heaphdr flags and 9 duk_hstring flags.
43  */
44 #define DUK_HSTRING_FLAG_ASCII                      DUK_HEAPHDR_USER_FLAG(0)  /* string is ASCII, clen == blen */
45 #define DUK_HSTRING_FLAG_ARRIDX                     DUK_HEAPHDR_USER_FLAG(1)  /* string is a valid array index */
46 #define DUK_HSTRING_FLAG_SYMBOL                     DUK_HEAPHDR_USER_FLAG(2)  /* string is a symbol (invalid utf-8) */
47 #define DUK_HSTRING_FLAG_HIDDEN                     DUK_HEAPHDR_USER_FLAG(3)  /* string is a hidden symbol (implies symbol, Duktape 1.x internal string) */
48 #define DUK_HSTRING_FLAG_RESERVED_WORD              DUK_HEAPHDR_USER_FLAG(4)  /* string is a reserved word (non-strict) */
49 #define DUK_HSTRING_FLAG_STRICT_RESERVED_WORD       DUK_HEAPHDR_USER_FLAG(5)  /* string is a reserved word (strict) */
50 #define DUK_HSTRING_FLAG_EVAL_OR_ARGUMENTS          DUK_HEAPHDR_USER_FLAG(6)  /* string is 'eval' or 'arguments' */
51 #define DUK_HSTRING_FLAG_EXTDATA                    DUK_HEAPHDR_USER_FLAG(7)  /* string data is external (duk_hstring_external) */
52 #define DUK_HSTRING_FLAG_PINNED_LITERAL             DUK_HEAPHDR_USER_FLAG(8)  /* string is a literal, and pinned */
53 
54 #define DUK_HSTRING_HAS_ASCII(x)                    DUK_HEAPHDR_CHECK_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_ASCII)
55 #define DUK_HSTRING_HAS_ARRIDX(x)                   DUK_HEAPHDR_CHECK_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_ARRIDX)
56 #define DUK_HSTRING_HAS_SYMBOL(x)                   DUK_HEAPHDR_CHECK_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_SYMBOL)
57 #define DUK_HSTRING_HAS_HIDDEN(x)                   DUK_HEAPHDR_CHECK_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_HIDDEN)
58 #define DUK_HSTRING_HAS_RESERVED_WORD(x)            DUK_HEAPHDR_CHECK_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_RESERVED_WORD)
59 #define DUK_HSTRING_HAS_STRICT_RESERVED_WORD(x)     DUK_HEAPHDR_CHECK_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_STRICT_RESERVED_WORD)
60 #define DUK_HSTRING_HAS_EVAL_OR_ARGUMENTS(x)        DUK_HEAPHDR_CHECK_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_EVAL_OR_ARGUMENTS)
61 #define DUK_HSTRING_HAS_EXTDATA(x)                  DUK_HEAPHDR_CHECK_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_EXTDATA)
62 #define DUK_HSTRING_HAS_PINNED_LITERAL(x)           DUK_HEAPHDR_CHECK_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_PINNED_LITERAL)
63 
64 #define DUK_HSTRING_SET_ASCII(x)                    DUK_HEAPHDR_SET_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_ASCII)
65 #define DUK_HSTRING_SET_ARRIDX(x)                   DUK_HEAPHDR_SET_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_ARRIDX)
66 #define DUK_HSTRING_SET_SYMBOL(x)                   DUK_HEAPHDR_SET_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_SYMBOL)
67 #define DUK_HSTRING_SET_HIDDEN(x)                   DUK_HEAPHDR_SET_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_HIDDEN)
68 #define DUK_HSTRING_SET_RESERVED_WORD(x)            DUK_HEAPHDR_SET_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_RESERVED_WORD)
69 #define DUK_HSTRING_SET_STRICT_RESERVED_WORD(x)     DUK_HEAPHDR_SET_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_STRICT_RESERVED_WORD)
70 #define DUK_HSTRING_SET_EVAL_OR_ARGUMENTS(x)        DUK_HEAPHDR_SET_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_EVAL_OR_ARGUMENTS)
71 #define DUK_HSTRING_SET_EXTDATA(x)                  DUK_HEAPHDR_SET_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_EXTDATA)
72 #define DUK_HSTRING_SET_PINNED_LITERAL(x)           DUK_HEAPHDR_SET_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_PINNED_LITERAL)
73 
74 #define DUK_HSTRING_CLEAR_ASCII(x)                  DUK_HEAPHDR_CLEAR_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_ASCII)
75 #define DUK_HSTRING_CLEAR_ARRIDX(x)                 DUK_HEAPHDR_CLEAR_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_ARRIDX)
76 #define DUK_HSTRING_CLEAR_SYMBOL(x)                 DUK_HEAPHDR_CLEAR_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_SYMBOL)
77 #define DUK_HSTRING_CLEAR_HIDDEN(x)                 DUK_HEAPHDR_CLEAR_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_HIDDEN)
78 #define DUK_HSTRING_CLEAR_RESERVED_WORD(x)          DUK_HEAPHDR_CLEAR_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_RESERVED_WORD)
79 #define DUK_HSTRING_CLEAR_STRICT_RESERVED_WORD(x)   DUK_HEAPHDR_CLEAR_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_STRICT_RESERVED_WORD)
80 #define DUK_HSTRING_CLEAR_EVAL_OR_ARGUMENTS(x)      DUK_HEAPHDR_CLEAR_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_EVAL_OR_ARGUMENTS)
81 #define DUK_HSTRING_CLEAR_EXTDATA(x)                DUK_HEAPHDR_CLEAR_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_EXTDATA)
82 #define DUK_HSTRING_CLEAR_PINNED_LITERAL(x)         DUK_HEAPHDR_CLEAR_FLAG_BITS(&(x)->hdr, DUK_HSTRING_FLAG_PINNED_LITERAL)
83 
84 #if 0  /* Slightly smaller code without explicit flag, but explicit flag
85         * is very useful when 'clen' is dropped.
86         */
87 #define DUK_HSTRING_IS_ASCII(x)                     (DUK_HSTRING_GET_BYTELEN((x)) == DUK_HSTRING_GET_CHARLEN((x)))
88 #endif
89 #define DUK_HSTRING_IS_ASCII(x)                     DUK_HSTRING_HAS_ASCII((x))  /* lazily set! */
90 #define DUK_HSTRING_IS_EMPTY(x)                     (DUK_HSTRING_GET_BYTELEN((x)) == 0)
91 
92 #if defined(DUK_USE_STRHASH16)
93 #define DUK_HSTRING_GET_HASH(x)                     ((x)->hdr.h_flags >> 16)
94 #define DUK_HSTRING_SET_HASH(x,v) do { \
95 		(x)->hdr.h_flags = ((x)->hdr.h_flags & 0x0000ffffUL) | ((v) << 16); \
96 	} while (0)
97 #else
98 #define DUK_HSTRING_GET_HASH(x)                     ((x)->hash)
99 #define DUK_HSTRING_SET_HASH(x,v) do { \
100 		(x)->hash = (v); \
101 	} while (0)
102 #endif
103 
104 #if defined(DUK_USE_STRLEN16)
105 #define DUK_HSTRING_GET_BYTELEN(x)                  ((x)->hdr.h_strextra16)
106 #define DUK_HSTRING_SET_BYTELEN(x,v) do { \
107 		(x)->hdr.h_strextra16 = (v); \
108 	} while (0)
109 #if defined(DUK_USE_HSTRING_CLEN)
110 #define DUK_HSTRING_GET_CHARLEN(x)                  duk_hstring_get_charlen((x))
111 #define DUK_HSTRING_SET_CHARLEN(x,v) do { \
112 		(x)->clen16 = (v); \
113 	} while (0)
114 #else
115 #define DUK_HSTRING_GET_CHARLEN(x)                  duk_hstring_get_charlen((x))
116 #define DUK_HSTRING_SET_CHARLEN(x,v) do { \
117 		DUK_ASSERT(0);  /* should never be called */ \
118 	} while (0)
119 #endif
120 #else
121 #define DUK_HSTRING_GET_BYTELEN(x)                  ((x)->blen)
122 #define DUK_HSTRING_SET_BYTELEN(x,v) do { \
123 		(x)->blen = (v); \
124 	} while (0)
125 #define DUK_HSTRING_GET_CHARLEN(x)                  duk_hstring_get_charlen((x))
126 #define DUK_HSTRING_SET_CHARLEN(x,v) do { \
127 		(x)->clen = (v); \
128 	} while (0)
129 #endif
130 
131 #if defined(DUK_USE_HSTRING_EXTDATA)
132 #define DUK_HSTRING_GET_EXTDATA(x) \
133 	((x)->extdata)
134 #define DUK_HSTRING_GET_DATA(x) \
135 	(DUK_HSTRING_HAS_EXTDATA((x)) ? \
136 		DUK_HSTRING_GET_EXTDATA((const duk_hstring_external *) (x)) : ((const duk_uint8_t *) ((x) + 1)))
137 #else
138 #define DUK_HSTRING_GET_DATA(x) \
139 	((const duk_uint8_t *) ((x) + 1))
140 #endif
141 
142 #define DUK_HSTRING_GET_DATA_END(x) \
143 	(DUK_HSTRING_GET_DATA((x)) + (x)->blen)
144 
145 /* Marker value; in E5 2^32-1 is not a valid array index (2^32-2 is highest
146  * valid).
147  */
148 #define DUK_HSTRING_NO_ARRAY_INDEX  (0xffffffffUL)
149 
150 #if defined(DUK_USE_HSTRING_ARRIDX)
151 #define DUK_HSTRING_GET_ARRIDX_FAST(h)  ((h)->arridx)
152 #define DUK_HSTRING_GET_ARRIDX_SLOW(h)  ((h)->arridx)
153 #else
154 /* Get array index related to string (or return DUK_HSTRING_NO_ARRAY_INDEX);
155  * avoids helper call if string has no array index value.
156  */
157 #define DUK_HSTRING_GET_ARRIDX_FAST(h)  \
158 	(DUK_HSTRING_HAS_ARRIDX((h)) ? duk_js_to_arrayindex_hstring_fast_known((h)) : DUK_HSTRING_NO_ARRAY_INDEX)
159 
160 /* Slower but more compact variant. */
161 #define DUK_HSTRING_GET_ARRIDX_SLOW(h)  \
162 	(duk_js_to_arrayindex_hstring_fast((h)))
163 #endif
164 
165 /* XXX: these actually fit into duk_hstring */
166 #define DUK_SYMBOL_TYPE_HIDDEN 0
167 #define DUK_SYMBOL_TYPE_GLOBAL 1
168 #define DUK_SYMBOL_TYPE_LOCAL 2
169 #define DUK_SYMBOL_TYPE_WELLKNOWN 3
170 
171 /* Assertion for duk_hstring validity. */
172 #if defined(DUK_USE_ASSERTIONS)
173 DUK_INTERNAL_DECL void duk_hstring_assert_valid(duk_hstring *h);
174 #define DUK_HSTRING_ASSERT_VALID(h)  do { duk_hstring_assert_valid((h)); } while (0)
175 #else
176 #define DUK_HSTRING_ASSERT_VALID(h)  do {} while (0)
177 #endif
178 
179 /*
180  *  Misc
181  */
182 
183 struct duk_hstring {
184 	/* Smaller heaphdr than for other objects, because strings are held
185 	 * in string intern table which requires no link pointers.  Much of
186 	 * the 32-bit flags field is unused by flags, so we can stuff a 16-bit
187 	 * field in there.
188 	 */
189 	duk_heaphdr_string hdr;
190 
191 	/* String hash. */
192 #if defined(DUK_USE_STRHASH16)
193 	/* If 16-bit hash is in use, stuff it into duk_heaphdr_string flags. */
194 #else
195 	duk_uint32_t hash;
196 #endif
197 
198 	/* Precomputed array index (or DUK_HSTRING_NO_ARRAY_INDEX). */
199 #if defined(DUK_USE_HSTRING_ARRIDX)
200 	duk_uarridx_t arridx;
201 #endif
202 
203 	/* Length in bytes (not counting NUL term). */
204 #if defined(DUK_USE_STRLEN16)
205 	/* placed in duk_heaphdr_string */
206 #else
207 	duk_uint32_t blen;
208 #endif
209 
210 	/* Length in codepoints (must be E5 compatible). */
211 #if defined(DUK_USE_STRLEN16)
212 #if defined(DUK_USE_HSTRING_CLEN)
213 	duk_uint16_t clen16;
214 #else
215 	/* computed live */
216 #endif
217 #else
218 	duk_uint32_t clen;
219 #endif
220 
221 	/*
222 	 *  String data of 'blen+1' bytes follows (+1 for NUL termination
223 	 *  convenience for C API).  No alignment needs to be guaranteed
224 	 *  for strings, but fields above should guarantee alignment-by-4
225 	 *  (but not alignment-by-8).
226 	 */
227 };
228 
229 /* The external string struct is defined even when the feature is inactive. */
230 struct duk_hstring_external {
231 	duk_hstring str;
232 
233 	/*
234 	 *  For an external string, the NUL-terminated string data is stored
235 	 *  externally.  The user must guarantee that data behind this pointer
236 	 *  doesn't change while it's used.
237 	 */
238 
239 	const duk_uint8_t *extdata;
240 };
241 
242 /*
243  *  Prototypes
244  */
245 
246 DUK_INTERNAL_DECL duk_ucodepoint_t duk_hstring_char_code_at_raw(duk_hthread *thr, duk_hstring *h, duk_uint_t pos, duk_bool_t surrogate_aware);
247 DUK_INTERNAL_DECL duk_bool_t duk_hstring_equals_ascii_cstring(duk_hstring *h, const char *cstr);
248 DUK_INTERNAL_DECL duk_size_t duk_hstring_get_charlen(duk_hstring *h);
249 #if !defined(DUK_USE_HSTRING_LAZY_CLEN)
250 DUK_INTERNAL_DECL void duk_hstring_init_charlen(duk_hstring *h);
251 #endif
252 
253 #endif  /* DUK_HSTRING_H_INCLUDED */
254