1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
2  *
3  * ***** BEGIN LICENSE BLOCK *****
4  * Version: MPL 1.1/GPL 2.0/LGPL 2.1
5  *
6  * The contents of this file are subject to the Mozilla Public License Version
7  * 1.1 (the "License"); you may not use this file except in compliance with
8  * the License. You may obtain a copy of the License at
9  * http://www.mozilla.org/MPL/
10  *
11  * Software distributed under the License is distributed on an "AS IS" basis,
12  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
13  * for the specific language governing rights and limitations under the
14  * License.
15  *
16  * The Original Code is Mozilla Communicator client code, released
17  * March 31, 1998.
18  *
19  * The Initial Developer of the Original Code is
20  * Netscape Communications Corporation.
21  * Portions created by the Initial Developer are Copyright (C) 1998
22  * the Initial Developer. All Rights Reserved.
23  *
24  * Contributor(s):
25  *
26  * Alternatively, the contents of this file may be used under the terms of
27  * either of the GNU General Public License Version 2 or later (the "GPL"),
28  * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
29  * in which case the provisions of the GPL or the LGPL are applicable instead
30  * of those above. If you wish to allow use of your version of this file only
31  * under the terms of either the GPL or the LGPL, and not to allow others to
32  * use your version of this file under the terms of the MPL, indicate your
33  * decision by deleting the provisions above and replace them with the notice
34  * and other provisions required by the GPL or the LGPL. If you do not delete
35  * the provisions above, a recipient may use your version of this file under
36  * the terms of any one of the MPL, the GPL or the LGPL.
37  *
38  * ***** END LICENSE BLOCK ***** */
39 
40 #ifndef jsstr_h___
41 #define jsstr_h___
42 /*
43  * JS string type implementation.
44  *
45  * A JS string is a counted array of unicode characters.  To support handoff
46  * of API client memory, the chars are allocated separately from the length,
47  * necessitating a pointer after the count, to form a separately allocated
48  * string descriptor.  String descriptors are GC'ed, while their chars are
49  * allocated from the malloc heap.
50  *
51  * When a string is treated as an object (by following it with . or []), the
52  * runtime wraps it with a JSObject whose valueOf method returns the unwrapped
53  * string descriptor.
54  */
55 #include <ctype.h>
56 #include "jspubtd.h"
57 #include "jsprvtd.h"
58 #include "jshash.h"
59 
60 JS_BEGIN_EXTERN_C
61 
62 /*
63  * The original GC-thing "string" type, a flat character string owned by its
64  * GC-thing descriptor.  The chars member points to a vector having byte size
65  * (length + 1) * sizeof(jschar), terminated at index length by a zero jschar.
66  * The terminator is purely a backstop, in case the chars pointer flows out to
67  * native code that requires \u0000 termination.
68  *
69  * NB: Always use the JSSTRING_LENGTH and JSSTRING_CHARS accessor macros,
70  * unless you guard str->member uses with !JSSTRING_IS_DEPENDENT(str).
71  */
72 struct JSString {
73     size_t          length;
74     jschar          *chars;
75 };
76 
77 /*
78  * Overlay structure for a string that depends on another string's characters.
79  * Distinguished by the JSSTRFLAG_DEPENDENT bit being set in length.  The base
80  * member may point to another dependent string if JSSTRING_CHARS has not been
81  * called yet.  The length chars in a dependent string are stored starting at
82  * base->chars + start, and are not necessarily zero-terminated.  If start is
83  * 0, it is not stored, length is a full size_t (minus the JSSTRFLAG_* bits in
84  * the high two positions), and the JSSTRFLAG_PREFIX flag is set.
85  */
86 struct JSDependentString {
87     size_t          length;
88     JSString        *base;
89 };
90 
91 /* Definitions for flags stored in the high order bits of JSString.length. */
92 #define JSSTRFLAG_BITS              2
93 #define JSSTRFLAG_SHIFT(flg)        ((size_t)(flg) << JSSTRING_LENGTH_BITS)
94 #define JSSTRFLAG_MASK              JSSTRFLAG_SHIFT(JS_BITMASK(JSSTRFLAG_BITS))
95 #define JSSTRFLAG_DEPENDENT         JSSTRFLAG_SHIFT(1)
96 #define JSSTRFLAG_PREFIX            JSSTRFLAG_SHIFT(2)
97 
98 /* Universal JSString type inquiry and accessor macros. */
99 #define JSSTRING_BIT(n)             ((size_t)1 << (n))
100 #define JSSTRING_BITMASK(n)         (JSSTRING_BIT(n) - 1)
101 #define JSSTRING_HAS_FLAG(str,flg)  ((str)->length & (flg))
102 #define JSSTRING_IS_DEPENDENT(str)  JSSTRING_HAS_FLAG(str, JSSTRFLAG_DEPENDENT)
103 #define JSSTRING_IS_PREFIX(str)     JSSTRING_HAS_FLAG(str, JSSTRFLAG_PREFIX)
104 #define JSSTRING_CHARS(str)         (JSSTRING_IS_DEPENDENT(str)               \
105                                      ? JSSTRDEP_CHARS(str)                    \
106                                      : (str)->chars)
107 #define JSSTRING_LENGTH(str)        (JSSTRING_IS_DEPENDENT(str)               \
108                                      ? JSSTRDEP_LENGTH(str)                   \
109                                      : (str)->length)
110 #define JSSTRING_LENGTH_BITS        (sizeof(size_t) * JS_BITS_PER_BYTE        \
111                                      - JSSTRFLAG_BITS)
112 #define JSSTRING_LENGTH_MASK        JSSTRING_BITMASK(JSSTRING_LENGTH_BITS)
113 
114 /* Specific JSDependentString shift/mask accessor and mutator macros. */
115 #define JSSTRDEP_START_BITS         (JSSTRING_LENGTH_BITS-JSSTRDEP_LENGTH_BITS)
116 #define JSSTRDEP_START_SHIFT        JSSTRDEP_LENGTH_BITS
117 #define JSSTRDEP_START_MASK         JSSTRING_BITMASK(JSSTRDEP_START_BITS)
118 #define JSSTRDEP_LENGTH_BITS        (JSSTRING_LENGTH_BITS / 2)
119 #define JSSTRDEP_LENGTH_MASK        JSSTRING_BITMASK(JSSTRDEP_LENGTH_BITS)
120 
121 #define JSSTRDEP(str)               ((JSDependentString *)(str))
122 #define JSSTRDEP_START(str)         (JSSTRING_IS_PREFIX(str) ? 0              \
123                                      : ((JSSTRDEP(str)->length                \
124                                          >> JSSTRDEP_START_SHIFT)             \
125                                         & JSSTRDEP_START_MASK))
126 #define JSSTRDEP_LENGTH(str)        (JSSTRDEP(str)->length                    \
127                                      & (JSSTRING_IS_PREFIX(str)               \
128                                         ? JSSTRING_LENGTH_MASK                \
129                                         : JSSTRDEP_LENGTH_MASK))
130 
131 #define JSSTRDEP_SET_START_AND_LENGTH(str,off,len)                            \
132     (JSSTRDEP(str)->length = JSSTRFLAG_DEPENDENT                              \
133                            | ((off) << JSSTRDEP_START_SHIFT)                  \
134                            | (len))
135 #define JSPREFIX_SET_LENGTH(str,len)                                          \
136     (JSSTRDEP(str)->length = JSSTRFLAG_DEPENDENT | JSSTRFLAG_PREFIX | (len))
137 
138 #define JSSTRDEP_BASE(str)          (JSSTRDEP(str)->base)
139 #define JSSTRDEP_SET_BASE(str,bstr) (JSSTRDEP(str)->base = (bstr))
140 #define JSPREFIX_BASE(str)          JSSTRDEP_BASE(str)
141 #define JSPREFIX_SET_BASE(str,bstr) JSSTRDEP_SET_BASE(str,bstr)
142 
143 #define JSSTRDEP_CHARS(str)                                                   \
144     (JSSTRING_IS_DEPENDENT(JSSTRDEP_BASE(str))                                \
145      ? js_GetDependentStringChars(str)                                        \
146      : JSSTRDEP_BASE(str)->chars + JSSTRDEP_START(str))
147 
148 extern size_t
149 js_MinimizeDependentStrings(JSString *str, int level, JSString **basep);
150 
151 extern jschar *
152 js_GetDependentStringChars(JSString *str);
153 
154 extern jschar *
155 js_GetStringChars(JSString *str);
156 
157 extern JSString *
158 js_ConcatStrings(JSContext *cx, JSString *left, JSString *right);
159 
160 extern const jschar *
161 js_UndependString(JSContext *cx, JSString *str);
162 
163 struct JSSubString {
164     size_t          length;
165     const jschar    *chars;
166 };
167 
168 extern jschar      js_empty_ucstr[];
169 extern JSSubString js_EmptySubString;
170 
171 /* Unicode character attribute lookup tables. */
172 extern const uint8 js_X[];
173 extern const uint8 js_Y[];
174 extern const uint32 js_A[];
175 
176 /* Enumerated Unicode general category types. */
177 typedef enum JSCharType {
178     JSCT_UNASSIGNED             = 0,
179     JSCT_UPPERCASE_LETTER       = 1,
180     JSCT_LOWERCASE_LETTER       = 2,
181     JSCT_TITLECASE_LETTER       = 3,
182     JSCT_MODIFIER_LETTER        = 4,
183     JSCT_OTHER_LETTER           = 5,
184     JSCT_NON_SPACING_MARK       = 6,
185     JSCT_ENCLOSING_MARK         = 7,
186     JSCT_COMBINING_SPACING_MARK = 8,
187     JSCT_DECIMAL_DIGIT_NUMBER   = 9,
188     JSCT_LETTER_NUMBER          = 10,
189     JSCT_OTHER_NUMBER           = 11,
190     JSCT_SPACE_SEPARATOR        = 12,
191     JSCT_LINE_SEPARATOR         = 13,
192     JSCT_PARAGRAPH_SEPARATOR    = 14,
193     JSCT_CONTROL                = 15,
194     JSCT_FORMAT                 = 16,
195     JSCT_PRIVATE_USE            = 18,
196     JSCT_SURROGATE              = 19,
197     JSCT_DASH_PUNCTUATION       = 20,
198     JSCT_START_PUNCTUATION      = 21,
199     JSCT_END_PUNCTUATION        = 22,
200     JSCT_CONNECTOR_PUNCTUATION  = 23,
201     JSCT_OTHER_PUNCTUATION      = 24,
202     JSCT_MATH_SYMBOL            = 25,
203     JSCT_CURRENCY_SYMBOL        = 26,
204     JSCT_MODIFIER_SYMBOL        = 27,
205     JSCT_OTHER_SYMBOL           = 28
206 } JSCharType;
207 
208 /* Character classifying and mapping macros, based on java.lang.Character. */
209 #define JS_CCODE(c)     (js_A[js_Y[(js_X[(uint16)(c)>>6]<<6)|((c)&0x3F)]])
210 #define JS_CTYPE(c)     (JS_CCODE(c) & 0x1F)
211 
212 #define JS_ISALPHA(c)   ((((1 << JSCT_UPPERCASE_LETTER) |                     \
213                            (1 << JSCT_LOWERCASE_LETTER) |                     \
214                            (1 << JSCT_TITLECASE_LETTER) |                     \
215                            (1 << JSCT_MODIFIER_LETTER) |                      \
216                            (1 << JSCT_OTHER_LETTER))                          \
217                           >> JS_CTYPE(c)) & 1)
218 
219 #define JS_ISALNUM(c)   ((((1 << JSCT_UPPERCASE_LETTER) |                     \
220                            (1 << JSCT_LOWERCASE_LETTER) |                     \
221                            (1 << JSCT_TITLECASE_LETTER) |                     \
222                            (1 << JSCT_MODIFIER_LETTER) |                      \
223                            (1 << JSCT_OTHER_LETTER) |                         \
224                            (1 << JSCT_DECIMAL_DIGIT_NUMBER))                  \
225                           >> JS_CTYPE(c)) & 1)
226 
227 /* A unicode letter, suitable for use in an identifier. */
228 #define JS_ISLETTER(c)   ((((1 << JSCT_UPPERCASE_LETTER) |                    \
229                             (1 << JSCT_LOWERCASE_LETTER) |                    \
230                             (1 << JSCT_TITLECASE_LETTER) |                    \
231                             (1 << JSCT_MODIFIER_LETTER) |                     \
232                             (1 << JSCT_OTHER_LETTER) |                        \
233                             (1 << JSCT_LETTER_NUMBER))                        \
234                            >> JS_CTYPE(c)) & 1)
235 
236 /*
237  * 'IdentifierPart' from ECMA grammar, is Unicode letter or combining mark or
238  * digit or connector punctuation.
239  */
240 #define JS_ISIDPART(c)  ((((1 << JSCT_UPPERCASE_LETTER) |                     \
241                            (1 << JSCT_LOWERCASE_LETTER) |                     \
242                            (1 << JSCT_TITLECASE_LETTER) |                     \
243                            (1 << JSCT_MODIFIER_LETTER) |                      \
244                            (1 << JSCT_OTHER_LETTER) |                         \
245                            (1 << JSCT_LETTER_NUMBER) |                        \
246                            (1 << JSCT_NON_SPACING_MARK) |                     \
247                            (1 << JSCT_COMBINING_SPACING_MARK) |               \
248                            (1 << JSCT_DECIMAL_DIGIT_NUMBER) |                 \
249                            (1 << JSCT_CONNECTOR_PUNCTUATION))                 \
250                           >> JS_CTYPE(c)) & 1)
251 
252 /* Unicode control-format characters, ignored in input */
253 #define JS_ISFORMAT(c) (((1 << JSCT_FORMAT) >> JS_CTYPE(c)) & 1)
254 
255 /*
256  * Per ECMA-262 15.10.2.6, these characters are the only ones that make up a
257  * "word", as far as a RegExp is concerned.  If we want a Unicode-friendlier
258  * definition of "word", we should rename this macro to something regexp-y.
259  */
260 #define JS_ISWORD(c)    ((c) < 128 && (isalnum(c) || (c) == '_'))
261 
262 #define JS_ISIDSTART(c) (JS_ISLETTER(c) || (c) == '_' || (c) == '$')
263 #define JS_ISIDENT(c)   (JS_ISIDPART(c) || (c) == '_' || (c) == '$')
264 
265 #define JS_ISXMLSPACE(c)        ((c) == ' ' || (c) == '\t' || (c) == '\r' ||  \
266                                  (c) == '\n')
267 #define JS_ISXMLNSSTART(c)      ((JS_CCODE(c) & 0x00000100) || (c) == '_')
268 #define JS_ISXMLNS(c)           ((JS_CCODE(c) & 0x00000080) || (c) == '.' ||  \
269                                  (c) == '-' || (c) == '_')
270 #define JS_ISXMLNAMESTART(c)    (JS_ISXMLNSSTART(c) || (c) == ':')
271 #define JS_ISXMLNAME(c)         (JS_ISXMLNS(c) || (c) == ':')
272 
273 #define JS_ISDIGIT(c)   (JS_CTYPE(c) == JSCT_DECIMAL_DIGIT_NUMBER)
274 
275 /* XXXbe unify on A/X/Y tbls, avoid ctype.h? */
276 /* XXXbe fs, etc. ? */
277 #define JS_ISSPACE(c)   ((JS_CCODE(c) & 0x00070000) == 0x00040000)
278 #define JS_ISPRINT(c)   ((c) < 128 && isprint(c))
279 
280 #define JS_ISUPPER(c)   (JS_CTYPE(c) == JSCT_UPPERCASE_LETTER)
281 #define JS_ISLOWER(c)   (JS_CTYPE(c) == JSCT_LOWERCASE_LETTER)
282 
283 #define JS_TOUPPER(c)   ((jschar) ((JS_CCODE(c) & 0x00100000)                 \
284                                    ? (c) - ((int32)JS_CCODE(c) >> 22)         \
285                                    : (c)))
286 #define JS_TOLOWER(c)   ((jschar) ((JS_CCODE(c) & 0x00200000)                 \
287                                    ? (c) + ((int32)JS_CCODE(c) >> 22)         \
288                                    : (c)))
289 
290 /*
291  * Shorthands for ASCII (7-bit) decimal and hex conversion.
292  * Manually inline isdigit for performance; MSVC doesn't do this for us.
293  */
294 #define JS7_ISDEC(c)    ((((unsigned)(c)) - '0') <= 9)
295 #define JS7_UNDEC(c)    ((c) - '0')
296 #define JS7_ISHEX(c)    ((c) < 128 && isxdigit(c))
297 #define JS7_UNHEX(c)    (uintN)(JS7_ISDEC(c) ? (c) - '0' : 10 + tolower(c) - 'a')
298 #define JS7_ISLET(c)    ((c) < 128 && isalpha(c))
299 
300 /* Initialize per-runtime string state for the first context in the runtime. */
301 extern JSBool
302 js_InitRuntimeStringState(JSContext *cx);
303 
304 extern void
305 js_FinishRuntimeStringState(JSContext *cx);
306 
307 extern void
308 js_FinishDeflatedStringCache(JSRuntime *rt);
309 
310 /* Initialize the String class, returning its prototype object. */
311 extern JSClass js_StringClass;
312 
313 extern JSObject *
314 js_InitStringClass(JSContext *cx, JSObject *obj);
315 
316 extern const char js_escape_str[];
317 extern const char js_unescape_str[];
318 extern const char js_uneval_str[];
319 extern const char js_decodeURI_str[];
320 extern const char js_encodeURI_str[];
321 extern const char js_decodeURIComponent_str[];
322 extern const char js_encodeURIComponent_str[];
323 
324 /* GC-allocate a string descriptor for the given malloc-allocated chars. */
325 extern JSString *
326 js_NewString(JSContext *cx, jschar *chars, size_t length, uintN gcflag);
327 
328 extern JSString *
329 js_NewDependentString(JSContext *cx, JSString *base, size_t start,
330                       size_t length, uintN gcflag);
331 
332 /* Copy a counted string and GC-allocate a descriptor for it. */
333 extern JSString *
334 js_NewStringCopyN(JSContext *cx, const jschar *s, size_t n, uintN gcflag);
335 
336 /* Copy a C string and GC-allocate a descriptor for it. */
337 extern JSString *
338 js_NewStringCopyZ(JSContext *cx, const jschar *s, uintN gcflag);
339 
340 /* Free the chars held by str when it is finalized by the GC. */
341 extern void
342 js_FinalizeString(JSContext *cx, JSString *str);
343 
344 extern void
345 js_FinalizeStringRT(JSRuntime *rt, JSString *str);
346 
347 /* Wrap a string value in a String object. */
348 extern JSObject *
349 js_StringToObject(JSContext *cx, JSString *str);
350 
351 /*
352  * Convert a value to a printable C string.
353  */
354 typedef JSString *(*JSValueToStringFun)(JSContext *cx, jsval v);
355 
356 extern JS_FRIEND_API(const char *)
357 js_ValueToPrintable(JSContext *cx, jsval v, JSValueToStringFun v2sfun);
358 
359 #define js_ValueToPrintableString(cx,v) \
360     js_ValueToPrintable(cx, v, js_ValueToString)
361 
362 #define js_ValueToPrintableSource(cx,v) \
363     js_ValueToPrintable(cx, v, js_ValueToSource)
364 
365 /*
366  * Convert a value to a string, returning null after reporting an error,
367  * otherwise returning a new string reference.
368  */
369 extern JS_FRIEND_API(JSString *)
370 js_ValueToString(JSContext *cx, jsval v);
371 
372 /*
373  * Convert a value to its source expression, returning null after reporting
374  * an error, otherwise returning a new string reference.
375  */
376 extern JS_FRIEND_API(JSString *)
377 js_ValueToSource(JSContext *cx, jsval v);
378 
379 #ifdef HT_ENUMERATE_NEXT        /* XXX don't require jshash.h */
380 /*
381  * Compute a hash function from str.
382  */
383 extern JSHashNumber
384 js_HashString(JSString *str);
385 #endif
386 
387 /*
388  * Return less than, equal to, or greater than zero depending on whether
389  * str1 is less than, equal to, or greater than str2.
390  */
391 extern intN
392 js_CompareStrings(JSString *str1, JSString *str2);
393 
394 /*
395  * Test if strings are equal.
396  */
397 extern JSBool
398 js_EqualStrings(JSString *str1, JSString *str2);
399 
400 /*
401  * Boyer-Moore-Horspool superlinear search for pat:patlen in text:textlen.
402  * The patlen argument must be positive and no greater than BMH_PATLEN_MAX.
403  * The start argument tells where in text to begin the search.
404  *
405  * Return the index of pat in text, or -1 if not found.
406  */
407 #define BMH_CHARSET_SIZE 256    /* ISO-Latin-1 */
408 #define BMH_PATLEN_MAX   255    /* skip table element is uint8 */
409 
410 #define BMH_BAD_PATTERN  (-2)   /* return value if pat is not ISO-Latin-1 */
411 
412 extern jsint
413 js_BoyerMooreHorspool(const jschar *text, jsint textlen,
414                       const jschar *pat, jsint patlen,
415                       jsint start);
416 
417 extern size_t
418 js_strlen(const jschar *s);
419 
420 extern jschar *
421 js_strchr(const jschar *s, jschar c);
422 
423 extern jschar *
424 js_strchr_limit(const jschar *s, jschar c, const jschar *limit);
425 
426 #define js_strncpy(t, s, n)     memcpy((t), (s), (n) * sizeof(jschar))
427 
428 /*
429  * Return s advanced past any Unicode white space characters.
430  */
431 extern const jschar *
432 js_SkipWhiteSpace(const jschar *s);
433 
434 /*
435  * Inflate bytes to JS chars and vice versa.  Report out of memory via cx
436  * and return null on error, otherwise return the jschar or byte vector that
437  * was JS_malloc'ed. length is updated with the length of the new string in jschars.
438  */
439 extern jschar *
440 js_InflateString(JSContext *cx, const char *bytes, size_t *length);
441 
442 extern char *
443 js_DeflateString(JSContext *cx, const jschar *chars, size_t length);
444 
445 /*
446  * Inflate bytes to JS chars into a buffer.
447  * 'chars' must be large enough for 'length' jschars.
448  * The buffer is NOT null-terminated.
449  * cx may be NULL, which means no errors are thrown.
450  * The destination length needs to be initialized with the buffer size, takes
451  * the number of chars moved.
452  */
453 extern JSBool
454 js_InflateStringToBuffer(JSContext* cx, const char *bytes, size_t length,
455                          jschar *chars, size_t* charsLength);
456 
457 /*
458  * Deflate JS chars to bytes into a buffer.
459  * 'bytes' must be large enough for 'length chars.
460  * The buffer is NOT null-terminated.
461  * cx may be NULL, which means no errors are thrown.
462  * The destination length needs to be initialized with the buffer size, takes
463  * the number of bytes moved.
464  */
465 extern JSBool
466 js_DeflateStringToBuffer(JSContext* cx, const jschar *chars,
467                          size_t charsLength, char *bytes, size_t* length);
468 
469 /*
470  * Associate bytes with str in the deflated string cache, returning true on
471  * successful association, false on out of memory.
472  */
473 extern JSBool
474 js_SetStringBytes(JSRuntime *rt, JSString *str, char *bytes, size_t length);
475 
476 /*
477  * Find or create a deflated string cache entry for str that contains its
478  * characters chopped from Unicode code points into bytes.
479  */
480 extern char *
481 js_GetStringBytes(JSRuntime *rt, JSString *str);
482 
483 /* Remove a deflated string cache entry associated with str if any. */
484 extern void
485 js_PurgeDeflatedStringCache(JSRuntime *rt, JSString *str);
486 
487 JSBool
488 js_str_escape(JSContext *cx, JSObject *obj, uintN argc, jsval *argv,
489               jsval *rval);
490 
491 /*
492  * Convert one UCS-4 char and write it into a UTF-8 buffer, which must be at
493  * least 6 bytes long.  Return the number of UTF-8 bytes of data written.
494  */
495 extern int
496 js_OneUcs4ToUtf8Char(uint8 *utf8Buffer, uint32 ucs4Char);
497 
498 JS_END_EXTERN_C
499 
500 #endif /* jsstr_h___ */
501