1 /* -*- Mode: C; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*- 2 * 3 * ***** BEGIN LICENSE BLOCK ***** 4 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 5 * 6 * The contents of this file are subject to the Mozilla Public License Version 7 * 1.1 (the "License"); you may not use this file except in compliance with 8 * the License. You may obtain a copy of the License at 9 * http://www.mozilla.org/MPL/ 10 * 11 * Software distributed under the License is distributed on an "AS IS" basis, 12 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License 13 * for the specific language governing rights and limitations under the 14 * License. 15 * 16 * The Original Code is Mozilla Communicator client code, released 17 * March 31, 1998. 18 * 19 * The Initial Developer of the Original Code is 20 * Netscape Communications Corporation. 21 * Portions created by the Initial Developer are Copyright (C) 1998 22 * the Initial Developer. All Rights Reserved. 23 * 24 * Contributor(s): 25 * 26 * Alternatively, the contents of this file may be used under the terms of 27 * either of the GNU General Public License Version 2 or later (the "GPL"), 28 * or the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), 29 * in which case the provisions of the GPL or the LGPL are applicable instead 30 * of those above. If you wish to allow use of your version of this file only 31 * under the terms of either the GPL or the LGPL, and not to allow others to 32 * use your version of this file under the terms of the MPL, indicate your 33 * decision by deleting the provisions above and replace them with the notice 34 * and other provisions required by the GPL or the LGPL. If you do not delete 35 * the provisions above, a recipient may use your version of this file under 36 * the terms of any one of the MPL, the GPL or the LGPL. 37 * 38 * ***** END LICENSE BLOCK ***** */ 39 40 #ifndef jsstr_h___ 41 #define jsstr_h___ 42 /* 43 * JS string type implementation. 44 * 45 * A JS string is a counted array of unicode characters. To support handoff 46 * of API client memory, the chars are allocated separately from the length, 47 * necessitating a pointer after the count, to form a separately allocated 48 * string descriptor. String descriptors are GC'ed, while their chars are 49 * allocated from the malloc heap. 50 * 51 * When a string is treated as an object (by following it with . or []), the 52 * runtime wraps it with a JSObject whose valueOf method returns the unwrapped 53 * string descriptor. 54 */ 55 #include <ctype.h> 56 #include "jspubtd.h" 57 #include "jsprvtd.h" 58 #include "jshash.h" 59 60 JS_BEGIN_EXTERN_C 61 62 /* 63 * The original GC-thing "string" type, a flat character string owned by its 64 * GC-thing descriptor. The chars member points to a vector having byte size 65 * (length + 1) * sizeof(jschar), terminated at index length by a zero jschar. 66 * The terminator is purely a backstop, in case the chars pointer flows out to 67 * native code that requires \u0000 termination. 68 * 69 * NB: Always use the JSSTRING_LENGTH and JSSTRING_CHARS accessor macros, 70 * unless you guard str->member uses with !JSSTRING_IS_DEPENDENT(str). 71 */ 72 struct JSString { 73 size_t length; 74 jschar *chars; 75 }; 76 77 /* 78 * Overlay structure for a string that depends on another string's characters. 79 * Distinguished by the JSSTRFLAG_DEPENDENT bit being set in length. The base 80 * member may point to another dependent string if JSSTRING_CHARS has not been 81 * called yet. The length chars in a dependent string are stored starting at 82 * base->chars + start, and are not necessarily zero-terminated. If start is 83 * 0, it is not stored, length is a full size_t (minus the JSSTRFLAG_* bits in 84 * the high two positions), and the JSSTRFLAG_PREFIX flag is set. 85 */ 86 struct JSDependentString { 87 size_t length; 88 JSString *base; 89 }; 90 91 /* Definitions for flags stored in the high order bits of JSString.length. */ 92 #define JSSTRFLAG_BITS 2 93 #define JSSTRFLAG_SHIFT(flg) ((size_t)(flg) << JSSTRING_LENGTH_BITS) 94 #define JSSTRFLAG_MASK JSSTRFLAG_SHIFT(JS_BITMASK(JSSTRFLAG_BITS)) 95 #define JSSTRFLAG_DEPENDENT JSSTRFLAG_SHIFT(1) 96 #define JSSTRFLAG_PREFIX JSSTRFLAG_SHIFT(2) 97 98 /* Universal JSString type inquiry and accessor macros. */ 99 #define JSSTRING_BIT(n) ((size_t)1 << (n)) 100 #define JSSTRING_BITMASK(n) (JSSTRING_BIT(n) - 1) 101 #define JSSTRING_HAS_FLAG(str,flg) ((str)->length & (flg)) 102 #define JSSTRING_IS_DEPENDENT(str) JSSTRING_HAS_FLAG(str, JSSTRFLAG_DEPENDENT) 103 #define JSSTRING_IS_PREFIX(str) JSSTRING_HAS_FLAG(str, JSSTRFLAG_PREFIX) 104 #define JSSTRING_CHARS(str) (JSSTRING_IS_DEPENDENT(str) \ 105 ? JSSTRDEP_CHARS(str) \ 106 : (str)->chars) 107 #define JSSTRING_LENGTH(str) (JSSTRING_IS_DEPENDENT(str) \ 108 ? JSSTRDEP_LENGTH(str) \ 109 : (str)->length) 110 #define JSSTRING_LENGTH_BITS (sizeof(size_t) * JS_BITS_PER_BYTE \ 111 - JSSTRFLAG_BITS) 112 #define JSSTRING_LENGTH_MASK JSSTRING_BITMASK(JSSTRING_LENGTH_BITS) 113 114 /* Specific JSDependentString shift/mask accessor and mutator macros. */ 115 #define JSSTRDEP_START_BITS (JSSTRING_LENGTH_BITS-JSSTRDEP_LENGTH_BITS) 116 #define JSSTRDEP_START_SHIFT JSSTRDEP_LENGTH_BITS 117 #define JSSTRDEP_START_MASK JSSTRING_BITMASK(JSSTRDEP_START_BITS) 118 #define JSSTRDEP_LENGTH_BITS (JSSTRING_LENGTH_BITS / 2) 119 #define JSSTRDEP_LENGTH_MASK JSSTRING_BITMASK(JSSTRDEP_LENGTH_BITS) 120 121 #define JSSTRDEP(str) ((JSDependentString *)(str)) 122 #define JSSTRDEP_START(str) (JSSTRING_IS_PREFIX(str) ? 0 \ 123 : ((JSSTRDEP(str)->length \ 124 >> JSSTRDEP_START_SHIFT) \ 125 & JSSTRDEP_START_MASK)) 126 #define JSSTRDEP_LENGTH(str) (JSSTRDEP(str)->length \ 127 & (JSSTRING_IS_PREFIX(str) \ 128 ? JSSTRING_LENGTH_MASK \ 129 : JSSTRDEP_LENGTH_MASK)) 130 131 #define JSSTRDEP_SET_START_AND_LENGTH(str,off,len) \ 132 (JSSTRDEP(str)->length = JSSTRFLAG_DEPENDENT \ 133 | ((off) << JSSTRDEP_START_SHIFT) \ 134 | (len)) 135 #define JSPREFIX_SET_LENGTH(str,len) \ 136 (JSSTRDEP(str)->length = JSSTRFLAG_DEPENDENT | JSSTRFLAG_PREFIX | (len)) 137 138 #define JSSTRDEP_BASE(str) (JSSTRDEP(str)->base) 139 #define JSSTRDEP_SET_BASE(str,bstr) (JSSTRDEP(str)->base = (bstr)) 140 #define JSPREFIX_BASE(str) JSSTRDEP_BASE(str) 141 #define JSPREFIX_SET_BASE(str,bstr) JSSTRDEP_SET_BASE(str,bstr) 142 143 #define JSSTRDEP_CHARS(str) \ 144 (JSSTRING_IS_DEPENDENT(JSSTRDEP_BASE(str)) \ 145 ? js_GetDependentStringChars(str) \ 146 : JSSTRDEP_BASE(str)->chars + JSSTRDEP_START(str)) 147 148 extern size_t 149 js_MinimizeDependentStrings(JSString *str, int level, JSString **basep); 150 151 extern jschar * 152 js_GetDependentStringChars(JSString *str); 153 154 extern jschar * 155 js_GetStringChars(JSString *str); 156 157 extern JSString * 158 js_ConcatStrings(JSContext *cx, JSString *left, JSString *right); 159 160 extern const jschar * 161 js_UndependString(JSContext *cx, JSString *str); 162 163 struct JSSubString { 164 size_t length; 165 const jschar *chars; 166 }; 167 168 extern jschar js_empty_ucstr[]; 169 extern JSSubString js_EmptySubString; 170 171 /* Unicode character attribute lookup tables. */ 172 extern const uint8 js_X[]; 173 extern const uint8 js_Y[]; 174 extern const uint32 js_A[]; 175 176 /* Enumerated Unicode general category types. */ 177 typedef enum JSCharType { 178 JSCT_UNASSIGNED = 0, 179 JSCT_UPPERCASE_LETTER = 1, 180 JSCT_LOWERCASE_LETTER = 2, 181 JSCT_TITLECASE_LETTER = 3, 182 JSCT_MODIFIER_LETTER = 4, 183 JSCT_OTHER_LETTER = 5, 184 JSCT_NON_SPACING_MARK = 6, 185 JSCT_ENCLOSING_MARK = 7, 186 JSCT_COMBINING_SPACING_MARK = 8, 187 JSCT_DECIMAL_DIGIT_NUMBER = 9, 188 JSCT_LETTER_NUMBER = 10, 189 JSCT_OTHER_NUMBER = 11, 190 JSCT_SPACE_SEPARATOR = 12, 191 JSCT_LINE_SEPARATOR = 13, 192 JSCT_PARAGRAPH_SEPARATOR = 14, 193 JSCT_CONTROL = 15, 194 JSCT_FORMAT = 16, 195 JSCT_PRIVATE_USE = 18, 196 JSCT_SURROGATE = 19, 197 JSCT_DASH_PUNCTUATION = 20, 198 JSCT_START_PUNCTUATION = 21, 199 JSCT_END_PUNCTUATION = 22, 200 JSCT_CONNECTOR_PUNCTUATION = 23, 201 JSCT_OTHER_PUNCTUATION = 24, 202 JSCT_MATH_SYMBOL = 25, 203 JSCT_CURRENCY_SYMBOL = 26, 204 JSCT_MODIFIER_SYMBOL = 27, 205 JSCT_OTHER_SYMBOL = 28 206 } JSCharType; 207 208 /* Character classifying and mapping macros, based on java.lang.Character. */ 209 #define JS_CCODE(c) (js_A[js_Y[(js_X[(uint16)(c)>>6]<<6)|((c)&0x3F)]]) 210 #define JS_CTYPE(c) (JS_CCODE(c) & 0x1F) 211 212 #define JS_ISALPHA(c) ((((1 << JSCT_UPPERCASE_LETTER) | \ 213 (1 << JSCT_LOWERCASE_LETTER) | \ 214 (1 << JSCT_TITLECASE_LETTER) | \ 215 (1 << JSCT_MODIFIER_LETTER) | \ 216 (1 << JSCT_OTHER_LETTER)) \ 217 >> JS_CTYPE(c)) & 1) 218 219 #define JS_ISALNUM(c) ((((1 << JSCT_UPPERCASE_LETTER) | \ 220 (1 << JSCT_LOWERCASE_LETTER) | \ 221 (1 << JSCT_TITLECASE_LETTER) | \ 222 (1 << JSCT_MODIFIER_LETTER) | \ 223 (1 << JSCT_OTHER_LETTER) | \ 224 (1 << JSCT_DECIMAL_DIGIT_NUMBER)) \ 225 >> JS_CTYPE(c)) & 1) 226 227 /* A unicode letter, suitable for use in an identifier. */ 228 #define JS_ISLETTER(c) ((((1 << JSCT_UPPERCASE_LETTER) | \ 229 (1 << JSCT_LOWERCASE_LETTER) | \ 230 (1 << JSCT_TITLECASE_LETTER) | \ 231 (1 << JSCT_MODIFIER_LETTER) | \ 232 (1 << JSCT_OTHER_LETTER) | \ 233 (1 << JSCT_LETTER_NUMBER)) \ 234 >> JS_CTYPE(c)) & 1) 235 236 /* 237 * 'IdentifierPart' from ECMA grammar, is Unicode letter or combining mark or 238 * digit or connector punctuation. 239 */ 240 #define JS_ISIDPART(c) ((((1 << JSCT_UPPERCASE_LETTER) | \ 241 (1 << JSCT_LOWERCASE_LETTER) | \ 242 (1 << JSCT_TITLECASE_LETTER) | \ 243 (1 << JSCT_MODIFIER_LETTER) | \ 244 (1 << JSCT_OTHER_LETTER) | \ 245 (1 << JSCT_LETTER_NUMBER) | \ 246 (1 << JSCT_NON_SPACING_MARK) | \ 247 (1 << JSCT_COMBINING_SPACING_MARK) | \ 248 (1 << JSCT_DECIMAL_DIGIT_NUMBER) | \ 249 (1 << JSCT_CONNECTOR_PUNCTUATION)) \ 250 >> JS_CTYPE(c)) & 1) 251 252 /* Unicode control-format characters, ignored in input */ 253 #define JS_ISFORMAT(c) (((1 << JSCT_FORMAT) >> JS_CTYPE(c)) & 1) 254 255 /* 256 * Per ECMA-262 15.10.2.6, these characters are the only ones that make up a 257 * "word", as far as a RegExp is concerned. If we want a Unicode-friendlier 258 * definition of "word", we should rename this macro to something regexp-y. 259 */ 260 #define JS_ISWORD(c) ((c) < 128 && (isalnum(c) || (c) == '_')) 261 262 #define JS_ISIDSTART(c) (JS_ISLETTER(c) || (c) == '_' || (c) == '$') 263 #define JS_ISIDENT(c) (JS_ISIDPART(c) || (c) == '_' || (c) == '$') 264 265 #define JS_ISXMLSPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\r' || \ 266 (c) == '\n') 267 #define JS_ISXMLNSSTART(c) ((JS_CCODE(c) & 0x00000100) || (c) == '_') 268 #define JS_ISXMLNS(c) ((JS_CCODE(c) & 0x00000080) || (c) == '.' || \ 269 (c) == '-' || (c) == '_') 270 #define JS_ISXMLNAMESTART(c) (JS_ISXMLNSSTART(c) || (c) == ':') 271 #define JS_ISXMLNAME(c) (JS_ISXMLNS(c) || (c) == ':') 272 273 #define JS_ISDIGIT(c) (JS_CTYPE(c) == JSCT_DECIMAL_DIGIT_NUMBER) 274 275 /* XXXbe unify on A/X/Y tbls, avoid ctype.h? */ 276 /* XXXbe fs, etc. ? */ 277 #define JS_ISSPACE(c) ((JS_CCODE(c) & 0x00070000) == 0x00040000) 278 #define JS_ISPRINT(c) ((c) < 128 && isprint(c)) 279 280 #define JS_ISUPPER(c) (JS_CTYPE(c) == JSCT_UPPERCASE_LETTER) 281 #define JS_ISLOWER(c) (JS_CTYPE(c) == JSCT_LOWERCASE_LETTER) 282 283 #define JS_TOUPPER(c) ((jschar) ((JS_CCODE(c) & 0x00100000) \ 284 ? (c) - ((int32)JS_CCODE(c) >> 22) \ 285 : (c))) 286 #define JS_TOLOWER(c) ((jschar) ((JS_CCODE(c) & 0x00200000) \ 287 ? (c) + ((int32)JS_CCODE(c) >> 22) \ 288 : (c))) 289 290 /* 291 * Shorthands for ASCII (7-bit) decimal and hex conversion. 292 * Manually inline isdigit for performance; MSVC doesn't do this for us. 293 */ 294 #define JS7_ISDEC(c) ((((unsigned)(c)) - '0') <= 9) 295 #define JS7_UNDEC(c) ((c) - '0') 296 #define JS7_ISHEX(c) ((c) < 128 && isxdigit(c)) 297 #define JS7_UNHEX(c) (uintN)(JS7_ISDEC(c) ? (c) - '0' : 10 + tolower(c) - 'a') 298 #define JS7_ISLET(c) ((c) < 128 && isalpha(c)) 299 300 /* Initialize per-runtime string state for the first context in the runtime. */ 301 extern JSBool 302 js_InitRuntimeStringState(JSContext *cx); 303 304 extern void 305 js_FinishRuntimeStringState(JSContext *cx); 306 307 extern void 308 js_FinishDeflatedStringCache(JSRuntime *rt); 309 310 /* Initialize the String class, returning its prototype object. */ 311 extern JSClass js_StringClass; 312 313 extern JSObject * 314 js_InitStringClass(JSContext *cx, JSObject *obj); 315 316 extern const char js_escape_str[]; 317 extern const char js_unescape_str[]; 318 extern const char js_uneval_str[]; 319 extern const char js_decodeURI_str[]; 320 extern const char js_encodeURI_str[]; 321 extern const char js_decodeURIComponent_str[]; 322 extern const char js_encodeURIComponent_str[]; 323 324 /* GC-allocate a string descriptor for the given malloc-allocated chars. */ 325 extern JSString * 326 js_NewString(JSContext *cx, jschar *chars, size_t length, uintN gcflag); 327 328 extern JSString * 329 js_NewDependentString(JSContext *cx, JSString *base, size_t start, 330 size_t length, uintN gcflag); 331 332 /* Copy a counted string and GC-allocate a descriptor for it. */ 333 extern JSString * 334 js_NewStringCopyN(JSContext *cx, const jschar *s, size_t n, uintN gcflag); 335 336 /* Copy a C string and GC-allocate a descriptor for it. */ 337 extern JSString * 338 js_NewStringCopyZ(JSContext *cx, const jschar *s, uintN gcflag); 339 340 /* Free the chars held by str when it is finalized by the GC. */ 341 extern void 342 js_FinalizeString(JSContext *cx, JSString *str); 343 344 extern void 345 js_FinalizeStringRT(JSRuntime *rt, JSString *str); 346 347 /* Wrap a string value in a String object. */ 348 extern JSObject * 349 js_StringToObject(JSContext *cx, JSString *str); 350 351 /* 352 * Convert a value to a printable C string. 353 */ 354 typedef JSString *(*JSValueToStringFun)(JSContext *cx, jsval v); 355 356 extern JS_FRIEND_API(const char *) 357 js_ValueToPrintable(JSContext *cx, jsval v, JSValueToStringFun v2sfun); 358 359 #define js_ValueToPrintableString(cx,v) \ 360 js_ValueToPrintable(cx, v, js_ValueToString) 361 362 #define js_ValueToPrintableSource(cx,v) \ 363 js_ValueToPrintable(cx, v, js_ValueToSource) 364 365 /* 366 * Convert a value to a string, returning null after reporting an error, 367 * otherwise returning a new string reference. 368 */ 369 extern JS_FRIEND_API(JSString *) 370 js_ValueToString(JSContext *cx, jsval v); 371 372 /* 373 * Convert a value to its source expression, returning null after reporting 374 * an error, otherwise returning a new string reference. 375 */ 376 extern JS_FRIEND_API(JSString *) 377 js_ValueToSource(JSContext *cx, jsval v); 378 379 #ifdef HT_ENUMERATE_NEXT /* XXX don't require jshash.h */ 380 /* 381 * Compute a hash function from str. 382 */ 383 extern JSHashNumber 384 js_HashString(JSString *str); 385 #endif 386 387 /* 388 * Return less than, equal to, or greater than zero depending on whether 389 * str1 is less than, equal to, or greater than str2. 390 */ 391 extern intN 392 js_CompareStrings(JSString *str1, JSString *str2); 393 394 /* 395 * Test if strings are equal. 396 */ 397 extern JSBool 398 js_EqualStrings(JSString *str1, JSString *str2); 399 400 /* 401 * Boyer-Moore-Horspool superlinear search for pat:patlen in text:textlen. 402 * The patlen argument must be positive and no greater than BMH_PATLEN_MAX. 403 * The start argument tells where in text to begin the search. 404 * 405 * Return the index of pat in text, or -1 if not found. 406 */ 407 #define BMH_CHARSET_SIZE 256 /* ISO-Latin-1 */ 408 #define BMH_PATLEN_MAX 255 /* skip table element is uint8 */ 409 410 #define BMH_BAD_PATTERN (-2) /* return value if pat is not ISO-Latin-1 */ 411 412 extern jsint 413 js_BoyerMooreHorspool(const jschar *text, jsint textlen, 414 const jschar *pat, jsint patlen, 415 jsint start); 416 417 extern size_t 418 js_strlen(const jschar *s); 419 420 extern jschar * 421 js_strchr(const jschar *s, jschar c); 422 423 extern jschar * 424 js_strchr_limit(const jschar *s, jschar c, const jschar *limit); 425 426 #define js_strncpy(t, s, n) memcpy((t), (s), (n) * sizeof(jschar)) 427 428 /* 429 * Return s advanced past any Unicode white space characters. 430 */ 431 extern const jschar * 432 js_SkipWhiteSpace(const jschar *s); 433 434 /* 435 * Inflate bytes to JS chars and vice versa. Report out of memory via cx 436 * and return null on error, otherwise return the jschar or byte vector that 437 * was JS_malloc'ed. length is updated with the length of the new string in jschars. 438 */ 439 extern jschar * 440 js_InflateString(JSContext *cx, const char *bytes, size_t *length); 441 442 extern char * 443 js_DeflateString(JSContext *cx, const jschar *chars, size_t length); 444 445 /* 446 * Inflate bytes to JS chars into a buffer. 447 * 'chars' must be large enough for 'length' jschars. 448 * The buffer is NOT null-terminated. 449 * cx may be NULL, which means no errors are thrown. 450 * The destination length needs to be initialized with the buffer size, takes 451 * the number of chars moved. 452 */ 453 extern JSBool 454 js_InflateStringToBuffer(JSContext* cx, const char *bytes, size_t length, 455 jschar *chars, size_t* charsLength); 456 457 /* 458 * Deflate JS chars to bytes into a buffer. 459 * 'bytes' must be large enough for 'length chars. 460 * The buffer is NOT null-terminated. 461 * cx may be NULL, which means no errors are thrown. 462 * The destination length needs to be initialized with the buffer size, takes 463 * the number of bytes moved. 464 */ 465 extern JSBool 466 js_DeflateStringToBuffer(JSContext* cx, const jschar *chars, 467 size_t charsLength, char *bytes, size_t* length); 468 469 /* 470 * Associate bytes with str in the deflated string cache, returning true on 471 * successful association, false on out of memory. 472 */ 473 extern JSBool 474 js_SetStringBytes(JSRuntime *rt, JSString *str, char *bytes, size_t length); 475 476 /* 477 * Find or create a deflated string cache entry for str that contains its 478 * characters chopped from Unicode code points into bytes. 479 */ 480 extern char * 481 js_GetStringBytes(JSRuntime *rt, JSString *str); 482 483 /* Remove a deflated string cache entry associated with str if any. */ 484 extern void 485 js_PurgeDeflatedStringCache(JSRuntime *rt, JSString *str); 486 487 JSBool 488 js_str_escape(JSContext *cx, JSObject *obj, uintN argc, jsval *argv, 489 jsval *rval); 490 491 /* 492 * Convert one UCS-4 char and write it into a UTF-8 buffer, which must be at 493 * least 6 bytes long. Return the number of UTF-8 bytes of data written. 494 */ 495 extern int 496 js_OneUcs4ToUtf8Char(uint8 *utf8Buffer, uint32 ucs4Char); 497 498 JS_END_EXTERN_C 499 500 #endif /* jsstr_h___ */ 501