1 /* 2 * Unicode helpers 3 */ 4 5 #ifndef DUK_UNICODE_H_INCLUDED 6 #define DUK_UNICODE_H_INCLUDED 7 8 /* 9 * UTF-8 / XUTF-8 / CESU-8 constants 10 */ 11 12 #define DUK_UNICODE_MAX_XUTF8_LENGTH 7 /* up to 36 bit codepoints */ 13 #define DUK_UNICODE_MAX_XUTF8_BMP_LENGTH 3 /* all codepoints up to U+FFFF */ 14 #define DUK_UNICODE_MAX_CESU8_LENGTH 6 /* all codepoints up to U+10FFFF */ 15 #define DUK_UNICODE_MAX_CESU8_BMP_LENGTH 3 /* all codepoints up to U+FFFF */ 16 17 /* 18 * Useful Unicode codepoints 19 * 20 * Integer constants must be signed to avoid unexpected coercions 21 * in comparisons. 22 */ 23 24 #define DUK_UNICODE_CP_ZWNJ 0x200cL /* zero-width non-joiner */ 25 #define DUK_UNICODE_CP_ZWJ 0x200dL /* zero-width joiner */ 26 #define DUK_UNICODE_CP_REPLACEMENT_CHARACTER 0xfffdL /* http://en.wikipedia.org/wiki/Replacement_character#Replacement_character */ 27 28 /* 29 * ASCII character constants 30 * 31 * C character literals like 'x' have a platform specific value and do 32 * not match ASCII (UTF-8) values on e.g. EBCDIC platforms. So, use 33 * these (admittedly awkward) constants instead. These constants must 34 * also have signed values to avoid unexpected coercions in comparisons. 35 * 36 * http://en.wikipedia.org/wiki/ASCII 37 */ 38 39 #define DUK_ASC_NUL 0x00 40 #define DUK_ASC_SOH 0x01 41 #define DUK_ASC_STX 0x02 42 #define DUK_ASC_ETX 0x03 43 #define DUK_ASC_EOT 0x04 44 #define DUK_ASC_ENQ 0x05 45 #define DUK_ASC_ACK 0x06 46 #define DUK_ASC_BEL 0x07 47 #define DUK_ASC_BS 0x08 48 #define DUK_ASC_HT 0x09 49 #define DUK_ASC_LF 0x0a 50 #define DUK_ASC_VT 0x0b 51 #define DUK_ASC_FF 0x0c 52 #define DUK_ASC_CR 0x0d 53 #define DUK_ASC_SO 0x0e 54 #define DUK_ASC_SI 0x0f 55 #define DUK_ASC_DLE 0x10 56 #define DUK_ASC_DC1 0x11 57 #define DUK_ASC_DC2 0x12 58 #define DUK_ASC_DC3 0x13 59 #define DUK_ASC_DC4 0x14 60 #define DUK_ASC_NAK 0x15 61 #define DUK_ASC_SYN 0x16 62 #define DUK_ASC_ETB 0x17 63 #define DUK_ASC_CAN 0x18 64 #define DUK_ASC_EM 0x19 65 #define DUK_ASC_SUB 0x1a 66 #define DUK_ASC_ESC 0x1b 67 #define DUK_ASC_FS 0x1c 68 #define DUK_ASC_GS 0x1d 69 #define DUK_ASC_RS 0x1e 70 #define DUK_ASC_US 0x1f 71 #define DUK_ASC_SPACE 0x20 72 #define DUK_ASC_EXCLAMATION 0x21 73 #define DUK_ASC_DOUBLEQUOTE 0x22 74 #define DUK_ASC_HASH 0x23 75 #define DUK_ASC_DOLLAR 0x24 76 #define DUK_ASC_PERCENT 0x25 77 #define DUK_ASC_AMP 0x26 78 #define DUK_ASC_SINGLEQUOTE 0x27 79 #define DUK_ASC_LPAREN 0x28 80 #define DUK_ASC_RPAREN 0x29 81 #define DUK_ASC_STAR 0x2a 82 #define DUK_ASC_PLUS 0x2b 83 #define DUK_ASC_COMMA 0x2c 84 #define DUK_ASC_MINUS 0x2d 85 #define DUK_ASC_PERIOD 0x2e 86 #define DUK_ASC_SLASH 0x2f 87 #define DUK_ASC_0 0x30 88 #define DUK_ASC_1 0x31 89 #define DUK_ASC_2 0x32 90 #define DUK_ASC_3 0x33 91 #define DUK_ASC_4 0x34 92 #define DUK_ASC_5 0x35 93 #define DUK_ASC_6 0x36 94 #define DUK_ASC_7 0x37 95 #define DUK_ASC_8 0x38 96 #define DUK_ASC_9 0x39 97 #define DUK_ASC_COLON 0x3a 98 #define DUK_ASC_SEMICOLON 0x3b 99 #define DUK_ASC_LANGLE 0x3c 100 #define DUK_ASC_EQUALS 0x3d 101 #define DUK_ASC_RANGLE 0x3e 102 #define DUK_ASC_QUESTION 0x3f 103 #define DUK_ASC_ATSIGN 0x40 104 #define DUK_ASC_UC_A 0x41 105 #define DUK_ASC_UC_B 0x42 106 #define DUK_ASC_UC_C 0x43 107 #define DUK_ASC_UC_D 0x44 108 #define DUK_ASC_UC_E 0x45 109 #define DUK_ASC_UC_F 0x46 110 #define DUK_ASC_UC_G 0x47 111 #define DUK_ASC_UC_H 0x48 112 #define DUK_ASC_UC_I 0x49 113 #define DUK_ASC_UC_J 0x4a 114 #define DUK_ASC_UC_K 0x4b 115 #define DUK_ASC_UC_L 0x4c 116 #define DUK_ASC_UC_M 0x4d 117 #define DUK_ASC_UC_N 0x4e 118 #define DUK_ASC_UC_O 0x4f 119 #define DUK_ASC_UC_P 0x50 120 #define DUK_ASC_UC_Q 0x51 121 #define DUK_ASC_UC_R 0x52 122 #define DUK_ASC_UC_S 0x53 123 #define DUK_ASC_UC_T 0x54 124 #define DUK_ASC_UC_U 0x55 125 #define DUK_ASC_UC_V 0x56 126 #define DUK_ASC_UC_W 0x57 127 #define DUK_ASC_UC_X 0x58 128 #define DUK_ASC_UC_Y 0x59 129 #define DUK_ASC_UC_Z 0x5a 130 #define DUK_ASC_LBRACKET 0x5b 131 #define DUK_ASC_BACKSLASH 0x5c 132 #define DUK_ASC_RBRACKET 0x5d 133 #define DUK_ASC_CARET 0x5e 134 #define DUK_ASC_UNDERSCORE 0x5f 135 #define DUK_ASC_GRAVE 0x60 136 #define DUK_ASC_LC_A 0x61 137 #define DUK_ASC_LC_B 0x62 138 #define DUK_ASC_LC_C 0x63 139 #define DUK_ASC_LC_D 0x64 140 #define DUK_ASC_LC_E 0x65 141 #define DUK_ASC_LC_F 0x66 142 #define DUK_ASC_LC_G 0x67 143 #define DUK_ASC_LC_H 0x68 144 #define DUK_ASC_LC_I 0x69 145 #define DUK_ASC_LC_J 0x6a 146 #define DUK_ASC_LC_K 0x6b 147 #define DUK_ASC_LC_L 0x6c 148 #define DUK_ASC_LC_M 0x6d 149 #define DUK_ASC_LC_N 0x6e 150 #define DUK_ASC_LC_O 0x6f 151 #define DUK_ASC_LC_P 0x70 152 #define DUK_ASC_LC_Q 0x71 153 #define DUK_ASC_LC_R 0x72 154 #define DUK_ASC_LC_S 0x73 155 #define DUK_ASC_LC_T 0x74 156 #define DUK_ASC_LC_U 0x75 157 #define DUK_ASC_LC_V 0x76 158 #define DUK_ASC_LC_W 0x77 159 #define DUK_ASC_LC_X 0x78 160 #define DUK_ASC_LC_Y 0x79 161 #define DUK_ASC_LC_Z 0x7a 162 #define DUK_ASC_LCURLY 0x7b 163 #define DUK_ASC_PIPE 0x7c 164 #define DUK_ASC_RCURLY 0x7d 165 #define DUK_ASC_TILDE 0x7e 166 #define DUK_ASC_DEL 0x7f 167 168 /* 169 * Unicode tables 170 */ 171 172 #ifdef DUK_USE_SOURCE_NONBMP 173 /* 174 * Automatically generated by extract_chars.py, do not edit! 175 */ 176 177 extern const duk_uint8_t duk_unicode_ids_noa[791]; 178 #else 179 /* 180 * Automatically generated by extract_chars.py, do not edit! 181 */ 182 183 extern const duk_uint8_t duk_unicode_ids_noabmp[611]; 184 #endif 185 186 #ifdef DUK_USE_SOURCE_NONBMP 187 /* 188 * Automatically generated by extract_chars.py, do not edit! 189 */ 190 191 extern const duk_uint8_t duk_unicode_ids_m_let_noa[42]; 192 #else 193 /* 194 * Automatically generated by extract_chars.py, do not edit! 195 */ 196 197 extern const duk_uint8_t duk_unicode_ids_m_let_noabmp[24]; 198 #endif 199 200 #ifdef DUK_USE_SOURCE_NONBMP 201 /* 202 * Automatically generated by extract_chars.py, do not edit! 203 */ 204 205 extern const duk_uint8_t duk_unicode_idp_m_ids_noa[397]; 206 #else 207 /* 208 * Automatically generated by extract_chars.py, do not edit! 209 */ 210 211 extern const duk_uint8_t duk_unicode_idp_m_ids_noabmp[348]; 212 #endif 213 214 /* 215 * Automatically generated by extract_caseconv.py, do not edit! 216 */ 217 218 extern const duk_uint8_t duk_unicode_caseconv_uc[1288]; 219 extern const duk_uint8_t duk_unicode_caseconv_lc[616]; 220 221 #if defined(DUK_USE_REGEXP_CANON_WORKAROUND) 222 /* 223 * Automatically generated by extract_caseconv.py, do not edit! 224 */ 225 226 extern const duk_uint16_t duk_unicode_re_canon_lookup[65536]; 227 #endif 228 229 /* 230 * Extern 231 */ 232 233 /* duk_unicode_support.c */ 234 #if !defined(DUK_SINGLE_FILE) 235 DUK_INTERNAL_DECL const duk_uint8_t duk_unicode_xutf8_markers[7]; 236 DUK_INTERNAL_DECL const duk_uint16_t duk_unicode_re_ranges_digit[2]; 237 DUK_INTERNAL_DECL const duk_uint16_t duk_unicode_re_ranges_white[22]; 238 DUK_INTERNAL_DECL const duk_uint16_t duk_unicode_re_ranges_wordchar[8]; 239 DUK_INTERNAL_DECL const duk_uint16_t duk_unicode_re_ranges_not_digit[4]; 240 DUK_INTERNAL_DECL const duk_uint16_t duk_unicode_re_ranges_not_white[24]; 241 DUK_INTERNAL_DECL const duk_uint16_t duk_unicode_re_ranges_not_wordchar[10]; 242 DUK_INTERNAL_DECL const duk_int8_t duk_is_idchar_tab[128]; 243 #endif /* !DUK_SINGLE_FILE */ 244 245 /* 246 * Prototypes 247 */ 248 249 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp); 250 #if defined(DUK_USE_ASSERTIONS) 251 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_get_cesu8_length(duk_ucodepoint_t cp); 252 #endif 253 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out); 254 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out); 255 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp); 256 DUK_INTERNAL_DECL duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end); 257 DUK_INTERNAL_DECL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen); 258 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp); 259 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp); 260 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp); 261 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp); 262 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_letter(duk_codepoint_t cp); 263 DUK_INTERNAL_DECL void duk_unicode_case_convert_string(duk_hthread *thr, duk_bool_t uppercase); 264 DUK_INTERNAL_DECL duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp); 265 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t cp); 266 267 #endif /* DUK_UNICODE_H_INCLUDED */ 268