1 /*
2  *  Unicode helpers
3  */
4 
5 #ifndef DUK_UNICODE_H_INCLUDED
6 #define DUK_UNICODE_H_INCLUDED
7 
8 /*
9  *  UTF-8 / XUTF-8 / CESU-8 constants
10  */
11 
12 #define DUK_UNICODE_MAX_XUTF8_LENGTH      7   /* up to 36 bit codepoints */
13 #define DUK_UNICODE_MAX_XUTF8_BMP_LENGTH  3   /* all codepoints up to U+FFFF */
14 #define DUK_UNICODE_MAX_CESU8_LENGTH      6   /* all codepoints up to U+10FFFF */
15 #define DUK_UNICODE_MAX_CESU8_BMP_LENGTH  3   /* all codepoints up to U+FFFF */
16 
17 /*
18  *  Useful Unicode codepoints
19  *
20  *  Integer constants must be signed to avoid unexpected coercions
21  *  in comparisons.
22  */
23 
24 #define DUK_UNICODE_CP_ZWNJ                   0x200cL  /* zero-width non-joiner */
25 #define DUK_UNICODE_CP_ZWJ                    0x200dL  /* zero-width joiner */
26 #define DUK_UNICODE_CP_REPLACEMENT_CHARACTER  0xfffdL  /* http://en.wikipedia.org/wiki/Replacement_character#Replacement_character */
27 
28 /*
29  *  ASCII character constants
30  *
31  *  C character literals like 'x' have a platform specific value and do
32  *  not match ASCII (UTF-8) values on e.g. EBCDIC platforms.  So, use
33  *  these (admittedly awkward) constants instead.  These constants must
34  *  also have signed values to avoid unexpected coercions in comparisons.
35  *
36  *  http://en.wikipedia.org/wiki/ASCII
37  */
38 
39 #define DUK_ASC_NUL              0x00
40 #define DUK_ASC_SOH              0x01
41 #define DUK_ASC_STX              0x02
42 #define DUK_ASC_ETX              0x03
43 #define DUK_ASC_EOT              0x04
44 #define DUK_ASC_ENQ              0x05
45 #define DUK_ASC_ACK              0x06
46 #define DUK_ASC_BEL              0x07
47 #define DUK_ASC_BS               0x08
48 #define DUK_ASC_HT               0x09
49 #define DUK_ASC_LF               0x0a
50 #define DUK_ASC_VT               0x0b
51 #define DUK_ASC_FF               0x0c
52 #define DUK_ASC_CR               0x0d
53 #define DUK_ASC_SO               0x0e
54 #define DUK_ASC_SI               0x0f
55 #define DUK_ASC_DLE              0x10
56 #define DUK_ASC_DC1              0x11
57 #define DUK_ASC_DC2              0x12
58 #define DUK_ASC_DC3              0x13
59 #define DUK_ASC_DC4              0x14
60 #define DUK_ASC_NAK              0x15
61 #define DUK_ASC_SYN              0x16
62 #define DUK_ASC_ETB              0x17
63 #define DUK_ASC_CAN              0x18
64 #define DUK_ASC_EM               0x19
65 #define DUK_ASC_SUB              0x1a
66 #define DUK_ASC_ESC              0x1b
67 #define DUK_ASC_FS               0x1c
68 #define DUK_ASC_GS               0x1d
69 #define DUK_ASC_RS               0x1e
70 #define DUK_ASC_US               0x1f
71 #define DUK_ASC_SPACE            0x20
72 #define DUK_ASC_EXCLAMATION      0x21
73 #define DUK_ASC_DOUBLEQUOTE      0x22
74 #define DUK_ASC_HASH             0x23
75 #define DUK_ASC_DOLLAR           0x24
76 #define DUK_ASC_PERCENT          0x25
77 #define DUK_ASC_AMP              0x26
78 #define DUK_ASC_SINGLEQUOTE      0x27
79 #define DUK_ASC_LPAREN           0x28
80 #define DUK_ASC_RPAREN           0x29
81 #define DUK_ASC_STAR             0x2a
82 #define DUK_ASC_PLUS             0x2b
83 #define DUK_ASC_COMMA            0x2c
84 #define DUK_ASC_MINUS            0x2d
85 #define DUK_ASC_PERIOD           0x2e
86 #define DUK_ASC_SLASH            0x2f
87 #define DUK_ASC_0                0x30
88 #define DUK_ASC_1                0x31
89 #define DUK_ASC_2                0x32
90 #define DUK_ASC_3                0x33
91 #define DUK_ASC_4                0x34
92 #define DUK_ASC_5                0x35
93 #define DUK_ASC_6                0x36
94 #define DUK_ASC_7                0x37
95 #define DUK_ASC_8                0x38
96 #define DUK_ASC_9                0x39
97 #define DUK_ASC_COLON            0x3a
98 #define DUK_ASC_SEMICOLON        0x3b
99 #define DUK_ASC_LANGLE           0x3c
100 #define DUK_ASC_EQUALS           0x3d
101 #define DUK_ASC_RANGLE           0x3e
102 #define DUK_ASC_QUESTION         0x3f
103 #define DUK_ASC_ATSIGN           0x40
104 #define DUK_ASC_UC_A             0x41
105 #define DUK_ASC_UC_B             0x42
106 #define DUK_ASC_UC_C             0x43
107 #define DUK_ASC_UC_D             0x44
108 #define DUK_ASC_UC_E             0x45
109 #define DUK_ASC_UC_F             0x46
110 #define DUK_ASC_UC_G             0x47
111 #define DUK_ASC_UC_H             0x48
112 #define DUK_ASC_UC_I             0x49
113 #define DUK_ASC_UC_J             0x4a
114 #define DUK_ASC_UC_K             0x4b
115 #define DUK_ASC_UC_L             0x4c
116 #define DUK_ASC_UC_M             0x4d
117 #define DUK_ASC_UC_N             0x4e
118 #define DUK_ASC_UC_O             0x4f
119 #define DUK_ASC_UC_P             0x50
120 #define DUK_ASC_UC_Q             0x51
121 #define DUK_ASC_UC_R             0x52
122 #define DUK_ASC_UC_S             0x53
123 #define DUK_ASC_UC_T             0x54
124 #define DUK_ASC_UC_U             0x55
125 #define DUK_ASC_UC_V             0x56
126 #define DUK_ASC_UC_W             0x57
127 #define DUK_ASC_UC_X             0x58
128 #define DUK_ASC_UC_Y             0x59
129 #define DUK_ASC_UC_Z             0x5a
130 #define DUK_ASC_LBRACKET         0x5b
131 #define DUK_ASC_BACKSLASH        0x5c
132 #define DUK_ASC_RBRACKET         0x5d
133 #define DUK_ASC_CARET            0x5e
134 #define DUK_ASC_UNDERSCORE       0x5f
135 #define DUK_ASC_GRAVE            0x60
136 #define DUK_ASC_LC_A             0x61
137 #define DUK_ASC_LC_B             0x62
138 #define DUK_ASC_LC_C             0x63
139 #define DUK_ASC_LC_D             0x64
140 #define DUK_ASC_LC_E             0x65
141 #define DUK_ASC_LC_F             0x66
142 #define DUK_ASC_LC_G             0x67
143 #define DUK_ASC_LC_H             0x68
144 #define DUK_ASC_LC_I             0x69
145 #define DUK_ASC_LC_J             0x6a
146 #define DUK_ASC_LC_K             0x6b
147 #define DUK_ASC_LC_L             0x6c
148 #define DUK_ASC_LC_M             0x6d
149 #define DUK_ASC_LC_N             0x6e
150 #define DUK_ASC_LC_O             0x6f
151 #define DUK_ASC_LC_P             0x70
152 #define DUK_ASC_LC_Q             0x71
153 #define DUK_ASC_LC_R             0x72
154 #define DUK_ASC_LC_S             0x73
155 #define DUK_ASC_LC_T             0x74
156 #define DUK_ASC_LC_U             0x75
157 #define DUK_ASC_LC_V             0x76
158 #define DUK_ASC_LC_W             0x77
159 #define DUK_ASC_LC_X             0x78
160 #define DUK_ASC_LC_Y             0x79
161 #define DUK_ASC_LC_Z             0x7a
162 #define DUK_ASC_LCURLY           0x7b
163 #define DUK_ASC_PIPE             0x7c
164 #define DUK_ASC_RCURLY           0x7d
165 #define DUK_ASC_TILDE            0x7e
166 #define DUK_ASC_DEL              0x7f
167 
168 /*
169  *  Unicode tables
170  */
171 
172 #ifdef DUK_USE_SOURCE_NONBMP
173 /*
174  *  Automatically generated by extract_chars.py, do not edit!
175  */
176 
177 extern const duk_uint8_t duk_unicode_ids_noa[791];
178 #else
179 /*
180  *  Automatically generated by extract_chars.py, do not edit!
181  */
182 
183 extern const duk_uint8_t duk_unicode_ids_noabmp[611];
184 #endif
185 
186 #ifdef DUK_USE_SOURCE_NONBMP
187 /*
188  *  Automatically generated by extract_chars.py, do not edit!
189  */
190 
191 extern const duk_uint8_t duk_unicode_ids_m_let_noa[42];
192 #else
193 /*
194  *  Automatically generated by extract_chars.py, do not edit!
195  */
196 
197 extern const duk_uint8_t duk_unicode_ids_m_let_noabmp[24];
198 #endif
199 
200 #ifdef DUK_USE_SOURCE_NONBMP
201 /*
202  *  Automatically generated by extract_chars.py, do not edit!
203  */
204 
205 extern const duk_uint8_t duk_unicode_idp_m_ids_noa[397];
206 #else
207 /*
208  *  Automatically generated by extract_chars.py, do not edit!
209  */
210 
211 extern const duk_uint8_t duk_unicode_idp_m_ids_noabmp[348];
212 #endif
213 
214 /*
215  *  Automatically generated by extract_caseconv.py, do not edit!
216  */
217 
218 extern const duk_uint8_t duk_unicode_caseconv_uc[1288];
219 extern const duk_uint8_t duk_unicode_caseconv_lc[616];
220 
221 #if defined(DUK_USE_REGEXP_CANON_WORKAROUND)
222 /*
223  *  Automatically generated by extract_caseconv.py, do not edit!
224  */
225 
226 extern const duk_uint16_t duk_unicode_re_canon_lookup[65536];
227 #endif
228 
229 /*
230  *  Extern
231  */
232 
233 /* duk_unicode_support.c */
234 #if !defined(DUK_SINGLE_FILE)
235 DUK_INTERNAL_DECL const duk_uint8_t duk_unicode_xutf8_markers[7];
236 DUK_INTERNAL_DECL const duk_uint16_t duk_unicode_re_ranges_digit[2];
237 DUK_INTERNAL_DECL const duk_uint16_t duk_unicode_re_ranges_white[22];
238 DUK_INTERNAL_DECL const duk_uint16_t duk_unicode_re_ranges_wordchar[8];
239 DUK_INTERNAL_DECL const duk_uint16_t duk_unicode_re_ranges_not_digit[4];
240 DUK_INTERNAL_DECL const duk_uint16_t duk_unicode_re_ranges_not_white[24];
241 DUK_INTERNAL_DECL const duk_uint16_t duk_unicode_re_ranges_not_wordchar[10];
242 DUK_INTERNAL_DECL const duk_int8_t duk_is_idchar_tab[128];
243 #endif  /* !DUK_SINGLE_FILE */
244 
245 /*
246  *  Prototypes
247  */
248 
249 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_get_xutf8_length(duk_ucodepoint_t cp);
250 #if defined(DUK_USE_ASSERTIONS)
251 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_get_cesu8_length(duk_ucodepoint_t cp);
252 #endif
253 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_encode_xutf8(duk_ucodepoint_t cp, duk_uint8_t *out);
254 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_encode_cesu8(duk_ucodepoint_t cp, duk_uint8_t *out);
255 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_decode_xutf8(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end, duk_ucodepoint_t *out_cp);
256 DUK_INTERNAL_DECL duk_ucodepoint_t duk_unicode_decode_xutf8_checked(duk_hthread *thr, const duk_uint8_t **ptr, const duk_uint8_t *ptr_start, const duk_uint8_t *ptr_end);
257 DUK_INTERNAL_DECL duk_size_t duk_unicode_unvalidated_utf8_length(const duk_uint8_t *data, duk_size_t blen);
258 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_whitespace(duk_codepoint_t cp);
259 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_line_terminator(duk_codepoint_t cp);
260 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_identifier_start(duk_codepoint_t cp);
261 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_identifier_part(duk_codepoint_t cp);
262 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_is_letter(duk_codepoint_t cp);
263 DUK_INTERNAL_DECL void duk_unicode_case_convert_string(duk_hthread *thr, duk_bool_t uppercase);
264 DUK_INTERNAL_DECL duk_codepoint_t duk_unicode_re_canonicalize_char(duk_hthread *thr, duk_codepoint_t cp);
265 DUK_INTERNAL_DECL duk_small_int_t duk_unicode_re_is_wordchar(duk_codepoint_t cp);
266 
267 #endif  /* DUK_UNICODE_H_INCLUDED */
268