1 
2 /*
3  * Copyright (C) Igor Sysoev
4  * Copyright (C) NGINX, Inc.
5  */
6 
7 #ifndef _NJS_STRING_H_INCLUDED_
8 #define _NJS_STRING_H_INCLUDED_
9 
10 
11 /*
12  * nJSVM supports two string variants:
13  *
14  * 1) short strings which size is less than or equal to 14 (NJS_STRING_SHORT)
15  *    bytes, these strings are stored inside  njs_value_t (see njs_vm.h for
16  *    details);
17  *
18  * 2) and long strings using additional njs_string_t structure.
19  *    This structure has the start field to support external strings.
20  *    The long strings can have optional UTF-8 offset map.
21  *
22  * The number of the string variants is limited to 2 variants to minimize
23  * overhead of processing string fields.
24  */
25 
26 /* The maximum signed int32_t. */
27 #define NJS_STRING_MAX_LENGTH  0x7fffffff
28 
29 /*
30  * NJS_STRING_MAP_STRIDE should be power of two to use shift and binary
31  * AND operations instead of division and remainder operations but no
32  * less than 16 because the maximum length of short string inlined in
33  * njs_value_t is less than 16 bytes.
34  */
35 #define NJS_STRING_MAP_STRIDE  32
36 
37 #define njs_string_map_offset(size)  njs_align_size((size), sizeof(uint32_t))
38 
39 #define njs_string_map_start(p)                                               \
40     ((uint32_t *) njs_align_ptr((p), sizeof(uint32_t)))
41 
42 #define njs_string_map_size(length)                                           \
43     (((length - 1) / NJS_STRING_MAP_STRIDE) * sizeof(uint32_t))
44 
45 /*
46  * ECMAScript strings are stored in UTF-16.  nJSVM however, allows to store
47  * any byte sequences in strings.  A size of string in bytes is stored in the
48  * size field.  If byte sequence is valid UTF-8 string then its length is
49  * stored in the UTF-8 length field.  Otherwise, the length field is zero.
50  * If a string is UTF-8 string then string functions use UTF-8 characters
51  * positions and lengths.  Otherwise they use with byte positions and lengths.
52  * Using UTF-8 encoding does not allow to get quickly a character at specified
53  * position.  To speed up this search a map of offsets is stored after the
54  * UTF-8 string.  The map is aligned to uint32_t and contains byte positions
55  * of each NJS_STRING_MAP_STRIDE UTF-8 character except zero position.  The
56  * map can be initialized on demand.  Unitialized map is marked with zero
57  * value in the first map element.  If string comes outside JavaScript as
58  * byte string just to be concatenated or to match regular expressions the
59  * offset map is not required.
60  *
61  * The map is not allocated:
62  * 1) if string length is zero hence string is a byte string;
63  * 2) if string size and length are equal so the string contains only
64  *    ASCII characters and map is not required;
65  * 3) if string length is less than NJS_STRING_MAP_STRIDE.
66  *
67  * The current implementation does not support Unicode surrogate pairs.
68  * It can be implemented later if it will be required using the following
69  * algorithm: if offset in map points to surrogate pair then the previous
70  * offset should be used and so on until start of the string.
71  */
72 
73 struct njs_string_s {
74     u_char    *start;
75     uint32_t  length;   /* Length in UTF-8 characters. */
76     uint32_t  retain;   /* Link counter. */
77 };
78 
79 
80 typedef struct {
81     size_t    size;
82     size_t    length;
83     u_char    *start;
84 } njs_string_prop_t;
85 
86 
87 typedef struct {
88     size_t    start;
89     size_t    length;
90     size_t    string_length;
91 } njs_slice_prop_t;
92 
93 
94 typedef enum {
95     NJS_STRING_BYTE = 0,
96     NJS_STRING_ASCII,
97     NJS_STRING_UTF8,
98 } njs_utf8_t;
99 
100 
101 njs_inline njs_bool_t
njs_is_byte_string(njs_string_prop_t * string)102 njs_is_byte_string(njs_string_prop_t *string)
103 {
104     return (string->length == 0 && string->size != 0);
105 }
106 
107 
108 njs_inline uint32_t
njs_string_calc_length(njs_utf8_t utf8,const u_char * start,size_t size)109 njs_string_calc_length(njs_utf8_t utf8, const u_char *start, size_t size)
110 {
111     ssize_t  length;
112 
113     switch (utf8) {
114 
115     case NJS_STRING_BYTE:
116         return 0;
117 
118     case NJS_STRING_ASCII:
119         return size;
120 
121     case NJS_STRING_UTF8:
122     default:
123         length = njs_utf8_length(start, size);
124 
125         return (length >= 0) ? length : 0;
126     }
127 }
128 
129 
130 njs_inline uint32_t
njs_string_length(njs_value_t * string)131 njs_string_length(njs_value_t *string)
132 {
133     uint32_t  length, size;
134 
135     if (string->short_string.size != NJS_STRING_LONG) {
136         size = string->short_string.size;
137         length = string->short_string.length;
138 
139     } else {
140         size = string->long_string.size;
141         length = string->long_string.data->length;
142     }
143 
144     return (length == 0) ? size : length;
145 }
146 
147 
148 njs_inline njs_bool_t
njs_need_escape(const uint32_t * escape,uint32_t byte)149 njs_need_escape(const uint32_t *escape, uint32_t byte)
150 {
151     return ((escape[byte >> 5] & ((uint32_t) 1 << (byte & 0x1f))) != 0);
152 }
153 
154 
155 njs_inline u_char *
njs_string_encode(const uint32_t * escape,size_t size,const u_char * src,u_char * dst)156 njs_string_encode(const uint32_t *escape, size_t size, const u_char *src,
157     u_char *dst)
158 {
159     uint8_t              byte;
160     static const u_char  hex[16] = "0123456789ABCDEF";
161 
162     do {
163         byte = *src++;
164 
165         if (njs_need_escape(escape, byte)) {
166             *dst++ = '%';
167             *dst++ = hex[byte >> 4];
168             *dst++ = hex[byte & 0xf];
169 
170         } else {
171             *dst++ = byte;
172         }
173 
174         size--;
175 
176     } while (size != 0);
177 
178     return dst;
179 }
180 
181 
182 njs_int_t njs_string_set(njs_vm_t *vm, njs_value_t *value, const u_char *start,
183     uint32_t size);
184 u_char *njs_string_alloc(njs_vm_t *vm, njs_value_t *value, uint64_t size,
185     uint64_t length);
186 njs_int_t njs_string_new(njs_vm_t *vm, njs_value_t *value, const u_char *start,
187     uint32_t size, uint32_t length);
188 njs_int_t njs_string_create(njs_vm_t *vm, njs_value_t *value, const char *src,
189     size_t size);
190 
191 void njs_encode_hex(njs_str_t *dst, const njs_str_t *src);
192 size_t njs_encode_hex_length(const njs_str_t *src, size_t *out_size);
193 void njs_encode_base64(njs_str_t *dst, const njs_str_t *src);
194 size_t njs_encode_base64_length(const njs_str_t *src, size_t *out_size);
195 
196 void njs_decode_utf8(njs_str_t *dst, const njs_str_t *src);
197 size_t njs_decode_utf8_length(const njs_str_t *src, size_t *out_size);
198 void njs_decode_hex(njs_str_t *dst, const njs_str_t *src);
199 size_t njs_decode_hex_length(const njs_str_t *src, size_t *out_size);
200 void njs_decode_base64(njs_str_t *dst, const njs_str_t *src);
201 size_t njs_decode_base64_length(const njs_str_t *src, size_t *out_size);
202 void njs_decode_base64url(njs_str_t *dst, const njs_str_t *src);
203 size_t njs_decode_base64url_length(const njs_str_t *src, size_t *out_size);
204 
205 njs_int_t njs_string_hex(njs_vm_t *vm, njs_value_t *value,
206     const njs_str_t *src);
207 njs_int_t njs_string_base64(njs_vm_t *vm, njs_value_t *value,
208     const njs_str_t *src);
209 njs_int_t njs_string_base64url(njs_vm_t *vm, njs_value_t *value,
210     const njs_str_t *src);
211 njs_int_t njs_string_decode_utf8(njs_vm_t *vm, njs_value_t *value,
212     const njs_str_t *src);
213 njs_int_t njs_string_decode_hex(njs_vm_t *vm, njs_value_t *value,
214     const njs_str_t *src);
215 njs_int_t njs_string_decode_base64(njs_vm_t *vm, njs_value_t *value,
216     const njs_str_t *src);
217 njs_int_t njs_string_decode_base64url(njs_vm_t *vm, njs_value_t *value,
218     const njs_str_t *src);
219 void njs_string_truncate(njs_value_t *value, uint32_t size, uint32_t length);
220 void njs_string_copy(njs_value_t *dst, njs_value_t *src);
221 njs_int_t njs_string_validate(njs_vm_t *vm, njs_string_prop_t *string,
222     njs_value_t *value);
223 size_t njs_string_prop(njs_string_prop_t *string, const njs_value_t *value);
224 njs_int_t njs_string_cmp(const njs_value_t *val1, const njs_value_t *val2);
225 void njs_string_slice_string_prop(njs_string_prop_t *dst,
226     const njs_string_prop_t *string, const njs_slice_prop_t *slice);
227 njs_int_t njs_string_slice(njs_vm_t *vm, njs_value_t *dst,
228     const njs_string_prop_t *string, const njs_slice_prop_t *slice);
229 const u_char *njs_string_offset(const u_char *start, const u_char *end,
230     size_t index);
231 uint32_t njs_string_index(njs_string_prop_t *string, uint32_t offset);
232 void njs_string_offset_map_init(const u_char *start, size_t size);
233 double njs_string_to_index(const njs_value_t *value);
234 const char *njs_string_to_c_string(njs_vm_t *vm, njs_value_t *value);
235 njs_int_t njs_string_encode_uri(njs_vm_t *vm, njs_value_t *args,
236     njs_uint_t nargs, njs_index_t component);
237 njs_int_t njs_string_decode_uri(njs_vm_t *vm, njs_value_t *args,
238     njs_uint_t nargs, njs_index_t component);
239 
240 njs_int_t njs_string_prototype_concat(njs_vm_t *vm, njs_value_t *args,
241     njs_uint_t nargs, njs_index_t unused);
242 njs_int_t njs_string_split_part_add(njs_vm_t *vm, njs_array_t *array,
243     njs_utf8_t utf8, const u_char *start, size_t size);
244 njs_int_t njs_string_get_substitution(njs_vm_t *vm, njs_value_t *matched,
245     njs_value_t *string, int64_t pos, njs_value_t *captures, int64_t ncaptures,
246     njs_value_t *groups, njs_value_t *replacement, njs_value_t *retval);
247 
248 
249 extern const njs_object_init_t  njs_string_instance_init;
250 extern const njs_object_type_init_t  njs_string_type_init;
251 
252 
253 #endif /* _NJS_STRING_H_INCLUDED_ */
254