1 /*
2 Copyright (C) 2004-2014, Parrot Foundation.
3
4 =head1 NAME
5
6 src/string/encoding.c - global encoding functions
7
8 =head1 DESCRIPTION
9
10 These are parrot's generic encoding handling functions
11
12 =over 4
13
14 =cut
15
16 */
17
18 #include "parrot/encoding.h"
19 #include "parrot/namealias.h"
20 #if PARROT_HAS_ICU
21 # include <unicode/uchar.h>
22 #endif
23 #include "encoding.str"
24
25 STR_VTABLE *Parrot_default_encoding_ptr = NULL;
26 STR_VTABLE *Parrot_platform_encoding_ptr = NULL;
27
28 static STR_VTABLE **encodings;
29 static int n_encodings = 0;
30 static STRING *platform_str;
31 /* for backwards compatibility */
32 static STRING *unicode_str;
33 static STRING *fixed_8_str;
34
35 #define ENC_NAME_PLATFORM "platform"
36 #define ENC_NAME_UNICODE "unicode"
37 #define ENC_NAME_FIXED8 "fixed_8"
38
39 /* HEADERIZER HFILE: include/parrot/encoding.h */
40
41 /* HEADERIZER BEGIN: static */
42 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
43
44 PARROT_PURE_FUNCTION
45 PARROT_WARN_UNUSED_RESULT
46 PARROT_CAN_RETURN_NULL
47 static const STR_VTABLE * find_encoding(PARROT_INTERP,
48 ARGIN(const STRING *encodingname))
49 __attribute__nonnull__(1)
50 __attribute__nonnull__(2);
51
52 #define ASSERT_ARGS_find_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
53 PARROT_ASSERT_ARG(interp) \
54 , PARROT_ASSERT_ARG(encodingname))
55 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END. Your changes will be lost. */
56 /* HEADERIZER END: static */
57
58
59 /*
60
61 =item C<void Parrot_deinit_encodings(PARROT_INTERP)>
62
63 Deinitialize encodings and free all memory used by them.
64
65 =cut
66
67 */
68
69 void
Parrot_deinit_encodings(PARROT_INTERP)70 Parrot_deinit_encodings(PARROT_INTERP)
71 {
72 ASSERT_ARGS(Parrot_deinit_encodings)
73
74 mem_gc_free(interp, encodings);
75 encodings = NULL;
76 n_encodings = 0;
77 }
78
79 /*
80
81 =item C<STR_VTABLE * Parrot_new_encoding(PARROT_INTERP)>
82
83 Allocates the memory for a new string vtable from the system.
84
85 =cut
86
87 */
88
89 PARROT_EXPORT
90 PARROT_MALLOC
91 PARROT_CANNOT_RETURN_NULL
92 STR_VTABLE *
Parrot_new_encoding(PARROT_INTERP)93 Parrot_new_encoding(PARROT_INTERP)
94 {
95 ASSERT_ARGS(Parrot_new_encoding)
96 return mem_gc_allocate_typed(interp, STR_VTABLE);
97 }
98
99 /*
100
101 =item C<static const STR_VTABLE * find_encoding(PARROT_INTERP, const STRING
102 *encodingname)>
103
104 Finds an encoding with the STRING name C<encodingname>. Returns the encoding
105 if it is successfully found, returns NULL otherwise.
106
107 =cut
108
109 */
110 PARROT_PURE_FUNCTION
111 PARROT_WARN_UNUSED_RESULT
112 PARROT_CAN_RETURN_NULL
113 static const STR_VTABLE *
find_encoding(PARROT_INTERP,ARGIN (const STRING * encodingname))114 find_encoding(PARROT_INTERP, ARGIN(const STRING *encodingname))
115 {
116 ASSERT_ARGS(find_encoding)
117 const int n = n_encodings;
118 int i;
119
120 for (i = 0; i < n; ++i)
121 if (STRING_equal(interp, encodings[i]->name_str, encodingname))
122 return encodings[i];
123
124 /* backwards compatibility */
125 if (STRING_equal(interp, encodingname, unicode_str))
126 return Parrot_utf8_encoding_ptr;
127
128 if (STRING_equal(interp, encodingname, platform_str))
129 return Parrot_platform_encoding_ptr;
130
131 if (STRING_equal(interp, encodingname, fixed_8_str))
132 return Parrot_ascii_encoding_ptr;
133
134 return NULL;
135 }
136
137 /*
138
139 =item C<const STR_VTABLE * Parrot_find_encoding(PARROT_INTERP, const char
140 *encodingname)>
141
142 Finds an encoding with the C string name C<encodingname>. Returns the encoding
143 if it is successfully found, returns NULL otherwise.
144
145 =cut
146
147 */
148
149 PARROT_EXPORT
150 PARROT_PURE_FUNCTION
151 PARROT_WARN_UNUSED_RESULT
152 PARROT_CAN_RETURN_NULL
153 const STR_VTABLE *
Parrot_find_encoding(SHIM_INTERP,ARGIN (const char * encodingname))154 Parrot_find_encoding(SHIM_INTERP, ARGIN(const char *encodingname))
155 {
156 ASSERT_ARGS(Parrot_find_encoding)
157 const int n = n_encodings;
158 int i;
159
160 for (i = 0; i < n; ++i)
161 if (STREQ(encodings[i]->name, encodingname))
162 return encodings[i];
163
164 /* backwards compatibility */
165 if (strcmp(encodingname, ENC_NAME_UNICODE) == 0)
166 return Parrot_utf8_encoding_ptr;
167
168 if (strcmp(encodingname, ENC_NAME_PLATFORM) == 0)
169 return Parrot_platform_encoding_ptr;
170
171 if (strcmp(encodingname, ENC_NAME_FIXED8) == 0)
172 return Parrot_ascii_encoding_ptr;
173
174 return NULL;
175 }
176
177
178 /*
179
180 =item C<const STR_VTABLE * Parrot_find_encoding_by_string(PARROT_INTERP, STRING
181 *encodingname)>
182
183 Finds an encoding with the STRING name C<encodingname>. Returns the encoding
184 if it is successfully found, throws an exception otherwise. Returns the
185 default encoding for the NULL string.
186
187 =cut
188
189 */
190
191 PARROT_EXPORT
192 PARROT_PURE_FUNCTION
193 PARROT_WARN_UNUSED_RESULT
194 PARROT_CAN_RETURN_NULL
195 const STR_VTABLE *
Parrot_find_encoding_by_string(PARROT_INTERP,ARGIN (STRING * encodingname))196 Parrot_find_encoding_by_string(PARROT_INTERP, ARGIN(STRING *encodingname))
197 {
198 ASSERT_ARGS(Parrot_find_encoding_by_string)
199
200 if (STRING_IS_NULL(encodingname))
201 return Parrot_default_encoding_ptr;
202 else {
203 const STR_VTABLE * const result = find_encoding(interp, encodingname);
204 if (result)
205 return result;
206 }
207 Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
208 "invalid encoding '%Ss'", encodingname);
209 }
210
211
212 /*
213
214 =item C<const STR_VTABLE * Parrot_load_encoding(PARROT_INTERP, const char
215 *encodingname)>
216
217 Loads an encoding. Currently throws an exception because we cannot load
218 encodings. See https://trac.parrot.org/parrot/wiki/StringsTasklist.
219
220 =cut
221
222 */
223
224 /* Yep, this needs to be a char * parameter -- it's tough to load in
225 encodings and such for strings if we can't be sure we've got enough
226 info set up to actually build strings...
227
228 Also remember to use PARROT_WARN_UNUSED_RESULT and
229 PARROT_CANNOT_RETURN_NULL when this actually works.
230 */
231
232 PARROT_EXPORT
233 PARROT_DOES_NOT_RETURN
234 PARROT_CANNOT_RETURN_NULL
235 const STR_VTABLE *
Parrot_load_encoding(PARROT_INTERP,SHIM (const char * encodingname))236 Parrot_load_encoding(PARROT_INTERP, SHIM(const char *encodingname))
237 {
238 ASSERT_ARGS(Parrot_load_encoding)
239 Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_UNIMPLEMENTED,
240 "Can't load encodings yet");
241 }
242
243 /*
244
245 =item C<INTVAL Parrot_encoding_number(PARROT_INTERP, const STRING
246 *encodingname)>
247
248 Return the number of the encoding or -1 if not found.
249
250 =cut
251
252 */
253
254 PARROT_EXPORT
255 PARROT_PURE_FUNCTION
256 PARROT_WARN_UNUSED_RESULT
257 INTVAL
Parrot_encoding_number(PARROT_INTERP,ARGIN (const STRING * encodingname))258 Parrot_encoding_number(PARROT_INTERP, ARGIN(const STRING *encodingname))
259 {
260 ASSERT_ARGS(Parrot_encoding_number)
261 const STR_VTABLE * const result = find_encoding(interp, encodingname);
262 return result ? result->num : -1;
263 }
264
265 /*
266
267 =item C<INTVAL Parrot_encoding_number_of_str(PARROT_INTERP, const STRING *src)>
268
269 Return the number of the encoding of the given string or -1 if not found.
270
271 This could be converted to a macro.
272
273 =cut
274
275 */
276
277 PARROT_EXPORT
278 PARROT_PURE_FUNCTION
279 PARROT_WARN_UNUSED_RESULT
280 INTVAL
Parrot_encoding_number_of_str(SHIM_INTERP,ARGIN (const STRING * src))281 Parrot_encoding_number_of_str(SHIM_INTERP, ARGIN(const STRING *src))
282 {
283 ASSERT_ARGS(Parrot_encoding_number_of_str)
284
285 return src->encoding->num;
286 }
287
288 /*
289
290 =item C<STRING* Parrot_encoding_name(PARROT_INTERP, INTVAL number_of_encoding)>
291
292 Returns the name of a character encoding based on the INTVAL index
293 C<number_of_encoding> to the All_encodings array.
294
295 This could be converted to a macro.
296
297 =cut
298
299 */
300
301 PARROT_EXPORT
302 PARROT_PURE_FUNCTION
303 PARROT_WARN_UNUSED_RESULT
304 PARROT_CAN_RETURN_NULL
305 STRING*
Parrot_encoding_name(SHIM_INTERP,INTVAL number_of_encoding)306 Parrot_encoding_name(SHIM_INTERP, INTVAL number_of_encoding)
307 {
308 ASSERT_ARGS(Parrot_encoding_name)
309 if (number_of_encoding >= n_encodings ||
310 number_of_encoding < 0)
311 return NULL;
312 return encodings[number_of_encoding]->name_str;
313 }
314
315 /*
316
317 =item C<const STR_VTABLE* Parrot_get_encoding(PARROT_INTERP, INTVAL
318 number_of_encoding)>
319
320 Returns the encoding given by the INTVAL index C<number_of_encoding>.
321
322 =cut
323
324 */
325
326 PARROT_EXPORT
327 PARROT_PURE_FUNCTION
328 PARROT_WARN_UNUSED_RESULT
329 PARROT_CAN_RETURN_NULL
330 const STR_VTABLE*
Parrot_get_encoding(SHIM_INTERP,INTVAL number_of_encoding)331 Parrot_get_encoding(SHIM_INTERP, INTVAL number_of_encoding)
332 {
333 ASSERT_ARGS(Parrot_get_encoding)
334 if (number_of_encoding >= n_encodings ||
335 number_of_encoding < 0)
336 return NULL;
337 return encodings[number_of_encoding];
338 }
339
340 /*
341
342 =item C<const char * Parrot_encoding_c_name(PARROT_INTERP, INTVAL
343 number_of_encoding)>
344
345 Returns the NULL-terminated C string representation of the encodings name
346 given by the C<number_of_encoding>.
347
348 =cut
349
350 */
351
352 PARROT_EXPORT
353 PARROT_PURE_FUNCTION
354 PARROT_WARN_UNUSED_RESULT
355 PARROT_CAN_RETURN_NULL
356 const char *
Parrot_encoding_c_name(SHIM_INTERP,INTVAL number_of_encoding)357 Parrot_encoding_c_name(SHIM_INTERP, INTVAL number_of_encoding)
358 {
359 ASSERT_ARGS(Parrot_encoding_c_name)
360 if (number_of_encoding >= n_encodings ||
361 number_of_encoding < 0)
362 return NULL;
363 return encodings[number_of_encoding]->name;
364 }
365
366 /*
367
368 =item C<void Parrot_str_internal_register_encoding_names(PARROT_INTERP)>
369
370 Helper function for initializing characterset encoding names. We can't create
371 the STRING names until the default encodings are already initted,
372 so the name generation is split into a second init stage.
373
374 =cut
375
376 */
377
378
379 void
Parrot_str_internal_register_encoding_names(PARROT_INTERP)380 Parrot_str_internal_register_encoding_names(PARROT_INTERP)
381 {
382 ASSERT_ARGS(Parrot_str_internal_register_encoding_names)
383 int n;
384 for (n = 0; n < n_encodings; ++n)
385 encodings[n]->name_str =
386 Parrot_str_new_constant(interp, encodings[n]->name);
387 /* Can't use CONST_STRING here, not setup yet */
388 unicode_str = Parrot_str_new_constant(interp, ENC_NAME_UNICODE);
389 fixed_8_str = Parrot_str_new_constant(interp, ENC_NAME_FIXED8);
390 platform_str = Parrot_str_new_constant(interp, ENC_NAME_PLATFORM);
391 }
392
393 /*
394
395 =item C<INTVAL Parrot_register_encoding(PARROT_INTERP, STR_VTABLE *encoding)>
396
397 Registers a character encoding C<encoding> with name C<encodingname>.
398 Only allows one of 5 possibilities: fixed_8, utf8, utf16, ucs2 and ucs4.
399
400 =cut
401
402 */
403
404 PARROT_EXPORT
405 INTVAL
Parrot_register_encoding(PARROT_INTERP,ARGIN (STR_VTABLE * encoding))406 Parrot_register_encoding(PARROT_INTERP, ARGIN(STR_VTABLE *encoding))
407 {
408 ASSERT_ARGS(Parrot_register_encoding)
409 int i;
410 int n = n_encodings;
411
412 for (i = 0; i < n_encodings; ++i) {
413 if (STREQ(encodings[i]->name, encoding->name))
414 return 0;
415 }
416
417 if (!n)
418 encodings = mem_gc_allocate_zeroed_typed(interp, STR_VTABLE *);
419 else
420 encodings = mem_gc_realloc_n_typed_zeroed(interp,
421 encodings, n + 1, n, STR_VTABLE *);
422
423 encoding->num = n;
424 encodings[n] = encoding;
425 ++n_encodings;
426
427 return 1;
428 }
429
430 /*
431
432 =item C<void Parrot_encodings_init(PARROT_INTERP)>
433
434 Creates the initial encodings.
435
436 =cut
437
438 */
439
440 PARROT_EXPORT
441 void
Parrot_encodings_init(PARROT_INTERP)442 Parrot_encodings_init(PARROT_INTERP)
443 {
444 ASSERT_ARGS(Parrot_encodings_init)
445
446 Parrot_register_encoding(interp, Parrot_ascii_encoding_ptr);
447 Parrot_register_encoding(interp, Parrot_latin1_encoding_ptr);
448 Parrot_register_encoding(interp, Parrot_binary_encoding_ptr);
449 Parrot_register_encoding(interp, Parrot_utf8_encoding_ptr);
450 Parrot_register_encoding(interp, Parrot_utf16_encoding_ptr);
451 Parrot_register_encoding(interp, Parrot_ucs2_encoding_ptr);
452 Parrot_register_encoding(interp, Parrot_ucs4_encoding_ptr);
453
454 Parrot_default_encoding_ptr = Parrot_ascii_encoding_ptr;
455 Parrot_init_platform_encoding(interp);
456
457 /* Now that the plugins are registered, we can create STRING
458 * names for them. */
459 Parrot_str_internal_register_encoding_names(interp);
460 }
461
462 /*
463
464 =item C<INTVAL Parrot_make_default_encoding(PARROT_INTERP, const char
465 *encodingname, STR_VTABLE *encoding)>
466
467 Sets the default encoding to C<encoding> with name C<encodingname>.
468
469 =cut
470
471 */
472
473 PARROT_EXPORT
474 INTVAL
Parrot_make_default_encoding(SHIM_INTERP,ARGIN (SHIM (const char * encodingname)),ARGIN (STR_VTABLE * encoding))475 Parrot_make_default_encoding(SHIM_INTERP, ARGIN(SHIM(const char *encodingname)),
476 ARGIN(STR_VTABLE *encoding))
477 {
478 ASSERT_ARGS(Parrot_make_default_encoding)
479 Parrot_default_encoding_ptr = encoding;
480 return 1;
481 }
482
483 /*
484
485 =item C<const STR_VTABLE * Parrot_default_encoding(PARROT_INTERP)>
486
487 Gets the default encoding.
488
489 =cut
490
491 */
492
493 PARROT_EXPORT
494 PARROT_PURE_FUNCTION
495 PARROT_WARN_UNUSED_RESULT
496 PARROT_CANNOT_RETURN_NULL
497 const STR_VTABLE *
Parrot_default_encoding(SHIM_INTERP)498 Parrot_default_encoding(SHIM_INTERP)
499 {
500 ASSERT_ARGS(Parrot_default_encoding)
501 return Parrot_default_encoding_ptr;
502 }
503
504 /*
505
506 =item C<INTVAL Parrot_str_internal_find_codepoint(PARROT_INTERP, const STRING
507 *name)>
508
509 Helper function for string.ops in the ICU and non-ICU variant.
510
511 At first search for ICU names.
512 This will not find name aliases for control characters starting with ICU 5.2.
513 U_CHAR_NAME_ALIAS started with ICU 4.4,
514 U_UNICODE_10_CHAR_NAME (the "old name" like "LINE FEED") was deprecated with ICU 4.9,
515 but U_CHAR_NAME_CHOICE_COUNT is stable since 2.0.
516
517 =cut
518
519 */
520
521 PARROT_PURE_FUNCTION
522 PARROT_WARN_UNUSED_RESULT
523 INTVAL
Parrot_str_internal_find_codepoint(PARROT_INTERP,ARGIN (const STRING * name))524 Parrot_str_internal_find_codepoint(PARROT_INTERP, ARGIN(const STRING *name))
525 {
526 ASSERT_ARGS(Parrot_str_internal_find_codepoint)
527 INTVAL retval = -1;
528 char * const cstr = Parrot_str_to_cstring(interp, name);
529 #if PARROT_HAS_ICU
530 UErrorCode err = U_ZERO_ERROR;
531 unsigned int i = 0;
532 for (; i < U_CHAR_NAME_CHOICE_COUNT; i++) {
533 UChar32 codepoint = u_charFromName((UCharNameChoice)i, cstr, &err);
534 if (U_SUCCESS(err)) {
535 retval = (INTVAL) codepoint;
536 goto found;
537 }
538 }
539 #endif
540 {
541 const struct Parrot_namealias *namealias
542 = Parrot_namealias_lookup(cstr, STRING_byte_length(name));
543 if (namealias)
544 retval = (INTVAL) namealias->codepoint;
545 }
546 found:
547 Parrot_str_free_cstring(cstr);
548 return retval;
549 }
550
551 /*
552
553 =back
554
555 */
556
557 /*
558 * Local variables:
559 * c-file-style: "parrot"
560 * End:
561 * vim: expandtab shiftwidth=4 cinoptions='\:2=2' :
562 */
563