1 /*
2 Copyright (C) 2004-2014, Parrot Foundation.
3 
4 =head1 NAME
5 
6 src/string/encoding.c - global encoding functions
7 
8 =head1 DESCRIPTION
9 
10 These are parrot's generic encoding handling functions
11 
12 =over 4
13 
14 =cut
15 
16 */
17 
18 #include "parrot/encoding.h"
19 #include "parrot/namealias.h"
20 #if PARROT_HAS_ICU
21 #  include <unicode/uchar.h>
22 #endif
23 #include "encoding.str"
24 
25 STR_VTABLE *Parrot_default_encoding_ptr  = NULL;
26 STR_VTABLE *Parrot_platform_encoding_ptr = NULL;
27 
28 static STR_VTABLE **encodings;
29 static int          n_encodings = 0;
30 static STRING      *platform_str;
31 /* for backwards compatibility */
32 static STRING      *unicode_str;
33 static STRING      *fixed_8_str;
34 
35 #define ENC_NAME_PLATFORM "platform"
36 #define ENC_NAME_UNICODE  "unicode"
37 #define ENC_NAME_FIXED8   "fixed_8"
38 
39 /* HEADERIZER HFILE: include/parrot/encoding.h */
40 
41 /* HEADERIZER BEGIN: static */
42 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END.  Your changes will be lost. */
43 
44 PARROT_PURE_FUNCTION
45 PARROT_WARN_UNUSED_RESULT
46 PARROT_CAN_RETURN_NULL
47 static const STR_VTABLE * find_encoding(PARROT_INTERP,
48     ARGIN(const STRING *encodingname))
49         __attribute__nonnull__(1)
50         __attribute__nonnull__(2);
51 
52 #define ASSERT_ARGS_find_encoding __attribute__unused__ int _ASSERT_ARGS_CHECK = (\
53        PARROT_ASSERT_ARG(interp) \
54     , PARROT_ASSERT_ARG(encodingname))
55 /* Don't modify between HEADERIZER BEGIN / HEADERIZER END.  Your changes will be lost. */
56 /* HEADERIZER END: static */
57 
58 
59 /*
60 
61 =item C<void Parrot_deinit_encodings(PARROT_INTERP)>
62 
63 Deinitialize encodings and free all memory used by them.
64 
65 =cut
66 
67 */
68 
69 void
Parrot_deinit_encodings(PARROT_INTERP)70 Parrot_deinit_encodings(PARROT_INTERP)
71 {
72     ASSERT_ARGS(Parrot_deinit_encodings)
73 
74     mem_gc_free(interp, encodings);
75     encodings   = NULL;
76     n_encodings = 0;
77 }
78 
79 /*
80 
81 =item C<STR_VTABLE * Parrot_new_encoding(PARROT_INTERP)>
82 
83 Allocates the memory for a new string vtable from the system.
84 
85 =cut
86 
87 */
88 
89 PARROT_EXPORT
90 PARROT_MALLOC
91 PARROT_CANNOT_RETURN_NULL
92 STR_VTABLE *
Parrot_new_encoding(PARROT_INTERP)93 Parrot_new_encoding(PARROT_INTERP)
94 {
95     ASSERT_ARGS(Parrot_new_encoding)
96     return mem_gc_allocate_typed(interp, STR_VTABLE);
97 }
98 
99 /*
100 
101 =item C<static const STR_VTABLE * find_encoding(PARROT_INTERP, const STRING
102 *encodingname)>
103 
104 Finds an encoding with the STRING name C<encodingname>. Returns the encoding
105 if it is successfully found, returns NULL otherwise.
106 
107 =cut
108 
109 */
110 PARROT_PURE_FUNCTION
111 PARROT_WARN_UNUSED_RESULT
112 PARROT_CAN_RETURN_NULL
113 static const STR_VTABLE *
find_encoding(PARROT_INTERP,ARGIN (const STRING * encodingname))114 find_encoding(PARROT_INTERP, ARGIN(const STRING *encodingname))
115 {
116     ASSERT_ARGS(find_encoding)
117     const int n = n_encodings;
118     int i;
119 
120     for (i = 0; i < n; ++i)
121         if (STRING_equal(interp, encodings[i]->name_str, encodingname))
122             return encodings[i];
123 
124     /* backwards compatibility */
125     if (STRING_equal(interp, encodingname, unicode_str))
126         return Parrot_utf8_encoding_ptr;
127 
128     if (STRING_equal(interp, encodingname, platform_str))
129         return Parrot_platform_encoding_ptr;
130 
131     if (STRING_equal(interp, encodingname, fixed_8_str))
132         return Parrot_ascii_encoding_ptr;
133 
134     return NULL;
135 }
136 
137 /*
138 
139 =item C<const STR_VTABLE * Parrot_find_encoding(PARROT_INTERP, const char
140 *encodingname)>
141 
142 Finds an encoding with the C string name C<encodingname>. Returns the encoding
143 if it is successfully found, returns NULL otherwise.
144 
145 =cut
146 
147 */
148 
149 PARROT_EXPORT
150 PARROT_PURE_FUNCTION
151 PARROT_WARN_UNUSED_RESULT
152 PARROT_CAN_RETURN_NULL
153 const STR_VTABLE *
Parrot_find_encoding(SHIM_INTERP,ARGIN (const char * encodingname))154 Parrot_find_encoding(SHIM_INTERP, ARGIN(const char *encodingname))
155 {
156     ASSERT_ARGS(Parrot_find_encoding)
157     const int n = n_encodings;
158     int i;
159 
160     for (i = 0; i < n; ++i)
161         if (STREQ(encodings[i]->name, encodingname))
162             return encodings[i];
163 
164     /* backwards compatibility */
165     if (strcmp(encodingname, ENC_NAME_UNICODE) == 0)
166         return Parrot_utf8_encoding_ptr;
167 
168     if (strcmp(encodingname, ENC_NAME_PLATFORM) == 0)
169         return Parrot_platform_encoding_ptr;
170 
171     if (strcmp(encodingname, ENC_NAME_FIXED8) == 0)
172         return Parrot_ascii_encoding_ptr;
173 
174     return NULL;
175 }
176 
177 
178 /*
179 
180 =item C<const STR_VTABLE * Parrot_find_encoding_by_string(PARROT_INTERP, STRING
181 *encodingname)>
182 
183 Finds an encoding with the STRING name C<encodingname>. Returns the encoding
184 if it is successfully found, throws an exception otherwise. Returns the
185 default encoding for the NULL string.
186 
187 =cut
188 
189 */
190 
191 PARROT_EXPORT
192 PARROT_PURE_FUNCTION
193 PARROT_WARN_UNUSED_RESULT
194 PARROT_CAN_RETURN_NULL
195 const STR_VTABLE *
Parrot_find_encoding_by_string(PARROT_INTERP,ARGIN (STRING * encodingname))196 Parrot_find_encoding_by_string(PARROT_INTERP, ARGIN(STRING *encodingname))
197 {
198     ASSERT_ARGS(Parrot_find_encoding_by_string)
199 
200     if (STRING_IS_NULL(encodingname))
201         return Parrot_default_encoding_ptr;
202     else {
203         const STR_VTABLE * const result = find_encoding(interp, encodingname);
204         if (result)
205             return result;
206     }
207     Parrot_ex_throw_from_c_args(interp, NULL, EXCEPTION_INVALID_ENCODING,
208             "invalid encoding '%Ss'", encodingname);
209 }
210 
211 
212 /*
213 
214 =item C<const STR_VTABLE * Parrot_load_encoding(PARROT_INTERP, const char
215 *encodingname)>
216 
217 Loads an encoding. Currently throws an exception because we cannot load
218 encodings. See https://trac.parrot.org/parrot/wiki/StringsTasklist.
219 
220 =cut
221 
222 */
223 
224 /* Yep, this needs to be a char * parameter -- it's tough to load in
225    encodings and such for strings if we can't be sure we've got enough
226    info set up to actually build strings...
227 
228     Also remember to use PARROT_WARN_UNUSED_RESULT and
229     PARROT_CANNOT_RETURN_NULL when this actually works.
230  */
231 
232 PARROT_EXPORT
233 PARROT_DOES_NOT_RETURN
234 PARROT_CANNOT_RETURN_NULL
235 const STR_VTABLE *
Parrot_load_encoding(PARROT_INTERP,SHIM (const char * encodingname))236 Parrot_load_encoding(PARROT_INTERP, SHIM(const char *encodingname))
237 {
238     ASSERT_ARGS(Parrot_load_encoding)
239     Parrot_ex_throw_from_c_noargs(interp, EXCEPTION_UNIMPLEMENTED,
240         "Can't load encodings yet");
241 }
242 
243 /*
244 
245 =item C<INTVAL Parrot_encoding_number(PARROT_INTERP, const STRING
246 *encodingname)>
247 
248 Return the number of the encoding or -1 if not found.
249 
250 =cut
251 
252 */
253 
254 PARROT_EXPORT
255 PARROT_PURE_FUNCTION
256 PARROT_WARN_UNUSED_RESULT
257 INTVAL
Parrot_encoding_number(PARROT_INTERP,ARGIN (const STRING * encodingname))258 Parrot_encoding_number(PARROT_INTERP, ARGIN(const STRING *encodingname))
259 {
260     ASSERT_ARGS(Parrot_encoding_number)
261     const STR_VTABLE * const result = find_encoding(interp, encodingname);
262     return result ? result->num : -1;
263 }
264 
265 /*
266 
267 =item C<INTVAL Parrot_encoding_number_of_str(PARROT_INTERP, const STRING *src)>
268 
269 Return the number of the encoding of the given string or -1 if not found.
270 
271 This could be converted to a macro.
272 
273 =cut
274 
275 */
276 
277 PARROT_EXPORT
278 PARROT_PURE_FUNCTION
279 PARROT_WARN_UNUSED_RESULT
280 INTVAL
Parrot_encoding_number_of_str(SHIM_INTERP,ARGIN (const STRING * src))281 Parrot_encoding_number_of_str(SHIM_INTERP, ARGIN(const STRING *src))
282 {
283     ASSERT_ARGS(Parrot_encoding_number_of_str)
284 
285     return src->encoding->num;
286 }
287 
288 /*
289 
290 =item C<STRING* Parrot_encoding_name(PARROT_INTERP, INTVAL number_of_encoding)>
291 
292 Returns the name of a character encoding based on the INTVAL index
293 C<number_of_encoding> to the All_encodings array.
294 
295 This could be converted to a macro.
296 
297 =cut
298 
299 */
300 
301 PARROT_EXPORT
302 PARROT_PURE_FUNCTION
303 PARROT_WARN_UNUSED_RESULT
304 PARROT_CAN_RETURN_NULL
305 STRING*
Parrot_encoding_name(SHIM_INTERP,INTVAL number_of_encoding)306 Parrot_encoding_name(SHIM_INTERP, INTVAL number_of_encoding)
307 {
308     ASSERT_ARGS(Parrot_encoding_name)
309     if (number_of_encoding >= n_encodings ||
310         number_of_encoding < 0)
311         return NULL;
312     return encodings[number_of_encoding]->name_str;
313 }
314 
315 /*
316 
317 =item C<const STR_VTABLE* Parrot_get_encoding(PARROT_INTERP, INTVAL
318 number_of_encoding)>
319 
320 Returns the encoding given by the INTVAL index C<number_of_encoding>.
321 
322 =cut
323 
324 */
325 
326 PARROT_EXPORT
327 PARROT_PURE_FUNCTION
328 PARROT_WARN_UNUSED_RESULT
329 PARROT_CAN_RETURN_NULL
330 const STR_VTABLE*
Parrot_get_encoding(SHIM_INTERP,INTVAL number_of_encoding)331 Parrot_get_encoding(SHIM_INTERP, INTVAL number_of_encoding)
332 {
333     ASSERT_ARGS(Parrot_get_encoding)
334     if (number_of_encoding >= n_encodings ||
335         number_of_encoding < 0)
336         return NULL;
337     return encodings[number_of_encoding];
338 }
339 
340 /*
341 
342 =item C<const char * Parrot_encoding_c_name(PARROT_INTERP, INTVAL
343 number_of_encoding)>
344 
345 Returns the NULL-terminated C string representation of the encodings name
346 given by the C<number_of_encoding>.
347 
348 =cut
349 
350 */
351 
352 PARROT_EXPORT
353 PARROT_PURE_FUNCTION
354 PARROT_WARN_UNUSED_RESULT
355 PARROT_CAN_RETURN_NULL
356 const char *
Parrot_encoding_c_name(SHIM_INTERP,INTVAL number_of_encoding)357 Parrot_encoding_c_name(SHIM_INTERP, INTVAL number_of_encoding)
358 {
359     ASSERT_ARGS(Parrot_encoding_c_name)
360     if (number_of_encoding >= n_encodings ||
361         number_of_encoding < 0)
362         return NULL;
363     return encodings[number_of_encoding]->name;
364 }
365 
366 /*
367 
368 =item C<void Parrot_str_internal_register_encoding_names(PARROT_INTERP)>
369 
370 Helper function for initializing characterset encoding names. We can't create
371 the STRING names until the default encodings are already initted,
372 so the name generation is split into a second init stage.
373 
374 =cut
375 
376 */
377 
378 
379 void
Parrot_str_internal_register_encoding_names(PARROT_INTERP)380 Parrot_str_internal_register_encoding_names(PARROT_INTERP)
381 {
382     ASSERT_ARGS(Parrot_str_internal_register_encoding_names)
383     int n;
384     for (n = 0; n < n_encodings; ++n)
385         encodings[n]->name_str =
386             Parrot_str_new_constant(interp, encodings[n]->name);
387     /* Can't use CONST_STRING here, not setup yet */
388     unicode_str  = Parrot_str_new_constant(interp, ENC_NAME_UNICODE);
389     fixed_8_str  = Parrot_str_new_constant(interp, ENC_NAME_FIXED8);
390     platform_str = Parrot_str_new_constant(interp, ENC_NAME_PLATFORM);
391 }
392 
393 /*
394 
395 =item C<INTVAL Parrot_register_encoding(PARROT_INTERP, STR_VTABLE *encoding)>
396 
397 Registers a character encoding C<encoding> with name C<encodingname>.
398 Only allows one of 5 possibilities: fixed_8, utf8, utf16, ucs2 and ucs4.
399 
400 =cut
401 
402 */
403 
404 PARROT_EXPORT
405 INTVAL
Parrot_register_encoding(PARROT_INTERP,ARGIN (STR_VTABLE * encoding))406 Parrot_register_encoding(PARROT_INTERP, ARGIN(STR_VTABLE *encoding))
407 {
408     ASSERT_ARGS(Parrot_register_encoding)
409     int i;
410     int n = n_encodings;
411 
412     for (i = 0; i < n_encodings; ++i) {
413         if (STREQ(encodings[i]->name, encoding->name))
414             return 0;
415     }
416 
417     if (!n)
418         encodings = mem_gc_allocate_zeroed_typed(interp, STR_VTABLE *);
419     else
420         encodings = mem_gc_realloc_n_typed_zeroed(interp,
421                 encodings, n + 1, n, STR_VTABLE *);
422 
423     encoding->num = n;
424     encodings[n]  = encoding;
425     ++n_encodings;
426 
427     return 1;
428 }
429 
430 /*
431 
432 =item C<void Parrot_encodings_init(PARROT_INTERP)>
433 
434 Creates the initial encodings.
435 
436 =cut
437 
438 */
439 
440 PARROT_EXPORT
441 void
Parrot_encodings_init(PARROT_INTERP)442 Parrot_encodings_init(PARROT_INTERP)
443 {
444     ASSERT_ARGS(Parrot_encodings_init)
445 
446     Parrot_register_encoding(interp, Parrot_ascii_encoding_ptr);
447     Parrot_register_encoding(interp, Parrot_latin1_encoding_ptr);
448     Parrot_register_encoding(interp, Parrot_binary_encoding_ptr);
449     Parrot_register_encoding(interp, Parrot_utf8_encoding_ptr);
450     Parrot_register_encoding(interp, Parrot_utf16_encoding_ptr);
451     Parrot_register_encoding(interp, Parrot_ucs2_encoding_ptr);
452     Parrot_register_encoding(interp, Parrot_ucs4_encoding_ptr);
453 
454     Parrot_default_encoding_ptr = Parrot_ascii_encoding_ptr;
455     Parrot_init_platform_encoding(interp);
456 
457     /* Now that the plugins are registered, we can create STRING
458      * names for them.  */
459     Parrot_str_internal_register_encoding_names(interp);
460 }
461 
462 /*
463 
464 =item C<INTVAL Parrot_make_default_encoding(PARROT_INTERP, const char
465 *encodingname, STR_VTABLE *encoding)>
466 
467 Sets the default encoding to C<encoding> with name C<encodingname>.
468 
469 =cut
470 
471 */
472 
473 PARROT_EXPORT
474 INTVAL
Parrot_make_default_encoding(SHIM_INTERP,ARGIN (SHIM (const char * encodingname)),ARGIN (STR_VTABLE * encoding))475 Parrot_make_default_encoding(SHIM_INTERP, ARGIN(SHIM(const char *encodingname)),
476         ARGIN(STR_VTABLE *encoding))
477 {
478     ASSERT_ARGS(Parrot_make_default_encoding)
479     Parrot_default_encoding_ptr = encoding;
480     return 1;
481 }
482 
483 /*
484 
485 =item C<const STR_VTABLE * Parrot_default_encoding(PARROT_INTERP)>
486 
487 Gets the default encoding.
488 
489 =cut
490 
491 */
492 
493 PARROT_EXPORT
494 PARROT_PURE_FUNCTION
495 PARROT_WARN_UNUSED_RESULT
496 PARROT_CANNOT_RETURN_NULL
497 const STR_VTABLE *
Parrot_default_encoding(SHIM_INTERP)498 Parrot_default_encoding(SHIM_INTERP)
499 {
500     ASSERT_ARGS(Parrot_default_encoding)
501     return Parrot_default_encoding_ptr;
502 }
503 
504 /*
505 
506 =item C<INTVAL Parrot_str_internal_find_codepoint(PARROT_INTERP, const STRING
507 *name)>
508 
509 Helper function for string.ops in the ICU and non-ICU variant.
510 
511 At first search for ICU names.
512 This will not find name aliases for control characters starting with ICU 5.2.
513 U_CHAR_NAME_ALIAS started with ICU 4.4,
514 U_UNICODE_10_CHAR_NAME (the "old name" like "LINE FEED") was deprecated with ICU 4.9,
515 but U_CHAR_NAME_CHOICE_COUNT is stable since 2.0.
516 
517 =cut
518 
519 */
520 
521 PARROT_PURE_FUNCTION
522 PARROT_WARN_UNUSED_RESULT
523 INTVAL
Parrot_str_internal_find_codepoint(PARROT_INTERP,ARGIN (const STRING * name))524 Parrot_str_internal_find_codepoint(PARROT_INTERP, ARGIN(const STRING *name))
525 {
526     ASSERT_ARGS(Parrot_str_internal_find_codepoint)
527     INTVAL retval = -1;
528     char * const cstr      = Parrot_str_to_cstring(interp, name);
529 #if PARROT_HAS_ICU
530     UErrorCode   err       = U_ZERO_ERROR;
531     unsigned int i = 0;
532     for (; i < U_CHAR_NAME_CHOICE_COUNT; i++) {
533         UChar32 codepoint = u_charFromName((UCharNameChoice)i, cstr, &err);
534         if (U_SUCCESS(err)) {
535             retval = (INTVAL) codepoint;
536             goto found;
537         }
538     }
539 #endif
540     {
541         const struct Parrot_namealias *namealias
542             = Parrot_namealias_lookup(cstr, STRING_byte_length(name));
543         if (namealias)
544             retval = (INTVAL) namealias->codepoint;
545     }
546   found:
547     Parrot_str_free_cstring(cstr);
548     return retval;
549 }
550 
551 /*
552 
553 =back
554 
555 */
556 
557 /*
558  * Local variables:
559  *   c-file-style: "parrot"
560  * End:
561  * vim: expandtab shiftwidth=4 cinoptions='\:2=2' :
562  */
563