1 #include "schpriv.h"
2 #include "racket_version.h"
3 #include "schrktio.h"
4 #include <string.h>
5 #include <ctype.h>
6 #ifdef NO_ERRNO_GLOBAL
7 # define errno -1
8 #else
9 # include <errno.h>
10 #endif
11
12 #ifndef SCHEME_PLATFORM_LIBRARY_SUBPATH
13 # include "schsys.h"
14 #endif
15
16 #ifndef SPLS_SUFFIX
17 # define SPLS_SUFFIX ""
18 #endif
19
20 #include "schustr.inc"
21
22 #ifdef MACOS_UNICODE_SUPPORT
23 # define mzLOCALE_IS_UTF_8(s) (!s || !(*s))
24 #endif
25 #ifdef WINDOWS_UNICODE_SUPPORT
26 # define mzLOCALE_IS_UTF_8(s) (!s || !(*s))
27 #endif
28 #ifndef mzLOCALE_IS_UTF_8
29 # define mzLOCALE_IS_UTF_8(s) (!(rktio_convert_properties(scheme_rktio) & RKTIO_CONVERTER_SUPPORTED))
30 #endif
31
32 #ifdef WINDOWS_UNICODE_SUPPORT
33 # define WIN_UTF16_AS_WTF16(utf16) utf16
34 #else
35 # define WIN_UTF16_AS_WTF16(utf16) 0
36 #endif
37
38 #define mzICONV_KIND 0
39 #define mzUTF8_KIND 1
40 #define mzUTF8_TO_UTF16_KIND 2
41 #define mzUTF16_TO_UTF8_KIND 3
42
43 typedef struct Scheme_Converter {
44 Scheme_Object so;
45 short closed;
46 short kind;
47 rktio_converter_t *cd;
48 int permissive, wtf;
49 Scheme_Custodian_Reference *mref;
50 } Scheme_Converter;
51
52 Scheme_Object *scheme_system_type_proc;
53
54 static Scheme_Object *make_string (int argc, Scheme_Object *argv[]);
55 static Scheme_Object *string (int argc, Scheme_Object *argv[]);
56 static Scheme_Object *string_p (int argc, Scheme_Object *argv[]);
57 static Scheme_Object *string_length (int argc, Scheme_Object *argv[]);
58 static Scheme_Object *string_eq (int argc, Scheme_Object *argv[]);
59 static Scheme_Object *string_locale_eq (int argc, Scheme_Object *argv[]);
60 static Scheme_Object *string_ci_eq (int argc, Scheme_Object *argv[]);
61 static Scheme_Object *string_locale_ci_eq (int argc, Scheme_Object *argv[]);
62 static Scheme_Object *string_lt (int argc, Scheme_Object *argv[]);
63 static Scheme_Object *string_locale_lt (int argc, Scheme_Object *argv[]);
64 static Scheme_Object *string_gt (int argc, Scheme_Object *argv[]);
65 static Scheme_Object *string_locale_gt (int argc, Scheme_Object *argv[]);
66 static Scheme_Object *string_lt_eq (int argc, Scheme_Object *argv[]);
67 static Scheme_Object *string_gt_eq (int argc, Scheme_Object *argv[]);
68 static Scheme_Object *string_ci_lt (int argc, Scheme_Object *argv[]);
69 static Scheme_Object *string_locale_ci_lt (int argc, Scheme_Object *argv[]);
70 static Scheme_Object *string_ci_gt (int argc, Scheme_Object *argv[]);
71 static Scheme_Object *string_locale_ci_gt (int argc, Scheme_Object *argv[]);
72 static Scheme_Object *string_ci_lt_eq (int argc, Scheme_Object *argv[]);
73 static Scheme_Object *string_ci_gt_eq (int argc, Scheme_Object *argv[]);
74 static Scheme_Object *string_upcase (int argc, Scheme_Object *argv[]);
75 static Scheme_Object *string_downcase (int argc, Scheme_Object *argv[]);
76 static Scheme_Object *string_titlecase (int argc, Scheme_Object *argv[]);
77 static Scheme_Object *string_foldcase (int argc, Scheme_Object *argv[]);
78 static Scheme_Object *string_locale_upcase (int argc, Scheme_Object *argv[]);
79 static Scheme_Object *string_locale_downcase (int argc, Scheme_Object *argv[]);
80 static Scheme_Object *substring (int argc, Scheme_Object *argv[]);
81 static Scheme_Object *string_append (int argc, Scheme_Object *argv[]);
82 static Scheme_Object *string_append_immutable (int argc, Scheme_Object *argv[]);
83 static Scheme_Object *string_to_list (int argc, Scheme_Object *argv[]);
84 static Scheme_Object *list_to_string (int argc, Scheme_Object *argv[]);
85 static Scheme_Object *string_copy (int argc, Scheme_Object *argv[]);
86 static Scheme_Object *string_copy_bang (int argc, Scheme_Object *argv[]);
87 static Scheme_Object *string_fill (int argc, Scheme_Object *argv[]);
88 static Scheme_Object *string_to_immutable (int argc, Scheme_Object *argv[]);
89 static Scheme_Object *string_normalize_c (int argc, Scheme_Object *argv[]);
90 static Scheme_Object *string_normalize_kc (int argc, Scheme_Object *argv[]);
91 static Scheme_Object *string_normalize_d (int argc, Scheme_Object *argv[]);
92 static Scheme_Object *string_normalize_kd (int argc, Scheme_Object *argv[]);
93
94 static Scheme_Object *make_shared_byte_string (int argc, Scheme_Object *argv[]);
95 static Scheme_Object *shared_byte_string (int argc, Scheme_Object *argv[]);
96
97 static Scheme_Object *make_byte_string (int argc, Scheme_Object *argv[]);
98 static Scheme_Object *byte_string (int argc, Scheme_Object *argv[]);
99 static Scheme_Object *byte_p (int argc, Scheme_Object *argv[]);
100 static Scheme_Object *byte_string_p (int argc, Scheme_Object *argv[]);
101 static Scheme_Object *byte_string_length (int argc, Scheme_Object *argv[]);
102 static Scheme_Object *byte_string_eq (int argc, Scheme_Object *argv[]);
103 static Scheme_Object *byte_string_lt (int argc, Scheme_Object *argv[]);
104 static Scheme_Object *byte_string_gt (int argc, Scheme_Object *argv[]);
105 static Scheme_Object *byte_substring (int argc, Scheme_Object *argv[]);
106 static Scheme_Object *byte_string_append (int argc, Scheme_Object *argv[]);
107 static Scheme_Object *byte_string_to_list (int argc, Scheme_Object *argv[]);
108 static Scheme_Object *list_to_byte_string (int argc, Scheme_Object *argv[]);
109 static Scheme_Object *byte_string_copy (int argc, Scheme_Object *argv[]);
110 static Scheme_Object *byte_string_copy_bang (int argc, Scheme_Object *argv[]);
111 static Scheme_Object *byte_string_fill (int argc, Scheme_Object *argv[]);
112 static Scheme_Object *byte_string_to_immutable (int argc, Scheme_Object *argv[]);
113
114 static Scheme_Object *byte_string_utf8_index (int argc, Scheme_Object *argv[]);
115 static Scheme_Object *byte_string_utf8_ref (int argc, Scheme_Object *argv[]);
116 static Scheme_Object *byte_string_utf8_length (int argc, Scheme_Object *argv[]);
117
118 static Scheme_Object *byte_string_to_char_string (int argc, Scheme_Object *argv[]);
119 static Scheme_Object *byte_string_to_char_string_locale (int argc, Scheme_Object *argv[]);
120 static Scheme_Object *byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[]);
121 static Scheme_Object *char_string_to_byte_string (int argc, Scheme_Object *argv[]);
122 static Scheme_Object *char_string_to_byte_string_locale (int argc, Scheme_Object *argv[]);
123 static Scheme_Object *char_string_to_byte_string_latin1 (int argc, Scheme_Object *argv[]);
124 static Scheme_Object *char_string_utf8_length (int argc, Scheme_Object *argv[]);
125
126 static Scheme_Object *version(int argc, Scheme_Object *argv[]);
127 static Scheme_Object *format(int argc, Scheme_Object *argv[]);
128 static Scheme_Object *sch_printf(int argc, Scheme_Object *argv[]);
129 static Scheme_Object *sch_eprintf(int argc, Scheme_Object *argv[]);
130 static Scheme_Object *sch_fprintf(int argc, Scheme_Object *argv[]);
131 static Scheme_Object *banner(int argc, Scheme_Object *argv[]);
132 static Scheme_Object *env_p(int argc, Scheme_Object *argv[]);
133 static Scheme_Object *sch_getenv(int argc, Scheme_Object *argv[]);
134 static Scheme_Object *sch_getenv_names(int argc, Scheme_Object *argv[]);
135 static Scheme_Object *sch_putenv(int argc, Scheme_Object *argv[]);
136 static Scheme_Object *env_copy(int argc, Scheme_Object *argv[]);
137 static Scheme_Object *env_make(int argc, Scheme_Object *argv[]);
138 static Scheme_Object *current_environment_variables(int argc, Scheme_Object *argv[]);
139 static Scheme_Object *system_type(int argc, Scheme_Object *argv[]);
140 static Scheme_Object *system_library_subpath(int argc, Scheme_Object *argv[]);
141 static Scheme_Object *cmdline_args(int argc, Scheme_Object *argv[]);
142 static Scheme_Object *current_locale(int argc, Scheme_Object *argv[]);
143 static Scheme_Object *locale_string_encoding(int argc, Scheme_Object *argv[]);
144 static Scheme_Object *system_language_country(int argc, Scheme_Object *argv[]);
145
146 static Scheme_Object *byte_string_open_converter(int argc, Scheme_Object *argv[]);
147 static Scheme_Object *byte_string_close_converter(int argc, Scheme_Object *argv[]);
148 static Scheme_Object *byte_string_convert(int argc, Scheme_Object *argv[]);
149 static Scheme_Object *byte_string_convert_end(int argc, Scheme_Object *argv[]);
150 static Scheme_Object *byte_converter_p(int argc, Scheme_Object *argv[]);
151
152 static Scheme_Object *path_lt (int argc, Scheme_Object *argv[]);
153
154 #ifdef MZ_PRECISE_GC
155 static void register_traversers(void);
156 #endif
157
158 static int mz_char_strcmp(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2, int locale, int size_shortcut);
159 static int mz_char_strcmp_ci(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2, int locale, int size_shortcut);
160 static int mz_strcmp(const char *who, unsigned char *str1, intptr_t l1, unsigned char *str2, intptr_t l2);
161
162 XFORM_NONGCING static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t end,
163 unsigned int *us, intptr_t dstart, intptr_t dend,
164 intptr_t *ipos, intptr_t *jpos,
165 char compact, char utf16,
166 int *state, int might_continue, int permissive, int wtf);
167 XFORM_NONGCING static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t end,
168 unsigned char *s, intptr_t dstart, intptr_t dend,
169 intptr_t *_ipos, intptr_t *_opos, char utf16, int wtf);
170
171 static char *string_to_from_locale(int to_bytes,
172 char *in, intptr_t delta, intptr_t len,
173 intptr_t *olen, int perm,
174 int *no_cvt);
175
176 static void cache_locale_or_close(int to_bytes, rktio_converter_t *cd, char *le);
177
178 #define portable_isspace(x) (((x) < 128) && isspace(x))
179
180 ROSYM static Scheme_Object *sys_symbol, *sys_os_symbol, *sys_arch_symbol;
181 ROSYM static Scheme_Object *link_symbol, *machine_symbol, *vm_symbol, *gc_symbol;
182 ROSYM static Scheme_Object *so_suffix_symbol, *so_mode_symbol, *word_symbol;
183 ROSYM static Scheme_Object *os_symbol, *os_star_symbol, *arch_symbol;
184 ROSYM static Scheme_Object *fs_change_symbol, *target_machine_symbol, *cross_symbol;
185 ROSYM static Scheme_Object *racket_symbol, *cgc_symbol, *_3m_symbol, *cs_symbol;
186 ROSYM static Scheme_Object *force_symbol, *infer_symbol;
187 ROSYM static Scheme_Object *platform_3m_path, *platform_cgc_path, *platform_cs_path;
188 READ_ONLY static Scheme_Object *zero_length_char_string;
189 READ_ONLY static Scheme_Object *zero_length_char_immutable_string;
190 READ_ONLY static Scheme_Object *zero_length_byte_string;
191
192 SHARED_OK static char *embedding_banner;
193 SHARED_OK static Scheme_Object *vers_str;
194 SHARED_OK static Scheme_Object *banner_str;
195
196 THREAD_LOCAL_DECL(static Scheme_Object *fs_change_props);
197
198 THREAD_LOCAL_DECL(static char *cached_locale_encoding_name);
199 THREAD_LOCAL_DECL(struct rktio_converter_t *cached_locale_to_converter);
200 THREAD_LOCAL_DECL(struct rktio_converter_t *cached_locale_from_converter);
201
202 READ_ONLY static Scheme_Object *complete_symbol, *continues_symbol, *aborts_symbol, *error_symbol;
203
204 READ_ONLY Scheme_Object *scheme_string_p_proc;
205 READ_ONLY Scheme_Object *scheme_byte_string_p_proc;
206
207 READ_ONLY static int cross_compile_mode;
208
209 /* These two locale variables are only valid when reset_locale()
210 is called after continuation marks (and hence parameterization)
211 may have changed. Similarly, setlocale() is only up-to-date
212 when reset_locale() has been called. */
213 THREAD_LOCAL_DECL(static int locale_on);
214 THREAD_LOCAL_DECL(static void *current_locale_name_ptr);
215 static void reset_locale(void);
216
217 #define current_locale_name ((const mzchar *)current_locale_name_ptr)
218
219 static const mzchar empty_char_string[1] = { 0 };
220 static const mzchar xes_char_string[2] = { 0x78787878, 0 };
221
222 void
scheme_init_string(Scheme_Startup_Env * env)223 scheme_init_string (Scheme_Startup_Env *env)
224 {
225 Scheme_Object *p;
226
227 REGISTER_SO(sys_symbol);
228 REGISTER_SO(sys_os_symbol);
229 REGISTER_SO(sys_arch_symbol);
230 sys_symbol = scheme_intern_symbol(SYSTEM_TYPE_NAME);
231 sys_os_symbol = scheme_intern_symbol(SCHEME_OS);
232 sys_arch_symbol = scheme_intern_symbol(SCHEME_ARCH);
233
234 REGISTER_SO(link_symbol);
235 REGISTER_SO(machine_symbol);
236 REGISTER_SO(gc_symbol);
237 REGISTER_SO(vm_symbol);
238 REGISTER_SO(so_suffix_symbol);
239 REGISTER_SO(so_mode_symbol);
240 REGISTER_SO(word_symbol);
241 REGISTER_SO(os_symbol);
242 REGISTER_SO(os_star_symbol);
243 REGISTER_SO(arch_symbol);
244 REGISTER_SO(fs_change_symbol);
245 REGISTER_SO(target_machine_symbol);
246 REGISTER_SO(cross_symbol);
247 link_symbol = scheme_intern_symbol("link");
248 machine_symbol = scheme_intern_symbol("machine");
249 vm_symbol = scheme_intern_symbol("vm");
250 gc_symbol = scheme_intern_symbol("gc");
251 so_suffix_symbol = scheme_intern_symbol("so-suffix");
252 so_mode_symbol = scheme_intern_symbol("so-mode");
253 word_symbol = scheme_intern_symbol("word");
254 os_symbol = scheme_intern_symbol("os");
255 os_star_symbol = scheme_intern_symbol("os*");
256 arch_symbol = scheme_intern_symbol("arch");
257 fs_change_symbol = scheme_intern_symbol("fs-change");
258 target_machine_symbol = scheme_intern_symbol("target-machine");
259 cross_symbol = scheme_intern_symbol("cross");
260
261 REGISTER_SO(racket_symbol);
262 REGISTER_SO(cgc_symbol);
263 REGISTER_SO(_3m_symbol);
264 REGISTER_SO(cs_symbol);
265 racket_symbol = scheme_intern_symbol("racket");
266 cgc_symbol = scheme_intern_symbol("cgc");
267 _3m_symbol = scheme_intern_symbol("3m");
268 cs_symbol = scheme_intern_symbol("cs");
269
270 REGISTER_SO(force_symbol);
271 REGISTER_SO(infer_symbol);
272 force_symbol = scheme_intern_symbol("force");
273 infer_symbol = scheme_intern_symbol("infer");
274
275 REGISTER_SO(zero_length_char_string);
276 REGISTER_SO(zero_length_char_immutable_string);
277 REGISTER_SO(zero_length_byte_string);
278 zero_length_char_string = scheme_alloc_char_string(0, 0);
279 zero_length_char_immutable_string = scheme_alloc_char_string(0, 0);
280 SCHEME_SET_CHAR_STRING_IMMUTABLE(zero_length_char_immutable_string);
281 zero_length_byte_string = scheme_alloc_byte_string(0, 0);
282
283 REGISTER_SO(complete_symbol);
284 REGISTER_SO(continues_symbol);
285 REGISTER_SO(aborts_symbol);
286 REGISTER_SO(error_symbol);
287 complete_symbol = scheme_intern_symbol("complete");
288 continues_symbol = scheme_intern_symbol("continues");
289 aborts_symbol = scheme_intern_symbol("aborts");
290 error_symbol = scheme_intern_symbol("error");
291
292 REGISTER_SO(platform_3m_path);
293 # ifdef DOS_FILE_SYSTEM
294 # define MZ3M_SUBDIR "\\3m"
295 # define MZCS_SUBDIR "\\cs"
296 # else
297 # define MZ3M_SUBDIR "/3m"
298 # define MZCS_SUBDIR "/cs"
299 #endif
300 REGISTER_SO(platform_3m_path);
301 REGISTER_SO(platform_cgc_path);
302 REGISTER_SO(platform_cs_path);
303 platform_cgc_path = scheme_make_path(SCHEME_PLATFORM_LIBRARY_SUBPATH SPLS_SUFFIX);
304 platform_3m_path = scheme_make_path(SCHEME_PLATFORM_LIBRARY_SUBPATH SPLS_SUFFIX MZ3M_SUBDIR);
305 platform_cs_path = scheme_make_path(SCHEME_PLATFORM_LIBRARY_SUBPATH SPLS_SUFFIX MZCS_SUBDIR);
306
307 REGISTER_SO(embedding_banner);
308 REGISTER_SO(vers_str);
309 REGISTER_SO(banner_str);
310
311 vers_str = scheme_make_utf8_string(scheme_version());
312 SCHEME_SET_CHAR_STRING_IMMUTABLE(vers_str);
313 banner_str = scheme_make_utf8_string(scheme_banner());
314 SCHEME_SET_CHAR_STRING_IMMUTABLE(banner_str);
315
316 REGISTER_SO(scheme_string_p_proc);
317 p = scheme_make_folding_prim(string_p, "string?", 1, 1, 1);
318 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_UNARY_INLINED
319 | SCHEME_PRIM_IS_OMITABLE
320 | SCHEME_PRIM_PRODUCES_BOOL);
321 scheme_addto_prim_instance("string?", p, env);
322 scheme_string_p_proc = p;
323
324 scheme_addto_prim_instance("make-string",
325 scheme_make_immed_prim(make_string,
326 "make-string",
327 1, 2),
328 env);
329 scheme_addto_prim_instance("string",
330 scheme_make_immed_prim(string,
331 "string",
332 0, -1),
333 env);
334
335 p = scheme_make_folding_prim(string_length, "string-length", 1, 1, 1);
336 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_UNARY_INLINED
337 |SCHEME_PRIM_PRODUCES_FIXNUM
338 | SCHEME_PRIM_AD_HOC_OPT);
339 scheme_addto_prim_instance("string-length", p,
340 env);
341
342 p = scheme_make_immed_prim(scheme_checked_string_ref, "string-ref", 2, 2);
343 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_BINARY_INLINED
344 | SCHEME_PRIM_AD_HOC_OPT);
345 scheme_addto_prim_instance("string-ref", p, env);
346
347 p = scheme_make_immed_prim(scheme_checked_string_set, "string-set!", 3, 3);
348 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_NARY_INLINED
349 | SCHEME_PRIM_AD_HOC_OPT);
350 scheme_addto_prim_instance("string-set!", p, env);
351
352 p = scheme_make_immed_prim(string_eq, "string=?", 1, -1);
353 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_BINARY_INLINED
354 | SCHEME_PRIM_PRODUCES_BOOL
355 | SCHEME_PRIM_PRODUCES_BOOL);
356 scheme_addto_prim_instance("string=?", p, env);
357
358 scheme_addto_prim_instance("string-locale=?",
359 scheme_make_immed_prim(string_locale_eq,
360 "string-locale=?",
361 1, -1),
362 env);
363 scheme_addto_prim_instance("string-ci=?",
364 scheme_make_immed_prim(string_ci_eq,
365 "string-ci=?",
366 1, -1),
367 env);
368 scheme_addto_prim_instance("string-locale-ci=?",
369 scheme_make_immed_prim(string_locale_ci_eq,
370 "string-locale-ci=?",
371 1, -1),
372 env);
373 scheme_addto_prim_instance("string<?",
374 scheme_make_immed_prim(string_lt,
375 "string<?",
376 1, -1),
377 env);
378 scheme_addto_prim_instance("string-locale<?",
379 scheme_make_immed_prim(string_locale_lt,
380 "string-locale<?",
381 1, -1),
382 env);
383 scheme_addto_prim_instance("string>?",
384 scheme_make_immed_prim(string_gt,
385 "string>?",
386 1, -1),
387 env);
388 scheme_addto_prim_instance("string-locale>?",
389 scheme_make_immed_prim(string_locale_gt,
390 "string-locale>?",
391 1, -1),
392 env);
393 scheme_addto_prim_instance("string<=?",
394 scheme_make_immed_prim(string_lt_eq,
395 "string<=?",
396 1, -1),
397 env);
398 scheme_addto_prim_instance("string>=?",
399 scheme_make_immed_prim(string_gt_eq,
400 "string>=?",
401 1, -1),
402 env);
403 scheme_addto_prim_instance("string-ci<?",
404 scheme_make_immed_prim(string_ci_lt,
405 "string-ci<?",
406 1, -1),
407 env);
408 scheme_addto_prim_instance("string-locale-ci<?",
409 scheme_make_immed_prim(string_locale_ci_lt,
410 "string-locale-ci<?",
411 1, -1),
412 env);
413 scheme_addto_prim_instance("string-ci>?",
414 scheme_make_immed_prim(string_ci_gt,
415 "string-ci>?",
416 1, -1),
417 env);
418 scheme_addto_prim_instance("string-locale-ci>?",
419 scheme_make_immed_prim(string_locale_ci_gt,
420 "string-locale-ci>?",
421 1, -1),
422 env);
423 scheme_addto_prim_instance("string-ci<=?",
424 scheme_make_immed_prim(string_ci_lt_eq,
425 "string-ci<=?",
426 1, -1),
427 env);
428 scheme_addto_prim_instance("string-ci>=?",
429 scheme_make_immed_prim(string_ci_gt_eq,
430 "string-ci>=?",
431 1, -1),
432 env);
433
434 scheme_addto_prim_instance("substring",
435 scheme_make_immed_prim(substring,
436 "substring",
437 2, 3),
438 env);
439
440 p = scheme_make_immed_prim(string_append, "string-append", 0, -1);
441 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_AD_HOC_OPT);
442 scheme_addto_prim_instance("string-append", p, env);
443
444 p = scheme_make_immed_prim(string_append_immutable, "string-append-immutable", 0, -1);
445 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_AD_HOC_OPT);
446 scheme_addto_prim_instance("string-append-immutable", p, env);
447
448 scheme_addto_prim_instance("string->list",
449 scheme_make_immed_prim(string_to_list,
450 "string->list",
451 1, 1),
452 env);
453 scheme_addto_prim_instance("list->string",
454 scheme_make_immed_prim(list_to_string,
455 "list->string",
456 1, 1),
457 env);
458 scheme_addto_prim_instance("string-copy",
459 scheme_make_immed_prim(string_copy,
460 "string-copy",
461 1, 1),
462 env);
463 scheme_addto_prim_instance("string-copy!",
464 scheme_make_immed_prim(string_copy_bang,
465 "string-copy!",
466 3, 5),
467 env);
468 scheme_addto_prim_instance("string-fill!",
469 scheme_make_immed_prim(string_fill,
470 "string-fill!",
471 2, 2),
472 env);
473
474 p = scheme_make_immed_prim(string_to_immutable, "string->immutable-string", 1, 1);
475 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_AD_HOC_OPT);
476 scheme_addto_prim_instance("string->immutable-string", p, env);
477
478 scheme_addto_prim_instance("string-normalize-nfc",
479 scheme_make_immed_prim(string_normalize_c,
480 "string-normalize-nfc",
481 1, 1),
482 env);
483 scheme_addto_prim_instance("string-normalize-nfkc",
484 scheme_make_immed_prim(string_normalize_kc,
485 "string-normalize-nfkc",
486 1, 1),
487 env);
488 scheme_addto_prim_instance("string-normalize-nfd",
489 scheme_make_immed_prim(string_normalize_d,
490 "string-normalize-nfd",
491 1, 1),
492 env);
493 scheme_addto_prim_instance("string-normalize-nfkd",
494 scheme_make_immed_prim(string_normalize_kd,
495 "string-normalize-nfkd",
496 1, 1),
497 env);
498
499 scheme_addto_prim_instance("string-upcase",
500 scheme_make_immed_prim(string_upcase,
501 "string-upcase",
502 1, 1),
503 env);
504 scheme_addto_prim_instance("string-downcase",
505 scheme_make_immed_prim(string_downcase,
506 "string-downcase",
507 1, 1),
508 env);
509 scheme_addto_prim_instance("string-titlecase",
510 scheme_make_immed_prim(string_titlecase,
511 "string-titlecase",
512 1, 1),
513 env);
514 scheme_addto_prim_instance("string-foldcase",
515 scheme_make_immed_prim(string_foldcase,
516 "string-foldcase",
517 1, 1),
518 env);
519
520 scheme_addto_prim_instance("string-locale-upcase",
521 scheme_make_immed_prim(string_locale_upcase,
522 "string-locale-upcase",
523 1, 1),
524 env);
525 scheme_addto_prim_instance("string-locale-downcase",
526 scheme_make_immed_prim(string_locale_downcase,
527 "string-locale-downcase",
528 1, 1),
529 env);
530
531 scheme_addto_prim_instance("current-locale",
532 scheme_register_parameter(current_locale,
533 "current-locale",
534 MZCONFIG_LOCALE),
535 env);
536 scheme_addto_prim_instance("locale-string-encoding",
537 scheme_make_immed_prim(locale_string_encoding,
538 "locale-string-encoding",
539 0, 0),
540 env);
541 scheme_addto_prim_instance("system-language+country",
542 scheme_make_immed_prim(system_language_country,
543 "system-language+country",
544 0, 0),
545 env);
546
547 scheme_addto_prim_instance("bytes-converter?",
548 scheme_make_immed_prim(byte_converter_p,
549 "bytes-converter?",
550 1, 1),
551 env);
552 scheme_addto_prim_instance("bytes-convert",
553 scheme_make_prim_w_arity2(byte_string_convert,
554 "bytes-convert",
555 2, 7,
556 3, 3),
557 env);
558 scheme_addto_prim_instance("bytes-convert-end",
559 scheme_make_prim_w_arity2(byte_string_convert_end,
560 "bytes-convert-end",
561 1, 4,
562 2, 2),
563 env);
564 scheme_addto_prim_instance("bytes-open-converter",
565 scheme_make_immed_prim(byte_string_open_converter,
566 "bytes-open-converter",
567 2, 2),
568 env);
569 scheme_addto_prim_instance("bytes-close-converter",
570 scheme_make_immed_prim(byte_string_close_converter,
571 "bytes-close-converter",
572 1, 1),
573 env);
574
575 scheme_addto_prim_instance("format",
576 scheme_make_noncm_prim(format,
577 "format",
578 1, -1),
579 env);
580 scheme_addto_prim_instance("printf",
581 scheme_make_noncm_prim(sch_printf,
582 "printf",
583 1, -1),
584 env);
585 scheme_addto_prim_instance("eprintf",
586 scheme_make_noncm_prim(sch_eprintf,
587 "eprintf",
588 1, -1),
589 env);
590 scheme_addto_prim_instance("fprintf",
591 scheme_make_noncm_prim(sch_fprintf,
592 "fprintf",
593 2, -1),
594 env);
595
596 scheme_addto_prim_instance("byte?",
597 scheme_make_folding_prim(byte_p,
598 "byte?",
599 1, 1, 1),
600 env);
601
602 REGISTER_SO(scheme_byte_string_p_proc);
603 p = scheme_make_folding_prim(byte_string_p, "bytes?", 1, 1, 1);
604 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_UNARY_INLINED
605 | SCHEME_PRIM_IS_OMITABLE
606 | SCHEME_PRIM_PRODUCES_BOOL);
607 scheme_addto_prim_instance("bytes?", p, env);
608 scheme_byte_string_p_proc = p;
609
610 scheme_addto_prim_instance("make-bytes",
611 scheme_make_immed_prim(make_byte_string,
612 "make-bytes",
613 1, 2),
614 env);
615 scheme_addto_prim_instance("bytes",
616 scheme_make_immed_prim(byte_string,
617 "bytes",
618 0, -1),
619 env);
620
621 ADD_PRIM_W_ARITY("make-shared-bytes", make_shared_byte_string, 1, 2, env);
622 ADD_PRIM_W_ARITY("shared-bytes", shared_byte_string, 0, -1, env);
623
624 p = scheme_make_folding_prim(byte_string_length, "bytes-length", 1, 1, 1);
625 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_UNARY_INLINED
626 | SCHEME_PRIM_PRODUCES_FIXNUM
627 | SCHEME_PRIM_AD_HOC_OPT);
628 scheme_addto_prim_instance("bytes-length", p, env);
629
630 p = scheme_make_immed_prim(scheme_checked_byte_string_ref, "bytes-ref", 2, 2);
631 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_BINARY_INLINED
632 | SCHEME_PRIM_PRODUCES_FIXNUM
633 | SCHEME_PRIM_AD_HOC_OPT);
634 scheme_addto_prim_instance("bytes-ref", p, env);
635
636 p = scheme_make_immed_prim(scheme_checked_byte_string_set, "bytes-set!", 3, 3);
637 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_NARY_INLINED
638 | SCHEME_PRIM_AD_HOC_OPT);
639 scheme_addto_prim_instance("bytes-set!", p, env);
640
641 p = scheme_make_immed_prim(byte_string_eq, "bytes=?", 1, -1);
642 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_BINARY_INLINED
643 | SCHEME_PRIM_PRODUCES_BOOL);
644 scheme_addto_prim_instance("bytes=?", p, env);
645
646 scheme_addto_prim_instance("bytes<?",
647 scheme_make_immed_prim(byte_string_lt,
648 "bytes<?",
649 1, -1),
650 env);
651 scheme_addto_prim_instance("bytes>?",
652 scheme_make_immed_prim(byte_string_gt,
653 "bytes>?",
654 1, -1),
655 env);
656
657 scheme_addto_prim_instance("subbytes",
658 scheme_make_immed_prim(byte_substring,
659 "subbytes",
660 2, 3),
661 env);
662
663 p = scheme_make_immed_prim(byte_string_append, "bytes-append", 0, -1);
664 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_AD_HOC_OPT);
665 scheme_addto_prim_instance("bytes-append", p, env);
666
667 scheme_addto_prim_instance("bytes->list",
668 scheme_make_immed_prim(byte_string_to_list,
669 "bytes->list",
670 1, 1),
671 env);
672 scheme_addto_prim_instance("list->bytes",
673 scheme_make_immed_prim(list_to_byte_string,
674 "list->bytes",
675 1, 1),
676 env);
677 scheme_addto_prim_instance("bytes-copy",
678 scheme_make_immed_prim(byte_string_copy,
679 "bytes-copy",
680 1, 1),
681 env);
682 scheme_addto_prim_instance("bytes-copy!",
683 scheme_make_immed_prim(byte_string_copy_bang,
684 "bytes-copy!",
685 3, 5),
686 env);
687 scheme_addto_prim_instance("bytes-fill!",
688 scheme_make_immed_prim(byte_string_fill,
689 "bytes-fill!",
690 2, 2),
691 env);
692
693 p = scheme_make_immed_prim(byte_string_to_immutable, "bytes->immutable-bytes", 1, 1);
694 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_AD_HOC_OPT);
695 scheme_addto_prim_instance("bytes->immutable-bytes", p, env);
696
697 p = scheme_make_immed_prim(byte_string_utf8_index, "bytes-utf-8-index", 2, 5);
698 /* Incorrect, since the result can be #f:
699 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_PRODUCES_FIXNUM); */
700 scheme_addto_prim_instance("bytes-utf-8-index", p, env);
701
702 p = scheme_make_immed_prim(byte_string_utf8_length, "bytes-utf-8-length", 1, 4);
703 /* Incorrect, since the result can be #f:
704 SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_PRODUCES_FIXNUM); */
705 scheme_addto_prim_instance("bytes-utf-8-length", p, env);
706
707 scheme_addto_prim_instance("bytes-utf-8-ref",
708 scheme_make_immed_prim(byte_string_utf8_ref,
709 "bytes-utf-8-ref",
710 2, 5),
711 env);
712
713 scheme_addto_prim_instance("bytes->string/utf-8",
714 scheme_make_immed_prim(byte_string_to_char_string,
715 "bytes->string/utf-8",
716 1, 4),
717 env);
718 scheme_addto_prim_instance("bytes->string/locale",
719 scheme_make_immed_prim(byte_string_to_char_string_locale,
720 "bytes->string/locale",
721 1, 4),
722 env);
723 scheme_addto_prim_instance("bytes->string/latin-1",
724 scheme_make_immed_prim(byte_string_to_char_string_latin1,
725 "bytes->string/latin-1",
726 1, 4),
727 env);
728 scheme_addto_prim_instance("string->bytes/utf-8",
729 scheme_make_immed_prim(char_string_to_byte_string,
730 "string->bytes/utf-8",
731 1, 4),
732 env);
733 scheme_addto_prim_instance("string->bytes/locale",
734 scheme_make_immed_prim(char_string_to_byte_string_locale,
735 "string->bytes/locale",
736 1, 4),
737 env);
738 scheme_addto_prim_instance("string->bytes/latin-1",
739 scheme_make_immed_prim(char_string_to_byte_string_latin1,
740 "string->bytes/latin-1",
741 1, 4),
742 env);
743
744 scheme_addto_prim_instance("string-utf-8-length",
745 scheme_make_immed_prim(char_string_utf8_length,
746 "string-utf-8-length",
747 1, 3),
748 env);
749
750
751 /* In principle, `version' could be foldable, but it invites
752 more problems than it solves... */
753
754 scheme_addto_prim_instance("version",
755 scheme_make_immed_prim(version,
756 "version",
757 0, 0),
758 env);
759 scheme_addto_prim_instance("banner",
760 scheme_make_immed_prim(banner,
761 "banner",
762 0, 0),
763 env);
764
765 /* Environment variables */
766
767 scheme_addto_prim_instance("environment-variables?",
768 scheme_make_folding_prim(env_p,
769 "environment-variables?",
770 1, 1, 1),
771 env);
772
773 scheme_addto_prim_instance("current-environment-variables",
774 scheme_register_parameter(current_environment_variables,
775 "current-environment-variables",
776 MZCONFIG_CURRENT_ENV_VARS),
777 env);
778
779 scheme_addto_prim_instance("environment-variables-ref",
780 scheme_make_immed_prim(sch_getenv,
781 "environment-variables-ref",
782 2, 2),
783 env);
784
785 scheme_addto_prim_instance("environment-variables-set!",
786 scheme_make_prim_w_arity(sch_putenv,
787 "environment-variables-set!",
788 3, 4),
789 env);
790
791 scheme_addto_prim_instance("environment-variables-names",
792 scheme_make_immed_prim(sch_getenv_names,
793 "environment-variables-names",
794 1, 1),
795 env);
796
797 scheme_addto_prim_instance("environment-variables-copy",
798 scheme_make_immed_prim(env_copy,
799 "environment-variables-copy",
800 1, 1),
801 env);
802
803 scheme_addto_prim_instance("make-environment-variables",
804 scheme_make_immed_prim(env_make,
805 "make-environment-variables",
806 0, -1),
807 env);
808
809 /* Don't make these folding, since they're platform-specific: */
810
811 REGISTER_SO(scheme_system_type_proc);
812 scheme_system_type_proc = scheme_make_immed_prim(system_type,
813 "system-type",
814 0, 1);
815 scheme_addto_prim_instance("system-type", scheme_system_type_proc, env);
816
817 scheme_addto_prim_instance("system-library-subpath",
818 scheme_make_immed_prim(system_library_subpath,
819 "system-library-subpath",
820 0, 1),
821 env);
822
823 scheme_addto_prim_instance("current-command-line-arguments",
824 scheme_register_parameter(cmdline_args,
825 "current-command-line-arguments",
826 MZCONFIG_CMDLINE_ARGS),
827 env);
828
829
830 scheme_addto_prim_instance("path<?",
831 scheme_make_immed_prim(path_lt,
832 "path<?",
833 1, -1),
834 env);
835
836 #ifdef MZ_PRECISE_GC
837 register_traversers();
838 #endif
839 }
840
scheme_init_string_places(void)841 void scheme_init_string_places(void) {
842 REGISTER_SO(current_locale_name_ptr);
843 current_locale_name_ptr = (void *)xes_char_string;
844
845 REGISTER_SO(fs_change_props);
846 {
847 int supported, scalable, low_latency, file_level;
848 Scheme_Object *s;
849 scheme_fs_change_properties(&supported, &scalable, &low_latency, &file_level);
850 fs_change_props = scheme_make_vector(4, scheme_false);
851 if (supported) {
852 s = scheme_intern_symbol("supported");
853 SCHEME_VEC_ELS(fs_change_props)[0] = s;
854 }
855 if (scalable) {
856 s = scheme_intern_symbol("scalable");
857 SCHEME_VEC_ELS(fs_change_props)[1] = s;
858 }
859 if (low_latency) {
860 s = scheme_intern_symbol("low-latency");
861 SCHEME_VEC_ELS(fs_change_props)[2] = s;
862 }
863 if (file_level) {
864 s = scheme_intern_symbol("file-level");
865 SCHEME_VEC_ELS(fs_change_props)[3] = s;
866 }
867 SCHEME_SET_IMMUTABLE(fs_change_props);
868 }
869 }
870
871 /**********************************************************************/
872 /* UTF-8 char constructors */
873 /**********************************************************************/
874
scheme_make_sized_offset_utf8_string(char * chars,intptr_t d,intptr_t len)875 Scheme_Object *scheme_make_sized_offset_utf8_string(char *chars, intptr_t d, intptr_t len)
876 {
877 intptr_t ulen;
878 mzchar *us;
879
880 if (len) {
881 ulen = scheme_utf8_decode((unsigned char *)chars, d, d + len,
882 NULL, 0, -1,
883 NULL, 0 /* not UTF-16 */, 0xFFFD);
884 us = scheme_malloc_atomic(sizeof(mzchar) * (ulen + 1));
885 scheme_utf8_decode((unsigned char *)chars, d, d + len,
886 us, 0, -1,
887 NULL, 0 /* not UTF-16 */, 0xFFFD);
888 us[ulen] = 0;
889 } else {
890 us = (mzchar *)empty_char_string;
891 ulen = 0;
892 }
893 return scheme_make_sized_offset_char_string(us, 0, ulen, 0);
894 }
895
896 Scheme_Object *
scheme_make_sized_utf8_string(char * chars,intptr_t len)897 scheme_make_sized_utf8_string(char *chars, intptr_t len)
898 {
899 return scheme_make_sized_offset_utf8_string(chars, 0, len);
900 }
901
902 Scheme_Object *
scheme_make_immutable_sized_utf8_string(char * chars,intptr_t len)903 scheme_make_immutable_sized_utf8_string(char *chars, intptr_t len)
904 {
905 Scheme_Object *s;
906
907 s = scheme_make_sized_offset_utf8_string(chars, 0, len);
908 if (len)
909 SCHEME_SET_CHAR_STRING_IMMUTABLE(s);
910
911 return s;
912 }
913
914 Scheme_Object *
scheme_make_utf8_string(const char * chars)915 scheme_make_utf8_string(const char *chars)
916 {
917 return scheme_make_sized_offset_utf8_string((char *)chars, 0, -1);
918 }
919
920 Scheme_Object *
scheme_make_locale_string(const char * chars)921 scheme_make_locale_string(const char *chars)
922 {
923 return scheme_byte_string_to_char_string_locale(scheme_make_byte_string((char *)chars));
924 }
925
scheme_append_strings(Scheme_Object * s1,Scheme_Object * s2)926 Scheme_Object *scheme_append_strings(Scheme_Object *s1, Scheme_Object *s2)
927 {
928 Scheme_Object *a[2];
929 a[0] = s1;
930 a[1] = s2;
931 return string_append(2, a);
932 }
933
934 /**********************************************************************/
935 /* index helpers */
936 /**********************************************************************/
937
scheme_extract_index(const char * name,int pos,int argc,Scheme_Object ** argv,intptr_t top,int false_ok)938 intptr_t scheme_extract_index(const char *name, int pos, int argc, Scheme_Object **argv, intptr_t top, int false_ok)
939 {
940 intptr_t i;
941 int is_top = 0;
942
943 if (SCHEME_INTP(argv[pos])) {
944 i = SCHEME_INT_VAL(argv[pos]);
945 } else if (SCHEME_BIGNUMP(argv[pos])) {
946 if (SCHEME_BIGPOS(argv[pos])) {
947 i = top; /* out-of-bounds */
948 is_top = 1;
949 } else
950 i = -1; /* negative */
951 } else
952 i = -1;
953
954 if (!is_top && (i < 0))
955 scheme_wrong_contract(name,
956 (false_ok ? "(or/c exact-nonnegative-integer? #f)" : "exact-nonnegative-integer?"),
957 pos, argc, argv);
958
959 return i;
960 }
961
scheme_get_substring_indices(const char * name,Scheme_Object * str,int argc,Scheme_Object ** argv,int spos,int fpos,intptr_t * _start,intptr_t * _finish)962 void scheme_get_substring_indices(const char *name, Scheme_Object *str,
963 int argc, Scheme_Object **argv,
964 int spos, int fpos, intptr_t *_start, intptr_t *_finish)
965 {
966 intptr_t len;
967 intptr_t start, finish;
968
969 if (SCHEME_CHAPERONE_VECTORP(str))
970 len = SCHEME_CHAPERONE_VEC_SIZE(str);
971 else if (SCHEME_CHAR_STRINGP(str))
972 len = SCHEME_CHAR_STRTAG_VAL(str);
973 else
974 len = SCHEME_BYTE_STRTAG_VAL(str);
975
976 if (argc > spos)
977 start = scheme_extract_index(name, spos, argc, argv, len + 1, 0);
978 else
979 start = 0;
980 if (argc > fpos)
981 finish = scheme_extract_index(name, fpos, argc, argv, len + 1, 0);
982 else
983 finish = len;
984
985 if (!(start <= len)) {
986 scheme_out_of_range(name, NULL, (fpos < 100) ? "starting " : "", argv[spos], str, 0, len);
987 }
988 if (!(finish >= start && finish <= len)) {
989 scheme_out_of_range(name, NULL, "ending ", argv[fpos], str, start, len);
990 }
991
992 *_start = start;
993 *_finish = finish;
994 }
995
scheme_do_get_substring_indices(const char * name,Scheme_Object * str,int argc,Scheme_Object ** argv,int spos,int fpos,intptr_t * _start,intptr_t * _finish,intptr_t len)996 void scheme_do_get_substring_indices(const char *name, Scheme_Object *str,
997 int argc, Scheme_Object **argv,
998 int spos, int fpos, intptr_t *_start, intptr_t *_finish, intptr_t len)
999 {
1000 if (argc > spos) {
1001 if (SCHEME_INTP(argv[spos])) {
1002 intptr_t start = SCHEME_INT_VAL(argv[spos]);
1003 if ((start >= 0) && (start < len)) {
1004 *_start = start;
1005 if (argc > fpos) {
1006 intptr_t finish = SCHEME_INT_VAL(argv[fpos]);
1007 if ((finish >= start) && (finish <= len)) {
1008 *_finish = finish;
1009 return;
1010 }
1011 } else {
1012 *_finish = len;
1013 return;
1014 }
1015 }
1016 }
1017 } else {
1018 *_start = 0;
1019 *_finish = len;
1020 return;
1021 }
1022
1023 scheme_get_substring_indices(name, str, argc, argv, spos, fpos, _start, _finish);
1024 }
1025
1026 /**********************************************************************/
1027 /* char strings */
1028 /**********************************************************************/
1029
1030 #define SCHEME_X_STR_VAL(x) SCHEME_CHAR_STR_VAL(x)
1031 #define SCHEME_X_STRTAG_VAL(x) SCHEME_CHAR_STRTAG_VAL(x)
1032 #define SCHEME_X_STRINGP(x) SCHEME_CHAR_STRINGP(x)
1033 #define SCHEME_MUTABLE_X_STRINGP(x) SCHEME_MUTABLE_CHAR_STRINGP(x)
1034 #define SCHEME_SET_X_STRING_IMMUTABLE(x) SCHEME_SET_CHAR_STRING_IMMUTABLE(x)
1035 #define scheme_x_string_type scheme_char_string_type
1036 #define X(a, b) a##_char##b
1037 #define X_(a, b) a##_##b
1038 #define X__(a) a
1039 #define EMPTY (mzchar *)"\0\0\0"
1040 #define Xchar mzchar
1041 #define uXchar mzchar
1042 #define XSTR ""
1043 #define IS_STR "string?"
1044 #define XSTRINGSTR "string"
1045 #define SUBXSTR "substring"
1046 #define CHARP(x) SCHEME_CHARP(x)
1047 #define CHAR_VAL(x) SCHEME_CHAR_VAL(x)
1048 #define CHAR_STR "char?"
1049 #define MAKE_CHAR(x) _scheme_make_char(x)
1050 #define xstrlen scheme_char_strlen
1051 #include "strops.inc"
1052
1053 #define GEN_STRING_COMP(name, scheme_name, comp, op, ul, size_shortcut) \
1054 static Scheme_Object * name (int argc, Scheme_Object *argv[]) \
1055 { mzchar *s, *prev; int i, sl, pl; int falz = 0;\
1056 if (!SCHEME_CHAR_STRINGP(argv[0])) \
1057 scheme_wrong_contract(scheme_name, "string?", 0, argc, argv); \
1058 prev = SCHEME_CHAR_STR_VAL(argv[0]); pl = SCHEME_CHAR_STRTAG_VAL(argv[0]); \
1059 for (i = 1; i < argc; i++) { \
1060 if (!SCHEME_CHAR_STRINGP(argv[i])) \
1061 scheme_wrong_contract(scheme_name, "string?", i, argc, argv); \
1062 s = SCHEME_CHAR_STR_VAL(argv[i]); sl = SCHEME_CHAR_STRTAG_VAL(argv[i]); \
1063 if (!falz) if (!(comp(scheme_name, \
1064 prev, pl, \
1065 s, sl, ul, size_shortcut) op 0)) falz = 1; \
1066 prev = s; pl = sl; \
1067 } \
1068 return falz ? scheme_false : scheme_true; \
1069 }
1070
1071 GEN_STRING_COMP(string_eq, "string=?", mz_char_strcmp, ==, 0, 1)
1072 GEN_STRING_COMP(string_lt, "string<?", mz_char_strcmp, <, 0, 0)
1073 GEN_STRING_COMP(string_gt, "string>?", mz_char_strcmp, >, 0, 0)
1074 GEN_STRING_COMP(string_lt_eq, "string<=?", mz_char_strcmp, <=, 0, 0)
1075 GEN_STRING_COMP(string_gt_eq, "string>=?", mz_char_strcmp, >=, 0, 0)
1076
1077 GEN_STRING_COMP(string_ci_eq, "string-ci=?", mz_char_strcmp_ci, ==, 0, 0)
1078 GEN_STRING_COMP(string_ci_lt, "string-ci<?", mz_char_strcmp_ci, <, 0, 0)
1079 GEN_STRING_COMP(string_ci_gt, "string-ci>?", mz_char_strcmp_ci, >, 0, 0)
1080 GEN_STRING_COMP(string_ci_lt_eq, "string-ci<=?", mz_char_strcmp_ci, <=, 0, 0)
1081 GEN_STRING_COMP(string_ci_gt_eq, "string-ci>=?", mz_char_strcmp_ci, >=, 0, 0)
1082
1083 GEN_STRING_COMP(string_locale_eq, "string-locale=?", mz_char_strcmp, ==, 1, 0)
1084 GEN_STRING_COMP(string_locale_lt, "string-locale<?", mz_char_strcmp, <, 1, 0)
1085 GEN_STRING_COMP(string_locale_gt, "string-locale>?", mz_char_strcmp, >, 1, 0)
1086 GEN_STRING_COMP(string_locale_ci_eq, "string-locale-ci=?", mz_char_strcmp_ci, ==, 1, 0)
1087 GEN_STRING_COMP(string_locale_ci_lt, "string-locale-ci<?", mz_char_strcmp_ci, <, 1, 0)
1088 GEN_STRING_COMP(string_locale_ci_gt, "string-locale-ci>?", mz_char_strcmp_ci, >, 1, 0)
1089
scheme_string_eq_2(Scheme_Object * str1,Scheme_Object * str2)1090 Scheme_Object *scheme_string_eq_2(Scheme_Object *str1, Scheme_Object *str2)
1091 {
1092 Scheme_Object *a[2];
1093 a[0] = str1;
1094 a[1] = str2;
1095 return string_eq(2, a);
1096 }
1097
string_append_immutable(int argc,Scheme_Object * argv[])1098 Scheme_Object *string_append_immutable(int argc, Scheme_Object *argv[])
1099 {
1100 Scheme_Object *r;
1101
1102 r = do_string_append("string-append-immutable", argc, argv);
1103
1104 if (r == zero_length_char_string)
1105 return zero_length_char_immutable_string;
1106
1107 SCHEME_SET_CHAR_STRING_IMMUTABLE(r);
1108
1109 return r;
1110 }
1111
1112 /**********************************************************************/
1113 /* byte strings */
1114 /**********************************************************************/
1115
1116 #define SCHEME_BYTEP(x) ((SCHEME_INTP(x)) && (SCHEME_INT_VAL(x) >= 0) && (SCHEME_INT_VAL(x) <= 255))
1117
1118 static Scheme_Object *
byte_p(int argc,Scheme_Object * argv[])1119 byte_p(int argc, Scheme_Object *argv[])
1120 {
1121 return (SCHEME_BYTEP(argv[0]) ? scheme_true : scheme_false);
1122 }
1123
1124 #define SCHEME_X_STR_VAL(x) SCHEME_BYTE_STR_VAL(x)
1125 #define SCHEME_X_STRTAG_VAL(x) SCHEME_BYTE_STRTAG_VAL(x)
1126 #define SCHEME_X_STRINGP(x) SCHEME_BYTE_STRINGP(x)
1127 #define SCHEME_MUTABLE_X_STRINGP(x) SCHEME_MUTABLE_BYTE_STRINGP(x)
1128 #define SCHEME_SET_X_STRING_IMMUTABLE(x) SCHEME_SET_BYTE_STRING_IMMUTABLE(x)
1129 #define scheme_x_string_type scheme_byte_string_type
1130 #define X(a, b) a##_byte##b
1131 #define X_(a, b) a##_byte_##b
1132 #define X__(a) byte_##a
1133 #define EMPTY ""
1134 #define Xchar char
1135 #define uXchar unsigned char
1136 #define XSTR "byte "
1137 #define IS_STR "bytes?"
1138 #define XSTRINGSTR "bytes"
1139 #define SUBXSTR "subbytes"
1140 #define CHARP(x) SCHEME_BYTEP(x)
1141 #define CHAR_VAL(x) SCHEME_INT_VAL(x)
1142 #define CHAR_STR "byte?"
1143 #define MAKE_CHAR(x) scheme_make_integer_value(x)
1144 #define xstrlen strlen
1145 #define GENERATING_BYTE
1146 #include "strops.inc"
1147 #undef GENERATING_BYTE
1148
1149 /* comparisons */
1150
1151 #define GEN_BYTE_STRING_PATH_COMP(name, scheme_name, comp, op, PRED, contract) \
1152 static Scheme_Object * name (int argc, Scheme_Object *argv[]) \
1153 { char *s, *prev; int i, sl, pl; int falz = 0;\
1154 if (!PRED(argv[0])) \
1155 scheme_wrong_contract(scheme_name, contract, 0, argc, argv); \
1156 prev = SCHEME_BYTE_STR_VAL(argv[0]); pl = SCHEME_BYTE_STRTAG_VAL(argv[0]); \
1157 for (i = 1; i < argc; i++) { \
1158 if (!PRED(argv[i])) \
1159 scheme_wrong_contract(scheme_name, contract, i, argc, argv); \
1160 s = SCHEME_BYTE_STR_VAL(argv[i]); sl = SCHEME_BYTE_STRTAG_VAL(argv[i]); \
1161 if (!falz) if (!(comp(scheme_name, \
1162 (unsigned char *)prev, pl, \
1163 (unsigned char *)s, sl) op 0)) falz = 1; \
1164 prev = s; pl = sl; \
1165 } \
1166 return falz ? scheme_false : scheme_true; \
1167 }
1168
1169 #define GEN_BYTE_STRING_COMP(name, scheme_name, comp, op) \
1170 GEN_BYTE_STRING_PATH_COMP(name, scheme_name, comp, op, SCHEME_BYTE_STRINGP, "bytes?") \
1171
1172 GEN_BYTE_STRING_COMP(byte_string_eq, "bytes=?", mz_strcmp, ==)
1173 GEN_BYTE_STRING_COMP(byte_string_lt, "bytes<?", mz_strcmp, <)
1174 GEN_BYTE_STRING_COMP(byte_string_gt, "bytes>?", mz_strcmp, >)
1175
1176 GEN_BYTE_STRING_PATH_COMP(path_lt, "path<?", mz_strcmp, <, SCHEME_PATHP, "path?")
1177
scheme_byte_string_eq_2(Scheme_Object * str1,Scheme_Object * str2)1178 Scheme_Object *scheme_byte_string_eq_2(Scheme_Object *str1, Scheme_Object *str2)
1179 {
1180 Scheme_Object *a[2];
1181 a[0] = str1;
1182 a[1] = str2;
1183 return byte_string_eq(2, a);
1184 }
1185
1186 /**********************************************************************/
1187 /* byte string <-> char string */
1188 /**********************************************************************/
1189
1190 /************************* bytes->string *************************/
1191
1192 static Scheme_Object *
do_byte_string_to_char_string(const char * who,Scheme_Object * bstr,intptr_t istart,intptr_t ifinish,int perm,int as_locale)1193 do_byte_string_to_char_string(const char *who,
1194 Scheme_Object *bstr,
1195 intptr_t istart, intptr_t ifinish,
1196 int perm, int as_locale)
1197 {
1198 int i, ulen;
1199 char *chars;
1200 unsigned int *v;
1201
1202 chars = SCHEME_BYTE_STR_VAL(bstr);
1203
1204 ulen = utf8_decode_x((unsigned char *)chars, istart, ifinish,
1205 NULL, 0, -1,
1206 NULL, NULL, 0, 0,
1207 NULL, 0,
1208 (perm > -1) ? 0xD800 : 0, 0);
1209 if (ulen < 0) {
1210 scheme_contract_error(who,
1211 "string is not a well-formed UTF-8 encoding",
1212 "string", 1, bstr,
1213 NULL);
1214 }
1215
1216 v = (unsigned int *)scheme_malloc_atomic((ulen + 1) * sizeof(unsigned int));
1217 utf8_decode_x((unsigned char *)chars, istart, ifinish,
1218 v, 0, -1,
1219 NULL, NULL, 0, 0,
1220 NULL, 0,
1221 (perm > -1) ? 0xD800 : 0, 0);
1222
1223 if (perm > -1) {
1224 for (i = 0; i < ulen; i++) {
1225 if (v[i] == 0xD800)
1226 v[i] = perm;
1227 }
1228 }
1229 v[ulen] = 0;
1230
1231 return scheme_make_sized_char_string(v, ulen, 0);
1232 }
1233
1234 static Scheme_Object *
do_byte_string_to_char_string_locale(const char * who,Scheme_Object * bstr,intptr_t istart,intptr_t ifinish,int perm)1235 do_byte_string_to_char_string_locale(const char *who,
1236 Scheme_Object *bstr,
1237 intptr_t istart, intptr_t ifinish,
1238 int perm)
1239 {
1240 char *us;
1241 intptr_t olen;
1242
1243 reset_locale();
1244
1245 if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on
1246 || !(rktio_convert_properties(scheme_rktio) & RKTIO_CONVERTER_SUPPORTED))
1247 return do_byte_string_to_char_string(who, bstr, istart, ifinish, perm, 1);
1248
1249 if (istart < ifinish) {
1250 int no_cvt;
1251
1252 us = string_to_from_locale(0, SCHEME_BYTE_STR_VAL(bstr),
1253 istart, ifinish - istart,
1254 &olen, perm, &no_cvt);
1255
1256 if (!us) {
1257 if (no_cvt) {
1258 return do_byte_string_to_char_string(who, bstr, istart, ifinish, perm, 1);
1259 } else {
1260 scheme_contract_error(who,
1261 "byte string is not a valid encoding for the current locale",
1262 "byte string", 1, bstr,
1263 NULL);
1264 }
1265 }
1266 ((mzchar *)us)[olen] = 0;
1267 } else {
1268 us = "\0\0\0";
1269 olen = 0;
1270 }
1271
1272 return scheme_make_sized_char_string((mzchar *)us, olen, 0);
1273 }
1274
1275 static Scheme_Object *
do_string_to_vector(const char * who,int mode,int argc,Scheme_Object * argv[])1276 do_string_to_vector(const char *who, int mode, int argc, Scheme_Object *argv[])
1277 {
1278 int permc;
1279 intptr_t istart, ifinish;
1280
1281 if (!SCHEME_BYTE_STRINGP(argv[0]))
1282 scheme_wrong_contract(who, "bytes?", 0, argc, argv);
1283
1284 if ((argc < 2) || SCHEME_FALSEP(argv[1]))
1285 permc = -1;
1286 else {
1287 if (!SCHEME_CHARP(argv[1]))
1288 scheme_wrong_contract(who, "(or/c char? #f)", 1, argc, argv);
1289 permc = SCHEME_CHAR_VAL(argv[1]);
1290 }
1291
1292 scheme_get_substring_indices(who, argv[0], argc, argv,
1293 2, 3,
1294 &istart, &ifinish);
1295
1296 if (mode == 0)
1297 return do_byte_string_to_char_string(who, argv[0], istart, ifinish, permc, 0);
1298 else if (mode == 1)
1299 return do_byte_string_to_char_string_locale(who, argv[0], istart, ifinish, permc);
1300 else {
1301 /* Latin-1 */
1302 mzchar *us;
1303 unsigned char *s;
1304 intptr_t i, len;
1305 len = ifinish - istart;
1306 s = (unsigned char *)SCHEME_BYTE_STR_VAL(argv[0]);
1307 us = (mzchar *)scheme_malloc_atomic((len + 1) * sizeof(mzchar));
1308 for (i = istart; i < ifinish; i++) {
1309 us[i - istart] = s[i];
1310 }
1311 us[len] = 0;
1312
1313 return scheme_make_sized_char_string(us, len, 0);
1314 }
1315 }
1316
1317
1318 static Scheme_Object *
byte_string_to_char_string(int argc,Scheme_Object * argv[])1319 byte_string_to_char_string (int argc, Scheme_Object *argv[])
1320 {
1321 return do_string_to_vector("bytes->string/utf-8", 0, argc, argv);
1322 }
1323
1324 static Scheme_Object *
byte_string_to_char_string_locale(int argc,Scheme_Object * argv[])1325 byte_string_to_char_string_locale (int argc, Scheme_Object *argv[])
1326 {
1327 return do_string_to_vector("bytes->string/locale", 1, argc, argv);
1328 }
1329
1330 static Scheme_Object *
byte_string_to_char_string_latin1(int argc,Scheme_Object * argv[])1331 byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[])
1332 {
1333 return do_string_to_vector("bytes->string/latin-1", 2, argc, argv);
1334 }
1335
scheme_byte_string_to_char_string(Scheme_Object * o)1336 Scheme_Object *scheme_byte_string_to_char_string(Scheme_Object *o)
1337 {
1338 return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD, 0);
1339 }
1340
scheme_byte_string_to_char_string_locale(Scheme_Object * o)1341 Scheme_Object *scheme_byte_string_to_char_string_locale(Scheme_Object *o)
1342 {
1343 return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD);
1344 }
1345
1346 /************************* string->bytes *************************/
1347
do_char_string_to_byte_string(Scheme_Object * s,intptr_t istart,intptr_t ifinish,int as_locale)1348 static Scheme_Object *do_char_string_to_byte_string(Scheme_Object *s, intptr_t istart, intptr_t ifinish,
1349 int as_locale)
1350 {
1351 char *bs;
1352 int slen;
1353
1354 slen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(s), istart, ifinish,
1355 NULL, 0,
1356 0 /* UTF-16 */);
1357 bs = (char *)scheme_malloc_atomic(slen + 1);
1358 scheme_utf8_encode(SCHEME_CHAR_STR_VAL(s), istart, ifinish,
1359 (unsigned char *)bs, 0,
1360 0 /* UTF-16 */);
1361 bs[slen] = 0;
1362
1363 return scheme_make_sized_byte_string(bs, slen, 0);
1364 }
1365
1366 static Scheme_Object *
do_char_string_to_byte_string_locale(const char * who,Scheme_Object * cstr,intptr_t istart,intptr_t ifinish,int perm)1367 do_char_string_to_byte_string_locale(const char *who,
1368 Scheme_Object *cstr,
1369 intptr_t istart, intptr_t ifinish,
1370 int perm)
1371 {
1372 char *s;
1373 intptr_t olen;
1374
1375 reset_locale();
1376
1377 if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on
1378 || !(rktio_convert_properties(scheme_rktio) & RKTIO_CONVERTER_SUPPORTED))
1379 return do_char_string_to_byte_string(cstr, istart, ifinish, 1);
1380
1381 if (istart < ifinish) {
1382 int no_cvt;
1383
1384 s = string_to_from_locale(1, (char *)SCHEME_CHAR_STR_VAL(cstr),
1385 istart, ifinish - istart,
1386 &olen, perm, &no_cvt);
1387
1388 if (!s) {
1389 if (no_cvt) {
1390 return do_char_string_to_byte_string(cstr, istart, ifinish, 1);
1391 } else {
1392 scheme_contract_error(who,
1393 "string cannot be encoded for the current locale",
1394 "string", 1, cstr,
1395 NULL);
1396 }
1397 }
1398 s[olen] = 0;
1399 } else {
1400 s = "";
1401 olen = 0;
1402 }
1403
1404 return scheme_make_sized_byte_string(s, olen, 0);
1405 }
1406
1407
scheme_char_string_to_byte_string(Scheme_Object * s)1408 Scheme_Object *scheme_char_string_to_byte_string(Scheme_Object *s)
1409 {
1410 return do_char_string_to_byte_string(s, 0, SCHEME_CHAR_STRLEN_VAL(s), 0);
1411 }
1412
scheme_char_string_to_byte_string_locale(Scheme_Object * s)1413 Scheme_Object *scheme_char_string_to_byte_string_locale(Scheme_Object *s)
1414 {
1415 return do_char_string_to_byte_string_locale("s->s", s, 0, SCHEME_CHAR_STRLEN_VAL(s), '?');
1416 }
1417
do_chars_to_bytes(const char * who,int mode,int argc,Scheme_Object * argv[])1418 static Scheme_Object *do_chars_to_bytes(const char *who, int mode,
1419 int argc, Scheme_Object *argv[])
1420 {
1421 intptr_t istart, ifinish;
1422 int permc;
1423
1424 if (!SCHEME_CHAR_STRINGP(argv[0]))
1425 scheme_wrong_contract(who, "string?", 0, argc, argv);
1426
1427 if ((argc < 2) || SCHEME_FALSEP(argv[1]))
1428 permc = -1;
1429 else {
1430 if (!SCHEME_BYTEP(argv[1]))
1431 scheme_wrong_contract(who, "(or/c byte? #f)", 1, argc, argv);
1432 permc = SCHEME_INT_VAL(argv[1]);
1433 }
1434
1435 scheme_get_substring_indices(who, argv[0], argc, argv,
1436 2, 3, &istart, &ifinish);
1437
1438 if (mode == 1)
1439 return do_char_string_to_byte_string_locale(who, argv[0], istart, ifinish, permc);
1440 else if (mode == 0)
1441 return do_char_string_to_byte_string(argv[0], istart, ifinish, 0);
1442 else {
1443 /* Latin-1 */
1444 mzchar *us;
1445 unsigned char *s;
1446 intptr_t i, len;
1447 len = ifinish - istart;
1448 us = SCHEME_CHAR_STR_VAL(argv[0]);
1449 s = (unsigned char *)scheme_malloc_atomic(len + 1);
1450 for (i = istart; i < ifinish; i++) {
1451 if (us[i] < 256)
1452 s[i - istart] = us[i];
1453 else if (permc >= 0) {
1454 s[i - istart] = permc;
1455 } else {
1456 scheme_contract_error(who,
1457 "string cannot be encoded in Latin-1",
1458 "string", 1, argv[0],
1459 NULL);
1460 }
1461 }
1462 s[len] = 0;
1463
1464 return scheme_make_sized_byte_string((char *)s, len, 0);
1465 }
1466 }
1467
char_string_to_byte_string(int argc,Scheme_Object * argv[])1468 static Scheme_Object *char_string_to_byte_string(int argc, Scheme_Object *argv[])
1469 {
1470 return do_chars_to_bytes("string->bytes/utf-8", 0, argc, argv);
1471 }
1472
char_string_to_byte_string_locale(int argc,Scheme_Object * argv[])1473 static Scheme_Object *char_string_to_byte_string_locale(int argc, Scheme_Object *argv[])
1474 {
1475 return do_chars_to_bytes("string->bytes/locale", 1, argc, argv);
1476 }
1477
char_string_to_byte_string_latin1(int argc,Scheme_Object * argv[])1478 static Scheme_Object *char_string_to_byte_string_latin1(int argc, Scheme_Object *argv[])
1479 {
1480 return do_chars_to_bytes("string->bytes/latin-1", 2, argc, argv);
1481 }
1482
1483 /************************* Other *************************/
1484
char_string_utf8_length(int argc,Scheme_Object * argv[])1485 static Scheme_Object *char_string_utf8_length (int argc, Scheme_Object *argv[])
1486 {
1487 intptr_t istart, ifinish, len;
1488
1489 if (!SCHEME_CHAR_STRINGP(argv[0]))
1490 scheme_wrong_contract("string-utf-8-length", "string?", 0, argc, argv);
1491
1492 scheme_get_substring_indices("string-utf-8-length", argv[0], argc, argv,
1493 1, 2, &istart, &ifinish);
1494
1495 len = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[0]), istart, ifinish,
1496 NULL, 0, 0);
1497
1498 return scheme_make_integer(len);
1499 }
1500
1501 static Scheme_Object *
byte_string_utf8_length(int argc,Scheme_Object * argv[])1502 byte_string_utf8_length (int argc, Scheme_Object *argv[])
1503 {
1504 int len, perm;
1505 intptr_t istart, ifinish;
1506 char *chars;
1507
1508 if (!SCHEME_BYTE_STRINGP(argv[0]))
1509 scheme_wrong_contract("bytes-utf-8-length", "string?", 0, argc, argv);
1510
1511 chars = SCHEME_BYTE_STR_VAL(argv[0]);
1512
1513 if ((argc > 1) && !SCHEME_FALSEP(argv[1])) {
1514 if (!SCHEME_CHARP(argv[1]))
1515 scheme_wrong_contract("bytes-utf-8-length", "(or/c char? #f)", 1, argc, argv);
1516 perm = 1;
1517 } else
1518 perm = 0;
1519
1520 scheme_get_substring_indices("bytes-utf-8-length", argv[0], argc, argv,
1521 2, 3,
1522 &istart, &ifinish);
1523
1524 len = scheme_utf8_decode((unsigned char *)chars, istart, ifinish,
1525 NULL, 0, -1,
1526 NULL, 0, perm);
1527
1528 if (len < 0)
1529 return scheme_false;
1530 else
1531 return scheme_make_integer(len);
1532 }
1533
1534 static Scheme_Object *
byte_string_utf8_index(int argc,Scheme_Object * argv[])1535 byte_string_utf8_index(int argc, Scheme_Object *argv[])
1536 {
1537 intptr_t istart, ifinish, pos = -1, opos, ipos;
1538 int result, perm;
1539 char *chars;
1540
1541 if (!SCHEME_BYTE_STRINGP(argv[0]))
1542 scheme_wrong_contract("bytes-utf-8-index", "bytes?", 0, argc, argv);
1543
1544 chars = SCHEME_BYTE_STR_VAL(argv[0]);
1545
1546 if (SCHEME_INTP(argv[1])) {
1547 pos = SCHEME_INT_VAL(argv[1]);
1548 } else if (SCHEME_BIGNUMP(argv[1])) {
1549 if (SCHEME_BIGPOS(argv[1]))
1550 pos = 0x7FFFFFFF;
1551 }
1552
1553 if (pos < 0) {
1554 scheme_wrong_contract("bytes-utf-8-index", "exact-nonnegative-integer?", 1, argc, argv);
1555 }
1556
1557 if ((argc > 2) && !SCHEME_FALSEP(argv[2])) {
1558 if (!SCHEME_CHARP(argv[2]))
1559 scheme_wrong_contract("bytes-utf-8-index", "(or/c char? #f)", 1, argc, argv);
1560 perm = 1;
1561 } else
1562 perm = 0;
1563
1564 scheme_get_substring_indices("bytes-utf-8-index", argv[0], argc, argv,
1565 3, 4,
1566 &istart, &ifinish);
1567
1568 result = utf8_decode_x((unsigned char *)chars, istart, ifinish,
1569 NULL, 0, pos,
1570 &ipos, &opos,
1571 0, 0, NULL, 0, perm ? 1 : 0, 0);
1572
1573 if (((result < 0) && (result != -3))
1574 || ((ipos == ifinish) && (opos <= pos)))
1575 return scheme_false;
1576 else
1577 return scheme_make_integer(ipos);
1578 }
1579
1580 static Scheme_Object *
byte_string_utf8_ref(int argc,Scheme_Object * argv[])1581 byte_string_utf8_ref(int argc, Scheme_Object *argv[])
1582 {
1583 intptr_t istart, ifinish, pos = -1, opos, ipos;
1584 char *chars;
1585 unsigned int us[1];
1586 Scheme_Object *perm;
1587
1588 if (!SCHEME_BYTE_STRINGP(argv[0]))
1589 scheme_wrong_contract("bytes-utf-8-ref", "bytes?", 0, argc, argv);
1590
1591 chars = SCHEME_BYTE_STR_VAL(argv[0]);
1592
1593 if (SCHEME_INTP(argv[1])) {
1594 pos = SCHEME_INT_VAL(argv[1]);
1595 } else if (SCHEME_BIGNUMP(argv[1])) {
1596 if (SCHEME_BIGPOS(argv[1]))
1597 pos = 0x7FFFFFFF;
1598 }
1599
1600 if (pos < 0) {
1601 scheme_wrong_contract("bytes-utf-8-ref", "exact-nonnegative-integer?", 1, argc, argv);
1602 }
1603
1604 if ((argc > 2) && !SCHEME_FALSEP(argv[2])) {
1605 if (!SCHEME_CHARP(argv[2]))
1606 scheme_wrong_contract("bytes-utf-8-ref", "(or/c char? #f)", 1, argc, argv);
1607 perm = argv[2];
1608 } else
1609 perm = 0;
1610
1611 scheme_get_substring_indices("bytes-utf-8-ref", argv[0], argc, argv,
1612 3, 4,
1613 &istart, &ifinish);
1614
1615 if (pos > 0) {
1616 utf8_decode_x((unsigned char *)chars, istart, ifinish,
1617 NULL, 0, pos,
1618 &ipos, &opos,
1619 0, 0, NULL, 0, perm ? 1 : 0, 0);
1620 if (opos < pos)
1621 return scheme_false;
1622 istart = ipos;
1623 }
1624
1625 utf8_decode_x((unsigned char *)chars, istart, ifinish,
1626 us, 0, 1,
1627 &ipos, &opos,
1628 0, 0, NULL, 0, perm ? 0xFFFFFF : 0, 0);
1629
1630 if (opos < 1)
1631 return scheme_false;
1632 else if (us[0] == 0xFFFFFF)
1633 return perm;
1634 else
1635 return scheme_make_character(us[0]);
1636 }
1637
1638 /********************************************************************/
1639 /* format */
1640 /********************************************************************/
1641
scheme_do_format(const char * procname,Scheme_Object * port,const mzchar * format,int flen,int fpos,int offset,int argc,Scheme_Object ** argv)1642 void scheme_do_format(const char *procname, Scheme_Object *port,
1643 const mzchar *format, int flen,
1644 int fpos, int offset, int argc, Scheme_Object **argv)
1645 {
1646 int i, start, end;
1647 int used = offset;
1648 int num_err = 0, char_err = 0, end_ok = 0;
1649 Scheme_Object *a[2];
1650
1651 if (!format) {
1652 if (!SCHEME_CHAR_STRINGP(argv[fpos])) {
1653 scheme_wrong_contract(procname, "string?", fpos, argc, argv);
1654 return;
1655 }
1656 format = SCHEME_CHAR_STR_VAL(argv[fpos]);
1657 flen = SCHEME_CHAR_STRTAG_VAL(argv[fpos]);
1658 } else if (flen == -1)
1659 flen = strlen((char *)format);
1660
1661 /* Check string first: */
1662 end = flen - 1;
1663 for (i = 0; i < end; i++) {
1664 if (format[i] == '~') {
1665 i++;
1666 if (scheme_isspace(format[i])) {
1667 /* skip spaces... */
1668 } else switch (format[i]) {
1669 case '~':
1670 if (i == end)
1671 end_ok = 1;
1672 break;
1673 case '%':
1674 case 'n':
1675 case 'N':
1676 break;
1677 case 'a':
1678 case 'A':
1679 case 's':
1680 case 'S':
1681 case 'v':
1682 case 'V':
1683 case 'e':
1684 case 'E':
1685 used++;
1686 break;
1687 case '.':
1688 switch (format[i+1]) {
1689 case 'a':
1690 case 'A':
1691 case 's':
1692 case 'S':
1693 case 'v':
1694 case 'V':
1695 break;
1696 default:
1697 scheme_contract_error(procname,
1698 "ill-formed pattern string",
1699 "explanation", 0, "tag `~.' not followed by `a', `s', or `v'",
1700 "pattern string", 1, argv[fpos],
1701 NULL);
1702 break;
1703 }
1704 used++;
1705 break;
1706 case 'x':
1707 case 'X':
1708 case 'o':
1709 case 'O':
1710 case 'b':
1711 case 'B':
1712 if (!num_err && !char_err && (used < argc)) {
1713 Scheme_Object *o = argv[used];
1714 if (!SCHEME_EXACT_REALP(o)
1715 && (!SCHEME_COMPLEXP(o)
1716 || !SCHEME_EXACT_REALP(scheme_complex_real_part(o))))
1717 num_err = used + 1;
1718 }
1719 used++;
1720 break;
1721 case 'c':
1722 case 'C':
1723 if (!num_err && !char_err && (used < argc)) {
1724 if (!SCHEME_CHARP(argv[used]))
1725 char_err = used + 1;
1726 }
1727 used++;
1728 break;
1729 default:
1730 {
1731 char buffer[64];
1732 sprintf(buffer, "tag `~%c' not allowed", format[i]);
1733 scheme_contract_error(procname,
1734 "ill-formed pattern string",
1735 "explanation", 0, buffer,
1736 "pattern string", 1, argv[fpos],
1737 NULL);
1738 return;
1739 }
1740 }
1741 }
1742 }
1743 if ((format[end] == '~') && !end_ok) {
1744 scheme_contract_error(procname,
1745 "ill-formed pattern string",
1746 "explanation", 0, "cannot end in `~'",
1747 "pattern string", 1, argv[fpos],
1748 NULL);
1749 return;
1750 }
1751 if (used != argc) {
1752 char *args;
1753 intptr_t alen;
1754
1755 args = scheme_make_args_string("", -1, argc, argv, &alen);
1756
1757 if (used > argc) {
1758 scheme_raise_exn(MZEXN_FAIL_CONTRACT,
1759 "%s: format string requires %d arguments, given %d%t",
1760 procname, used - offset, argc - offset, args, alen);
1761 } else {
1762 scheme_raise_exn(MZEXN_FAIL_CONTRACT,
1763 "%s: format string requires %d arguments, given %d%t",
1764 procname, used - offset, argc - offset, args, alen);
1765 }
1766 return;
1767 }
1768 if (num_err || char_err) {
1769 int pos = (num_err ? num_err : char_err) - 1;
1770 char *args, *bstr;
1771 intptr_t alen;
1772 intptr_t blen;
1773 char *type = (num_err ? "exact-number" : "character");
1774 Scheme_Object *bad = argv[pos];
1775
1776 args = scheme_make_args_string("other ", pos, argc, argv, &alen);
1777 bstr = scheme_make_provided_string(bad, 1, &blen);
1778 scheme_raise_exn(MZEXN_FAIL_CONTRACT,
1779 "%s: format string requires argument of type <%s>, given %t%t",
1780 procname, type,
1781 bstr, blen,
1782 args, alen);
1783 return;
1784 }
1785
1786 for (used = offset, i = start = 0; i < flen; i++) {
1787 if (format[i] == '~') {
1788 if (start < i) {
1789 (void)scheme_put_char_string(procname, port, format, start, i - start);
1790 }
1791 i++;
1792 if (scheme_isspace(format[i])) {
1793 /* skip spaces (at most one newline) */
1794 do {
1795 if ((format[i] == '\n') || (format[i] == '\r')) {
1796 /* got one */
1797 if ((format[i] == '\r') && (format[i + 1] == '\n'))
1798 i++; /* Windows-style CR-NL */
1799 i++;
1800 while (portable_isspace(format[i])
1801 && !((format[i] == '\n') || (format[i] == '\r'))) {
1802 i++;
1803 }
1804 break;
1805 } else
1806 i++;
1807 } while (scheme_isspace(format[i]));
1808 --i; /* back up over something */
1809 } else switch (format[i]) {
1810 case '~':
1811 scheme_write_byte_string("~", 1, port);
1812 break;
1813 case '%':
1814 case 'n':
1815 case 'N':
1816 scheme_write_byte_string("\n", 1, port);
1817 break;
1818 case 'c':
1819 case 'C':
1820 case 'a':
1821 case 'A':
1822 a[0] = argv[used++];
1823 a[1] = port;
1824 _scheme_apply(scheme_display_proc, 2, a);
1825 break;
1826 case 's':
1827 case 'S':
1828 a[0] = argv[used++];
1829 a[1] = port;
1830 _scheme_apply(scheme_write_proc, 2, a);
1831 break;
1832 case 'v':
1833 case 'V':
1834 a[0] = argv[used++];
1835 a[1] = port;
1836 _scheme_apply(scheme_print_proc, 2, a);
1837 break;
1838 case 'e':
1839 case 'E':
1840 {
1841 intptr_t len;
1842 char *s;
1843 s = scheme_make_provided_string(argv[used++], 0, &len);
1844 scheme_write_byte_string(s, len, port);
1845 }
1846 break;
1847 case '.':
1848 {
1849 intptr_t len;
1850 char *s;
1851 len = scheme_get_print_width();
1852 i++;
1853 switch (format[i]) {
1854 case 'a':
1855 case 'A':
1856 s = scheme_display_to_string_w_max(argv[used++], &len, len);
1857 break;
1858 case 's':
1859 case 'S':
1860 s = scheme_write_to_string_w_max(argv[used++], &len, len);
1861 break;
1862 case 'v':
1863 case 'V':
1864 s = scheme_print_to_string_w_max(argv[used++], &len, len);
1865 break;
1866 default:
1867 s = "???";
1868 len = 3;
1869 }
1870 scheme_write_byte_string(s, len, port);
1871 }
1872 break;
1873 case 'x':
1874 case 'X':
1875 case 'o':
1876 case 'O':
1877 case 'b':
1878 case 'B':
1879 {
1880 char *s;
1881 int radix;
1882
1883 switch(format[i]) {
1884 case 'x':
1885 case 'X':
1886 radix = 16;
1887 break;
1888 case 'o':
1889 case 'O':
1890 radix = 8;
1891 break;
1892 default:
1893 case 'b':
1894 case 'B':
1895 radix = 2;
1896 break;
1897 }
1898 s = scheme_number_to_string(radix, argv[used++]);
1899
1900 scheme_write_byte_string(s, strlen(s), port);
1901 }
1902 break;
1903 }
1904 SCHEME_USE_FUEL(1);
1905 start = i + 1;
1906 }
1907 }
1908
1909 SCHEME_USE_FUEL(flen);
1910
1911 if (start < i) {
1912 (void)scheme_put_char_string(procname, port, format, start, i - start);
1913 }
1914 }
1915
scheme_format(mzchar * format,int flen,int argc,Scheme_Object ** argv,intptr_t * rlen)1916 char *scheme_format(mzchar *format, int flen, int argc, Scheme_Object **argv, intptr_t *rlen)
1917 {
1918 Scheme_Object *port;
1919 port = scheme_make_byte_string_output_port();
1920 scheme_do_format("format", port, format, flen, 0, 0, argc, argv);
1921 return scheme_get_sized_byte_string_output(port, rlen);
1922 }
1923
scheme_printf(mzchar * format,int flen,int argc,Scheme_Object ** argv)1924 void scheme_printf(mzchar *format, int flen, int argc, Scheme_Object **argv)
1925 {
1926 scheme_do_format("printf", scheme_get_param(scheme_current_config(), MZCONFIG_OUTPUT_PORT),
1927 format, flen, 0, 0, argc, argv);
1928 }
1929
scheme_format_utf8(char * format,int flen,int argc,Scheme_Object ** argv,intptr_t * rlen)1930 char *scheme_format_utf8(char *format, int flen, int argc, Scheme_Object **argv, intptr_t *rlen)
1931 {
1932 mzchar *s;
1933 intptr_t srlen;
1934 if (flen == -1)
1935 flen = strlen(format);
1936 s = scheme_utf8_decode_to_buffer_len((unsigned char *)format, flen, NULL, 0, &srlen);
1937 if (s)
1938 return scheme_format(s, srlen, argc, argv, rlen);
1939 else
1940 return "";
1941 }
1942
scheme_printf_utf8(char * format,int flen,int argc,Scheme_Object ** argv)1943 void scheme_printf_utf8(char *format, int flen, int argc, Scheme_Object **argv)
1944 {
1945 mzchar *s;
1946 intptr_t srlen;
1947 if (flen == -1)
1948 flen = strlen(format);
1949 s = scheme_utf8_decode_to_buffer_len((unsigned char *)format, flen, NULL, 0, &srlen);
1950 if (s)
1951 scheme_printf(s, srlen, argc, argv);
1952 }
1953
1954
1955 static Scheme_Object *
format(int argc,Scheme_Object * argv[])1956 format(int argc, Scheme_Object *argv[])
1957 {
1958 Scheme_Object *port;
1959 char *s;
1960 intptr_t len;
1961
1962 port = scheme_make_byte_string_output_port();
1963
1964 scheme_do_format("format", port, NULL, 0, 0, 1, argc, argv);
1965
1966 s = scheme_get_sized_byte_string_output(port, &len);
1967 return scheme_make_sized_utf8_string(s, len);
1968 }
1969
1970 #ifdef INSTRUMENT_PRIMITIVES
1971 extern int g_print_prims;
1972 #endif
1973
1974 static Scheme_Object *
sch_printf(int argc,Scheme_Object * argv[])1975 sch_printf(int argc, Scheme_Object *argv[])
1976 {
1977 scheme_do_format("printf", scheme_get_param(scheme_current_config(), MZCONFIG_OUTPUT_PORT),
1978 NULL, 0, 0, 1, argc, argv);
1979 return scheme_void;
1980 }
1981
1982 static Scheme_Object *
sch_eprintf(int argc,Scheme_Object * argv[])1983 sch_eprintf(int argc, Scheme_Object *argv[])
1984 {
1985 scheme_do_format("eprintf", scheme_get_param(scheme_current_config(), MZCONFIG_ERROR_PORT),
1986 NULL, 0, 0, 1, argc, argv);
1987 return scheme_void;
1988 }
1989
1990 static Scheme_Object *
sch_fprintf(int argc,Scheme_Object * argv[])1991 sch_fprintf(int argc, Scheme_Object *argv[])
1992 {
1993 if (!SCHEME_OUTPUT_PORTP(argv[0]))
1994 scheme_wrong_contract("fprintf", "output-port?", 0, argc, argv);
1995
1996 scheme_do_format("fprintf", argv[0], NULL, 0, 1, 2, argc, argv);
1997 return scheme_void;
1998 }
1999
2000 /********************************************************************/
2001 /* misc */
2002 /********************************************************************/
2003
2004 static Scheme_Object *
version(int argc,Scheme_Object * argv[])2005 version(int argc, Scheme_Object *argv[])
2006 {
2007 return vers_str;
2008 }
2009
2010 static Scheme_Object *
banner(int argc,Scheme_Object * argv[])2011 banner(int argc, Scheme_Object *argv[])
2012 {
2013 return banner_str;
2014 }
2015
scheme_version(void)2016 char *scheme_version(void)
2017 {
2018 return MZSCHEME_VERSION;
2019 }
2020
2021 #ifdef MZ_PRECISE_GC
2022 # define VERSION_SUFFIX " [bc]"
2023 #else
2024 # ifdef USE_SENORA_GC
2025 # define VERSION_SUFFIX " [cgc]"
2026 # else
2027 # define VERSION_SUFFIX " [cgc/b]"
2028 # endif
2029 #endif
2030
scheme_banner(void)2031 char *scheme_banner(void)
2032 {
2033 if (embedding_banner)
2034 return embedding_banner;
2035 else
2036 return ("Welcome to Racket"
2037 " v" MZSCHEME_VERSION VERSION_SUFFIX
2038 ".\n");
2039 }
2040
scheme_set_banner(char * s)2041 void scheme_set_banner(char *s)
2042 {
2043 embedding_banner = s;
2044 }
2045
scheme_byte_string_has_null(Scheme_Object * o)2046 int scheme_byte_string_has_null(Scheme_Object *o)
2047 {
2048 const char *s = SCHEME_BYTE_STR_VAL(o);
2049 int i = SCHEME_BYTE_STRTAG_VAL(o);
2050 while (i--) {
2051 if (!s[i])
2052 return 1;
2053 }
2054 return 0;
2055 }
2056
scheme_any_string_has_null(Scheme_Object * o)2057 int scheme_any_string_has_null(Scheme_Object *o)
2058 {
2059 if (SCHEME_BYTE_STRINGP(o))
2060 return scheme_byte_string_has_null(o);
2061 else {
2062 const mzchar *s = SCHEME_CHAR_STR_VAL(o);
2063 int i = SCHEME_CHAR_STRTAG_VAL(o);
2064 while (i--) {
2065 if (!s[i])
2066 return 1;
2067 }
2068 return 0;
2069 }
2070 }
2071
2072 /***********************************************************************/
2073 /* Environment Variables */
2074 /***********************************************************************/
2075
2076 /* A `scheme_environment_variables_type` record wraps a hash table
2077 that maps normalized keys to (cons <key> <val>), where the key
2078 in the pair preserves its original case. */
2079
2080 #define SCHEME_ENVVARS_TABLE(ev) ((Scheme_Hash_Tree *)SCHEME_PTR_VAL(ev))
2081
scheme_make_environment_variables(Scheme_Hash_Tree * ht)2082 Scheme_Object *scheme_make_environment_variables(Scheme_Hash_Tree *ht)
2083 {
2084 Scheme_Object *ev;
2085
2086 ev = scheme_alloc_small_object();
2087 ev->type = scheme_environment_variables_type;
2088 SCHEME_PTR_VAL(ev) = (Scheme_Object *)ht;
2089
2090 return ev;
2091 }
2092
env_p(int argc,Scheme_Object * argv[])2093 static Scheme_Object *env_p(int argc, Scheme_Object *argv[])
2094 {
2095 return (SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_environment_variables_type)
2096 ? scheme_true
2097 : scheme_false);
2098 }
2099
current_environment_variables(int argc,Scheme_Object * argv[])2100 static Scheme_Object *current_environment_variables(int argc, Scheme_Object *argv[])
2101 {
2102 Scheme_Object *v;
2103
2104 v = scheme_param_config2("current-environment-variables",
2105 scheme_make_integer(MZCONFIG_CURRENT_ENV_VARS),
2106 argc, argv,
2107 -1, env_p, "environment-variables?", 0);
2108
2109 return v;
2110 }
2111
2112 static int sch_bool_getenv(const char* name);
2113
scheme_init_getenv(void)2114 void scheme_init_getenv(void)
2115 {
2116 if (sch_bool_getenv("PLTNOMZJIT")) {
2117 scheme_set_startup_use_jit(0);
2118 }
2119 if (sch_bool_getenv("PLT_SHOW_BUILTIN_CONTEXT")) {
2120 scheme_keep_builtin_context = 1;
2121 }
2122 }
2123
sch_bool_getenv(const char * name)2124 static int sch_bool_getenv(const char* name)
2125 {
2126 if (rktio_getenv(scheme_rktio, name))
2127 return 1;
2128 else
2129 return 0;
2130 }
2131
byte_string_ok_name(Scheme_Object * o)2132 int byte_string_ok_name(Scheme_Object *o)
2133 {
2134 const char *s = SCHEME_BYTE_STR_VAL(o);
2135 int i = SCHEME_BYTE_STRTAG_VAL(o);
2136
2137 while (i--) {
2138 if (!s[i])
2139 return 0;
2140 }
2141
2142 return rktio_is_ok_envvar_name(scheme_rktio, s);
2143
2144 return 1;
2145 }
2146
normalize_env_case(Scheme_Object * bs)2147 static Scheme_Object *normalize_env_case(Scheme_Object *bs)
2148 {
2149 if (rktio_are_envvar_names_case_insensitive(scheme_rktio)) {
2150 bs = scheme_byte_string_to_char_string(bs);
2151 bs = string_locale_downcase(1, &bs);
2152 bs = scheme_char_string_to_byte_string(bs);
2153 }
2154 return bs;
2155 }
2156
scheme_getenv(char * name)2157 char *scheme_getenv(char *name)
2158 {
2159 char *s;
2160 s = rktio_getenv(scheme_rktio, name);
2161 if (s)
2162 return scheme_strdup_and_free(s);
2163 else
2164 return NULL;
2165 }
2166
sch_getenv(int argc,Scheme_Object * argv[])2167 static Scheme_Object *sch_getenv(int argc, Scheme_Object *argv[])
2168 {
2169 char *name;
2170 char *value;
2171 Scheme_Object *bs, *ev, *val;
2172 Scheme_Hash_Tree *ht;
2173
2174 if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_environment_variables_type))
2175 scheme_wrong_contract("environment-variables-ref", "environment-variables?", 0, argc, argv);
2176
2177 bs = argv[1];
2178 if (!SCHEME_BYTE_STRINGP(bs)
2179 || !byte_string_ok_name(bs))
2180 scheme_wrong_contract("environment-variables-ref", "bytes-environment-variable-name?", 1, argc, argv);
2181
2182 ev = argv[0];
2183 ht = SCHEME_ENVVARS_TABLE(ev);
2184
2185 if (!ht) {
2186 name = SCHEME_BYTE_STR_VAL(bs);
2187
2188 value = rktio_getenv(scheme_rktio, name);
2189 if (value) {
2190 val = scheme_make_byte_string(value);
2191 free(value);
2192 } else
2193 val = scheme_false;
2194
2195 return val;
2196 } else {
2197 bs = normalize_env_case(bs);
2198 val = scheme_hash_tree_get(ht, bs);
2199 return val ? SCHEME_CDR(val) : scheme_false;
2200 }
2201 }
2202
sch_putenv(int argc,Scheme_Object * argv[])2203 static Scheme_Object *sch_putenv(int argc, Scheme_Object *argv[])
2204 {
2205 Scheme_Object *varbs, *valbs, *norm_varbs, *ev;
2206 Scheme_Hash_Tree *ht;
2207 char *var;
2208 char *val;
2209
2210 if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_environment_variables_type))
2211 scheme_wrong_contract("environment-variables-set!", "environment-variables?", 0, argc, argv);
2212
2213 varbs = argv[1];
2214 if (!SCHEME_BYTE_STRINGP(varbs)
2215 || !byte_string_ok_name(varbs))
2216 scheme_wrong_contract("environment-variables-set!", "bytes-environment-variable-name?", 1, argc, argv);
2217
2218 valbs = argv[2];
2219 if (!SCHEME_FALSEP(valbs)
2220 && (!SCHEME_BYTE_STRINGP(valbs)
2221 || scheme_byte_string_has_null(valbs)))
2222 scheme_wrong_contract("environment-variables-set!", "(or/c bytes-no-nuls? #f)", 2, argc, argv);
2223 if (argc > 3)
2224 scheme_check_proc_arity("environment-variables-set!", 0, 3, argc, argv);
2225
2226 ev = argv[0];
2227 ht = SCHEME_ENVVARS_TABLE(ev);
2228
2229 if (ht) {
2230 norm_varbs = normalize_env_case(varbs);
2231
2232 if (SCHEME_FALSEP(valbs)) {
2233 ht = scheme_hash_tree_set(ht, norm_varbs, NULL);
2234 } else {
2235 if (SAME_OBJ(varbs, norm_varbs)) {
2236 varbs = byte_string_to_immutable(1, &varbs);
2237 norm_varbs = varbs;
2238 } else {
2239 varbs = byte_string_to_immutable(1, &varbs);
2240 norm_varbs = byte_string_to_immutable(1, &norm_varbs);
2241 }
2242 valbs = byte_string_to_immutable(1, &valbs);
2243 ht = scheme_hash_tree_set(ht, norm_varbs, scheme_make_pair(varbs, valbs));
2244 }
2245
2246 SCHEME_PTR_VAL(ev) = (Scheme_Object *)ht;
2247
2248 return scheme_void;
2249 } else {
2250 var = SCHEME_BYTE_STR_VAL(varbs);
2251
2252 if (SCHEME_FALSEP(valbs)) {
2253 val = NULL;
2254 } else {
2255 val = SCHEME_BYTE_STR_VAL(valbs);
2256 }
2257
2258 if (!rktio_setenv(scheme_rktio, var, val)) {
2259 if (argc > 3)
2260 return _scheme_tail_apply(argv[3], 0, NULL);
2261 else {
2262 scheme_raise_exn(MZEXN_FAIL,
2263 "environment-variables-set!: change failed\n"
2264 " system error: %R");
2265 }
2266 }
2267
2268 return scheme_void;
2269 }
2270 }
2271
env_copy(int argc,Scheme_Object * argv[])2272 static Scheme_Object *env_copy(int argc, Scheme_Object *argv[])
2273 {
2274 Scheme_Hash_Tree *ht;
2275
2276 if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_environment_variables_type))
2277 scheme_wrong_contract("environment-variables-copy", "environment-variables?", 0, argc, argv);
2278
2279 ht = SCHEME_ENVVARS_TABLE(argv[0]);
2280 if (ht)
2281 return scheme_make_environment_variables(ht);
2282
2283 /* copy system environment variables into a hash table: */
2284 ht = scheme_make_hash_tree(SCHEME_hashtr_equal);
2285
2286 {
2287 intptr_t i;
2288 rktio_envvars_t *envvars;
2289 Scheme_Object *var, *val, *norm_var;
2290
2291 envvars = rktio_envvars(scheme_rktio);
2292 for (i = rktio_envvars_count(scheme_rktio, envvars); i--; ) {
2293 var = scheme_make_immutable_sized_byte_string(rktio_envvars_name_ref(scheme_rktio, envvars, i), -1, 1);
2294 val = scheme_make_immutable_sized_byte_string(rktio_envvars_value_ref(scheme_rktio, envvars, i), -1, 1);
2295 norm_var = normalize_env_case(var);
2296 if (!SAME_OBJ(var, norm_var))
2297 norm_var = byte_string_to_immutable(1, &norm_var);
2298 ht = scheme_hash_tree_set(ht, norm_var, scheme_make_pair(var, val));
2299 }
2300
2301 rktio_envvars_free(scheme_rktio, envvars);
2302 }
2303
2304 return scheme_make_environment_variables(ht);
2305 }
2306
env_make(int argc,Scheme_Object * argv[])2307 static Scheme_Object *env_make(int argc, Scheme_Object *argv[])
2308 {
2309 Scheme_Hash_Tree *ht;
2310 Scheme_Object *varbs, *valbs, *norm_varbs;
2311 int i;
2312
2313 ht = scheme_make_hash_tree(SCHEME_hashtr_equal);
2314
2315 for (i = 0; i < argc; i += 2) {
2316 varbs = argv[i];
2317 if (!SCHEME_BYTE_STRINGP(varbs)
2318 || !byte_string_ok_name(varbs))
2319 scheme_wrong_contract("make-environment-variables", "bytes-environment-variable-name?", i, argc, argv);
2320
2321 if (i+1 >= argc) {
2322 scheme_contract_error("make-environment-variables",
2323 "key does not have a value (i.e., an odd number of arguments were provided)",
2324 "key", 1, argv[i],
2325 NULL);
2326 return NULL;
2327 }
2328
2329 valbs = argv[i+1];
2330 if (!SCHEME_FALSEP(valbs)
2331 && (!SCHEME_BYTE_STRINGP(valbs)
2332 || scheme_byte_string_has_null(valbs)))
2333 scheme_wrong_contract("make-environment-variables", "(or/c bytes-no-nuls? #f)", i+1, argc, argv);
2334
2335 varbs = byte_string_to_immutable(1, &varbs);
2336 valbs = byte_string_to_immutable(1, &valbs);
2337 norm_varbs = normalize_env_case(varbs);
2338 if (!SAME_OBJ(varbs, norm_varbs))
2339 norm_varbs = byte_string_to_immutable(1, &norm_varbs);
2340 ht = scheme_hash_tree_set(ht, norm_varbs, scheme_make_pair(varbs, valbs));
2341 }
2342
2343 return scheme_make_environment_variables(ht);
2344 }
2345
sch_getenv_names(int argc,Scheme_Object * argv[])2346 static Scheme_Object *sch_getenv_names(int argc, Scheme_Object *argv[])
2347 {
2348 Scheme_Object *ev, *r = scheme_null, *key, *val;
2349 Scheme_Hash_Tree *ht;
2350 mzlonglong i;
2351
2352 ev = argv[0];
2353 if (!SAME_TYPE(SCHEME_TYPE(ev), scheme_environment_variables_type))
2354 scheme_wrong_contract("environment-variables-names", "environment-variables?", 0, argc, argv);
2355
2356 ht = SCHEME_ENVVARS_TABLE(ev);
2357 if (!ht) {
2358 ev = env_copy(1, argv);
2359 ht = SCHEME_ENVVARS_TABLE(ev);
2360 }
2361
2362 for (i = scheme_hash_tree_next(ht, -1); i != -1; i = scheme_hash_tree_next(ht, i)) {
2363 scheme_hash_tree_index(ht, i, &key, &val);
2364 r = scheme_make_pair(SCHEME_CAR(val), r);
2365 }
2366
2367 return r;
2368 }
2369
scheme_environment_variables_to_envvars(Scheme_Object * ev)2370 rktio_envvars_t *scheme_environment_variables_to_envvars(Scheme_Object *ev)
2371 {
2372 Scheme_Hash_Tree *ht = SCHEME_ENVVARS_TABLE(ev);
2373 rktio_envvars_t *envvars;
2374 mzlonglong i;
2375 Scheme_Object *key, *val;
2376
2377 if (!ht)
2378 return NULL;
2379
2380 envvars = rktio_empty_envvars(scheme_rktio);
2381
2382 for (i = scheme_hash_tree_next(ht, -1); i != -1; i = scheme_hash_tree_next(ht, i)) {
2383 scheme_hash_tree_index(ht, i, &key, &val);
2384
2385 rktio_envvars_set(scheme_rktio,
2386 envvars,
2387 SCHEME_BYTE_STR_VAL(SCHEME_CAR(val)),
2388 SCHEME_BYTE_STR_VAL(SCHEME_CDR(val)));
2389 }
2390
2391 return envvars;
2392 }
2393
2394 /***********************************************************************/
2395 /* End Environment Variables */
2396 /***********************************************************************/
2397
scheme_set_cross_compile_mode(int v)2398 void scheme_set_cross_compile_mode(int v)
2399 {
2400 cross_compile_mode = v;
2401 }
2402
2403 #include "systype.inc"
2404
system_type(int argc,Scheme_Object * argv[])2405 static Scheme_Object *system_type(int argc, Scheme_Object *argv[])
2406 {
2407 if (argc) {
2408 if (SAME_OBJ(argv[0], link_symbol)) {
2409 return scheme_intern_symbol(MZ_SYSTEM_TYPE_LINK);
2410 }
2411
2412 if (SAME_OBJ(argv[0], machine_symbol)) {
2413 char *s;
2414 Scheme_Object *str;
2415
2416 s = rktio_uname(scheme_rktio);
2417 str = scheme_make_locale_string(s);
2418 rktio_free(s);
2419
2420 return str;
2421 }
2422
2423 if (SAME_OBJ(argv[0], gc_symbol)) {
2424 #ifdef MZ_PRECISE_GC
2425 return _3m_symbol;
2426 #else
2427 return cgc_symbol;
2428 #endif
2429 }
2430
2431 if (SAME_OBJ(argv[0], vm_symbol)) {
2432 return racket_symbol;
2433 }
2434
2435 if (SAME_OBJ(argv[0], so_suffix_symbol)) {
2436 return scheme_make_byte_string(MZ_SYSTEM_TYPE_SO_SUFFIX);
2437 }
2438
2439 if (SAME_OBJ(argv[0], so_mode_symbol)) {
2440 return scheme_intern_symbol(MZ_SYSTEM_TYPE_SO_MODE);
2441 }
2442
2443
2444 if (SAME_OBJ(argv[0], word_symbol)) {
2445 return scheme_make_integer(sizeof(void*)*8);
2446 }
2447
2448 if (SAME_OBJ(argv[0], fs_change_symbol)) {
2449 return fs_change_props;
2450 }
2451
2452 if (SAME_OBJ(argv[0], target_machine_symbol)) {
2453 return racket_symbol;
2454 }
2455
2456 if (SAME_OBJ(argv[0], cross_symbol)) {
2457 return (cross_compile_mode ? force_symbol : infer_symbol);
2458 }
2459
2460 if (SAME_OBJ(argv[0], os_star_symbol)) {
2461 return sys_os_symbol;
2462 }
2463
2464 if (SAME_OBJ(argv[0], arch_symbol)) {
2465 return sys_arch_symbol;
2466 }
2467
2468 if (!SAME_OBJ(argv[0], os_symbol)) {
2469 scheme_wrong_contract("system-type",
2470 ("(or/c 'os 'os* 'arch 'word 'link 'machine 'target-machine\n"
2471 " 'vm 'gc 'so-suffix 'so-mode 'word 'fs-change 'cross)"),
2472 0, argc, argv);
2473 return NULL;
2474 }
2475 }
2476
2477 return sys_symbol;
2478 }
2479
system_library_subpath(int argc,Scheme_Object * argv[])2480 static Scheme_Object *system_library_subpath(int argc, Scheme_Object *argv[])
2481 {
2482 if (argc > 0) {
2483 if (SCHEME_FALSEP(argv[0]))
2484 return platform_cgc_path;
2485
2486 if (SAME_OBJ(cgc_symbol, argv[0]))
2487 return platform_cgc_path;
2488
2489 if (SAME_OBJ(_3m_symbol, argv[0]))
2490 return platform_3m_path;
2491
2492 if (SAME_OBJ(cs_symbol, argv[0]))
2493 return platform_cs_path;
2494
2495 scheme_wrong_contract("system-library-subpath", "(or/c 'cgc '3m 'cs #f)", 0, argc, argv);
2496 return NULL;
2497 } else {
2498 #ifdef MZ_PRECISE_GC
2499 return platform_3m_path;
2500 #else
2501 return platform_cgc_path;
2502 #endif
2503 }
2504 }
2505
scheme_system_library_subpath()2506 const char *scheme_system_library_subpath()
2507 {
2508 return SCHEME_PLATFORM_LIBRARY_SUBPATH SPLS_SUFFIX;
2509 }
2510
2511 /* Our own strncpy - which would be really stupid, except the one for
2512 the implementation in Solaris 2.6 is broken (it doesn't always stop
2513 at the null terminator). */
scheme_strncmp(const char * a,const char * b,int len)2514 int scheme_strncmp(const char *a, const char *b, int len)
2515 {
2516 while (len-- && (*a == *b) && *a) {
2517 a++;
2518 b++;
2519 }
2520
2521 if (len < 0)
2522 return 0;
2523 else
2524 return *a - *b;
2525 }
2526
ok_cmdline(int argc,Scheme_Object ** argv)2527 static Scheme_Object *ok_cmdline(int argc, Scheme_Object **argv)
2528 {
2529 if (SCHEME_CHAPERONE_VECTORP(argv[0])) {
2530 Scheme_Object *vec = argv[0], *vec2, *str;
2531 int i, size = SCHEME_CHAPERONE_VEC_SIZE(vec);
2532
2533 if (!size)
2534 return vec;
2535
2536 /* Make sure vector and strings are immutable: */
2537 vec2 = scheme_make_vector(size, NULL);
2538 if (size)
2539 SCHEME_SET_VECTOR_IMMUTABLE(vec2);
2540 for (i = 0; i < size; i++) {
2541 if (SCHEME_VECTORP(vec))
2542 str = SCHEME_VEC_ELS(vec)[i];
2543 else
2544 str = scheme_chaperone_vector_ref(vec, i);
2545 if (!SCHEME_CHAR_STRINGP(str))
2546 return NULL;
2547 if (!SCHEME_IMMUTABLE_CHAR_STRINGP(str)) {
2548 str = scheme_make_sized_char_string(SCHEME_CHAR_STR_VAL(str), SCHEME_CHAR_STRLEN_VAL(str), 0);
2549 SCHEME_SET_CHAR_STRING_IMMUTABLE(str);
2550 }
2551 SCHEME_VEC_ELS(vec2)[i] = str;
2552 }
2553
2554 return vec2;
2555 }
2556
2557 return NULL;
2558 }
2559
cmdline_args(int argc,Scheme_Object * argv[])2560 static Scheme_Object *cmdline_args(int argc, Scheme_Object *argv[])
2561 {
2562 return scheme_param_config2("current-command-line-arguments",
2563 scheme_make_integer(MZCONFIG_CMDLINE_ARGS),
2564 argc, argv,
2565 -1, ok_cmdline, "(vectorof string?)", 1);
2566 }
2567
2568 /**********************************************************************/
2569 /* locale ops */
2570 /**********************************************************************/
2571
ok_locale(int argc,Scheme_Object ** argv)2572 static Scheme_Object *ok_locale(int argc, Scheme_Object **argv)
2573 {
2574 if (SCHEME_FALSEP(argv[0]))
2575 return argv[0];
2576 else if (SCHEME_CHAR_STRINGP(argv[0])) {
2577 if (SCHEME_IMMUTABLEP(argv[0]))
2578 return argv[0];
2579 else {
2580 Scheme_Object *str = argv[0];
2581 str = scheme_make_immutable_sized_char_string(SCHEME_CHAR_STR_VAL(str), SCHEME_CHAR_STRLEN_VAL(str), 1);
2582 return str;
2583 }
2584 }
2585
2586 return NULL;
2587 }
2588
current_locale(int argc,Scheme_Object * argv[])2589 static Scheme_Object *current_locale(int argc, Scheme_Object *argv[])
2590 {
2591 Scheme_Object *v;
2592
2593 v = scheme_param_config2("current-locale",
2594 scheme_make_integer(MZCONFIG_LOCALE),
2595 argc, argv,
2596 -1, ok_locale, "(or/c #f string?)", 1);
2597
2598 return v;
2599 }
2600
locale_string_encoding(int argc,Scheme_Object * argv[])2601 static Scheme_Object *locale_string_encoding(int argc, Scheme_Object *argv[])
2602 {
2603 char *enc;
2604 Scheme_Object *s;
2605
2606 reset_locale();
2607 if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on)
2608 return scheme_make_utf8_string("UTF-8");
2609
2610 enc = rktio_locale_encoding(scheme_rktio);
2611 s = scheme_make_utf8_string(enc);
2612 free(enc);
2613
2614 return s;
2615 }
2616
system_language_country(int argc,Scheme_Object * argv[])2617 static Scheme_Object *system_language_country(int argc, Scheme_Object *argv[])
2618 {
2619 char *lc;
2620 Scheme_Object *s;
2621
2622 lc = rktio_system_language_country(scheme_rktio);
2623 s = scheme_make_utf8_string(lc);
2624 free(lc);
2625
2626 return s;
2627 }
2628
do_convert_close(rktio_converter_t * cd,int cache_mode,const char * to_e,const char * from_e)2629 static void do_convert_close(rktio_converter_t *cd, int cache_mode, const char *to_e, const char *from_e)
2630 /* If `cache_mode` is -1, then `to_e` needs to be freed (or cached).
2631 If `cache_mode` is 1, then `from_e` needs to be freed (or cached). */
2632 {
2633 if (cache_mode == -1)
2634 cache_locale_or_close(1, cd, (char *)to_e);
2635 else if (cache_mode == 1)
2636 cache_locale_or_close(0, cd, (char *)from_e);
2637 else if (!cache_mode)
2638 rktio_converter_close(scheme_rktio, cd);
2639 }
2640
do_convert(rktio_converter_t * cd,const char * from_e,const char * to_e,int to_from_utf8,char * in,int id,int iilen,char * out,int od,int iolen,int grow,int add_end_shift,int extra,intptr_t * oilen,intptr_t * oolen,int * status)2641 static char *do_convert(rktio_converter_t *cd,
2642 /* if !cd and either from_e or to_e can be NULL, then
2643 reset_locale() must have been called */
2644 const char *from_e, const char *to_e,
2645 /* 1 => UCS-4 -> UTF-8; 2 => UTF-8 -> UCS-4; 0 => other */
2646 int to_from_utf8,
2647 /* in can be NULL to output just a shift; in that case,
2648 id should be 0, too */
2649 char *in, int id, int iilen,
2650 char *out, int od, int iolen,
2651 /* if grow, then reallocate when out isn't big enough */
2652 int grow,
2653 /* if add_end_shift, add a shift sequence to the end;
2654 not useful if in is already NULL to indicate a shift */
2655 int add_end_shift,
2656 /* extra specifies the length of a terminator,
2657 not included in iolen or *oolen */
2658 int extra,
2659 /* these two report actual read/wrote sizes: */
2660 intptr_t *oilen, intptr_t *oolen,
2661 /* status is set to
2662 0 for complete,
2663 -1 for partial input,
2664 -2 for error,
2665 1 for more avail */
2666 int *status)
2667 {
2668 int dip, dop, close_it = 0, cache_mode = 0, mz_utf8 = 0;
2669 intptr_t il, ol, r;
2670 GC_CAN_IGNORE char *ip, *op;
2671
2672 /* Defaults: */
2673 *status = -1;
2674 if (oilen)
2675 *oilen = 0;
2676 *oolen = 0;
2677
2678 if (!cd) {
2679 if (rktio_convert_properties(scheme_rktio) & RKTIO_CONVERTER_SUPPORTED) {
2680 char *tmp_from_e = NULL, *tmp_to_e = NULL;
2681
2682 if (!to_e && !strcmp(from_e, MZ_UCS4_NAME))
2683 cache_mode = -1;
2684 else if (!from_e && !strcmp(to_e, MZ_UCS4_NAME))
2685 cache_mode = 1;
2686
2687 if (!from_e) {
2688 tmp_from_e = rktio_locale_encoding(scheme_rktio);
2689 from_e = tmp_from_e;
2690 }
2691 if (!to_e) {
2692 tmp_to_e = rktio_locale_encoding(scheme_rktio);
2693 to_e = tmp_to_e;
2694 }
2695
2696 if ((cache_mode == -1)
2697 && cached_locale_to_converter
2698 && !strcmp(to_e, cached_locale_encoding_name)) {
2699 cd = cached_locale_to_converter;
2700 cached_locale_to_converter = NULL;
2701 } else if ((cache_mode == 1)
2702 && cached_locale_from_converter
2703 && !strcmp(from_e, cached_locale_encoding_name)) {
2704 cd = cached_locale_from_converter;
2705 cached_locale_from_converter = NULL;
2706 } else {
2707 cd = rktio_converter_open(scheme_rktio, to_e, from_e);
2708 }
2709 close_it = 1;
2710 if (tmp_from_e && ((cache_mode != 1) || !cd)) free(tmp_from_e);
2711 if (tmp_to_e && ((cache_mode != -1) || !cd)) free(tmp_to_e);
2712 } else if (to_from_utf8) {
2713 /* Assume UTF-8 */
2714 mz_utf8 = 1;
2715 }
2716 }
2717
2718 if (!cd && !mz_utf8) {
2719 if (out) {
2720 while (extra--) {
2721 out[extra] = 0;
2722 }
2723 }
2724 return out;
2725 }
2726
2727 /* The converter is ready. Allocate out space, if necessary */
2728
2729 if (!out) {
2730 if (iolen <= 0)
2731 iolen = iilen;
2732 out = (char *)scheme_malloc_atomic(iolen + extra);
2733 od = 0;
2734 }
2735
2736 /* il and ol are the number of available chars */
2737 il = iilen;
2738 ol = iolen;
2739 /* dip and dop are the number of characters read so far;
2740 we use these and NULL out the ip and op pointers
2741 for the sake of precise GC */
2742 dip = 0;
2743 dop = 0;
2744 if (!in)
2745 add_end_shift = 0;
2746
2747 while (1) {
2748 int icerr;
2749
2750 if (mz_utf8) {
2751 /* Use our UTF-8 routines as if they were iconv */
2752 if (to_from_utf8 == 1) {
2753 /* UCS-4 -> UTF-8 */
2754 /* We assume that in + id and iilen are mzchar-aligned */
2755 int opos, uid, uilen;
2756 uid = (id + dip) >> 2;
2757 uilen = (iilen - dip) >> 2;
2758 opos = scheme_utf8_encode((const unsigned int *)in, uid, uilen,
2759 NULL, 0,
2760 0);
2761 if (opos <= iolen) {
2762 opos = scheme_utf8_encode((const unsigned int *)in, uid, uilen,
2763 (unsigned char *)out, od + dop,
2764 0);
2765 dop += opos;
2766 dip += iilen;
2767 icerr = 0;
2768 r = (size_t)opos;
2769 } else {
2770 icerr = E2BIG;
2771 r = (size_t)-1;
2772 }
2773 } else {
2774 /* UTF-8 -> UCS-4 */
2775 /* We assume that out + od is mzchar-aligned */
2776 intptr_t ipos, opos;
2777
2778 r = utf8_decode_x((unsigned char *)in, id + dip, iilen,
2779 (unsigned int *)out, (od + dop) >> 2, iolen >> 2,
2780 &ipos, &opos,
2781 0, 0, NULL, 0, 0, 0);
2782
2783 opos <<= 2;
2784 dop = (opos - od);
2785 dip = (ipos - id);
2786
2787 if ((r == -1) || (r == -2)) {
2788 r = (size_t)-1;
2789 icerr = RKTIO_ERROR_CONVERT_BAD_SEQUENCE;
2790 } else if (r == -3) {
2791 icerr = RKTIO_ERROR_CONVERT_NOT_ENOUGH_SPACE;
2792 r = (size_t)-1;
2793 } else
2794 icerr = 0;
2795 }
2796 } else {
2797 ip = in XFORM_OK_PLUS id + dip;
2798 op = out XFORM_OK_PLUS od + dop;
2799 r = rktio_convert(scheme_rktio, cd, &ip, &il, &op, &ol);
2800 dip = ip - (in XFORM_OK_PLUS id);
2801 dop = op - (out XFORM_OK_PLUS od);
2802 ip = op = NULL;
2803 icerr = rktio_get_last_error(scheme_rktio);
2804 }
2805
2806 /* Record how many chars processed, now */
2807 if (oilen)
2808 *oilen = dip;
2809 *oolen = dop;
2810
2811 /* Got all the chars? */
2812 if (r == RKTIO_CONVERT_ERROR) {
2813 if (icerr == RKTIO_ERROR_CONVERT_NOT_ENOUGH_SPACE) {
2814 if (grow) {
2815 /* Double the string size and try again */
2816 char *naya;
2817 naya = (char *)scheme_malloc_atomic((iolen * 2) + extra);
2818 memcpy(naya, out + od, *oolen);
2819 ol += iolen;
2820 iolen += iolen;
2821 out = naya;
2822 od = 0;
2823 } else {
2824 *status = 1;
2825 if (close_it)
2826 do_convert_close(cd, cache_mode, to_e, from_e);
2827 while (extra--) {
2828 out[od + dop + extra] = 0;
2829 }
2830 return out;
2831 }
2832 } else {
2833 /* Either EINVAL (premature end) or EILSEQ (bad sequence) */
2834 if (icerr == RKTIO_ERROR_CONVERT_BAD_SEQUENCE)
2835 *status = -2;
2836 if (close_it)
2837 do_convert_close(cd, cache_mode, to_e, from_e);
2838 while (extra--) {
2839 out[od + dop + extra] = 0;
2840 }
2841 return out;
2842 }
2843 } else {
2844 /* All done... */
2845 if (add_end_shift) {
2846 add_end_shift = 0;
2847 in = NULL;
2848 dip = 0;
2849 id = 0;
2850 il = 0; /* should be redundant */
2851 oilen = NULL; /* so it doesn't get set to 0 */
2852 } else {
2853 *status = 0;
2854 if (close_it)
2855 do_convert_close(cd, cache_mode, to_e, from_e);
2856 while (extra--) {
2857 out[od + dop + extra] = 0;
2858 }
2859 return out;
2860 }
2861 }
2862 }
2863 }
2864
2865 #define MZ_SC_BUF_SIZE 32
2866
string_to_from_locale(int to_bytes,char * in,intptr_t delta,intptr_t len,intptr_t * olen,int perm,int * no_cvt)2867 static char *string_to_from_locale(int to_bytes,
2868 char *in, intptr_t delta, intptr_t len,
2869 intptr_t *olen, int perm,
2870 int *no_cvt)
2871 /* Call this function only when iconv is available, and only when
2872 reset_locale() has been called */
2873 {
2874 Scheme_Object *parts = scheme_null, *one;
2875 char *c, *le;
2876 intptr_t clen, used;
2877 int status;
2878 rktio_converter_t *cd;
2879
2880 le = rktio_locale_encoding(scheme_rktio);
2881 if (cached_locale_encoding_name
2882 && !strcmp(le, cached_locale_encoding_name)
2883 && (to_bytes ? cached_locale_to_converter : cached_locale_from_converter)) {
2884 if (to_bytes) {
2885 cd = cached_locale_to_converter;
2886 cached_locale_to_converter = NULL;
2887 } else {
2888 cd = cached_locale_from_converter;
2889 cached_locale_from_converter = NULL;
2890 }
2891 } else {
2892 if (to_bytes)
2893 cd = rktio_converter_open(scheme_rktio, le, MZ_UCS4_NAME);
2894 else
2895 cd = rktio_converter_open(scheme_rktio, MZ_UCS4_NAME, le);
2896 }
2897
2898 if (!cd) {
2899 free(le);
2900 *no_cvt = 1;
2901 return NULL;
2902 }
2903 *no_cvt = 0;
2904
2905 status = 0;
2906
2907 while (len) {
2908 /* We might have conversion errors... */
2909 c = do_convert(cd, NULL, NULL, 0,
2910 (char *)in, (to_bytes ? 4 : 1) * delta, (to_bytes ? 4 : 1) * len,
2911 NULL, 0, (to_bytes ? 1 : 4) * (len + 1),
2912 1 /* grow */, 1, (to_bytes ? 1 : 4) /* terminator size */,
2913 &used, &clen,
2914 &status);
2915
2916 if (to_bytes)
2917 used >>= 2;
2918
2919 if ((perm < 0) && (used < len)) {
2920 rktio_converter_close(scheme_rktio, cd);
2921 free(le);
2922 return NULL;
2923 }
2924
2925 delta += used;
2926 len -= used;
2927
2928 if (!len && SCHEME_NULLP(parts)) {
2929 if (to_bytes) {
2930 *olen = clen;
2931 c[*olen] = 0;
2932 } else {
2933 *olen = (clen >> 2);
2934 ((mzchar *)c)[*olen] = 0;
2935 }
2936 cache_locale_or_close(to_bytes, cd, le);
2937 return c;
2938 }
2939
2940 /* We can get here if there was some conversion error at some
2941 point. We're building up a list of parts. */
2942
2943 if (to_bytes) {
2944 one = scheme_make_sized_byte_string(c, clen, 0);
2945 } else {
2946 one = scheme_make_sized_char_string((mzchar *)c, clen >> 2, 0);
2947 }
2948
2949 parts = scheme_make_pair(one, parts);
2950
2951 if (len) {
2952 /* Conversion error, so skip one char. */
2953 if (to_bytes) {
2954 char bc[1];
2955 bc[0] = perm;
2956 one = scheme_make_sized_byte_string(bc, 1, 1);
2957 } else {
2958 mzchar bc[1];
2959 bc[0] = perm;
2960 one = scheme_make_sized_char_string(bc, 1, 1);
2961 }
2962 parts = scheme_make_pair(one, parts);
2963 delta += 1;
2964 len -= 1;
2965 }
2966 }
2967
2968 cache_locale_or_close(to_bytes, cd, le);
2969
2970 if (to_bytes) {
2971 parts = append_all_byte_strings_backwards(parts);
2972 *olen = SCHEME_BYTE_STRTAG_VAL(parts);
2973
2974 return SCHEME_BYTE_STR_VAL(parts);
2975 } else {
2976 parts = append_all_strings_backwards(parts);
2977 *olen = SCHEME_CHAR_STRTAG_VAL(parts);
2978
2979 return (char *)SCHEME_CHAR_STR_VAL(parts);
2980 }
2981 }
2982
cache_locale_or_close(int to_bytes,rktio_converter_t * cd,char * le)2983 void cache_locale_or_close(int to_bytes, rktio_converter_t *cd, char *le)
2984 {
2985 if (to_bytes ? cached_locale_to_converter : cached_locale_from_converter) {
2986 rktio_converter_close(scheme_rktio, cd);
2987 free(le);
2988 } else {
2989 if (!cached_locale_encoding_name || strcmp(le, cached_locale_encoding_name)) {
2990 scheme_clear_locale_cache();
2991 cached_locale_encoding_name = le;
2992 } else
2993 free(le);
2994
2995 rktio_convert_reset(scheme_rktio, cd);
2996 if (to_bytes)
2997 cached_locale_to_converter = cd;
2998 else
2999 cached_locale_from_converter = cd;
3000 }
3001 }
3002
scheme_clear_locale_cache(void)3003 void scheme_clear_locale_cache(void)
3004 {
3005 if (cached_locale_encoding_name) {
3006 if (cached_locale_to_converter) {
3007 rktio_converter_close(scheme_rktio, cached_locale_to_converter);
3008 cached_locale_to_converter = NULL;
3009 }
3010 if (cached_locale_from_converter) {
3011 rktio_converter_close(scheme_rktio, cached_locale_from_converter);
3012 cached_locale_from_converter = NULL;
3013 }
3014 free(cached_locale_encoding_name);
3015 cached_locale_encoding_name = NULL;
3016 }
3017 }
3018
locale_recase(int to_up,char * in,int id,int iilen,char * out,int od,int iolen,intptr_t * oolen)3019 static char *locale_recase(int to_up,
3020 /* in must be null-terminated, iilen doesn't include it */
3021 char *in, int id, int iilen,
3022 /* iolen, in contrast, includes the terminator */
3023 char *out, int od, int iolen,
3024 intptr_t *oolen)
3025 /* Assumes that reset_locale() has been called */
3026 {
3027 char *s, *s2;
3028 intptr_t len;
3029 s = rktio_locale_recase(scheme_rktio, to_up, in XFORM_OK_PLUS id);
3030 len = strlen(s);
3031 if ((len+1) < iolen) {
3032 memcpy(out XFORM_OK_PLUS od, s, len+1);
3033 s2 = out;
3034 } else {
3035 s2 = scheme_malloc_atomic(len+1);
3036 memcpy(s2, s, len+1);
3037 }
3038 free(s);
3039 *oolen = len;
3040 return s2;
3041 }
3042
mz_locale_strcoll(char * s1,int d1,int l1,char * s2,int d2,int l2,int cvt_case)3043 int mz_locale_strcoll(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case)
3044 /* The s1 and s2 arguments are actually UCS-4.
3045 Assumes that reset_locale() has been called. */
3046 {
3047 intptr_t clen1, clen2, used1, used2, origl1, origl2;
3048 char *c1, *c2, buf1[MZ_SC_BUF_SIZE], buf2[MZ_SC_BUF_SIZE];
3049 char case_buf1[MZ_SC_BUF_SIZE], case_buf2[MZ_SC_BUF_SIZE];
3050 int status, got_more;
3051
3052 /* First, convert UCS-4 to locale-specific encoding. If some
3053 characters don't fit into the encoding, then we'll have leftover
3054 characters. Count unconvertable charc as greater than anything
3055 that can be converted */
3056
3057 origl1 = l1;
3058 origl2 = l2;
3059
3060 /* Loop to check both convertable and unconvertable parts */
3061 while (1) {
3062 if (!origl1 && !origl2)
3063 return 0;
3064 if (!origl1)
3065 return -1;
3066 if (!origl2)
3067 return 1;
3068
3069 /* Loop to get consistent parts of the wto strings, in case
3070 a conversion fails. */
3071 got_more = 0;
3072 l1 = origl1;
3073 l2 = origl2;
3074 while (1) {
3075 c1 = do_convert(NULL, MZ_UCS4_NAME, NULL, 1,
3076 s1, d1 * 4, 4 * l1,
3077 buf1, 0, MZ_SC_BUF_SIZE - 1,
3078 1 /* grow */, 0, 1 /* terminator size */,
3079 &used1, &clen1,
3080 &status);
3081 c2 = do_convert(NULL, MZ_UCS4_NAME, NULL, 1,
3082 s2, d2 * 4, 4 * l2,
3083 buf2, 0, MZ_SC_BUF_SIZE - 1,
3084 1 /* grow */, 0, 1 /* terminator size */,
3085 &used2, &clen2,
3086 &status);
3087
3088 if ((used1 < 4 * l1) || (used2 < 4 * l2)) {
3089 if (got_more) {
3090 /* Something went wrong. We've already tried to
3091 even out the parts that work. Let's give up
3092 on the first characters */
3093 clen1 = clen2 = 0;
3094 break;
3095 } else if (used1 == used2) {
3096 /* Not everything, but both ended at the same point */
3097 break;
3098 } else {
3099 /* Pick the smallest */
3100 if (used2 < used1) {
3101 used1 = used2;
3102 got_more = 1;
3103 } else
3104 got_more = 2;
3105 l2 = (used1 >> 2);
3106 l1 = (used1 >> 2);
3107
3108 if (!l1) {
3109 /* Nothing to get this time. */
3110 clen1 = clen2 = 0;
3111 c1 = c2 = "";
3112 used1 = used2 = 0;
3113 break;
3114 }
3115 }
3116 } else
3117 /* Got all that we wanted */
3118 break;
3119 }
3120
3121 if (cvt_case) {
3122 if (clen1)
3123 c1 = locale_recase(0, c1, 0, clen1,
3124 case_buf1, 0, MZ_SC_BUF_SIZE - 1,
3125 &clen1);
3126 else
3127 c1 = NULL;
3128 if (clen2)
3129 c2 = locale_recase(0, c2, 0, clen2,
3130 case_buf2, 0, MZ_SC_BUF_SIZE - 1,
3131 &clen2);
3132 else
3133 c2 = NULL;
3134 /* There shouldn't have been conversion errors, but just in
3135 case, care of NULL. */
3136 if (!c1) c1 = "";
3137 if (!c2) c2 = "";
3138 }
3139
3140 /* Collate, finally. */
3141 status = rktio_locale_strcoll(scheme_rktio, c1, c2);
3142
3143 /* If one is bigger than the other, we're done. */
3144 if (status)
3145 return status;
3146
3147 /* Otherwise, is there more to check? */
3148 origl1 -= (used1 >> 2);
3149 origl2 -= (used2 >> 2);
3150 d1 += (used1 >> 2);
3151 d2 += (used2 >> 2);
3152 if (!origl1 && !origl2)
3153 return 0;
3154
3155 /* There's more. It must be that the next character wasn't
3156 convertable in one of the encodings. */
3157 if (got_more)
3158 return ((got_more == 2) ? 1 : -1);
3159
3160 if (!origl1)
3161 return -1;
3162
3163 /* Compare an unconverable character directly. No case conversions
3164 if it's outside the locale. */
3165 if (((unsigned int *)s1)[d1] > ((unsigned int *)s2)[d2])
3166 return 1;
3167 else if (((unsigned int *)s1)[d1] < ((unsigned int *)s2)[d2])
3168 return -1;
3169 else {
3170 /* We've skipped one unconvertable char, and they still look the
3171 same. Now try again. */
3172 origl1 -= 1;
3173 origl2 -= 1;
3174 d1 += 1;
3175 d2 += 1;
3176 }
3177 }
3178 }
3179
mz_native_strcoll(char * s1,int d1,int l1,char * s2,int d2,int l2,int cvt_case)3180 int mz_native_strcoll(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case)
3181 /* The s1 and s2 arguments are actually UTF-16. */
3182 {
3183 return rktio_strcoll_utf16(scheme_rktio,
3184 (rktio_char16_t *)s1 XFORM_OK_PLUS d1, l1,
3185 (rktio_char16_t *)s2 XFORM_OK_PLUS d2, l2,
3186 cvt_case);
3187 }
3188
3189 typedef int (*strcoll_proc)(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case);
3190
do_locale_comp(const char * who,const mzchar * us1,intptr_t ul1,const mzchar * us2,intptr_t ul2,int cvt_case)3191 int do_locale_comp(const char *who, const mzchar *us1, intptr_t ul1, const mzchar *us2, intptr_t ul2, int cvt_case)
3192 {
3193 int xl1;
3194 int v, endres, utf16 = 0;
3195 GC_CAN_IGNORE strcoll_proc mz_strcoll = mz_locale_strcoll;
3196
3197 if (current_locale_name
3198 && !*current_locale_name
3199 && (rktio_convert_properties(scheme_rktio) & RKTIO_CONVERT_STRCOLL_UTF16)) {
3200 utf16 = 1;
3201 mz_strcoll = mz_native_strcoll;
3202 }
3203
3204 if (utf16) {
3205 us1 = (mzchar *)scheme_ucs4_to_utf16(us1, 0, ul1, NULL, 0, &ul1, 1);
3206 us2 = (mzchar *)scheme_ucs4_to_utf16(us2, 0, ul2, NULL, 0, &ul2, 1);
3207 ((short *)us1)[ul1] = 0;
3208 ((short *)us2)[ul2] = 0;
3209 }
3210
3211 if (ul1 > ul2) {
3212 ul1 = ul2;
3213 endres = 1;
3214 } else {
3215 if (ul2 > ul1)
3216 endres = -1;
3217 else
3218 endres = 0;
3219 }
3220
3221 /* Walk back through the strings looking for nul characters. If we
3222 find one, compare the part after the null character to update
3223 endres, then continue. Unfortunately, we do too much work if an
3224 earlier part of the string (tested later) determines the result,
3225 but hopefully nul characters are rare. */
3226
3227 xl1 = 0;
3228 while (ul1--) {
3229 if ((utf16 && (!(((short *)us1)[ul1]) || !(((short *)us2)[ul1])))
3230 || (!utf16 && (!(us1[ul1]) || !(us2[ul1])))) {
3231 if (utf16) {
3232 if (((short *)us1)[ul1])
3233 endres = 1;
3234 else if (((short *)us2)[ul1])
3235 endres = -1;
3236 } else {
3237 if (us1[ul1])
3238 endres = 1;
3239 else if (us2[ul1])
3240 endres = -1;
3241 }
3242
3243 if (xl1)
3244 v = mz_strcoll((char *)us1, ul1 + 1, xl1, (char *)us2, ul1 + 1, xl1, cvt_case);
3245 else
3246 v = 0;
3247
3248 if (v)
3249 endres = v;
3250 xl1 = 0;
3251 } else {
3252 xl1++;
3253 }
3254 }
3255
3256 v = mz_strcoll((char *)us1, 0, xl1, (char *)us2, 0, xl1, cvt_case);
3257 if (v)
3258 endres = v;
3259
3260 return endres;
3261 }
3262
do_locale_recase(int to_up,mzchar * in,int delta,int len,intptr_t * olen)3263 mzchar *do_locale_recase(int to_up, mzchar *in, int delta, int len, intptr_t *olen)
3264 {
3265 Scheme_Object *parts = scheme_null;
3266 char *c, buf[MZ_SC_BUF_SIZE], case_buf[MZ_SC_BUF_SIZE];
3267 intptr_t clen, used;
3268 int status;
3269
3270 while (len) {
3271 /* We might have conversion errors... */
3272 c = do_convert(NULL, MZ_UCS4_NAME, NULL, 1,
3273 (char *)in, 4 * delta, 4 * len,
3274 buf, 0, MZ_SC_BUF_SIZE - 1,
3275 1 /* grow */, 0, 1 /* terminator size */,
3276 &used, &clen,
3277 &status);
3278
3279 used >>= 2;
3280 delta += used;
3281 len -= used;
3282
3283 c = locale_recase(to_up, c, 0, clen,
3284 case_buf, 0, MZ_SC_BUF_SIZE - 1,
3285 &clen);
3286 if (!c)
3287 clen = 0;
3288
3289 c = do_convert(NULL, NULL, MZ_UCS4_NAME, 2,
3290 c, 0, clen,
3291 NULL, 0, 0,
3292 1 /* grow */, 0, sizeof(mzchar) /* terminator size */,
3293 &used, &clen,
3294 &status);
3295
3296 if (!len && SCHEME_NULLP(parts)) {
3297 *olen = (clen >> 2);
3298 ((mzchar *)c)[*olen] = 0;
3299 return (mzchar *)c;
3300 }
3301
3302 /* We can get here if there was some conversion error at some
3303 point. We're building up a list of parts. */
3304
3305 parts = scheme_make_pair(scheme_make_sized_char_string((mzchar *)c, clen >> 2, 0),
3306 parts);
3307
3308 if (len) {
3309 /* Conversion error, so skip one char. */
3310 parts = scheme_make_pair(scheme_make_sized_offset_char_string(in, delta, 1, 1),
3311 parts);
3312 delta += 1;
3313 len -= 1;
3314 }
3315 }
3316
3317 parts = append_all_strings_backwards(parts);
3318 *olen = SCHEME_CHAR_STRTAG_VAL(parts);
3319
3320 return SCHEME_CHAR_STR_VAL(parts);
3321 }
3322
do_native_recase(int to_up,mzchar * in,int delta,int len,intptr_t * olen)3323 mzchar *do_native_recase(int to_up, mzchar *in, int delta, int len, intptr_t *olen)
3324 /* The in argument is actually UTF-16. */
3325 {
3326 rktio_char16_t *s, *s2;
3327 intptr_t ol;
3328
3329 s = rktio_recase_utf16(scheme_rktio, to_up, (rktio_char16_t *)in XFORM_OK_PLUS delta, len, &ol);
3330
3331 s2 = scheme_malloc_atomic(sizeof(rktio_char16_t) * (ol+1));
3332 memcpy(s2, s, sizeof(rktio_char16_t) * (ol+1));
3333 free(s);
3334
3335 *olen = ol;
3336
3337 return (mzchar *)s2;
3338 }
3339
3340 typedef mzchar *(*recase_proc)(int to_up, mzchar *in, int delta, int len, intptr_t *olen);
3341
mz_recase(const char * who,int to_up,mzchar * us,intptr_t ulen)3342 static Scheme_Object *mz_recase(const char *who, int to_up, mzchar *us, intptr_t ulen)
3343 {
3344 intptr_t ulen1;
3345 int utf16 = 0, i, delta = 0;
3346 mzchar *us1;
3347 recase_proc mz_do_recase = do_locale_recase;
3348 Scheme_Object *s, *parts = scheme_null;
3349
3350 reset_locale();
3351
3352 if (current_locale_name
3353 && !*current_locale_name
3354 && (rktio_convert_properties(scheme_rktio) & RKTIO_CONVERT_RECASE_UTF16)) {
3355 utf16 = 1;
3356 mz_do_recase = do_native_recase;
3357 }
3358
3359 if (utf16) {
3360 us = (mzchar *)scheme_ucs4_to_utf16(us, 0, ulen, NULL, 0, &ulen, 1);
3361 ((short *)us)[ulen] = 0;
3362 }
3363
3364 /* If there are nuls in the string, then we have to make multiple
3365 calls to mz_do_recase */
3366 i = 0;
3367 while (1) {
3368 for (; i < ulen; i++) {
3369 if (utf16) {
3370 if (!((short *)us)[i])
3371 break;
3372 } else if (!us[i])
3373 break;
3374 }
3375
3376 us1 = mz_do_recase(to_up, us, delta, i - delta, &ulen1);
3377
3378 if (utf16) {
3379 us1 = scheme_utf16_to_ucs4((unsigned short *)us1, 0, ulen1, NULL, 0, &ulen1, 1);
3380 us1[ulen1] = 0;
3381 }
3382
3383 s = scheme_make_sized_char_string((mzchar *)us1, ulen1, 0);
3384
3385 if (SCHEME_NULLP(parts) && (i == ulen))
3386 return s;
3387
3388 parts = scheme_make_pair(s, parts);
3389
3390 if (i == ulen)
3391 break;
3392
3393 /* upcasing and encoding a nul char is easy: */
3394 s = scheme_make_sized_char_string((mzchar *)"\0\0\0\0", 1, 0);
3395 parts = scheme_make_pair(s, parts);
3396 i++;
3397 delta = i;
3398
3399 if (i == ulen)
3400 break;
3401 }
3402
3403 return append_all_strings_backwards(parts);
3404 }
3405
3406 static Scheme_Object *
unicode_recase(const char * who,int to_up,int argc,Scheme_Object * argv[])3407 unicode_recase(const char *who, int to_up, int argc, Scheme_Object *argv[])
3408 {
3409 intptr_t len;
3410 mzchar *chars;
3411
3412 if (!SCHEME_CHAR_STRINGP(argv[0]))
3413 scheme_wrong_contract(who, "string?", 0, argc, argv);
3414
3415 chars = SCHEME_CHAR_STR_VAL(argv[0]);
3416 len = SCHEME_CHAR_STRTAG_VAL(argv[0]);
3417
3418 return mz_recase(who, to_up, chars, len);
3419 }
3420
3421 static Scheme_Object *
string_locale_upcase(int argc,Scheme_Object * argv[])3422 string_locale_upcase(int argc, Scheme_Object *argv[])
3423 {
3424 return unicode_recase("string-locale-upcase", 1, argc, argv);
3425 }
3426
3427 static Scheme_Object *
string_locale_downcase(int argc,Scheme_Object * argv[])3428 string_locale_downcase(int argc, Scheme_Object *argv[])
3429 {
3430 return unicode_recase("string-locale-downcase", 0, argc, argv);
3431 }
3432
reset_locale(void)3433 static void reset_locale(void)
3434 {
3435 Scheme_Object *v;
3436 const mzchar *name;
3437
3438 /* This function needs to work before threads are set up: */
3439 if (scheme_current_thread) {
3440 v = scheme_get_param(scheme_current_config(), MZCONFIG_LOCALE);
3441 } else {
3442 v = scheme_make_immutable_sized_utf8_string("", 0);
3443 }
3444 locale_on = SCHEME_TRUEP(v);
3445
3446 if (locale_on) {
3447 name = SCHEME_CHAR_STR_VAL(v);
3448 if ((current_locale_name != name)
3449 && (!current_locale_name
3450 || mz_char_strcmp("result-locale",
3451 current_locale_name, scheme_char_strlen(current_locale_name),
3452 name, SCHEME_CHAR_STRLEN_VAL(v),
3453 0, 1))) {
3454 char *n, buf[32];
3455
3456 n = scheme_utf8_encode_to_buffer(name, SCHEME_CHAR_STRLEN_VAL(v), buf, 32);
3457
3458 rktio_set_locale(scheme_rktio, n);
3459 }
3460
3461 current_locale_name_ptr = (void *)name;
3462 }
3463 }
3464
scheme_push_c_numeric_locale()3465 char *scheme_push_c_numeric_locale()
3466 {
3467 return rktio_push_c_numeric_locale(scheme_rktio);
3468 }
3469
scheme_pop_c_numeric_locale(char * prev)3470 void scheme_pop_c_numeric_locale(char *prev)
3471 {
3472 rktio_pop_c_numeric_locale(scheme_rktio, prev);
3473 }
3474
scheme_set_default_locale(void)3475 void scheme_set_default_locale(void)
3476 {
3477 rktio_set_default_locale("");
3478 }
3479
find_special_casing(int ch)3480 static int find_special_casing(int ch)
3481 {
3482 /* Binary search */
3483 int i, lo, hi, j;
3484
3485 i = NUM_SPECIAL_CASINGS >> 1;
3486 lo = i;
3487 hi = NUM_SPECIAL_CASINGS - i - 1;
3488
3489 while (1) {
3490 if (uchar_special_casings[i * 10] == ch)
3491 return i * 10;
3492 if (uchar_special_casings[i * 10] > ch) {
3493 j = i - lo;
3494 i = j + (lo >> 1);
3495 hi = lo - (i - j) - 1;
3496 lo = i - j;
3497 } else {
3498 j = i + 1;
3499 i = j + (hi >> 1);
3500 lo = i - j;
3501 hi = hi - (i - j) - 1;
3502 }
3503 }
3504 }
3505
is_final_sigma(int mode,mzchar * s,int d,int i,int len)3506 static int is_final_sigma(int mode, mzchar *s, int d, int i, int len)
3507 {
3508 int j;
3509
3510 if (mode == 3)
3511 return 1;
3512
3513 /* find a cased char before, skipping case-ignorable: */
3514 for (j = i - 1; j >= d; j--) {
3515 if (!scheme_iscaseignorable(s[j])) {
3516 if (scheme_iscased(s[j]))
3517 break;
3518 else
3519 return 0;
3520 }
3521 }
3522 if (j < d)
3523 return 0;
3524
3525 /* next non-case-ignorable must not be cased: */
3526 for (j = i + 1; j < d + len; j++) {
3527 if (!scheme_iscaseignorable(s[j])) {
3528 return !scheme_iscased(s[j]);
3529 }
3530 }
3531
3532 return 1;
3533 }
3534
scheme_string_recase(mzchar * s,int d,int len,int mode,int inplace,int * _len)3535 mzchar *scheme_string_recase(mzchar *s, int d, int len, int mode, int inplace, int *_len)
3536 {
3537 mzchar *t;
3538 int i, extra = 0, pos, special = 0, td, prev_was_cased = 0, xmode = mode;
3539
3540 for (i = 0; i < len; i++) {
3541 if (scheme_isspecialcasing(s[d+i])) {
3542 pos = find_special_casing(s[d+i]);
3543 if (!uchar_special_casings[pos + 9] || is_final_sigma(xmode, s, d, i, len)) {
3544 special = 1;
3545 extra += (uchar_special_casings[pos + 1 + (xmode << 1)] - 1);
3546 }
3547 }
3548 if (mode == 2) {
3549 if (!scheme_iscaseignorable(s[d+i]))
3550 prev_was_cased = scheme_iscased(s[d+i]);
3551 xmode = (prev_was_cased ? 0 : 2);
3552 }
3553 }
3554
3555 if (_len)
3556 *_len = len + extra;
3557
3558 if (!extra && inplace) {
3559 t = s;
3560 td = d;
3561 } else {
3562 t = scheme_malloc_atomic(sizeof(mzchar) * (len + extra + 1));
3563 td = 0;
3564 }
3565
3566 if (!special) {
3567 if (mode == 0) {
3568 for (i = 0; i < len; i++) {
3569 t[i+td] = scheme_tolower(s[i+d]);
3570 }
3571 } else if (mode == 1) {
3572 for (i = 0; i < len; i++) {
3573 t[i+td] = scheme_toupper(s[i+d]);
3574 }
3575 } else if (mode == 2) {
3576 prev_was_cased = 0;
3577 for (i = 0; i < len; i++) {
3578 if (!prev_was_cased)
3579 t[i+td] = scheme_totitle(s[i+d]);
3580 else
3581 t[i+td] = scheme_tolower(s[i+d]);
3582 if (!scheme_iscaseignorable(s[i+d]))
3583 prev_was_cased = scheme_iscased(s[i+d]);
3584 }
3585 } else /* if (mode == 3) */ {
3586 for (i = 0; i < len; i++) {
3587 t[i+td] = scheme_tofold(s[i+d]);
3588 }
3589 }
3590 } else {
3591 int j = 0, c;
3592 prev_was_cased = 0;
3593 for (i = 0; i < len; i++) {
3594 if (mode == 0) {
3595 t[j+td] = scheme_tolower(s[i+d]);
3596 } else if (mode == 1) {
3597 t[j+td] = scheme_toupper(s[i+d]);
3598 } else if (mode == 2) {
3599 if (!prev_was_cased) {
3600 xmode = 2;
3601 t[j+td] = scheme_totitle(s[i+d]);
3602 } else {
3603 xmode = 0;
3604 t[j+td] = scheme_tolower(s[i+d]);
3605 }
3606 if (!scheme_iscaseignorable(s[i+d]))
3607 prev_was_cased = scheme_iscased(s[i+d]);
3608 } else /* if (mode == 3) */ {
3609 t[j+td] = scheme_tofold(s[i+d]);
3610 }
3611
3612 if (scheme_isspecialcasing(s[i+d])) {
3613 pos = find_special_casing(s[i+d]);
3614 if (!uchar_special_casings[pos + 9] || is_final_sigma(xmode, s, d, i, len)) {
3615 c = uchar_special_casings[pos + 1 + (xmode << 1)];
3616 pos = uchar_special_casings[pos + 2 + (xmode << 1)];
3617 while (c--) {
3618 t[(j++)+td] = uchar_special_casing_data[pos++];
3619 }
3620 } else
3621 j++;
3622 } else
3623 j++;
3624 }
3625 }
3626 t[len+extra+td] = 0;
3627
3628 return t;
3629 }
3630
string_recase(const char * name,int argc,Scheme_Object * argv[],int mode)3631 static Scheme_Object *string_recase (const char *name, int argc, Scheme_Object *argv[], int mode)
3632 {
3633 mzchar *s;
3634 int len;
3635
3636 if (!SCHEME_CHAR_STRINGP(argv[0]))
3637 scheme_wrong_contract(name, "string?", 0, argc, argv);
3638
3639 s = SCHEME_CHAR_STR_VAL(argv[0]);
3640 len = SCHEME_CHAR_STRLEN_VAL(argv[0]);
3641
3642 s = scheme_string_recase(s, 0, len, mode, 0, &len);
3643
3644 return scheme_make_sized_char_string(s, len, 0);
3645 }
3646
string_upcase(int argc,Scheme_Object * argv[])3647 static Scheme_Object *string_upcase (int argc, Scheme_Object *argv[])
3648 {
3649 return string_recase("string-upcase", argc, argv, 1);
3650 }
3651
string_downcase(int argc,Scheme_Object * argv[])3652 static Scheme_Object *string_downcase (int argc, Scheme_Object *argv[])
3653 {
3654 return string_recase("string-downcase", argc, argv, 0);
3655 }
3656
string_titlecase(int argc,Scheme_Object * argv[])3657 static Scheme_Object *string_titlecase (int argc, Scheme_Object *argv[])
3658 {
3659 return string_recase("string-titlecase", argc, argv, 2);
3660 }
3661
string_foldcase(int argc,Scheme_Object * argv[])3662 static Scheme_Object *string_foldcase (int argc, Scheme_Object *argv[])
3663 {
3664 return string_recase("string-foldcase", argc, argv, 3);
3665 }
3666
3667 /**********************************************************************/
3668 /* normalization */
3669 /**********************************************************************/
3670
3671 #define MZ_JAMO_INITIAL_CONSONANT_START 0x1100
3672 #define MZ_JAMO_INITIAL_CONSONANT_COUNT 19
3673 #define MZ_JAMO_INITIAL_CONSONANT_END (MZ_JAMO_INITIAL_CONSONANT_START + MZ_JAMO_INITIAL_CONSONANT_COUNT - 1)
3674
3675 #define MZ_JAMO_VOWEL_START 0x1161
3676 #define MZ_JAMO_VOWEL_COUNT 21
3677 #define MZ_JAMO_VOWEL_END (MZ_JAMO_VOWEL_START + MZ_JAMO_VOWEL_COUNT - 1)
3678
3679 /* First in this range is not actually a consonant, but a placeholder for "no consonant" */
3680 #define MZ_JAMO_TRAILING_CONSONANT_START 0x11A7
3681 #define MZ_JAMO_TRAILING_CONSONANT_COUNT 28
3682 #define MZ_JAMO_TRAILING_CONSONANT_END (MZ_JAMO_TRAILING_CONSONANT_START + MZ_JAMO_TRAILING_CONSONANT_COUNT - 1)
3683
3684 #define MZ_JAMO_SYLLABLE_START 0xAC00
3685 #define MZ_JAMO_SYLLABLE_END (MZ_JAMO_SYLLABLE_START + 11171)
3686
get_composition(mzchar a,mzchar b)3687 XFORM_NONGCING static mzchar get_composition(mzchar a, mzchar b)
3688 {
3689 if ((a > 0xFFFF) || (b > 0xFFFF)) {
3690 /* Look in long-composes table. */
3691 mzlonglong key = ((((mzlonglong)a & 0x1F0000) << 21)
3692 | (((mzlonglong)a & 0xFFFF) << 16)
3693 | (((mzlonglong)b & 0x1F0000) << 16)
3694 | ((mzlonglong)b & 0xFFFF));
3695 int pos = (LONG_COMPOSE_TABLE_SIZE >> 1), new_pos;
3696 int below_len = pos;
3697 int above_len = (LONG_COMPOSE_TABLE_SIZE - pos - 1);
3698
3699 /* Binary search: */
3700 while (key != utable_canon_compose_long_pairs[pos]) {
3701 if (key > utable_canon_compose_long_pairs[pos]) {
3702 if (!above_len)
3703 return 0;
3704 new_pos = pos + (above_len >> 1) + 1;
3705 below_len = (new_pos - pos - 1);
3706 above_len = (above_len - below_len - 1);
3707 pos = new_pos;
3708 } else if (key < utable_canon_compose_long_pairs[pos]) {
3709 if (!below_len)
3710 return 0;
3711 new_pos = pos - ((below_len >> 1) + 1);
3712 above_len = (pos - new_pos - 1);
3713 below_len = (below_len - above_len - 1);
3714 pos = new_pos;
3715 }
3716 }
3717
3718 return utable_canon_compose_long_result[pos];
3719 } else {
3720 uintptr_t key = (a << 16) | b;
3721 int pos = (COMPOSE_TABLE_SIZE >> 1), new_pos;
3722 int below_len = pos;
3723 int above_len = (COMPOSE_TABLE_SIZE - pos - 1);
3724
3725 /* Binary search: */
3726 while (key != utable_compose_pairs[pos]) {
3727 if (key > utable_compose_pairs[pos]) {
3728 if (!above_len)
3729 return 0;
3730 new_pos = pos + (above_len >> 1) + 1;
3731 below_len = (new_pos - pos - 1);
3732 above_len = (above_len - below_len - 1);
3733 pos = new_pos;
3734 } else if (key < utable_compose_pairs[pos]) {
3735 if (!below_len)
3736 return 0;
3737 new_pos = pos - ((below_len >> 1) + 1);
3738 above_len = (pos - new_pos - 1);
3739 below_len = (below_len - above_len - 1);
3740 pos = new_pos;
3741 }
3742 }
3743
3744 return utable_compose_result[pos];
3745 }
3746 }
3747
get_canon_decomposition(mzchar key,mzchar * b)3748 XFORM_NONGCING mzchar get_canon_decomposition(mzchar key, mzchar *b)
3749 {
3750 int pos = (DECOMPOSE_TABLE_SIZE >> 1), new_pos;
3751 int below_len = pos;
3752 int above_len = (DECOMPOSE_TABLE_SIZE - pos - 1);
3753
3754 /* Binary search: */
3755 while (key != utable_decomp_keys[pos]) {
3756 if (key > utable_decomp_keys[pos]) {
3757 if (!above_len)
3758 return 0;
3759 new_pos = pos + (above_len >> 1) + 1;
3760 below_len = (new_pos - pos - 1);
3761 above_len = (above_len - below_len - 1);
3762 pos = new_pos;
3763 } else if (key < utable_decomp_keys[pos]) {
3764 if (!below_len)
3765 return 0;
3766 new_pos = pos - ((below_len >> 1) + 1);
3767 above_len = (pos - new_pos - 1);
3768 below_len = (below_len - above_len - 1);
3769 pos = new_pos;
3770 }
3771 }
3772
3773 pos = utable_decomp_indices[pos];
3774 if (pos < 0) {
3775 pos = -(pos + 1);
3776 pos <<= 1;
3777 *b = utable_compose_long_pairs[pos + 1];
3778 return utable_compose_long_pairs[pos];
3779 } else {
3780 key = utable_compose_pairs[pos];
3781 *b = (key & 0xFFFF);
3782 return (key >> 16);
3783 }
3784 }
3785
get_kompat_decomposition(mzchar key,unsigned short ** chars)3786 XFORM_NONGCING int get_kompat_decomposition(mzchar key, unsigned short **chars)
3787 {
3788 int pos = (KOMPAT_DECOMPOSE_TABLE_SIZE >> 1), new_pos;
3789 int below_len = pos;
3790 int above_len = (KOMPAT_DECOMPOSE_TABLE_SIZE - pos - 1);
3791
3792 /* Binary search: */
3793 while (key != utable_kompat_decomp_keys[pos]) {
3794 if (key > utable_kompat_decomp_keys[pos]) {
3795 if (!above_len)
3796 return 0;
3797 new_pos = pos + (above_len >> 1) + 1;
3798 below_len = (new_pos - pos - 1);
3799 above_len = (above_len - below_len - 1);
3800 pos = new_pos;
3801 } else if (key < utable_kompat_decomp_keys[pos]) {
3802 if (!below_len)
3803 return 0;
3804 new_pos = pos - ((below_len >> 1) + 1);
3805 above_len = (pos - new_pos - 1);
3806 below_len = (below_len - above_len - 1);
3807 pos = new_pos;
3808 }
3809 }
3810
3811 *chars = utable_kompat_decomp_strs XFORM_OK_PLUS utable_kompat_decomp_indices[pos];
3812 return utable_kompat_decomp_lens[pos];
3813 }
3814
normalize_c(Scheme_Object * o)3815 static Scheme_Object *normalize_c(Scheme_Object *o)
3816 /* Assumes then given string is in normal form D */
3817 {
3818 mzchar *s, *s2, tmp, last_c0 = 0;
3819 int len, i, j = 0, last_c0_pos = 0, last_cc = 0;
3820
3821 s = SCHEME_CHAR_STR_VAL(o);
3822 len = SCHEME_CHAR_STRLEN_VAL(o);
3823
3824 s2 = (mzchar *)scheme_malloc_atomic((len + 1) * sizeof(mzchar));
3825 memcpy(s2, s, len * sizeof(mzchar));
3826
3827 for (i = 0; i < len; i++) {
3828 if ((i + 1 < len)
3829 && (s2[i] >= MZ_JAMO_INITIAL_CONSONANT_START)
3830 && (s2[i] <= MZ_JAMO_INITIAL_CONSONANT_END)
3831 && (s2[i+1] >= MZ_JAMO_VOWEL_START)
3832 && (s2[i+1] <= MZ_JAMO_VOWEL_END)) {
3833 /* Need Hangul composition */
3834 if ((i + 2 < len)
3835 && (s2[i+2] > MZ_JAMO_TRAILING_CONSONANT_START)
3836 && (s2[i+2] <= MZ_JAMO_TRAILING_CONSONANT_END)) {
3837 /* 3-char composition */
3838 tmp = (MZ_JAMO_SYLLABLE_START
3839 + ((s2[i] - MZ_JAMO_INITIAL_CONSONANT_START)
3840 * MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT)
3841 + ((s2[i+1] - MZ_JAMO_VOWEL_START)
3842 * MZ_JAMO_TRAILING_CONSONANT_COUNT)
3843 + (s2[i+2] - MZ_JAMO_TRAILING_CONSONANT_START));
3844 i += 2;
3845 } else {
3846 /* 2-char composition */
3847 tmp = (MZ_JAMO_SYLLABLE_START
3848 + ((s2[i] - MZ_JAMO_INITIAL_CONSONANT_START)
3849 * MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT)
3850 + ((s2[i+1] - MZ_JAMO_VOWEL_START)
3851 * MZ_JAMO_TRAILING_CONSONANT_COUNT));
3852 i++;
3853 }
3854 last_c0 = tmp;
3855 last_c0_pos = j;
3856 last_cc = 0;
3857 s2[j++] = tmp;
3858 } else {
3859 int cc;
3860
3861 cc = scheme_combining_class(s2[i]);
3862 if (last_c0 && (cc > last_cc))
3863 tmp = get_composition(last_c0, s2[i]);
3864 else
3865 tmp = 0;
3866
3867 if (tmp) {
3868 /* Need to compose */
3869 s2[last_c0_pos] = tmp;
3870 last_c0 = tmp;
3871 } else if (!cc) {
3872 /* Reset last_c0... */
3873 tmp = s2[i];
3874 if (scheme_needs_maybe_compose(tmp)) {
3875 last_c0 = tmp;
3876 last_c0_pos = j;
3877 } else {
3878 last_c0 = 0;
3879 }
3880 last_cc = -1;
3881 s2[j++] = tmp;
3882 } else {
3883 s2[j++] = s2[i];
3884 last_cc = cc;
3885 }
3886 }
3887 }
3888
3889 s2[j] = 0;
3890 if (len - j > 16) {
3891 s = (mzchar *)scheme_malloc_atomic((j + 1) * sizeof(mzchar));
3892 memcpy(s, s2, (j + 1) * sizeof(mzchar));
3893 s2 = s;
3894 }
3895
3896 return scheme_make_sized_char_string(s2, j, 0);
3897 }
3898
normalize_d(Scheme_Object * o,int kompat)3899 static Scheme_Object *normalize_d(Scheme_Object *o, int kompat)
3900 {
3901 mzchar *s, tmp, *s2;
3902 int len, i, delta, j, swapped;
3903
3904 s = SCHEME_CHAR_STR_VAL(o);
3905 len = SCHEME_CHAR_STRLEN_VAL(o);
3906
3907 /* Run through string list to predict expansion: */
3908 delta = 0;
3909 for (i = 0; i < len; i++) {
3910 if (scheme_needs_decompose(s[i])) {
3911 int klen;
3912 mzchar snd;
3913 GC_CAN_IGNORE unsigned short *start;
3914
3915 tmp = s[i];
3916 while (scheme_needs_decompose(tmp)) {
3917 if (kompat)
3918 klen = get_kompat_decomposition(tmp, &start);
3919 else
3920 klen = 0;
3921 if (klen) {
3922 delta += (klen - 1);
3923 break;
3924 } else {
3925 tmp = get_canon_decomposition(tmp, &snd);
3926 if (tmp) {
3927 if (snd) {
3928 delta++;
3929 if (kompat) {
3930 klen = get_kompat_decomposition(snd, &start);
3931 if (klen)
3932 delta += (klen - 1);
3933 }
3934 }
3935 } else
3936 break;
3937 }
3938 }
3939 } else if ((s[i] >= MZ_JAMO_SYLLABLE_START)
3940 && (s[i] <= MZ_JAMO_SYLLABLE_END)) {
3941 tmp = s[i];
3942 tmp -= MZ_JAMO_SYLLABLE_START;
3943 if (tmp % MZ_JAMO_TRAILING_CONSONANT_COUNT)
3944 delta += 2;
3945 else
3946 delta += 1;
3947 }
3948 }
3949
3950 s2 = (mzchar *)scheme_malloc_atomic((len + delta + 1) * sizeof(mzchar));
3951
3952 j = 0;
3953 for (i = 0; i < len; i++) {
3954 if (scheme_needs_decompose(s[i])) {
3955 mzchar snd, tmp2;
3956 int snds = 0, klen = 0, k;
3957 GC_CAN_IGNORE unsigned short*start;
3958
3959 tmp = s[i];
3960 while (scheme_needs_decompose(tmp)) {
3961 if (kompat)
3962 klen = get_kompat_decomposition(tmp, &start);
3963 else
3964 klen = 0;
3965 if (klen) {
3966 for (k = 0; k < klen; k++) {
3967 s2[j++] = start[k];
3968 }
3969 break;
3970 } else {
3971 tmp2 = get_canon_decomposition(tmp, &snd);
3972 if (tmp2) {
3973 tmp = tmp2;
3974 if (snd) {
3975 if (kompat)
3976 klen = get_kompat_decomposition(snd, &start);
3977 else
3978 klen = 0;
3979 if (klen) {
3980 snds += klen;
3981 for (k = 0; k < klen; k++) {
3982 s2[len + delta - snds + k] = start[k];
3983 }
3984 klen = 0;
3985 } else {
3986 snds++;
3987 s2[len + delta - snds] = snd;
3988 }
3989 }
3990 } else
3991 break;
3992 }
3993 }
3994 if (!klen)
3995 s2[j++] = tmp;
3996 memmove(s2 + j, s2 + len + delta - snds, snds * sizeof(mzchar));
3997 j += snds;
3998 } else if ((s[i] >= MZ_JAMO_SYLLABLE_START)
3999 && (s[i] <= MZ_JAMO_SYLLABLE_END)) {
4000 int l, v, t;
4001 tmp = s[i];
4002 tmp -= MZ_JAMO_SYLLABLE_START;
4003 l = tmp / (MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT);
4004 v = (tmp % (MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT)) / MZ_JAMO_TRAILING_CONSONANT_COUNT;
4005 t = tmp % MZ_JAMO_TRAILING_CONSONANT_COUNT;
4006 s2[j++] = MZ_JAMO_INITIAL_CONSONANT_START + l;
4007 s2[j++] = MZ_JAMO_VOWEL_START + v;
4008 if (t) {
4009 s2[j++] = MZ_JAMO_TRAILING_CONSONANT_START + t;
4010 }
4011 } else {
4012 s2[j++] = s[i];
4013 }
4014 }
4015 s2[j] = 0;
4016 len += delta;
4017
4018 /* Reorder pass: */
4019 do {
4020 swapped = 0;
4021 for (i = 0; i < len; i++) {
4022 if ((i + 1 < len)
4023 && scheme_combining_class(s2[i])
4024 && scheme_combining_class(s2[i+1])
4025 && (scheme_combining_class(s2[i+1]) < scheme_combining_class(s2[i]))) {
4026 /* Reorder and try again: */
4027 tmp = s2[i + 1];
4028 s2[i + 1] = s2[i];
4029 s2[i] = tmp;
4030 i--;
4031 swapped = 1;
4032 }
4033 }
4034 } while (swapped);
4035
4036 return scheme_make_sized_char_string(s2, len, 0);
4037 }
4038
do_string_normalize_c(const char * who,int argc,Scheme_Object * argv[],int kompat)4039 static Scheme_Object *do_string_normalize_c (const char *who, int argc, Scheme_Object *argv[], int kompat)
4040 {
4041 Scheme_Object *o;
4042 mzchar *s, last_c0 = 0, snd;
4043 int len, i, last_cc = 0;
4044
4045 o = argv[0];
4046 if (!SCHEME_CHAR_STRINGP(o))
4047 scheme_wrong_contract(who, "string?", 0, argc, argv);
4048
4049 s = SCHEME_CHAR_STR_VAL(o);
4050 len = SCHEME_CHAR_STRLEN_VAL(o);
4051
4052 for (i = 0; i < len; i++) {
4053 if (scheme_needs_decompose(s[i])
4054 && (kompat || get_canon_decomposition(s[i], &snd))) {
4055 /* Decomposition may expose a different composition */
4056 break;
4057 } else if ((i + 1 < len)
4058 && scheme_combining_class(s[i])
4059 && scheme_combining_class(s[i+1])
4060 && (scheme_combining_class(s[i+1]) < scheme_combining_class(s[i]))) {
4061 /* Need to reorder */
4062 break;
4063 } else if ((s[i] >= MZ_JAMO_INITIAL_CONSONANT_START)
4064 && (s[i] <= MZ_JAMO_INITIAL_CONSONANT_END)
4065 && (s[i+1] >= MZ_JAMO_VOWEL_START)
4066 && (s[i+1] <= MZ_JAMO_VOWEL_END)) {
4067 /* Need Hangul composition */
4068 break;
4069 } else if (last_c0
4070 && get_composition(last_c0, s[i])
4071 && (scheme_combining_class(s[i]) > last_cc)) {
4072 /* Need to compose */
4073 break;
4074 } else {
4075 int cc;
4076
4077 cc = scheme_combining_class(s[i]);
4078
4079 if (!cc) {
4080 if (scheme_needs_maybe_compose(s[i]))
4081 last_c0 = s[i];
4082 else
4083 last_c0 = 0;
4084 last_cc = -1;
4085 } else
4086 last_cc = cc;
4087 }
4088 }
4089
4090 if (i < len) {
4091 o = normalize_c(normalize_d(o, kompat));
4092 }
4093
4094 return o;
4095 }
4096
string_normalize_c(int argc,Scheme_Object * argv[])4097 static Scheme_Object *string_normalize_c (int argc, Scheme_Object *argv[])
4098 {
4099 return do_string_normalize_c("string-normalize-nfc", argc, argv, 0);
4100 }
4101
string_normalize_kc(int argc,Scheme_Object * argv[])4102 static Scheme_Object *string_normalize_kc (int argc, Scheme_Object *argv[])
4103 {
4104 return do_string_normalize_c("string-normalize-nfkc", argc, argv, 1);
4105 }
4106
do_string_normalize_d(const char * who,int argc,Scheme_Object * argv[],int kompat)4107 static Scheme_Object *do_string_normalize_d (const char *who, int argc, Scheme_Object *argv[], int kompat)
4108 {
4109 Scheme_Object *o;
4110 mzchar *s;
4111 int len, i;
4112
4113 o = argv[0];
4114 if (!SCHEME_CHAR_STRINGP(o))
4115 scheme_wrong_contract(who, "string?", 0, argc, argv);
4116
4117 s = SCHEME_CHAR_STR_VAL(o);
4118 len = SCHEME_CHAR_STRLEN_VAL(o);
4119
4120 for (i = len; i--; ) {
4121 if (scheme_needs_decompose(s[i])) {
4122 /* Need to decompose */
4123 mzchar snd;
4124 if (kompat || get_canon_decomposition(s[i], &snd))
4125 break;
4126 } else if ((i + 1 < len)
4127 && scheme_combining_class(s[i])
4128 && scheme_combining_class(s[i+1])
4129 && (scheme_combining_class(s[i+1]) < scheme_combining_class(s[i]))) {
4130 /* Need to reorder */
4131 break;
4132 } else if ((s[i] >= MZ_JAMO_SYLLABLE_START)
4133 && (s[i] <= MZ_JAMO_SYLLABLE_END)) {
4134 /* Need Hangul decomposition */
4135 break;
4136 }
4137 }
4138
4139 if (i >= 0) {
4140 o = normalize_d(o, kompat);
4141 }
4142
4143 return o;
4144 }
4145
string_normalize_d(int argc,Scheme_Object * argv[])4146 static Scheme_Object *string_normalize_d (int argc, Scheme_Object *argv[])
4147 {
4148 return do_string_normalize_d("string-normalize-nfd", argc, argv, 0);
4149 }
4150
string_normalize_kd(int argc,Scheme_Object * argv[])4151 static Scheme_Object *string_normalize_kd (int argc, Scheme_Object *argv[])
4152 {
4153 return do_string_normalize_d("string-normalize-nfkd", argc, argv, 1);
4154 }
4155
4156 /**********************************************************************/
4157 /* strcmps */
4158 /**********************************************************************/
4159
scheme_char_strlen(const mzchar * s)4160 intptr_t scheme_char_strlen(const mzchar *s)
4161 {
4162 intptr_t i;
4163 if ((intptr_t)s & 0x3)
4164 abort();
4165 for (i = 0; s[i]; i++) {
4166 }
4167 return i;
4168 }
4169
mz_char_strcmp(const char * who,const mzchar * str1,intptr_t l1,const mzchar * str2,intptr_t l2,int use_locale,int size_shortcut)4170 static int mz_char_strcmp(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2,
4171 int use_locale, int size_shortcut)
4172 {
4173 intptr_t endres;
4174
4175 if (size_shortcut && (l1 != l2))
4176 return 1;
4177
4178 if (use_locale) {
4179 reset_locale();
4180 if (locale_on) {
4181 return do_locale_comp(who, str1, l1, str2, l2, 0);
4182 }
4183 }
4184
4185 if (l1 > l2) {
4186 l1 = l2;
4187 endres = 1;
4188 } else {
4189 if (l2 > l1)
4190 endres = -1;
4191 else
4192 endres = 0;
4193 }
4194
4195 while (l1--) {
4196 unsigned int a, b;
4197
4198 a = *(str1++);
4199 b = *(str2++);
4200
4201 a = a - b;
4202 if (a)
4203 return a;
4204 }
4205
4206 return endres;
4207 }
4208
mz_char_strcmp_ci(const char * who,const mzchar * str1,intptr_t l1,const mzchar * str2,intptr_t l2,int use_locale,int size_shortcut)4209 static int mz_char_strcmp_ci(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2,
4210 int use_locale, int size_shortcut)
4211 {
4212 intptr_t p1, p2, sp1, sp2, a, b;
4213 mzchar spec1[SPECIAL_CASE_FOLD_MAX], spec2[SPECIAL_CASE_FOLD_MAX];
4214
4215 if (size_shortcut && (l1 != l2))
4216 return 1;
4217
4218 if (use_locale) {
4219 reset_locale();
4220 if (locale_on) {
4221 return do_locale_comp(who, str1, l1, str2, l2, 1);
4222 }
4223 }
4224
4225 p1 = sp1 = 0;
4226 p2 = sp2 = 0;
4227
4228 while (((p1 < l1) || sp1) && ((p2 < l2) || sp2)) {
4229 if (sp1) {
4230 a = spec1[--sp1];
4231 } else {
4232 a = str1[p1];
4233 if (scheme_isspecialcasing(a)) {
4234 int pos, i;
4235 pos = find_special_casing(a);
4236 sp1 = uchar_special_casings[pos + 7];
4237 pos = uchar_special_casings[pos + 8];
4238 for (i = sp1; i--; pos++) {
4239 spec1[i] = uchar_special_casing_data[pos];
4240 }
4241 a = spec1[--sp1];
4242 } else {
4243 a = scheme_tofold(a);
4244 }
4245 p1++;
4246 }
4247
4248 if (sp2) {
4249 b = spec2[--sp2];
4250 } else {
4251 b = str2[p2];
4252 if (scheme_isspecialcasing(b)) {
4253 int pos, i;
4254 pos = find_special_casing(b);
4255 sp2 = uchar_special_casings[pos + 7];
4256 pos = uchar_special_casings[pos + 8];
4257 for (i = sp2; i--; pos++) {
4258 spec2[i] = uchar_special_casing_data[pos];
4259 }
4260 b = spec2[--sp2];
4261 } else {
4262 b = scheme_tofold(b);
4263 }
4264 p2++;
4265 }
4266
4267 a = a - b;
4268 if (a)
4269 return a;
4270 }
4271
4272 return ((p1 < l1) || sp1) - ((p2 < l2) || sp2);
4273 }
4274
mz_strcmp(const char * who,unsigned char * str1,intptr_t l1,unsigned char * str2,intptr_t l2)4275 static int mz_strcmp(const char *who, unsigned char *str1, intptr_t l1, unsigned char *str2, intptr_t l2)
4276 {
4277 intptr_t endres;
4278
4279 if (l1 > l2) {
4280 l1 = l2;
4281 endres = 1;
4282 } else {
4283 if (l2 > l1)
4284 endres = -1;
4285 else
4286 endres = 0;
4287 }
4288
4289 while (l1--) {
4290 unsigned int a, b;
4291
4292 a = *(str1++);
4293 b = *(str2++);
4294
4295 a = a - b;
4296 if (a)
4297 return a;
4298 }
4299
4300 return endres;
4301 }
4302
scheme_string_compare(Scheme_Object * a,Scheme_Object * b)4303 int scheme_string_compare(Scheme_Object *a, Scheme_Object *b)
4304 {
4305 return mz_char_strcmp(NULL,
4306 SCHEME_CHAR_STR_VAL(a), SCHEME_CHAR_STRTAG_VAL(a),
4307 SCHEME_CHAR_STR_VAL(b), SCHEME_CHAR_STRTAG_VAL(b),
4308 0, 0);
4309 }
4310
scheme_bytes_compare(Scheme_Object * a,Scheme_Object * b)4311 int scheme_bytes_compare(Scheme_Object *a, Scheme_Object *b)
4312 {
4313 return mz_strcmp(NULL,
4314 (unsigned char *)SCHEME_BYTE_STR_VAL(a), SCHEME_BYTE_STRTAG_VAL(a),
4315 (unsigned char *)SCHEME_BYTE_STR_VAL(b), SCHEME_BYTE_STRTAG_VAL(b));
4316 }
4317
4318 /**********************************************************************/
4319 /* byte string conversion */
4320 /**********************************************************************/
4321
close_converter(Scheme_Object * o,void * data)4322 static void close_converter(Scheme_Object *o, void *data)
4323 {
4324 Scheme_Converter *c = (Scheme_Converter *)o;
4325
4326 if (!c->closed) {
4327 c->closed = 1;
4328 if (c->kind == mzICONV_KIND) {
4329 rktio_converter_close(scheme_rktio, c->cd);
4330 c->cd = NULL;
4331 }
4332 if (c->mref) {
4333 scheme_remove_managed(c->mref, (Scheme_Object *)c);
4334 c->mref = NULL;
4335 }
4336 }
4337 }
4338
scheme_open_converter(const char * from_e,const char * to_e)4339 Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
4340 {
4341 Scheme_Converter *c;
4342 rktio_converter_t *cd;
4343 int kind;
4344 int permissive, wtf;
4345 int need_regis = 1;
4346 Scheme_Custodian_Reference *mref;
4347
4348 if (!*to_e || !*from_e)
4349 reset_locale();
4350
4351 if ((!strcmp(from_e, "UTF-8")
4352 || !strcmp(from_e, "UTF-8-permissive")
4353 || (!*from_e && mzLOCALE_IS_UTF_8(current_locale_name)))
4354 && (!strcmp(to_e, "UTF-8")
4355 || (!*to_e && mzLOCALE_IS_UTF_8(current_locale_name)))) {
4356 /* Use the built-in UTF-8<->UTF-8 converter: */
4357 kind = mzUTF8_KIND;
4358 if (!strcmp(from_e, "UTF-8-permissive"))
4359 permissive = 0xFFFD;
4360 else
4361 permissive = 0;
4362 cd = NULL;
4363 need_regis = 0;
4364 wtf = 0;
4365 } else if ((!strcmp(from_e, "platform-UTF-8")
4366 || !strcmp(from_e, "platform-UTF-8-permissive"))
4367 && !strcmp(to_e, "platform-UTF-16")) {
4368 kind = mzUTF8_TO_UTF16_KIND;
4369 if (!strcmp(from_e, "platform-UTF-8-permissive"))
4370 permissive = 0xFFFD;
4371 else
4372 permissive = 0;
4373 cd = NULL;
4374 need_regis = 0;
4375 wtf = WIN_UTF16_AS_WTF16(1);
4376 } else if ((!strcmp(from_e, "WTF-8")
4377 || !strcmp(from_e, "WTF-8-permissive"))
4378 && !strcmp(to_e, "WTF-16")) {
4379 kind = mzUTF8_TO_UTF16_KIND;
4380 if (!strcmp(from_e, "WTF-8-permissive"))
4381 permissive = 0xFFFD;
4382 else
4383 permissive = 0;
4384 cd = NULL;
4385 need_regis = 0;
4386 wtf = 1;
4387 } else if (!strcmp(from_e, "platform-UTF-16")
4388 && !strcmp(to_e, "platform-UTF-8")) {
4389 kind = mzUTF16_TO_UTF8_KIND;
4390 permissive = 0;
4391 cd = NULL;
4392 need_regis = 0;
4393 wtf = WIN_UTF16_AS_WTF16(1);
4394 } else if (!strcmp(from_e, "WTF-16")
4395 && !strcmp(to_e, "WTF-8")) {
4396 kind = mzUTF16_TO_UTF8_KIND;
4397 permissive = 0;
4398 cd = NULL;
4399 need_regis = 0;
4400 wtf = 1;
4401 } else {
4402 char *tmp_from_e = NULL, *tmp_to_e = NULL;
4403
4404 if (!(rktio_convert_properties(scheme_rktio) & RKTIO_CONVERTER_SUPPORTED))
4405 return scheme_false;
4406
4407 if (!*from_e || !*to_e)
4408 reset_locale();
4409
4410 if (!*from_e) {
4411 tmp_from_e = rktio_locale_encoding(scheme_rktio);
4412 from_e = tmp_from_e;
4413 }
4414 if (!*to_e) {
4415 tmp_to_e = rktio_locale_encoding(scheme_rktio);
4416 to_e = tmp_to_e;
4417 }
4418 cd = rktio_converter_open(scheme_rktio, to_e, from_e);
4419
4420 if (tmp_from_e) free(tmp_from_e);
4421 if (tmp_to_e) free(tmp_to_e);
4422
4423 if (!cd)
4424 return scheme_false;
4425
4426 kind = mzICONV_KIND;
4427 permissive = 0;
4428 wtf = 0;
4429 }
4430
4431 c = MALLOC_ONE_TAGGED(Scheme_Converter);
4432 c->so.type = scheme_string_converter_type;
4433 c->closed = 0;
4434 c->kind = kind;
4435 c->permissive = permissive;
4436 c->wtf = wtf;
4437 c->cd = cd;
4438 if (!need_regis)
4439 mref = NULL;
4440 else
4441 mref = scheme_add_managed(NULL,
4442 (Scheme_Object *)c,
4443 close_converter,
4444 NULL, 1);
4445 c->mref = mref;
4446
4447 return (Scheme_Object *)c;
4448 }
4449
byte_string_open_converter(int argc,Scheme_Object ** argv)4450 static Scheme_Object *byte_string_open_converter(int argc, Scheme_Object **argv)
4451 {
4452 Scheme_Object *s1, *s2;
4453 char *from_e, *to_e;
4454
4455 if (!SCHEME_CHAR_STRINGP(argv[0]))
4456 scheme_wrong_contract("bytes-open-converter", "bytes?", 0, argc, argv);
4457 if (!SCHEME_CHAR_STRINGP(argv[1]))
4458 scheme_wrong_contract("bytes-open-converter", "bytes?", 1, argc, argv);
4459
4460 scheme_custodian_check_available(NULL, "bytes-open-converter", "converter");
4461
4462 s1 = scheme_char_string_to_byte_string(argv[0]);
4463 s2 = scheme_char_string_to_byte_string(argv[1]);
4464
4465 if (scheme_byte_string_has_null(s1))
4466 return scheme_false;
4467 if (scheme_byte_string_has_null(s2))
4468 return scheme_false;
4469
4470 from_e = SCHEME_BYTE_STR_VAL(s1);
4471 to_e = SCHEME_BYTE_STR_VAL(s2);
4472
4473 return scheme_open_converter(from_e, to_e);
4474 }
4475
convert_one(const char * who,int opos,int argc,Scheme_Object * argv[])4476 static Scheme_Object *convert_one(const char *who, int opos, int argc, Scheme_Object *argv[])
4477 {
4478 char *r, *instr;
4479 int status;
4480 intptr_t amt_read, amt_wrote;
4481 intptr_t istart, ifinish, ostart, ofinish;
4482 Scheme_Object *a[3], *status_sym;
4483 Scheme_Converter *c;
4484
4485 if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_string_converter_type))
4486 scheme_wrong_contract(who, "bytes-converter?", 0, argc, argv);
4487
4488 if (opos > 1) {
4489 if (!SCHEME_BYTE_STRINGP(argv[1]))
4490 scheme_wrong_contract(who, "bytes?", 1, argc, argv);
4491 scheme_get_substring_indices(who, argv[1], argc, argv, 2, 3, &istart, &ifinish);
4492 } else {
4493 istart = 0;
4494 ifinish = 0;
4495 }
4496
4497 if (argc > opos) {
4498 if (SCHEME_TRUEP(argv[opos])) {
4499 if (!SCHEME_MUTABLE_BYTE_STRINGP(argv[opos]))
4500 scheme_wrong_contract(who, "(and/c bytes? (not/c immutable?))", opos, argc, argv);
4501 r = SCHEME_BYTE_STR_VAL(argv[opos]);
4502 scheme_get_substring_indices(who, argv[opos], argc, argv, opos + 1, opos + 2, &ostart, &ofinish);
4503 } else {
4504 int ip;
4505 r = NULL;
4506 for (ip = opos + 1; ip <= opos + 2; ip++) {
4507 if (argc > ip) {
4508 int ok = 0;
4509 if (SCHEME_INTP(argv[ip]))
4510 ok = SCHEME_INT_VAL(argv[ip]) >= 0;
4511 else if (SCHEME_BIGNUMP(argv[ip]))
4512 ok = SCHEME_BIGPOS(argv[ip]);
4513 else if ((ip == opos + 2) && SCHEME_FALSEP(argv[ip]))
4514 ok = 1;
4515 if (!ok)
4516 scheme_wrong_contract(who,
4517 ((ip == opos + 2)
4518 ? "(or/c exact-nonnegative-integer? #f)"
4519 : "exact-nonnegative-integer?"),
4520 ip, argc, argv);
4521 }
4522 }
4523 if ((argc > opos + 2) && SCHEME_TRUEP(argv[opos + 2])) {
4524 Scheme_Object *delta;
4525 if (scheme_bin_lt(argv[opos + 2], argv[opos + 1])) {
4526 scheme_contract_error(who,
4527 "ending index is less than the starting index",
4528 "staring index", 1, argv[opos + 1],
4529 "ending index", 1, argv[opos + 2],
4530 NULL);
4531 }
4532 delta = scheme_bin_minus(argv[opos + 2], argv[opos + 1]);
4533 if (SCHEME_BIGNUMP(delta))
4534 ofinish = -1;
4535 else
4536 ofinish = SCHEME_INT_VAL(delta);
4537 ostart = 0;
4538 } else {
4539 ostart = 0;
4540 ofinish = -1;
4541 }
4542 }
4543 } else {
4544 r = NULL;
4545 ostart = 0;
4546 ofinish = -1;
4547 }
4548
4549 c = (Scheme_Converter *)argv[0];
4550 if (c->closed)
4551 scheme_contract_error(who, "converter is closed",
4552 "converter", 1, argv[0],
4553 NULL);
4554
4555 instr = ((opos > 1) ? SCHEME_BYTE_STR_VAL(argv[1]) : NULL);
4556
4557 if (c->kind == mzUTF16_TO_UTF8_KIND) {
4558 if (istart & 0x1) {
4559 /* Copy to word-align */
4560 char *c2;
4561 c2 = (char *)scheme_malloc_atomic(ifinish - istart);
4562 memcpy(c2, instr XFORM_OK_PLUS istart, ifinish - istart);
4563 ifinish = ifinish - istart;
4564 istart = 0;
4565 instr = c2;
4566 }
4567
4568 status = utf8_encode_x((const unsigned int *)instr, istart >> 1, ifinish >> 1,
4569 (unsigned char *)r, ostart, ofinish,
4570 &amt_read, &amt_wrote, 1, c->wtf);
4571
4572 amt_read -= (istart >> 1);
4573
4574 if (amt_read) {
4575 if (!r) {
4576 /* Need to allocate, then do it again: */
4577 r = (char *)scheme_malloc_atomic(amt_wrote + 1);
4578 utf8_encode_x((const unsigned int *)instr, istart >> 1, ifinish >> 1,
4579 (unsigned char *)r, ostart, ofinish,
4580 NULL, NULL, 1, c->wtf);
4581 r[amt_wrote] = 0;
4582 }
4583 amt_read <<= 1;
4584 }
4585
4586 /* We might get a -1 result because the input has an odd number of
4587 bytes, and 2nd+next-to-last bytes form an unpaired
4588 surrogate. In that case, the transformer normally needs one
4589 more byte: Windows is little-endian, so we need the byte to
4590 tell whether the surrogate is paired, and for all other
4591 platforms (where we assume that surrogates are paired), we need
4592 the byte to generate output. Technically, on a big-endian
4593 non-Windows machine, we could generate the first byte of UTF-8
4594 output and keep the byte as state, but we don't. */
4595
4596 if (status != -1) {
4597 if (amt_read < ((ifinish - istart) & ~0x1)) {
4598 /* Must have run out of output space */
4599 status = 1;
4600 } else {
4601 /* Read all of input --- but it wasn't really all if there
4602 was an odd number of bytes. */
4603 if ((ifinish - istart) & 0x1)
4604 status = -1;
4605 else
4606 status = 0;
4607 }
4608 }
4609 } else if (c->kind != mzICONV_KIND) {
4610 /* UTF-8 -> UTF-{8,16} "identity" converter, but maybe permissive */
4611 if (instr) {
4612 intptr_t _ostart, _ofinish;
4613 int utf16;
4614
4615 if (c->kind == mzUTF8_TO_UTF16_KIND) {
4616 _ostart = ostart;
4617 _ofinish = ofinish;
4618 if (_ostart & 0x1)
4619 _ostart++;
4620 _ostart >>= 1;
4621 if (_ofinish > 0)
4622 _ofinish >>= 1;
4623 utf16 = 1;
4624 } else {
4625 _ostart = ostart;
4626 _ofinish = ofinish;
4627 utf16 = 0;
4628 }
4629
4630 status = utf8_decode_x((unsigned char *)instr, istart, ifinish,
4631 (unsigned int *)r, _ostart, _ofinish,
4632 &amt_read, &amt_wrote,
4633 1, utf16, NULL, 1, c->permissive, c->wtf);
4634
4635 if (utf16) {
4636 _ostart <<= 1;
4637 amt_wrote <<= 1;
4638 if ((ostart & 0x1) && (amt_wrote > _ostart)) {
4639 /* Shift down one byte: */
4640 memmove(r XFORM_OK_PLUS ostart, r XFORM_OK_PLUS _ostart, amt_wrote - _ostart);
4641 }
4642 }
4643
4644 amt_read -= istart;
4645 amt_wrote -= _ostart;
4646 if (status == -3) {
4647 /* r is not NULL; ran out of room */
4648 status = 1;
4649 } else {
4650 if (amt_wrote) {
4651 if (!r) {
4652 /* Need to allocate, then do it again: */
4653 r = (char *)scheme_malloc_atomic(amt_wrote + 1);
4654 utf8_decode_x((unsigned char *)instr, istart, ifinish,
4655 (unsigned int *)r, ostart, _ofinish,
4656 NULL, NULL,
4657 1, utf16, NULL, 1, c->permissive, c->wtf);
4658 r[amt_wrote] = 0;
4659 }
4660 } else if (!r)
4661 r = "";
4662 if (status > 0)
4663 status = 0;
4664 }
4665 } else {
4666 r = "";
4667 status = 0;
4668 amt_read = 0;
4669 amt_wrote = 0;
4670 }
4671 } else {
4672 r = do_convert(c->cd, NULL, NULL, 0,
4673 instr, istart, ifinish-istart,
4674 r, ostart, ofinish-ostart,
4675 !r, /* grow? */
4676 0,
4677 (r ? 0 : 1), /* terminator */
4678 &amt_read, &amt_wrote,
4679 &status);
4680 }
4681
4682 if (status == 0) {
4683 /* Converted all input without error */
4684 status_sym = complete_symbol;
4685 } else if (status == 1) {
4686 /* Filled output, more input ready */
4687 status_sym = continues_symbol;
4688 } else if (status == -1) {
4689 /* Input ends in the middle of an encoding */
4690 status_sym = aborts_symbol;
4691 } else {
4692 /* Assert: status == -2 */
4693 /* Input has error (that won't be fixed by
4694 adding more characters */
4695 status_sym = error_symbol;
4696 }
4697
4698 if (argc <= opos) {
4699 a[0] = scheme_make_sized_byte_string(r, amt_wrote, 0);
4700 } else {
4701 a[0] = scheme_make_integer(amt_wrote);
4702 }
4703 if (opos > 1) {
4704 a[1] = scheme_make_integer(amt_read);
4705 a[2] = status_sym;
4706 return scheme_values(3, a);
4707 } else {
4708 a[1] = status_sym;
4709 return scheme_values(2, a);
4710 }
4711 }
4712
byte_string_convert(int argc,Scheme_Object * argv[])4713 static Scheme_Object *byte_string_convert(int argc, Scheme_Object *argv[])
4714 {
4715 return convert_one("bytes-convert", 4, argc, argv);
4716 }
4717
byte_string_convert_end(int argc,Scheme_Object * argv[])4718 static Scheme_Object *byte_string_convert_end(int argc, Scheme_Object *argv[])
4719 {
4720 return convert_one("bytes-convert-end", 1, argc, argv);
4721 }
4722
scheme_close_converter(Scheme_Object * conv)4723 void scheme_close_converter(Scheme_Object *conv)
4724 {
4725 close_converter(conv, NULL);
4726 }
4727
byte_string_close_converter(int argc,Scheme_Object ** argv)4728 static Scheme_Object *byte_string_close_converter(int argc, Scheme_Object **argv)
4729 {
4730 if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_string_converter_type))
4731 scheme_wrong_contract("bytes-close-converter", "bytes-converter?", 0, argc, argv);
4732
4733 scheme_close_converter(argv[0]);
4734
4735 return scheme_void;
4736 }
4737
4738 static Scheme_Object *
byte_converter_p(int argc,Scheme_Object * argv[])4739 byte_converter_p(int argc, Scheme_Object *argv[])
4740 {
4741 return (SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_string_converter_type)
4742 ? scheme_true
4743 : scheme_false);
4744 }
4745
4746 /**********************************************************************/
4747 /* utf8 converter */
4748 /**********************************************************************/
4749
utf8_decode_x(const unsigned char * s,intptr_t start,intptr_t end,unsigned int * us,intptr_t dstart,intptr_t dend,intptr_t * ipos,intptr_t * jpos,char compact,char utf16,int * _state,int might_continue,int permissive,int wtf)4750 static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t end,
4751 unsigned int *us, intptr_t dstart, intptr_t dend,
4752 intptr_t *ipos, intptr_t *jpos,
4753 char compact, char utf16, int *_state,
4754 int might_continue, int permissive, int wtf)
4755 /* Results:
4756 non-negative => translation complete, = number of produced chars
4757 -1 => input ended in middle of encoding (only if might_continue)
4758 -2 => encoding error (only if permissive is 0)
4759 -3 => not enough output room
4760
4761 ipos & jpos are filled with ending positions (between [d]start
4762 and [d]end) before return, unless they are NULL.
4763
4764 compact => UTF-8 to UTF-8 or UTF-16 --- the latter if utf16;
4765 for utf16 and wtf, decode extended UTF-8 that allows surrogates
4766
4767 _state provides initial state and is filled with ending state;
4768 when it's not NULL, the us must be NULL
4769
4770 might_continue => allows -1 result without consuming characters
4771
4772 permissive is non-zero => use permissive as value for bad byte
4773 sequences. When generating UTF-8, this must be an ASCII character
4774 or U+FFFD. */
4775 {
4776 intptr_t i, j, oki;
4777 int failmode = -3, state;
4778 int init_doki;
4779 int nextbits, v;
4780 unsigned int sc;
4781 int pending_surrogate = 0;
4782
4783 if (_state) {
4784 state = (*_state) & 0x7;
4785 init_doki = (((*_state) >> 3) & 0x7);
4786 nextbits = ((((*_state) >> 6) & 0xF) << 2);
4787 /* Need v to detect 0xD800 through 0xDFFF
4788 Note that we have 22 bits to work with, which is
4789 is enough to detect > 0x10FFFF */
4790 v = ((*_state) >> 10);
4791 } else {
4792 state = 0;
4793 init_doki = 0;
4794 nextbits = 0;
4795 v = 0;
4796 }
4797
4798 /* In non-permissive mode, a negative result means ill-formed input.
4799 Permissive mode accepts anything and tries to convert it. In
4800 that case, the strategy for illegal sequences is to convert
4801 anything bad to the given "permissive" value. */
4802
4803 if (end < 0)
4804 end = strlen((char *)s);
4805 if (dend < 0)
4806 dend = 0x7FFFFFFF;
4807
4808 # define ENCFAIL i = oki; failmode = -2; break
4809
4810 oki = start;
4811 j = dstart;
4812 i = start;
4813 if (j < dend) {
4814 while (i < end) {
4815 sc = s[i];
4816 if (sc < 0x80) {
4817 if (state) {
4818 /* In a sequence, but didn't continue */
4819 state = 0;
4820 nextbits = 0;
4821 if (permissive) {
4822 v = permissive;
4823 i = oki;
4824 j += init_doki;
4825 } else {
4826 ENCFAIL;
4827 }
4828 } else {
4829 v = sc;
4830 }
4831 } else if ((sc & 0xC0) == 0x80) {
4832 /* Continues a sequence ... */
4833 if (state) {
4834 /* ... and we're in one ... */
4835 if (!nextbits || (sc & nextbits)) {
4836 /* and we have required bits. */
4837 v = (v << 6) + (sc & 0x3F);
4838 nextbits = 0;
4839 --state;
4840 if (state) {
4841 i++;
4842 continue;
4843 }
4844 /* We finished. One last check: */
4845 if ((((v >= 0xD800) && (v <= 0xDFFF))
4846 || (v > 0x10FFFF))
4847 && (!wtf
4848 || !utf16
4849 /* If WTF-16, just apply upper-limit check */
4850 || (v > 0x10FFFF))) {
4851 /* UTF-16 surrogates or other illegal code units */
4852 if (permissive) {
4853 v = permissive;
4854 j += init_doki;
4855 i = oki;
4856 } else {
4857 ENCFAIL;
4858 }
4859 }
4860 } else {
4861 /* ... but we're missing required bits. */
4862 state = 0;
4863 nextbits = 0;
4864 if (permissive) {
4865 v = permissive;
4866 j += init_doki;
4867 i = oki;
4868 } else {
4869 ENCFAIL;
4870 }
4871 }
4872 } else {
4873 /* ... but we're not in one */
4874 if (permissive) {
4875 v = permissive;
4876 } else {
4877 ENCFAIL;
4878 }
4879 }
4880 } else if (state) {
4881 /* bad: already in a sequence */
4882 state = 0;
4883 if (permissive) {
4884 v = permissive;
4885 i = oki;
4886 j += init_doki;
4887 } else {
4888 ENCFAIL;
4889 }
4890 } else {
4891 if ((sc & 0xE0) == 0xC0) {
4892 if (sc & 0x1E) {
4893 state = 1;
4894 v = (sc & 0x1F);
4895 i++;
4896 continue;
4897 }
4898 /* else too small */
4899 } else if ((sc & 0xF0) == 0xE0) {
4900 state = 2;
4901 v = (sc & 0xF);
4902 if (!v)
4903 nextbits = 0x20;
4904 i++;
4905 continue;
4906 } else if ((sc & 0xF8) == 0xF0) {
4907 v = (sc & 0x7);
4908 if (v <= 4) {
4909 state = 3;
4910 if (!v)
4911 nextbits = 0x30;
4912 i++;
4913 continue;
4914 }
4915 /* Else will be larger than 0x10FFFF, so fail */
4916 }
4917 /* Too small, or 0xFF or 0xFe, or start of a 5- or 6-byte sequence */
4918 if (permissive) {
4919 v = permissive;
4920 } else {
4921 ENCFAIL;
4922 }
4923 }
4924
4925 /* If we get here, we're supposed to output v */
4926
4927 if (compact) {
4928 if (utf16) {
4929 if (v > 0xFFFF) {
4930 if (pending_surrogate) {
4931 if (us)
4932 ((unsigned short *)us)[j] = pending_surrogate;
4933 j++; /* Accept previously written unpaired surrogate */
4934 pending_surrogate = 0;
4935 }
4936 if (j + 1 >= dend)
4937 break;
4938 if (us) {
4939 v -= 0x10000;
4940 ((unsigned short *)us)[j] = 0xD800 | ((v >> 10) & 0x3FF);
4941 ((unsigned short *)us)[j+1] = 0xDC00 | (v & 0x3FF);
4942 }
4943 j++;
4944 } else if (wtf) {
4945 /* We allow a surrogate by itself, but don't allow
4946 a 0xDC00 after a 0xD800, otherwise multiple encodings can
4947 map to the same thing. */
4948 if ((v >= 0xD800) && (v <= 0xDFFF)) {
4949 if (pending_surrogate && ((v & 0xDC00) == 0xDC00)) {
4950 /* This looks like a surrogate pair, so disallow it. */
4951 if (permissive) {
4952 /* We need to fill in 6 permissive substitutions,
4953 one for each input byte. If we can't put all 6,
4954 then don't use any input. */
4955 if (j + 5 >= dend) {
4956 break;
4957 } else {
4958 int p;
4959 if (us) {
4960 for (p = 0; p < 5; p++) {
4961 if (j + p >= dend)
4962 break;
4963 ((unsigned short *)us)[j+p] = permissive;
4964 }
4965 }
4966 j += 5;
4967 v = permissive;
4968 }
4969 } else {
4970 ENCFAIL;
4971 }
4972 pending_surrogate = 0;
4973 } else {
4974 if (pending_surrogate) {
4975 if (us)
4976 ((unsigned short *)us)[j] = pending_surrogate;
4977 j++; /* Accept previousy written unpaired surrogate */
4978 pending_surrogate = 0;
4979 if (j >= dend)
4980 break;
4981 }
4982 if ((v & 0xDC00) == 0xD800)
4983 pending_surrogate = v;
4984 else
4985 pending_surrogate = 0;
4986 }
4987 } else {
4988 if (pending_surrogate) {
4989 if (us)
4990 ((unsigned short *)us)[j] = pending_surrogate;
4991 j++; /* Accept previousy written unpaired surrogate */
4992 pending_surrogate = 0;
4993 if (j >= dend)
4994 break;
4995 }
4996 }
4997
4998 if (pending_surrogate)
4999 --j; /* don't accept unpaired surrogate, yet */
5000 else if (us)
5001 ((unsigned short *)us)[j] = v;
5002 } else {
5003 if (us)
5004 ((unsigned short *)us)[j] = v;
5005 }
5006 } else {
5007 intptr_t delta;
5008 delta = (i - oki);
5009 if (delta) {
5010 if (j + delta + 1 < dend) {
5011 if (us)
5012 memcpy(((char *)us) + j, s + oki, delta + 1);
5013 j += delta;
5014 } else
5015 break;
5016 } else if (v == 0xFFFD) {
5017 if (j + 3 < dend) {
5018 if (us) {
5019 ((unsigned char *)us)[j] = 0xEF;
5020 ((unsigned char *)us)[j+1] = 0xBF;
5021 ((unsigned char *)us)[j+2] = 0xBD;
5022 }
5023 j += 2;
5024 } else
5025 break;
5026 } else if (us) {
5027 ((unsigned char *)us)[j] = v;
5028 }
5029 }
5030 } else if (us) {
5031 us[j] = v;
5032 }
5033 j++;
5034 i++;
5035 oki = i;
5036 init_doki = 0;
5037 if (j >= dend)
5038 break;
5039 }
5040 }
5041
5042 if (_state) {
5043 if (!state)
5044 *_state = 0;
5045 else
5046 *_state = (state
5047 | (((end - oki) + init_doki) << 3)
5048 | ((nextbits >> 2) << 6)
5049 | (v << 10));
5050 } else if (state) {
5051 if (might_continue || !permissive) {
5052 failmode = -1;
5053 i = end - 1; /* to ensure that failmode is returned */
5054 } else if (permissive) {
5055 if (pending_surrogate) {
5056 /* Unpaired surrogate before permissive replacements */
5057 if (utf16 && (j < dend)) {
5058 if (us)
5059 ((unsigned short *)us)[j] = pending_surrogate;
5060 j++;
5061 }
5062 pending_surrogate = 0;
5063 }
5064 for (i = oki; i < end; i++) {
5065 if (j < dend) {
5066 if (us) {
5067 if (compact) {
5068 if (utf16)
5069 ((unsigned short *)us)[j] = permissive;
5070 else
5071 ((unsigned char *)us)[j] = permissive;
5072 } else
5073 us[j] = permissive;
5074 }
5075 j++;
5076 } else
5077 break;
5078 }
5079 oki = i;
5080 }
5081 }
5082
5083 if (pending_surrogate) {
5084 if (!might_continue) {
5085 /* Accept unpaired surrogate at end of input */
5086 if (j < dend) {
5087 if (us)
5088 ((unsigned short *)us)[j] = pending_surrogate;
5089 j++;
5090 }
5091 } else {
5092 oki -= 3;
5093 }
5094 }
5095
5096 if (ipos)
5097 *ipos = oki;
5098 if (jpos)
5099 *jpos = j;
5100
5101 if (i < end)
5102 return failmode;
5103
5104 if (pending_surrogate) {
5105 /* input must have ended right after surrogate */
5106 return -1;
5107 }
5108
5109 return j - dstart;
5110 }
5111
scheme_utf8_decode(const unsigned char * s,intptr_t start,intptr_t end,unsigned int * us,intptr_t dstart,intptr_t dend,intptr_t * ipos,char utf16,int permissive)5112 intptr_t scheme_utf8_decode(const unsigned char *s, intptr_t start, intptr_t end,
5113 unsigned int *us, intptr_t dstart, intptr_t dend,
5114 intptr_t *ipos, char utf16, int permissive)
5115 {
5116 return utf8_decode_x(s, start, end, us, dstart, dend,
5117 ipos, NULL, utf16, utf16, NULL, 0, permissive, WIN_UTF16_AS_WTF16(utf16));
5118 }
5119
scheme_utf8_decode_offset_prefix(const unsigned char * s,intptr_t start,intptr_t end,unsigned int * us,intptr_t dstart,intptr_t dend,intptr_t * ipos,char utf16,int permissive)5120 intptr_t scheme_utf8_decode_offset_prefix(const unsigned char *s, intptr_t start, intptr_t end,
5121 unsigned int *us, intptr_t dstart, intptr_t dend,
5122 intptr_t *ipos, char utf16, int permissive)
5123 {
5124 return utf8_decode_x(s, start, end, us, dstart, dend,
5125 ipos, NULL, utf16, utf16, NULL, 1, permissive, WIN_UTF16_AS_WTF16(utf16));
5126 }
5127
scheme_utf8_decode_as_prefix(const unsigned char * s,intptr_t start,intptr_t end,unsigned int * us,intptr_t dstart,intptr_t dend,intptr_t * ipos,char utf16,int permissive)5128 intptr_t scheme_utf8_decode_as_prefix(const unsigned char *s, intptr_t start, intptr_t end,
5129 unsigned int *us, intptr_t dstart, intptr_t dend,
5130 intptr_t *ipos, char utf16, int permissive)
5131 /* Always returns number of read characters, not error codes. */
5132 {
5133 intptr_t opos;
5134 utf8_decode_x(s, start, end, us, dstart, dend,
5135 ipos, &opos, utf16, utf16, NULL, 1, permissive, WIN_UTF16_AS_WTF16(utf16));
5136 return opos - dstart;
5137 }
5138
scheme_utf8_decode_all(const unsigned char * s,intptr_t len,unsigned int * us,int permissive)5139 intptr_t scheme_utf8_decode_all(const unsigned char *s, intptr_t len, unsigned int *us, int permissive)
5140 {
5141 return utf8_decode_x(s, 0, len, us, 0, -1, NULL, NULL, 0, 0, NULL, 0, permissive, 0);
5142 }
5143
scheme_utf8_decode_prefix(const unsigned char * s,intptr_t len,unsigned int * us,int permissive)5144 intptr_t scheme_utf8_decode_prefix(const unsigned char *s, intptr_t len, unsigned int *us, int permissive)
5145 /* us != NULL */
5146 {
5147 {
5148 /* Try fast path (all ASCII) */
5149 intptr_t i;
5150 for (i = 0; i < len; i++) {
5151 if (s[i] < 128)
5152 us[i] = s[i];
5153 else
5154 break;
5155 }
5156 if (i == len)
5157 return len;
5158 }
5159
5160 return utf8_decode_x(s, 0, len, us, 0, -1, NULL, NULL, 0, 0, NULL, 1, permissive, 0);
5161 }
5162
scheme_utf8_decode_to_buffer_len(const unsigned char * s,intptr_t len,mzchar * buf,intptr_t blen,intptr_t * _ulen)5163 mzchar *scheme_utf8_decode_to_buffer_len(const unsigned char *s, intptr_t len,
5164 mzchar *buf, intptr_t blen, intptr_t *_ulen)
5165 {
5166 intptr_t ulen;
5167
5168 ulen = utf8_decode_x(s, 0, len, NULL, 0, -1,
5169 NULL, NULL, 0, 0,
5170 NULL, 0, 0, 0);
5171 if (ulen < 0)
5172 return NULL;
5173 if (ulen + 1 > blen) {
5174 buf = (mzchar *)scheme_malloc_atomic((ulen + 1) * sizeof(mzchar));
5175 }
5176 utf8_decode_x(s, 0, len, buf, 0, -1,
5177 NULL, NULL, 0, 0,
5178 NULL, 0, 0, 0);
5179 buf[ulen] = 0;
5180 *_ulen = ulen;
5181 return buf;
5182 }
5183
scheme_utf8_decode_to_buffer(const unsigned char * s,intptr_t len,mzchar * buf,intptr_t blen)5184 mzchar *scheme_utf8_decode_to_buffer(const unsigned char *s, intptr_t len,
5185 mzchar *buf, intptr_t blen)
5186 {
5187 intptr_t ulen;
5188 return scheme_utf8_decode_to_buffer_len(s, len, buf, blen, &ulen);
5189 }
5190
scheme_utf8_decode_count(const unsigned char * s,intptr_t start,intptr_t end,int * _state,int might_continue,int permissive)5191 intptr_t scheme_utf8_decode_count(const unsigned char *s, intptr_t start, intptr_t end,
5192 int *_state, int might_continue, int permissive)
5193 {
5194 intptr_t pos = 0;
5195
5196 if (!_state || !*_state) {
5197 /* Try fast path (all ASCII): */
5198 intptr_t i;
5199 for (i = start; i < end; i++) {
5200 if (s[i] > 127)
5201 break;
5202 }
5203 if (i == end)
5204 return end - start;
5205 }
5206
5207 utf8_decode_x(s, start, end,
5208 NULL, 0, -1,
5209 NULL, &pos,
5210 0, 0, _state,
5211 might_continue, permissive, 0);
5212
5213 return pos;
5214 }
5215
utf8_encode_x(const unsigned int * us,intptr_t start,intptr_t end,unsigned char * s,intptr_t dstart,intptr_t dend,intptr_t * _ipos,intptr_t * _opos,char utf16,int wtf)5216 static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t end,
5217 unsigned char *s, intptr_t dstart, intptr_t dend,
5218 intptr_t *_ipos, intptr_t *_opos, char utf16, int wtf)
5219 /* Results:
5220 -1 => input ended in the middle of an encoding - only when utf16 and _opos
5221 non-negative => reports number of bytes/code-units produced */
5222 {
5223 intptr_t i, j, done = start;
5224
5225 if (dend < 0)
5226 dend = 0x7FFFFFFF;
5227
5228 if (!s) {
5229 unsigned int wc;
5230 j = 0;
5231 for (i = start; i < end; i++) {
5232 if (utf16) {
5233 wc = ((unsigned short *)us)[i];
5234 if ((wc & 0xF800) == 0xD800) {
5235 /* Unparse surrogates. We assume that the surrogates are
5236 well formed, unless this is Windows or if we're at the
5237 end and _opos is 0. The well-formedness assumption was
5238 probably not a good idea, but note that it's explicitly
5239 documented to behave that way. */
5240 # define UNPAIRED_MASK(wtf) (wtf ? 0xFC00 : 0xF800)
5241 if (((i + 1) == end) && ((wc & UNPAIRED_MASK(wtf)) == 0xD800) && _opos) {
5242 /* Ended in the middle of a surrogate pair */
5243 *_opos = j;
5244 if (_ipos)
5245 *_ipos = i;
5246 return -1;
5247 }
5248 if (wtf && ((wc & 0xFC00) != 0xD800)) {
5249 /* Count as one */
5250 } else if (wtf && ((i + 1 >= end)
5251 || (((((unsigned short *)us)[i+1]) & 0xFC00) != 0xDC00))) {
5252 } else {
5253 i++;
5254 wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)us)[i]) & 0x3FF);
5255 wc += 0x10000;
5256 }
5257 }
5258 } else {
5259 wc = us[i];
5260 }
5261 if (wc < 0x80) {
5262 j += 1;
5263 } else if (wc < 0x800) {
5264 j += 2;
5265 } else if (wc < 0x10000) {
5266 j += 3;
5267 } else if (wc < 0x200000) {
5268 j += 4;
5269 } else if (wc < 0x4000000) {
5270 j += 5;
5271 } else {
5272 j += 6;
5273 }
5274 }
5275 if (_ipos)
5276 *_ipos = i;
5277 if (_opos)
5278 *_opos = j + dstart;
5279 return j;
5280 } else {
5281 unsigned int wc;
5282 j = dstart;
5283 for (i = start; i < end; i++) {
5284 if (utf16) {
5285 wc = ((unsigned short *)us)[i];
5286 if ((wc & 0xF800) == 0xD800) {
5287 /* Unparse surrogates. We assume that the surrogates are
5288 well formed on non-Windows platforms, but when _opos,
5289 we detect ending in the middle of an surrogate pair. */
5290 if (((i + 1) == end) && ((wc & UNPAIRED_MASK(wtf)) == 0xD800) && _opos) {
5291 /* Ended in the middle of a surrogate pair */
5292 *_opos = j;
5293 if (_ipos)
5294 *_ipos = i;
5295 return -1;
5296 }
5297 if (wtf && ((wc & 0xFC00) != 0xD800)) {
5298 /* Let the misplaced surrogate through */
5299 } else if (wtf && ((i + 1 >= end)
5300 || (((((unsigned short *)us)[i+1]) & 0xFC00) != 0xDC00))) {
5301 /* Let the misplaced surrogate through */
5302 } else {
5303 i++;
5304 wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)us)[i]) & 0x3FF);
5305 wc += 0x10000;
5306 }
5307 }
5308 } else {
5309 wc = us[i];
5310 }
5311
5312 if (wc < 0x80) {
5313 if (j + 1 > dend)
5314 break;
5315 s[j++] = wc;
5316 } else if (wc < 0x800) {
5317 if (j + 2 > dend)
5318 break;
5319 s[j++] = 0xC0 | ((wc & 0x7C0) >> 6);
5320 s[j++] = 0x80 | (wc & 0x3F);
5321 } else if (wc < 0x10000) {
5322 if (j + 3 > dend)
5323 break;
5324 s[j++] = 0xE0 | ((wc & 0xF000) >> 12);
5325 s[j++] = 0x80 | ((wc & 0x0FC0) >> 6);
5326 s[j++] = 0x80 | (wc & 0x3F);
5327 } else if (wc < 0x200000) {
5328 if (j + 4 > dend)
5329 break;
5330 s[j++] = 0xF0 | ((wc & 0x1C0000) >> 18);
5331 s[j++] = 0x80 | ((wc & 0x03F000) >> 12);
5332 s[j++] = 0x80 | ((wc & 0x000FC0) >> 6);
5333 s[j++] = 0x80 | (wc & 0x3F);
5334 } else if (wc < 0x4000000) {
5335 if (j + 5 > dend)
5336 break;
5337 s[j++] = 0xF8 | ((wc & 0x3000000) >> 24);
5338 s[j++] = 0x80 | ((wc & 0x0FC0000) >> 18);
5339 s[j++] = 0x80 | ((wc & 0x003F000) >> 12);
5340 s[j++] = 0x80 | ((wc & 0x0000FC0) >> 6);
5341 s[j++] = 0x80 | (wc & 0x3F);
5342 } else {
5343 if (j + 6 > dend)
5344 break;
5345 s[j++] = 0xFC | ((wc & 0x40000000) >> 30);
5346 s[j++] = 0x80 | ((wc & 0x3F000000) >> 24);
5347 s[j++] = 0x80 | ((wc & 0x00FC0000) >> 18);
5348 s[j++] = 0x80 | ((wc & 0x0003F000) >> 12);
5349 s[j++] = 0x80 | ((wc & 0x00000FC0) >> 6);
5350 s[j++] = 0x80 | (wc & 0x3F);
5351 }
5352 done = i;
5353 }
5354 if (_ipos)
5355 *_ipos = done;
5356 if (_opos)
5357 *_opos = j;
5358 return j - dstart;
5359 }
5360 }
5361
scheme_utf8_encode(const unsigned int * us,intptr_t start,intptr_t end,unsigned char * s,intptr_t dstart,char utf16)5362 intptr_t scheme_utf8_encode(const unsigned int *us, intptr_t start, intptr_t end,
5363 unsigned char *s, intptr_t dstart,
5364 char utf16)
5365 {
5366 return utf8_encode_x(us, start, end,
5367 s, dstart, -1,
5368 NULL, NULL, utf16, WIN_UTF16_AS_WTF16(utf16));
5369 }
5370
scheme_utf8_encode_all(const unsigned int * us,intptr_t len,unsigned char * s)5371 intptr_t scheme_utf8_encode_all(const unsigned int *us, intptr_t len, unsigned char *s)
5372 {
5373 return utf8_encode_x(us, 0, len, s, 0, -1, NULL, NULL, 0 /* utf16 */, 0);
5374 }
5375
scheme_utf8_encode_to_buffer_len(const mzchar * s,intptr_t len,char * buf,intptr_t blen,intptr_t * _slen)5376 char *scheme_utf8_encode_to_buffer_len(const mzchar *s, intptr_t len,
5377 char *buf, intptr_t blen,
5378 intptr_t *_slen)
5379 {
5380 intptr_t slen;
5381
5382 /* ASCII with len < blen is a common case: */
5383 if (len < blen) {
5384 for (slen = 0; slen < len; slen++) {
5385 if (s[slen] > 127)
5386 break;
5387 else
5388 buf[slen] = s[slen];
5389 }
5390 if (slen == len) {
5391 buf[slen] = 0;
5392 *_slen = slen;
5393 return buf;
5394 }
5395 }
5396
5397 slen = utf8_encode_x(s, 0, len, NULL, 0, -1, NULL, NULL, 0, 0);
5398 if (slen + 1 > blen) {
5399 buf = (char *)scheme_malloc_atomic(slen + 1);
5400 }
5401 utf8_encode_x(s, 0, len, (unsigned char *)buf, 0, -1, NULL, NULL, 0, 0);
5402 buf[slen] = 0;
5403 *_slen = slen;
5404 return buf;
5405 }
5406
scheme_utf8_encode_to_buffer(const mzchar * s,intptr_t len,char * buf,intptr_t blen)5407 char *scheme_utf8_encode_to_buffer(const mzchar *s, intptr_t len,
5408 char *buf, intptr_t blen)
5409 {
5410 intptr_t slen;
5411 return scheme_utf8_encode_to_buffer_len(s, len, buf, blen, &slen);
5412 }
5413
scheme_ucs4_to_utf16(const mzchar * text,intptr_t start,intptr_t end,unsigned short * buf,intptr_t bufsize,intptr_t * ulen,intptr_t term_size)5414 unsigned short *scheme_ucs4_to_utf16(const mzchar *text, intptr_t start, intptr_t end,
5415 unsigned short *buf, intptr_t bufsize,
5416 intptr_t *ulen, intptr_t term_size)
5417 {
5418 mzchar v;
5419 intptr_t extra, i, j;
5420 unsigned short *utf16;
5421
5422 /* Count characters that fall outside UCS-2: */
5423 for (i = start, extra = 0; i < end; i++) {
5424 if (text[i] > 0xFFFF)
5425 extra++;
5426 }
5427
5428 if ((end - start) + extra + term_size < bufsize)
5429 utf16 = buf;
5430 else
5431 utf16 = (unsigned short *)scheme_malloc_atomic(sizeof(unsigned short) * ((end - start) + extra + term_size));
5432
5433 for (i = start, j = 0; i < end; i++) {
5434 v = text[i];
5435 if (v > 0xFFFF) {
5436 v -= 0x10000;
5437 utf16[j++] = 0xD800 | ((v >> 10) & 0x3FF);
5438 utf16[j++] = 0xDC00 | (v & 0x3FF);
5439 } else
5440 utf16[j++] = v;
5441 }
5442
5443 *ulen = j;
5444
5445 return utf16;
5446 }
5447
scheme_utf16_to_ucs4(const unsigned short * text,intptr_t start,intptr_t end,mzchar * buf,intptr_t bufsize,intptr_t * ulen,intptr_t term_size)5448 mzchar *scheme_utf16_to_ucs4(const unsigned short *text, intptr_t start, intptr_t end,
5449 mzchar *buf, intptr_t bufsize,
5450 intptr_t *ulen, intptr_t term_size)
5451 {
5452 int wc;
5453 intptr_t i, j;
5454
5455 for (i = start, j = 0; i < end; i++) {
5456 wc = text[i];
5457 if ((wc & 0xF800) == 0xD800) {
5458 i++;
5459 }
5460 j++;
5461 }
5462
5463 if (j + term_size >= bufsize)
5464 buf = (mzchar *)scheme_malloc_atomic((j + term_size) * sizeof(mzchar));
5465
5466 for (i = start, j = 0; i < end; i++) {
5467 wc = text[i];
5468 if ((wc & 0xF800) == 0xD800) {
5469 i++;
5470 wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)text)[i]) & 0x3FF);
5471 wc += 0x10000;
5472 }
5473 buf[j++] = wc;
5474 }
5475
5476 *ulen = j;
5477
5478 return buf;
5479 }
5480
5481 /**********************************************************************/
5482 /* Precise GC */
5483 /**********************************************************************/
5484
5485 #ifdef MZ_PRECISE_GC
5486
5487 START_XFORM_SKIP;
5488
5489 #include "mzmark_string.inc"
5490
register_traversers(void)5491 static void register_traversers(void)
5492 {
5493 GC_REG_TRAV(scheme_string_converter_type, mark_string_convert);
5494 }
5495
5496 END_XFORM_SKIP;
5497
5498 #endif
5499