1 #include "schpriv.h"
2 #include "racket_version.h"
3 #include "schrktio.h"
4 #include <string.h>
5 #include <ctype.h>
6 #ifdef NO_ERRNO_GLOBAL
7 # define errno -1
8 #else
9 # include <errno.h>
10 #endif
11 
12 #ifndef SCHEME_PLATFORM_LIBRARY_SUBPATH
13 # include "schsys.h"
14 #endif
15 
16 #ifndef SPLS_SUFFIX
17 # define SPLS_SUFFIX ""
18 #endif
19 
20 #include "schustr.inc"
21 
22 #ifdef MACOS_UNICODE_SUPPORT
23 # define mzLOCALE_IS_UTF_8(s) (!s || !(*s))
24 #endif
25 #ifdef WINDOWS_UNICODE_SUPPORT
26 # define mzLOCALE_IS_UTF_8(s) (!s || !(*s))
27 #endif
28 #ifndef mzLOCALE_IS_UTF_8
29 # define mzLOCALE_IS_UTF_8(s) (!(rktio_convert_properties(scheme_rktio) & RKTIO_CONVERTER_SUPPORTED))
30 #endif
31 
32 #ifdef WINDOWS_UNICODE_SUPPORT
33 # define WIN_UTF16_AS_WTF16(utf16) utf16
34 #else
35 # define WIN_UTF16_AS_WTF16(utf16) 0
36 #endif
37 
38 #define mzICONV_KIND 0
39 #define mzUTF8_KIND 1
40 #define mzUTF8_TO_UTF16_KIND 2
41 #define mzUTF16_TO_UTF8_KIND 3
42 
43 typedef struct Scheme_Converter {
44   Scheme_Object so;
45   short closed;
46   short kind;
47   rktio_converter_t *cd;
48   int permissive, wtf;
49   Scheme_Custodian_Reference *mref;
50 } Scheme_Converter;
51 
52 Scheme_Object *scheme_system_type_proc;
53 
54 static Scheme_Object *make_string (int argc, Scheme_Object *argv[]);
55 static Scheme_Object *string (int argc, Scheme_Object *argv[]);
56 static Scheme_Object *string_p (int argc, Scheme_Object *argv[]);
57 static Scheme_Object *string_length (int argc, Scheme_Object *argv[]);
58 static Scheme_Object *string_eq (int argc, Scheme_Object *argv[]);
59 static Scheme_Object *string_locale_eq (int argc, Scheme_Object *argv[]);
60 static Scheme_Object *string_ci_eq (int argc, Scheme_Object *argv[]);
61 static Scheme_Object *string_locale_ci_eq (int argc, Scheme_Object *argv[]);
62 static Scheme_Object *string_lt (int argc, Scheme_Object *argv[]);
63 static Scheme_Object *string_locale_lt (int argc, Scheme_Object *argv[]);
64 static Scheme_Object *string_gt (int argc, Scheme_Object *argv[]);
65 static Scheme_Object *string_locale_gt (int argc, Scheme_Object *argv[]);
66 static Scheme_Object *string_lt_eq (int argc, Scheme_Object *argv[]);
67 static Scheme_Object *string_gt_eq (int argc, Scheme_Object *argv[]);
68 static Scheme_Object *string_ci_lt (int argc, Scheme_Object *argv[]);
69 static Scheme_Object *string_locale_ci_lt (int argc, Scheme_Object *argv[]);
70 static Scheme_Object *string_ci_gt (int argc, Scheme_Object *argv[]);
71 static Scheme_Object *string_locale_ci_gt (int argc, Scheme_Object *argv[]);
72 static Scheme_Object *string_ci_lt_eq (int argc, Scheme_Object *argv[]);
73 static Scheme_Object *string_ci_gt_eq (int argc, Scheme_Object *argv[]);
74 static Scheme_Object *string_upcase (int argc, Scheme_Object *argv[]);
75 static Scheme_Object *string_downcase (int argc, Scheme_Object *argv[]);
76 static Scheme_Object *string_titlecase (int argc, Scheme_Object *argv[]);
77 static Scheme_Object *string_foldcase (int argc, Scheme_Object *argv[]);
78 static Scheme_Object *string_locale_upcase (int argc, Scheme_Object *argv[]);
79 static Scheme_Object *string_locale_downcase (int argc, Scheme_Object *argv[]);
80 static Scheme_Object *substring (int argc, Scheme_Object *argv[]);
81 static Scheme_Object *string_append (int argc, Scheme_Object *argv[]);
82 static Scheme_Object *string_append_immutable (int argc, Scheme_Object *argv[]);
83 static Scheme_Object *string_to_list (int argc, Scheme_Object *argv[]);
84 static Scheme_Object *list_to_string (int argc, Scheme_Object *argv[]);
85 static Scheme_Object *string_copy (int argc, Scheme_Object *argv[]);
86 static Scheme_Object *string_copy_bang (int argc, Scheme_Object *argv[]);
87 static Scheme_Object *string_fill (int argc, Scheme_Object *argv[]);
88 static Scheme_Object *string_to_immutable (int argc, Scheme_Object *argv[]);
89 static Scheme_Object *string_normalize_c (int argc, Scheme_Object *argv[]);
90 static Scheme_Object *string_normalize_kc (int argc, Scheme_Object *argv[]);
91 static Scheme_Object *string_normalize_d (int argc, Scheme_Object *argv[]);
92 static Scheme_Object *string_normalize_kd (int argc, Scheme_Object *argv[]);
93 
94 static Scheme_Object *make_shared_byte_string (int argc, Scheme_Object *argv[]);
95 static Scheme_Object *shared_byte_string (int argc, Scheme_Object *argv[]);
96 
97 static Scheme_Object *make_byte_string (int argc, Scheme_Object *argv[]);
98 static Scheme_Object *byte_string (int argc, Scheme_Object *argv[]);
99 static Scheme_Object *byte_p (int argc, Scheme_Object *argv[]);
100 static Scheme_Object *byte_string_p (int argc, Scheme_Object *argv[]);
101 static Scheme_Object *byte_string_length (int argc, Scheme_Object *argv[]);
102 static Scheme_Object *byte_string_eq (int argc, Scheme_Object *argv[]);
103 static Scheme_Object *byte_string_lt (int argc, Scheme_Object *argv[]);
104 static Scheme_Object *byte_string_gt (int argc, Scheme_Object *argv[]);
105 static Scheme_Object *byte_substring (int argc, Scheme_Object *argv[]);
106 static Scheme_Object *byte_string_append (int argc, Scheme_Object *argv[]);
107 static Scheme_Object *byte_string_to_list (int argc, Scheme_Object *argv[]);
108 static Scheme_Object *list_to_byte_string (int argc, Scheme_Object *argv[]);
109 static Scheme_Object *byte_string_copy (int argc, Scheme_Object *argv[]);
110 static Scheme_Object *byte_string_copy_bang (int argc, Scheme_Object *argv[]);
111 static Scheme_Object *byte_string_fill (int argc, Scheme_Object *argv[]);
112 static Scheme_Object *byte_string_to_immutable (int argc, Scheme_Object *argv[]);
113 
114 static Scheme_Object *byte_string_utf8_index (int argc, Scheme_Object *argv[]);
115 static Scheme_Object *byte_string_utf8_ref (int argc, Scheme_Object *argv[]);
116 static Scheme_Object *byte_string_utf8_length (int argc, Scheme_Object *argv[]);
117 
118 static Scheme_Object *byte_string_to_char_string (int argc, Scheme_Object *argv[]);
119 static Scheme_Object *byte_string_to_char_string_locale (int argc, Scheme_Object *argv[]);
120 static Scheme_Object *byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[]);
121 static Scheme_Object *char_string_to_byte_string (int argc, Scheme_Object *argv[]);
122 static Scheme_Object *char_string_to_byte_string_locale (int argc, Scheme_Object *argv[]);
123 static Scheme_Object *char_string_to_byte_string_latin1 (int argc, Scheme_Object *argv[]);
124 static Scheme_Object *char_string_utf8_length (int argc, Scheme_Object *argv[]);
125 
126 static Scheme_Object *version(int argc, Scheme_Object *argv[]);
127 static Scheme_Object *format(int argc, Scheme_Object *argv[]);
128 static Scheme_Object *sch_printf(int argc, Scheme_Object *argv[]);
129 static Scheme_Object *sch_eprintf(int argc, Scheme_Object *argv[]);
130 static Scheme_Object *sch_fprintf(int argc, Scheme_Object *argv[]);
131 static Scheme_Object *banner(int argc, Scheme_Object *argv[]);
132 static Scheme_Object *env_p(int argc, Scheme_Object *argv[]);
133 static Scheme_Object *sch_getenv(int argc, Scheme_Object *argv[]);
134 static Scheme_Object *sch_getenv_names(int argc, Scheme_Object *argv[]);
135 static Scheme_Object *sch_putenv(int argc, Scheme_Object *argv[]);
136 static Scheme_Object *env_copy(int argc, Scheme_Object *argv[]);
137 static Scheme_Object *env_make(int argc, Scheme_Object *argv[]);
138 static Scheme_Object *current_environment_variables(int argc, Scheme_Object *argv[]);
139 static Scheme_Object *system_type(int argc, Scheme_Object *argv[]);
140 static Scheme_Object *system_library_subpath(int argc, Scheme_Object *argv[]);
141 static Scheme_Object *cmdline_args(int argc, Scheme_Object *argv[]);
142 static Scheme_Object *current_locale(int argc, Scheme_Object *argv[]);
143 static Scheme_Object *locale_string_encoding(int argc, Scheme_Object *argv[]);
144 static Scheme_Object *system_language_country(int argc, Scheme_Object *argv[]);
145 
146 static Scheme_Object *byte_string_open_converter(int argc, Scheme_Object *argv[]);
147 static Scheme_Object *byte_string_close_converter(int argc, Scheme_Object *argv[]);
148 static Scheme_Object *byte_string_convert(int argc, Scheme_Object *argv[]);
149 static Scheme_Object *byte_string_convert_end(int argc, Scheme_Object *argv[]);
150 static Scheme_Object *byte_converter_p(int argc, Scheme_Object *argv[]);
151 
152 static Scheme_Object *path_lt (int argc, Scheme_Object *argv[]);
153 
154 #ifdef MZ_PRECISE_GC
155 static void register_traversers(void);
156 #endif
157 
158 static int mz_char_strcmp(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2, int locale, int size_shortcut);
159 static int mz_char_strcmp_ci(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2, int locale, int size_shortcut);
160 static int mz_strcmp(const char *who, unsigned char *str1, intptr_t l1, unsigned char *str2, intptr_t l2);
161 
162 XFORM_NONGCING static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t end,
163                                              unsigned int *us, intptr_t dstart, intptr_t dend,
164                                              intptr_t *ipos, intptr_t *jpos,
165                                              char compact, char utf16,
166                                              int *state, int might_continue, int permissive, int wtf);
167 XFORM_NONGCING static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t end,
168                                              unsigned char *s, intptr_t dstart, intptr_t dend,
169                                              intptr_t *_ipos, intptr_t *_opos, char utf16, int wtf);
170 
171 static char *string_to_from_locale(int to_bytes,
172 				   char *in, intptr_t delta, intptr_t len,
173 				   intptr_t *olen, int perm,
174 				   int *no_cvt);
175 
176 static void cache_locale_or_close(int to_bytes, rktio_converter_t *cd, char *le);
177 
178 #define portable_isspace(x) (((x) < 128) && isspace(x))
179 
180 ROSYM static Scheme_Object *sys_symbol, *sys_os_symbol, *sys_arch_symbol;
181 ROSYM static Scheme_Object *link_symbol, *machine_symbol, *vm_symbol, *gc_symbol;
182 ROSYM static Scheme_Object *so_suffix_symbol, *so_mode_symbol, *word_symbol;
183 ROSYM static Scheme_Object *os_symbol, *os_star_symbol, *arch_symbol;
184 ROSYM static Scheme_Object *fs_change_symbol, *target_machine_symbol, *cross_symbol;
185 ROSYM static Scheme_Object *racket_symbol, *cgc_symbol, *_3m_symbol, *cs_symbol;
186 ROSYM static Scheme_Object *force_symbol, *infer_symbol;
187 ROSYM static Scheme_Object *platform_3m_path, *platform_cgc_path, *platform_cs_path;
188 READ_ONLY static Scheme_Object *zero_length_char_string;
189 READ_ONLY static Scheme_Object *zero_length_char_immutable_string;
190 READ_ONLY static Scheme_Object *zero_length_byte_string;
191 
192 SHARED_OK static char *embedding_banner;
193 SHARED_OK static Scheme_Object *vers_str;
194 SHARED_OK static Scheme_Object *banner_str;
195 
196 THREAD_LOCAL_DECL(static Scheme_Object *fs_change_props);
197 
198 THREAD_LOCAL_DECL(static char *cached_locale_encoding_name);
199 THREAD_LOCAL_DECL(struct rktio_converter_t *cached_locale_to_converter);
200 THREAD_LOCAL_DECL(struct rktio_converter_t *cached_locale_from_converter);
201 
202 READ_ONLY static Scheme_Object *complete_symbol, *continues_symbol, *aborts_symbol, *error_symbol;
203 
204 READ_ONLY Scheme_Object *scheme_string_p_proc;
205 READ_ONLY Scheme_Object *scheme_byte_string_p_proc;
206 
207 READ_ONLY static int cross_compile_mode;
208 
209 /* These two locale variables are only valid when reset_locale()
210    is called after continuation marks (and hence parameterization)
211    may have changed. Similarly, setlocale() is only up-to-date
212    when reset_locale() has been called. */
213 THREAD_LOCAL_DECL(static int locale_on);
214 THREAD_LOCAL_DECL(static void *current_locale_name_ptr);
215 static void reset_locale(void);
216 
217 #define current_locale_name ((const mzchar *)current_locale_name_ptr)
218 
219 static const mzchar empty_char_string[1] = { 0 };
220 static const mzchar xes_char_string[2] = { 0x78787878, 0 };
221 
222 void
scheme_init_string(Scheme_Startup_Env * env)223 scheme_init_string (Scheme_Startup_Env *env)
224 {
225   Scheme_Object *p;
226 
227   REGISTER_SO(sys_symbol);
228   REGISTER_SO(sys_os_symbol);
229   REGISTER_SO(sys_arch_symbol);
230   sys_symbol = scheme_intern_symbol(SYSTEM_TYPE_NAME);
231   sys_os_symbol = scheme_intern_symbol(SCHEME_OS);
232   sys_arch_symbol = scheme_intern_symbol(SCHEME_ARCH);
233 
234   REGISTER_SO(link_symbol);
235   REGISTER_SO(machine_symbol);
236   REGISTER_SO(gc_symbol);
237   REGISTER_SO(vm_symbol);
238   REGISTER_SO(so_suffix_symbol);
239   REGISTER_SO(so_mode_symbol);
240   REGISTER_SO(word_symbol);
241   REGISTER_SO(os_symbol);
242   REGISTER_SO(os_star_symbol);
243   REGISTER_SO(arch_symbol);
244   REGISTER_SO(fs_change_symbol);
245   REGISTER_SO(target_machine_symbol);
246   REGISTER_SO(cross_symbol);
247   link_symbol = scheme_intern_symbol("link");
248   machine_symbol = scheme_intern_symbol("machine");
249   vm_symbol = scheme_intern_symbol("vm");
250   gc_symbol = scheme_intern_symbol("gc");
251   so_suffix_symbol = scheme_intern_symbol("so-suffix");
252   so_mode_symbol = scheme_intern_symbol("so-mode");
253   word_symbol = scheme_intern_symbol("word");
254   os_symbol = scheme_intern_symbol("os");
255   os_star_symbol = scheme_intern_symbol("os*");
256   arch_symbol = scheme_intern_symbol("arch");
257   fs_change_symbol = scheme_intern_symbol("fs-change");
258   target_machine_symbol = scheme_intern_symbol("target-machine");
259   cross_symbol = scheme_intern_symbol("cross");
260 
261   REGISTER_SO(racket_symbol);
262   REGISTER_SO(cgc_symbol);
263   REGISTER_SO(_3m_symbol);
264   REGISTER_SO(cs_symbol);
265   racket_symbol = scheme_intern_symbol("racket");
266   cgc_symbol = scheme_intern_symbol("cgc");
267   _3m_symbol = scheme_intern_symbol("3m");
268   cs_symbol = scheme_intern_symbol("cs");
269 
270   REGISTER_SO(force_symbol);
271   REGISTER_SO(infer_symbol);
272   force_symbol = scheme_intern_symbol("force");
273   infer_symbol = scheme_intern_symbol("infer");
274 
275   REGISTER_SO(zero_length_char_string);
276   REGISTER_SO(zero_length_char_immutable_string);
277   REGISTER_SO(zero_length_byte_string);
278   zero_length_char_string = scheme_alloc_char_string(0, 0);
279   zero_length_char_immutable_string = scheme_alloc_char_string(0, 0);
280   SCHEME_SET_CHAR_STRING_IMMUTABLE(zero_length_char_immutable_string);
281   zero_length_byte_string = scheme_alloc_byte_string(0, 0);
282 
283   REGISTER_SO(complete_symbol);
284   REGISTER_SO(continues_symbol);
285   REGISTER_SO(aborts_symbol);
286   REGISTER_SO(error_symbol);
287   complete_symbol = scheme_intern_symbol("complete");
288   continues_symbol = scheme_intern_symbol("continues");
289   aborts_symbol = scheme_intern_symbol("aborts");
290   error_symbol = scheme_intern_symbol("error");
291 
292   REGISTER_SO(platform_3m_path);
293 # ifdef DOS_FILE_SYSTEM
294 #  define MZ3M_SUBDIR "\\3m"
295 #  define MZCS_SUBDIR "\\cs"
296 # else
297 #  define MZ3M_SUBDIR "/3m"
298 #  define MZCS_SUBDIR "/cs"
299 #endif
300   REGISTER_SO(platform_3m_path);
301   REGISTER_SO(platform_cgc_path);
302   REGISTER_SO(platform_cs_path);
303   platform_cgc_path = scheme_make_path(SCHEME_PLATFORM_LIBRARY_SUBPATH SPLS_SUFFIX);
304   platform_3m_path = scheme_make_path(SCHEME_PLATFORM_LIBRARY_SUBPATH SPLS_SUFFIX MZ3M_SUBDIR);
305   platform_cs_path = scheme_make_path(SCHEME_PLATFORM_LIBRARY_SUBPATH SPLS_SUFFIX MZCS_SUBDIR);
306 
307   REGISTER_SO(embedding_banner);
308   REGISTER_SO(vers_str);
309   REGISTER_SO(banner_str);
310 
311   vers_str = scheme_make_utf8_string(scheme_version());
312   SCHEME_SET_CHAR_STRING_IMMUTABLE(vers_str);
313   banner_str = scheme_make_utf8_string(scheme_banner());
314   SCHEME_SET_CHAR_STRING_IMMUTABLE(banner_str);
315 
316   REGISTER_SO(scheme_string_p_proc);
317   p = scheme_make_folding_prim(string_p, "string?", 1, 1, 1);
318   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_UNARY_INLINED
319                                                             | SCHEME_PRIM_IS_OMITABLE
320                                                             | SCHEME_PRIM_PRODUCES_BOOL);
321   scheme_addto_prim_instance("string?", p, env);
322   scheme_string_p_proc = p;
323 
324   scheme_addto_prim_instance("make-string",
325 			     scheme_make_immed_prim(make_string,
326 						    "make-string",
327 						    1, 2),
328 			     env);
329   scheme_addto_prim_instance("string",
330 			     scheme_make_immed_prim(string,
331 						    "string",
332 						    0, -1),
333 			     env);
334 
335   p = scheme_make_folding_prim(string_length, "string-length", 1, 1, 1);
336   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_UNARY_INLINED
337                                                             |SCHEME_PRIM_PRODUCES_FIXNUM
338                                                             | SCHEME_PRIM_AD_HOC_OPT);
339   scheme_addto_prim_instance("string-length", p,
340 			     env);
341 
342   p = scheme_make_immed_prim(scheme_checked_string_ref, "string-ref", 2, 2);
343   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_BINARY_INLINED
344                                                             | SCHEME_PRIM_AD_HOC_OPT);
345   scheme_addto_prim_instance("string-ref", p, env);
346 
347   p = scheme_make_immed_prim(scheme_checked_string_set, "string-set!", 3, 3);
348   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_NARY_INLINED
349                                                             | SCHEME_PRIM_AD_HOC_OPT);
350   scheme_addto_prim_instance("string-set!", p, env);
351 
352   p = scheme_make_immed_prim(string_eq, "string=?", 1, -1);
353   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_BINARY_INLINED
354                                                             | SCHEME_PRIM_PRODUCES_BOOL
355                                                             | SCHEME_PRIM_PRODUCES_BOOL);
356   scheme_addto_prim_instance("string=?", p, env);
357 
358   scheme_addto_prim_instance("string-locale=?",
359 			     scheme_make_immed_prim(string_locale_eq,
360 						    "string-locale=?",
361 						    1, -1),
362 			     env);
363   scheme_addto_prim_instance("string-ci=?",
364 			     scheme_make_immed_prim(string_ci_eq,
365 						    "string-ci=?",
366 						    1, -1),
367 			     env);
368   scheme_addto_prim_instance("string-locale-ci=?",
369 			     scheme_make_immed_prim(string_locale_ci_eq,
370 						    "string-locale-ci=?",
371 						    1, -1),
372 			     env);
373   scheme_addto_prim_instance("string<?",
374 			     scheme_make_immed_prim(string_lt,
375 						    "string<?",
376 						    1, -1),
377 			     env);
378   scheme_addto_prim_instance("string-locale<?",
379 			     scheme_make_immed_prim(string_locale_lt,
380 						    "string-locale<?",
381 						    1, -1),
382 			     env);
383   scheme_addto_prim_instance("string>?",
384 			     scheme_make_immed_prim(string_gt,
385 						    "string>?",
386 						    1, -1),
387 			     env);
388   scheme_addto_prim_instance("string-locale>?",
389 			     scheme_make_immed_prim(string_locale_gt,
390 						    "string-locale>?",
391 						    1, -1),
392 			     env);
393   scheme_addto_prim_instance("string<=?",
394 			     scheme_make_immed_prim(string_lt_eq,
395 						    "string<=?",
396 						    1, -1),
397 			     env);
398   scheme_addto_prim_instance("string>=?",
399 			     scheme_make_immed_prim(string_gt_eq,
400 						    "string>=?",
401 						    1, -1),
402 			     env);
403   scheme_addto_prim_instance("string-ci<?",
404 			     scheme_make_immed_prim(string_ci_lt,
405 						    "string-ci<?",
406 						    1, -1),
407 			     env);
408   scheme_addto_prim_instance("string-locale-ci<?",
409 			     scheme_make_immed_prim(string_locale_ci_lt,
410 						    "string-locale-ci<?",
411 						    1, -1),
412 			     env);
413   scheme_addto_prim_instance("string-ci>?",
414 			     scheme_make_immed_prim(string_ci_gt,
415 						    "string-ci>?",
416 						    1, -1),
417 			     env);
418   scheme_addto_prim_instance("string-locale-ci>?",
419 			     scheme_make_immed_prim(string_locale_ci_gt,
420 						    "string-locale-ci>?",
421 						    1, -1),
422 			     env);
423   scheme_addto_prim_instance("string-ci<=?",
424 			     scheme_make_immed_prim(string_ci_lt_eq,
425 						    "string-ci<=?",
426 						    1, -1),
427 			     env);
428   scheme_addto_prim_instance("string-ci>=?",
429 			     scheme_make_immed_prim(string_ci_gt_eq,
430 						    "string-ci>=?",
431 						    1, -1),
432 			     env);
433 
434   scheme_addto_prim_instance("substring",
435 			     scheme_make_immed_prim(substring,
436 						    "substring",
437 						    2, 3),
438 			     env);
439 
440   p = scheme_make_immed_prim(string_append, "string-append", 0, -1);
441   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_AD_HOC_OPT);
442   scheme_addto_prim_instance("string-append", p, env);
443 
444   p = scheme_make_immed_prim(string_append_immutable, "string-append-immutable", 0, -1);
445   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_AD_HOC_OPT);
446   scheme_addto_prim_instance("string-append-immutable", p, env);
447 
448   scheme_addto_prim_instance("string->list",
449 			     scheme_make_immed_prim(string_to_list,
450 						    "string->list",
451 						    1, 1),
452 			     env);
453   scheme_addto_prim_instance("list->string",
454 			     scheme_make_immed_prim(list_to_string,
455 						    "list->string",
456 						    1, 1),
457 			     env);
458   scheme_addto_prim_instance("string-copy",
459 			     scheme_make_immed_prim(string_copy,
460 						    "string-copy",
461 						    1, 1),
462 			     env);
463   scheme_addto_prim_instance("string-copy!",
464 			     scheme_make_immed_prim(string_copy_bang,
465 						    "string-copy!",
466 						    3, 5),
467 			     env);
468   scheme_addto_prim_instance("string-fill!",
469 			     scheme_make_immed_prim(string_fill,
470 						    "string-fill!",
471 						    2, 2),
472 			     env);
473 
474   p = scheme_make_immed_prim(string_to_immutable, "string->immutable-string", 1, 1);
475   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_AD_HOC_OPT);
476   scheme_addto_prim_instance("string->immutable-string", p, env);
477 
478   scheme_addto_prim_instance("string-normalize-nfc",
479 			     scheme_make_immed_prim(string_normalize_c,
480 						    "string-normalize-nfc",
481 						    1, 1),
482 			     env);
483   scheme_addto_prim_instance("string-normalize-nfkc",
484 			     scheme_make_immed_prim(string_normalize_kc,
485 						    "string-normalize-nfkc",
486 						    1, 1),
487 			     env);
488   scheme_addto_prim_instance("string-normalize-nfd",
489 			     scheme_make_immed_prim(string_normalize_d,
490 						    "string-normalize-nfd",
491 						    1, 1),
492 			     env);
493   scheme_addto_prim_instance("string-normalize-nfkd",
494 			     scheme_make_immed_prim(string_normalize_kd,
495 						    "string-normalize-nfkd",
496 						    1, 1),
497 			     env);
498 
499   scheme_addto_prim_instance("string-upcase",
500 			     scheme_make_immed_prim(string_upcase,
501 						    "string-upcase",
502 						    1, 1),
503 			     env);
504   scheme_addto_prim_instance("string-downcase",
505 			     scheme_make_immed_prim(string_downcase,
506 						    "string-downcase",
507 						    1, 1),
508 			     env);
509   scheme_addto_prim_instance("string-titlecase",
510 			     scheme_make_immed_prim(string_titlecase,
511 						    "string-titlecase",
512 						    1, 1),
513 			     env);
514   scheme_addto_prim_instance("string-foldcase",
515 			     scheme_make_immed_prim(string_foldcase,
516 						    "string-foldcase",
517 						    1, 1),
518 			     env);
519 
520   scheme_addto_prim_instance("string-locale-upcase",
521 			     scheme_make_immed_prim(string_locale_upcase,
522 						    "string-locale-upcase",
523 						    1, 1),
524 			     env);
525   scheme_addto_prim_instance("string-locale-downcase",
526 			     scheme_make_immed_prim(string_locale_downcase,
527 						    "string-locale-downcase",
528 						    1, 1),
529 			     env);
530 
531   scheme_addto_prim_instance("current-locale",
532 			     scheme_register_parameter(current_locale,
533 						       "current-locale",
534 						       MZCONFIG_LOCALE),
535 			     env);
536   scheme_addto_prim_instance("locale-string-encoding",
537 			     scheme_make_immed_prim(locale_string_encoding,
538 						    "locale-string-encoding",
539 						    0, 0),
540 			     env);
541   scheme_addto_prim_instance("system-language+country",
542 			     scheme_make_immed_prim(system_language_country,
543 						    "system-language+country",
544 						    0, 0),
545 			     env);
546 
547   scheme_addto_prim_instance("bytes-converter?",
548 			     scheme_make_immed_prim(byte_converter_p,
549 						    "bytes-converter?",
550 						    1, 1),
551 			     env);
552   scheme_addto_prim_instance("bytes-convert",
553 			     scheme_make_prim_w_arity2(byte_string_convert,
554 						       "bytes-convert",
555 						       2, 7,
556 						       3, 3),
557 			     env);
558   scheme_addto_prim_instance("bytes-convert-end",
559 			     scheme_make_prim_w_arity2(byte_string_convert_end,
560 						       "bytes-convert-end",
561 						       1, 4,
562 						       2, 2),
563 			     env);
564   scheme_addto_prim_instance("bytes-open-converter",
565 			     scheme_make_immed_prim(byte_string_open_converter,
566 						    "bytes-open-converter",
567 						    2, 2),
568 			     env);
569   scheme_addto_prim_instance("bytes-close-converter",
570 			     scheme_make_immed_prim(byte_string_close_converter,
571 						    "bytes-close-converter",
572 						    1, 1),
573 			     env);
574 
575   scheme_addto_prim_instance("format",
576 			     scheme_make_noncm_prim(format,
577                                                     "format",
578                                                     1, -1),
579 			     env);
580   scheme_addto_prim_instance("printf",
581 			     scheme_make_noncm_prim(sch_printf,
582                                                     "printf",
583                                                     1, -1),
584 			     env);
585   scheme_addto_prim_instance("eprintf",
586 			     scheme_make_noncm_prim(sch_eprintf,
587                                                     "eprintf",
588                                                     1, -1),
589 			     env);
590   scheme_addto_prim_instance("fprintf",
591 			     scheme_make_noncm_prim(sch_fprintf,
592                                                     "fprintf",
593                                                     2, -1),
594 			     env);
595 
596   scheme_addto_prim_instance("byte?",
597 			     scheme_make_folding_prim(byte_p,
598 						      "byte?",
599 						      1, 1, 1),
600 			     env);
601 
602   REGISTER_SO(scheme_byte_string_p_proc);
603   p = scheme_make_folding_prim(byte_string_p, "bytes?", 1, 1, 1);
604   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_UNARY_INLINED
605                                                             | SCHEME_PRIM_IS_OMITABLE
606                                                             | SCHEME_PRIM_PRODUCES_BOOL);
607   scheme_addto_prim_instance("bytes?", p, env);
608   scheme_byte_string_p_proc = p;
609 
610   scheme_addto_prim_instance("make-bytes",
611 			     scheme_make_immed_prim(make_byte_string,
612 						    "make-bytes",
613 						    1, 2),
614 			     env);
615   scheme_addto_prim_instance("bytes",
616 			     scheme_make_immed_prim(byte_string,
617 						    "bytes",
618 						    0, -1),
619 			     env);
620 
621   ADD_PRIM_W_ARITY("make-shared-bytes", make_shared_byte_string, 1, 2, env);
622   ADD_PRIM_W_ARITY("shared-bytes", shared_byte_string, 0, -1, env);
623 
624   p = scheme_make_folding_prim(byte_string_length, "bytes-length", 1, 1, 1);
625   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_UNARY_INLINED
626                                                             | SCHEME_PRIM_PRODUCES_FIXNUM
627                                                             | SCHEME_PRIM_AD_HOC_OPT);
628   scheme_addto_prim_instance("bytes-length", p, env);
629 
630   p = scheme_make_immed_prim(scheme_checked_byte_string_ref, "bytes-ref", 2, 2);
631   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_BINARY_INLINED
632                                                             | SCHEME_PRIM_PRODUCES_FIXNUM
633                                                             | SCHEME_PRIM_AD_HOC_OPT);
634   scheme_addto_prim_instance("bytes-ref", p, env);
635 
636   p = scheme_make_immed_prim(scheme_checked_byte_string_set, "bytes-set!", 3, 3);
637   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_NARY_INLINED
638                                                             | SCHEME_PRIM_AD_HOC_OPT);
639   scheme_addto_prim_instance("bytes-set!", p, env);
640 
641   p = scheme_make_immed_prim(byte_string_eq, "bytes=?", 1, -1);
642   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_IS_BINARY_INLINED
643                                                             | SCHEME_PRIM_PRODUCES_BOOL);
644   scheme_addto_prim_instance("bytes=?", p, env);
645 
646   scheme_addto_prim_instance("bytes<?",
647 			     scheme_make_immed_prim(byte_string_lt,
648 						    "bytes<?",
649 						    1, -1),
650 			     env);
651   scheme_addto_prim_instance("bytes>?",
652 			     scheme_make_immed_prim(byte_string_gt,
653 						    "bytes>?",
654 						    1, -1),
655 			     env);
656 
657   scheme_addto_prim_instance("subbytes",
658 			     scheme_make_immed_prim(byte_substring,
659 						    "subbytes",
660 						    2, 3),
661 			     env);
662 
663   p = scheme_make_immed_prim(byte_string_append, "bytes-append", 0, -1);
664   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_AD_HOC_OPT);
665   scheme_addto_prim_instance("bytes-append", p, env);
666 
667   scheme_addto_prim_instance("bytes->list",
668 			     scheme_make_immed_prim(byte_string_to_list,
669 						    "bytes->list",
670 						    1, 1),
671 			     env);
672   scheme_addto_prim_instance("list->bytes",
673 			     scheme_make_immed_prim(list_to_byte_string,
674 						    "list->bytes",
675 						    1, 1),
676 			     env);
677   scheme_addto_prim_instance("bytes-copy",
678 			     scheme_make_immed_prim(byte_string_copy,
679 						    "bytes-copy",
680 						    1, 1),
681 			     env);
682   scheme_addto_prim_instance("bytes-copy!",
683 			     scheme_make_immed_prim(byte_string_copy_bang,
684 						    "bytes-copy!",
685 						    3, 5),
686 			     env);
687   scheme_addto_prim_instance("bytes-fill!",
688 			     scheme_make_immed_prim(byte_string_fill,
689 						    "bytes-fill!",
690 						    2, 2),
691 			     env);
692 
693   p = scheme_make_immed_prim(byte_string_to_immutable, "bytes->immutable-bytes", 1, 1);
694   SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_AD_HOC_OPT);
695   scheme_addto_prim_instance("bytes->immutable-bytes", p, env);
696 
697   p = scheme_make_immed_prim(byte_string_utf8_index, "bytes-utf-8-index", 2, 5);
698   /* Incorrect, since the result can be #f:
699      SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_PRODUCES_FIXNUM); */
700   scheme_addto_prim_instance("bytes-utf-8-index", p, env);
701 
702   p = scheme_make_immed_prim(byte_string_utf8_length, "bytes-utf-8-length", 1, 4);
703   /* Incorrect, since the result can be #f:
704      SCHEME_PRIM_PROC_FLAGS(p) |= scheme_intern_prim_opt_flags(SCHEME_PRIM_PRODUCES_FIXNUM); */
705   scheme_addto_prim_instance("bytes-utf-8-length", p, env);
706 
707   scheme_addto_prim_instance("bytes-utf-8-ref",
708 			     scheme_make_immed_prim(byte_string_utf8_ref,
709 						    "bytes-utf-8-ref",
710 						    2, 5),
711 			     env);
712 
713   scheme_addto_prim_instance("bytes->string/utf-8",
714 			     scheme_make_immed_prim(byte_string_to_char_string,
715 						    "bytes->string/utf-8",
716 						    1, 4),
717 			     env);
718   scheme_addto_prim_instance("bytes->string/locale",
719 			     scheme_make_immed_prim(byte_string_to_char_string_locale,
720 						    "bytes->string/locale",
721 						    1, 4),
722 			     env);
723   scheme_addto_prim_instance("bytes->string/latin-1",
724 			     scheme_make_immed_prim(byte_string_to_char_string_latin1,
725 						    "bytes->string/latin-1",
726 						    1, 4),
727 			     env);
728   scheme_addto_prim_instance("string->bytes/utf-8",
729 			     scheme_make_immed_prim(char_string_to_byte_string,
730 						    "string->bytes/utf-8",
731 						    1, 4),
732 			     env);
733   scheme_addto_prim_instance("string->bytes/locale",
734 			     scheme_make_immed_prim(char_string_to_byte_string_locale,
735 						    "string->bytes/locale",
736 						    1, 4),
737 			     env);
738   scheme_addto_prim_instance("string->bytes/latin-1",
739 			     scheme_make_immed_prim(char_string_to_byte_string_latin1,
740 						    "string->bytes/latin-1",
741 						    1, 4),
742 			     env);
743 
744   scheme_addto_prim_instance("string-utf-8-length",
745 			     scheme_make_immed_prim(char_string_utf8_length,
746 						    "string-utf-8-length",
747 						    1, 3),
748 			     env);
749 
750 
751   /* In principle, `version' could be foldable, but it invites
752      more problems than it solves... */
753 
754   scheme_addto_prim_instance("version",
755 			     scheme_make_immed_prim(version,
756 						    "version",
757 						    0, 0),
758 			     env);
759   scheme_addto_prim_instance("banner",
760 			     scheme_make_immed_prim(banner,
761 						    "banner",
762 						    0, 0),
763 			     env);
764 
765   /* Environment variables */
766 
767   scheme_addto_prim_instance("environment-variables?",
768 			     scheme_make_folding_prim(env_p,
769                                                       "environment-variables?",
770                                                       1, 1, 1),
771 			     env);
772 
773   scheme_addto_prim_instance("current-environment-variables",
774 			     scheme_register_parameter(current_environment_variables,
775 						       "current-environment-variables",
776 						       MZCONFIG_CURRENT_ENV_VARS),
777 			     env);
778 
779   scheme_addto_prim_instance("environment-variables-ref",
780 			     scheme_make_immed_prim(sch_getenv,
781 						    "environment-variables-ref",
782 						    2, 2),
783 			     env);
784 
785   scheme_addto_prim_instance("environment-variables-set!",
786 			     scheme_make_prim_w_arity(sch_putenv,
787                                                       "environment-variables-set!",
788                                                       3, 4),
789 			     env);
790 
791   scheme_addto_prim_instance("environment-variables-names",
792 			     scheme_make_immed_prim(sch_getenv_names,
793 						    "environment-variables-names",
794 						    1, 1),
795 			     env);
796 
797   scheme_addto_prim_instance("environment-variables-copy",
798 			     scheme_make_immed_prim(env_copy,
799 						    "environment-variables-copy",
800 						    1, 1),
801 			     env);
802 
803   scheme_addto_prim_instance("make-environment-variables",
804 			     scheme_make_immed_prim(env_make,
805 						    "make-environment-variables",
806 						    0, -1),
807 			     env);
808 
809   /* Don't make these folding, since they're platform-specific: */
810 
811   REGISTER_SO(scheme_system_type_proc);
812   scheme_system_type_proc = scheme_make_immed_prim(system_type,
813                                                    "system-type",
814                                                    0, 1);
815   scheme_addto_prim_instance("system-type", scheme_system_type_proc, env);
816 
817   scheme_addto_prim_instance("system-library-subpath",
818 			     scheme_make_immed_prim(system_library_subpath,
819 						    "system-library-subpath",
820 						    0, 1),
821 			     env);
822 
823   scheme_addto_prim_instance("current-command-line-arguments",
824 			     scheme_register_parameter(cmdline_args,
825 						       "current-command-line-arguments",
826 						       MZCONFIG_CMDLINE_ARGS),
827 			     env);
828 
829 
830   scheme_addto_prim_instance("path<?",
831 			     scheme_make_immed_prim(path_lt,
832 						    "path<?",
833 						    1, -1),
834 			     env);
835 
836 #ifdef MZ_PRECISE_GC
837   register_traversers();
838 #endif
839 }
840 
scheme_init_string_places(void)841 void scheme_init_string_places(void) {
842   REGISTER_SO(current_locale_name_ptr);
843   current_locale_name_ptr = (void *)xes_char_string;
844 
845   REGISTER_SO(fs_change_props);
846   {
847     int supported, scalable, low_latency, file_level;
848     Scheme_Object *s;
849     scheme_fs_change_properties(&supported, &scalable, &low_latency, &file_level);
850     fs_change_props = scheme_make_vector(4, scheme_false);
851     if (supported) {
852       s = scheme_intern_symbol("supported");
853       SCHEME_VEC_ELS(fs_change_props)[0] = s;
854     }
855     if (scalable) {
856       s = scheme_intern_symbol("scalable");
857       SCHEME_VEC_ELS(fs_change_props)[1] = s;
858     }
859     if (low_latency) {
860       s = scheme_intern_symbol("low-latency");
861       SCHEME_VEC_ELS(fs_change_props)[2] = s;
862     }
863     if (file_level) {
864       s = scheme_intern_symbol("file-level");
865       SCHEME_VEC_ELS(fs_change_props)[3] = s;
866     }
867     SCHEME_SET_IMMUTABLE(fs_change_props);
868   }
869 }
870 
871 /**********************************************************************/
872 /*                     UTF-8 char constructors                        */
873 /**********************************************************************/
874 
scheme_make_sized_offset_utf8_string(char * chars,intptr_t d,intptr_t len)875 Scheme_Object *scheme_make_sized_offset_utf8_string(char *chars, intptr_t d, intptr_t len)
876 {
877   intptr_t ulen;
878   mzchar *us;
879 
880   if (len) {
881     ulen = scheme_utf8_decode((unsigned char *)chars, d, d + len,
882 			      NULL, 0, -1,
883 			      NULL, 0 /* not UTF-16 */, 0xFFFD);
884     us = scheme_malloc_atomic(sizeof(mzchar) * (ulen + 1));
885     scheme_utf8_decode((unsigned char *)chars, d, d + len,
886                        us, 0, -1,
887                        NULL, 0 /* not UTF-16 */, 0xFFFD);
888     us[ulen] = 0;
889   } else {
890     us = (mzchar *)empty_char_string;
891     ulen = 0;
892   }
893   return scheme_make_sized_offset_char_string(us, 0, ulen, 0);
894 }
895 
896 Scheme_Object *
scheme_make_sized_utf8_string(char * chars,intptr_t len)897 scheme_make_sized_utf8_string(char *chars, intptr_t len)
898 {
899   return scheme_make_sized_offset_utf8_string(chars, 0, len);
900 }
901 
902 Scheme_Object *
scheme_make_immutable_sized_utf8_string(char * chars,intptr_t len)903 scheme_make_immutable_sized_utf8_string(char *chars, intptr_t len)
904 {
905   Scheme_Object *s;
906 
907   s = scheme_make_sized_offset_utf8_string(chars, 0, len);
908   if (len)
909     SCHEME_SET_CHAR_STRING_IMMUTABLE(s);
910 
911   return s;
912 }
913 
914 Scheme_Object *
scheme_make_utf8_string(const char * chars)915 scheme_make_utf8_string(const char *chars)
916 {
917   return scheme_make_sized_offset_utf8_string((char *)chars, 0, -1);
918 }
919 
920 Scheme_Object *
scheme_make_locale_string(const char * chars)921 scheme_make_locale_string(const char *chars)
922 {
923   return scheme_byte_string_to_char_string_locale(scheme_make_byte_string((char *)chars));
924 }
925 
scheme_append_strings(Scheme_Object * s1,Scheme_Object * s2)926 Scheme_Object *scheme_append_strings(Scheme_Object *s1, Scheme_Object *s2)
927 {
928   Scheme_Object *a[2];
929   a[0] = s1;
930   a[1] = s2;
931   return string_append(2, a);
932 }
933 
934 /**********************************************************************/
935 /*                         index helpers                              */
936 /**********************************************************************/
937 
scheme_extract_index(const char * name,int pos,int argc,Scheme_Object ** argv,intptr_t top,int false_ok)938 intptr_t scheme_extract_index(const char *name, int pos, int argc, Scheme_Object **argv, intptr_t top, int false_ok)
939 {
940   intptr_t i;
941   int is_top = 0;
942 
943   if (SCHEME_INTP(argv[pos])) {
944     i = SCHEME_INT_VAL(argv[pos]);
945   } else if (SCHEME_BIGNUMP(argv[pos])) {
946     if (SCHEME_BIGPOS(argv[pos])) {
947       i = top; /* out-of-bounds */
948       is_top = 1;
949     } else
950       i = -1; /* negative */
951   } else
952     i = -1;
953 
954   if (!is_top && (i < 0))
955     scheme_wrong_contract(name,
956                           (false_ok ? "(or/c exact-nonnegative-integer? #f)" : "exact-nonnegative-integer?"),
957                           pos, argc, argv);
958 
959   return i;
960 }
961 
scheme_get_substring_indices(const char * name,Scheme_Object * str,int argc,Scheme_Object ** argv,int spos,int fpos,intptr_t * _start,intptr_t * _finish)962 void scheme_get_substring_indices(const char *name, Scheme_Object *str,
963                                   int argc, Scheme_Object **argv,
964                                   int spos, int fpos, intptr_t *_start, intptr_t *_finish)
965 {
966   intptr_t len;
967   intptr_t start, finish;
968 
969   if (SCHEME_CHAPERONE_VECTORP(str))
970     len = SCHEME_CHAPERONE_VEC_SIZE(str);
971   else if (SCHEME_CHAR_STRINGP(str))
972     len = SCHEME_CHAR_STRTAG_VAL(str);
973   else
974     len = SCHEME_BYTE_STRTAG_VAL(str);
975 
976   if (argc > spos)
977     start = scheme_extract_index(name, spos, argc, argv, len + 1, 0);
978   else
979     start = 0;
980   if (argc > fpos)
981     finish = scheme_extract_index(name, fpos, argc, argv, len + 1, 0);
982   else
983     finish = len;
984 
985   if (!(start <= len)) {
986     scheme_out_of_range(name, NULL, (fpos < 100) ? "starting " : "", argv[spos], str, 0, len);
987   }
988   if (!(finish >= start && finish <= len)) {
989     scheme_out_of_range(name, NULL, "ending ", argv[fpos], str, start, len);
990   }
991 
992   *_start = start;
993   *_finish = finish;
994 }
995 
scheme_do_get_substring_indices(const char * name,Scheme_Object * str,int argc,Scheme_Object ** argv,int spos,int fpos,intptr_t * _start,intptr_t * _finish,intptr_t len)996 void scheme_do_get_substring_indices(const char *name, Scheme_Object *str,
997                                      int argc, Scheme_Object **argv,
998                                      int spos, int fpos, intptr_t *_start, intptr_t *_finish, intptr_t len)
999 {
1000   if (argc > spos) {
1001     if (SCHEME_INTP(argv[spos])) {
1002       intptr_t start = SCHEME_INT_VAL(argv[spos]);
1003       if ((start >= 0) && (start < len)) {
1004         *_start = start;
1005         if (argc > fpos) {
1006           intptr_t finish = SCHEME_INT_VAL(argv[fpos]);
1007           if ((finish >= start) && (finish <= len)) {
1008             *_finish = finish;
1009             return;
1010           }
1011         } else {
1012           *_finish = len;
1013           return;
1014         }
1015       }
1016     }
1017   } else {
1018     *_start = 0;
1019     *_finish = len;
1020     return;
1021   }
1022 
1023   scheme_get_substring_indices(name, str, argc, argv, spos, fpos, _start, _finish);
1024 }
1025 
1026 /**********************************************************************/
1027 /*                          char strings                              */
1028 /**********************************************************************/
1029 
1030 #define SCHEME_X_STR_VAL(x) SCHEME_CHAR_STR_VAL(x)
1031 #define SCHEME_X_STRTAG_VAL(x) SCHEME_CHAR_STRTAG_VAL(x)
1032 #define SCHEME_X_STRINGP(x) SCHEME_CHAR_STRINGP(x)
1033 #define SCHEME_MUTABLE_X_STRINGP(x) SCHEME_MUTABLE_CHAR_STRINGP(x)
1034 #define SCHEME_SET_X_STRING_IMMUTABLE(x) SCHEME_SET_CHAR_STRING_IMMUTABLE(x)
1035 #define scheme_x_string_type scheme_char_string_type
1036 #define X(a, b) a##_char##b
1037 #define X_(a, b) a##_##b
1038 #define X__(a) a
1039 #define EMPTY (mzchar *)"\0\0\0"
1040 #define Xchar mzchar
1041 #define uXchar mzchar
1042 #define XSTR ""
1043 #define IS_STR "string?"
1044 #define XSTRINGSTR "string"
1045 #define SUBXSTR "substring"
1046 #define CHARP(x) SCHEME_CHARP(x)
1047 #define CHAR_VAL(x) SCHEME_CHAR_VAL(x)
1048 #define CHAR_STR "char?"
1049 #define MAKE_CHAR(x) _scheme_make_char(x)
1050 #define xstrlen scheme_char_strlen
1051 #include "strops.inc"
1052 
1053 #define GEN_STRING_COMP(name, scheme_name, comp, op, ul, size_shortcut)     \
1054 static Scheme_Object * name (int argc, Scheme_Object *argv[]) \
1055 {  mzchar *s, *prev; int i, sl, pl; int falz = 0;\
1056    if (!SCHEME_CHAR_STRINGP(argv[0])) \
1057     scheme_wrong_contract(scheme_name, "string?", 0, argc, argv); \
1058    prev = SCHEME_CHAR_STR_VAL(argv[0]); pl = SCHEME_CHAR_STRTAG_VAL(argv[0]); \
1059    for (i = 1; i < argc; i++) { \
1060      if (!SCHEME_CHAR_STRINGP(argv[i])) \
1061       scheme_wrong_contract(scheme_name, "string?", i, argc, argv); \
1062      s = SCHEME_CHAR_STR_VAL(argv[i]); sl = SCHEME_CHAR_STRTAG_VAL(argv[i]); \
1063      if (!falz) if (!(comp(scheme_name, \
1064                            prev, pl, \
1065                            s, sl, ul, size_shortcut) op 0)) falz = 1; \
1066      prev = s; pl = sl; \
1067   } \
1068   return falz ? scheme_false : scheme_true; \
1069 }
1070 
1071 GEN_STRING_COMP(string_eq, "string=?", mz_char_strcmp, ==, 0, 1)
1072 GEN_STRING_COMP(string_lt, "string<?", mz_char_strcmp, <, 0, 0)
1073 GEN_STRING_COMP(string_gt, "string>?", mz_char_strcmp, >, 0, 0)
1074 GEN_STRING_COMP(string_lt_eq, "string<=?", mz_char_strcmp, <=, 0, 0)
1075 GEN_STRING_COMP(string_gt_eq, "string>=?", mz_char_strcmp, >=, 0, 0)
1076 
1077 GEN_STRING_COMP(string_ci_eq, "string-ci=?", mz_char_strcmp_ci, ==, 0, 0)
1078 GEN_STRING_COMP(string_ci_lt, "string-ci<?", mz_char_strcmp_ci, <, 0, 0)
1079 GEN_STRING_COMP(string_ci_gt, "string-ci>?", mz_char_strcmp_ci, >, 0, 0)
1080 GEN_STRING_COMP(string_ci_lt_eq, "string-ci<=?", mz_char_strcmp_ci, <=, 0, 0)
1081 GEN_STRING_COMP(string_ci_gt_eq, "string-ci>=?", mz_char_strcmp_ci, >=, 0, 0)
1082 
1083 GEN_STRING_COMP(string_locale_eq, "string-locale=?", mz_char_strcmp, ==, 1, 0)
1084 GEN_STRING_COMP(string_locale_lt, "string-locale<?", mz_char_strcmp, <, 1, 0)
1085 GEN_STRING_COMP(string_locale_gt, "string-locale>?", mz_char_strcmp, >, 1, 0)
1086 GEN_STRING_COMP(string_locale_ci_eq, "string-locale-ci=?", mz_char_strcmp_ci, ==, 1, 0)
1087 GEN_STRING_COMP(string_locale_ci_lt, "string-locale-ci<?", mz_char_strcmp_ci, <, 1, 0)
1088 GEN_STRING_COMP(string_locale_ci_gt, "string-locale-ci>?", mz_char_strcmp_ci, >, 1, 0)
1089 
scheme_string_eq_2(Scheme_Object * str1,Scheme_Object * str2)1090 Scheme_Object *scheme_string_eq_2(Scheme_Object *str1, Scheme_Object *str2)
1091 {
1092   Scheme_Object *a[2];
1093   a[0] = str1;
1094   a[1] = str2;
1095   return string_eq(2, a);
1096 }
1097 
string_append_immutable(int argc,Scheme_Object * argv[])1098 Scheme_Object *string_append_immutable(int argc, Scheme_Object *argv[])
1099 {
1100   Scheme_Object *r;
1101 
1102   r = do_string_append("string-append-immutable", argc, argv);
1103 
1104   if (r == zero_length_char_string)
1105     return zero_length_char_immutable_string;
1106 
1107   SCHEME_SET_CHAR_STRING_IMMUTABLE(r);
1108 
1109   return r;
1110 }
1111 
1112 /**********************************************************************/
1113 /*                         byte strings                               */
1114 /**********************************************************************/
1115 
1116 #define SCHEME_BYTEP(x) ((SCHEME_INTP(x)) && (SCHEME_INT_VAL(x) >= 0) && (SCHEME_INT_VAL(x) <= 255))
1117 
1118 static Scheme_Object *
byte_p(int argc,Scheme_Object * argv[])1119 byte_p(int argc, Scheme_Object *argv[])
1120 {
1121   return (SCHEME_BYTEP(argv[0]) ? scheme_true : scheme_false);
1122 }
1123 
1124 #define SCHEME_X_STR_VAL(x) SCHEME_BYTE_STR_VAL(x)
1125 #define SCHEME_X_STRTAG_VAL(x) SCHEME_BYTE_STRTAG_VAL(x)
1126 #define SCHEME_X_STRINGP(x) SCHEME_BYTE_STRINGP(x)
1127 #define SCHEME_MUTABLE_X_STRINGP(x) SCHEME_MUTABLE_BYTE_STRINGP(x)
1128 #define SCHEME_SET_X_STRING_IMMUTABLE(x) SCHEME_SET_BYTE_STRING_IMMUTABLE(x)
1129 #define scheme_x_string_type scheme_byte_string_type
1130 #define X(a, b) a##_byte##b
1131 #define X_(a, b) a##_byte_##b
1132 #define X__(a) byte_##a
1133 #define EMPTY ""
1134 #define Xchar char
1135 #define uXchar unsigned char
1136 #define XSTR "byte "
1137 #define IS_STR "bytes?"
1138 #define XSTRINGSTR "bytes"
1139 #define SUBXSTR "subbytes"
1140 #define CHARP(x) SCHEME_BYTEP(x)
1141 #define CHAR_VAL(x) SCHEME_INT_VAL(x)
1142 #define CHAR_STR "byte?"
1143 #define MAKE_CHAR(x) scheme_make_integer_value(x)
1144 #define xstrlen strlen
1145 #define GENERATING_BYTE
1146 #include "strops.inc"
1147 #undef GENERATING_BYTE
1148 
1149 /* comparisons */
1150 
1151 #define GEN_BYTE_STRING_PATH_COMP(name, scheme_name, comp, op, PRED, contract)     \
1152 static Scheme_Object * name (int argc, Scheme_Object *argv[]) \
1153 {  char *s, *prev; int i, sl, pl; int falz = 0;\
1154   if (!PRED(argv[0])) \
1155     scheme_wrong_contract(scheme_name, contract, 0, argc, argv); \
1156    prev = SCHEME_BYTE_STR_VAL(argv[0]); pl = SCHEME_BYTE_STRTAG_VAL(argv[0]); \
1157    for (i = 1; i < argc; i++) { \
1158      if (!PRED(argv[i])) \
1159       scheme_wrong_contract(scheme_name, contract, i, argc, argv); \
1160      s = SCHEME_BYTE_STR_VAL(argv[i]); sl = SCHEME_BYTE_STRTAG_VAL(argv[i]); \
1161      if (!falz) if (!(comp(scheme_name, \
1162                            (unsigned char *)prev, pl, \
1163                            (unsigned char *)s, sl) op 0)) falz = 1; \
1164      prev = s; pl = sl; \
1165   } \
1166   return falz ? scheme_false : scheme_true; \
1167 }
1168 
1169 #define GEN_BYTE_STRING_COMP(name, scheme_name, comp, op) \
1170   GEN_BYTE_STRING_PATH_COMP(name, scheme_name, comp, op, SCHEME_BYTE_STRINGP, "bytes?") \
1171 
1172 GEN_BYTE_STRING_COMP(byte_string_eq, "bytes=?", mz_strcmp, ==)
1173 GEN_BYTE_STRING_COMP(byte_string_lt, "bytes<?", mz_strcmp, <)
1174 GEN_BYTE_STRING_COMP(byte_string_gt, "bytes>?", mz_strcmp, >)
1175 
1176 GEN_BYTE_STRING_PATH_COMP(path_lt, "path<?", mz_strcmp, <, SCHEME_PATHP, "path?")
1177 
scheme_byte_string_eq_2(Scheme_Object * str1,Scheme_Object * str2)1178 Scheme_Object *scheme_byte_string_eq_2(Scheme_Object *str1, Scheme_Object *str2)
1179 {
1180   Scheme_Object *a[2];
1181   a[0] = str1;
1182   a[1] = str2;
1183   return byte_string_eq(2, a);
1184 }
1185 
1186 /**********************************************************************/
1187 /*                   byte string <-> char string                      */
1188 /**********************************************************************/
1189 
1190 /************************* bytes->string *************************/
1191 
1192 static Scheme_Object *
do_byte_string_to_char_string(const char * who,Scheme_Object * bstr,intptr_t istart,intptr_t ifinish,int perm,int as_locale)1193 do_byte_string_to_char_string(const char *who,
1194 			      Scheme_Object *bstr,
1195 			      intptr_t istart, intptr_t ifinish,
1196 			      int perm, int as_locale)
1197 {
1198   int i, ulen;
1199   char *chars;
1200   unsigned int *v;
1201 
1202   chars = SCHEME_BYTE_STR_VAL(bstr);
1203 
1204   ulen = utf8_decode_x((unsigned char *)chars, istart, ifinish,
1205 		       NULL, 0, -1,
1206 		       NULL, NULL, 0, 0,
1207 		       NULL, 0,
1208 		       (perm > -1) ? 0xD800 : 0, 0);
1209   if (ulen < 0) {
1210     scheme_contract_error(who,
1211                           "string is not a well-formed UTF-8 encoding",
1212                           "string", 1, bstr,
1213                           NULL);
1214   }
1215 
1216   v = (unsigned int *)scheme_malloc_atomic((ulen + 1) * sizeof(unsigned int));
1217   utf8_decode_x((unsigned char *)chars, istart, ifinish,
1218 		v, 0, -1,
1219 		NULL, NULL, 0, 0,
1220 		NULL, 0,
1221 		(perm > -1) ? 0xD800 : 0, 0);
1222 
1223   if (perm > -1) {
1224     for (i = 0; i < ulen; i++) {
1225       if (v[i] == 0xD800)
1226 	v[i] = perm;
1227     }
1228   }
1229   v[ulen] = 0;
1230 
1231   return scheme_make_sized_char_string(v, ulen, 0);
1232 }
1233 
1234 static Scheme_Object *
do_byte_string_to_char_string_locale(const char * who,Scheme_Object * bstr,intptr_t istart,intptr_t ifinish,int perm)1235 do_byte_string_to_char_string_locale(const char *who,
1236 				     Scheme_Object *bstr,
1237 				     intptr_t istart, intptr_t ifinish,
1238 				     int perm)
1239 {
1240   char *us;
1241   intptr_t olen;
1242 
1243   reset_locale();
1244 
1245   if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on
1246       || !(rktio_convert_properties(scheme_rktio) & RKTIO_CONVERTER_SUPPORTED))
1247     return do_byte_string_to_char_string(who, bstr, istart, ifinish, perm, 1);
1248 
1249   if (istart < ifinish) {
1250     int no_cvt;
1251 
1252     us = string_to_from_locale(0, SCHEME_BYTE_STR_VAL(bstr),
1253 			       istart, ifinish - istart,
1254 			       &olen, perm, &no_cvt);
1255 
1256     if (!us) {
1257       if (no_cvt) {
1258 	return do_byte_string_to_char_string(who, bstr, istart, ifinish, perm, 1);
1259       } else {
1260 	scheme_contract_error(who,
1261                               "byte string is not a valid encoding for the current locale",
1262                               "byte string", 1, bstr,
1263                               NULL);
1264       }
1265     }
1266     ((mzchar *)us)[olen] = 0;
1267   } else {
1268     us = "\0\0\0";
1269     olen = 0;
1270   }
1271 
1272   return scheme_make_sized_char_string((mzchar *)us, olen, 0);
1273 }
1274 
1275 static Scheme_Object *
do_string_to_vector(const char * who,int mode,int argc,Scheme_Object * argv[])1276 do_string_to_vector(const char *who, int mode, int argc, Scheme_Object *argv[])
1277 {
1278   int permc;
1279   intptr_t istart, ifinish;
1280 
1281   if (!SCHEME_BYTE_STRINGP(argv[0]))
1282     scheme_wrong_contract(who, "bytes?", 0, argc, argv);
1283 
1284   if ((argc < 2) || SCHEME_FALSEP(argv[1]))
1285     permc = -1;
1286   else {
1287     if (!SCHEME_CHARP(argv[1]))
1288       scheme_wrong_contract(who, "(or/c char? #f)", 1, argc, argv);
1289     permc = SCHEME_CHAR_VAL(argv[1]);
1290   }
1291 
1292   scheme_get_substring_indices(who, argv[0], argc, argv,
1293 			       2, 3,
1294 			       &istart, &ifinish);
1295 
1296   if (mode == 0)
1297     return do_byte_string_to_char_string(who, argv[0], istart, ifinish, permc, 0);
1298   else if (mode == 1)
1299     return do_byte_string_to_char_string_locale(who, argv[0], istart, ifinish, permc);
1300   else {
1301     /* Latin-1 */
1302     mzchar *us;
1303     unsigned char *s;
1304     intptr_t i, len;
1305     len = ifinish - istart;
1306     s = (unsigned char *)SCHEME_BYTE_STR_VAL(argv[0]);
1307     us = (mzchar *)scheme_malloc_atomic((len + 1) * sizeof(mzchar));
1308     for (i = istart; i < ifinish; i++) {
1309       us[i - istart] = s[i];
1310     }
1311     us[len] = 0;
1312 
1313     return scheme_make_sized_char_string(us, len, 0);
1314   }
1315 }
1316 
1317 
1318 static Scheme_Object *
byte_string_to_char_string(int argc,Scheme_Object * argv[])1319 byte_string_to_char_string (int argc, Scheme_Object *argv[])
1320 {
1321   return do_string_to_vector("bytes->string/utf-8", 0, argc, argv);
1322 }
1323 
1324 static Scheme_Object *
byte_string_to_char_string_locale(int argc,Scheme_Object * argv[])1325 byte_string_to_char_string_locale (int argc, Scheme_Object *argv[])
1326 {
1327   return do_string_to_vector("bytes->string/locale", 1, argc, argv);
1328 }
1329 
1330 static Scheme_Object *
byte_string_to_char_string_latin1(int argc,Scheme_Object * argv[])1331 byte_string_to_char_string_latin1 (int argc, Scheme_Object *argv[])
1332 {
1333   return do_string_to_vector("bytes->string/latin-1", 2, argc, argv);
1334 }
1335 
scheme_byte_string_to_char_string(Scheme_Object * o)1336 Scheme_Object *scheme_byte_string_to_char_string(Scheme_Object *o)
1337 {
1338   return do_byte_string_to_char_string("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD, 0);
1339 }
1340 
scheme_byte_string_to_char_string_locale(Scheme_Object * o)1341 Scheme_Object *scheme_byte_string_to_char_string_locale(Scheme_Object *o)
1342 {
1343   return do_byte_string_to_char_string_locale("s->s", o, 0, SCHEME_BYTE_STRLEN_VAL(o), 0xFFFD);
1344 }
1345 
1346 /************************* string->bytes *************************/
1347 
do_char_string_to_byte_string(Scheme_Object * s,intptr_t istart,intptr_t ifinish,int as_locale)1348 static Scheme_Object *do_char_string_to_byte_string(Scheme_Object *s, intptr_t istart, intptr_t ifinish,
1349 						    int as_locale)
1350 {
1351   char *bs;
1352   int slen;
1353 
1354   slen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(s), istart, ifinish,
1355 			    NULL, 0,
1356 			    0 /* UTF-16 */);
1357   bs = (char *)scheme_malloc_atomic(slen + 1);
1358   scheme_utf8_encode(SCHEME_CHAR_STR_VAL(s), istart, ifinish,
1359 		     (unsigned char *)bs, 0,
1360 		     0 /* UTF-16 */);
1361   bs[slen] = 0;
1362 
1363   return scheme_make_sized_byte_string(bs, slen, 0);
1364 }
1365 
1366 static Scheme_Object *
do_char_string_to_byte_string_locale(const char * who,Scheme_Object * cstr,intptr_t istart,intptr_t ifinish,int perm)1367 do_char_string_to_byte_string_locale(const char *who,
1368 				     Scheme_Object *cstr,
1369 				     intptr_t istart, intptr_t ifinish,
1370 				     int perm)
1371 {
1372   char *s;
1373   intptr_t olen;
1374 
1375   reset_locale();
1376 
1377   if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on
1378       || !(rktio_convert_properties(scheme_rktio) & RKTIO_CONVERTER_SUPPORTED))
1379     return do_char_string_to_byte_string(cstr, istart, ifinish, 1);
1380 
1381   if (istart < ifinish) {
1382     int no_cvt;
1383 
1384     s = string_to_from_locale(1, (char *)SCHEME_CHAR_STR_VAL(cstr),
1385 			      istart, ifinish - istart,
1386 			      &olen, perm, &no_cvt);
1387 
1388     if (!s) {
1389       if (no_cvt) {
1390 	return do_char_string_to_byte_string(cstr, istart, ifinish, 1);
1391       } else {
1392 	scheme_contract_error(who,
1393                               "string cannot be encoded for the current locale",
1394                               "string", 1, cstr,
1395                               NULL);
1396       }
1397     }
1398     s[olen] = 0;
1399   } else {
1400     s = "";
1401     olen = 0;
1402   }
1403 
1404   return scheme_make_sized_byte_string(s, olen, 0);
1405 }
1406 
1407 
scheme_char_string_to_byte_string(Scheme_Object * s)1408 Scheme_Object *scheme_char_string_to_byte_string(Scheme_Object *s)
1409 {
1410   return do_char_string_to_byte_string(s, 0, SCHEME_CHAR_STRLEN_VAL(s), 0);
1411 }
1412 
scheme_char_string_to_byte_string_locale(Scheme_Object * s)1413 Scheme_Object *scheme_char_string_to_byte_string_locale(Scheme_Object *s)
1414 {
1415   return do_char_string_to_byte_string_locale("s->s", s, 0, SCHEME_CHAR_STRLEN_VAL(s), '?');
1416 }
1417 
do_chars_to_bytes(const char * who,int mode,int argc,Scheme_Object * argv[])1418 static Scheme_Object *do_chars_to_bytes(const char *who, int mode,
1419 					int argc, Scheme_Object *argv[])
1420 {
1421   intptr_t istart, ifinish;
1422   int permc;
1423 
1424   if (!SCHEME_CHAR_STRINGP(argv[0]))
1425     scheme_wrong_contract(who, "string?", 0, argc, argv);
1426 
1427   if ((argc < 2) || SCHEME_FALSEP(argv[1]))
1428     permc = -1;
1429   else {
1430     if (!SCHEME_BYTEP(argv[1]))
1431       scheme_wrong_contract(who, "(or/c byte? #f)", 1, argc, argv);
1432     permc = SCHEME_INT_VAL(argv[1]);
1433   }
1434 
1435   scheme_get_substring_indices(who, argv[0], argc, argv,
1436 			       2, 3, &istart, &ifinish);
1437 
1438   if (mode == 1)
1439     return do_char_string_to_byte_string_locale(who, argv[0], istart, ifinish, permc);
1440   else if (mode == 0)
1441     return do_char_string_to_byte_string(argv[0], istart, ifinish, 0);
1442   else {
1443     /* Latin-1 */
1444     mzchar *us;
1445     unsigned char *s;
1446     intptr_t i, len;
1447     len = ifinish - istart;
1448     us = SCHEME_CHAR_STR_VAL(argv[0]);
1449     s = (unsigned char *)scheme_malloc_atomic(len + 1);
1450     for (i = istart; i < ifinish; i++) {
1451       if (us[i] < 256)
1452 	s[i - istart] = us[i];
1453       else if (permc >= 0) {
1454 	s[i - istart] = permc;
1455       } else {
1456 	scheme_contract_error(who,
1457                               "string cannot be encoded in Latin-1",
1458                               "string", 1, argv[0],
1459                               NULL);
1460       }
1461     }
1462     s[len] = 0;
1463 
1464     return scheme_make_sized_byte_string((char *)s, len, 0);
1465   }
1466 }
1467 
char_string_to_byte_string(int argc,Scheme_Object * argv[])1468 static Scheme_Object *char_string_to_byte_string(int argc, Scheme_Object *argv[])
1469 {
1470   return do_chars_to_bytes("string->bytes/utf-8", 0, argc, argv);
1471 }
1472 
char_string_to_byte_string_locale(int argc,Scheme_Object * argv[])1473 static Scheme_Object *char_string_to_byte_string_locale(int argc, Scheme_Object *argv[])
1474 {
1475   return do_chars_to_bytes("string->bytes/locale", 1, argc, argv);
1476 }
1477 
char_string_to_byte_string_latin1(int argc,Scheme_Object * argv[])1478 static Scheme_Object *char_string_to_byte_string_latin1(int argc, Scheme_Object *argv[])
1479 {
1480   return do_chars_to_bytes("string->bytes/latin-1", 2, argc, argv);
1481 }
1482 
1483 /************************* Other *************************/
1484 
char_string_utf8_length(int argc,Scheme_Object * argv[])1485 static Scheme_Object *char_string_utf8_length (int argc, Scheme_Object *argv[])
1486 {
1487   intptr_t istart, ifinish, len;
1488 
1489   if (!SCHEME_CHAR_STRINGP(argv[0]))
1490     scheme_wrong_contract("string-utf-8-length", "string?", 0, argc, argv);
1491 
1492   scheme_get_substring_indices("string-utf-8-length", argv[0], argc, argv,
1493 			       1, 2, &istart, &ifinish);
1494 
1495   len = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[0]), istart, ifinish,
1496 			   NULL, 0, 0);
1497 
1498   return scheme_make_integer(len);
1499 }
1500 
1501 static Scheme_Object *
byte_string_utf8_length(int argc,Scheme_Object * argv[])1502 byte_string_utf8_length (int argc, Scheme_Object *argv[])
1503 {
1504   int len, perm;
1505   intptr_t istart, ifinish;
1506   char *chars;
1507 
1508   if (!SCHEME_BYTE_STRINGP(argv[0]))
1509     scheme_wrong_contract("bytes-utf-8-length", "string?", 0, argc, argv);
1510 
1511   chars = SCHEME_BYTE_STR_VAL(argv[0]);
1512 
1513   if ((argc > 1) && !SCHEME_FALSEP(argv[1])) {
1514     if (!SCHEME_CHARP(argv[1]))
1515       scheme_wrong_contract("bytes-utf-8-length", "(or/c char? #f)", 1, argc, argv);
1516     perm = 1;
1517   } else
1518     perm = 0;
1519 
1520   scheme_get_substring_indices("bytes-utf-8-length", argv[0], argc, argv,
1521 			       2, 3,
1522 			       &istart, &ifinish);
1523 
1524   len = scheme_utf8_decode((unsigned char *)chars, istart, ifinish,
1525 			   NULL, 0, -1,
1526 			   NULL, 0, perm);
1527 
1528   if (len < 0)
1529     return scheme_false;
1530   else
1531     return scheme_make_integer(len);
1532 }
1533 
1534 static Scheme_Object *
byte_string_utf8_index(int argc,Scheme_Object * argv[])1535 byte_string_utf8_index(int argc, Scheme_Object *argv[])
1536 {
1537   intptr_t istart, ifinish, pos = -1, opos, ipos;
1538   int result, perm;
1539   char *chars;
1540 
1541   if (!SCHEME_BYTE_STRINGP(argv[0]))
1542     scheme_wrong_contract("bytes-utf-8-index", "bytes?", 0, argc, argv);
1543 
1544   chars = SCHEME_BYTE_STR_VAL(argv[0]);
1545 
1546   if (SCHEME_INTP(argv[1])) {
1547     pos = SCHEME_INT_VAL(argv[1]);
1548   } else if (SCHEME_BIGNUMP(argv[1])) {
1549     if (SCHEME_BIGPOS(argv[1]))
1550       pos = 0x7FFFFFFF;
1551   }
1552 
1553   if (pos < 0) {
1554     scheme_wrong_contract("bytes-utf-8-index", "exact-nonnegative-integer?", 1, argc, argv);
1555   }
1556 
1557   if ((argc > 2) && !SCHEME_FALSEP(argv[2])) {
1558     if (!SCHEME_CHARP(argv[2]))
1559       scheme_wrong_contract("bytes-utf-8-index", "(or/c char? #f)", 1, argc, argv);
1560     perm = 1;
1561   } else
1562     perm = 0;
1563 
1564   scheme_get_substring_indices("bytes-utf-8-index", argv[0], argc, argv,
1565 			       3, 4,
1566 			       &istart, &ifinish);
1567 
1568   result = utf8_decode_x((unsigned char *)chars, istart, ifinish,
1569 			 NULL, 0, pos,
1570 			 &ipos, &opos,
1571 			 0, 0, NULL, 0, perm ? 1 : 0, 0);
1572 
1573   if (((result < 0) && (result != -3))
1574       || ((ipos == ifinish) && (opos <= pos)))
1575     return scheme_false;
1576   else
1577     return scheme_make_integer(ipos);
1578 }
1579 
1580 static Scheme_Object *
byte_string_utf8_ref(int argc,Scheme_Object * argv[])1581 byte_string_utf8_ref(int argc, Scheme_Object *argv[])
1582 {
1583   intptr_t istart, ifinish, pos = -1, opos, ipos;
1584   char *chars;
1585   unsigned int us[1];
1586   Scheme_Object *perm;
1587 
1588   if (!SCHEME_BYTE_STRINGP(argv[0]))
1589     scheme_wrong_contract("bytes-utf-8-ref", "bytes?", 0, argc, argv);
1590 
1591   chars = SCHEME_BYTE_STR_VAL(argv[0]);
1592 
1593   if (SCHEME_INTP(argv[1])) {
1594     pos = SCHEME_INT_VAL(argv[1]);
1595   } else if (SCHEME_BIGNUMP(argv[1])) {
1596     if (SCHEME_BIGPOS(argv[1]))
1597       pos = 0x7FFFFFFF;
1598   }
1599 
1600   if (pos < 0) {
1601     scheme_wrong_contract("bytes-utf-8-ref", "exact-nonnegative-integer?", 1, argc, argv);
1602   }
1603 
1604   if ((argc > 2) && !SCHEME_FALSEP(argv[2])) {
1605     if (!SCHEME_CHARP(argv[2]))
1606       scheme_wrong_contract("bytes-utf-8-ref", "(or/c char? #f)", 1, argc, argv);
1607     perm = argv[2];
1608   } else
1609     perm = 0;
1610 
1611   scheme_get_substring_indices("bytes-utf-8-ref", argv[0], argc, argv,
1612 			       3, 4,
1613 			       &istart, &ifinish);
1614 
1615   if (pos > 0) {
1616     utf8_decode_x((unsigned char *)chars, istart, ifinish,
1617 		  NULL, 0, pos,
1618 		  &ipos, &opos,
1619 		  0, 0, NULL, 0, perm ? 1 : 0, 0);
1620     if (opos < pos)
1621       return scheme_false;
1622     istart = ipos;
1623   }
1624 
1625   utf8_decode_x((unsigned char *)chars, istart, ifinish,
1626 		us, 0, 1,
1627 		&ipos, &opos,
1628 		0, 0, NULL, 0, perm ? 0xFFFFFF : 0, 0);
1629 
1630   if (opos < 1)
1631     return scheme_false;
1632   else if (us[0] == 0xFFFFFF)
1633     return perm;
1634   else
1635     return scheme_make_character(us[0]);
1636 }
1637 
1638 /********************************************************************/
1639 /*                            format                                */
1640 /********************************************************************/
1641 
scheme_do_format(const char * procname,Scheme_Object * port,const mzchar * format,int flen,int fpos,int offset,int argc,Scheme_Object ** argv)1642 void scheme_do_format(const char *procname, Scheme_Object *port,
1643 		      const mzchar *format, int flen,
1644 		      int fpos, int offset, int argc, Scheme_Object **argv)
1645 {
1646   int i, start, end;
1647   int used = offset;
1648   int num_err = 0, char_err = 0, end_ok = 0;
1649   Scheme_Object *a[2];
1650 
1651   if (!format) {
1652     if (!SCHEME_CHAR_STRINGP(argv[fpos])) {
1653       scheme_wrong_contract(procname, "string?", fpos, argc, argv);
1654       return;
1655     }
1656     format = SCHEME_CHAR_STR_VAL(argv[fpos]);
1657     flen = SCHEME_CHAR_STRTAG_VAL(argv[fpos]);
1658   } else if (flen == -1)
1659     flen = strlen((char *)format);
1660 
1661   /* Check string first: */
1662   end = flen - 1;
1663   for (i = 0; i < end; i++) {
1664     if (format[i] == '~') {
1665       i++;
1666       if (scheme_isspace(format[i])) {
1667 	/* skip spaces... */
1668       } else switch (format[i]) {
1669       case '~':
1670 	if (i == end)
1671 	  end_ok = 1;
1672 	break;
1673       case '%':
1674       case 'n':
1675       case 'N':
1676 	break;
1677       case 'a':
1678       case 'A':
1679       case 's':
1680       case 'S':
1681       case 'v':
1682       case 'V':
1683       case 'e':
1684       case 'E':
1685 	used++;
1686 	break;
1687       case '.':
1688         switch (format[i+1]) {
1689         case 'a':
1690         case 'A':
1691         case 's':
1692         case 'S':
1693         case 'v':
1694         case 'V':
1695           break;
1696         default:
1697 	  scheme_contract_error(procname,
1698                                 "ill-formed pattern string",
1699                                 "explanation", 0, "tag `~.' not followed by `a', `s', or `v'",
1700                                 "pattern string", 1, argv[fpos],
1701                                 NULL);
1702           break;
1703         }
1704         used++;
1705         break;
1706       case 'x':
1707       case 'X':
1708       case 'o':
1709       case 'O':
1710       case 'b':
1711       case 'B':
1712 	if (!num_err && !char_err && (used < argc)) {
1713 	  Scheme_Object *o = argv[used];
1714 	  if (!SCHEME_EXACT_REALP(o)
1715 	      && (!SCHEME_COMPLEXP(o)
1716 		  || !SCHEME_EXACT_REALP(scheme_complex_real_part(o))))
1717 	    num_err = used + 1;
1718 	}
1719 	used++;
1720 	break;
1721       case 'c':
1722       case 'C':
1723 	if (!num_err && !char_err && (used < argc)) {
1724 	  if (!SCHEME_CHARP(argv[used]))
1725 	    char_err = used + 1;
1726 	}
1727 	used++;
1728 	break;
1729       default:
1730 	{
1731 	  char buffer[64];
1732 	  sprintf(buffer, "tag `~%c' not allowed", format[i]);
1733 	  scheme_contract_error(procname,
1734                                 "ill-formed pattern string",
1735                                 "explanation", 0, buffer,
1736                                 "pattern string", 1, argv[fpos],
1737                                 NULL);
1738 	  return;
1739 	}
1740       }
1741     }
1742   }
1743   if ((format[end] == '~') && !end_ok) {
1744     scheme_contract_error(procname,
1745                           "ill-formed pattern string",
1746                           "explanation", 0, "cannot end in `~'",
1747                           "pattern string", 1, argv[fpos],
1748                           NULL);
1749     return;
1750   }
1751   if (used != argc) {
1752     char *args;
1753     intptr_t alen;
1754 
1755     args = scheme_make_args_string("", -1, argc, argv, &alen);
1756 
1757     if (used > argc) {
1758       scheme_raise_exn(MZEXN_FAIL_CONTRACT,
1759 		       "%s: format string requires %d arguments, given %d%t",
1760 		       procname, used - offset, argc - offset, args, alen);
1761     } else {
1762       scheme_raise_exn(MZEXN_FAIL_CONTRACT,
1763 		       "%s: format string requires %d arguments, given %d%t",
1764 		       procname, used - offset, argc - offset, args, alen);
1765     }
1766     return;
1767   }
1768   if (num_err || char_err) {
1769     int pos = (num_err ? num_err : char_err) - 1;
1770     char *args, *bstr;
1771     intptr_t alen;
1772     intptr_t blen;
1773     char *type = (num_err ? "exact-number" : "character");
1774     Scheme_Object *bad = argv[pos];
1775 
1776     args = scheme_make_args_string("other ", pos, argc, argv, &alen);
1777     bstr = scheme_make_provided_string(bad, 1, &blen);
1778     scheme_raise_exn(MZEXN_FAIL_CONTRACT,
1779 		     "%s: format string requires argument of type <%s>, given %t%t",
1780 		     procname, type,
1781 		     bstr, blen,
1782 		     args, alen);
1783     return;
1784   }
1785 
1786   for (used = offset, i = start = 0; i < flen; i++) {
1787     if (format[i] == '~') {
1788       if (start < i) {
1789 	(void)scheme_put_char_string(procname, port, format, start, i - start);
1790       }
1791       i++;
1792       if (scheme_isspace(format[i])) {
1793 	/* skip spaces (at most one newline) */
1794 	do {
1795 	  if ((format[i] == '\n') || (format[i] == '\r')) {
1796 	    /* got one */
1797 	    if ((format[i] == '\r') && (format[i + 1] == '\n'))
1798 	      i++; /* Windows-style CR-NL */
1799 	    i++;
1800 	    while (portable_isspace(format[i])
1801 		   && !((format[i] == '\n') || (format[i] == '\r'))) {
1802 	      i++;
1803 	    }
1804 	    break;
1805 	  } else
1806 	    i++;
1807 	} while (scheme_isspace(format[i]));
1808 	--i; /* back up over something */
1809       } else switch (format[i]) {
1810       case '~':
1811 	scheme_write_byte_string("~", 1, port);
1812 	break;
1813       case '%':
1814       case 'n':
1815       case 'N':
1816 	scheme_write_byte_string("\n", 1, port);
1817 	break;
1818       case 'c':
1819       case 'C':
1820       case 'a':
1821       case 'A':
1822 	a[0] = argv[used++];
1823 	a[1] = port;
1824 	_scheme_apply(scheme_display_proc, 2, a);
1825 	break;
1826       case 's':
1827       case 'S':
1828 	a[0] = argv[used++];
1829 	a[1] = port;
1830 	_scheme_apply(scheme_write_proc, 2, a);
1831 	break;
1832       case 'v':
1833       case 'V':
1834 	a[0] = argv[used++];
1835 	a[1] = port;
1836 	_scheme_apply(scheme_print_proc, 2, a);
1837 	break;
1838       case 'e':
1839       case 'E':
1840 	{
1841 	  intptr_t len;
1842 	  char *s;
1843 	  s = scheme_make_provided_string(argv[used++], 0, &len);
1844 	  scheme_write_byte_string(s, len, port);
1845 	}
1846 	break;
1847       case '.':
1848 	{
1849 	  intptr_t len;
1850 	  char *s;
1851           len = scheme_get_print_width();
1852           i++;
1853           switch (format[i]) {
1854           case 'a':
1855           case 'A':
1856             s = scheme_display_to_string_w_max(argv[used++], &len, len);
1857             break;
1858           case 's':
1859           case 'S':
1860             s = scheme_write_to_string_w_max(argv[used++], &len, len);
1861             break;
1862           case 'v':
1863           case 'V':
1864             s = scheme_print_to_string_w_max(argv[used++], &len, len);
1865             break;
1866           default:
1867             s = "???";
1868             len = 3;
1869           }
1870 	  scheme_write_byte_string(s, len, port);
1871 	}
1872 	break;
1873       case 'x':
1874       case 'X':
1875       case 'o':
1876       case 'O':
1877       case 'b':
1878       case 'B':
1879 	{
1880 	  char *s;
1881 	  int radix;
1882 
1883 	  switch(format[i]) {
1884 	  case 'x':
1885 	  case 'X':
1886 	    radix = 16;
1887 	    break;
1888 	  case 'o':
1889 	  case 'O':
1890 	    radix = 8;
1891 	    break;
1892 	  default:
1893 	  case 'b':
1894 	  case 'B':
1895 	    radix = 2;
1896 	    break;
1897 	  }
1898 	  s = scheme_number_to_string(radix, argv[used++]);
1899 
1900 	  scheme_write_byte_string(s, strlen(s), port);
1901 	}
1902 	break;
1903       }
1904       SCHEME_USE_FUEL(1);
1905       start = i + 1;
1906     }
1907   }
1908 
1909   SCHEME_USE_FUEL(flen);
1910 
1911   if (start < i) {
1912     (void)scheme_put_char_string(procname, port, format, start, i - start);
1913   }
1914 }
1915 
scheme_format(mzchar * format,int flen,int argc,Scheme_Object ** argv,intptr_t * rlen)1916 char *scheme_format(mzchar *format, int flen, int argc, Scheme_Object **argv, intptr_t *rlen)
1917 {
1918   Scheme_Object *port;
1919   port = scheme_make_byte_string_output_port();
1920   scheme_do_format("format", port, format, flen, 0, 0, argc, argv);
1921   return scheme_get_sized_byte_string_output(port, rlen);
1922 }
1923 
scheme_printf(mzchar * format,int flen,int argc,Scheme_Object ** argv)1924 void scheme_printf(mzchar *format, int flen, int argc, Scheme_Object **argv)
1925 {
1926   scheme_do_format("printf", scheme_get_param(scheme_current_config(), MZCONFIG_OUTPUT_PORT),
1927 		   format, flen, 0, 0, argc, argv);
1928 }
1929 
scheme_format_utf8(char * format,int flen,int argc,Scheme_Object ** argv,intptr_t * rlen)1930 char *scheme_format_utf8(char *format, int flen, int argc, Scheme_Object **argv, intptr_t *rlen)
1931 {
1932   mzchar *s;
1933   intptr_t srlen;
1934   if (flen == -1)
1935     flen = strlen(format);
1936   s = scheme_utf8_decode_to_buffer_len((unsigned char *)format, flen, NULL, 0, &srlen);
1937   if (s)
1938     return scheme_format(s, srlen, argc, argv, rlen);
1939   else
1940     return "";
1941 }
1942 
scheme_printf_utf8(char * format,int flen,int argc,Scheme_Object ** argv)1943 void scheme_printf_utf8(char *format, int flen, int argc, Scheme_Object **argv)
1944 {
1945   mzchar *s;
1946   intptr_t srlen;
1947   if (flen == -1)
1948     flen = strlen(format);
1949   s = scheme_utf8_decode_to_buffer_len((unsigned char *)format, flen, NULL, 0, &srlen);
1950   if (s)
1951     scheme_printf(s, srlen, argc, argv);
1952 }
1953 
1954 
1955 static Scheme_Object *
format(int argc,Scheme_Object * argv[])1956 format(int argc, Scheme_Object *argv[])
1957 {
1958   Scheme_Object *port;
1959   char *s;
1960   intptr_t len;
1961 
1962   port = scheme_make_byte_string_output_port();
1963 
1964   scheme_do_format("format", port, NULL, 0, 0, 1, argc, argv);
1965 
1966   s = scheme_get_sized_byte_string_output(port, &len);
1967   return scheme_make_sized_utf8_string(s, len);
1968 }
1969 
1970 #ifdef INSTRUMENT_PRIMITIVES
1971 extern int g_print_prims;
1972 #endif
1973 
1974 static Scheme_Object *
sch_printf(int argc,Scheme_Object * argv[])1975 sch_printf(int argc, Scheme_Object *argv[])
1976 {
1977   scheme_do_format("printf", scheme_get_param(scheme_current_config(), MZCONFIG_OUTPUT_PORT),
1978 		   NULL, 0, 0, 1, argc, argv);
1979   return scheme_void;
1980 }
1981 
1982 static Scheme_Object *
sch_eprintf(int argc,Scheme_Object * argv[])1983 sch_eprintf(int argc, Scheme_Object *argv[])
1984 {
1985   scheme_do_format("eprintf", scheme_get_param(scheme_current_config(), MZCONFIG_ERROR_PORT),
1986 		   NULL, 0, 0, 1, argc, argv);
1987   return scheme_void;
1988 }
1989 
1990 static Scheme_Object *
sch_fprintf(int argc,Scheme_Object * argv[])1991 sch_fprintf(int argc, Scheme_Object *argv[])
1992 {
1993   if (!SCHEME_OUTPUT_PORTP(argv[0]))
1994     scheme_wrong_contract("fprintf", "output-port?", 0, argc, argv);
1995 
1996   scheme_do_format("fprintf", argv[0], NULL, 0, 1, 2, argc, argv);
1997   return scheme_void;
1998 }
1999 
2000 /********************************************************************/
2001 /*                              misc                                */
2002 /********************************************************************/
2003 
2004 static Scheme_Object *
version(int argc,Scheme_Object * argv[])2005 version(int argc, Scheme_Object *argv[])
2006 {
2007   return vers_str;
2008 }
2009 
2010 static Scheme_Object *
banner(int argc,Scheme_Object * argv[])2011 banner(int argc, Scheme_Object *argv[])
2012 {
2013   return banner_str;
2014 }
2015 
scheme_version(void)2016 char *scheme_version(void)
2017 {
2018   return MZSCHEME_VERSION;
2019 }
2020 
2021 #ifdef MZ_PRECISE_GC
2022 # define VERSION_SUFFIX " [bc]"
2023 #else
2024 # ifdef USE_SENORA_GC
2025 #  define VERSION_SUFFIX " [cgc]"
2026 # else
2027 #  define VERSION_SUFFIX " [cgc/b]"
2028 # endif
2029 #endif
2030 
scheme_banner(void)2031 char *scheme_banner(void)
2032 {
2033   if (embedding_banner)
2034     return embedding_banner;
2035   else
2036     return ("Welcome to Racket"
2037             " v" MZSCHEME_VERSION VERSION_SUFFIX
2038             ".\n");
2039 }
2040 
scheme_set_banner(char * s)2041 void scheme_set_banner(char *s)
2042 {
2043   embedding_banner = s;
2044 }
2045 
scheme_byte_string_has_null(Scheme_Object * o)2046 int scheme_byte_string_has_null(Scheme_Object *o)
2047 {
2048   const char *s = SCHEME_BYTE_STR_VAL(o);
2049   int i = SCHEME_BYTE_STRTAG_VAL(o);
2050   while (i--) {
2051     if (!s[i])
2052       return 1;
2053   }
2054   return 0;
2055 }
2056 
scheme_any_string_has_null(Scheme_Object * o)2057 int scheme_any_string_has_null(Scheme_Object *o)
2058 {
2059   if (SCHEME_BYTE_STRINGP(o))
2060     return scheme_byte_string_has_null(o);
2061   else {
2062     const mzchar *s = SCHEME_CHAR_STR_VAL(o);
2063     int i = SCHEME_CHAR_STRTAG_VAL(o);
2064     while (i--) {
2065       if (!s[i])
2066 	return 1;
2067     }
2068     return 0;
2069   }
2070 }
2071 
2072 /***********************************************************************/
2073 /* Environment Variables                                               */
2074 /***********************************************************************/
2075 
2076 /* A `scheme_environment_variables_type` record wraps a hash table
2077    that maps normalized keys to (cons <key> <val>), where the key
2078    in the pair preserves its original case. */
2079 
2080 #define SCHEME_ENVVARS_TABLE(ev) ((Scheme_Hash_Tree *)SCHEME_PTR_VAL(ev))
2081 
scheme_make_environment_variables(Scheme_Hash_Tree * ht)2082 Scheme_Object *scheme_make_environment_variables(Scheme_Hash_Tree *ht)
2083 {
2084   Scheme_Object *ev;
2085 
2086   ev = scheme_alloc_small_object();
2087   ev->type = scheme_environment_variables_type;
2088   SCHEME_PTR_VAL(ev) = (Scheme_Object *)ht;
2089 
2090   return ev;
2091 }
2092 
env_p(int argc,Scheme_Object * argv[])2093 static Scheme_Object *env_p(int argc, Scheme_Object *argv[])
2094 {
2095   return (SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_environment_variables_type)
2096           ? scheme_true
2097           : scheme_false);
2098 }
2099 
current_environment_variables(int argc,Scheme_Object * argv[])2100 static Scheme_Object *current_environment_variables(int argc, Scheme_Object *argv[])
2101 {
2102   Scheme_Object *v;
2103 
2104   v = scheme_param_config2("current-environment-variables",
2105                            scheme_make_integer(MZCONFIG_CURRENT_ENV_VARS),
2106                            argc, argv,
2107                            -1, env_p, "environment-variables?", 0);
2108 
2109   return v;
2110 }
2111 
2112 static int sch_bool_getenv(const char* name);
2113 
scheme_init_getenv(void)2114 void scheme_init_getenv(void)
2115 {
2116   if (sch_bool_getenv("PLTNOMZJIT")) {
2117     scheme_set_startup_use_jit(0);
2118   }
2119   if (sch_bool_getenv("PLT_SHOW_BUILTIN_CONTEXT")) {
2120     scheme_keep_builtin_context = 1;
2121   }
2122 }
2123 
sch_bool_getenv(const char * name)2124 static int sch_bool_getenv(const char* name)
2125 {
2126   if (rktio_getenv(scheme_rktio, name))
2127     return 1;
2128   else
2129     return 0;
2130 }
2131 
byte_string_ok_name(Scheme_Object * o)2132 int byte_string_ok_name(Scheme_Object *o)
2133 {
2134   const char *s = SCHEME_BYTE_STR_VAL(o);
2135   int i = SCHEME_BYTE_STRTAG_VAL(o);
2136 
2137   while (i--) {
2138     if (!s[i])
2139       return 0;
2140   }
2141 
2142   return rktio_is_ok_envvar_name(scheme_rktio, s);
2143 
2144   return 1;
2145 }
2146 
normalize_env_case(Scheme_Object * bs)2147 static Scheme_Object *normalize_env_case(Scheme_Object *bs)
2148 {
2149   if (rktio_are_envvar_names_case_insensitive(scheme_rktio)) {
2150     bs = scheme_byte_string_to_char_string(bs);
2151     bs = string_locale_downcase(1, &bs);
2152     bs = scheme_char_string_to_byte_string(bs);
2153   }
2154   return bs;
2155 }
2156 
scheme_getenv(char * name)2157 char *scheme_getenv(char *name)
2158 {
2159   char *s;
2160   s = rktio_getenv(scheme_rktio, name);
2161   if (s)
2162     return scheme_strdup_and_free(s);
2163   else
2164     return NULL;
2165 }
2166 
sch_getenv(int argc,Scheme_Object * argv[])2167 static Scheme_Object *sch_getenv(int argc, Scheme_Object *argv[])
2168 {
2169   char *name;
2170   char *value;
2171   Scheme_Object *bs, *ev, *val;
2172   Scheme_Hash_Tree *ht;
2173 
2174   if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_environment_variables_type))
2175     scheme_wrong_contract("environment-variables-ref", "environment-variables?", 0, argc, argv);
2176 
2177   bs = argv[1];
2178   if (!SCHEME_BYTE_STRINGP(bs)
2179       || !byte_string_ok_name(bs))
2180     scheme_wrong_contract("environment-variables-ref", "bytes-environment-variable-name?", 1, argc, argv);
2181 
2182   ev = argv[0];
2183   ht = SCHEME_ENVVARS_TABLE(ev);
2184 
2185   if (!ht) {
2186     name = SCHEME_BYTE_STR_VAL(bs);
2187 
2188     value = rktio_getenv(scheme_rktio, name);
2189     if (value) {
2190       val = scheme_make_byte_string(value);
2191       free(value);
2192     } else
2193     val = scheme_false;
2194 
2195     return val;
2196   } else {
2197     bs = normalize_env_case(bs);
2198     val = scheme_hash_tree_get(ht, bs);
2199     return val ? SCHEME_CDR(val) : scheme_false;
2200   }
2201 }
2202 
sch_putenv(int argc,Scheme_Object * argv[])2203 static Scheme_Object *sch_putenv(int argc, Scheme_Object *argv[])
2204 {
2205   Scheme_Object *varbs, *valbs, *norm_varbs, *ev;
2206   Scheme_Hash_Tree *ht;
2207   char *var;
2208   char *val;
2209 
2210   if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_environment_variables_type))
2211     scheme_wrong_contract("environment-variables-set!", "environment-variables?", 0, argc, argv);
2212 
2213   varbs = argv[1];
2214   if (!SCHEME_BYTE_STRINGP(varbs)
2215       || !byte_string_ok_name(varbs))
2216     scheme_wrong_contract("environment-variables-set!", "bytes-environment-variable-name?", 1, argc, argv);
2217 
2218   valbs = argv[2];
2219   if (!SCHEME_FALSEP(valbs)
2220       && (!SCHEME_BYTE_STRINGP(valbs)
2221           || scheme_byte_string_has_null(valbs)))
2222     scheme_wrong_contract("environment-variables-set!", "(or/c bytes-no-nuls? #f)", 2, argc, argv);
2223   if (argc > 3)
2224     scheme_check_proc_arity("environment-variables-set!", 0, 3, argc, argv);
2225 
2226   ev = argv[0];
2227   ht = SCHEME_ENVVARS_TABLE(ev);
2228 
2229   if (ht) {
2230     norm_varbs = normalize_env_case(varbs);
2231 
2232     if (SCHEME_FALSEP(valbs)) {
2233       ht = scheme_hash_tree_set(ht, norm_varbs, NULL);
2234     } else {
2235       if (SAME_OBJ(varbs, norm_varbs)) {
2236         varbs = byte_string_to_immutable(1, &varbs);
2237         norm_varbs = varbs;
2238       } else {
2239         varbs = byte_string_to_immutable(1, &varbs);
2240         norm_varbs = byte_string_to_immutable(1, &norm_varbs);
2241       }
2242       valbs = byte_string_to_immutable(1, &valbs);
2243       ht = scheme_hash_tree_set(ht, norm_varbs, scheme_make_pair(varbs, valbs));
2244     }
2245 
2246     SCHEME_PTR_VAL(ev) = (Scheme_Object *)ht;
2247 
2248     return scheme_void;
2249   } else {
2250     var = SCHEME_BYTE_STR_VAL(varbs);
2251 
2252     if (SCHEME_FALSEP(valbs)) {
2253       val = NULL;
2254     } else {
2255       val = SCHEME_BYTE_STR_VAL(valbs);
2256     }
2257 
2258     if (!rktio_setenv(scheme_rktio, var, val)) {
2259       if (argc > 3)
2260         return _scheme_tail_apply(argv[3], 0, NULL);
2261       else {
2262         scheme_raise_exn(MZEXN_FAIL,
2263                          "environment-variables-set!: change failed\n"
2264                          "  system error: %R");
2265       }
2266     }
2267 
2268     return scheme_void;
2269   }
2270 }
2271 
env_copy(int argc,Scheme_Object * argv[])2272 static Scheme_Object *env_copy(int argc, Scheme_Object *argv[])
2273 {
2274   Scheme_Hash_Tree *ht;
2275 
2276   if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_environment_variables_type))
2277     scheme_wrong_contract("environment-variables-copy", "environment-variables?", 0, argc, argv);
2278 
2279   ht = SCHEME_ENVVARS_TABLE(argv[0]);
2280   if (ht)
2281     return scheme_make_environment_variables(ht);
2282 
2283   /* copy system environment variables into a hash table: */
2284   ht = scheme_make_hash_tree(SCHEME_hashtr_equal);
2285 
2286   {
2287     intptr_t i;
2288     rktio_envvars_t *envvars;
2289     Scheme_Object *var, *val, *norm_var;
2290 
2291     envvars = rktio_envvars(scheme_rktio);
2292     for (i = rktio_envvars_count(scheme_rktio, envvars); i--; ) {
2293       var = scheme_make_immutable_sized_byte_string(rktio_envvars_name_ref(scheme_rktio, envvars, i), -1, 1);
2294       val = scheme_make_immutable_sized_byte_string(rktio_envvars_value_ref(scheme_rktio, envvars, i), -1, 1);
2295       norm_var = normalize_env_case(var);
2296       if (!SAME_OBJ(var, norm_var))
2297         norm_var = byte_string_to_immutable(1, &norm_var);
2298       ht = scheme_hash_tree_set(ht, norm_var, scheme_make_pair(var, val));
2299     }
2300 
2301     rktio_envvars_free(scheme_rktio, envvars);
2302   }
2303 
2304   return scheme_make_environment_variables(ht);
2305 }
2306 
env_make(int argc,Scheme_Object * argv[])2307 static Scheme_Object *env_make(int argc, Scheme_Object *argv[])
2308 {
2309   Scheme_Hash_Tree *ht;
2310   Scheme_Object *varbs, *valbs, *norm_varbs;
2311   int i;
2312 
2313   ht = scheme_make_hash_tree(SCHEME_hashtr_equal);
2314 
2315   for (i = 0; i < argc; i += 2) {
2316     varbs = argv[i];
2317     if (!SCHEME_BYTE_STRINGP(varbs)
2318         || !byte_string_ok_name(varbs))
2319       scheme_wrong_contract("make-environment-variables", "bytes-environment-variable-name?", i, argc, argv);
2320 
2321     if (i+1 >= argc) {
2322       scheme_contract_error("make-environment-variables",
2323                             "key does not have a value (i.e., an odd number of arguments were provided)",
2324                             "key", 1, argv[i],
2325                             NULL);
2326       return NULL;
2327     }
2328 
2329     valbs = argv[i+1];
2330     if (!SCHEME_FALSEP(valbs)
2331         && (!SCHEME_BYTE_STRINGP(valbs)
2332             || scheme_byte_string_has_null(valbs)))
2333       scheme_wrong_contract("make-environment-variables", "(or/c bytes-no-nuls? #f)", i+1, argc, argv);
2334 
2335     varbs = byte_string_to_immutable(1, &varbs);
2336     valbs = byte_string_to_immutable(1, &valbs);
2337     norm_varbs = normalize_env_case(varbs);
2338     if (!SAME_OBJ(varbs, norm_varbs))
2339       norm_varbs = byte_string_to_immutable(1, &norm_varbs);
2340     ht = scheme_hash_tree_set(ht, norm_varbs, scheme_make_pair(varbs, valbs));
2341   }
2342 
2343   return scheme_make_environment_variables(ht);
2344 }
2345 
sch_getenv_names(int argc,Scheme_Object * argv[])2346 static Scheme_Object *sch_getenv_names(int argc, Scheme_Object *argv[])
2347 {
2348   Scheme_Object *ev, *r = scheme_null, *key, *val;
2349   Scheme_Hash_Tree *ht;
2350   mzlonglong i;
2351 
2352   ev = argv[0];
2353   if (!SAME_TYPE(SCHEME_TYPE(ev), scheme_environment_variables_type))
2354     scheme_wrong_contract("environment-variables-names", "environment-variables?", 0, argc, argv);
2355 
2356   ht = SCHEME_ENVVARS_TABLE(ev);
2357   if (!ht) {
2358     ev = env_copy(1, argv);
2359     ht = SCHEME_ENVVARS_TABLE(ev);
2360   }
2361 
2362   for (i = scheme_hash_tree_next(ht, -1); i != -1; i = scheme_hash_tree_next(ht, i)) {
2363     scheme_hash_tree_index(ht, i, &key, &val);
2364     r = scheme_make_pair(SCHEME_CAR(val), r);
2365   }
2366 
2367   return r;
2368 }
2369 
scheme_environment_variables_to_envvars(Scheme_Object * ev)2370 rktio_envvars_t *scheme_environment_variables_to_envvars(Scheme_Object *ev)
2371 {
2372   Scheme_Hash_Tree *ht = SCHEME_ENVVARS_TABLE(ev);
2373   rktio_envvars_t *envvars;
2374   mzlonglong i;
2375   Scheme_Object *key, *val;
2376 
2377   if (!ht)
2378     return NULL;
2379 
2380   envvars = rktio_empty_envvars(scheme_rktio);
2381 
2382   for (i = scheme_hash_tree_next(ht, -1); i != -1; i = scheme_hash_tree_next(ht, i)) {
2383     scheme_hash_tree_index(ht, i, &key, &val);
2384 
2385     rktio_envvars_set(scheme_rktio,
2386                       envvars,
2387                       SCHEME_BYTE_STR_VAL(SCHEME_CAR(val)),
2388                       SCHEME_BYTE_STR_VAL(SCHEME_CDR(val)));
2389   }
2390 
2391   return envvars;
2392 }
2393 
2394 /***********************************************************************/
2395 /* End Environment Variables                                           */
2396 /***********************************************************************/
2397 
scheme_set_cross_compile_mode(int v)2398 void scheme_set_cross_compile_mode(int v)
2399 {
2400   cross_compile_mode = v;
2401 }
2402 
2403 #include "systype.inc"
2404 
system_type(int argc,Scheme_Object * argv[])2405 static Scheme_Object *system_type(int argc, Scheme_Object *argv[])
2406 {
2407   if (argc) {
2408     if (SAME_OBJ(argv[0], link_symbol)) {
2409       return scheme_intern_symbol(MZ_SYSTEM_TYPE_LINK);
2410     }
2411 
2412     if (SAME_OBJ(argv[0], machine_symbol)) {
2413       char *s;
2414       Scheme_Object *str;
2415 
2416       s = rktio_uname(scheme_rktio);
2417       str = scheme_make_locale_string(s);
2418       rktio_free(s);
2419 
2420       return str;
2421     }
2422 
2423     if (SAME_OBJ(argv[0], gc_symbol)) {
2424 #ifdef MZ_PRECISE_GC
2425       return _3m_symbol;
2426 #else
2427       return cgc_symbol;
2428 #endif
2429     }
2430 
2431     if (SAME_OBJ(argv[0], vm_symbol)) {
2432       return racket_symbol;
2433     }
2434 
2435     if (SAME_OBJ(argv[0], so_suffix_symbol)) {
2436       return scheme_make_byte_string(MZ_SYSTEM_TYPE_SO_SUFFIX);
2437     }
2438 
2439     if (SAME_OBJ(argv[0], so_mode_symbol)) {
2440       return scheme_intern_symbol(MZ_SYSTEM_TYPE_SO_MODE);
2441     }
2442 
2443 
2444     if (SAME_OBJ(argv[0], word_symbol)) {
2445       return scheme_make_integer(sizeof(void*)*8);
2446     }
2447 
2448     if (SAME_OBJ(argv[0], fs_change_symbol)) {
2449       return fs_change_props;
2450     }
2451 
2452     if (SAME_OBJ(argv[0], target_machine_symbol)) {
2453       return racket_symbol;
2454     }
2455 
2456     if (SAME_OBJ(argv[0], cross_symbol)) {
2457       return (cross_compile_mode ? force_symbol : infer_symbol);
2458     }
2459 
2460     if (SAME_OBJ(argv[0], os_star_symbol)) {
2461       return sys_os_symbol;
2462     }
2463 
2464     if (SAME_OBJ(argv[0], arch_symbol)) {
2465       return sys_arch_symbol;
2466     }
2467 
2468     if (!SAME_OBJ(argv[0], os_symbol)) {
2469       scheme_wrong_contract("system-type",
2470                             ("(or/c 'os 'os* 'arch 'word 'link 'machine 'target-machine\n"
2471                              "      'vm 'gc 'so-suffix 'so-mode 'word 'fs-change 'cross)"),
2472                             0, argc, argv);
2473       return NULL;
2474     }
2475   }
2476 
2477   return sys_symbol;
2478 }
2479 
system_library_subpath(int argc,Scheme_Object * argv[])2480 static Scheme_Object *system_library_subpath(int argc, Scheme_Object *argv[])
2481 {
2482   if (argc > 0) {
2483     if (SCHEME_FALSEP(argv[0]))
2484       return platform_cgc_path;
2485 
2486     if (SAME_OBJ(cgc_symbol, argv[0]))
2487       return platform_cgc_path;
2488 
2489     if (SAME_OBJ(_3m_symbol, argv[0]))
2490       return platform_3m_path;
2491 
2492     if (SAME_OBJ(cs_symbol, argv[0]))
2493       return platform_cs_path;
2494 
2495     scheme_wrong_contract("system-library-subpath", "(or/c 'cgc '3m 'cs #f)", 0, argc, argv);
2496     return NULL;
2497   } else {
2498 #ifdef MZ_PRECISE_GC
2499     return platform_3m_path;
2500 #else
2501     return platform_cgc_path;
2502 #endif
2503   }
2504 }
2505 
scheme_system_library_subpath()2506 const char *scheme_system_library_subpath()
2507 {
2508   return SCHEME_PLATFORM_LIBRARY_SUBPATH SPLS_SUFFIX;
2509 }
2510 
2511 /* Our own strncpy - which would be really stupid, except the one for
2512    the implementation in Solaris 2.6 is broken (it doesn't always stop
2513    at the null terminator). */
scheme_strncmp(const char * a,const char * b,int len)2514 int scheme_strncmp(const char *a, const char *b, int len)
2515 {
2516   while (len-- && (*a == *b) && *a) {
2517     a++;
2518     b++;
2519   }
2520 
2521   if (len < 0)
2522     return 0;
2523   else
2524     return *a - *b;
2525 }
2526 
ok_cmdline(int argc,Scheme_Object ** argv)2527 static Scheme_Object *ok_cmdline(int argc, Scheme_Object **argv)
2528 {
2529   if (SCHEME_CHAPERONE_VECTORP(argv[0])) {
2530     Scheme_Object *vec = argv[0], *vec2, *str;
2531     int i, size = SCHEME_CHAPERONE_VEC_SIZE(vec);
2532 
2533     if (!size)
2534       return vec;
2535 
2536     /* Make sure vector and strings are immutable: */
2537     vec2 = scheme_make_vector(size, NULL);
2538     if (size)
2539       SCHEME_SET_VECTOR_IMMUTABLE(vec2);
2540     for (i = 0; i < size; i++) {
2541       if (SCHEME_VECTORP(vec))
2542         str = SCHEME_VEC_ELS(vec)[i];
2543       else
2544         str = scheme_chaperone_vector_ref(vec, i);
2545       if (!SCHEME_CHAR_STRINGP(str))
2546         return NULL;
2547       if (!SCHEME_IMMUTABLE_CHAR_STRINGP(str)) {
2548 	str = scheme_make_sized_char_string(SCHEME_CHAR_STR_VAL(str), SCHEME_CHAR_STRLEN_VAL(str), 0);
2549 	SCHEME_SET_CHAR_STRING_IMMUTABLE(str);
2550       }
2551       SCHEME_VEC_ELS(vec2)[i] = str;
2552     }
2553 
2554     return vec2;
2555   }
2556 
2557   return NULL;
2558 }
2559 
cmdline_args(int argc,Scheme_Object * argv[])2560 static Scheme_Object *cmdline_args(int argc, Scheme_Object *argv[])
2561 {
2562   return scheme_param_config2("current-command-line-arguments",
2563                               scheme_make_integer(MZCONFIG_CMDLINE_ARGS),
2564                               argc, argv,
2565                               -1, ok_cmdline, "(vectorof string?)", 1);
2566 }
2567 
2568 /**********************************************************************/
2569 /*                           locale ops                               */
2570 /**********************************************************************/
2571 
ok_locale(int argc,Scheme_Object ** argv)2572 static Scheme_Object *ok_locale(int argc, Scheme_Object **argv)
2573 {
2574   if (SCHEME_FALSEP(argv[0]))
2575     return argv[0];
2576   else if (SCHEME_CHAR_STRINGP(argv[0])) {
2577     if (SCHEME_IMMUTABLEP(argv[0]))
2578       return argv[0];
2579     else {
2580       Scheme_Object *str = argv[0];
2581       str = scheme_make_immutable_sized_char_string(SCHEME_CHAR_STR_VAL(str), SCHEME_CHAR_STRLEN_VAL(str), 1);
2582       return str;
2583     }
2584   }
2585 
2586   return NULL;
2587 }
2588 
current_locale(int argc,Scheme_Object * argv[])2589 static Scheme_Object *current_locale(int argc, Scheme_Object *argv[])
2590 {
2591   Scheme_Object *v;
2592 
2593   v = scheme_param_config2("current-locale",
2594                            scheme_make_integer(MZCONFIG_LOCALE),
2595                            argc, argv,
2596                            -1, ok_locale, "(or/c #f string?)", 1);
2597 
2598   return v;
2599 }
2600 
locale_string_encoding(int argc,Scheme_Object * argv[])2601 static Scheme_Object *locale_string_encoding(int argc, Scheme_Object *argv[])
2602 {
2603   char *enc;
2604   Scheme_Object *s;
2605 
2606   reset_locale();
2607   if (mzLOCALE_IS_UTF_8(current_locale_name) || !locale_on)
2608     return scheme_make_utf8_string("UTF-8");
2609 
2610   enc = rktio_locale_encoding(scheme_rktio);
2611   s = scheme_make_utf8_string(enc);
2612   free(enc);
2613 
2614   return s;
2615 }
2616 
system_language_country(int argc,Scheme_Object * argv[])2617 static Scheme_Object *system_language_country(int argc, Scheme_Object *argv[])
2618 {
2619   char *lc;
2620   Scheme_Object *s;
2621 
2622   lc = rktio_system_language_country(scheme_rktio);
2623   s = scheme_make_utf8_string(lc);
2624   free(lc);
2625 
2626   return s;
2627 }
2628 
do_convert_close(rktio_converter_t * cd,int cache_mode,const char * to_e,const char * from_e)2629 static void do_convert_close(rktio_converter_t *cd, int cache_mode, const char *to_e, const char *from_e)
2630 /* If `cache_mode` is -1, then `to_e` needs to be freed (or cached).
2631    If `cache_mode` is 1, then `from_e` needs to be freed (or cached). */
2632 {
2633   if (cache_mode == -1)
2634     cache_locale_or_close(1, cd, (char *)to_e);
2635   else if (cache_mode == 1)
2636     cache_locale_or_close(0, cd, (char *)from_e);
2637   else if (!cache_mode)
2638     rktio_converter_close(scheme_rktio, cd);
2639 }
2640 
do_convert(rktio_converter_t * cd,const char * from_e,const char * to_e,int to_from_utf8,char * in,int id,int iilen,char * out,int od,int iolen,int grow,int add_end_shift,int extra,intptr_t * oilen,intptr_t * oolen,int * status)2641 static char *do_convert(rktio_converter_t *cd,
2642 			/* if !cd and either from_e or to_e can be NULL, then
2643 			   reset_locale() must have been called */
2644 			const char *from_e, const char *to_e,
2645 			/* 1 => UCS-4 -> UTF-8; 2 => UTF-8 -> UCS-4; 0 => other */
2646 			int to_from_utf8,
2647 			/* in can be NULL to output just a shift; in that case,
2648 			   id should be 0, too */
2649 			char *in, int id, int iilen,
2650 			char *out, int od, int iolen,
2651 			/* if grow, then reallocate when out isn't big enough */
2652 			int grow,
2653 			/* if add_end_shift, add a shift sequence to the end;
2654 			   not useful if in is already NULL to indicate a shift */
2655 			int add_end_shift,
2656 			/* extra specifies the length of a terminator,
2657 			   not included in iolen or *oolen */
2658 			int extra,
2659 			/* these two report actual read/wrote sizes: */
2660 			intptr_t *oilen, intptr_t *oolen,
2661 			/* status is set to
2662 			   0 for complete,
2663 			   -1 for partial input,
2664 			   -2 for error,
2665 			   1 for more avail */
2666 			int *status)
2667 {
2668   int dip, dop, close_it = 0, cache_mode = 0, mz_utf8 = 0;
2669   intptr_t il, ol, r;
2670   GC_CAN_IGNORE char *ip, *op;
2671 
2672   /* Defaults: */
2673   *status = -1;
2674   if (oilen)
2675     *oilen = 0;
2676   *oolen = 0;
2677 
2678   if (!cd) {
2679     if (rktio_convert_properties(scheme_rktio) & RKTIO_CONVERTER_SUPPORTED) {
2680       char *tmp_from_e = NULL, *tmp_to_e = NULL;
2681 
2682       if (!to_e && !strcmp(from_e, MZ_UCS4_NAME))
2683         cache_mode = -1;
2684       else if (!from_e && !strcmp(to_e, MZ_UCS4_NAME))
2685         cache_mode = 1;
2686 
2687       if (!from_e) {
2688 	tmp_from_e = rktio_locale_encoding(scheme_rktio);
2689         from_e = tmp_from_e;
2690       }
2691       if (!to_e) {
2692 	tmp_to_e = rktio_locale_encoding(scheme_rktio);
2693         to_e = tmp_to_e;
2694       }
2695 
2696       if ((cache_mode == -1)
2697           && cached_locale_to_converter
2698           && !strcmp(to_e, cached_locale_encoding_name)) {
2699         cd = cached_locale_to_converter;
2700         cached_locale_to_converter = NULL;
2701       } else if ((cache_mode == 1)
2702                  && cached_locale_from_converter
2703                  && !strcmp(from_e, cached_locale_encoding_name)) {
2704         cd = cached_locale_from_converter;
2705         cached_locale_from_converter = NULL;
2706       } else {
2707         cd = rktio_converter_open(scheme_rktio, to_e, from_e);
2708       }
2709       close_it = 1;
2710       if (tmp_from_e && ((cache_mode != 1) || !cd)) free(tmp_from_e);
2711       if (tmp_to_e && ((cache_mode != -1) || !cd)) free(tmp_to_e);
2712     } else if (to_from_utf8) {
2713       /* Assume UTF-8 */
2714       mz_utf8 = 1;
2715     }
2716   }
2717 
2718   if (!cd && !mz_utf8) {
2719     if (out) {
2720       while (extra--) {
2721 	out[extra] = 0;
2722       }
2723     }
2724     return out;
2725   }
2726 
2727   /* The converter is ready. Allocate out space, if necessary */
2728 
2729   if (!out) {
2730     if (iolen <= 0)
2731       iolen = iilen;
2732     out = (char *)scheme_malloc_atomic(iolen + extra);
2733     od = 0;
2734   }
2735 
2736   /* il and ol are the number of available chars */
2737   il = iilen;
2738   ol = iolen;
2739   /* dip and dop are the number of characters read so far;
2740      we use these and NULL out the ip and op pointers
2741      for the sake of precise GC */
2742   dip = 0;
2743   dop = 0;
2744   if (!in)
2745     add_end_shift = 0;
2746 
2747   while (1) {
2748     int icerr;
2749 
2750     if (mz_utf8) {
2751       /* Use our UTF-8 routines as if they were iconv */
2752       if (to_from_utf8 == 1) {
2753 	/* UCS-4 -> UTF-8 */
2754 	/* We assume that in + id and iilen are mzchar-aligned */
2755 	int opos, uid, uilen;
2756 	uid = (id + dip) >> 2;
2757 	uilen = (iilen - dip) >> 2;
2758 	opos = scheme_utf8_encode((const unsigned int *)in, uid, uilen,
2759 				  NULL, 0,
2760 				  0);
2761 	if (opos <= iolen) {
2762 	  opos = scheme_utf8_encode((const unsigned int *)in, uid, uilen,
2763 				    (unsigned char *)out, od + dop,
2764 				    0);
2765 	  dop += opos;
2766 	  dip += iilen;
2767 	  icerr = 0;
2768 	  r = (size_t)opos;
2769 	} else {
2770 	  icerr = E2BIG;
2771 	  r = (size_t)-1;
2772 	}
2773       } else {
2774 	/* UTF-8 -> UCS-4 */
2775 	/* We assume that out + od is mzchar-aligned */
2776 	intptr_t ipos, opos;
2777 
2778 	r = utf8_decode_x((unsigned char *)in, id + dip, iilen,
2779 			  (unsigned int *)out, (od + dop) >> 2, iolen >> 2,
2780 			  &ipos, &opos,
2781 			  0, 0, NULL, 0, 0, 0);
2782 
2783 	opos <<= 2;
2784 	dop = (opos - od);
2785 	dip = (ipos - id);
2786 
2787 	if ((r == -1) || (r == -2)) {
2788 	  r = (size_t)-1;
2789 	  icerr = RKTIO_ERROR_CONVERT_BAD_SEQUENCE;
2790 	} else if (r == -3) {
2791 	  icerr = RKTIO_ERROR_CONVERT_NOT_ENOUGH_SPACE;
2792 	  r = (size_t)-1;
2793 	} else
2794 	  icerr = 0;
2795       }
2796     } else  {
2797       ip = in XFORM_OK_PLUS id + dip;
2798       op = out XFORM_OK_PLUS od + dop;
2799       r = rktio_convert(scheme_rktio, cd, &ip, &il, &op, &ol);
2800       dip = ip - (in XFORM_OK_PLUS id);
2801       dop = op - (out XFORM_OK_PLUS od);
2802       ip = op = NULL;
2803       icerr = rktio_get_last_error(scheme_rktio);
2804     }
2805 
2806     /* Record how many chars processed, now */
2807     if (oilen)
2808       *oilen = dip;
2809     *oolen = dop;
2810 
2811     /* Got all the chars? */
2812     if (r == RKTIO_CONVERT_ERROR) {
2813       if (icerr == RKTIO_ERROR_CONVERT_NOT_ENOUGH_SPACE) {
2814 	if (grow) {
2815 	  /* Double the string size and try again */
2816 	  char *naya;
2817 	  naya = (char *)scheme_malloc_atomic((iolen * 2) + extra);
2818 	  memcpy(naya, out + od, *oolen);
2819 	  ol += iolen;
2820 	  iolen += iolen;
2821 	  out = naya;
2822 	  od = 0;
2823 	} else {
2824 	  *status = 1;
2825 	  if (close_it)
2826             do_convert_close(cd, cache_mode, to_e, from_e);
2827 	  while (extra--) {
2828 	    out[od + dop + extra] = 0;
2829 	  }
2830 	  return out;
2831 	}
2832       } else {
2833 	/* Either EINVAL (premature end) or EILSEQ (bad sequence) */
2834 	if (icerr == RKTIO_ERROR_CONVERT_BAD_SEQUENCE)
2835 	  *status = -2;
2836 	if (close_it)
2837           do_convert_close(cd, cache_mode, to_e, from_e);
2838 	while (extra--) {
2839 	  out[od + dop + extra] = 0;
2840 	}
2841 	return out;
2842       }
2843     } else {
2844       /* All done... */
2845       if (add_end_shift) {
2846 	add_end_shift = 0;
2847 	in = NULL;
2848 	dip = 0;
2849 	id = 0;
2850 	il = 0; /* should be redundant */
2851 	oilen = NULL; /* so it doesn't get set to 0 */
2852       } else {
2853 	*status = 0;
2854 	if (close_it)
2855           do_convert_close(cd, cache_mode, to_e, from_e);
2856 	while (extra--) {
2857 	  out[od + dop + extra] = 0;
2858 	}
2859 	return out;
2860       }
2861     }
2862   }
2863 }
2864 
2865 #define MZ_SC_BUF_SIZE 32
2866 
string_to_from_locale(int to_bytes,char * in,intptr_t delta,intptr_t len,intptr_t * olen,int perm,int * no_cvt)2867 static char *string_to_from_locale(int to_bytes,
2868 				   char *in, intptr_t delta, intptr_t len,
2869 				   intptr_t *olen, int perm,
2870 				   int *no_cvt)
2871      /* Call this function only when iconv is available, and only when
2872 	reset_locale() has been called */
2873 {
2874   Scheme_Object *parts = scheme_null, *one;
2875   char *c, *le;
2876   intptr_t clen, used;
2877   int status;
2878   rktio_converter_t *cd;
2879 
2880   le = rktio_locale_encoding(scheme_rktio);
2881   if (cached_locale_encoding_name
2882       && !strcmp(le, cached_locale_encoding_name)
2883       && (to_bytes ? cached_locale_to_converter : cached_locale_from_converter)) {
2884     if (to_bytes) {
2885       cd = cached_locale_to_converter;
2886       cached_locale_to_converter = NULL;
2887     } else {
2888       cd = cached_locale_from_converter;
2889       cached_locale_from_converter = NULL;
2890     }
2891   } else {
2892     if (to_bytes)
2893       cd = rktio_converter_open(scheme_rktio, le, MZ_UCS4_NAME);
2894     else
2895       cd = rktio_converter_open(scheme_rktio, MZ_UCS4_NAME, le);
2896   }
2897 
2898   if (!cd) {
2899     free(le);
2900     *no_cvt = 1;
2901     return NULL;
2902   }
2903   *no_cvt = 0;
2904 
2905   status = 0;
2906 
2907   while (len) {
2908     /* We might have conversion errors... */
2909     c = do_convert(cd, NULL, NULL, 0,
2910 		   (char *)in, (to_bytes ? 4 : 1) * delta, (to_bytes ? 4 : 1) * len,
2911 		   NULL, 0, (to_bytes ? 1 : 4) * (len + 1),
2912 		   1 /* grow */, 1, (to_bytes ? 1 : 4) /* terminator size */,
2913 		   &used, &clen,
2914 		   &status);
2915 
2916     if (to_bytes)
2917       used >>= 2;
2918 
2919     if ((perm < 0) && (used < len)) {
2920       rktio_converter_close(scheme_rktio, cd);
2921       free(le);
2922       return NULL;
2923     }
2924 
2925     delta += used;
2926     len -= used;
2927 
2928     if (!len && SCHEME_NULLP(parts)) {
2929       if (to_bytes) {
2930 	*olen = clen;
2931 	c[*olen] = 0;
2932       } else {
2933 	*olen = (clen >> 2);
2934 	((mzchar *)c)[*olen] = 0;
2935       }
2936       cache_locale_or_close(to_bytes, cd, le);
2937       return c;
2938     }
2939 
2940     /* We can get here if there was some conversion error at some
2941        point. We're building up a list of parts. */
2942 
2943     if (to_bytes) {
2944       one = scheme_make_sized_byte_string(c, clen, 0);
2945     } else {
2946       one = scheme_make_sized_char_string((mzchar *)c, clen >> 2, 0);
2947     }
2948 
2949     parts = scheme_make_pair(one, parts);
2950 
2951     if (len) {
2952       /* Conversion error, so skip one char. */
2953       if (to_bytes) {
2954 	char bc[1];
2955 	bc[0] = perm;
2956 	one = scheme_make_sized_byte_string(bc, 1, 1);
2957       } else {
2958 	mzchar bc[1];
2959 	bc[0] = perm;
2960 	one = scheme_make_sized_char_string(bc, 1, 1);
2961       }
2962       parts = scheme_make_pair(one, parts);
2963       delta += 1;
2964       len -= 1;
2965     }
2966   }
2967 
2968   cache_locale_or_close(to_bytes, cd, le);
2969 
2970   if (to_bytes) {
2971     parts = append_all_byte_strings_backwards(parts);
2972     *olen = SCHEME_BYTE_STRTAG_VAL(parts);
2973 
2974     return SCHEME_BYTE_STR_VAL(parts);
2975   } else {
2976     parts = append_all_strings_backwards(parts);
2977     *olen = SCHEME_CHAR_STRTAG_VAL(parts);
2978 
2979     return (char *)SCHEME_CHAR_STR_VAL(parts);
2980   }
2981 }
2982 
cache_locale_or_close(int to_bytes,rktio_converter_t * cd,char * le)2983 void cache_locale_or_close(int to_bytes, rktio_converter_t *cd, char *le)
2984 {
2985   if (to_bytes ? cached_locale_to_converter : cached_locale_from_converter) {
2986     rktio_converter_close(scheme_rktio, cd);
2987     free(le);
2988   } else {
2989     if (!cached_locale_encoding_name || strcmp(le, cached_locale_encoding_name)) {
2990       scheme_clear_locale_cache();
2991       cached_locale_encoding_name = le;
2992     } else
2993       free(le);
2994 
2995     rktio_convert_reset(scheme_rktio, cd);
2996     if (to_bytes)
2997       cached_locale_to_converter = cd;
2998     else
2999       cached_locale_from_converter = cd;
3000   }
3001 }
3002 
scheme_clear_locale_cache(void)3003 void scheme_clear_locale_cache(void)
3004 {
3005   if (cached_locale_encoding_name) {
3006     if (cached_locale_to_converter) {
3007       rktio_converter_close(scheme_rktio, cached_locale_to_converter);
3008       cached_locale_to_converter = NULL;
3009     }
3010     if (cached_locale_from_converter) {
3011       rktio_converter_close(scheme_rktio, cached_locale_from_converter);
3012       cached_locale_from_converter = NULL;
3013     }
3014     free(cached_locale_encoding_name);
3015     cached_locale_encoding_name = NULL;
3016   }
3017 }
3018 
locale_recase(int to_up,char * in,int id,int iilen,char * out,int od,int iolen,intptr_t * oolen)3019 static char *locale_recase(int to_up,
3020 			   /* in must be null-terminated, iilen doesn't include it */
3021 			   char *in, int id, int iilen,
3022 			   /* iolen, in contrast, includes the terminator */
3023 			   char *out, int od, int iolen,
3024 			   intptr_t *oolen)
3025      /* Assumes that reset_locale() has been called */
3026 {
3027   char *s, *s2;
3028   intptr_t len;
3029   s = rktio_locale_recase(scheme_rktio, to_up, in XFORM_OK_PLUS id);
3030   len = strlen(s);
3031   if ((len+1) < iolen) {
3032     memcpy(out XFORM_OK_PLUS od, s, len+1);
3033     s2 = out;
3034   } else {
3035     s2 = scheme_malloc_atomic(len+1);
3036     memcpy(s2, s, len+1);
3037   }
3038   free(s);
3039   *oolen = len;
3040   return s2;
3041 }
3042 
mz_locale_strcoll(char * s1,int d1,int l1,char * s2,int d2,int l2,int cvt_case)3043 int mz_locale_strcoll(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case)
3044      /* The s1 and s2 arguments are actually UCS-4.
3045         Assumes that reset_locale() has been called. */
3046 {
3047   intptr_t clen1, clen2, used1, used2, origl1, origl2;
3048   char *c1, *c2, buf1[MZ_SC_BUF_SIZE], buf2[MZ_SC_BUF_SIZE];
3049   char case_buf1[MZ_SC_BUF_SIZE], case_buf2[MZ_SC_BUF_SIZE];
3050   int status, got_more;
3051 
3052   /* First, convert UCS-4 to locale-specific encoding. If some
3053      characters don't fit into the encoding, then we'll have leftover
3054      characters. Count unconvertable charc as greater than anything
3055      that can be converted */
3056 
3057   origl1 = l1;
3058   origl2 = l2;
3059 
3060   /* Loop to check both convertable and unconvertable parts */
3061   while (1) {
3062     if (!origl1 && !origl2)
3063       return 0;
3064     if (!origl1)
3065       return -1;
3066     if (!origl2)
3067       return 1;
3068 
3069     /* Loop to get consistent parts of the wto strings, in case
3070        a conversion fails. */
3071     got_more = 0;
3072     l1 = origl1;
3073     l2 = origl2;
3074     while (1) {
3075       c1 = do_convert(NULL, MZ_UCS4_NAME, NULL, 1,
3076 		      s1, d1 * 4, 4 * l1,
3077 		      buf1, 0, MZ_SC_BUF_SIZE - 1,
3078 		      1 /* grow */, 0, 1 /* terminator size */,
3079 		      &used1, &clen1,
3080 		      &status);
3081       c2 = do_convert(NULL, MZ_UCS4_NAME, NULL, 1,
3082 		      s2, d2 * 4, 4 * l2,
3083 		      buf2, 0, MZ_SC_BUF_SIZE - 1,
3084 		      1 /* grow */, 0, 1 /* terminator size */,
3085 		      &used2, &clen2,
3086 		      &status);
3087 
3088       if ((used1 < 4 * l1) || (used2 < 4 * l2)) {
3089 	if (got_more) {
3090 	  /* Something went wrong. We've already tried to
3091 	     even out the parts that work. Let's give up
3092 	     on the first characters */
3093 	  clen1 = clen2 = 0;
3094 	  break;
3095 	} else if (used1 == used2) {
3096 	  /* Not everything, but both ended at the same point */
3097 	  break;
3098 	} else {
3099 	  /* Pick the smallest */
3100 	  if (used2 < used1) {
3101 	    used1 = used2;
3102 	    got_more = 1;
3103 	  } else
3104 	    got_more = 2;
3105 	  l2 = (used1 >> 2);
3106 	  l1 = (used1 >> 2);
3107 
3108 	  if (!l1) {
3109 	    /* Nothing to get this time. */
3110 	    clen1 = clen2 = 0;
3111 	    c1 = c2 = "";
3112 	    used1 = used2 = 0;
3113 	    break;
3114 	  }
3115 	}
3116       } else
3117 	/* Got all that we wanted */
3118 	break;
3119     }
3120 
3121     if (cvt_case) {
3122       if (clen1)
3123 	c1 = locale_recase(0, c1, 0, clen1,
3124 			   case_buf1, 0, MZ_SC_BUF_SIZE - 1,
3125 			   &clen1);
3126       else
3127 	c1 = NULL;
3128       if (clen2)
3129 	c2 = locale_recase(0, c2, 0, clen2,
3130 			   case_buf2, 0, MZ_SC_BUF_SIZE - 1,
3131 			   &clen2);
3132       else
3133 	c2 = NULL;
3134       /* There shouldn't have been conversion errors, but just in
3135 	 case, care of NULL. */
3136       if (!c1) c1 = "";
3137       if (!c2) c2 = "";
3138     }
3139 
3140     /* Collate, finally. */
3141     status = rktio_locale_strcoll(scheme_rktio, c1, c2);
3142 
3143     /* If one is bigger than the other, we're done. */
3144     if (status)
3145       return status;
3146 
3147     /* Otherwise, is there more to check? */
3148     origl1 -= (used1 >> 2);
3149     origl2 -= (used2 >> 2);
3150     d1 += (used1 >> 2);
3151     d2 += (used2 >> 2);
3152     if (!origl1 && !origl2)
3153       return 0;
3154 
3155     /* There's more. It must be that the next character wasn't
3156        convertable in one of the encodings. */
3157     if (got_more)
3158       return ((got_more == 2) ? 1 : -1);
3159 
3160     if (!origl1)
3161       return -1;
3162 
3163     /* Compare an unconverable character directly. No case conversions
3164        if it's outside the locale. */
3165     if (((unsigned int *)s1)[d1] > ((unsigned int *)s2)[d2])
3166       return 1;
3167     else if (((unsigned int *)s1)[d1] < ((unsigned int *)s2)[d2])
3168       return -1;
3169     else {
3170       /* We've skipped one unconvertable char, and they still look the
3171 	 same.  Now try again. */
3172       origl1 -= 1;
3173       origl2 -= 1;
3174       d1 += 1;
3175       d2 += 1;
3176     }
3177   }
3178 }
3179 
mz_native_strcoll(char * s1,int d1,int l1,char * s2,int d2,int l2,int cvt_case)3180 int mz_native_strcoll(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case)
3181      /* The s1 and s2 arguments are actually UTF-16. */
3182 {
3183   return rktio_strcoll_utf16(scheme_rktio,
3184                              (rktio_char16_t *)s1 XFORM_OK_PLUS d1, l1,
3185                              (rktio_char16_t *)s2 XFORM_OK_PLUS d2, l2,
3186                              cvt_case);
3187 }
3188 
3189 typedef int (*strcoll_proc)(char *s1, int d1, int l1, char *s2, int d2, int l2, int cvt_case);
3190 
do_locale_comp(const char * who,const mzchar * us1,intptr_t ul1,const mzchar * us2,intptr_t ul2,int cvt_case)3191 int do_locale_comp(const char *who, const mzchar *us1, intptr_t ul1, const mzchar *us2, intptr_t ul2, int cvt_case)
3192 {
3193   int xl1;
3194   int v, endres, utf16 = 0;
3195   GC_CAN_IGNORE strcoll_proc mz_strcoll = mz_locale_strcoll;
3196 
3197   if (current_locale_name
3198       && !*current_locale_name
3199       && (rktio_convert_properties(scheme_rktio) & RKTIO_CONVERT_STRCOLL_UTF16)) {
3200     utf16 = 1;
3201     mz_strcoll = mz_native_strcoll;
3202   }
3203 
3204   if (utf16) {
3205     us1 = (mzchar *)scheme_ucs4_to_utf16(us1, 0, ul1, NULL, 0, &ul1, 1);
3206     us2 = (mzchar *)scheme_ucs4_to_utf16(us2, 0, ul2, NULL, 0, &ul2, 1);
3207     ((short *)us1)[ul1] = 0;
3208     ((short *)us2)[ul2] = 0;
3209   }
3210 
3211   if (ul1 > ul2) {
3212     ul1 = ul2;
3213     endres = 1;
3214   } else {
3215     if (ul2 > ul1)
3216       endres = -1;
3217     else
3218       endres = 0;
3219   }
3220 
3221   /* Walk back through the strings looking for nul characters. If we
3222      find one, compare the part after the null character to update
3223      endres, then continue. Unfortunately, we do too much work if an
3224      earlier part of the string (tested later) determines the result,
3225      but hopefully nul characters are rare. */
3226 
3227   xl1 = 0;
3228   while (ul1--) {
3229     if ((utf16 && (!(((short *)us1)[ul1]) || !(((short *)us2)[ul1])))
3230 	|| (!utf16 && (!(us1[ul1]) || !(us2[ul1])))) {
3231       if (utf16) {
3232 	if (((short *)us1)[ul1])
3233 	  endres = 1;
3234 	else if (((short *)us2)[ul1])
3235 	  endres = -1;
3236       } else {
3237 	if (us1[ul1])
3238 	  endres = 1;
3239 	else if (us2[ul1])
3240 	  endres = -1;
3241       }
3242 
3243       if (xl1)
3244 	v = mz_strcoll((char *)us1, ul1 + 1, xl1, (char *)us2, ul1 + 1, xl1, cvt_case);
3245       else
3246 	v = 0;
3247 
3248       if (v)
3249 	endres = v;
3250       xl1 = 0;
3251     } else {
3252       xl1++;
3253     }
3254   }
3255 
3256   v = mz_strcoll((char *)us1, 0, xl1, (char *)us2, 0, xl1, cvt_case);
3257   if (v)
3258     endres = v;
3259 
3260   return endres;
3261 }
3262 
do_locale_recase(int to_up,mzchar * in,int delta,int len,intptr_t * olen)3263 mzchar *do_locale_recase(int to_up, mzchar *in, int delta, int len, intptr_t *olen)
3264 {
3265   Scheme_Object *parts = scheme_null;
3266   char *c, buf[MZ_SC_BUF_SIZE], case_buf[MZ_SC_BUF_SIZE];
3267   intptr_t clen, used;
3268   int status;
3269 
3270   while (len) {
3271     /* We might have conversion errors... */
3272     c = do_convert(NULL, MZ_UCS4_NAME, NULL, 1,
3273 		   (char *)in, 4 * delta, 4 * len,
3274 		   buf, 0, MZ_SC_BUF_SIZE - 1,
3275 		   1 /* grow */, 0, 1 /* terminator size */,
3276 		   &used, &clen,
3277 		   &status);
3278 
3279     used >>= 2;
3280     delta += used;
3281     len -= used;
3282 
3283     c = locale_recase(to_up, c, 0, clen,
3284 		      case_buf, 0, MZ_SC_BUF_SIZE - 1,
3285 		      &clen);
3286     if (!c)
3287       clen = 0;
3288 
3289     c = do_convert(NULL, NULL, MZ_UCS4_NAME, 2,
3290 		   c, 0, clen,
3291 		   NULL, 0, 0,
3292 		   1 /* grow */, 0, sizeof(mzchar) /* terminator size */,
3293 		   &used, &clen,
3294 		   &status);
3295 
3296     if (!len && SCHEME_NULLP(parts)) {
3297       *olen = (clen >> 2);
3298       ((mzchar *)c)[*olen] = 0;
3299       return (mzchar *)c;
3300     }
3301 
3302     /* We can get here if there was some conversion error at some
3303        point. We're building up a list of parts. */
3304 
3305     parts = scheme_make_pair(scheme_make_sized_char_string((mzchar *)c, clen >> 2, 0),
3306 			     parts);
3307 
3308     if (len) {
3309       /* Conversion error, so skip one char. */
3310       parts = scheme_make_pair(scheme_make_sized_offset_char_string(in, delta, 1, 1),
3311 			       parts);
3312       delta += 1;
3313       len -= 1;
3314     }
3315   }
3316 
3317   parts = append_all_strings_backwards(parts);
3318   *olen = SCHEME_CHAR_STRTAG_VAL(parts);
3319 
3320   return SCHEME_CHAR_STR_VAL(parts);
3321 }
3322 
do_native_recase(int to_up,mzchar * in,int delta,int len,intptr_t * olen)3323 mzchar *do_native_recase(int to_up, mzchar *in, int delta, int len, intptr_t *olen)
3324 /* The in argument is actually UTF-16. */
3325 {
3326   rktio_char16_t *s, *s2;
3327   intptr_t ol;
3328 
3329   s = rktio_recase_utf16(scheme_rktio, to_up, (rktio_char16_t *)in XFORM_OK_PLUS delta, len, &ol);
3330 
3331   s2 = scheme_malloc_atomic(sizeof(rktio_char16_t) * (ol+1));
3332   memcpy(s2, s, sizeof(rktio_char16_t) * (ol+1));
3333   free(s);
3334 
3335   *olen = ol;
3336 
3337   return (mzchar *)s2;
3338 }
3339 
3340 typedef mzchar *(*recase_proc)(int to_up, mzchar *in, int delta, int len, intptr_t *olen);
3341 
mz_recase(const char * who,int to_up,mzchar * us,intptr_t ulen)3342 static Scheme_Object *mz_recase(const char *who, int to_up, mzchar *us, intptr_t ulen)
3343 {
3344   intptr_t ulen1;
3345   int utf16 = 0, i, delta = 0;
3346   mzchar *us1;
3347   recase_proc mz_do_recase = do_locale_recase;
3348   Scheme_Object *s, *parts = scheme_null;
3349 
3350   reset_locale();
3351 
3352   if (current_locale_name
3353       && !*current_locale_name
3354       && (rktio_convert_properties(scheme_rktio) & RKTIO_CONVERT_RECASE_UTF16)) {
3355     utf16 = 1;
3356     mz_do_recase = do_native_recase;
3357   }
3358 
3359   if (utf16) {
3360     us = (mzchar *)scheme_ucs4_to_utf16(us, 0, ulen, NULL, 0, &ulen, 1);
3361     ((short *)us)[ulen] = 0;
3362   }
3363 
3364   /* If there are nuls in the string, then we have to make multiple
3365      calls to mz_do_recase */
3366   i = 0;
3367   while (1) {
3368     for (; i < ulen; i++) {
3369       if (utf16) {
3370 	if (!((short *)us)[i])
3371 	  break;
3372       } else if (!us[i])
3373 	break;
3374     }
3375 
3376     us1 = mz_do_recase(to_up, us, delta, i - delta, &ulen1);
3377 
3378     if (utf16) {
3379       us1 = scheme_utf16_to_ucs4((unsigned short *)us1, 0, ulen1, NULL, 0, &ulen1, 1);
3380       us1[ulen1] = 0;
3381     }
3382 
3383     s = scheme_make_sized_char_string((mzchar *)us1, ulen1, 0);
3384 
3385     if (SCHEME_NULLP(parts) && (i == ulen))
3386       return s;
3387 
3388     parts = scheme_make_pair(s, parts);
3389 
3390     if (i == ulen)
3391       break;
3392 
3393     /* upcasing and encoding a nul char is easy: */
3394     s = scheme_make_sized_char_string((mzchar *)"\0\0\0\0", 1, 0);
3395     parts = scheme_make_pair(s, parts);
3396     i++;
3397     delta = i;
3398 
3399     if (i == ulen)
3400       break;
3401   }
3402 
3403   return append_all_strings_backwards(parts);
3404 }
3405 
3406 static Scheme_Object *
unicode_recase(const char * who,int to_up,int argc,Scheme_Object * argv[])3407 unicode_recase(const char *who, int to_up, int argc, Scheme_Object *argv[])
3408 {
3409   intptr_t len;
3410   mzchar *chars;
3411 
3412   if (!SCHEME_CHAR_STRINGP(argv[0]))
3413     scheme_wrong_contract(who, "string?", 0, argc, argv);
3414 
3415   chars = SCHEME_CHAR_STR_VAL(argv[0]);
3416   len = SCHEME_CHAR_STRTAG_VAL(argv[0]);
3417 
3418   return mz_recase(who, to_up, chars, len);
3419 }
3420 
3421 static Scheme_Object *
string_locale_upcase(int argc,Scheme_Object * argv[])3422 string_locale_upcase(int argc, Scheme_Object *argv[])
3423 {
3424   return unicode_recase("string-locale-upcase", 1, argc, argv);
3425 }
3426 
3427 static Scheme_Object *
string_locale_downcase(int argc,Scheme_Object * argv[])3428 string_locale_downcase(int argc, Scheme_Object *argv[])
3429 {
3430   return unicode_recase("string-locale-downcase", 0, argc, argv);
3431 }
3432 
reset_locale(void)3433 static void reset_locale(void)
3434 {
3435   Scheme_Object *v;
3436   const mzchar *name;
3437 
3438   /* This function needs to work before threads are set up: */
3439   if (scheme_current_thread) {
3440     v = scheme_get_param(scheme_current_config(), MZCONFIG_LOCALE);
3441   } else {
3442     v = scheme_make_immutable_sized_utf8_string("", 0);
3443   }
3444   locale_on = SCHEME_TRUEP(v);
3445 
3446   if (locale_on) {
3447     name = SCHEME_CHAR_STR_VAL(v);
3448     if ((current_locale_name != name)
3449         && (!current_locale_name
3450             || mz_char_strcmp("result-locale",
3451                               current_locale_name, scheme_char_strlen(current_locale_name),
3452                               name, SCHEME_CHAR_STRLEN_VAL(v),
3453                               0, 1))) {
3454       char *n, buf[32];
3455 
3456       n = scheme_utf8_encode_to_buffer(name, SCHEME_CHAR_STRLEN_VAL(v), buf, 32);
3457 
3458       rktio_set_locale(scheme_rktio, n);
3459     }
3460 
3461     current_locale_name_ptr = (void *)name;
3462   }
3463 }
3464 
scheme_push_c_numeric_locale()3465 char *scheme_push_c_numeric_locale()
3466 {
3467   return rktio_push_c_numeric_locale(scheme_rktio);
3468 }
3469 
scheme_pop_c_numeric_locale(char * prev)3470 void scheme_pop_c_numeric_locale(char *prev)
3471 {
3472   rktio_pop_c_numeric_locale(scheme_rktio, prev);
3473 }
3474 
scheme_set_default_locale(void)3475 void scheme_set_default_locale(void)
3476 {
3477   rktio_set_default_locale("");
3478 }
3479 
find_special_casing(int ch)3480 static int find_special_casing(int ch)
3481 {
3482   /* Binary search */
3483   int i, lo, hi, j;
3484 
3485   i = NUM_SPECIAL_CASINGS >> 1;
3486   lo = i;
3487   hi = NUM_SPECIAL_CASINGS - i - 1;
3488 
3489   while (1) {
3490     if (uchar_special_casings[i * 10] == ch)
3491       return i * 10;
3492     if (uchar_special_casings[i * 10] > ch) {
3493       j = i - lo;
3494       i = j + (lo >> 1);
3495       hi = lo - (i - j) - 1;
3496       lo = i - j;
3497     } else {
3498       j = i + 1;
3499       i = j + (hi >> 1);
3500       lo = i - j;
3501       hi = hi - (i - j) - 1;
3502     }
3503   }
3504 }
3505 
is_final_sigma(int mode,mzchar * s,int d,int i,int len)3506 static int is_final_sigma(int mode, mzchar *s, int d, int i, int len)
3507 {
3508   int j;
3509 
3510   if (mode == 3)
3511     return 1;
3512 
3513   /* find a cased char before, skipping case-ignorable: */
3514   for (j = i - 1; j >= d; j--) {
3515     if (!scheme_iscaseignorable(s[j])) {
3516       if (scheme_iscased(s[j]))
3517 	break;
3518       else
3519 	return 0;
3520     }
3521   }
3522   if (j < d)
3523     return 0;
3524 
3525   /* next non-case-ignorable must not be cased: */
3526   for (j = i + 1; j < d + len; j++) {
3527     if (!scheme_iscaseignorable(s[j])) {
3528       return !scheme_iscased(s[j]);
3529     }
3530   }
3531 
3532   return 1;
3533 }
3534 
scheme_string_recase(mzchar * s,int d,int len,int mode,int inplace,int * _len)3535 mzchar *scheme_string_recase(mzchar *s, int d, int len, int mode, int inplace, int *_len)
3536 {
3537   mzchar *t;
3538   int i, extra = 0, pos, special = 0, td, prev_was_cased = 0, xmode = mode;
3539 
3540   for (i = 0; i < len; i++) {
3541     if (scheme_isspecialcasing(s[d+i])) {
3542       pos = find_special_casing(s[d+i]);
3543       if (!uchar_special_casings[pos + 9] || is_final_sigma(xmode, s, d, i, len)) {
3544 	special = 1;
3545 	extra += (uchar_special_casings[pos + 1 + (xmode << 1)] - 1);
3546       }
3547     }
3548     if (mode == 2) {
3549       if (!scheme_iscaseignorable(s[d+i]))
3550 	prev_was_cased = scheme_iscased(s[d+i]);
3551       xmode = (prev_was_cased ? 0 : 2);
3552     }
3553   }
3554 
3555   if (_len)
3556     *_len = len + extra;
3557 
3558   if (!extra && inplace) {
3559     t = s;
3560     td = d;
3561   } else {
3562     t = scheme_malloc_atomic(sizeof(mzchar) * (len + extra + 1));
3563     td = 0;
3564   }
3565 
3566   if (!special) {
3567     if (mode == 0) {
3568       for (i = 0; i < len; i++) {
3569 	t[i+td] = scheme_tolower(s[i+d]);
3570       }
3571     } else if (mode == 1) {
3572       for (i = 0; i < len; i++) {
3573 	t[i+td] = scheme_toupper(s[i+d]);
3574       }
3575     } else if (mode == 2) {
3576       prev_was_cased = 0;
3577       for (i = 0; i < len; i++) {
3578 	if (!prev_was_cased)
3579 	  t[i+td] = scheme_totitle(s[i+d]);
3580 	else
3581 	  t[i+td] = scheme_tolower(s[i+d]);
3582 	if (!scheme_iscaseignorable(s[i+d]))
3583 	  prev_was_cased = scheme_iscased(s[i+d]);
3584       }
3585     } else /* if (mode == 3) */ {
3586       for (i = 0; i < len; i++) {
3587 	t[i+td] = scheme_tofold(s[i+d]);
3588       }
3589     }
3590   } else {
3591     int j = 0, c;
3592     prev_was_cased = 0;
3593     for (i = 0; i < len; i++) {
3594       if (mode == 0) {
3595 	t[j+td] = scheme_tolower(s[i+d]);
3596       } else if (mode == 1) {
3597 	t[j+td] = scheme_toupper(s[i+d]);
3598       } else if (mode == 2) {
3599 	if (!prev_was_cased) {
3600 	  xmode = 2;
3601 	  t[j+td] = scheme_totitle(s[i+d]);
3602 	} else {
3603 	  xmode = 0;
3604 	  t[j+td] = scheme_tolower(s[i+d]);
3605 	}
3606 	if (!scheme_iscaseignorable(s[i+d]))
3607 	  prev_was_cased = scheme_iscased(s[i+d]);
3608       } else /* if (mode == 3) */ {
3609 	t[j+td] = scheme_tofold(s[i+d]);
3610       }
3611 
3612       if (scheme_isspecialcasing(s[i+d])) {
3613 	pos = find_special_casing(s[i+d]);
3614 	if (!uchar_special_casings[pos + 9] || is_final_sigma(xmode, s, d, i, len)) {
3615 	  c = uchar_special_casings[pos + 1 + (xmode << 1)];
3616 	  pos = uchar_special_casings[pos + 2 + (xmode << 1)];
3617 	  while (c--) {
3618 	    t[(j++)+td] = uchar_special_casing_data[pos++];
3619 	  }
3620 	} else
3621 	  j++;
3622       } else
3623 	j++;
3624     }
3625   }
3626   t[len+extra+td] = 0;
3627 
3628   return t;
3629 }
3630 
string_recase(const char * name,int argc,Scheme_Object * argv[],int mode)3631 static Scheme_Object *string_recase (const char *name, int argc, Scheme_Object *argv[], int mode)
3632 {
3633   mzchar *s;
3634   int len;
3635 
3636   if (!SCHEME_CHAR_STRINGP(argv[0]))
3637     scheme_wrong_contract(name, "string?", 0, argc, argv);
3638 
3639   s = SCHEME_CHAR_STR_VAL(argv[0]);
3640   len = SCHEME_CHAR_STRLEN_VAL(argv[0]);
3641 
3642   s = scheme_string_recase(s, 0, len, mode, 0, &len);
3643 
3644   return scheme_make_sized_char_string(s, len, 0);
3645 }
3646 
string_upcase(int argc,Scheme_Object * argv[])3647 static Scheme_Object *string_upcase (int argc, Scheme_Object *argv[])
3648 {
3649   return string_recase("string-upcase", argc, argv, 1);
3650 }
3651 
string_downcase(int argc,Scheme_Object * argv[])3652 static Scheme_Object *string_downcase (int argc, Scheme_Object *argv[])
3653 {
3654   return string_recase("string-downcase", argc, argv, 0);
3655 }
3656 
string_titlecase(int argc,Scheme_Object * argv[])3657 static Scheme_Object *string_titlecase (int argc, Scheme_Object *argv[])
3658 {
3659   return string_recase("string-titlecase", argc, argv, 2);
3660 }
3661 
string_foldcase(int argc,Scheme_Object * argv[])3662 static Scheme_Object *string_foldcase (int argc, Scheme_Object *argv[])
3663 {
3664   return string_recase("string-foldcase", argc, argv, 3);
3665 }
3666 
3667 /**********************************************************************/
3668 /*                          normalization                             */
3669 /**********************************************************************/
3670 
3671 #define MZ_JAMO_INITIAL_CONSONANT_START  0x1100
3672 #define MZ_JAMO_INITIAL_CONSONANT_COUNT  19
3673 #define MZ_JAMO_INITIAL_CONSONANT_END    (MZ_JAMO_INITIAL_CONSONANT_START + MZ_JAMO_INITIAL_CONSONANT_COUNT - 1)
3674 
3675 #define MZ_JAMO_VOWEL_START              0x1161
3676 #define MZ_JAMO_VOWEL_COUNT              21
3677 #define MZ_JAMO_VOWEL_END                (MZ_JAMO_VOWEL_START + MZ_JAMO_VOWEL_COUNT - 1)
3678 
3679 /* First in this range is not actually a consonant, but a placeholder for "no consonant" */
3680 #define MZ_JAMO_TRAILING_CONSONANT_START 0x11A7
3681 #define MZ_JAMO_TRAILING_CONSONANT_COUNT 28
3682 #define MZ_JAMO_TRAILING_CONSONANT_END   (MZ_JAMO_TRAILING_CONSONANT_START + MZ_JAMO_TRAILING_CONSONANT_COUNT - 1)
3683 
3684 #define MZ_JAMO_SYLLABLE_START           0xAC00
3685 #define MZ_JAMO_SYLLABLE_END             (MZ_JAMO_SYLLABLE_START + 11171)
3686 
get_composition(mzchar a,mzchar b)3687 XFORM_NONGCING static mzchar get_composition(mzchar a, mzchar b)
3688 {
3689   if ((a > 0xFFFF) || (b > 0xFFFF)) {
3690     /* Look in long-composes table. */
3691     mzlonglong key = ((((mzlonglong)a & 0x1F0000) << 21)
3692                       | (((mzlonglong)a & 0xFFFF) << 16)
3693                       | (((mzlonglong)b & 0x1F0000) << 16)
3694                       | ((mzlonglong)b & 0xFFFF));
3695     int pos = (LONG_COMPOSE_TABLE_SIZE >> 1), new_pos;
3696     int below_len = pos;
3697     int above_len = (LONG_COMPOSE_TABLE_SIZE - pos - 1);
3698 
3699     /* Binary search: */
3700     while (key != utable_canon_compose_long_pairs[pos]) {
3701       if (key > utable_canon_compose_long_pairs[pos]) {
3702         if (!above_len)
3703           return 0;
3704         new_pos = pos + (above_len >> 1) + 1;
3705         below_len = (new_pos - pos - 1);
3706         above_len = (above_len - below_len - 1);
3707         pos = new_pos;
3708       } else if (key < utable_canon_compose_long_pairs[pos]) {
3709         if (!below_len)
3710           return 0;
3711         new_pos = pos - ((below_len >> 1) + 1);
3712         above_len = (pos - new_pos - 1);
3713         below_len = (below_len - above_len - 1);
3714         pos = new_pos;
3715       }
3716     }
3717 
3718     return utable_canon_compose_long_result[pos];
3719   } else {
3720     uintptr_t key = (a << 16) | b;
3721     int pos = (COMPOSE_TABLE_SIZE >> 1), new_pos;
3722     int below_len = pos;
3723     int above_len = (COMPOSE_TABLE_SIZE - pos - 1);
3724 
3725     /* Binary search: */
3726     while (key != utable_compose_pairs[pos]) {
3727       if (key > utable_compose_pairs[pos]) {
3728         if (!above_len)
3729           return 0;
3730         new_pos = pos + (above_len >> 1) + 1;
3731         below_len = (new_pos - pos - 1);
3732         above_len = (above_len - below_len - 1);
3733         pos = new_pos;
3734       } else if (key < utable_compose_pairs[pos]) {
3735         if (!below_len)
3736           return 0;
3737         new_pos = pos - ((below_len >> 1) + 1);
3738         above_len = (pos - new_pos - 1);
3739         below_len = (below_len - above_len - 1);
3740         pos = new_pos;
3741       }
3742     }
3743 
3744     return utable_compose_result[pos];
3745   }
3746 }
3747 
get_canon_decomposition(mzchar key,mzchar * b)3748 XFORM_NONGCING mzchar get_canon_decomposition(mzchar key, mzchar *b)
3749 {
3750   int pos = (DECOMPOSE_TABLE_SIZE >> 1), new_pos;
3751   int below_len = pos;
3752   int above_len = (DECOMPOSE_TABLE_SIZE - pos - 1);
3753 
3754   /* Binary search: */
3755   while (key != utable_decomp_keys[pos]) {
3756     if (key > utable_decomp_keys[pos]) {
3757       if (!above_len)
3758 	return 0;
3759       new_pos = pos + (above_len >> 1) + 1;
3760       below_len = (new_pos - pos - 1);
3761       above_len = (above_len - below_len - 1);
3762       pos = new_pos;
3763     } else if (key < utable_decomp_keys[pos]) {
3764       if (!below_len)
3765 	return 0;
3766       new_pos = pos - ((below_len >> 1) + 1);
3767       above_len = (pos - new_pos - 1);
3768       below_len = (below_len - above_len - 1);
3769       pos = new_pos;
3770     }
3771   }
3772 
3773   pos = utable_decomp_indices[pos];
3774   if (pos < 0) {
3775     pos = -(pos + 1);
3776     pos <<= 1;
3777     *b = utable_compose_long_pairs[pos + 1];
3778     return utable_compose_long_pairs[pos];
3779   } else {
3780     key = utable_compose_pairs[pos];
3781     *b = (key & 0xFFFF);
3782     return (key >> 16);
3783   }
3784 }
3785 
get_kompat_decomposition(mzchar key,unsigned short ** chars)3786 XFORM_NONGCING int get_kompat_decomposition(mzchar key, unsigned short **chars)
3787 {
3788   int pos = (KOMPAT_DECOMPOSE_TABLE_SIZE >> 1), new_pos;
3789   int below_len = pos;
3790   int above_len = (KOMPAT_DECOMPOSE_TABLE_SIZE - pos - 1);
3791 
3792   /* Binary search: */
3793   while (key != utable_kompat_decomp_keys[pos]) {
3794     if (key > utable_kompat_decomp_keys[pos]) {
3795       if (!above_len)
3796 	return 0;
3797       new_pos = pos + (above_len >> 1) + 1;
3798       below_len = (new_pos - pos - 1);
3799       above_len = (above_len - below_len - 1);
3800       pos = new_pos;
3801     } else if (key < utable_kompat_decomp_keys[pos]) {
3802       if (!below_len)
3803 	return 0;
3804       new_pos = pos - ((below_len >> 1) + 1);
3805       above_len = (pos - new_pos - 1);
3806       below_len = (below_len - above_len - 1);
3807       pos = new_pos;
3808     }
3809   }
3810 
3811   *chars = utable_kompat_decomp_strs XFORM_OK_PLUS utable_kompat_decomp_indices[pos];
3812   return utable_kompat_decomp_lens[pos];
3813 }
3814 
normalize_c(Scheme_Object * o)3815 static Scheme_Object *normalize_c(Scheme_Object *o)
3816 /* Assumes then given string is in normal form D */
3817 {
3818   mzchar *s, *s2, tmp, last_c0 = 0;
3819   int len, i, j = 0, last_c0_pos = 0, last_cc = 0;
3820 
3821   s = SCHEME_CHAR_STR_VAL(o);
3822   len = SCHEME_CHAR_STRLEN_VAL(o);
3823 
3824   s2 = (mzchar *)scheme_malloc_atomic((len + 1) * sizeof(mzchar));
3825   memcpy(s2, s, len * sizeof(mzchar));
3826 
3827   for (i = 0; i < len; i++) {
3828     if ((i + 1 < len)
3829 	&& (s2[i] >= MZ_JAMO_INITIAL_CONSONANT_START)
3830 	&& (s2[i] <= MZ_JAMO_INITIAL_CONSONANT_END)
3831 	&& (s2[i+1] >= MZ_JAMO_VOWEL_START)
3832 	&& (s2[i+1] <= MZ_JAMO_VOWEL_END)) {
3833       /* Need Hangul composition */
3834       if ((i + 2 < len)
3835 	  && (s2[i+2] > MZ_JAMO_TRAILING_CONSONANT_START)
3836 	  && (s2[i+2] <= MZ_JAMO_TRAILING_CONSONANT_END)) {
3837 	/* 3-char composition */
3838 	tmp = (MZ_JAMO_SYLLABLE_START
3839 	       + ((s2[i] - MZ_JAMO_INITIAL_CONSONANT_START)
3840 		  * MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT)
3841 	       + ((s2[i+1] - MZ_JAMO_VOWEL_START)
3842 		  * MZ_JAMO_TRAILING_CONSONANT_COUNT)
3843 	       + (s2[i+2] - MZ_JAMO_TRAILING_CONSONANT_START));
3844 	i += 2;
3845       } else {
3846 	/* 2-char composition */
3847 	tmp = (MZ_JAMO_SYLLABLE_START
3848 	       + ((s2[i] - MZ_JAMO_INITIAL_CONSONANT_START)
3849 		  * MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT)
3850 	       + ((s2[i+1] - MZ_JAMO_VOWEL_START)
3851 		  * MZ_JAMO_TRAILING_CONSONANT_COUNT));
3852 	i++;
3853       }
3854       last_c0 = tmp;
3855       last_c0_pos = j;
3856       last_cc = 0;
3857       s2[j++] = tmp;
3858     } else {
3859       int cc;
3860 
3861       cc = scheme_combining_class(s2[i]);
3862       if (last_c0 && (cc > last_cc))
3863 	tmp = get_composition(last_c0, s2[i]);
3864       else
3865 	tmp = 0;
3866 
3867       if (tmp) {
3868 	/* Need to compose */
3869 	s2[last_c0_pos] = tmp;
3870 	last_c0 = tmp;
3871       } else if (!cc) {
3872 	/* Reset last_c0... */
3873 	tmp = s2[i];
3874 	if (scheme_needs_maybe_compose(tmp)) {
3875 	  last_c0 = tmp;
3876 	  last_c0_pos = j;
3877 	} else {
3878 	  last_c0 = 0;
3879 	}
3880 	last_cc = -1;
3881 	s2[j++] = tmp;
3882       } else {
3883 	s2[j++] = s2[i];
3884 	last_cc = cc;
3885       }
3886     }
3887   }
3888 
3889   s2[j] = 0;
3890   if (len - j > 16) {
3891     s = (mzchar *)scheme_malloc_atomic((j + 1) * sizeof(mzchar));
3892     memcpy(s, s2, (j + 1) * sizeof(mzchar));
3893     s2 = s;
3894   }
3895 
3896   return scheme_make_sized_char_string(s2, j, 0);
3897 }
3898 
normalize_d(Scheme_Object * o,int kompat)3899 static Scheme_Object *normalize_d(Scheme_Object *o, int kompat)
3900 {
3901   mzchar *s, tmp, *s2;
3902   int len, i, delta, j, swapped;
3903 
3904   s = SCHEME_CHAR_STR_VAL(o);
3905   len = SCHEME_CHAR_STRLEN_VAL(o);
3906 
3907   /* Run through string list to predict expansion: */
3908   delta = 0;
3909   for (i = 0; i < len; i++) {
3910     if (scheme_needs_decompose(s[i])) {
3911       int klen;
3912       mzchar snd;
3913       GC_CAN_IGNORE unsigned short *start;
3914 
3915       tmp = s[i];
3916       while (scheme_needs_decompose(tmp)) {
3917 	if (kompat)
3918 	  klen = get_kompat_decomposition(tmp, &start);
3919 	else
3920 	  klen = 0;
3921 	if (klen) {
3922 	  delta += (klen - 1);
3923 	  break;
3924 	} else {
3925 	  tmp = get_canon_decomposition(tmp, &snd);
3926 	  if (tmp) {
3927 	    if (snd) {
3928 	      delta++;
3929 	      if (kompat) {
3930 		klen = get_kompat_decomposition(snd, &start);
3931 		if (klen)
3932 		  delta += (klen - 1);
3933 	      }
3934 	    }
3935 	  } else
3936 	    break;
3937 	}
3938       }
3939     } else if ((s[i] >= MZ_JAMO_SYLLABLE_START)
3940 	       && (s[i] <= MZ_JAMO_SYLLABLE_END)) {
3941       tmp = s[i];
3942       tmp -= MZ_JAMO_SYLLABLE_START;
3943       if (tmp % MZ_JAMO_TRAILING_CONSONANT_COUNT)
3944 	delta += 2;
3945       else
3946 	delta += 1;
3947     }
3948   }
3949 
3950   s2 = (mzchar *)scheme_malloc_atomic((len + delta + 1) * sizeof(mzchar));
3951 
3952   j = 0;
3953   for (i = 0; i < len; i++) {
3954     if (scheme_needs_decompose(s[i])) {
3955       mzchar snd, tmp2;
3956       int snds = 0, klen = 0, k;
3957       GC_CAN_IGNORE unsigned short*start;
3958 
3959       tmp = s[i];
3960       while (scheme_needs_decompose(tmp)) {
3961 	if (kompat)
3962 	  klen = get_kompat_decomposition(tmp, &start);
3963 	else
3964 	  klen = 0;
3965 	if (klen) {
3966 	  for (k = 0; k < klen; k++) {
3967 	    s2[j++] = start[k];
3968 	  }
3969 	  break;
3970 	} else {
3971 	  tmp2 = get_canon_decomposition(tmp, &snd);
3972 	  if (tmp2) {
3973 	    tmp = tmp2;
3974 	    if (snd) {
3975 	      if (kompat)
3976 		klen = get_kompat_decomposition(snd, &start);
3977 	      else
3978 		klen = 0;
3979 	      if (klen) {
3980 		snds += klen;
3981 		for (k = 0; k < klen; k++) {
3982 		  s2[len + delta - snds + k] = start[k];
3983 		}
3984 		klen = 0;
3985 	      } else {
3986 		snds++;
3987 		s2[len + delta - snds] = snd;
3988 	      }
3989 	    }
3990 	  } else
3991 	    break;
3992 	}
3993       }
3994       if (!klen)
3995 	s2[j++] = tmp;
3996       memmove(s2 + j, s2 + len + delta - snds, snds * sizeof(mzchar));
3997       j += snds;
3998     } else if ((s[i] >= MZ_JAMO_SYLLABLE_START)
3999 	       && (s[i] <= MZ_JAMO_SYLLABLE_END)) {
4000       int l, v, t;
4001       tmp = s[i];
4002       tmp -= MZ_JAMO_SYLLABLE_START;
4003       l = tmp / (MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT);
4004       v = (tmp % (MZ_JAMO_VOWEL_COUNT * MZ_JAMO_TRAILING_CONSONANT_COUNT)) / MZ_JAMO_TRAILING_CONSONANT_COUNT;
4005       t = tmp % MZ_JAMO_TRAILING_CONSONANT_COUNT;
4006       s2[j++] = MZ_JAMO_INITIAL_CONSONANT_START + l;
4007       s2[j++] = MZ_JAMO_VOWEL_START + v;
4008       if (t) {
4009 	s2[j++] = MZ_JAMO_TRAILING_CONSONANT_START + t;
4010       }
4011     } else {
4012       s2[j++] = s[i];
4013     }
4014   }
4015   s2[j] = 0;
4016   len += delta;
4017 
4018   /* Reorder pass: */
4019   do {
4020     swapped = 0;
4021     for (i = 0; i < len; i++) {
4022       if ((i + 1 < len)
4023 	  && scheme_combining_class(s2[i])
4024 	  && scheme_combining_class(s2[i+1])
4025 	  && (scheme_combining_class(s2[i+1]) < scheme_combining_class(s2[i]))) {
4026 	/* Reorder and try again: */
4027 	tmp = s2[i + 1];
4028 	s2[i + 1] = s2[i];
4029 	s2[i] = tmp;
4030 	i--;
4031 	swapped = 1;
4032       }
4033     }
4034   } while (swapped);
4035 
4036   return scheme_make_sized_char_string(s2, len, 0);
4037 }
4038 
do_string_normalize_c(const char * who,int argc,Scheme_Object * argv[],int kompat)4039 static Scheme_Object *do_string_normalize_c (const char *who, int argc, Scheme_Object *argv[], int kompat)
4040 {
4041   Scheme_Object *o;
4042   mzchar *s, last_c0 = 0, snd;
4043   int len, i, last_cc = 0;
4044 
4045   o = argv[0];
4046   if (!SCHEME_CHAR_STRINGP(o))
4047     scheme_wrong_contract(who, "string?", 0, argc, argv);
4048 
4049   s = SCHEME_CHAR_STR_VAL(o);
4050   len = SCHEME_CHAR_STRLEN_VAL(o);
4051 
4052   for (i = 0; i < len; i++) {
4053     if (scheme_needs_decompose(s[i])
4054 	&& (kompat || get_canon_decomposition(s[i], &snd))) {
4055       /* Decomposition may expose a different composition */
4056       break;
4057     } else if ((i + 1 < len)
4058 	&& scheme_combining_class(s[i])
4059 	&& scheme_combining_class(s[i+1])
4060 	&& (scheme_combining_class(s[i+1]) < scheme_combining_class(s[i]))) {
4061       /* Need to reorder */
4062       break;
4063     } else if ((s[i] >= MZ_JAMO_INITIAL_CONSONANT_START)
4064 	       && (s[i] <= MZ_JAMO_INITIAL_CONSONANT_END)
4065 	       && (s[i+1] >= MZ_JAMO_VOWEL_START)
4066 	       && (s[i+1] <= MZ_JAMO_VOWEL_END)) {
4067       /* Need Hangul composition */
4068       break;
4069     } else if (last_c0
4070 	       && get_composition(last_c0, s[i])
4071 	       && (scheme_combining_class(s[i]) > last_cc)) {
4072       /* Need to compose */
4073       break;
4074     } else {
4075       int cc;
4076 
4077       cc = scheme_combining_class(s[i]);
4078 
4079       if (!cc) {
4080 	if (scheme_needs_maybe_compose(s[i]))
4081 	  last_c0 = s[i];
4082 	else
4083 	  last_c0 = 0;
4084 	last_cc = -1;
4085       } else
4086 	last_cc = cc;
4087     }
4088   }
4089 
4090   if (i < len) {
4091     o = normalize_c(normalize_d(o, kompat));
4092   }
4093 
4094   return o;
4095 }
4096 
string_normalize_c(int argc,Scheme_Object * argv[])4097 static Scheme_Object *string_normalize_c (int argc, Scheme_Object *argv[])
4098 {
4099   return do_string_normalize_c("string-normalize-nfc", argc, argv, 0);
4100 }
4101 
string_normalize_kc(int argc,Scheme_Object * argv[])4102 static Scheme_Object *string_normalize_kc (int argc, Scheme_Object *argv[])
4103 {
4104   return do_string_normalize_c("string-normalize-nfkc", argc, argv, 1);
4105 }
4106 
do_string_normalize_d(const char * who,int argc,Scheme_Object * argv[],int kompat)4107 static Scheme_Object *do_string_normalize_d (const char *who, int argc, Scheme_Object *argv[], int kompat)
4108 {
4109   Scheme_Object *o;
4110   mzchar *s;
4111   int len, i;
4112 
4113   o = argv[0];
4114   if (!SCHEME_CHAR_STRINGP(o))
4115     scheme_wrong_contract(who, "string?", 0, argc, argv);
4116 
4117   s = SCHEME_CHAR_STR_VAL(o);
4118   len = SCHEME_CHAR_STRLEN_VAL(o);
4119 
4120   for (i = len; i--; ) {
4121     if (scheme_needs_decompose(s[i])) {
4122       /* Need to decompose */
4123       mzchar snd;
4124       if (kompat || get_canon_decomposition(s[i], &snd))
4125 	break;
4126     } else if ((i + 1 < len)
4127 	       && scheme_combining_class(s[i])
4128 	       && scheme_combining_class(s[i+1])
4129 	       && (scheme_combining_class(s[i+1]) < scheme_combining_class(s[i]))) {
4130       /* Need to reorder */
4131       break;
4132     } else if ((s[i] >= MZ_JAMO_SYLLABLE_START)
4133 	       && (s[i] <= MZ_JAMO_SYLLABLE_END)) {
4134       /* Need Hangul decomposition */
4135       break;
4136     }
4137   }
4138 
4139   if (i >= 0) {
4140     o = normalize_d(o, kompat);
4141   }
4142 
4143   return o;
4144 }
4145 
string_normalize_d(int argc,Scheme_Object * argv[])4146 static Scheme_Object *string_normalize_d (int argc, Scheme_Object *argv[])
4147 {
4148   return do_string_normalize_d("string-normalize-nfd", argc, argv, 0);
4149 }
4150 
string_normalize_kd(int argc,Scheme_Object * argv[])4151 static Scheme_Object *string_normalize_kd (int argc, Scheme_Object *argv[])
4152 {
4153   return do_string_normalize_d("string-normalize-nfkd", argc, argv, 1);
4154 }
4155 
4156 /**********************************************************************/
4157 /*                            strcmps                                 */
4158 /**********************************************************************/
4159 
scheme_char_strlen(const mzchar * s)4160 intptr_t scheme_char_strlen(const mzchar *s)
4161 {
4162   intptr_t i;
4163   if ((intptr_t)s & 0x3)
4164     abort();
4165   for (i = 0; s[i]; i++) {
4166   }
4167   return i;
4168 }
4169 
mz_char_strcmp(const char * who,const mzchar * str1,intptr_t l1,const mzchar * str2,intptr_t l2,int use_locale,int size_shortcut)4170 static int mz_char_strcmp(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2,
4171 			  int use_locale, int size_shortcut)
4172 {
4173   intptr_t endres;
4174 
4175   if (size_shortcut && (l1 != l2))
4176     return 1;
4177 
4178   if (use_locale) {
4179     reset_locale();
4180     if (locale_on) {
4181       return do_locale_comp(who, str1, l1, str2, l2, 0);
4182     }
4183   }
4184 
4185   if (l1 > l2) {
4186     l1 = l2;
4187     endres = 1;
4188   } else {
4189     if (l2 > l1)
4190       endres = -1;
4191     else
4192       endres = 0;
4193   }
4194 
4195   while (l1--) {
4196     unsigned int a, b;
4197 
4198     a = *(str1++);
4199     b = *(str2++);
4200 
4201     a = a - b;
4202     if (a)
4203       return a;
4204   }
4205 
4206   return endres;
4207 }
4208 
mz_char_strcmp_ci(const char * who,const mzchar * str1,intptr_t l1,const mzchar * str2,intptr_t l2,int use_locale,int size_shortcut)4209 static int mz_char_strcmp_ci(const char *who, const mzchar *str1, intptr_t l1, const mzchar *str2, intptr_t l2,
4210 			     int use_locale, int size_shortcut)
4211 {
4212   intptr_t p1, p2, sp1, sp2, a, b;
4213   mzchar spec1[SPECIAL_CASE_FOLD_MAX], spec2[SPECIAL_CASE_FOLD_MAX];
4214 
4215   if (size_shortcut && (l1 != l2))
4216     return 1;
4217 
4218   if (use_locale) {
4219     reset_locale();
4220     if (locale_on) {
4221       return do_locale_comp(who, str1, l1, str2, l2, 1);
4222     }
4223   }
4224 
4225   p1 = sp1 = 0;
4226   p2 = sp2 = 0;
4227 
4228   while (((p1 < l1) || sp1) && ((p2 < l2) || sp2)) {
4229     if (sp1) {
4230       a = spec1[--sp1];
4231     } else {
4232       a = str1[p1];
4233       if (scheme_isspecialcasing(a)) {
4234 	int pos, i;
4235 	pos = find_special_casing(a);
4236 	sp1 = uchar_special_casings[pos + 7];
4237 	pos = uchar_special_casings[pos + 8];
4238 	for (i = sp1; i--; pos++) {
4239 	  spec1[i] = uchar_special_casing_data[pos];
4240 	}
4241 	a = spec1[--sp1];
4242       } else {
4243 	a = scheme_tofold(a);
4244       }
4245       p1++;
4246     }
4247 
4248     if (sp2) {
4249       b = spec2[--sp2];
4250     } else {
4251       b = str2[p2];
4252       if (scheme_isspecialcasing(b)) {
4253 	int pos, i;
4254 	pos = find_special_casing(b);
4255 	sp2 = uchar_special_casings[pos + 7];
4256 	pos = uchar_special_casings[pos + 8];
4257 	for (i = sp2; i--; pos++) {
4258 	  spec2[i] = uchar_special_casing_data[pos];
4259 	}
4260 	b = spec2[--sp2];
4261       } else {
4262 	b = scheme_tofold(b);
4263       }
4264       p2++;
4265     }
4266 
4267     a = a - b;
4268     if (a)
4269       return a;
4270   }
4271 
4272   return ((p1 < l1) || sp1) - ((p2 < l2) || sp2);
4273 }
4274 
mz_strcmp(const char * who,unsigned char * str1,intptr_t l1,unsigned char * str2,intptr_t l2)4275 static int mz_strcmp(const char *who, unsigned char *str1, intptr_t l1, unsigned char *str2, intptr_t l2)
4276 {
4277   intptr_t endres;
4278 
4279   if (l1 > l2) {
4280     l1 = l2;
4281     endres = 1;
4282   } else {
4283     if (l2 > l1)
4284       endres = -1;
4285     else
4286       endres = 0;
4287   }
4288 
4289   while (l1--) {
4290     unsigned int a, b;
4291 
4292     a = *(str1++);
4293     b = *(str2++);
4294 
4295     a = a - b;
4296     if (a)
4297       return a;
4298   }
4299 
4300   return endres;
4301 }
4302 
scheme_string_compare(Scheme_Object * a,Scheme_Object * b)4303 int scheme_string_compare(Scheme_Object *a, Scheme_Object *b)
4304 {
4305   return mz_char_strcmp(NULL,
4306                         SCHEME_CHAR_STR_VAL(a),  SCHEME_CHAR_STRTAG_VAL(a),
4307                         SCHEME_CHAR_STR_VAL(b),  SCHEME_CHAR_STRTAG_VAL(b),
4308                         0, 0);
4309 }
4310 
scheme_bytes_compare(Scheme_Object * a,Scheme_Object * b)4311 int scheme_bytes_compare(Scheme_Object *a, Scheme_Object *b)
4312 {
4313   return mz_strcmp(NULL,
4314                    (unsigned char *)SCHEME_BYTE_STR_VAL(a),  SCHEME_BYTE_STRTAG_VAL(a),
4315                    (unsigned char *)SCHEME_BYTE_STR_VAL(b),  SCHEME_BYTE_STRTAG_VAL(b));
4316 }
4317 
4318 /**********************************************************************/
4319 /*                  byte string conversion                            */
4320 /**********************************************************************/
4321 
close_converter(Scheme_Object * o,void * data)4322 static void close_converter(Scheme_Object *o, void *data)
4323 {
4324   Scheme_Converter *c = (Scheme_Converter *)o;
4325 
4326   if (!c->closed) {
4327     c->closed = 1;
4328     if (c->kind == mzICONV_KIND) {
4329       rktio_converter_close(scheme_rktio, c->cd);
4330       c->cd = NULL;
4331     }
4332     if (c->mref) {
4333       scheme_remove_managed(c->mref, (Scheme_Object *)c);
4334       c->mref = NULL;
4335     }
4336   }
4337 }
4338 
scheme_open_converter(const char * from_e,const char * to_e)4339 Scheme_Object *scheme_open_converter(const char *from_e, const char *to_e)
4340 {
4341   Scheme_Converter *c;
4342   rktio_converter_t *cd;
4343   int kind;
4344   int permissive, wtf;
4345   int need_regis = 1;
4346   Scheme_Custodian_Reference *mref;
4347 
4348   if (!*to_e || !*from_e)
4349     reset_locale();
4350 
4351   if ((!strcmp(from_e, "UTF-8")
4352        || !strcmp(from_e, "UTF-8-permissive")
4353        || (!*from_e && mzLOCALE_IS_UTF_8(current_locale_name)))
4354       && (!strcmp(to_e, "UTF-8")
4355 	  || (!*to_e && mzLOCALE_IS_UTF_8(current_locale_name)))) {
4356     /* Use the built-in UTF-8<->UTF-8 converter: */
4357     kind = mzUTF8_KIND;
4358     if (!strcmp(from_e, "UTF-8-permissive"))
4359       permissive = 0xFFFD;
4360     else
4361       permissive = 0;
4362     cd = NULL;
4363     need_regis = 0;
4364     wtf = 0;
4365   } else if ((!strcmp(from_e, "platform-UTF-8")
4366 	      || !strcmp(from_e, "platform-UTF-8-permissive"))
4367 	     && !strcmp(to_e, "platform-UTF-16")) {
4368     kind = mzUTF8_TO_UTF16_KIND;
4369     if (!strcmp(from_e, "platform-UTF-8-permissive"))
4370       permissive = 0xFFFD;
4371     else
4372       permissive = 0;
4373     cd = NULL;
4374     need_regis = 0;
4375     wtf = WIN_UTF16_AS_WTF16(1);
4376   } else if ((!strcmp(from_e, "WTF-8")
4377 	      || !strcmp(from_e, "WTF-8-permissive"))
4378 	     && !strcmp(to_e, "WTF-16")) {
4379     kind = mzUTF8_TO_UTF16_KIND;
4380     if (!strcmp(from_e, "WTF-8-permissive"))
4381       permissive = 0xFFFD;
4382     else
4383       permissive = 0;
4384     cd = NULL;
4385     need_regis = 0;
4386     wtf = 1;
4387   } else if (!strcmp(from_e, "platform-UTF-16")
4388 	     && !strcmp(to_e, "platform-UTF-8")) {
4389     kind = mzUTF16_TO_UTF8_KIND;
4390     permissive = 0;
4391     cd = NULL;
4392     need_regis = 0;
4393     wtf = WIN_UTF16_AS_WTF16(1);
4394   } else if (!strcmp(from_e, "WTF-16")
4395 	     && !strcmp(to_e, "WTF-8")) {
4396     kind = mzUTF16_TO_UTF8_KIND;
4397     permissive = 0;
4398     cd = NULL;
4399     need_regis = 0;
4400     wtf = 1;
4401   } else {
4402     char *tmp_from_e = NULL, *tmp_to_e = NULL;
4403 
4404     if (!(rktio_convert_properties(scheme_rktio) & RKTIO_CONVERTER_SUPPORTED))
4405       return scheme_false;
4406 
4407     if (!*from_e || !*to_e)
4408       reset_locale();
4409 
4410     if (!*from_e) {
4411       tmp_from_e = rktio_locale_encoding(scheme_rktio);
4412       from_e = tmp_from_e;
4413     }
4414     if (!*to_e) {
4415       tmp_to_e = rktio_locale_encoding(scheme_rktio);
4416       to_e = tmp_to_e;
4417     }
4418     cd = rktio_converter_open(scheme_rktio, to_e, from_e);
4419 
4420     if (tmp_from_e) free(tmp_from_e);
4421     if (tmp_to_e) free(tmp_to_e);
4422 
4423     if (!cd)
4424       return scheme_false;
4425 
4426     kind = mzICONV_KIND;
4427     permissive = 0;
4428     wtf = 0;
4429   }
4430 
4431   c = MALLOC_ONE_TAGGED(Scheme_Converter);
4432   c->so.type = scheme_string_converter_type;
4433   c->closed = 0;
4434   c->kind = kind;
4435   c->permissive = permissive;
4436   c->wtf = wtf;
4437   c->cd = cd;
4438   if (!need_regis)
4439     mref = NULL;
4440   else
4441     mref = scheme_add_managed(NULL,
4442 			      (Scheme_Object *)c,
4443 			      close_converter,
4444 			      NULL, 1);
4445   c->mref = mref;
4446 
4447   return (Scheme_Object *)c;
4448 }
4449 
byte_string_open_converter(int argc,Scheme_Object ** argv)4450 static Scheme_Object *byte_string_open_converter(int argc, Scheme_Object **argv)
4451 {
4452   Scheme_Object *s1, *s2;
4453   char *from_e, *to_e;
4454 
4455   if (!SCHEME_CHAR_STRINGP(argv[0]))
4456     scheme_wrong_contract("bytes-open-converter", "bytes?", 0, argc, argv);
4457   if (!SCHEME_CHAR_STRINGP(argv[1]))
4458     scheme_wrong_contract("bytes-open-converter", "bytes?", 1, argc, argv);
4459 
4460   scheme_custodian_check_available(NULL, "bytes-open-converter", "converter");
4461 
4462   s1 = scheme_char_string_to_byte_string(argv[0]);
4463   s2 = scheme_char_string_to_byte_string(argv[1]);
4464 
4465   if (scheme_byte_string_has_null(s1))
4466     return scheme_false;
4467   if (scheme_byte_string_has_null(s2))
4468     return scheme_false;
4469 
4470   from_e = SCHEME_BYTE_STR_VAL(s1);
4471   to_e = SCHEME_BYTE_STR_VAL(s2);
4472 
4473   return scheme_open_converter(from_e, to_e);
4474 }
4475 
convert_one(const char * who,int opos,int argc,Scheme_Object * argv[])4476 static Scheme_Object *convert_one(const char *who, int opos, int argc, Scheme_Object *argv[])
4477 {
4478   char *r, *instr;
4479   int status;
4480   intptr_t amt_read, amt_wrote;
4481   intptr_t istart, ifinish, ostart, ofinish;
4482   Scheme_Object *a[3], *status_sym;
4483   Scheme_Converter *c;
4484 
4485   if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_string_converter_type))
4486     scheme_wrong_contract(who, "bytes-converter?", 0, argc, argv);
4487 
4488   if (opos > 1) {
4489     if (!SCHEME_BYTE_STRINGP(argv[1]))
4490       scheme_wrong_contract(who, "bytes?", 1, argc, argv);
4491     scheme_get_substring_indices(who, argv[1], argc, argv, 2, 3, &istart, &ifinish);
4492   } else {
4493     istart = 0;
4494     ifinish = 0;
4495   }
4496 
4497   if (argc > opos) {
4498     if (SCHEME_TRUEP(argv[opos])) {
4499       if (!SCHEME_MUTABLE_BYTE_STRINGP(argv[opos]))
4500 	scheme_wrong_contract(who, "(and/c bytes? (not/c immutable?))", opos, argc, argv);
4501       r = SCHEME_BYTE_STR_VAL(argv[opos]);
4502       scheme_get_substring_indices(who, argv[opos], argc, argv, opos + 1, opos + 2, &ostart, &ofinish);
4503     } else {
4504       int ip;
4505       r = NULL;
4506       for (ip = opos + 1; ip <= opos + 2; ip++) {
4507 	if (argc > ip) {
4508 	  int ok = 0;
4509 	  if (SCHEME_INTP(argv[ip]))
4510 	    ok = SCHEME_INT_VAL(argv[ip]) >= 0;
4511 	  else if (SCHEME_BIGNUMP(argv[ip]))
4512 	    ok = SCHEME_BIGPOS(argv[ip]);
4513 	  else if ((ip == opos + 2) && SCHEME_FALSEP(argv[ip]))
4514 	    ok = 1;
4515 	  if (!ok)
4516 	    scheme_wrong_contract(who,
4517                                   ((ip == opos + 2)
4518                                    ? "(or/c exact-nonnegative-integer? #f)"
4519                                    : "exact-nonnegative-integer?"),
4520                                   ip, argc, argv);
4521 	}
4522       }
4523       if ((argc > opos + 2) && SCHEME_TRUEP(argv[opos + 2])) {
4524 	Scheme_Object *delta;
4525 	if (scheme_bin_lt(argv[opos + 2], argv[opos + 1])) {
4526 	  scheme_contract_error(who,
4527                                 "ending index is less than the starting index",
4528                                 "staring index", 1, argv[opos + 1],
4529                                 "ending index", 1, argv[opos + 2],
4530                                 NULL);
4531 	}
4532 	delta = scheme_bin_minus(argv[opos + 2], argv[opos + 1]);
4533 	if (SCHEME_BIGNUMP(delta))
4534 	  ofinish = -1;
4535 	else
4536 	  ofinish = SCHEME_INT_VAL(delta);
4537 	ostart = 0;
4538       } else {
4539 	ostart = 0;
4540 	ofinish = -1;
4541       }
4542     }
4543   } else {
4544     r = NULL;
4545     ostart = 0;
4546     ofinish = -1;
4547   }
4548 
4549   c = (Scheme_Converter *)argv[0];
4550   if (c->closed)
4551     scheme_contract_error(who, "converter is closed",
4552                           "converter", 1, argv[0],
4553                           NULL);
4554 
4555   instr = ((opos > 1) ? SCHEME_BYTE_STR_VAL(argv[1]) : NULL);
4556 
4557   if (c->kind == mzUTF16_TO_UTF8_KIND) {
4558     if (istart & 0x1) {
4559       /* Copy to word-align */
4560       char *c2;
4561       c2 = (char *)scheme_malloc_atomic(ifinish - istart);
4562       memcpy(c2, instr XFORM_OK_PLUS istart, ifinish - istart);
4563       ifinish = ifinish - istart;
4564       istart = 0;
4565       instr = c2;
4566     }
4567 
4568     status = utf8_encode_x((const unsigned int *)instr, istart >> 1, ifinish >> 1,
4569 			   (unsigned char *)r, ostart, ofinish,
4570 			   &amt_read, &amt_wrote, 1, c->wtf);
4571 
4572     amt_read -= (istart >> 1);
4573 
4574     if (amt_read) {
4575       if (!r) {
4576 	/* Need to allocate, then do it again: */
4577 	r = (char *)scheme_malloc_atomic(amt_wrote + 1);
4578 	utf8_encode_x((const unsigned int *)instr, istart >> 1, ifinish >> 1,
4579 		      (unsigned char *)r, ostart, ofinish,
4580 		      NULL, NULL, 1, c->wtf);
4581 	r[amt_wrote] = 0;
4582       }
4583       amt_read <<= 1;
4584     }
4585 
4586     /* We might get a -1 result because the input has an odd number of
4587        bytes, and 2nd+next-to-last bytes form an unpaired
4588        surrogate. In that case, the transformer normally needs one
4589        more byte: Windows is little-endian, so we need the byte to
4590        tell whether the surrogate is paired, and for all other
4591        platforms (where we assume that surrogates are paired), we need
4592        the byte to generate output. Technically, on a big-endian
4593        non-Windows machine, we could generate the first byte of UTF-8
4594        output and keep the byte as state, but we don't. */
4595 
4596     if (status != -1) {
4597       if (amt_read < ((ifinish - istart) & ~0x1)) {
4598 	/* Must have run out of output space */
4599 	status = 1;
4600       } else {
4601 	/* Read all of input --- but it wasn't really all if there
4602 	   was an odd number of bytes. */
4603 	if ((ifinish - istart) & 0x1)
4604 	  status = -1;
4605 	else
4606 	  status = 0;
4607       }
4608     }
4609   } else if (c->kind != mzICONV_KIND) {
4610     /* UTF-8 -> UTF-{8,16} "identity" converter, but maybe permissive */
4611     if (instr) {
4612       intptr_t _ostart, _ofinish;
4613       int utf16;
4614 
4615       if (c->kind == mzUTF8_TO_UTF16_KIND) {
4616 	_ostart = ostart;
4617 	_ofinish = ofinish;
4618 	if (_ostart & 0x1)
4619 	  _ostart++;
4620 	_ostart >>= 1;
4621 	if (_ofinish > 0)
4622 	  _ofinish >>= 1;
4623 	utf16 = 1;
4624       } else {
4625 	_ostart = ostart;
4626 	_ofinish = ofinish;
4627 	utf16 = 0;
4628       }
4629 
4630       status = utf8_decode_x((unsigned char *)instr, istart, ifinish,
4631 			     (unsigned int *)r, _ostart, _ofinish,
4632 			     &amt_read, &amt_wrote,
4633 			     1, utf16, NULL, 1, c->permissive, c->wtf);
4634 
4635       if (utf16) {
4636 	_ostart <<= 1;
4637 	amt_wrote <<= 1;
4638 	if ((ostart & 0x1) && (amt_wrote > _ostart)) {
4639 	  /* Shift down one byte: */
4640 	  memmove(r XFORM_OK_PLUS ostart, r XFORM_OK_PLUS _ostart, amt_wrote - _ostart);
4641 	}
4642       }
4643 
4644       amt_read -= istart;
4645       amt_wrote -= _ostart;
4646       if (status == -3) {
4647 	/* r is not NULL; ran out of room */
4648 	status = 1;
4649       } else {
4650 	if (amt_wrote) {
4651 	  if (!r) {
4652 	    /* Need to allocate, then do it again: */
4653 	    r = (char *)scheme_malloc_atomic(amt_wrote + 1);
4654 	    utf8_decode_x((unsigned char *)instr, istart, ifinish,
4655 			  (unsigned int *)r, ostart, _ofinish,
4656 			  NULL, NULL,
4657 			  1, utf16, NULL, 1, c->permissive, c->wtf);
4658 	    r[amt_wrote] = 0;
4659 	  }
4660 	} else if (!r)
4661 	  r = "";
4662 	if (status > 0)
4663 	  status = 0;
4664       }
4665     } else {
4666       r = "";
4667       status = 0;
4668       amt_read = 0;
4669       amt_wrote = 0;
4670     }
4671   } else {
4672     r = do_convert(c->cd, NULL, NULL, 0,
4673 		   instr, istart, ifinish-istart,
4674 		   r, ostart, ofinish-ostart,
4675 		   !r, /* grow? */
4676 		   0,
4677 		   (r ? 0 : 1), /* terminator */
4678 		   &amt_read, &amt_wrote,
4679 		   &status);
4680   }
4681 
4682   if (status == 0) {
4683     /* Converted all input without error */
4684     status_sym = complete_symbol;
4685   } else if (status == 1) {
4686     /* Filled output, more input ready */
4687     status_sym = continues_symbol;
4688   } else if (status == -1) {
4689     /* Input ends in the middle of an encoding */
4690     status_sym = aborts_symbol;
4691   } else {
4692     /* Assert: status == -2 */
4693     /* Input has error (that won't be fixed by
4694        adding more characters */
4695     status_sym = error_symbol;
4696   }
4697 
4698   if (argc <= opos) {
4699     a[0] = scheme_make_sized_byte_string(r, amt_wrote, 0);
4700   } else {
4701     a[0] = scheme_make_integer(amt_wrote);
4702   }
4703   if (opos > 1) {
4704     a[1] = scheme_make_integer(amt_read);
4705     a[2] = status_sym;
4706     return scheme_values(3, a);
4707   } else {
4708     a[1] = status_sym;
4709     return scheme_values(2, a);
4710   }
4711 }
4712 
byte_string_convert(int argc,Scheme_Object * argv[])4713 static Scheme_Object *byte_string_convert(int argc, Scheme_Object *argv[])
4714 {
4715   return convert_one("bytes-convert", 4, argc, argv);
4716 }
4717 
byte_string_convert_end(int argc,Scheme_Object * argv[])4718 static Scheme_Object *byte_string_convert_end(int argc, Scheme_Object *argv[])
4719 {
4720   return convert_one("bytes-convert-end", 1, argc, argv);
4721 }
4722 
scheme_close_converter(Scheme_Object * conv)4723 void scheme_close_converter(Scheme_Object *conv)
4724 {
4725   close_converter(conv, NULL);
4726 }
4727 
byte_string_close_converter(int argc,Scheme_Object ** argv)4728 static Scheme_Object *byte_string_close_converter(int argc, Scheme_Object **argv)
4729 {
4730   if (!SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_string_converter_type))
4731     scheme_wrong_contract("bytes-close-converter", "bytes-converter?", 0, argc, argv);
4732 
4733   scheme_close_converter(argv[0]);
4734 
4735   return scheme_void;
4736 }
4737 
4738 static Scheme_Object *
byte_converter_p(int argc,Scheme_Object * argv[])4739 byte_converter_p(int argc, Scheme_Object *argv[])
4740 {
4741   return (SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_string_converter_type)
4742 	  ? scheme_true
4743 	  : scheme_false);
4744 }
4745 
4746 /**********************************************************************/
4747 /*                         utf8 converter                             */
4748 /**********************************************************************/
4749 
utf8_decode_x(const unsigned char * s,intptr_t start,intptr_t end,unsigned int * us,intptr_t dstart,intptr_t dend,intptr_t * ipos,intptr_t * jpos,char compact,char utf16,int * _state,int might_continue,int permissive,int wtf)4750 static intptr_t utf8_decode_x(const unsigned char *s, intptr_t start, intptr_t end,
4751                               unsigned int *us, intptr_t dstart, intptr_t dend,
4752                               intptr_t *ipos, intptr_t *jpos,
4753                               char compact, char utf16, int *_state,
4754                               int might_continue, int permissive, int wtf)
4755      /* Results:
4756 	non-negative => translation complete, = number of produced chars
4757 	-1 => input ended in middle of encoding (only if might_continue)
4758 	-2 => encoding error (only if permissive is 0)
4759 	-3 => not enough output room
4760 
4761 	ipos & jpos are filled with ending positions (between [d]start
4762 	and [d]end) before return, unless they are NULL.
4763 
4764 	compact => UTF-8 to UTF-8 or UTF-16 --- the latter if utf16;
4765 	for utf16 and wtf, decode extended UTF-8 that allows surrogates
4766 
4767 	_state provides initial state and is filled with ending state;
4768 	when it's not NULL, the us must be NULL
4769 
4770 	might_continue => allows -1 result without consuming characters
4771 
4772 	permissive is non-zero => use permissive as value for bad byte
4773 	sequences. When generating UTF-8, this must be an ASCII character
4774         or U+FFFD. */
4775 {
4776   intptr_t i, j, oki;
4777   int failmode = -3, state;
4778   int init_doki;
4779   int nextbits, v;
4780   unsigned int sc;
4781   int pending_surrogate = 0;
4782 
4783   if (_state) {
4784     state = (*_state) & 0x7;
4785     init_doki = (((*_state) >> 3) & 0x7);
4786     nextbits = ((((*_state) >> 6) & 0xF) << 2);
4787     /* Need v to detect 0xD800 through 0xDFFF
4788        Note that we have 22 bits to work with, which is
4789        is enough to detect > 0x10FFFF */
4790     v = ((*_state) >> 10);
4791   } else {
4792     state = 0;
4793     init_doki = 0;
4794     nextbits = 0;
4795     v = 0;
4796   }
4797 
4798   /* In non-permissive mode, a negative result means ill-formed input.
4799      Permissive mode accepts anything and tries to convert it.  In
4800      that case, the strategy for illegal sequences is to convert
4801      anything bad to the given "permissive" value. */
4802 
4803   if (end < 0)
4804     end = strlen((char *)s);
4805   if (dend < 0)
4806     dend = 0x7FFFFFFF;
4807 
4808 # define ENCFAIL i = oki; failmode = -2; break
4809 
4810   oki = start;
4811   j = dstart;
4812   i = start;
4813   if (j < dend) {
4814     while (i < end) {
4815       sc = s[i];
4816       if (sc < 0x80) {
4817 	if (state) {
4818 	  /* In a sequence, but didn't continue */
4819 	  state = 0;
4820 	  nextbits = 0;
4821 	  if (permissive) {
4822 	    v = permissive;
4823 	    i = oki;
4824 	    j += init_doki;
4825 	  } else {
4826 	    ENCFAIL;
4827 	  }
4828 	} else {
4829 	  v = sc;
4830 	}
4831       } else if ((sc & 0xC0) == 0x80) {
4832 	/* Continues a sequence ... */
4833 	if (state) {
4834 	  /* ... and we're in one ... */
4835 	  if (!nextbits || (sc & nextbits)) {
4836 	    /* and we have required bits. */
4837 	    v = (v << 6) + (sc & 0x3F);
4838 	    nextbits = 0;
4839 	    --state;
4840 	    if (state) {
4841 	      i++;
4842 	      continue;
4843 	    }
4844 	    /* We finished. One last check: */
4845 	    if ((((v >= 0xD800) && (v <= 0xDFFF))
4846 		 || (v > 0x10FFFF))
4847 		&& (!wtf
4848                     || !utf16
4849                     /* If WTF-16, just apply upper-limit check */
4850                     || (v > 0x10FFFF))) {
4851 	      /* UTF-16 surrogates or other illegal code units */
4852 	      if (permissive) {
4853 		v = permissive;
4854 		j += init_doki;
4855 		i = oki;
4856 	      } else {
4857 		ENCFAIL;
4858 	      }
4859 	    }
4860 	  } else {
4861 	    /* ... but we're missing required bits. */
4862 	    state = 0;
4863 	    nextbits = 0;
4864 	    if (permissive) {
4865 	      v = permissive;
4866 	      j += init_doki;
4867 	      i = oki;
4868 	    } else {
4869 	      ENCFAIL;
4870 	    }
4871 	  }
4872 	} else {
4873 	  /* ... but we're not in one */
4874 	  if (permissive) {
4875 	    v = permissive;
4876 	  } else {
4877 	    ENCFAIL;
4878 	  }
4879 	}
4880       } else if (state) {
4881 	/* bad: already in a sequence */
4882 	state = 0;
4883 	if (permissive) {
4884 	  v = permissive;
4885 	  i = oki;
4886 	  j += init_doki;
4887 	} else {
4888 	  ENCFAIL;
4889 	}
4890       } else {
4891 	if ((sc & 0xE0) == 0xC0) {
4892 	  if (sc & 0x1E) {
4893 	    state = 1;
4894 	    v = (sc & 0x1F);
4895 	    i++;
4896 	    continue;
4897 	  }
4898 	  /* else too small */
4899 	} else if ((sc & 0xF0) == 0xE0) {
4900 	  state = 2;
4901 	  v = (sc & 0xF);
4902 	  if (!v)
4903 	    nextbits = 0x20;
4904 	  i++;
4905 	  continue;
4906 	} else if ((sc & 0xF8) == 0xF0) {
4907 	  v = (sc & 0x7);
4908 	  if (v <= 4) {
4909 	    state = 3;
4910 	    if (!v)
4911 	      nextbits = 0x30;
4912 	    i++;
4913 	    continue;
4914 	  }
4915 	  /* Else will be larger than 0x10FFFF, so fail */
4916 	}
4917 	/* Too small, or 0xFF or 0xFe, or start of a 5- or 6-byte sequence */
4918 	if (permissive) {
4919 	  v = permissive;
4920 	} else {
4921 	  ENCFAIL;
4922 	}
4923       }
4924 
4925       /* If we get here, we're supposed to output v */
4926 
4927       if (compact) {
4928 	if (utf16) {
4929 	  if (v > 0xFFFF) {
4930 	    if (pending_surrogate) {
4931 	      if (us)
4932 		((unsigned short *)us)[j] = pending_surrogate;
4933 	      j++; /* Accept previously written unpaired surrogate */
4934 	      pending_surrogate = 0;
4935 	    }
4936 	    if (j + 1 >= dend)
4937 	      break;
4938 	    if (us) {
4939 	      v -= 0x10000;
4940 	      ((unsigned short *)us)[j] = 0xD800 | ((v >> 10) & 0x3FF);
4941 	      ((unsigned short *)us)[j+1] = 0xDC00 | (v & 0x3FF);
4942 	    }
4943 	    j++;
4944 	  } else if (wtf) {
4945 	    /* We allow a surrogate by itself, but don't allow
4946 	       a 0xDC00 after a 0xD800, otherwise multiple encodings can
4947 	       map to the same thing. */
4948 	    if ((v >= 0xD800) && (v <= 0xDFFF)) {
4949 	      if (pending_surrogate && ((v & 0xDC00) == 0xDC00)) {
4950 		/* This looks like a surrogate pair, so disallow it. */
4951 		if (permissive) {
4952 		  /* We need to fill in 6 permissive substitutions,
4953 		     one for each input byte. If we can't put all 6,
4954 		     then don't use any input. */
4955 		  if (j + 5 >= dend) {
4956 		    break;
4957 		  } else {
4958 		    int p;
4959 		    if (us) {
4960 		      for (p = 0; p < 5; p++) {
4961 			if (j + p >= dend)
4962 			  break;
4963 			((unsigned short *)us)[j+p] = permissive;
4964 		      }
4965 		    }
4966 		    j += 5;
4967 		    v = permissive;
4968 		  }
4969 		} else {
4970 		  ENCFAIL;
4971 		}
4972 		pending_surrogate = 0;
4973 	      } else {
4974 		if (pending_surrogate) {
4975 		  if (us)
4976 		    ((unsigned short *)us)[j] = pending_surrogate;
4977 		  j++; /* Accept previousy written unpaired surrogate */
4978 		  pending_surrogate = 0;
4979 		  if (j >= dend)
4980 		    break;
4981 		}
4982 		if ((v & 0xDC00) == 0xD800)
4983 		  pending_surrogate = v;
4984 		else
4985 		  pending_surrogate = 0;
4986 	      }
4987 	    } else {
4988 	      if (pending_surrogate) {
4989 		if (us)
4990 		  ((unsigned short *)us)[j] = pending_surrogate;
4991 		j++; /* Accept previousy written unpaired surrogate */
4992 		pending_surrogate = 0;
4993 		if (j >= dend)
4994 		  break;
4995 	      }
4996 	    }
4997 
4998 	    if (pending_surrogate)
4999 	      --j; /* don't accept unpaired surrogate, yet */
5000 	    else if (us)
5001 	      ((unsigned short *)us)[j] = v;
5002           } else {
5003 	    if (us)
5004 	      ((unsigned short *)us)[j] = v;
5005 	  }
5006 	} else {
5007 	  intptr_t delta;
5008 	  delta = (i - oki);
5009 	  if (delta) {
5010 	    if (j + delta + 1 < dend) {
5011 	      if (us)
5012 		memcpy(((char *)us) + j, s + oki, delta + 1);
5013 	      j += delta;
5014 	    } else
5015 	      break;
5016 	  } else if (v == 0xFFFD) {
5017             if (j + 3 < dend) {
5018               if (us) {
5019                 ((unsigned char *)us)[j] = 0xEF;
5020                 ((unsigned char *)us)[j+1] = 0xBF;
5021                 ((unsigned char *)us)[j+2] = 0xBD;
5022               }
5023               j += 2;
5024             } else
5025               break;
5026           } else if (us) {
5027             ((unsigned char *)us)[j] = v;
5028           }
5029 	}
5030       } else if (us) {
5031 	us[j] = v;
5032       }
5033       j++;
5034       i++;
5035       oki = i;
5036       init_doki = 0;
5037       if (j >= dend)
5038 	break;
5039     }
5040   }
5041 
5042   if (_state) {
5043     if (!state)
5044       *_state = 0;
5045     else
5046       *_state = (state
5047 		 | (((end - oki) + init_doki) << 3)
5048 		 | ((nextbits >> 2) << 6)
5049 		 | (v << 10));
5050   } else if (state) {
5051     if (might_continue || !permissive) {
5052       failmode = -1;
5053       i = end - 1; /* to ensure that failmode is returned */
5054     } else if (permissive) {
5055       if (pending_surrogate) {
5056         /* Unpaired surrogate before permissive replacements */
5057         if (utf16 && (j < dend)) {
5058           if (us)
5059             ((unsigned short *)us)[j] = pending_surrogate;
5060           j++;
5061         }
5062         pending_surrogate = 0;
5063       }
5064       for (i = oki; i < end; i++) {
5065 	if (j < dend) {
5066 	  if (us) {
5067 	    if (compact) {
5068 	      if (utf16)
5069 		((unsigned short *)us)[j] = permissive;
5070 	      else
5071 		((unsigned char *)us)[j] = permissive;
5072 	    } else
5073 	      us[j] = permissive;
5074 	  }
5075 	  j++;
5076 	} else
5077 	  break;
5078       }
5079       oki = i;
5080     }
5081   }
5082 
5083   if (pending_surrogate) {
5084     if (!might_continue) {
5085       /* Accept unpaired surrogate at end of input */
5086       if (j < dend) {
5087         if (us)
5088           ((unsigned short *)us)[j] = pending_surrogate;
5089         j++;
5090       }
5091     } else {
5092       oki -= 3;
5093     }
5094   }
5095 
5096   if (ipos)
5097     *ipos = oki;
5098   if (jpos)
5099     *jpos = j;
5100 
5101   if (i < end)
5102     return failmode;
5103 
5104   if (pending_surrogate) {
5105     /* input must have ended right after surrogate */
5106     return -1;
5107   }
5108 
5109   return j - dstart;
5110 }
5111 
scheme_utf8_decode(const unsigned char * s,intptr_t start,intptr_t end,unsigned int * us,intptr_t dstart,intptr_t dend,intptr_t * ipos,char utf16,int permissive)5112 intptr_t scheme_utf8_decode(const unsigned char *s, intptr_t start, intptr_t end,
5113                             unsigned int *us, intptr_t dstart, intptr_t dend,
5114                             intptr_t *ipos, char utf16, int permissive)
5115 {
5116   return utf8_decode_x(s, start, end, us, dstart, dend,
5117 		       ipos, NULL, utf16, utf16, NULL, 0, permissive, WIN_UTF16_AS_WTF16(utf16));
5118 }
5119 
scheme_utf8_decode_offset_prefix(const unsigned char * s,intptr_t start,intptr_t end,unsigned int * us,intptr_t dstart,intptr_t dend,intptr_t * ipos,char utf16,int permissive)5120 intptr_t scheme_utf8_decode_offset_prefix(const unsigned char *s, intptr_t start, intptr_t end,
5121                                           unsigned int *us, intptr_t dstart, intptr_t dend,
5122                                           intptr_t *ipos, char utf16, int permissive)
5123 {
5124   return utf8_decode_x(s, start, end, us, dstart, dend,
5125 		       ipos, NULL, utf16, utf16, NULL, 1, permissive, WIN_UTF16_AS_WTF16(utf16));
5126 }
5127 
scheme_utf8_decode_as_prefix(const unsigned char * s,intptr_t start,intptr_t end,unsigned int * us,intptr_t dstart,intptr_t dend,intptr_t * ipos,char utf16,int permissive)5128 intptr_t scheme_utf8_decode_as_prefix(const unsigned char *s, intptr_t start, intptr_t end,
5129                                       unsigned int *us, intptr_t dstart, intptr_t dend,
5130                                       intptr_t *ipos, char utf16, int permissive)
5131      /* Always returns number of read characters, not error codes. */
5132 {
5133   intptr_t opos;
5134   utf8_decode_x(s, start, end, us, dstart, dend,
5135 		ipos, &opos, utf16, utf16, NULL, 1, permissive, WIN_UTF16_AS_WTF16(utf16));
5136   return opos - dstart;
5137 }
5138 
scheme_utf8_decode_all(const unsigned char * s,intptr_t len,unsigned int * us,int permissive)5139 intptr_t scheme_utf8_decode_all(const unsigned char *s, intptr_t len, unsigned int *us, int permissive)
5140 {
5141   return utf8_decode_x(s, 0, len, us, 0, -1, NULL, NULL, 0, 0, NULL, 0, permissive, 0);
5142 }
5143 
scheme_utf8_decode_prefix(const unsigned char * s,intptr_t len,unsigned int * us,int permissive)5144 intptr_t scheme_utf8_decode_prefix(const unsigned char *s, intptr_t len, unsigned int *us, int permissive)
5145      /* us != NULL */
5146 {
5147   {
5148     /* Try fast path (all ASCII) */
5149     intptr_t i;
5150     for (i = 0; i < len; i++) {
5151       if (s[i] < 128)
5152 	us[i] = s[i];
5153       else
5154 	break;
5155     }
5156     if (i == len)
5157       return len;
5158   }
5159 
5160   return utf8_decode_x(s, 0, len, us, 0, -1, NULL, NULL, 0, 0, NULL, 1, permissive, 0);
5161 }
5162 
scheme_utf8_decode_to_buffer_len(const unsigned char * s,intptr_t len,mzchar * buf,intptr_t blen,intptr_t * _ulen)5163 mzchar *scheme_utf8_decode_to_buffer_len(const unsigned char *s, intptr_t len,
5164 					 mzchar *buf, intptr_t blen, intptr_t *_ulen)
5165 {
5166   intptr_t ulen;
5167 
5168   ulen = utf8_decode_x(s, 0, len, NULL, 0, -1,
5169 		       NULL, NULL, 0, 0,
5170 		       NULL, 0, 0, 0);
5171   if (ulen < 0)
5172     return NULL;
5173   if (ulen + 1 > blen) {
5174     buf = (mzchar *)scheme_malloc_atomic((ulen + 1) * sizeof(mzchar));
5175   }
5176   utf8_decode_x(s, 0, len, buf, 0, -1,
5177 		NULL, NULL, 0, 0,
5178 		NULL, 0, 0, 0);
5179   buf[ulen] = 0;
5180   *_ulen = ulen;
5181   return buf;
5182 }
5183 
scheme_utf8_decode_to_buffer(const unsigned char * s,intptr_t len,mzchar * buf,intptr_t blen)5184 mzchar *scheme_utf8_decode_to_buffer(const unsigned char *s, intptr_t len,
5185 				     mzchar *buf, intptr_t blen)
5186 {
5187   intptr_t ulen;
5188   return scheme_utf8_decode_to_buffer_len(s, len, buf, blen, &ulen);
5189 }
5190 
scheme_utf8_decode_count(const unsigned char * s,intptr_t start,intptr_t end,int * _state,int might_continue,int permissive)5191 intptr_t scheme_utf8_decode_count(const unsigned char *s, intptr_t start, intptr_t end,
5192 			     int *_state, int might_continue, int permissive)
5193 {
5194   intptr_t pos = 0;
5195 
5196   if (!_state || !*_state) {
5197     /* Try fast path (all ASCII): */
5198     intptr_t i;
5199     for (i = start; i < end; i++) {
5200       if (s[i] > 127)
5201 	break;
5202     }
5203     if (i == end)
5204       return end - start;
5205   }
5206 
5207   utf8_decode_x(s, start, end,
5208 		NULL, 0, -1,
5209 		NULL, &pos,
5210 		0, 0, _state,
5211 		might_continue, permissive, 0);
5212 
5213   return pos;
5214 }
5215 
utf8_encode_x(const unsigned int * us,intptr_t start,intptr_t end,unsigned char * s,intptr_t dstart,intptr_t dend,intptr_t * _ipos,intptr_t * _opos,char utf16,int wtf)5216 static intptr_t utf8_encode_x(const unsigned int *us, intptr_t start, intptr_t end,
5217                               unsigned char *s, intptr_t dstart, intptr_t dend,
5218                               intptr_t *_ipos, intptr_t *_opos, char utf16, int wtf)
5219   /* Results:
5220         -1 => input ended in the middle of an encoding - only when utf16 and _opos
5221 	non-negative => reports number of bytes/code-units produced */
5222 {
5223   intptr_t i, j, done = start;
5224 
5225   if (dend < 0)
5226     dend = 0x7FFFFFFF;
5227 
5228   if (!s) {
5229     unsigned int wc;
5230     j = 0;
5231     for (i = start; i < end; i++) {
5232       if (utf16) {
5233 	wc = ((unsigned short *)us)[i];
5234 	if ((wc & 0xF800) == 0xD800) {
5235 	  /* Unparse surrogates. We assume that the surrogates are
5236 	     well formed, unless this is Windows or if we're at the
5237              end and _opos is 0. The well-formedness assumption was
5238              probably not a good idea, but note that it's explicitly
5239              documented to behave that way. */
5240 # define UNPAIRED_MASK(wtf) (wtf ? 0xFC00 : 0xF800)
5241 	  if (((i + 1) == end) && ((wc & UNPAIRED_MASK(wtf)) == 0xD800) && _opos) {
5242 	    /* Ended in the middle of a surrogate pair */
5243 	    *_opos = j;
5244 	    if (_ipos)
5245 	      *_ipos = i;
5246 	    return -1;
5247 	  }
5248 	  if (wtf && ((wc & 0xFC00) != 0xD800)) {
5249 	    /* Count as one */
5250 	  } else if (wtf && ((i + 1 >= end)
5251                              || (((((unsigned short *)us)[i+1]) & 0xFC00) != 0xDC00))) {
5252 	  } else {
5253             i++;
5254             wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)us)[i]) & 0x3FF);
5255             wc += 0x10000;
5256           }
5257 	}
5258       } else {
5259 	wc = us[i];
5260       }
5261       if (wc < 0x80) {
5262 	j += 1;
5263       } else if (wc < 0x800) {
5264 	j += 2;
5265       } else if (wc < 0x10000) {
5266 	j += 3;
5267       } else if (wc < 0x200000) {
5268 	j += 4;
5269       } else if (wc < 0x4000000) {
5270 	j += 5;
5271       } else {
5272 	j += 6;
5273       }
5274     }
5275     if (_ipos)
5276       *_ipos = i;
5277     if (_opos)
5278       *_opos = j + dstart;
5279     return j;
5280   } else {
5281     unsigned int wc;
5282     j = dstart;
5283     for (i = start; i < end; i++) {
5284       if (utf16) {
5285 	wc = ((unsigned short *)us)[i];
5286 	if ((wc & 0xF800) == 0xD800) {
5287 	  /* Unparse surrogates. We assume that the surrogates are
5288 	     well formed on non-Windows platforms, but when _opos,
5289 	     we detect ending in the middle of an surrogate pair. */
5290 	  if (((i + 1) == end) && ((wc & UNPAIRED_MASK(wtf)) == 0xD800) && _opos) {
5291 	    /* Ended in the middle of a surrogate pair */
5292 	    *_opos = j;
5293 	    if (_ipos)
5294 	      *_ipos = i;
5295 	    return -1;
5296 	  }
5297 	  if (wtf && ((wc & 0xFC00) != 0xD800)) {
5298 	    /* Let the misplaced surrogate through */
5299 	  } else if (wtf && ((i + 1 >= end)
5300                              || (((((unsigned short *)us)[i+1]) & 0xFC00) != 0xDC00))) {
5301 	    /* Let the misplaced surrogate through */
5302 	  } else {
5303             i++;
5304             wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)us)[i]) & 0x3FF);
5305             wc += 0x10000;
5306           }
5307 	}
5308       } else {
5309 	wc = us[i];
5310       }
5311 
5312       if (wc < 0x80) {
5313 	if (j + 1 > dend)
5314 	  break;
5315 	s[j++] = wc;
5316       } else if (wc < 0x800) {
5317 	if (j + 2 > dend)
5318 	  break;
5319 	s[j++] = 0xC0 | ((wc & 0x7C0) >> 6);
5320 	s[j++] = 0x80 | (wc & 0x3F);
5321       } else if (wc < 0x10000) {
5322 	if (j + 3 > dend)
5323 	  break;
5324 	s[j++] = 0xE0 | ((wc & 0xF000) >> 12);
5325 	s[j++] = 0x80 | ((wc & 0x0FC0) >> 6);
5326 	s[j++] = 0x80 | (wc & 0x3F);
5327       } else if (wc < 0x200000) {
5328 	if (j + 4 > dend)
5329 	  break;
5330 	s[j++] = 0xF0 | ((wc & 0x1C0000) >> 18);
5331 	s[j++] = 0x80 | ((wc & 0x03F000) >> 12);
5332 	s[j++] = 0x80 | ((wc & 0x000FC0) >> 6);
5333 	s[j++] = 0x80 | (wc & 0x3F);
5334       } else if (wc < 0x4000000) {
5335 	if (j + 5 > dend)
5336 	  break;
5337 	s[j++] = 0xF8 | ((wc & 0x3000000) >> 24);
5338 	s[j++] = 0x80 | ((wc & 0x0FC0000) >> 18);
5339 	s[j++] = 0x80 | ((wc & 0x003F000) >> 12);
5340 	s[j++] = 0x80 | ((wc & 0x0000FC0) >> 6);
5341 	s[j++] = 0x80 | (wc & 0x3F);
5342       } else {
5343 	if (j + 6 > dend)
5344 	  break;
5345 	s[j++] = 0xFC | ((wc & 0x40000000) >> 30);
5346 	s[j++] = 0x80 | ((wc & 0x3F000000) >> 24);
5347 	s[j++] = 0x80 | ((wc & 0x00FC0000) >> 18);
5348 	s[j++] = 0x80 | ((wc & 0x0003F000) >> 12);
5349 	s[j++] = 0x80 | ((wc & 0x00000FC0) >> 6);
5350 	s[j++] = 0x80 | (wc & 0x3F);
5351       }
5352       done = i;
5353     }
5354     if (_ipos)
5355       *_ipos = done;
5356     if (_opos)
5357       *_opos = j;
5358     return j - dstart;
5359   }
5360 }
5361 
scheme_utf8_encode(const unsigned int * us,intptr_t start,intptr_t end,unsigned char * s,intptr_t dstart,char utf16)5362 intptr_t scheme_utf8_encode(const unsigned int *us, intptr_t start, intptr_t end,
5363 		       unsigned char *s, intptr_t dstart,
5364 		       char utf16)
5365 {
5366   return utf8_encode_x(us, start, end,
5367 		       s, dstart, -1,
5368 		       NULL, NULL, utf16, WIN_UTF16_AS_WTF16(utf16));
5369 }
5370 
scheme_utf8_encode_all(const unsigned int * us,intptr_t len,unsigned char * s)5371 intptr_t scheme_utf8_encode_all(const unsigned int *us, intptr_t len, unsigned char *s)
5372 {
5373   return utf8_encode_x(us, 0, len, s, 0, -1, NULL, NULL, 0 /* utf16 */, 0);
5374 }
5375 
scheme_utf8_encode_to_buffer_len(const mzchar * s,intptr_t len,char * buf,intptr_t blen,intptr_t * _slen)5376 char *scheme_utf8_encode_to_buffer_len(const mzchar *s, intptr_t len,
5377 				       char *buf, intptr_t blen,
5378 				       intptr_t *_slen)
5379 {
5380   intptr_t slen;
5381 
5382   /* ASCII with len < blen is a common case: */
5383   if (len < blen) {
5384     for (slen = 0; slen < len; slen++) {
5385       if (s[slen] > 127)
5386         break;
5387       else
5388         buf[slen] = s[slen];
5389     }
5390     if (slen == len) {
5391       buf[slen] = 0;
5392       *_slen = slen;
5393       return buf;
5394     }
5395   }
5396 
5397   slen = utf8_encode_x(s, 0, len, NULL, 0, -1, NULL, NULL, 0, 0);
5398   if (slen + 1 > blen) {
5399     buf = (char *)scheme_malloc_atomic(slen + 1);
5400   }
5401   utf8_encode_x(s, 0, len, (unsigned char *)buf, 0, -1, NULL, NULL, 0, 0);
5402   buf[slen] = 0;
5403   *_slen = slen;
5404   return buf;
5405 }
5406 
scheme_utf8_encode_to_buffer(const mzchar * s,intptr_t len,char * buf,intptr_t blen)5407 char *scheme_utf8_encode_to_buffer(const mzchar *s, intptr_t len,
5408 				   char *buf, intptr_t blen)
5409 {
5410   intptr_t slen;
5411   return scheme_utf8_encode_to_buffer_len(s, len, buf, blen, &slen);
5412 }
5413 
scheme_ucs4_to_utf16(const mzchar * text,intptr_t start,intptr_t end,unsigned short * buf,intptr_t bufsize,intptr_t * ulen,intptr_t term_size)5414 unsigned short *scheme_ucs4_to_utf16(const mzchar *text, intptr_t start, intptr_t end,
5415 				     unsigned short *buf, intptr_t bufsize,
5416 				     intptr_t *ulen, intptr_t term_size)
5417 {
5418   mzchar v;
5419   intptr_t extra, i, j;
5420   unsigned short *utf16;
5421 
5422   /* Count characters that fall outside UCS-2: */
5423   for (i = start, extra = 0; i < end; i++) {
5424     if (text[i] > 0xFFFF)
5425       extra++;
5426   }
5427 
5428   if ((end - start) + extra + term_size < bufsize)
5429     utf16 = buf;
5430   else
5431     utf16 = (unsigned short *)scheme_malloc_atomic(sizeof(unsigned short) * ((end - start) + extra + term_size));
5432 
5433   for (i = start, j = 0; i < end; i++) {
5434     v = text[i];
5435     if (v > 0xFFFF) {
5436       v -= 0x10000;
5437       utf16[j++] = 0xD800 | ((v >> 10) & 0x3FF);
5438       utf16[j++] = 0xDC00 | (v & 0x3FF);
5439     } else
5440       utf16[j++] = v;
5441   }
5442 
5443   *ulen = j;
5444 
5445   return utf16;
5446 }
5447 
scheme_utf16_to_ucs4(const unsigned short * text,intptr_t start,intptr_t end,mzchar * buf,intptr_t bufsize,intptr_t * ulen,intptr_t term_size)5448 mzchar *scheme_utf16_to_ucs4(const unsigned short *text, intptr_t start, intptr_t end,
5449 			     mzchar *buf, intptr_t bufsize,
5450 			     intptr_t *ulen, intptr_t term_size)
5451 {
5452   int wc;
5453   intptr_t i, j;
5454 
5455   for (i = start, j = 0; i < end; i++) {
5456     wc = text[i];
5457     if ((wc & 0xF800) == 0xD800) {
5458       i++;
5459     }
5460     j++;
5461   }
5462 
5463   if (j + term_size >= bufsize)
5464     buf = (mzchar *)scheme_malloc_atomic((j + term_size) * sizeof(mzchar));
5465 
5466   for (i = start, j = 0; i < end; i++) {
5467     wc = text[i];
5468     if ((wc & 0xF800) == 0xD800) {
5469       i++;
5470       wc = ((wc & 0x3FF) << 10) + ((((unsigned short *)text)[i]) & 0x3FF);
5471       wc += 0x10000;
5472     }
5473     buf[j++] = wc;
5474   }
5475 
5476   *ulen = j;
5477 
5478   return buf;
5479 }
5480 
5481 /**********************************************************************/
5482 /*                           Precise GC                               */
5483 /**********************************************************************/
5484 
5485 #ifdef MZ_PRECISE_GC
5486 
5487 START_XFORM_SKIP;
5488 
5489 #include "mzmark_string.inc"
5490 
register_traversers(void)5491 static void register_traversers(void)
5492 {
5493   GC_REG_TRAV(scheme_string_converter_type, mark_string_convert);
5494 }
5495 
5496 END_XFORM_SKIP;
5497 
5498 #endif
5499