1 /* udstr.c:
2 *
3 ****************************************************************
4 * Copyright (C) 2004 Tom Lord
5 *
6 * See the file "COPYING" for further information about
7 * the copyright and warranty status of this work.
8 */
9
10
11 #include "hackerlab/mem/mem.h"
12 #include "hackerlab/bugs/panic.h"
13 #include "hackerlab/uni/invariant.h"
14 #include "hackerlab/strings/udstr.h"
15
16
17
18 /************************************************************************
19 *(h1 "Dynamic Unicode String Functions"
20 * :includes ("hackerlab/strings/udstr.h"))
21 *
22 *
23 */
24
25 /*(c t_udstr :category type)
26 * typedef <unspecified> t_udstr;
27 *
28 * `t_udstr' values are mutable, resizable Unicode strings.
29 *
30 * Internally, strings of this type may be of any supported encoding
31 * form and may contain any codepoint valid in that encoding. The
32 * length of an udstr (pronounced "uhd-stir") in both encoding values
33 * and code-points is explicitly recorded.
34 */
35 struct udstr_handle
36 {
37 int refs;
38
39 uni_string str;
40 enum uni_encoding_scheme enc;
41 ustr_cv_index_t cv_len;
42 ustr_cp_index_t cp_len;
43
44 alloc_limits limits;
45 };
46
47
48 /*(menu)
49 */
50
51
52
53 /* __STDC__ prototypes for static functions */
54 static enum uni_encoding_scheme pick_fw_of (enum uni_encoding_scheme a,
55 enum uni_encoding_scheme b);
56 static void take_new_data (t_udstr dstr, t_udstr new_data);
57
58
59 /************************************************************************
60 *(h2 "udstr Naming Conventions")
61 *
62 * Some `udstr' functions require users to pass string indexes or lengths
63 * as parameters. Such parameters are always expressed in units of
64 * code values within the relevent encoding form.
65 *
66 * Functions having the suffix `_x' work by modifying their first string
67 * argument rather than by returning a newly allocated string.
68 *
69 * Finally, `_fw' functions are the "full width" varients (see below).
70 *
71 */
72
73 /************************************************************************
74 *(h2 "Convervative Width vs. Full Width Unicode String Functions")
75 *
76 * Normally, `udstr' functions choose the encoding form of their
77 * output strings (or the strings they modify) by copying the encoding
78 * of the first string argument. Thus, for example, concatenating a
79 * UTF-8 string (on the left) and a UTF-16 string (on the right) produces
80 * a UTF-8 string.
81 *
82 * Many `udstr' functions also have a varient whose name contains the
83 * suffix `_fw' ("full width"). These choose output encoding forms by
84 * choosing the _narrowest_ encoding wide enough so that each
85 * codepoint in the string occupies exactly one coding value. For example,
86 * the concatenation of a full-width UTF-8 string (which must contain
87 * only codepoints in the range 0..127) with a full-width UTF-16 string
88 * (which can contain no surrogate pair codepoints are codepoints larger
89 * than 16 bits) may be either UTF-8 or UTF-16, depending on whether the
90 * UTF-16 argument contains any codepoints outside of the range 0..127.
91 *
92 * [Add table of precise conversion rules.]
93 */
94
95
96
97 /************************************************************************
98 *(h2 "Constructors")
99 *
100 */
101
102
103 /*(c udstr_save)
104 * t_udstr udstr_save (alloc_limits limits,
105 * uni_string str,
106 * enum uni_encoding_scheme enc);
107 *
108 * Create a `t_udstr' from a 0-terminated unicode string
109 * in the indicated encoding form.
110 *
111 * The internal encoding form of the resulting `t_udstr'
112 * is the same as the encoding of `str', namely, `enc'.
113 */
114 t_udstr
udstr_save(alloc_limits limits,uni_string str,enum uni_encoding_scheme enc)115 udstr_save (alloc_limits limits,
116 uni_string str,
117 enum uni_encoding_scheme enc)
118 {
119 ustr_cv_index_t len;
120 ustr_cp_index_t cp_len;
121
122 len = ustr_lengths (&cp_len, 0, str, enc);
123 return udstr_save_generic (limits, enc, len, str, enc, len, cp_len);
124 }
125
126
127 /*(c udstr_save_n)
128 * t_udstr udstr_save_n (alloc_limits limits,
129 * uni_string str,
130 * enum uni_encoding_scheme enc,
131 * ustr_cv_index_t len);
132 *
133 *
134 * Create a `t_udstr' from a unicode string
135 * in the indicated encoding form and of the
136 * indicated length (in code values).
137 *
138 * The internal encoding form of the resulting `t_udstr'
139 * is the same as the encoding of `str', namely, `enc'.
140 */
141 t_udstr
udstr_save_n(alloc_limits limits,uni_string str,enum uni_encoding_scheme enc,ustr_cv_index_t len)142 udstr_save_n (alloc_limits limits,
143 uni_string str,
144 enum uni_encoding_scheme enc,
145 ustr_cv_index_t len)
146 {
147 ustr_cp_index_t cp_len;
148
149 cp_len = ustr_cp_length_n (0, str, enc, len);
150 return udstr_save_generic (limits, enc, len, str, enc, len, cp_len);
151 }
152
153
154
155 /*(c udstr_save_fw)
156 * t_udstr udstr_save_fw (alloc_limits limits,
157 * uni_string str,
158 * enum uni_encoding_scheme enc);
159 *
160 *
161 * Create a `t_udstr' from a 0-terminated unicode string
162 * in the indicated encoding form.
163 *
164 * The internal encoding form of the resulting `t_udstr'
165 * is the narrowest among:
166 *
167 * uni_iso8859_1
168 * uni_utf16
169 * uni_utf32
170 * uni_bogus32
171 *
172 * in which the data from `str' can be represented with
173 * exactly one code value per coding point.
174 */
175 t_udstr
udstr_save_fw(alloc_limits limits,uni_string str,enum uni_encoding_scheme enc)176 udstr_save_fw (alloc_limits limits,
177 uni_string str,
178 enum uni_encoding_scheme enc)
179 {
180 ustr_cv_index_t len;
181 ustr_cp_index_t cp_len;
182 ustr_cv_index_t dest_len;
183 enum uni_encoding_scheme fw_enc;
184
185 len = ustr_lengths (&cp_len, &fw_enc, str, enc);
186 dest_len.cv = cp_len.cp;
187 return udstr_save_generic (limits, fw_enc, dest_len, str, enc, len, cp_len);
188 }
189
190
191 /*(c udstr_save_fw_n)
192 * t_udstr udstr_save_fw_n (alloc_limits limits,
193 * uni_string str,
194 * enum uni_encoding_scheme enc,
195 * ustr_cv_index_t len);
196 *
197 * Create a `t_udstr' from a unicode string
198 * in the indicated encoding form and of the
199 * indicated length (in code values).
200 *
201 * The internal encoding form of the resulting `t_udstr'
202 * is the narrowest among:
203 *
204 * uni_iso8859_1
205 * uni_utf16
206 * uni_utf32
207 * uni_bogus32
208 *
209 * in which the data from `str' can be represented with
210 * exactly one code value per coding point.
211 */
212 t_udstr
udstr_save_fw_n(alloc_limits limits,uni_string str,enum uni_encoding_scheme enc,ustr_cv_index_t len)213 udstr_save_fw_n (alloc_limits limits,
214 uni_string str,
215 enum uni_encoding_scheme enc,
216 ustr_cv_index_t len)
217 {
218 ustr_cp_index_t cp_len;
219 ustr_cv_index_t dest_len;
220 enum uni_encoding_scheme fw_enc;
221
222 (void)ustr_lengths_n (&cp_len, &fw_enc, str, enc, len);
223 dest_len.cv = cp_len.cp;
224 return udstr_save_generic (limits, fw_enc, dest_len, str, enc, len, cp_len);
225 }
226
227
228
229
230 t_udstr
udstr_save_generic(alloc_limits limits,enum uni_encoding_scheme dest_enc,ustr_cv_index_t dest_len,uni_string str,enum uni_encoding_scheme enc,ustr_cv_index_t len,ustr_cp_index_t cp_len)231 udstr_save_generic (alloc_limits limits,
232 enum uni_encoding_scheme dest_enc,
233 ustr_cv_index_t dest_len,
234 uni_string str,
235 enum uni_encoding_scheme enc,
236 ustr_cv_index_t len,
237 ustr_cp_index_t cp_len)
238 {
239 t_udstr answer;
240 size_t dest_cv_sizeof;
241
242 answer = (t_udstr)lim_malloc (limits, sizeof (*answer));
243 if (!answer)
244 return 0;
245
246 answer->refs = 1;
247 answer->enc = dest_enc;
248 answer->cv_len = dest_len;
249 answer->cp_len = cp_len;
250
251 dest_cv_sizeof = uni_cv_sizeof (dest_enc);
252
253 answer->str.raw = lim_malloc (limits, (1 + dest_len.cv) * dest_cv_sizeof);
254 if (!answer->str.raw)
255 {
256 lim_free (limits, (t_uchar *)answer);
257 answer = 0;
258 }
259 else
260 {
261 if (dest_enc == enc)
262 {
263 mem_move (answer->str.raw, str.raw, len.cv * dest_cv_sizeof);
264 mem_set0 (answer->str.raw + (dest_len.cv * dest_cv_sizeof), dest_cv_sizeof);
265 }
266 else
267 {
268 ustr_copy_n (answer->str, dest_enc, dest_len, str, enc, len);
269 }
270 }
271
272 return answer;
273 }
274
275
276
277 /************************************************************************
278 *(h2 "udstr Memory Management")
279 *
280 * `t_udstr' values are reference counted objects.
281 * Constructors return objects with a reference count
282 * of 1.
283 *
284 */
285
286 /*(c udstr_ref)
287 * void udstr_ref (t_udstr d);
288 *
289 * Increment the reference count of `d'.
290 */
291 void
udstr_ref(t_udstr d)292 udstr_ref (t_udstr d)
293 {
294 if (!d)
295 return;
296
297 ++d->refs;
298 }
299
300
301 /*(c udstr_unref)
302 * void udstr_unref (t_udstr d);
303 *
304 * Decrement the reference count of `d'.
305 * If it drops to 0, free all storage associated
306 * wtih `d' (thus invalidating all references
307 * to `d').
308 */
309 void
udstr_unref(t_udstr d)310 udstr_unref (t_udstr d)
311 {
312 if (!d)
313 return;
314
315 if (d->refs > 1)
316 --d->refs;
317 else
318 {
319 lim_free (d->limits, d->str.raw);
320 }
321 }
322
323
324
325 /************************************************************************
326 *(h2 "udstr Deconstruction")
327 *
328 *
329 *
330 */
331
332 /*(c udstr_cv_length)
333 * ustr_cv_index_t udstr_cv_length (t_udstr dstr);
334 *
335 * Return the length of `dstr', measured in coding values.
336 */
337 ustr_cv_index_t
udstr_cv_length(t_udstr dstr)338 udstr_cv_length (t_udstr dstr)
339 {
340 return dstr->cv_len;
341 }
342
343
344 /*(c udstr_cp_length)
345 * ustr_cp_index_t udstr_cp_length (t_udstr dstr);
346 *
347 * Return the length of `dstr', measured in codepoints.
348 */
349 ustr_cp_index_t
udstr_cp_length(t_udstr dstr)350 udstr_cp_length (t_udstr dstr)
351 {
352 return dstr->cp_len;
353 }
354
355
356 /*(c udstr_encoding)
357 * enum uni_encoding_scheme udstr_encoding (t_udstr dstr);
358 *
359 * Return the encoding form used internally for `dstr'.
360 */
361 enum uni_encoding_scheme
udstr_encoding(t_udstr dstr)362 udstr_encoding (t_udstr dstr)
363 {
364 return dstr->enc;
365 }
366
367
368 /*(c udstr_str)
369 * uni_string udstr_str (t_udstr dstr);
370 *
371 * Return a pointer to the string data used internally for `dstr'.
372 * Subsequent calls to `udstr' functions with `dstr' as a parameter
373 * can invalidate the return value of this function.
374 */
375 uni_string
udstr_str(t_udstr dstr)376 udstr_str (t_udstr dstr)
377 {
378 return dstr->str;
379 }
380
381
382 /*(c udstr_cv_ref)
383 * t_unicode udstr_cv_ref (ustr_cv_index_t * pos_after,
384 * t_udstr dstr,
385 * ustr_cv_index_t pos);
386 *
387 * Return the codepoint found at the indicated
388 * code value index in `dstr'. Optionally return the
389 * codevalue index of the subsequent character.
390 */
391 t_unicode
udstr_cv_ref(ustr_cv_index_t * pos_after,t_udstr dstr,ustr_cv_index_t pos)392 udstr_cv_ref (ustr_cv_index_t * pos_after,
393 t_udstr dstr,
394 ustr_cv_index_t pos)
395 {
396 return ustr_cv_ref_n (pos_after, dstr->str, dstr->enc, dstr->cv_len, pos);
397 }
398
399
400 /*(c udstr_cp_ref)
401 * t_unicode udstr_cp_ref (ustr_cv_index_t * pos_after,
402 * t_udstr dstr,
403 * ustr_cp_index_t pos);
404 *
405 * Return the codepoint found at the indicated
406 * codepoint index in `dstr'. Optionally return the
407 * codevalue index of the subsequent character.
408 */
409 t_unicode
udstr_cp_ref(ustr_cv_index_t * pos_after,t_udstr dstr,ustr_cp_index_t pos)410 udstr_cp_ref (ustr_cv_index_t * pos_after,
411 t_udstr dstr,
412 ustr_cp_index_t pos)
413 {
414 uni_string pos_str;
415 ustr_cv_index_t pos_cv;
416
417 if (pos.cp >= dstr->cp_len.cp)
418 {
419 return 0;
420 }
421 else
422 {
423 pos_str = ustr_cp_offset_n (dstr->str, dstr->enc, dstr->cv_len, pos);
424 pos_cv = ustr_str_subtract (pos_str, dstr->str, dstr->enc);
425 return udstr_cv_ref (pos_after, dstr, pos_cv);
426 }
427 }
428
429
430 /*(c udstr_cv_set)
431 * t_udstr udstr_cv_set (alloc_limits limits,
432 * t_udstr s,
433 * ustr_cv_index_t x,
434 * t_unicode c);
435 *
436 * Return a new copy of `s' with the character at the
437 * indicated code value replaced by `c'.
438 */
439 t_udstr
udstr_cv_set(alloc_limits limits,t_udstr s,ustr_cv_index_t x,t_unicode c)440 udstr_cv_set (alloc_limits limits,
441 t_udstr s,
442 ustr_cv_index_t x,
443 t_unicode c)
444 {
445 struct udstr_handle tmp;
446 ustr_cv_index_t end;
447
448 tmp.refs = 1;
449 tmp.str.utf32 = &c;
450 tmp.enc = uni_bogus32;
451 tmp.cv_len = ustr_cv_index (1);
452 tmp.cp_len = ustr_cp_index (1);
453 tmp.limits = 0;
454
455 end = udstr_cv_inc (s, x);
456
457 return udstr_cv_replace (limits,
458 s, x, end,
459 &tmp, ustr_cv_index (0), ustr_cv_index (1));
460 }
461
462
463 /*(c udstr_cv_set_fw)
464 * t_udstr udstr_cv_set_fw (alloc_limits limits,
465 * t_udstr s,
466 * ustr_cv_index_t x,
467 * t_unicode c);
468 *
469 *
470 * Return a new copy of `s' with the character at the
471 * indicated code value replaced by `c'.
472 *
473 * The returned string uses a full-width encoding.
474 */
475 t_udstr
udstr_cv_set_fw(alloc_limits limits,t_udstr s,ustr_cv_index_t x,t_unicode c)476 udstr_cv_set_fw (alloc_limits limits,
477 t_udstr s,
478 ustr_cv_index_t x,
479 t_unicode c)
480 {
481 struct udstr_handle tmp;
482 ustr_cv_index_t end;
483
484 tmp.refs = 1;
485 tmp.str.utf32 = &c;
486 tmp.enc = uni_bogus32;
487 tmp.cv_len = ustr_cv_index (1);
488 tmp.cp_len = ustr_cp_index (1);
489 tmp.limits = 0;
490
491 end = udstr_cv_inc (s, x);
492
493 return udstr_cv_replace_fw (limits,
494 s, x, end,
495 &tmp, ustr_cv_index (0), ustr_cv_index (1));
496 }
497
498
499 /*(c udstr_cp_set)
500 * t_udstr udstr_cp_set (alloc_limits limits,
501 * t_udstr s,
502 * ustr_cp_index_t x,
503 * t_unicode c);
504 *
505 *
506 * Return a new copy of `s' with the character at the
507 * indicated code value replaced by `c'.
508 */
509 t_udstr
udstr_cp_set(alloc_limits limits,t_udstr s,ustr_cp_index_t x,t_unicode c)510 udstr_cp_set (alloc_limits limits,
511 t_udstr s,
512 ustr_cp_index_t x,
513 t_unicode c)
514 {
515 struct udstr_handle tmp;
516
517 tmp.refs = 1;
518 tmp.str.utf32 = &c;
519 tmp.enc = uni_bogus32;
520 tmp.cv_len = ustr_cv_index (1);
521 tmp.cp_len = ustr_cp_index (1);
522 tmp.limits = 0;
523
524 return udstr_cp_replace (limits,
525 s, x, ustr_cp_index (x.cp + 1),
526 &tmp, ustr_cp_index (0), ustr_cp_index (1));
527 }
528
529
530 /*(c udstr_cp_set_fw)
531 * t_udstr udstr_cp_set_fw (alloc_limits limits,
532 * t_udstr s,
533 * ustr_cp_index_t x,
534 * t_unicode c);
535 *
536 *
537 * Return a new copy of `s' with the character at the
538 * indicated code value replaced by `c'.
539 *
540 * The returned string uses a full-width encoding.
541 */
542 t_udstr
udstr_cp_set_fw(alloc_limits limits,t_udstr s,ustr_cp_index_t x,t_unicode c)543 udstr_cp_set_fw (alloc_limits limits,
544 t_udstr s,
545 ustr_cp_index_t x,
546 t_unicode c)
547 {
548 struct udstr_handle tmp;
549
550 tmp.refs = 1;
551 tmp.str.utf32 = &c;
552 tmp.enc = uni_bogus32;
553 tmp.cv_len = ustr_cv_index (1);
554 tmp.cp_len = ustr_cp_index (1);
555 tmp.limits = 0;
556
557 return udstr_cp_replace_fw (limits,
558 s, x, ustr_cp_index (x.cp + 1),
559 &tmp, ustr_cp_index (0), ustr_cp_index (1));
560 }
561
562
563
564
565
566 /************************************************************************
567 *(h2 "Full-width Conversion")
568 *
569 */
570
571
572 /*(c udstr_fw_x)
573 * t_udstr udstr_fw_x (t_udstr d);
574 *
575 * Modify (if necessary) `d' to be of a narrowest encoding such that
576 * each codepoint in `d' occupies exactly one code value.
577 */
578 t_udstr
udstr_fw_x(t_udstr d)579 udstr_fw_x (t_udstr d)
580 {
581 enum uni_encoding_scheme fw_enc;
582
583 (void)ustr_cp_length_n (&fw_enc, d->str, d->enc, d->cv_len);
584
585 if (d->enc == fw_enc)
586 return d;
587
588 {
589 ustr_cv_index_t fw_len;
590 t_udstr new_data = 0;
591
592 fw_len = ustr_cv_length_in_encoding_n (fw_enc, d->str, d->enc, d->cv_len);
593
594 new_data = udstr_save_generic (d->limits, fw_enc, fw_len, d->str, d->enc, d->cv_len, d->cp_len);
595 if (!new_data)
596 return 0;
597
598 take_new_data (d, new_data);
599
600 return d;
601 }
602 }
603
604
605
606 /************************************************************************
607 *(h2 "udstr String Copying")
608 *
609 *
610 *
611 */
612
613 /*(c udstr_copy)
614 * t_udstr udstr_copy (alloc_limits limits, t_udstr dstr);
615 *
616 * Allocate a fresh copy of `dstr'.
617 */
618 t_udstr
udstr_copy(alloc_limits limits,t_udstr dstr)619 udstr_copy (alloc_limits limits, t_udstr dstr)
620 {
621 return udstr_save_generic (limits, dstr->enc, dstr->cv_len, dstr->str, dstr->enc, dstr->cv_len, dstr->cp_len);
622 }
623
624
625 /*(c udstr_copy_fw)
626 * t_udstr udstr_copy_fw (alloc_limits limits, t_udstr dstr);
627 *
628 * Allocate a fresh copy of `dstr', converting it (if necessary)
629 * to a full-width encoding.
630 */
631 t_udstr
udstr_copy_fw(alloc_limits limits,t_udstr dstr)632 udstr_copy_fw (alloc_limits limits, t_udstr dstr)
633 {
634 return udstr_save_fw_n (limits, dstr->str, dstr->enc, dstr->cv_len);
635 }
636
637
638 /************************************************************************
639 *(h2 "udstr Substrings")
640 *
641 *
642 *
643 */
644
645
646 /*(c udstr_cv_substr)
647 * t_udstr udstr_cv_substr (alloc_limits limits,
648 * t_udstr dstr,
649 * ustr_cv_index_t from,
650 * ustr_cv_index_t to);
651 *
652 * Return a freshly allocated substring of `dstr'
653 * containing the indicated half-open range of characters
654 * (measured in code values).
655 */
656 t_udstr
udstr_cv_substr(alloc_limits limits,t_udstr dstr,ustr_cv_index_t from,ustr_cv_index_t to)657 udstr_cv_substr (alloc_limits limits,
658 t_udstr dstr,
659 ustr_cv_index_t from,
660 ustr_cv_index_t to)
661 {
662 uni_string str;
663 ustr_cv_index_t len;
664
665 str = ustr_cv_offset (dstr->str, dstr->enc, from);
666 len = ustr_cv_index (to.cv - from.cv);
667
668 return udstr_save_n (limits, str, dstr->enc, len);
669 }
670
671
672 /*(c udstr_cv_substr_x)
673 * t_udstr udstr_cv_substr_x (t_udstr dstr,
674 * ustr_cv_index_t from,
675 * ustr_cv_index_t to);
676 *
677 * Modify `dstr' to contain only its indicated
678 * substring.
679 */
680 t_udstr
udstr_cv_substr_x(t_udstr dstr,ustr_cv_index_t from,ustr_cv_index_t to)681 udstr_cv_substr_x (t_udstr dstr,
682 ustr_cv_index_t from,
683 ustr_cv_index_t to)
684 {
685 t_udstr almost_answer;
686
687 almost_answer = udstr_cv_substr (dstr->limits, dstr, from, to);
688
689 if (!almost_answer)
690 {
691 return 0;
692 }
693 else
694 {
695 take_new_data (dstr, almost_answer);
696
697 return dstr;
698 }
699 }
700
701
702 /*(c udstr_cv_substr_fw)
703 * t_udstr udstr_cv_substr_fw (alloc_limits limits,
704 * t_udstr dstr,
705 * ustr_cv_index_t from,
706 * ustr_cv_index_t to);
707 *
708 *
709 * Return a freshly allocated substring of `dstr'
710 * containing the indicated half-open range of characters
711 * (measured in code values).
712 *
713 * The returned string uses a full-width encoding (all of
714 * it's codepoints fit in exactly one code value).
715 */
716 t_udstr
udstr_cv_substr_fw(alloc_limits limits,t_udstr dstr,ustr_cv_index_t from,ustr_cv_index_t to)717 udstr_cv_substr_fw (alloc_limits limits,
718 t_udstr dstr,
719 ustr_cv_index_t from,
720 ustr_cv_index_t to)
721 {
722 uni_string str;
723 ustr_cv_index_t len;
724
725 str = ustr_cv_offset (dstr->str, dstr->enc, from);
726 len = ustr_cv_index (to.cv - from.cv);
727
728 return udstr_save_fw_n (limits, str, dstr->enc, len);
729 }
730
731
732 /*(c udstr_cv_substr_fw_x)
733 * t_udstr udstr_cv_substr_fw_x (t_udstr dstr,
734 * ustr_cv_index_t from,
735 * ustr_cv_index_t to);
736 *
737 * Modify `dstr' to contain only its indicated substring
738 * and to be in a full-width encoding.
739 */
740 t_udstr
udstr_cv_substr_fw_x(t_udstr dstr,ustr_cv_index_t from,ustr_cv_index_t to)741 udstr_cv_substr_fw_x (t_udstr dstr,
742 ustr_cv_index_t from,
743 ustr_cv_index_t to)
744 {
745 t_udstr almost_answer;
746
747 almost_answer = udstr_cv_substr_fw (dstr->limits, dstr, from, to);
748
749 if (!almost_answer)
750 {
751 return 0;
752 }
753 else
754 {
755 take_new_data (dstr, almost_answer);
756
757 return dstr;
758 }
759 }
760
761
762 /*(c udstr_cp_substr)
763 * t_udstr udstr_cp_substr (alloc_limits limits,
764 * t_udstr dstr,
765 * ustr_cp_index_t from,
766 * ustr_cp_index_t to);
767 *
768 * Return a freshly allocated substring of `dstr'
769 * containing the indicated half-open range of characters
770 * (measured in codepoints).
771 */
772 t_udstr
udstr_cp_substr(alloc_limits limits,t_udstr dstr,ustr_cp_index_t from,ustr_cp_index_t to)773 udstr_cp_substr (alloc_limits limits,
774 t_udstr dstr,
775 ustr_cp_index_t from,
776 ustr_cp_index_t to)
777 {
778 ustr_cv_index_t f_v;
779 ustr_cv_index_t t_v;
780
781 udstr_cp_to_cv_range (&f_v, &t_v, dstr, from, to);
782 return udstr_cv_substr (limits, dstr, f_v, t_v);
783 }
784
785
786 /*(c udstr_cp_substr_x)
787 * t_udstr udstr_cp_substr_x (t_udstr dstr,
788 * ustr_cp_index_t from,
789 * ustr_cp_index_t to);
790 *
791 * Modify `dstr' to contain only its indicated substring.
792 */
793 t_udstr
udstr_cp_substr_x(t_udstr dstr,ustr_cp_index_t from,ustr_cp_index_t to)794 udstr_cp_substr_x (t_udstr dstr,
795 ustr_cp_index_t from,
796 ustr_cp_index_t to)
797 {
798 t_udstr almost_answer;
799
800 almost_answer = udstr_cp_substr (dstr->limits, dstr, from, to);
801
802 if (!almost_answer)
803 {
804 return 0;
805 }
806 else
807 {
808 take_new_data (dstr, almost_answer);
809
810 return dstr;
811 }
812 }
813
814
815 /*(c udstr_cp_substr_fw)
816 * t_udstr udstr_cp_substr_fw (alloc_limits limits,
817 * t_udstr dstr,
818 * ustr_cp_index_t from,
819 * ustr_cp_index_t to);
820 *
821 * Return a freshly allocated substring of `dstr'
822 * containing the indicated half-open range of characters
823 * (measured in codepoints).
824 *
825 * The returned string uses a full-width encoding (all of
826 * it's codepoints fit in exactly one code value).
827 */
828 t_udstr
udstr_cp_substr_fw(alloc_limits limits,t_udstr dstr,ustr_cp_index_t from,ustr_cp_index_t to)829 udstr_cp_substr_fw (alloc_limits limits,
830 t_udstr dstr,
831 ustr_cp_index_t from,
832 ustr_cp_index_t to)
833 {
834 ustr_cv_index_t f_v;
835 ustr_cv_index_t t_v;
836
837 udstr_cp_to_cv_range (&f_v, &t_v, dstr, from, to);
838 return udstr_cv_substr_fw (limits, dstr, f_v, t_v);
839 }
840
841
842 /*(c udstr_cp_substr_fw_x)
843 * t_udstr udstr_cp_substr_fw_x (t_udstr dstr,
844 * ustr_cp_index_t from,
845 * ustr_cp_index_t to);
846 *
847 * Modify `dstr' to contain only its indicated substring
848 * and to be in a full-width encoding.
849 */
850 t_udstr
udstr_cp_substr_fw_x(t_udstr dstr,ustr_cp_index_t from,ustr_cp_index_t to)851 udstr_cp_substr_fw_x (t_udstr dstr,
852 ustr_cp_index_t from,
853 ustr_cp_index_t to)
854 {
855 t_udstr almost_answer;
856
857 almost_answer = udstr_cp_substr_fw (dstr->limits, dstr, from, to);
858
859 if (!almost_answer)
860 {
861 return 0;
862 }
863 else
864 {
865 take_new_data (dstr, almost_answer);
866
867 return dstr;
868 }
869 }
870
871
872
873
874
875 /************************************************************************
876 *(h2 "udstr String Concatentation")
877 *
878 *
879 *
880 */
881
882
883
884 /*(c udstr_append)
885 * t_udstr udstr_append (alloc_limits limits,
886 * t_udstr a_dstr,
887 * t_udstr b_dstr);
888 *
889 * Return a freshly allocated string containing
890 * the concatenation of the argument strings.
891 */
892 t_udstr
udstr_append(alloc_limits limits,t_udstr a_dstr,t_udstr b_dstr)893 udstr_append (alloc_limits limits,
894 t_udstr a_dstr,
895 t_udstr b_dstr)
896 {
897 t_udstr answer = 0;
898
899 answer = udstr_copy (limits, a_dstr);
900
901 if (answer)
902 {
903 if (!udstr_append_x (answer, b_dstr))
904 {
905 udstr_unref (answer);
906 answer = 0;
907 }
908 }
909
910 return answer;
911 }
912
913
914 /*(c udstr_append_x)
915 * t_udstr udstr_append_x (t_udstr a_dstr,
916 * t_udstr b_dstr);
917 *
918 * Modify `a_dstr' to contain the concatenation
919 * of `a_dstr' and `b_dstr'.
920 */
921 t_udstr
udstr_append_x(t_udstr a_dstr,t_udstr b_dstr)922 udstr_append_x (t_udstr a_dstr,
923 t_udstr b_dstr)
924 {
925 ustr_cv_index_t b_len_in_a_enc;
926 ustr_cv_index_t total_len;
927 size_t a_enc_size;
928 size_t proper_a_size;
929 uni_string b_dest;
930
931 if (a_dstr->enc == b_dstr->enc)
932 {
933 b_len_in_a_enc = b_dstr->cv_len;
934 }
935 else
936 {
937 b_len_in_a_enc = ustr_cv_length_in_encoding_n (a_dstr->enc, b_dstr->str, b_dstr->enc, b_dstr->cv_len);
938 }
939
940 total_len = ustr_cv_index (b_len_in_a_enc.cv + a_dstr->cv_len.cv);
941
942 a_enc_size = uni_cv_sizeof (a_dstr->enc);
943 proper_a_size = a_dstr->cv_len.cv * a_enc_size;
944
945 {
946 t_uchar * resized;
947
948 resized = lim_realloc (a_dstr->limits, a_dstr->str.raw, proper_a_size);
949 if (!resized)
950 return 0;
951
952 a_dstr->str.raw = resized;
953 }
954
955 b_dest = ustr_cv_offset_n (0, a_dstr->str, a_dstr->enc, a_dstr->cv_len, a_dstr->cv_len);
956
957 ustr_copy_n (b_dest, a_dstr->enc, b_len_in_a_enc, b_dstr->str, b_dstr->enc, b_dstr->cv_len);
958
959 a_dstr->cp_len = ustr_cp_length_n (0, a_dstr->str, a_dstr->enc, a_dstr->cv_len);
960
961 return a_dstr;
962 }
963
964
965 /*(c udstr_append_fw)
966 * t_udstr udstr_append_fw (alloc_limits limits,
967 * t_udstr a_dstr,
968 * t_udstr b_dstr);
969 *
970 * Return a freshly allocated string containing
971 * the concatenation of the argument strings.
972 *
973 * The new string uses a full-width encoding.
974 */
975 t_udstr
udstr_append_fw(alloc_limits limits,t_udstr a_dstr,t_udstr b_dstr)976 udstr_append_fw (alloc_limits limits,
977 t_udstr a_dstr,
978 t_udstr b_dstr)
979 {
980 t_udstr answer = 0;
981
982 answer = udstr_copy (limits, a_dstr);
983
984 if (answer)
985 {
986 if (!udstr_append_fw_x (answer, b_dstr))
987 {
988 udstr_unref (answer);
989 answer = 0;
990 }
991 }
992
993 return answer;
994 }
995
996
997 /*(c udstr_append_fw_x)
998 * t_udstr udstr_append_fw_x (t_udstr a_dstr,
999 * t_udstr b_dstr);
1000 *
1001 * Modify `a_dstr' to contain the concatenation
1002 * of `a_dstr' and `b_dstr' and to use a full-width
1003 * encoding.
1004 */
1005 t_udstr
udstr_append_fw_x(t_udstr a_dstr,t_udstr b_dstr)1006 udstr_append_fw_x (t_udstr a_dstr,
1007 t_udstr b_dstr)
1008 {
1009 enum uni_encoding_scheme a_fw;
1010 enum uni_encoding_scheme b_fw;
1011 enum uni_encoding_scheme best_encoding;
1012
1013 ustr_lengths_n (0, &a_fw, a_dstr->str, a_dstr->enc, a_dstr->cv_len);
1014 ustr_lengths_n (0, &b_fw, a_dstr->str, a_dstr->enc, a_dstr->cv_len);
1015
1016 best_encoding = pick_fw_of (a_fw, b_fw);
1017
1018 if (a_dstr->enc != best_encoding)
1019 {
1020 t_udstr new_a;
1021
1022 new_a = udstr_save_generic (a_dstr->limits,
1023 best_encoding,
1024 ustr_cv_index (a_dstr->cp_len.cp),
1025 a_dstr->str,
1026 a_dstr->enc,
1027 a_dstr->cv_len,
1028 a_dstr->cp_len);
1029
1030 if (!new_a)
1031 return 0;
1032
1033 a_dstr->str.raw = new_a->str.raw;
1034 a_dstr->enc = new_a->enc;
1035
1036 lim_free (new_a->limits, new_a);
1037
1038 /* new_a dropped deliberately --- a_dstr took it over
1039 */
1040 }
1041
1042 return udstr_append_x (a_dstr, b_dstr);
1043 }
1044
1045
1046
1047
1048
1049 /************************************************************************
1050 *(h2 "udstr Substring Deletion")
1051 *
1052 *
1053 *
1054 */
1055
1056
1057 /*(c udstr_cv_delete)
1058 * t_udstr udstr_cv_delete (alloc_limits limits,
1059 * t_udstr d,
1060 * ustr_cv_index_t from,
1061 * ustr_cv_index_t to);
1062 *
1063 * Return a new string which is a copy of `d' with
1064 * code values in the half-open range `from' to `to'
1065 * removed.
1066 */
1067 t_udstr
udstr_cv_delete(alloc_limits limits,t_udstr d,ustr_cv_index_t from,ustr_cv_index_t to)1068 udstr_cv_delete (alloc_limits limits,
1069 t_udstr d,
1070 ustr_cv_index_t from,
1071 ustr_cv_index_t to)
1072 {
1073 uni_string right_source;
1074 ustr_cv_index_t right_len;
1075 ustr_cv_index_t total_len;
1076 ustr_cv_index_t left_len;
1077 uni_string right_dest;
1078 t_udstr answer = 0;
1079
1080
1081 right_source = ustr_cv_offset_n (&right_len, d->str, d->enc, d->cv_len, to);
1082 left_len = from;
1083 total_len = ustr_cv_index (left_len.cv + right_len.cv);
1084
1085
1086 answer = udstr_save_generic (limits, d->enc, total_len, d->str, d->enc, d->cv_len, d->cp_len);
1087 if (!answer)
1088 return 0;
1089 /*
1090 * answer is missing half of its data and has the wrong cp_len now.
1091 */
1092
1093 right_dest = ustr_cv_offset_n (0, answer->str, answer->enc, answer->cv_len, left_len);
1094 ustr_copy_n (right_dest, answer->enc, right_len, right_source, d->enc, right_len);
1095
1096 answer->cp_len = ustr_cp_length_n (0, answer->str, answer->enc, answer->cv_len);
1097
1098 return answer;
1099 }
1100
1101
1102 /*(c udstr_cp_delete)
1103 * t_udstr udstr_cp_delete (alloc_limits limits,
1104 * t_udstr d,
1105 * ustr_cp_index_t from,
1106 * ustr_cp_index_t to);
1107 *
1108 * Return a new string which is a copy of `d' with
1109 * codepoints in the half-open range `from' to `to'
1110 * removed.
1111 */
1112 t_udstr
udstr_cp_delete(alloc_limits limits,t_udstr d,ustr_cp_index_t from,ustr_cp_index_t to)1113 udstr_cp_delete (alloc_limits limits,
1114 t_udstr d,
1115 ustr_cp_index_t from,
1116 ustr_cp_index_t to)
1117 {
1118 ustr_cv_index_t fv;
1119 ustr_cv_index_t tv;
1120
1121 udstr_cp_to_cv_range (&fv, &tv, d, from, to);
1122 return udstr_cv_delete (limits, d, fv, tv);
1123 }
1124
1125
1126 /*(c udstr_cv_delete_x)
1127 * t_udstr udstr_cv_delete_x (t_udstr d,
1128 * ustr_cv_index_t from,
1129 * ustr_cv_index_t to);
1130 *
1131 * Modify `d' by removing code values in the half-open range `from' to
1132 * `to'.
1133 */
1134 t_udstr
udstr_cv_delete_x(t_udstr d,ustr_cv_index_t from,ustr_cv_index_t to)1135 udstr_cv_delete_x (t_udstr d,
1136 ustr_cv_index_t from,
1137 ustr_cv_index_t to)
1138 {
1139 size_t cv_sizeof;
1140 size_t from_offset;
1141 size_t to_offset;
1142 size_t current_length;
1143
1144 cv_sizeof = uni_cv_sizeof (d->enc);
1145
1146 from_offset = from.cv * cv_sizeof;
1147 to_offset = to.cv * cv_sizeof;
1148 current_length = d->cv_len.cv * cv_sizeof;
1149
1150 mem_move (d->str.raw + from_offset, d->str.raw + to_offset, current_length - to_offset);
1151
1152 d->str.raw = lim_realloc (d->limits, d->str.raw, current_length - (to_offset - from_offset));
1153 d->cv_len.cv = (ssize_t)(current_length - (to_offset - from_offset));
1154
1155 return d;
1156 }
1157
1158
1159 /*(c udstr_cp_delete_x)
1160 * t_udstr udstr_cp_delete_x (t_udstr d,
1161 * ustr_cp_index_t from,
1162 * ustr_cp_index_t to);
1163 *
1164 * Modify `d' by removing codepoints in the half-open range `from' to
1165 * `to'.
1166 */
1167 t_udstr
udstr_cp_delete_x(t_udstr d,ustr_cp_index_t from,ustr_cp_index_t to)1168 udstr_cp_delete_x (t_udstr d,
1169 ustr_cp_index_t from,
1170 ustr_cp_index_t to)
1171 {
1172 ustr_cv_index_t fv;
1173 ustr_cv_index_t tv;
1174
1175 udstr_cp_to_cv_range (&fv, &tv, d, from, to);
1176 return udstr_cv_delete_x (d, fv, tv);
1177 }
1178
1179
1180 /*(c udstr_cv_delete_fw)
1181 * t_udstr udstr_cv_delete_fw (alloc_limits limits,
1182 * t_udstr d,
1183 * ustr_cv_index_t from,
1184 * ustr_cv_index_t to);
1185 *
1186 * Return a new string which is a copy of `d' with
1187 * code values in the half-open range `from' to `to'
1188 * removed.
1189 *
1190 * The new string uses a full-width encoding.
1191 */
1192 t_udstr
udstr_cv_delete_fw(alloc_limits limits,t_udstr d,ustr_cv_index_t from,ustr_cv_index_t to)1193 udstr_cv_delete_fw (alloc_limits limits,
1194 t_udstr d,
1195 ustr_cv_index_t from,
1196 ustr_cv_index_t to)
1197 {
1198 uni_string right_source;
1199 ustr_cv_index_t right_len;
1200 ustr_cp_index_t right_cp_len;
1201 enum uni_encoding_scheme right_fw;
1202 ustr_cp_index_t left_cp_len;
1203 enum uni_encoding_scheme left_fw;
1204 enum uni_encoding_scheme actual_fw;
1205 ustr_cv_index_t right_len_fw;
1206 ustr_cv_index_t left_len_fw;
1207 ustr_cv_index_t len_fw;
1208 ustr_cp_index_t cp_len;
1209 t_udstr answer = 0;
1210 uni_string right_dest;
1211 size_t d_cv_sizeof;
1212 size_t d_right_sizeof;
1213
1214
1215 right_source = ustr_cv_offset_n (&right_len, d->str, d->enc, d->cv_len, to);
1216 right_cp_len = ustr_cp_length_n (&right_fw, right_source, d->enc, right_len);
1217
1218 left_cp_len = ustr_cp_length_n (&left_fw, d->str, d->enc, from);
1219
1220 actual_fw = pick_fw_of (right_fw, left_fw);
1221
1222 if (actual_fw == d->enc)
1223 {
1224 right_len_fw = right_len;
1225 left_len_fw = from;
1226 }
1227 else
1228 {
1229 right_len_fw = ustr_cv_length_in_encoding_n (actual_fw, right_source, d->enc, right_len);
1230 left_len_fw = ustr_cv_length_in_encoding_n (actual_fw, d->str, d->enc, from);
1231 }
1232
1233 len_fw = ustr_cv_index (right_len_fw.cv + left_len_fw.cv);
1234 cp_len = ustr_cp_index (right_cp_len.cp + left_cp_len.cp);
1235
1236 answer = udstr_save_generic (limits, actual_fw, len_fw, d->str, d->enc, from, left_cp_len);
1237 if (!answer)
1238 return 0;
1239
1240 answer->cp_len = cp_len;
1241
1242 right_dest = ustr_cv_offset_n (0, answer->str, answer->enc, answer->cv_len, left_len_fw);
1243 d_cv_sizeof = uni_cv_sizeof (d->enc);
1244 d_right_sizeof = right_len.cv * d_cv_sizeof;
1245 mem_move (right_dest.raw, right_source.raw, d_right_sizeof);
1246
1247 return answer;
1248 }
1249
1250
1251 /*(c udstr_cp_delete_fw)
1252 * t_udstr udstr_cp_delete_fw (alloc_limits limits,
1253 * t_udstr d,
1254 * ustr_cp_index_t from,
1255 * ustr_cp_index_t to);
1256 *
1257 * Return a new string which is a copy of `d' with
1258 * codepoints in the half-open range `from' to `to'
1259 * removed.
1260 *
1261 * The new string uses a full-width encoding.
1262 */
1263 t_udstr
udstr_cp_delete_fw(alloc_limits limits,t_udstr d,ustr_cp_index_t from,ustr_cp_index_t to)1264 udstr_cp_delete_fw (alloc_limits limits,
1265 t_udstr d,
1266 ustr_cp_index_t from,
1267 ustr_cp_index_t to)
1268 {
1269 ustr_cv_index_t fv;
1270 ustr_cv_index_t tv;
1271
1272 udstr_cp_to_cv_range (&fv, &tv, d, from, to);
1273 return udstr_cv_delete_fw (limits, d, fv, tv);
1274 }
1275
1276
1277 /*(c udstr_cv_delete_fw_x)
1278 * t_udstr udstr_cv_delete_fw_x (t_udstr d,
1279 * ustr_cv_index_t from,
1280 * ustr_cv_index_t to);
1281 *
1282 * Modify `d' by removing code values in the half-open range `from' to
1283 * `to' and ensuring that `d' uses a full-width encoding.
1284 */
1285 t_udstr
udstr_cv_delete_fw_x(t_udstr d,ustr_cv_index_t from,ustr_cv_index_t to)1286 udstr_cv_delete_fw_x (t_udstr d,
1287 ustr_cv_index_t from,
1288 ustr_cv_index_t to)
1289 {
1290 t_udstr almost_answer = 0;
1291
1292 almost_answer = udstr_cv_delete_fw (d->limits, d, from, to);
1293
1294 if (!almost_answer)
1295 return 0;
1296
1297 take_new_data (d, almost_answer);
1298
1299 return d;
1300 }
1301
1302
1303 /*(c udstr_cp_delete_fw_x)
1304 * t_udstr udstr_cp_delete_fw_x (t_udstr d,
1305 * ustr_cp_index_t from,
1306 * ustr_cp_index_t to);
1307 *
1308 * Modify `d' by removing codepoints in the half-open range `from' to
1309 * `to' and ensuring that `d' uses a full-width encoding.
1310 */
1311 t_udstr
udstr_cp_delete_fw_x(t_udstr d,ustr_cp_index_t from,ustr_cp_index_t to)1312 udstr_cp_delete_fw_x (t_udstr d,
1313 ustr_cp_index_t from,
1314 ustr_cp_index_t to)
1315 {
1316 ustr_cv_index_t fv;
1317 ustr_cv_index_t tv;
1318
1319 udstr_cp_to_cv_range (&fv, &tv, d, from, to);
1320 return udstr_cv_delete_fw_x (d, fv, tv);
1321 }
1322
1323
1324
1325
1326
1327 /************************************************************************
1328 *(h2 "udstr Substring Replacement")
1329 *
1330 *
1331 *
1332 */
1333
1334
1335 /*(c udstr_cv_replace)
1336 * t_udstr udstr_cv_replace (alloc_limits limits,
1337 * t_udstr into,
1338 * ustr_cv_index_t i_from,
1339 * ustr_cv_index_t i_to,
1340 * t_udstr from,
1341 * ustr_cv_index_t f_from,
1342 * ustr_cv_index_t f_to);
1343 *
1344 * Return a new string in which the indicated substring
1345 * of `into' is replaced by the indicated substring of `from'.
1346 */
1347 t_udstr
udstr_cv_replace(alloc_limits limits,t_udstr into,ustr_cv_index_t i_from,ustr_cv_index_t i_to,t_udstr from,ustr_cv_index_t f_from,ustr_cv_index_t f_to)1348 udstr_cv_replace (alloc_limits limits,
1349 t_udstr into,
1350 ustr_cv_index_t i_from,
1351 ustr_cv_index_t i_to,
1352 t_udstr from,
1353 ustr_cv_index_t f_from,
1354 ustr_cv_index_t f_to)
1355 {
1356 t_udstr answer = 0;
1357 t_udstr middle = 0;
1358 t_udstr end = 0;
1359
1360 answer = udstr_cv_substr (limits, into, ustr_cv_index (0), i_from);
1361 middle = udstr_cv_substr (limits, from, f_from, f_to);
1362 end = udstr_cv_substr (limits, into, i_to, into->cv_len);
1363
1364 if (!(answer && middle && end)
1365 || !udstr_append_x (answer, middle)
1366 || !udstr_append_x (answer, end))
1367 {
1368 udstr_unref (answer);
1369 udstr_unref (middle);
1370 udstr_unref (end);
1371 answer = 0;
1372 }
1373 else
1374 {
1375 udstr_unref (middle);
1376 udstr_unref (end);
1377 }
1378
1379 return answer;
1380 }
1381
1382
1383 /*(c udstr_cp_replace)
1384 * t_udstr udstr_cp_replace (alloc_limits limits,
1385 * t_udstr into,
1386 * ustr_cp_index_t i_from,
1387 * ustr_cp_index_t i_to,
1388 * t_udstr from,
1389 * ustr_cp_index_t f_from,
1390 * ustr_cp_index_t f_to);
1391 *
1392 * Return a new string in which the indicated substring
1393 * of `into' is replaced by the indicated substring of `from'.
1394 */
1395 t_udstr
udstr_cp_replace(alloc_limits limits,t_udstr into,ustr_cp_index_t i_from,ustr_cp_index_t i_to,t_udstr from,ustr_cp_index_t f_from,ustr_cp_index_t f_to)1396 udstr_cp_replace (alloc_limits limits,
1397 t_udstr into,
1398 ustr_cp_index_t i_from,
1399 ustr_cp_index_t i_to,
1400 t_udstr from,
1401 ustr_cp_index_t f_from,
1402 ustr_cp_index_t f_to)
1403 {
1404 ustr_cv_index_t i_f;
1405 ustr_cv_index_t i_t;
1406 ustr_cv_index_t f_f;
1407 ustr_cv_index_t f_t;
1408
1409 udstr_cp_to_cv_range (&i_f, &i_t, into, i_from, i_to);
1410 udstr_cp_to_cv_range (&f_f, &f_t, from, f_from, f_to);
1411
1412 return udstr_cv_replace (limits, into, i_f, i_t, from, f_f, f_t);
1413 }
1414
1415
1416
1417
1418 /*(c udstr_cv_replace_x)
1419 * t_udstr udstr_cv_replace_x (t_udstr into,
1420 * ustr_cv_index_t i_from,
1421 * ustr_cv_index_t i_to,
1422 * t_udstr from,
1423 * ustr_cv_index_t f_from,
1424 * ustr_cv_index_t f_to);
1425 *
1426 * Modify `into' by replacing the indicated substring with
1427 * the indicated substring of `from'.
1428 */
1429 t_udstr
udstr_cv_replace_x(t_udstr into,ustr_cv_index_t i_from,ustr_cv_index_t i_to,t_udstr from,ustr_cv_index_t f_from,ustr_cv_index_t f_to)1430 udstr_cv_replace_x (t_udstr into,
1431 ustr_cv_index_t i_from,
1432 ustr_cv_index_t i_to,
1433 t_udstr from,
1434 ustr_cv_index_t f_from,
1435 ustr_cv_index_t f_to)
1436 {
1437 t_udstr almost_answer;
1438
1439 almost_answer = udstr_cv_replace (into->limits, into, i_from, i_to, from, f_from, f_to);
1440
1441 if (!almost_answer)
1442 return 0;
1443
1444 take_new_data (into, almost_answer);
1445 return into;
1446 }
1447
1448
1449 /*(c udstr_cp_replace_x)
1450 * t_udstr udstr_cp_replace_x (t_udstr into,
1451 * ustr_cp_index_t i_from,
1452 * ustr_cp_index_t i_to,
1453 * t_udstr from,
1454 * ustr_cp_index_t f_from,
1455 * ustr_cp_index_t f_to);
1456 *
1457 * Modify `into' by replacing the indicated substring with
1458 * the indicated substring of `from'.
1459 */
1460 t_udstr
udstr_cp_replace_x(t_udstr into,ustr_cp_index_t i_from,ustr_cp_index_t i_to,t_udstr from,ustr_cp_index_t f_from,ustr_cp_index_t f_to)1461 udstr_cp_replace_x (t_udstr into,
1462 ustr_cp_index_t i_from,
1463 ustr_cp_index_t i_to,
1464 t_udstr from,
1465 ustr_cp_index_t f_from,
1466 ustr_cp_index_t f_to)
1467 {
1468 ustr_cv_index_t i_f;
1469 ustr_cv_index_t i_t;
1470 ustr_cv_index_t f_f;
1471 ustr_cv_index_t f_t;
1472
1473 udstr_cp_to_cv_range (&i_f, &i_t, into, i_from, i_to);
1474 udstr_cp_to_cv_range (&f_f, &f_t, from, f_from, f_to);
1475
1476 return udstr_cv_replace_x (into, i_f, i_t, from, f_f, f_t);
1477 }
1478
1479
1480 /*(c udstr_cv_replace_fw)
1481 * t_udstr udstr_cv_replace_fw (alloc_limits limits,
1482 * t_udstr into,
1483 * ustr_cv_index_t i_from,
1484 * ustr_cv_index_t i_to,
1485 * t_udstr from,
1486 * ustr_cv_index_t f_from,
1487 * ustr_cv_index_t f_to);
1488 *
1489 * Return a new string in which the indicated substring
1490 * of `into' is replaced by the indicated substring of `from'.
1491 *
1492 * The returned string uses a full-width encoding.
1493 */
1494 t_udstr
udstr_cv_replace_fw(alloc_limits limits,t_udstr into,ustr_cv_index_t i_from,ustr_cv_index_t i_to,t_udstr from,ustr_cv_index_t f_from,ustr_cv_index_t f_to)1495 udstr_cv_replace_fw (alloc_limits limits,
1496 t_udstr into,
1497 ustr_cv_index_t i_from,
1498 ustr_cv_index_t i_to,
1499 t_udstr from,
1500 ustr_cv_index_t f_from,
1501 ustr_cv_index_t f_to)
1502 {
1503 t_udstr answer = 0;
1504 t_udstr middle = 0;
1505 t_udstr end = 0;
1506
1507 answer = udstr_cv_substr_fw (limits, into, ustr_cv_index (0), i_from);
1508 middle = udstr_cv_substr (limits, from, f_from, f_to);
1509 end = udstr_cv_substr (limits, into, i_to, into->cv_len);
1510
1511 if (!(answer && middle && end)
1512 || !udstr_append_fw_x (answer, middle)
1513 || !udstr_append_fw_x (answer, end))
1514 {
1515 udstr_unref (answer);
1516 udstr_unref (middle);
1517 udstr_unref (end);
1518 answer = 0;
1519 }
1520 else
1521 {
1522 udstr_unref (middle);
1523 udstr_unref (end);
1524 }
1525
1526 return answer;
1527 }
1528
1529
1530 /*(c udstr_cp_replace_fw)
1531 * t_udstr udstr_cp_replace_fw (alloc_limits limits,
1532 * t_udstr into,
1533 * ustr_cp_index_t i_from,
1534 * ustr_cp_index_t i_to,
1535 * t_udstr from,
1536 * ustr_cp_index_t f_from,
1537 * ustr_cp_index_t f_to);
1538 *
1539 * Return a new string in which the indicated substring
1540 * of `into' is replaced by the indicated substring of `from'.
1541 *
1542 * The returned string uses a full-width encoding.
1543 */
1544 t_udstr
udstr_cp_replace_fw(alloc_limits limits,t_udstr into,ustr_cp_index_t i_from,ustr_cp_index_t i_to,t_udstr from,ustr_cp_index_t f_from,ustr_cp_index_t f_to)1545 udstr_cp_replace_fw (alloc_limits limits,
1546 t_udstr into,
1547 ustr_cp_index_t i_from,
1548 ustr_cp_index_t i_to,
1549 t_udstr from,
1550 ustr_cp_index_t f_from,
1551 ustr_cp_index_t f_to)
1552 {
1553 ustr_cv_index_t i_f;
1554 ustr_cv_index_t i_t;
1555 ustr_cv_index_t f_f;
1556 ustr_cv_index_t f_t;
1557
1558 udstr_cp_to_cv_range (&i_f, &i_t, into, i_from, i_to);
1559 udstr_cp_to_cv_range (&f_f, &f_t, from, f_from, f_to);
1560
1561 return udstr_cv_replace_fw (limits, into, i_f, i_t, from, f_f, f_t);
1562 }
1563
1564
1565 /*(c udstr_cv_replace_fw_x)
1566 * t_udstr udstr_cv_replace_fw_x (t_udstr into,
1567 * ustr_cv_index_t i_from,
1568 * ustr_cv_index_t i_to,
1569 * t_udstr from,
1570 * ustr_cv_index_t f_from,
1571 * ustr_cv_index_t f_to);
1572 *
1573 * Modify `into' by replacing the indicated substring with
1574 * the indicated substring of `from' and ensuring that
1575 * `into' uses a full-width encoding.
1576 */
1577 t_udstr
udstr_cv_replace_fw_x(t_udstr into,ustr_cv_index_t i_from,ustr_cv_index_t i_to,t_udstr from,ustr_cv_index_t f_from,ustr_cv_index_t f_to)1578 udstr_cv_replace_fw_x (t_udstr into,
1579 ustr_cv_index_t i_from,
1580 ustr_cv_index_t i_to,
1581 t_udstr from,
1582 ustr_cv_index_t f_from,
1583 ustr_cv_index_t f_to)
1584 {
1585 t_udstr almost_answer;
1586
1587 almost_answer = udstr_cv_replace_fw (into->limits, into, i_from, i_to, from, f_from, f_to);
1588
1589 if (!almost_answer)
1590 return 0;
1591
1592 take_new_data (into, almost_answer);
1593 return into;
1594 }
1595
1596
1597 /*(c udstr_cp_replace_fw_x)
1598 * t_udstr udstr_cp_replace_fw_x (t_udstr into,
1599 * ustr_cp_index_t i_from,
1600 * ustr_cp_index_t i_to,
1601 * t_udstr from,
1602 * ustr_cp_index_t f_from,
1603 * ustr_cp_index_t f_to);
1604 *
1605 * Modify `into' by replacing the indicated substring with
1606 * the indicated substring of `from' and ensuring that
1607 * `into' uses a full-width encoding.
1608 */
1609 t_udstr
udstr_cp_replace_fw_x(t_udstr into,ustr_cp_index_t i_from,ustr_cp_index_t i_to,t_udstr from,ustr_cp_index_t f_from,ustr_cp_index_t f_to)1610 udstr_cp_replace_fw_x (t_udstr into,
1611 ustr_cp_index_t i_from,
1612 ustr_cp_index_t i_to,
1613 t_udstr from,
1614 ustr_cp_index_t f_from,
1615 ustr_cp_index_t f_to)
1616 {
1617 ustr_cv_index_t i_f;
1618 ustr_cv_index_t i_t;
1619 ustr_cv_index_t f_f;
1620 ustr_cv_index_t f_t;
1621
1622 udstr_cp_to_cv_range (&i_f, &i_t, into, i_from, i_to);
1623 udstr_cp_to_cv_range (&f_f, &f_t, from, f_from, f_to);
1624
1625 return udstr_cv_replace_fw_x (into, i_f, i_t, from, f_f, f_t);
1626 }
1627
1628
1629
1630
1631
1632 /************************************************************************
1633 *(h2 "udstr Index Normalization")
1634 *
1635 *
1636 *
1637 */
1638
1639
1640 /*(c udstr_cv_normalize)
1641 * ustr_cv_index_t udstr_cv_normalize (t_udstr dstr,
1642 * ustr_cv_index_t orig_index);
1643 *
1644 * Return a code value index derived from `orig_index' by adjusting
1645 * it to point to the first code value in its codepoint.
1646 */
1647 ustr_cv_index_t
udstr_cv_normalize(t_udstr dstr,ustr_cv_index_t orig_index)1648 udstr_cv_normalize (t_udstr dstr,
1649 ustr_cv_index_t orig_index)
1650 {
1651 ustr_cv_index_t index = orig_index;
1652
1653
1654 switch (dstr->enc)
1655 {
1656 default:
1657 case uni_iso8859_1:
1658 case uni_utf32:
1659 case uni_utf32be:
1660 case uni_utf32le:
1661 case uni_bogus32:
1662 case uni_bogus32be:
1663 case uni_bogus32le:
1664 {
1665 if (index.cv > dstr->cv_len.cv)
1666 index.cv = dstr->cv_len.cv;
1667 else if (index.cv < 0)
1668 index.cv = 0;
1669
1670 return index;
1671 }
1672
1673 case uni_utf8:
1674 {
1675 if (index.cv > dstr->cv_len.cv)
1676 {
1677 index.cv = dstr->cv_len.cv;
1678 return index;
1679 }
1680 else if (index.cv <= 0)
1681 {
1682 index.cv = 0;
1683 return index;
1684 }
1685 else if (!(dstr->str.utf8[index.cv] & 0x80))
1686 {
1687 return index;
1688 }
1689 else
1690 {
1691 if (0x80 == (0xC0 & dstr->str.utf8[index.cv]))
1692 {
1693 int n_suffix_bytes = 1;
1694
1695 while (1)
1696 {
1697 /* invarients:
1698 *
1699 * index.cv >= 1
1700 *
1701 * str[index.cv] is some 10xxxxxx character
1702 *
1703 * 1 <= n_suffix_bytes <= 4
1704 */
1705
1706 /* Look at the previous character to decide
1707 * what to do.
1708 */
1709 switch (0xc0 & dstr->str.utf8[index.cv - 1])
1710 {
1711 case 0x80:
1712 {
1713 /* preceeding is also a non-first multi-byte sequence
1714 * code value. This preserves one of three loop
1715 * invarients.
1716 */
1717
1718 if (n_suffix_bytes == 4)
1719 {
1720 /* Then the one to the left means that there are 5 or
1721 * more, which is illegal, so, our original index is
1722 * just pointing at an ill-formed sequence.
1723 * This preserves the second loop invarient while
1724 * n_suffix_bytes counts up to detect ill-formed sequences.
1725 */
1726 return orig_index;
1727 }
1728 else if (index.cv == 1)
1729 {
1730 /* The string starts (at index 0) in the middle of
1731 * a multi-char sequence. So, once again, our
1732 * original index is pointing at an ill-formed sequence.
1733 * This preserves the final loop invarient.
1734 */
1735 return orig_index;
1736 }
1737 else
1738 {
1739 /* All invarients being preserved and having had made
1740 * progress towards finding the start of the character
1741 * sequence:
1742 */
1743 ++n_suffix_bytes;
1744 --index.cv;
1745 continue;
1746 }
1747 }
1748 case 0x40:
1749 case 0x00:
1750 {
1751 /* The place we started at is part of an ill-formed sequence (no
1752 * 0xc0 starting character. This preserves our loop invarients.
1753 */
1754 return orig_index;
1755 }
1756
1757 case 0xC0:
1758 {
1759 /* aha! the first character of a multi-byte sequence.
1760 */
1761 --index.cv;
1762 return index;
1763 }
1764 }
1765 }
1766 }
1767 }
1768 }
1769 case uni_utf16:
1770 {
1771 if (index.cv > dstr->cv_len.cv)
1772 {
1773 index.cv = dstr->cv_len.cv;
1774 return index;
1775 }
1776 else if (index.cv <= 0)
1777 {
1778 index.cv = 0;
1779 return index;
1780 }
1781 else if (uni_is_low_surrogate (dstr->str.utf16[index.cv]))
1782 {
1783 if (uni_is_high_surrogate (dstr->str.utf16[index.cv - 1]))
1784 {
1785 --index.cv;
1786 }
1787 return index;
1788 }
1789 else
1790 return index;
1791 }
1792 case uni_utf16be:
1793 {
1794 size_t pos;
1795 t_unicode c;
1796
1797 if (index.cv > dstr->cv_len.cv)
1798 {
1799 index.cv = dstr->cv_len.cv;
1800 return index;
1801 }
1802 else if (index.cv <= 0)
1803 {
1804 index.cv = 0;
1805 return index;
1806 }
1807
1808 pos = 0;
1809 c = uni_utf16be_iscan ((t_uchar *)(dstr->str.utf16 + index.cv), &pos, (size_t)2);
1810
1811 if (uni_is_low_surrogate (c))
1812 {
1813 pos = 0;
1814 c = uni_utf16be_iscan ((t_uchar *)(dstr->str.utf16 + index.cv - 1), &pos, (size_t)2);
1815
1816 if (uni_is_high_surrogate (c))
1817 {
1818 --index.cv;
1819 }
1820 return index;
1821 }
1822 else
1823 return index;
1824 }
1825 case uni_utf16le:
1826 {
1827 size_t pos;
1828 t_unicode c;
1829
1830 if (index.cv > dstr->cv_len.cv)
1831 {
1832 index.cv = dstr->cv_len.cv;
1833 return index;
1834 }
1835 else if (index.cv <= 0)
1836 {
1837 index.cv = 0;
1838 return index;
1839 }
1840
1841 pos = 0;
1842 c = uni_utf16le_iscan ((t_uchar *)(dstr->str.utf16 + index.cv), &pos, (size_t)2);
1843
1844 if (uni_is_low_surrogate (c))
1845 {
1846 pos = 0;
1847 c = uni_utf16le_iscan ((t_uchar *)(dstr->str.utf16 + index.cv - 1), &pos, (size_t)2);
1848
1849 if (uni_is_high_surrogate (c))
1850 {
1851 --index.cv;
1852 }
1853 return index;
1854 }
1855 else
1856 return index;
1857 }
1858 }
1859 }
1860
1861
1862
1863 /************************************************************************
1864 *(h2 "udstr String Iteration")
1865 *
1866 */
1867
1868
1869
1870
1871 /*(c udstr_cv_inc)
1872 * ustr_cv_index_t udstr_cv_inc (t_udstr dstr,
1873 * ustr_cv_index_t orig_index);
1874 *
1875 * Increment `orig_index' (presumed to be the code value index
1876 * of the first code value of a possibly multi-code value codepoint)
1877 * to be the code value index of the beginning of the _next_ codepoint.
1878 * (!)
1879 */
1880 ustr_cv_index_t
udstr_cv_inc(t_udstr dstr,ustr_cv_index_t orig_index)1881 udstr_cv_inc (t_udstr dstr,
1882 ustr_cv_index_t orig_index)
1883 {
1884 if (orig_index.cv >= dstr->cv_len.cv)
1885 return dstr->cv_len;
1886
1887 if (orig_index.cv < 0)
1888 return ustr_cv_index (0);
1889
1890 switch (dstr->enc)
1891 {
1892 default:
1893 case uni_utf32:
1894 case uni_utf32be:
1895 case uni_utf32le:
1896 case uni_bogus32:
1897 case uni_bogus32be:
1898 case uni_bogus32le:
1899 case uni_iso8859_1:
1900 {
1901 return ustr_cv_index (orig_index.cv + 1);
1902 }
1903
1904 #undef CASE_FOR
1905
1906 #define CASE_FOR(X) \
1907 case uni_ ## X: \
1908 { \
1909 size_t pos; \
1910 size_t len; \
1911 \
1912 pos = orig_index.cv; \
1913 len = dstr->cv_len.cv; \
1914 uni_ ## X ## _scan (dstr->str.X, &pos, len); \
1915 return ustr_cv_index ((ssize_t)pos); \
1916 }
1917
1918 CASE_FOR(utf8);
1919 CASE_FOR(utf16);
1920
1921
1922 #define ICASE_FOR(X) \
1923 case uni_ ## X: \
1924 { \
1925 size_t pos; \
1926 size_t len; \
1927 \
1928 pos = orig_index.cv * 2; \
1929 len = dstr->cv_len.cv * 2; \
1930 uni_ ## X ## _iscan (dstr->str.X, &pos, len); \
1931 return ustr_cv_index ((ssize_t)(pos / 2)); \
1932 }
1933
1934 ICASE_FOR(utf16be);
1935 ICASE_FOR(utf16le);
1936 }
1937 }
1938
1939
1940
1941 /*(c udstr_cv_inc)
1942 * ustr_cv_index_t udstr_cv_inc (t_udstr dstr,
1943 * ustr_cv_index_t orig_index);
1944 *
1945 * Decrement `orig_index' (presumed to be the code value index
1946 * of the first code value of a possibly multi-code value codepoint)
1947 * to be the code value index of the beginning of the _previous_ codepoint.
1948 * (!)
1949 */
1950 ustr_cv_index_t
udstr_cv_dec(t_udstr dstr,ustr_cv_index_t orig_index)1951 udstr_cv_dec (t_udstr dstr,
1952 ustr_cv_index_t orig_index)
1953 {
1954 if (orig_index.cv > dstr->cv_len.cv)
1955 return dstr->cv_len;
1956
1957 if (orig_index.cv <= 0)
1958 return ustr_cv_index (0);
1959
1960 return udstr_cv_normalize (dstr, ustr_cv_index (orig_index.cv - 1));
1961 }
1962
1963
1964 /************************************************************************
1965 *(hd "udstr Code Value and Codepoint Index Conversion")
1966 *
1967 *
1968 *
1969 */
1970
1971
1972 /*(c udstr_cp_to_cv)
1973 * ustr_cv_index_t udstr_cp_to_cv (t_udstr dstr,
1974 * ustr_cp_index_t cpi);
1975 *
1976 * Return the code value index of the indicated codepoint.
1977 */
1978 ustr_cv_index_t
udstr_cp_to_cv(t_udstr dstr,ustr_cp_index_t cpi)1979 udstr_cp_to_cv (t_udstr dstr,
1980 ustr_cp_index_t cpi)
1981 {
1982 uni_string addressed;
1983
1984 if (cpi.cp < 0)
1985 return ustr_cv_index (0);
1986
1987 if (cpi.cp >= dstr->cp_len.cp)
1988 return dstr->cv_len;
1989
1990 addressed = ustr_cp_offset_n (dstr->str, dstr->enc, dstr->cv_len, cpi);
1991 return ustr_str_subtract (addressed, dstr->str, dstr->enc);
1992 }
1993
1994
1995 /*(c udstr_cp_to_cv_range)
1996 * void udstr_cp_to_cv_range (ustr_cv_index_t * from_v,
1997 * ustr_cv_index_t * to_v,
1998 * t_udstr d,
1999 * ustr_cp_index_t from,
2000 * ustr_cp_index_t to);
2001 *
2002 * Return the code value indices of the indicated codepoint range.
2003 *
2004 */
2005 void
udstr_cp_to_cv_range(ustr_cv_index_t * from_v,ustr_cv_index_t * to_v,t_udstr d,ustr_cp_index_t from,ustr_cp_index_t to)2006 udstr_cp_to_cv_range (ustr_cv_index_t * from_v,
2007 ustr_cv_index_t * to_v,
2008 t_udstr d,
2009 ustr_cp_index_t from,
2010 ustr_cp_index_t to)
2011 {
2012 uni_string f_str;
2013 ustr_cv_index_t f_v;
2014 uni_string t_str;
2015 ustr_cv_index_t t_v;
2016
2017 invariant (from.cp <= to.cp);
2018
2019 f_str = ustr_cp_offset_n (d->str, d->enc, d->cv_len, from);
2020 f_v = ustr_str_subtract (f_str, d->str, d->enc);
2021 t_str = ustr_cp_offset_n (f_str,
2022 d->enc,
2023 ustr_cv_index (d->cv_len.cv - f_v.cv),
2024 ustr_cp_index (to.cp - from.cp));
2025 t_v = ustr_str_subtract (t_str, f_str, d->enc);
2026
2027 *from_v = f_v;
2028 *to_v = t_v;
2029 }
2030
2031
2032 /*(c udstr_cv_to_cp)
2033 * ustr_cp_index_t udstr_cv_to_cp (t_udstr dstr,
2034 * ustr_cv_index_t cvi);
2035 *
2036 * Return the codepoint index of the indicated code value.
2037 */
2038 ustr_cp_index_t
udstr_cv_to_cp(t_udstr dstr,ustr_cv_index_t cvi)2039 udstr_cv_to_cp (t_udstr dstr,
2040 ustr_cv_index_t cvi)
2041
2042 {
2043 if (cvi.cv < 0)
2044 return ustr_cp_index (0);
2045
2046 if (cvi.cv >= dstr->cv_len.cv)
2047 return dstr->cp_len;
2048
2049 return ustr_cp_length_n (0, dstr->str, dstr->enc, cvi);
2050 }
2051
2052
2053
2054
2055
2056 #if 0
2057 cv_set
2058 cv_set_x
2059 cp_set
2060 cp_set_x
2061
2062 #endif
2063
2064
2065 static enum uni_encoding_scheme
pick_fw_of(enum uni_encoding_scheme a,enum uni_encoding_scheme b)2066 pick_fw_of (enum uni_encoding_scheme a,
2067 enum uni_encoding_scheme b)
2068 {
2069 size_t a_size;
2070 size_t b_size;
2071
2072 if (a == b)
2073 return a;
2074
2075 a_size = uni_cv_sizeof (a);
2076 b_size = uni_cv_sizeof (b);
2077
2078 if (a_size > b_size)
2079 return a;
2080 if (b_size > a_size)
2081 return b;
2082
2083 switch (a)
2084 {
2085 case uni_iso8859_1: return uni_iso8859_1;
2086
2087 case uni_utf8: return uni_utf8;
2088
2089 case uni_utf16be:
2090 case uni_utf16le:
2091 case uni_utf16: return uni_utf16;
2092
2093 case uni_utf32be:
2094 case uni_utf32le:
2095 case uni_utf32: return uni_utf32;
2096
2097 default:
2098 case uni_bogus32be:
2099 case uni_bogus32le:
2100 case uni_bogus32: return uni_bogus32;
2101 }
2102 }
2103
2104 static void
take_new_data(t_udstr dstr,t_udstr new_data)2105 take_new_data (t_udstr dstr, t_udstr new_data)
2106 {
2107 new_data->refs = dstr->refs;
2108 lim_free (dstr->limits, dstr->str.raw);
2109 *dstr = *new_data;
2110 lim_free (new_data->limits, (void *)new_data);
2111 }
2112
2113
2114
2115 /* tag: Tom Lord Fri Jan 2 08:47:21 2004 (udstr.c)
2116 */
2117