1 /* Handling strings that are given partially in the source encoding and
2 partially in Unicode.
3 Copyright (C) 2001-2018 Free Software Foundation, Inc.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21
22 /* Specification. */
23 #include "xg-mixed-string.h"
24
25 #include <assert.h>
26 #include <stdlib.h>
27 #include <string.h>
28
29 #include "error.h"
30 #include "error-progname.h"
31 #include "flexmember.h"
32 #include "msgl-ascii.h"
33 #include "po-charset.h"
34 #include "unistr.h"
35 #include "xalloc.h"
36
37 #include "xg-pos.h"
38
39 #include "gettext.h"
40 #define _(str) gettext (str)
41
42
43 /* Allocates a single segment. */
44 static inline struct mixed_string_segment *
segment_alloc(enum segment_type type,const char * string,size_t length)45 segment_alloc (enum segment_type type, const char *string, size_t length)
46 {
47 struct mixed_string_segment *segment =
48 (struct mixed_string_segment *)
49 xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents, length));
50 segment->type = type;
51 segment->length = length;
52 memcpy (segment->contents, string, length);
53 return segment;
54 }
55
56 /* Clones a single segment. */
57 static inline struct mixed_string_segment *
segment_clone(const struct mixed_string_segment * segment)58 segment_clone (const struct mixed_string_segment *segment)
59 {
60 return segment_alloc (segment->type, segment->contents, segment->length);
61 }
62
63 mixed_string_ty *
mixed_string_alloc_simple(const char * string,lexical_context_ty lcontext,const char * logical_file_name,int line_number)64 mixed_string_alloc_simple (const char *string,
65 lexical_context_ty lcontext,
66 const char *logical_file_name,
67 int line_number)
68 {
69 struct mixed_string *ms = XMALLOC (struct mixed_string);
70
71 if (*string == '\0')
72 {
73 /* An empty string. */
74 ms->segments = NULL;
75 ms->nsegments = 0;
76 }
77 else
78 {
79 ms->segments = XNMALLOC (1, struct mixed_string_segment *);
80 if ((xgettext_current_source_encoding == po_charset_ascii
81 || xgettext_current_source_encoding == po_charset_utf8)
82 && is_ascii_string (string))
83 /* An optimization. */
84 ms->segments[0] =
85 segment_alloc (utf8_encoded, string, strlen (string));
86 else
87 /* The general case. */
88 ms->segments[0] =
89 segment_alloc (source_encoded, string, strlen (string));
90 ms->nsegments = 1;
91 }
92 ms->lcontext = lcontext;
93 ms->logical_file_name = logical_file_name;
94 ms->line_number = line_number;
95
96 return ms;
97 }
98
99 mixed_string_ty *
mixed_string_alloc_utf8(const char * string,lexical_context_ty lcontext,const char * logical_file_name,int line_number)100 mixed_string_alloc_utf8 (const char *string,
101 lexical_context_ty lcontext,
102 const char *logical_file_name,
103 int line_number)
104 {
105 struct mixed_string *ms = XMALLOC (struct mixed_string);
106
107 if (*string == '\0')
108 {
109 /* An empty string. */
110 ms->segments = NULL;
111 ms->nsegments = 0;
112 }
113 else
114 {
115 ms->segments = XNMALLOC (1, struct mixed_string_segment *);
116 ms->segments[0] = segment_alloc (utf8_encoded, string, strlen (string));
117 ms->nsegments = 1;
118 }
119 ms->lcontext = lcontext;
120 ms->logical_file_name = logical_file_name;
121 ms->line_number = line_number;
122
123 return ms;
124 }
125
126 mixed_string_ty *
mixed_string_clone(const mixed_string_ty * ms1)127 mixed_string_clone (const mixed_string_ty *ms1)
128 {
129 struct mixed_string *ms = XMALLOC (struct mixed_string);
130 size_t nsegments = ms1->nsegments;
131
132 if (nsegments == 0)
133 {
134 ms->segments = NULL;
135 ms->nsegments = 0;
136 }
137 else
138 {
139 size_t i;
140
141 ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
142 for (i = 0; i < nsegments; i++)
143 ms->segments[i] = segment_clone (ms1->segments[i]);
144 ms->nsegments = nsegments;
145 }
146 ms->lcontext = ms1->lcontext;
147 ms->logical_file_name = ms1->logical_file_name;
148 ms->line_number = ms1->line_number;
149
150 return ms;
151 }
152
153 char *
mixed_string_contents(const mixed_string_ty * ms)154 mixed_string_contents (const mixed_string_ty *ms)
155 {
156 size_t nsegments = ms->nsegments;
157 /* Trivial cases. */
158 if (nsegments == 0)
159 return xstrdup ("");
160 if (nsegments == 1 && ms->segments[0]->type == utf8_encoded)
161 {
162 /* Return the segment, with a NUL at the end. */
163 size_t len = ms->segments[0]->length;
164 char *string = XNMALLOC (len + 1, char);
165 memcpy (string, ms->segments[0]->contents, len);
166 string[len] = '\0';
167 return string;
168 }
169 /* General case. */
170 {
171 size_t i;
172
173 for (i = 0; i < nsegments - 1; i++)
174 if (memchr (ms->segments[i]->contents, '\0', ms->segments[i]->length)
175 != NULL)
176 {
177 /* Segment i contains a NUL character. Ignore the remaining
178 segments. */
179 nsegments = i + 1;
180 break;
181 }
182 }
183 {
184 char **converted_segments = XNMALLOC (nsegments, char *);
185 size_t length;
186
187 length = 0;
188 {
189 size_t i;
190
191 for (i = 0; i < nsegments; i++)
192 if (ms->segments[i]->type == source_encoded)
193 {
194 char *source_encoded_string;
195 char *utf8_encoded_string;
196
197 /* Copy the segment's contents, with a NUL at the end. */
198 {
199 size_t len = ms->segments[i]->length;
200 source_encoded_string = XNMALLOC (len + 1, char);
201 memcpy (source_encoded_string, ms->segments[i]->contents, len);
202 source_encoded_string[len] = '\0';
203 }
204 /* Convert it to UTF-8 encoding. */
205 utf8_encoded_string =
206 from_current_source_encoding (source_encoded_string,
207 ms->lcontext,
208 ms->logical_file_name,
209 ms->line_number);
210 if (utf8_encoded_string != source_encoded_string)
211 free (source_encoded_string);
212 converted_segments[i] = utf8_encoded_string;
213 length += strlen (utf8_encoded_string);
214 }
215 else
216 length += ms->segments[i]->length;
217 }
218
219 {
220 char *string = XNMALLOC (length + 1, char);
221 {
222 char *p;
223 size_t i;
224
225 p = string;
226 for (i = 0; i < nsegments; i++)
227 if (ms->segments[i]->type == source_encoded)
228 {
229 p = stpcpy (p, converted_segments[i]);
230 free (converted_segments[i]);
231 }
232 else
233 {
234 memcpy (p, ms->segments[i]->contents, ms->segments[i]->length);
235 p += ms->segments[i]->length;
236 }
237 assert (p == string + length);
238 *p = '\0';
239 }
240
241 free (converted_segments);
242 return string;
243 }
244 }
245 }
246
247 void
mixed_string_free(mixed_string_ty * ms)248 mixed_string_free (mixed_string_ty *ms)
249 {
250 struct mixed_string_segment **segments = ms->segments;
251 size_t nsegments = ms->nsegments;
252 if (nsegments > 0)
253 {
254 size_t i;
255 for (i = 0; i < nsegments; i++)
256 free (segments[i]);
257 }
258 free (segments);
259 free (ms);
260 }
261
262 char *
mixed_string_contents_free1(mixed_string_ty * ms)263 mixed_string_contents_free1 (mixed_string_ty *ms)
264 {
265 char *contents = mixed_string_contents (ms);
266 mixed_string_free (ms);
267 return contents;
268 }
269
270 mixed_string_ty *
mixed_string_concat(const mixed_string_ty * ms1,const mixed_string_ty * ms2)271 mixed_string_concat (const mixed_string_ty *ms1,
272 const mixed_string_ty *ms2)
273 {
274 /* Trivial cases. */
275 if (ms2->nsegments == 0)
276 return mixed_string_clone (ms1);
277 if (ms1->nsegments == 0)
278 return mixed_string_clone (ms2);
279 /* General case. */
280 {
281 struct mixed_string *ms = XMALLOC (struct mixed_string);
282 size_t nsegments = ms1->nsegments + ms2->nsegments;
283 size_t j;
284 if (ms1->segments[ms1->nsegments-1]->type == ms2->segments[0]->type)
285 {
286 /* Combine the last segment of ms1 with the first segment of ms2. */
287 size_t i;
288
289 nsegments -= 1;
290 ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
291 j = 0;
292 for (i = 0; i < ms1->nsegments - 1; i++)
293 ms->segments[j++] = segment_clone (ms1->segments[i]);
294 {
295 size_t len1 = ms1->segments[i]->length;
296 size_t len2 = ms2->segments[0]->length;
297 struct mixed_string_segment *newseg =
298 (struct mixed_string_segment *)
299 xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents,
300 len1 + len2));
301 newseg->type = ms2->segments[0]->type;
302 newseg->length = len1 + len2;
303 memcpy (newseg->contents, ms1->segments[i]->contents, len1);
304 memcpy (newseg->contents + len1, ms2->segments[0]->contents, len2);
305 ms->segments[j++] = newseg;
306 }
307 for (i = 1; i < ms2->nsegments; i++)
308 ms->segments[j++] = segment_clone (ms2->segments[i]);
309 }
310 else
311 {
312 size_t i;
313
314 ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
315 j = 0;
316 for (i = 0; i < ms1->nsegments; i++)
317 ms->segments[j++] = segment_clone (ms1->segments[i]);
318 for (i = 0; i < ms2->nsegments; i++)
319 ms->segments[j++] = segment_clone (ms2->segments[i]);
320 }
321 assert (j == nsegments);
322 ms->nsegments = nsegments;
323 ms->lcontext = ms1->lcontext;
324 ms->logical_file_name = ms1->logical_file_name;
325 ms->line_number = ms1->line_number;
326
327 return ms;
328 }
329 }
330
331 mixed_string_ty *
mixed_string_concat_free1(mixed_string_ty * ms1,const mixed_string_ty * ms2)332 mixed_string_concat_free1 (mixed_string_ty *ms1, const mixed_string_ty *ms2)
333 {
334 /* Trivial cases. */
335 if (ms2->nsegments == 0)
336 return ms1;
337 if (ms1->nsegments == 0)
338 {
339 mixed_string_free (ms1);
340 return mixed_string_clone (ms2);
341 }
342 /* General case. */
343 {
344 struct mixed_string *ms = XMALLOC (struct mixed_string);
345 size_t nsegments = ms1->nsegments + ms2->nsegments;
346 size_t j;
347 if (ms1->segments[ms1->nsegments-1]->type == ms2->segments[0]->type)
348 {
349 /* Combine the last segment of ms1 with the first segment of ms2. */
350 size_t i;
351
352 nsegments -= 1;
353 ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
354 j = 0;
355 for (i = 0; i < ms1->nsegments - 1; i++)
356 ms->segments[j++] = ms1->segments[i];
357 {
358 size_t len1 = ms1->segments[i]->length;
359 size_t len2 = ms2->segments[0]->length;
360 struct mixed_string_segment *newseg =
361 (struct mixed_string_segment *)
362 xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents,
363 len1 + len2));
364 newseg->type = ms2->segments[0]->type;
365 newseg->length = len1 + len2;
366 memcpy (newseg->contents, ms1->segments[i]->contents, len1);
367 memcpy (newseg->contents + len1, ms2->segments[0]->contents, len2);
368 ms->segments[j++] = newseg;
369 }
370 free (ms1->segments[i]);
371 for (i = 1; i < ms2->nsegments; i++)
372 ms->segments[j++] = segment_clone (ms2->segments[i]);
373 }
374 else
375 {
376 size_t i;
377
378 ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
379 j = 0;
380 for (i = 0; i < ms1->nsegments; i++)
381 ms->segments[j++] = ms1->segments[i];
382 for (i = 0; i < ms2->nsegments; i++)
383 ms->segments[j++] = segment_clone (ms2->segments[i]);
384 }
385 assert (j == nsegments);
386 free (ms1->segments);
387 ms->nsegments = nsegments;
388 ms->lcontext = ms1->lcontext;
389 ms->logical_file_name = ms1->logical_file_name;
390 ms->line_number = ms1->line_number;
391 free (ms1);
392
393 return ms;
394 }
395 }
396
397
398 void
mixed_string_buffer_init(struct mixed_string_buffer * bp,lexical_context_ty lcontext,const char * logical_file_name,int line_number)399 mixed_string_buffer_init (struct mixed_string_buffer *bp,
400 lexical_context_ty lcontext,
401 const char *logical_file_name,
402 int line_number)
403 {
404 bp->segments = NULL;
405 bp->nsegments = 0;
406 bp->nsegments_allocated = 0;
407 bp->curr_type = -1;
408 bp->curr_buffer = NULL;
409 bp->curr_buflen = 0;
410 bp->curr_allocated = 0;
411 bp->utf16_surr = 0;
412 bp->lcontext = lcontext;
413 bp->logical_file_name = logical_file_name;
414 bp->line_number = line_number;
415 }
416
417 bool
mixed_string_buffer_is_empty(const struct mixed_string_buffer * bp)418 mixed_string_buffer_is_empty (const struct mixed_string_buffer *bp)
419 {
420 return (bp->nsegments == 0 && bp->curr_buflen == 0);
421 }
422
423 /* Auxiliary function: Ensure count more bytes are available in
424 bp->curr_buffer. */
425 static inline void
mixed_string_buffer_grow_curr_buffer(struct mixed_string_buffer * bp,size_t count)426 mixed_string_buffer_grow_curr_buffer (struct mixed_string_buffer *bp,
427 size_t count)
428 {
429 if (bp->curr_buflen + count > bp->curr_allocated)
430 {
431 size_t new_allocated = 2 * bp->curr_allocated + 10;
432 if (new_allocated < bp->curr_buflen + count)
433 new_allocated = bp->curr_buflen + count;
434 bp->curr_allocated = new_allocated;
435 bp->curr_buffer = xrealloc (bp->curr_buffer, new_allocated);
436 }
437 }
438
439 /* Auxiliary function: Append a byte to bp->curr. */
440 static inline void
mixed_string_buffer_append_to_curr_buffer(struct mixed_string_buffer * bp,unsigned char c)441 mixed_string_buffer_append_to_curr_buffer (struct mixed_string_buffer *bp,
442 unsigned char c)
443 {
444 if (bp->curr_buflen == bp->curr_allocated)
445 {
446 bp->curr_allocated = 2 * bp->curr_allocated + 10;
447 bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
448 }
449 bp->curr_buffer[bp->curr_buflen++] = c;
450 }
451
452 /* Auxiliary function: Assuming bp->curr_type == utf8_encoded, append a
453 Unicode character to bp->curr_buffer. uc must be < 0x110000. */
454 static inline void
mixed_string_buffer_append_to_utf8_buffer(struct mixed_string_buffer * bp,ucs4_t uc)455 mixed_string_buffer_append_to_utf8_buffer (struct mixed_string_buffer *bp,
456 ucs4_t uc)
457 {
458 unsigned char utf8buf[6];
459 int count = u8_uctomb (utf8buf, uc, 6);
460
461 if (count < 0)
462 /* The caller should have ensured that uc is not out-of-range. */
463 abort ();
464
465 mixed_string_buffer_grow_curr_buffer (bp, count);
466 memcpy (bp->curr_buffer + bp->curr_buflen, utf8buf, count);
467 bp->curr_buflen += count;
468 }
469
470 /* Auxiliary function: Assuming bp->curr_type == utf8_encoded, handle the
471 attempt to append a lone surrogate to bp->curr_buffer. */
472 static void
mixed_string_buffer_append_lone_surrogate(struct mixed_string_buffer * bp,ucs4_t uc)473 mixed_string_buffer_append_lone_surrogate (struct mixed_string_buffer *bp,
474 ucs4_t uc)
475 {
476 /* A half surrogate is invalid, therefore use U+FFFD instead.
477 It may be valid in a particular programming language.
478 But a half surrogate is invalid in UTF-8:
479 - RFC 3629 says
480 "The definition of UTF-8 prohibits encoding character
481 numbers between U+D800 and U+DFFF".
482 - Unicode 4.0 chapter 3
483 <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
484 section 3.9, p.77, says
485 "Because surrogate code points are not Unicode scalar
486 values, any UTF-8 byte sequence that would otherwise
487 map to code points D800..DFFF is ill-formed."
488 and in table 3-6, p. 78, does not mention D800..DFFF.
489 - The unicode.org FAQ question "How do I convert an unpaired
490 UTF-16 surrogate to UTF-8?" has the answer
491 "By representing such an unpaired surrogate on its own
492 as a 3-byte sequence, the resulting UTF-8 data stream
493 would become ill-formed."
494 So use U+FFFD instead. */
495 error_with_progname = false;
496 error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"),
497 logical_file_name, line_number, uc);
498 error_with_progname = true;
499 mixed_string_buffer_append_to_utf8_buffer (bp, 0xfffd);
500 }
501
502 /* Auxiliary function: Assuming bp->curr_type == utf8_encoded, flush
503 bp->utf16_surr into bp->curr_buffer. */
504 static inline void
mixed_string_buffer_flush_utf16_surr(struct mixed_string_buffer * bp)505 mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp)
506 {
507 if (bp->utf16_surr != 0)
508 {
509 mixed_string_buffer_append_lone_surrogate (bp, bp->utf16_surr);
510 bp->utf16_surr = 0;
511 }
512 }
513
514 /* Auxiliary function: Append a segment to bp->segments. */
515 static inline void
mixed_string_buffer_add_segment(struct mixed_string_buffer * bp,struct mixed_string_segment * newseg)516 mixed_string_buffer_add_segment (struct mixed_string_buffer *bp,
517 struct mixed_string_segment *newseg)
518 {
519 if (bp->nsegments == bp->nsegments_allocated)
520 {
521 size_t new_allocated =
522 bp->nsegments_allocated = 2 * bp->nsegments_allocated + 1;
523 bp->segments =
524 (struct mixed_string_segment **)
525 xrealloc (bp->segments,
526 new_allocated * sizeof (struct mixed_string_segment *));
527 }
528 bp->segments[bp->nsegments++] = newseg;
529 }
530
531 /* Auxiliary function: Flush bp->curr_buffer and bp->utf16_surr into
532 bp->segments. */
533 static void
mixed_string_buffer_flush_curr(struct mixed_string_buffer * bp)534 mixed_string_buffer_flush_curr (struct mixed_string_buffer *bp)
535 {
536 if (bp->curr_type == utf8_encoded)
537 mixed_string_buffer_flush_utf16_surr (bp);
538 if (bp->curr_type != -1)
539 {
540 if (bp->curr_buflen > 0)
541 {
542 struct mixed_string_segment *segment =
543 segment_alloc (bp->curr_type, bp->curr_buffer, bp->curr_buflen);
544 mixed_string_buffer_add_segment (bp, segment);
545 }
546 bp->curr_buflen = 0;
547 }
548 }
549
550 void
mixed_string_buffer_append_char(struct mixed_string_buffer * bp,int c)551 mixed_string_buffer_append_char (struct mixed_string_buffer *bp, int c)
552 {
553 /* Switch to multibyte character mode. */
554 if (bp->curr_type != source_encoded)
555 {
556 mixed_string_buffer_flush_curr (bp);
557 bp->curr_type = source_encoded;
558 }
559
560 mixed_string_buffer_append_to_curr_buffer (bp, (unsigned char) c);
561 }
562
563 void
mixed_string_buffer_append_unicode(struct mixed_string_buffer * bp,int c)564 mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, int c)
565 {
566 /* Switch to Unicode character mode. */
567 if (bp->curr_type != utf8_encoded)
568 {
569 mixed_string_buffer_flush_curr (bp);
570 bp->curr_type = utf8_encoded;
571 assert (bp->utf16_surr == 0);
572 }
573
574 /* Test whether this character and the previous one form a Unicode
575 surrogate character pair. */
576 if (bp->utf16_surr != 0 && (c >= 0xdc00 && c < 0xe000))
577 {
578 unsigned short utf16buf[2];
579 ucs4_t uc;
580
581 utf16buf[0] = bp->utf16_surr;
582 utf16buf[1] = c;
583 if (u16_mbtouc (&uc, utf16buf, 2) != 2)
584 abort ();
585
586 mixed_string_buffer_append_to_utf8_buffer (bp, uc);
587 bp->utf16_surr = 0;
588 }
589 else
590 {
591 mixed_string_buffer_flush_utf16_surr (bp);
592
593 if (c >= 0xd800 && c < 0xdc00)
594 bp->utf16_surr = c;
595 else if (c >= 0xdc00 && c < 0xe000)
596 mixed_string_buffer_append_lone_surrogate (bp, c);
597 else
598 mixed_string_buffer_append_to_utf8_buffer (bp, c);
599 }
600 }
601
602 void
mixed_string_buffer_destroy(struct mixed_string_buffer * bp)603 mixed_string_buffer_destroy (struct mixed_string_buffer *bp)
604 {
605 struct mixed_string_segment **segments = bp->segments;
606 size_t nsegments = bp->nsegments;
607 if (nsegments > 0)
608 {
609 size_t i;
610 for (i = 0; i < nsegments; i++)
611 free (segments[i]);
612 }
613 free (segments);
614 free (bp->curr_buffer);
615 }
616
617 mixed_string_ty *
mixed_string_buffer_result(struct mixed_string_buffer * bp)618 mixed_string_buffer_result (struct mixed_string_buffer *bp)
619 {
620 mixed_string_buffer_flush_curr (bp);
621
622 {
623 struct mixed_string *ms = XMALLOC (struct mixed_string);
624 size_t nsegments = bp->nsegments;
625
626 if (nsegments > 0)
627 ms->segments =
628 (struct mixed_string_segment **)
629 xrealloc (bp->segments,
630 nsegments * sizeof (struct mixed_string_segment *));
631 else
632 {
633 assert (bp->segments == NULL);
634 ms->segments = NULL;
635 }
636 ms->nsegments = nsegments;
637 ms->lcontext = bp->lcontext;
638 ms->logical_file_name = bp->logical_file_name;
639 ms->line_number = bp->line_number;
640
641 free (bp->curr_buffer);
642
643 return ms;
644 }
645 }
646