1 /* Handling strings that are given partially in the source encoding and
2    partially in Unicode.
3    Copyright (C) 2001-2018 Free Software Foundation, Inc.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17 
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21 
22 /* Specification.  */
23 #include "xg-mixed-string.h"
24 
25 #include <assert.h>
26 #include <stdlib.h>
27 #include <string.h>
28 
29 #include "error.h"
30 #include "error-progname.h"
31 #include "flexmember.h"
32 #include "msgl-ascii.h"
33 #include "po-charset.h"
34 #include "unistr.h"
35 #include "xalloc.h"
36 
37 #include "xg-pos.h"
38 
39 #include "gettext.h"
40 #define _(str) gettext (str)
41 
42 
43 /* Allocates a single segment.  */
44 static inline struct mixed_string_segment *
segment_alloc(enum segment_type type,const char * string,size_t length)45 segment_alloc (enum segment_type type, const char *string, size_t length)
46 {
47   struct mixed_string_segment *segment =
48     (struct mixed_string_segment *)
49     xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents, length));
50   segment->type = type;
51   segment->length = length;
52   memcpy (segment->contents, string, length);
53   return segment;
54 }
55 
56 /* Clones a single segment.  */
57 static inline struct mixed_string_segment *
segment_clone(const struct mixed_string_segment * segment)58 segment_clone (const struct mixed_string_segment *segment)
59 {
60   return segment_alloc (segment->type, segment->contents, segment->length);
61 }
62 
63 mixed_string_ty *
mixed_string_alloc_simple(const char * string,lexical_context_ty lcontext,const char * logical_file_name,int line_number)64 mixed_string_alloc_simple (const char *string,
65                            lexical_context_ty lcontext,
66                            const char *logical_file_name,
67                            int line_number)
68 {
69   struct mixed_string *ms = XMALLOC (struct mixed_string);
70 
71   if (*string == '\0')
72     {
73       /* An empty string.  */
74       ms->segments = NULL;
75       ms->nsegments = 0;
76     }
77   else
78     {
79       ms->segments = XNMALLOC (1, struct mixed_string_segment *);
80       if ((xgettext_current_source_encoding == po_charset_ascii
81            || xgettext_current_source_encoding == po_charset_utf8)
82           && is_ascii_string (string))
83         /* An optimization.  */
84         ms->segments[0] =
85           segment_alloc (utf8_encoded, string, strlen (string));
86       else
87         /* The general case.  */
88         ms->segments[0] =
89           segment_alloc (source_encoded, string, strlen (string));
90       ms->nsegments = 1;
91     }
92   ms->lcontext = lcontext;
93   ms->logical_file_name = logical_file_name;
94   ms->line_number = line_number;
95 
96   return ms;
97 }
98 
99 mixed_string_ty *
mixed_string_alloc_utf8(const char * string,lexical_context_ty lcontext,const char * logical_file_name,int line_number)100 mixed_string_alloc_utf8 (const char *string,
101                          lexical_context_ty lcontext,
102                          const char *logical_file_name,
103                          int line_number)
104 {
105   struct mixed_string *ms = XMALLOC (struct mixed_string);
106 
107   if (*string == '\0')
108     {
109       /* An empty string.  */
110       ms->segments = NULL;
111       ms->nsegments = 0;
112     }
113   else
114     {
115       ms->segments = XNMALLOC (1, struct mixed_string_segment *);
116       ms->segments[0] = segment_alloc (utf8_encoded, string, strlen (string));
117       ms->nsegments = 1;
118     }
119   ms->lcontext = lcontext;
120   ms->logical_file_name = logical_file_name;
121   ms->line_number = line_number;
122 
123   return ms;
124 }
125 
126 mixed_string_ty *
mixed_string_clone(const mixed_string_ty * ms1)127 mixed_string_clone (const mixed_string_ty *ms1)
128 {
129   struct mixed_string *ms = XMALLOC (struct mixed_string);
130   size_t nsegments = ms1->nsegments;
131 
132   if (nsegments == 0)
133     {
134       ms->segments = NULL;
135       ms->nsegments = 0;
136     }
137   else
138     {
139       size_t i;
140 
141       ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
142       for (i = 0; i < nsegments; i++)
143         ms->segments[i] = segment_clone (ms1->segments[i]);
144       ms->nsegments = nsegments;
145     }
146   ms->lcontext = ms1->lcontext;
147   ms->logical_file_name = ms1->logical_file_name;
148   ms->line_number = ms1->line_number;
149 
150   return ms;
151 }
152 
153 char *
mixed_string_contents(const mixed_string_ty * ms)154 mixed_string_contents (const mixed_string_ty *ms)
155 {
156   size_t nsegments = ms->nsegments;
157   /* Trivial cases.  */
158   if (nsegments == 0)
159     return xstrdup ("");
160   if (nsegments == 1 && ms->segments[0]->type == utf8_encoded)
161     {
162       /* Return the segment, with a NUL at the end.  */
163       size_t len = ms->segments[0]->length;
164       char *string = XNMALLOC (len + 1, char);
165       memcpy (string, ms->segments[0]->contents, len);
166       string[len] = '\0';
167       return string;
168     }
169   /* General case.  */
170   {
171     size_t i;
172 
173     for (i = 0; i < nsegments - 1; i++)
174       if (memchr (ms->segments[i]->contents, '\0', ms->segments[i]->length)
175           != NULL)
176         {
177           /* Segment i contains a NUL character.  Ignore the remaining
178              segments.  */
179           nsegments = i + 1;
180           break;
181         }
182   }
183   {
184     char **converted_segments = XNMALLOC (nsegments, char *);
185     size_t length;
186 
187     length = 0;
188     {
189       size_t i;
190 
191       for (i = 0; i < nsegments; i++)
192         if (ms->segments[i]->type == source_encoded)
193           {
194             char *source_encoded_string;
195             char *utf8_encoded_string;
196 
197             /* Copy the segment's contents, with a NUL at the end.  */
198             {
199               size_t len = ms->segments[i]->length;
200               source_encoded_string = XNMALLOC (len + 1, char);
201               memcpy (source_encoded_string, ms->segments[i]->contents, len);
202               source_encoded_string[len] = '\0';
203             }
204             /* Convert it to UTF-8 encoding.  */
205             utf8_encoded_string =
206               from_current_source_encoding (source_encoded_string,
207                                             ms->lcontext,
208                                             ms->logical_file_name,
209                                             ms->line_number);
210             if (utf8_encoded_string != source_encoded_string)
211               free (source_encoded_string);
212             converted_segments[i] = utf8_encoded_string;
213             length += strlen (utf8_encoded_string);
214           }
215         else
216           length += ms->segments[i]->length;
217     }
218 
219     {
220       char *string = XNMALLOC (length + 1, char);
221       {
222         char *p;
223         size_t i;
224 
225         p = string;
226         for (i = 0; i < nsegments; i++)
227           if (ms->segments[i]->type == source_encoded)
228             {
229               p = stpcpy (p, converted_segments[i]);
230               free (converted_segments[i]);
231             }
232           else
233             {
234               memcpy (p, ms->segments[i]->contents, ms->segments[i]->length);
235               p += ms->segments[i]->length;
236             }
237         assert (p == string + length);
238         *p = '\0';
239       }
240 
241       free (converted_segments);
242       return string;
243     }
244   }
245 }
246 
247 void
mixed_string_free(mixed_string_ty * ms)248 mixed_string_free (mixed_string_ty *ms)
249 {
250   struct mixed_string_segment **segments = ms->segments;
251   size_t nsegments = ms->nsegments;
252   if (nsegments > 0)
253     {
254       size_t i;
255       for (i = 0; i < nsegments; i++)
256         free (segments[i]);
257     }
258   free (segments);
259   free (ms);
260 }
261 
262 char *
mixed_string_contents_free1(mixed_string_ty * ms)263 mixed_string_contents_free1 (mixed_string_ty *ms)
264 {
265   char *contents = mixed_string_contents (ms);
266   mixed_string_free (ms);
267   return contents;
268 }
269 
270 mixed_string_ty *
mixed_string_concat(const mixed_string_ty * ms1,const mixed_string_ty * ms2)271 mixed_string_concat (const mixed_string_ty *ms1,
272                      const mixed_string_ty *ms2)
273 {
274   /* Trivial cases.  */
275   if (ms2->nsegments == 0)
276     return mixed_string_clone (ms1);
277   if (ms1->nsegments == 0)
278     return mixed_string_clone (ms2);
279   /* General case.  */
280   {
281     struct mixed_string *ms = XMALLOC (struct mixed_string);
282     size_t nsegments = ms1->nsegments + ms2->nsegments;
283     size_t j;
284     if (ms1->segments[ms1->nsegments-1]->type == ms2->segments[0]->type)
285       {
286         /* Combine the last segment of ms1 with the first segment of ms2.  */
287         size_t i;
288 
289         nsegments -= 1;
290         ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
291         j = 0;
292         for (i = 0; i < ms1->nsegments - 1; i++)
293           ms->segments[j++] = segment_clone (ms1->segments[i]);
294         {
295           size_t len1 = ms1->segments[i]->length;
296           size_t len2 = ms2->segments[0]->length;
297           struct mixed_string_segment *newseg =
298             (struct mixed_string_segment *)
299             xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents,
300                                  len1 + len2));
301           newseg->type = ms2->segments[0]->type;
302           newseg->length = len1 + len2;
303           memcpy (newseg->contents, ms1->segments[i]->contents, len1);
304           memcpy (newseg->contents + len1, ms2->segments[0]->contents, len2);
305           ms->segments[j++] = newseg;
306         }
307         for (i = 1; i < ms2->nsegments; i++)
308           ms->segments[j++] = segment_clone (ms2->segments[i]);
309       }
310     else
311       {
312         size_t i;
313 
314         ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
315         j = 0;
316         for (i = 0; i < ms1->nsegments; i++)
317           ms->segments[j++] = segment_clone (ms1->segments[i]);
318         for (i = 0; i < ms2->nsegments; i++)
319           ms->segments[j++] = segment_clone (ms2->segments[i]);
320       }
321     assert (j == nsegments);
322     ms->nsegments = nsegments;
323     ms->lcontext = ms1->lcontext;
324     ms->logical_file_name = ms1->logical_file_name;
325     ms->line_number = ms1->line_number;
326 
327     return ms;
328   }
329 }
330 
331 mixed_string_ty *
mixed_string_concat_free1(mixed_string_ty * ms1,const mixed_string_ty * ms2)332 mixed_string_concat_free1 (mixed_string_ty *ms1, const mixed_string_ty *ms2)
333 {
334   /* Trivial cases.  */
335   if (ms2->nsegments == 0)
336     return ms1;
337   if (ms1->nsegments == 0)
338     {
339       mixed_string_free (ms1);
340       return mixed_string_clone (ms2);
341     }
342   /* General case.  */
343   {
344     struct mixed_string *ms = XMALLOC (struct mixed_string);
345     size_t nsegments = ms1->nsegments + ms2->nsegments;
346     size_t j;
347     if (ms1->segments[ms1->nsegments-1]->type == ms2->segments[0]->type)
348       {
349         /* Combine the last segment of ms1 with the first segment of ms2.  */
350         size_t i;
351 
352         nsegments -= 1;
353         ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
354         j = 0;
355         for (i = 0; i < ms1->nsegments - 1; i++)
356           ms->segments[j++] = ms1->segments[i];
357         {
358           size_t len1 = ms1->segments[i]->length;
359           size_t len2 = ms2->segments[0]->length;
360           struct mixed_string_segment *newseg =
361             (struct mixed_string_segment *)
362             xmalloc (FLEXSIZEOF (struct mixed_string_segment, contents,
363                                  len1 + len2));
364           newseg->type = ms2->segments[0]->type;
365           newseg->length = len1 + len2;
366           memcpy (newseg->contents, ms1->segments[i]->contents, len1);
367           memcpy (newseg->contents + len1, ms2->segments[0]->contents, len2);
368           ms->segments[j++] = newseg;
369         }
370         free (ms1->segments[i]);
371         for (i = 1; i < ms2->nsegments; i++)
372           ms->segments[j++] = segment_clone (ms2->segments[i]);
373       }
374     else
375       {
376         size_t i;
377 
378         ms->segments = XNMALLOC (nsegments, struct mixed_string_segment *);
379         j = 0;
380         for (i = 0; i < ms1->nsegments; i++)
381           ms->segments[j++] = ms1->segments[i];
382         for (i = 0; i < ms2->nsegments; i++)
383           ms->segments[j++] = segment_clone (ms2->segments[i]);
384       }
385     assert (j == nsegments);
386     free (ms1->segments);
387     ms->nsegments = nsegments;
388     ms->lcontext = ms1->lcontext;
389     ms->logical_file_name = ms1->logical_file_name;
390     ms->line_number = ms1->line_number;
391     free (ms1);
392 
393     return ms;
394   }
395 }
396 
397 
398 void
mixed_string_buffer_init(struct mixed_string_buffer * bp,lexical_context_ty lcontext,const char * logical_file_name,int line_number)399 mixed_string_buffer_init (struct mixed_string_buffer *bp,
400                           lexical_context_ty lcontext,
401                           const char *logical_file_name,
402                           int line_number)
403 {
404   bp->segments = NULL;
405   bp->nsegments = 0;
406   bp->nsegments_allocated = 0;
407   bp->curr_type = -1;
408   bp->curr_buffer = NULL;
409   bp->curr_buflen = 0;
410   bp->curr_allocated = 0;
411   bp->utf16_surr = 0;
412   bp->lcontext = lcontext;
413   bp->logical_file_name = logical_file_name;
414   bp->line_number = line_number;
415 }
416 
417 bool
mixed_string_buffer_is_empty(const struct mixed_string_buffer * bp)418 mixed_string_buffer_is_empty (const struct mixed_string_buffer *bp)
419 {
420   return (bp->nsegments == 0 && bp->curr_buflen == 0);
421 }
422 
423 /* Auxiliary function: Ensure count more bytes are available in
424    bp->curr_buffer.  */
425 static inline void
mixed_string_buffer_grow_curr_buffer(struct mixed_string_buffer * bp,size_t count)426 mixed_string_buffer_grow_curr_buffer (struct mixed_string_buffer *bp,
427                                       size_t count)
428 {
429   if (bp->curr_buflen + count > bp->curr_allocated)
430     {
431       size_t new_allocated = 2 * bp->curr_allocated + 10;
432       if (new_allocated < bp->curr_buflen + count)
433         new_allocated = bp->curr_buflen + count;
434       bp->curr_allocated = new_allocated;
435       bp->curr_buffer = xrealloc (bp->curr_buffer, new_allocated);
436     }
437 }
438 
439 /* Auxiliary function: Append a byte to bp->curr.  */
440 static inline void
mixed_string_buffer_append_to_curr_buffer(struct mixed_string_buffer * bp,unsigned char c)441 mixed_string_buffer_append_to_curr_buffer (struct mixed_string_buffer *bp,
442                                            unsigned char c)
443 {
444   if (bp->curr_buflen == bp->curr_allocated)
445     {
446       bp->curr_allocated = 2 * bp->curr_allocated + 10;
447       bp->curr_buffer = xrealloc (bp->curr_buffer, bp->curr_allocated);
448     }
449   bp->curr_buffer[bp->curr_buflen++] = c;
450 }
451 
452 /* Auxiliary function: Assuming bp->curr_type == utf8_encoded, append a
453    Unicode character to bp->curr_buffer.  uc must be < 0x110000.  */
454 static inline void
mixed_string_buffer_append_to_utf8_buffer(struct mixed_string_buffer * bp,ucs4_t uc)455 mixed_string_buffer_append_to_utf8_buffer (struct mixed_string_buffer *bp,
456                                            ucs4_t uc)
457 {
458   unsigned char utf8buf[6];
459   int count = u8_uctomb (utf8buf, uc, 6);
460 
461   if (count < 0)
462     /* The caller should have ensured that uc is not out-of-range.  */
463     abort ();
464 
465   mixed_string_buffer_grow_curr_buffer (bp, count);
466   memcpy (bp->curr_buffer + bp->curr_buflen, utf8buf, count);
467   bp->curr_buflen += count;
468 }
469 
470 /* Auxiliary function: Assuming bp->curr_type == utf8_encoded, handle the
471    attempt to append a lone surrogate to bp->curr_buffer.  */
472 static void
mixed_string_buffer_append_lone_surrogate(struct mixed_string_buffer * bp,ucs4_t uc)473 mixed_string_buffer_append_lone_surrogate (struct mixed_string_buffer *bp,
474                                            ucs4_t uc)
475 {
476   /* A half surrogate is invalid, therefore use U+FFFD instead.
477      It may be valid in a particular programming language.
478      But a half surrogate is invalid in UTF-8:
479        - RFC 3629 says
480            "The definition of UTF-8 prohibits encoding character
481             numbers between U+D800 and U+DFFF".
482        - Unicode 4.0 chapter 3
483          <http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf>
484          section 3.9, p.77, says
485            "Because surrogate code points are not Unicode scalar
486             values, any UTF-8 byte sequence that would otherwise
487             map to code points D800..DFFF is ill-formed."
488          and in table 3-6, p. 78, does not mention D800..DFFF.
489        - The unicode.org FAQ question "How do I convert an unpaired
490          UTF-16 surrogate to UTF-8?" has the answer
491            "By representing such an unpaired surrogate on its own
492             as a 3-byte sequence, the resulting UTF-8 data stream
493             would become ill-formed."
494      So use U+FFFD instead.  */
495   error_with_progname = false;
496   error (0, 0, _("%s:%d: warning: lone surrogate U+%04X"),
497          logical_file_name, line_number, uc);
498   error_with_progname = true;
499   mixed_string_buffer_append_to_utf8_buffer (bp, 0xfffd);
500 }
501 
502 /* Auxiliary function: Assuming bp->curr_type == utf8_encoded, flush
503    bp->utf16_surr into bp->curr_buffer.  */
504 static inline void
mixed_string_buffer_flush_utf16_surr(struct mixed_string_buffer * bp)505 mixed_string_buffer_flush_utf16_surr (struct mixed_string_buffer *bp)
506 {
507   if (bp->utf16_surr != 0)
508     {
509       mixed_string_buffer_append_lone_surrogate (bp, bp->utf16_surr);
510       bp->utf16_surr = 0;
511     }
512 }
513 
514 /* Auxiliary function: Append a segment to bp->segments.  */
515 static inline void
mixed_string_buffer_add_segment(struct mixed_string_buffer * bp,struct mixed_string_segment * newseg)516 mixed_string_buffer_add_segment (struct mixed_string_buffer *bp,
517                                  struct mixed_string_segment *newseg)
518 {
519   if (bp->nsegments == bp->nsegments_allocated)
520     {
521       size_t new_allocated =
522         bp->nsegments_allocated = 2 * bp->nsegments_allocated + 1;
523       bp->segments =
524         (struct mixed_string_segment **)
525         xrealloc (bp->segments,
526                   new_allocated * sizeof (struct mixed_string_segment *));
527     }
528   bp->segments[bp->nsegments++] = newseg;
529 }
530 
531 /* Auxiliary function: Flush bp->curr_buffer and bp->utf16_surr into
532    bp->segments.  */
533 static void
mixed_string_buffer_flush_curr(struct mixed_string_buffer * bp)534 mixed_string_buffer_flush_curr (struct mixed_string_buffer *bp)
535 {
536   if (bp->curr_type == utf8_encoded)
537     mixed_string_buffer_flush_utf16_surr (bp);
538   if (bp->curr_type != -1)
539     {
540       if (bp->curr_buflen > 0)
541         {
542           struct mixed_string_segment *segment =
543             segment_alloc (bp->curr_type, bp->curr_buffer, bp->curr_buflen);
544           mixed_string_buffer_add_segment (bp, segment);
545         }
546       bp->curr_buflen = 0;
547     }
548 }
549 
550 void
mixed_string_buffer_append_char(struct mixed_string_buffer * bp,int c)551 mixed_string_buffer_append_char (struct mixed_string_buffer *bp, int c)
552 {
553   /* Switch to multibyte character mode.  */
554   if (bp->curr_type != source_encoded)
555     {
556       mixed_string_buffer_flush_curr (bp);
557       bp->curr_type = source_encoded;
558     }
559 
560     mixed_string_buffer_append_to_curr_buffer (bp, (unsigned char) c);
561 }
562 
563 void
mixed_string_buffer_append_unicode(struct mixed_string_buffer * bp,int c)564 mixed_string_buffer_append_unicode (struct mixed_string_buffer *bp, int c)
565 {
566   /* Switch to Unicode character mode.  */
567   if (bp->curr_type != utf8_encoded)
568     {
569       mixed_string_buffer_flush_curr (bp);
570       bp->curr_type = utf8_encoded;
571       assert (bp->utf16_surr == 0);
572     }
573 
574   /* Test whether this character and the previous one form a Unicode
575      surrogate character pair.  */
576   if (bp->utf16_surr != 0 && (c >= 0xdc00 && c < 0xe000))
577     {
578       unsigned short utf16buf[2];
579       ucs4_t uc;
580 
581       utf16buf[0] = bp->utf16_surr;
582       utf16buf[1] = c;
583       if (u16_mbtouc (&uc, utf16buf, 2) != 2)
584         abort ();
585 
586       mixed_string_buffer_append_to_utf8_buffer (bp, uc);
587       bp->utf16_surr = 0;
588     }
589   else
590     {
591       mixed_string_buffer_flush_utf16_surr (bp);
592 
593       if (c >= 0xd800 && c < 0xdc00)
594         bp->utf16_surr = c;
595       else if (c >= 0xdc00 && c < 0xe000)
596         mixed_string_buffer_append_lone_surrogate (bp, c);
597       else
598         mixed_string_buffer_append_to_utf8_buffer (bp, c);
599     }
600 }
601 
602 void
mixed_string_buffer_destroy(struct mixed_string_buffer * bp)603 mixed_string_buffer_destroy (struct mixed_string_buffer *bp)
604 {
605   struct mixed_string_segment **segments = bp->segments;
606   size_t nsegments = bp->nsegments;
607   if (nsegments > 0)
608     {
609       size_t i;
610       for (i = 0; i < nsegments; i++)
611         free (segments[i]);
612     }
613   free (segments);
614   free (bp->curr_buffer);
615 }
616 
617 mixed_string_ty *
mixed_string_buffer_result(struct mixed_string_buffer * bp)618 mixed_string_buffer_result (struct mixed_string_buffer *bp)
619 {
620   mixed_string_buffer_flush_curr (bp);
621 
622   {
623     struct mixed_string *ms = XMALLOC (struct mixed_string);
624     size_t nsegments = bp->nsegments;
625 
626     if (nsegments > 0)
627       ms->segments =
628         (struct mixed_string_segment **)
629         xrealloc (bp->segments,
630                   nsegments * sizeof (struct mixed_string_segment *));
631     else
632       {
633         assert (bp->segments == NULL);
634         ms->segments = NULL;
635       }
636     ms->nsegments = nsegments;
637     ms->lcontext = bp->lcontext;
638     ms->logical_file_name = bp->logical_file_name;
639     ms->line_number = bp->line_number;
640 
641     free (bp->curr_buffer);
642 
643     return ms;
644   }
645 }
646