1 /* Perl format strings.
2    Copyright (C) 2004, 2006-2007, 2009, 2019-2020 Free Software Foundation, Inc.
3    Written by Bruno Haible <bruno@clisp.org>, 2003.
4 
5    This program is free software: you can redistribute it and/or modify
6    it under the terms of the GNU General Public License as published by
7    the Free Software Foundation; either version 3 of the License, or
8    (at your option) any later version.
9 
10    This program is distributed in the hope that it will be useful,
11    but WITHOUT ANY WARRANTY; without even the implied warranty of
12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13    GNU General Public License for more details.
14 
15    You should have received a copy of the GNU General Public License
16    along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
17 
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21 
22 #include <stdbool.h>
23 #include <stdlib.h>
24 
25 #include "format.h"
26 #include "c-ctype.h"
27 #include "xalloc.h"
28 #include "xvasprintf.h"
29 #include "format-invalid.h"
30 #include "gettext.h"
31 
32 #define _(str) gettext (str)
33 
34 /* Perl format strings are implemented in function Perl_sv_vcatpvfn in
35    perl-5.8.0/sv.c.
36    A directive
37    - starts with '%' or '%m$' where m is a positive integer starting with a
38      nonzero digit,
39    - is optionally followed by any of the characters '#', '0', '-', ' ', '+',
40      each of which acts as a flag,
41    - is optionally followed by a vector specification: 'v' or '*v' (reads an
42      argument) or '*m$v' where m is a positive integer starting with a nonzero
43      digit,
44    - is optionally followed by a width specification: '*' (reads an argument)
45      or '*m$' where m is a positive integer starting with a nonzero digit or
46      a nonempty digit sequence starting with a nonzero digit,
47    - is optionally followed by '.' and a precision specification: '*' (reads
48      an argument) or '*m$' where m is a positive integer starting with a
49      nonzero digit or a digit sequence,
50    - is optionally followed by a size specifier, one of 'h' 'l' 'll' 'L' 'q'
51      'V' 'I32' 'I64' 'I',
52    - is finished by a specifier
53        - '%', that needs no argument,
54        - 'c', that needs a small integer argument,
55        - 's', that needs a string argument,
56        - '_', that needs a scalar vector argument,
57        - 'p', that needs a pointer argument,
58        - 'i', 'd', 'D', that need an integer argument,
59        - 'u', 'U', 'b', 'o', 'O', 'x', 'X', that need an unsigned integer
60          argument,
61        - 'e', 'E', 'f', 'F', 'g', 'G', that need a floating-point argument,
62        - 'n', that needs a pointer to integer.
63    So there can be numbered argument specifications:
64    - '%m$' for the format string,
65    - '*m$v' for the vector,
66    - '*m$' for the width,
67    - '.*m$' for the precision.
68    Numbered and unnumbered argument specifications can be used in the same
69    string. The effect of '%m$' is to take argument number m, without affecting
70    the current argument number. The current argument number is incremented
71    after processing a directive with an unnumbered argument specification.
72  */
73 
74 enum format_arg_type
75 {
76   FAT_NONE              = 0,
77   /* Basic types */
78   FAT_INTEGER           = 1,
79   FAT_DOUBLE            = 2,
80   FAT_CHAR              = 3,
81   FAT_STRING            = 4,
82   FAT_SCALAR_VECTOR     = 5,
83   FAT_POINTER           = 6,
84   FAT_COUNT_POINTER     = 7,
85   /* Flags */
86   FAT_UNSIGNED          = 1 << 3,
87   FAT_SIZE_SHORT        = 1 << 4,
88   FAT_SIZE_V            = 2 << 4,
89   FAT_SIZE_PTR          = 3 << 4,
90   FAT_SIZE_LONG         = 4 << 4,
91   FAT_SIZE_LONGLONG     = 5 << 4,
92   /* Bitmasks */
93   FAT_SIZE_MASK         = (FAT_SIZE_SHORT | FAT_SIZE_V | FAT_SIZE_PTR
94                            | FAT_SIZE_LONG | FAT_SIZE_LONGLONG)
95 };
96 #ifdef __cplusplus
97 typedef int format_arg_type_t;
98 #else
99 typedef enum format_arg_type format_arg_type_t;
100 #endif
101 
102 struct numbered_arg
103 {
104   unsigned int number;
105   format_arg_type_t type;
106 };
107 
108 struct spec
109 {
110   unsigned int directives;
111   unsigned int numbered_arg_count;
112   struct numbered_arg *numbered;
113 };
114 
115 /* Locale independent test for a decimal digit.
116    Argument can be  'char' or 'unsigned char'.  (Whereas the argument of
117    <ctype.h> isdigit must be an 'unsigned char'.)  */
118 #undef isdigit
119 #define isdigit(c) ((unsigned int) ((c) - '0') < 10)
120 
121 /* Locale independent test for a nonzero decimal digit.  */
122 #define isnonzerodigit(c) ((unsigned int) ((c) - '1') < 9)
123 
124 
125 static int
numbered_arg_compare(const void * p1,const void * p2)126 numbered_arg_compare (const void *p1, const void *p2)
127 {
128   unsigned int n1 = ((const struct numbered_arg *) p1)->number;
129   unsigned int n2 = ((const struct numbered_arg *) p2)->number;
130 
131   return (n1 > n2 ? 1 : n1 < n2 ? -1 : 0);
132 }
133 
134 static void *
format_parse(const char * format,bool translated,char * fdi,char ** invalid_reason)135 format_parse (const char *format, bool translated, char *fdi,
136               char **invalid_reason)
137 {
138   const char *const format_start = format;
139   unsigned int directives;
140   unsigned int numbered_arg_count;
141   struct numbered_arg *numbered;
142   unsigned int numbered_allocated;
143   unsigned int unnumbered_arg_count;
144   struct spec *result;
145 
146   directives = 0;
147   numbered_arg_count = 0;
148   numbered = NULL;
149   numbered_allocated = 0;
150   unnumbered_arg_count = 0;
151 
152   for (; *format != '\0';)
153     if (*format++ == '%')
154       {
155         /* A directive.  */
156         unsigned int number = 0;
157         bool vectorize = false;
158         format_arg_type_t type;
159         format_arg_type_t size;
160 
161         FDI_SET (format - 1, FMTDIR_START);
162         directives++;
163 
164         if (isnonzerodigit (*format))
165           {
166             const char *f = format;
167             unsigned int m = 0;
168 
169             do
170               {
171                 m = 10 * m + (*f - '0');
172                 f++;
173               }
174             while (isdigit (*f));
175 
176             if (*f == '$')
177               {
178                 number = m;
179                 format = ++f;
180               }
181           }
182 
183         /* Parse flags.  */
184         while (*format == ' ' || *format == '+' || *format == '-'
185                || *format == '#' || *format == '0')
186           format++;
187 
188         /* Parse vector.  */
189         if (*format == 'v')
190           {
191             format++;
192             vectorize = true;
193           }
194         else if (*format == '*')
195           {
196             const char *f = format;
197 
198             f++;
199             if (*f == 'v')
200               {
201                 format = ++f;
202                 vectorize = true;
203 
204                 /* Unnumbered argument.  */
205                 if (numbered_allocated == numbered_arg_count)
206                   {
207                     numbered_allocated = 2 * numbered_allocated + 1;
208                     numbered = (struct numbered_arg *) xrealloc (numbered, numbered_allocated * sizeof (struct numbered_arg));
209                   }
210                 numbered[numbered_arg_count].number = ++unnumbered_arg_count;
211                 numbered[numbered_arg_count].type = FAT_SCALAR_VECTOR; /* or FAT_STRING? */
212                 numbered_arg_count++;
213               }
214             else if (isnonzerodigit (*f))
215               {
216                 unsigned int m = 0;
217 
218                 do
219                   {
220                     m = 10 * m + (*f - '0');
221                     f++;
222                   }
223                 while (isdigit (*f));
224 
225                 if (*f == '$')
226                   {
227                     f++;
228                     if (*f == 'v')
229                       {
230                         unsigned int vector_number = m;
231 
232                         format = ++f;
233                         vectorize = true;
234 
235                         /* Numbered argument.  */
236                         /* Note: As of perl-5.8.0, this is not correctly
237                            implemented in perl's sv.c.  */
238                         if (numbered_allocated == numbered_arg_count)
239                           {
240                             numbered_allocated = 2 * numbered_allocated + 1;
241                             numbered = (struct numbered_arg *) xrealloc (numbered, numbered_allocated * sizeof (struct numbered_arg));
242                           }
243                         numbered[numbered_arg_count].number = vector_number;
244                         numbered[numbered_arg_count].type = FAT_SCALAR_VECTOR; /* or FAT_STRING? */
245                         numbered_arg_count++;
246                       }
247                   }
248               }
249           }
250 
251         if (vectorize)
252           {
253             /* Numbered or unnumbered argument.  */
254             if (numbered_allocated == numbered_arg_count)
255               {
256                 numbered_allocated = 2 * numbered_allocated + 1;
257                 numbered = (struct numbered_arg *) xrealloc (numbered, numbered_allocated * sizeof (struct numbered_arg));
258               }
259             numbered[numbered_arg_count].number = (number ? number : ++unnumbered_arg_count);
260             numbered[numbered_arg_count].type = FAT_SCALAR_VECTOR;
261             numbered_arg_count++;
262           }
263 
264         /* Parse width.  */
265         if (*format == '*')
266           {
267             unsigned int width_number = 0;
268 
269             format++;
270 
271             if (isnonzerodigit (*format))
272               {
273                 const char *f = format;
274                 unsigned int m = 0;
275 
276                 do
277                   {
278                     m = 10 * m + (*f - '0');
279                     f++;
280                   }
281                 while (isdigit (*f));
282 
283                 if (*f == '$')
284                   {
285                     width_number = m;
286                     format = ++f;
287                   }
288               }
289 
290             /* Numbered or unnumbered argument.  */
291             /* Note: As of perl-5.8.0, this is not correctly
292                implemented in perl's sv.c.  */
293             if (numbered_allocated == numbered_arg_count)
294               {
295                 numbered_allocated = 2 * numbered_allocated + 1;
296                 numbered = (struct numbered_arg *) xrealloc (numbered, numbered_allocated * sizeof (struct numbered_arg));
297               }
298             numbered[numbered_arg_count].number = (width_number ? width_number : ++unnumbered_arg_count);
299             numbered[numbered_arg_count].type = FAT_INTEGER;
300             numbered_arg_count++;
301           }
302         else if (isnonzerodigit (*format))
303           {
304             do format++; while (isdigit (*format));
305           }
306 
307         /* Parse precision.  */
308         if (*format == '.')
309           {
310             format++;
311 
312             if (*format == '*')
313               {
314                 unsigned int precision_number = 0;
315 
316                 format++;
317 
318                 if (isnonzerodigit (*format))
319                   {
320                     const char *f = format;
321                     unsigned int m = 0;
322 
323                     do
324                       {
325                         m = 10 * m + (*f - '0');
326                         f++;
327                       }
328                     while (isdigit (*f));
329 
330                     if (*f == '$')
331                       {
332                         precision_number = m;
333                         format = ++f;
334                       }
335                   }
336 
337                 /* Numbered or unnumbered argument.  */
338                 if (numbered_allocated == numbered_arg_count)
339                   {
340                     numbered_allocated = 2 * numbered_allocated + 1;
341                     numbered = (struct numbered_arg *) xrealloc (numbered, numbered_allocated * sizeof (struct numbered_arg));
342                   }
343                 numbered[numbered_arg_count].number = (precision_number ? precision_number : ++unnumbered_arg_count);
344                 numbered[numbered_arg_count].type = FAT_INTEGER;
345                 numbered_arg_count++;
346               }
347             else
348               {
349                 while (isdigit (*format)) format++;
350               }
351           }
352 
353         /* Parse size.  */
354         size = 0;
355         if (*format == 'h')
356           {
357             size = FAT_SIZE_SHORT;
358             format++;
359           }
360         else if (*format == 'l')
361           {
362             if (format[1] == 'l')
363               {
364                 size = FAT_SIZE_LONGLONG;
365                 format += 2;
366               }
367             else
368               {
369                 size = FAT_SIZE_LONG;
370                 format++;
371               }
372           }
373         else if (*format == 'L' || *format == 'q')
374           {
375             size = FAT_SIZE_LONGLONG;
376             format++;
377           }
378         else if (*format == 'V')
379           {
380             size = FAT_SIZE_V;
381             format++;
382           }
383         else if (*format == 'I')
384           {
385             if (format[1] == '6' && format[2] == '4')
386               {
387                 size = FAT_SIZE_LONGLONG;
388                 format += 3;
389               }
390             else if (format[1] == '3' && format[2] == '2')
391               {
392                 size = 0; /* FAT_SIZE_INT */
393                 format += 3;
394               }
395             else
396               {
397                 size = FAT_SIZE_PTR;
398                 format++;
399               }
400           }
401 
402         switch (*format)
403           {
404           case '%':
405             type = FAT_NONE;
406             break;
407           case 'c':
408             type = FAT_CHAR;
409             break;
410           case 's':
411             type = FAT_STRING;
412             break;
413           case '_':
414             type = FAT_SCALAR_VECTOR;
415             break;
416           case 'D':
417             type = FAT_INTEGER | FAT_SIZE_V;
418             break;
419           case 'i': case 'd':
420             type = FAT_INTEGER | size;
421             break;
422           case 'U': case 'O':
423             type = FAT_INTEGER | FAT_UNSIGNED | FAT_SIZE_V;
424             break;
425           case 'u': case 'b': case 'o': case 'x': case 'X':
426             type = FAT_INTEGER | FAT_UNSIGNED | size;
427             break;
428           case 'e': case 'E': case 'f': case 'F': case 'g': case 'G':
429             if (size == FAT_SIZE_SHORT || size == FAT_SIZE_LONG)
430               {
431                 *invalid_reason =
432                   xasprintf (_("In the directive number %u, the size specifier is incompatible with the conversion specifier '%c'."), directives, *format);
433                 FDI_SET (format, FMTDIR_ERROR);
434                 goto bad_format;
435               }
436             type = FAT_DOUBLE | size;
437             break;
438           case 'p':
439             type = FAT_POINTER;
440             break;
441           case 'n':
442             type = FAT_COUNT_POINTER | size;
443             break;
444           default:
445             if (*format == '\0')
446               {
447                 *invalid_reason = INVALID_UNTERMINATED_DIRECTIVE ();
448                 FDI_SET (format - 1, FMTDIR_ERROR);
449               }
450             else
451               {
452                 *invalid_reason =
453                   INVALID_CONVERSION_SPECIFIER (directives, *format);
454                 FDI_SET (format, FMTDIR_ERROR);
455               }
456             goto bad_format;
457           }
458 
459         if (type != FAT_NONE && !vectorize)
460           {
461             /* Numbered or unnumbered argument.  */
462             if (numbered_allocated == numbered_arg_count)
463               {
464                 numbered_allocated = 2 * numbered_allocated + 1;
465                 numbered = (struct numbered_arg *) xrealloc (numbered, numbered_allocated * sizeof (struct numbered_arg));
466               }
467             numbered[numbered_arg_count].number = (number ? number : ++unnumbered_arg_count);
468             numbered[numbered_arg_count].type = type;
469             numbered_arg_count++;
470           }
471 
472         FDI_SET (format, FMTDIR_END);
473 
474         format++;
475       }
476 
477   /* Sort the numbered argument array, and eliminate duplicates.  */
478   if (numbered_arg_count > 1)
479     {
480       unsigned int i, j;
481       bool err;
482 
483       qsort (numbered, numbered_arg_count,
484              sizeof (struct numbered_arg), numbered_arg_compare);
485 
486       /* Remove duplicates: Copy from i to j, keeping 0 <= j <= i.  */
487       err = false;
488       for (i = j = 0; i < numbered_arg_count; i++)
489         if (j > 0 && numbered[i].number == numbered[j-1].number)
490           {
491             format_arg_type_t type1 = numbered[i].type;
492             format_arg_type_t type2 = numbered[j-1].type;
493             format_arg_type_t type_both;
494 
495             if (type1 == type2)
496               type_both = type1;
497             else
498               {
499                 /* Incompatible types.  */
500                 type_both = FAT_NONE;
501                 if (!err)
502                   *invalid_reason =
503                     INVALID_INCOMPATIBLE_ARG_TYPES (numbered[i].number);
504                 err = true;
505               }
506 
507             numbered[j-1].type = type_both;
508           }
509         else
510           {
511             if (j < i)
512               {
513                 numbered[j].number = numbered[i].number;
514                 numbered[j].type = numbered[i].type;
515               }
516             j++;
517           }
518       numbered_arg_count = j;
519       if (err)
520         /* *invalid_reason has already been set above.  */
521         goto bad_format;
522     }
523 
524   result = XMALLOC (struct spec);
525   result->directives = directives;
526   result->numbered_arg_count = numbered_arg_count;
527   result->numbered = numbered;
528   return result;
529 
530  bad_format:
531   if (numbered != NULL)
532     free (numbered);
533   return NULL;
534 }
535 
536 static void
format_free(void * descr)537 format_free (void *descr)
538 {
539   struct spec *spec = (struct spec *) descr;
540 
541   if (spec->numbered != NULL)
542     free (spec->numbered);
543   free (spec);
544 }
545 
546 static int
format_get_number_of_directives(void * descr)547 format_get_number_of_directives (void *descr)
548 {
549   struct spec *spec = (struct spec *) descr;
550 
551   return spec->directives;
552 }
553 
554 static bool
format_check(void * msgid_descr,void * msgstr_descr,bool equality,formatstring_error_logger_t error_logger,const char * pretty_msgid,const char * pretty_msgstr)555 format_check (void *msgid_descr, void *msgstr_descr, bool equality,
556               formatstring_error_logger_t error_logger,
557               const char *pretty_msgid, const char *pretty_msgstr)
558 {
559   struct spec *spec1 = (struct spec *) msgid_descr;
560   struct spec *spec2 = (struct spec *) msgstr_descr;
561   bool err = false;
562 
563   if (spec1->numbered_arg_count + spec2->numbered_arg_count > 0)
564     {
565       unsigned int i, j;
566       unsigned int n1 = spec1->numbered_arg_count;
567       unsigned int n2 = spec2->numbered_arg_count;
568 
569       /* Check the argument names are the same.
570          Both arrays are sorted.  We search for the first difference.  */
571       for (i = 0, j = 0; i < n1 || j < n2; )
572         {
573           int cmp = (i >= n1 ? 1 :
574                      j >= n2 ? -1 :
575                      spec1->numbered[i].number > spec2->numbered[j].number ? 1 :
576                      spec1->numbered[i].number < spec2->numbered[j].number ? -1 :
577                      0);
578 
579           if (cmp > 0)
580             {
581               if (error_logger)
582                 error_logger (_("a format specification for argument %u, as in '%s', doesn't exist in '%s'"),
583                               spec2->numbered[j].number, pretty_msgstr,
584                               pretty_msgid);
585               err = true;
586               break;
587             }
588           else if (cmp < 0)
589             {
590               if (equality)
591                 {
592                   if (error_logger)
593                     error_logger (_("a format specification for argument %u doesn't exist in '%s'"),
594                                   spec1->numbered[i].number, pretty_msgstr);
595                   err = true;
596                   break;
597                 }
598               else
599                 i++;
600             }
601           else
602             j++, i++;
603         }
604       /* Check the argument types are the same.  */
605       if (!err)
606         for (i = 0, j = 0; j < n2; )
607           {
608             if (spec1->numbered[i].number == spec2->numbered[j].number)
609               {
610                 if (spec1->numbered[i].type != spec2->numbered[j].type)
611                   {
612                     if (error_logger)
613                       error_logger (_("format specifications in '%s' and '%s' for argument %u are not the same"),
614                                     pretty_msgid, pretty_msgstr,
615                                     spec2->numbered[j].number);
616                     err = true;
617                     break;
618                   }
619                 j++, i++;
620               }
621             else
622               i++;
623           }
624     }
625 
626   return err;
627 }
628 
629 
630 struct formatstring_parser formatstring_perl =
631 {
632   format_parse,
633   format_free,
634   format_get_number_of_directives,
635   NULL,
636   format_check
637 };
638 
639 
640 #ifdef TEST
641 
642 /* Test program: Print the argument list specification returned by
643    format_parse for strings read from standard input.  */
644 
645 #include <stdio.h>
646 
647 static void
format_print(void * descr)648 format_print (void *descr)
649 {
650   struct spec *spec = (struct spec *) descr;
651   unsigned int last;
652   unsigned int i;
653 
654   if (spec == NULL)
655     {
656       printf ("INVALID");
657       return;
658     }
659 
660   printf ("(");
661   last = 1;
662   for (i = 0; i < spec->numbered_arg_count; i++)
663     {
664       unsigned int number = spec->numbered[i].number;
665 
666       if (i > 0)
667         printf (" ");
668       if (number < last)
669         abort ();
670       for (; last < number; last++)
671         printf ("_ ");
672       if (spec->numbered[i].type & FAT_UNSIGNED)
673         printf ("[unsigned]");
674       switch (spec->numbered[i].type & FAT_SIZE_MASK)
675         {
676         case 0:
677           break;
678         case FAT_SIZE_SHORT:
679           printf ("[short]");
680           break;
681         case FAT_SIZE_V:
682           printf ("[IV]");
683           break;
684         case FAT_SIZE_PTR:
685           printf ("[PTR]");
686           break;
687         case FAT_SIZE_LONG:
688           printf ("[long]");
689           break;
690         case FAT_SIZE_LONGLONG:
691           printf ("[long long]");
692           break;
693         default:
694           abort ();
695         }
696       switch (spec->numbered[i].type & ~(FAT_UNSIGNED | FAT_SIZE_MASK))
697         {
698         case FAT_INTEGER:
699           printf ("i");
700           break;
701         case FAT_DOUBLE:
702           printf ("f");
703           break;
704         case FAT_CHAR:
705           printf ("c");
706           break;
707         case FAT_STRING:
708           printf ("s");
709           break;
710         case FAT_SCALAR_VECTOR:
711           printf ("sv");
712           break;
713         case FAT_POINTER:
714           printf ("p");
715           break;
716         case FAT_COUNT_POINTER:
717           printf ("n");
718           break;
719         default:
720           abort ();
721         }
722       last = number + 1;
723     }
724   printf (")");
725 }
726 
727 int
main()728 main ()
729 {
730   for (;;)
731     {
732       char *line = NULL;
733       size_t line_size = 0;
734       int line_len;
735       char *invalid_reason;
736       void *descr;
737 
738       line_len = getline (&line, &line_size, stdin);
739       if (line_len < 0)
740         break;
741       if (line_len > 0 && line[line_len - 1] == '\n')
742         line[--line_len] = '\0';
743 
744       invalid_reason = NULL;
745       descr = format_parse (line, false, NULL, &invalid_reason);
746 
747       format_print (descr);
748       printf ("\n");
749       if (descr == NULL)
750         printf ("%s\n", invalid_reason);
751 
752       free (invalid_reason);
753       free (line);
754     }
755 
756   return 0;
757 }
758 
759 /*
760  * For Emacs M-x compile
761  * Local Variables:
762  * compile-command: "/bin/sh ../libtool --tag=CC --mode=link gcc -o a.out -static -O -g -Wall -I.. -I../gnulib-lib -I../../gettext-runtime/intl -DHAVE_CONFIG_H -DTEST format-perl.c ../gnulib-lib/libgettextlib.la"
763  * End:
764  */
765 
766 #endif /* TEST */
767