xref: /openbsd/gnu/usr.bin/perl/dquote.c (revision e0680481)
1 /*    dquote.c
2  *
3  * This file contains functions that are related to
4  * parsing double-quotish expressions.
5  *
6 */
7 
8 #include "EXTERN.h"
9 #define PERL_IN_DQUOTE_C
10 #include "perl.h"
11 
12 /* XXX Add documentation after final interface and behavior is decided */
13 
14 bool
Perl_grok_bslash_c(pTHX_ const char source,U8 * result,const char ** message,U32 * packed_warn)15 Perl_grok_bslash_c(pTHX_ const char   source,
16                          U8 *         result,
17                          const char** message,
18                          U32 *        packed_warn)
19 {
20     PERL_ARGS_ASSERT_GROK_BSLASH_C;
21 
22     /* This returns TRUE if the \c? sequence is valid; FALSE otherwise.  If it
23      * is valid, the sequence evaluates to a single character, which will be
24      * stored into *result.
25      *
26      * source   is the character immediately after a '\c' sequence.
27      * result   points to a char variable into which this function will store
28      *          what the sequence evaluates to, if valid; unchanged otherwise.
29      * message  A pointer to any warning or error message will be stored into
30      *          this pointer; NULL if none.
31      * packed_warn if NULL on input asks that this routine display any warning
32      *          messages.  Otherwise, if the function found a warning, the
33      *          packed warning categories will be stored into *packed_warn (and
34      *          the corresponding message text into *message); 0 if none.
35      */
36 
37     *message = NULL;
38     if (packed_warn) *packed_warn = 0;
39 
40     if (! isPRINT_A(source)) {
41         *message = "Character following \"\\c\" must be printable ASCII";
42         return FALSE;
43     }
44 
45     if (source == '{') {
46         const char control = toCTRL('{');
47         if (isPRINT_A(control)) {
48             /* diag_listed_as: Use "%s" instead of "%s" */
49             *message = Perl_form(aTHX_ PERL_DIAG_DIE_SYNTAX("Use \"%c\" instead of \"\\c{\""), control);
50         }
51         else {
52             *message = "Sequence \"\\c{\" invalid";
53         }
54         return FALSE;
55     }
56 
57     *result = toCTRL(source);
58     if (isPRINT_A(*result) && ckWARN(WARN_SYNTAX)) {
59         U8 clearer[3];
60         U8 i = 0;
61         char format[] = PERL_DIAG_WARN_SYNTAX("\"\\c%c\" is more clearly written simply as \"%s\"");
62 
63         if (! isWORDCHAR(*result)) {
64             clearer[i++] = '\\';
65         }
66         clearer[i++] = *result;
67         clearer[i++] = '\0';
68 
69         if (packed_warn) {
70             *message = Perl_form(aTHX_ format, source, clearer);
71             *packed_warn = packWARN(WARN_SYNTAX);
72         }
73         else {
74             Perl_warner(aTHX_ packWARN(WARN_SYNTAX), format, source, clearer);
75         }
76     }
77 
78     return TRUE;
79 }
80 
81 const char *
Perl_form_alien_digit_msg(pTHX_ const U8 which,const STRLEN valids_len,const char * const first_bad,const char * const send,const bool UTF,const bool braced)82 Perl_form_alien_digit_msg(pTHX_
83         const U8 which,           /* 8 or 16 */
84         const STRLEN valids_len,  /* length of input before first bad char */
85         const char * const first_bad, /* Ptr to that bad char */
86         const char * const send,      /* End of input string */
87         const bool UTF,               /* Is it in UTF-8? */
88         const bool braced)            /* Is it enclosed in {} */
89 {
90     /* Generate a mortal SV containing an appropriate warning message about
91      * alien characters found in an octal or hex constant given by the inputs,
92      * and return a pointer to that SV's string.  The message looks like:
93      *
94      * Non-hex character '?' terminates \x early.  Resolved as "\x{...}"
95      *
96      */
97 
98     /* The usual worst case scenario: 2 chars to display per byte, plus \x{}
99      * (leading zeros could take up more space, and the scalar will
100      * automatically grow if necessary).  Space for NUL is added by the newSV()
101      * function */
102     SV * display_char = newSV(2 * UTF8_MAXBYTES + 4);
103     SV * message_sv = sv_newmortal();
104     char symbol;
105 
106     PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG;
107     assert(which == 8 || which == 16);
108 
109     /* Calculate the display form of the character */
110     if (    UVCHR_IS_INVARIANT(*first_bad)
111         || (UTF && isUTF8_CHAR((U8 *) first_bad, (U8 *) send)))
112     {
113         pv_uni_display(display_char, (U8 *) first_bad, UTF8SKIP(first_bad),
114                                                     (STRLEN) -1, UNI_DISPLAY_QQ);
115     }
116     else {  /* Is not UTF-8, or is illegal UTF-8.  Show just the one byte */
117 
118         /* It also isn't a UTF-8 invariant character, so no display shortcuts
119          * are available.  Use \\x{...} */
120         Perl_sv_setpvf(aTHX_ display_char, "\\x{%02x}", *first_bad);
121     }
122 
123     /* Ready to start building the message */
124     sv_setpvs(message_sv, "Non-");
125     if (which == 8) {
126         sv_catpvs(message_sv, "octal");
127         if (braced) {
128             symbol = 'o';
129         }
130         else {
131             symbol = '0';   /* \008, for example */
132         }
133     }
134     else {
135         sv_catpvs(message_sv, "hex");
136         symbol = 'x';
137     }
138     sv_catpvs(message_sv, " character ");
139 
140     if (isPRINT(*first_bad)) {
141         sv_catpvs(message_sv, "'");
142     }
143     sv_catsv(message_sv, display_char);
144     if (isPRINT(*first_bad)) {
145         sv_catpvs(message_sv, "'");
146     }
147     Perl_sv_catpvf(aTHX_ message_sv, " terminates \\%c early.  Resolved as "
148                                      "\"\\%c", symbol, symbol);
149     if (braced) {
150         sv_catpvs(message_sv, "{");
151     }
152 
153     /* Octal constants have an extra leading 0, but \0 already includes that */
154     if (symbol == 'o' && valids_len < 3) {
155         sv_catpvs(message_sv, "0");
156     }
157     if (valids_len == 0) {  /* No legal digits at all */
158         sv_catpvs(message_sv, "00");
159     }
160     else if (valids_len == 1) { /* Just one is legal */
161         sv_catpvs(message_sv, "0");
162     }
163     sv_catpvn(message_sv, first_bad - valids_len, valids_len);
164 
165     if (braced) {
166         sv_catpvs(message_sv, "}");
167     }
168     else {
169         sv_catsv(message_sv, display_char);
170     }
171     sv_catpvs(message_sv, "\"");
172 
173     SvREFCNT_dec_NN(display_char);
174 
175     return SvPVX_const(message_sv);
176 }
177 
178 const char *
Perl_form_cp_too_large_msg(pTHX_ const U8 which,const char * string,const Size_t len,const UV cp)179 Perl_form_cp_too_large_msg(pTHX_
180         const U8 which,        /* 8 or 16 */
181         const char * string,   /* NULL, or the text that is supposed to
182                                   represent a code point */
183         const Size_t len,      /* length of 'string' if not NULL; else 0 */
184         const UV cp)           /* 0 if 'string' not NULL; else the too-large
185                                   code point */
186 {
187     /* Generate a mortal SV containing an appropriate warning message about
188      * code points that are too large for this system, given by the inputs,
189      * and return a pointer to that SV's string.  Either the text of the string
190      * to be converted to a code point is input, or a code point itself.  The
191      * former is needed to accurately represent something that overflows.
192      *
193      * The message looks like:
194      *
195      * Use of code point %s is not allowed; the permissible max is %s
196      *
197      */
198 
199     SV * message_sv = sv_newmortal();
200     const char * format;
201     const char * prefix;
202 
203     PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG;
204     assert(which == 8 || which == 16);
205 
206     /* One but not both must be non-zero */
207     assert((string != NULL) ^ (cp != 0));
208     assert((string == NULL) || len);
209 
210     if (which == 8) {
211         format = "%" UVof;
212         prefix = "0";
213     }
214     else {
215         format = "%" UVXf;
216         prefix = "0x";
217     }
218 
219     Perl_sv_setpvf(aTHX_ message_sv, "Use of code point %s", prefix);
220     if (string) {
221         Perl_sv_catpvf(aTHX_ message_sv, "%.*s", (int) len, string);
222     }
223     else {
224         Perl_sv_catpvf(aTHX_ message_sv, format, cp);
225     }
226     Perl_sv_catpvf(aTHX_ message_sv, " is not allowed; the permissible max is %s", prefix);
227     Perl_sv_catpvf(aTHX_ message_sv, format, MAX_LEGAL_CP);
228 
229     return SvPVX_const(message_sv);
230 }
231 
232 bool
Perl_grok_bslash_o(pTHX_ char ** s,const char * const send,UV * uv,const char ** message,U32 * packed_warn,const bool strict,const bool allow_UV_MAX,const bool UTF)233 Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv,
234                       const char** message,
235                       U32 *      packed_warn,
236                       const bool strict,
237                       const bool allow_UV_MAX,
238                       const bool UTF)
239 {
240 
241 /*  Documentation to be supplied when interface nailed down finally
242  *  This returns FALSE if there is an error the caller should probably die
243  *  from; otherwise TRUE.
244  *	s   is the address of a pointer to a string.  **s is 'o', and the
245  *	    previous character was a backslash.  At exit, *s will be advanced
246  *	    to the byte just after those absorbed by this function.  Hence the
247  *	    caller can continue parsing from there.  In the case of an error
248  *	    when this function returns FALSE, continuing to parse is not an
249  *	    option, this routine has generally positioned *s to point just to
250  *	    the right of the first bad spot, so that a message that has a "<--"
251  *	    to mark the spot will be correctly positioned.
252  *	send - 1  gives a limit in *s that this function is not permitted to
253  *	    look beyond.  That is, the function may look at bytes only in the
254  *	    range *s..send-1
255  *	uv  points to a UV that will hold the output value, valid only if the
256  *	    return from the function is TRUE; may be changed from the input
257  *	    value even when FALSE is returned.
258  *      message  A pointer to any warning or error message will be stored into
259  *          this pointer; NULL if none.
260  *      packed_warn if NULL on input asks that this routine display any warning
261  *          messages.  Otherwise, if the function found a warning, the packed
262  *          warning categories will be stored into *packed_warn (and the
263  *          corresponding message text into *message); 0 if none.
264  *	strict is true if this should fail instead of warn if there are
265  *	    non-octal digits within the braces
266  *      allow_UV_MAX is true if this shouldn't fail if the input code point is
267  *          UV_MAX, which is normally illegal, reserved for internal use.
268  *	UTF is true iff the string *s is encoded in UTF-8.
269  */
270     char * e;
271     char * rbrace;
272     STRLEN numbers_len;
273     STRLEN trailing_blanks_len = 0;
274     I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
275               | PERL_SCAN_DISALLOW_PREFIX
276               | PERL_SCAN_SILENT_NON_PORTABLE
277               | PERL_SCAN_SILENT_ILLDIGIT
278               | PERL_SCAN_SILENT_OVERFLOW;
279 
280     PERL_ARGS_ASSERT_GROK_BSLASH_O;
281 
282     assert(*(*s - 1) == '\\');
283     assert(* *s       == 'o');
284 
285     *message = NULL;
286     if (packed_warn) *packed_warn = 0;
287 
288     (*s)++;
289 
290     if (send <= *s || **s != '{') {
291         *message = "Missing braces on \\o{}";
292         return FALSE;
293     }
294 
295     rbrace = (char *) memchr(*s, '}', send - *s);
296     if (!rbrace) {
297         (*s)++;  /* Move past the '{' */
298 
299         /* Position beyond the legal digits and blanks */
300         while (*s < send && isBLANK(**s)) {
301             (*s)++;
302         }
303 
304         while (*s < send && isOCTAL(**s)) {
305             (*s)++;
306         }
307 
308         *message = "Missing right brace on \\o{}";
309         return FALSE;
310     }
311 
312     /* Point to expected first digit (could be first byte of utf8 sequence if
313      * not a digit) */
314     (*s)++;
315     while (isBLANK(**s)) {
316         (*s)++;
317     }
318 
319     e = rbrace;
320     while (*s < e && isBLANK(*(e - 1))) {
321         e--;
322     }
323 
324     numbers_len = e - *s;
325     if (numbers_len == 0) {
326         (*s)++;    /* Move past the '}' */
327         *message = "Empty \\o{}";
328         return FALSE;
329     }
330 
331     *uv = grok_oct(*s, &numbers_len, &flags, NULL);
332     if (UNLIKELY(   (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
333                  || (! allow_UV_MAX && *uv == UV_MAX)))
334     {
335         *message = form_cp_too_large_msg(8, *s, numbers_len, 0);
336         *s = rbrace + 1;
337         return FALSE;
338     }
339 
340     while (isBLANK(**s)) {
341         trailing_blanks_len++;
342         (*s)++;
343     }
344 
345     /* Note that if has non-octal, will ignore everything starting with that up
346      * to the '}' */
347     if (numbers_len + trailing_blanks_len != (STRLEN) (e - *s)) {
348         *s += numbers_len;
349         if (strict) {
350             *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
351             *message = "Non-octal character";
352             return FALSE;
353         }
354 
355         if (ckWARN(WARN_DIGIT)) {
356             const char * failure = form_alien_digit_msg(8, numbers_len, *s, send,
357                                                                       UTF, TRUE);
358             if (packed_warn) {
359                 *message = failure;
360                 *packed_warn = packWARN(WARN_DIGIT);
361             }
362             else {
363                 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
364             }
365         }
366     }
367 
368     /* Return past the '}' */
369     *s = rbrace + 1;
370 
371     return TRUE;
372 }
373 
374 bool
Perl_grok_bslash_x(pTHX_ char ** s,const char * const send,UV * uv,const char ** message,U32 * packed_warn,const bool strict,const bool allow_UV_MAX,const bool UTF)375 Perl_grok_bslash_x(pTHX_ char ** s, const char * const send, UV *uv,
376                       const char** message,
377                       U32 *      packed_warn,
378                       const bool strict,
379                       const bool allow_UV_MAX,
380                       const bool UTF)
381 {
382 
383 /*  Documentation to be supplied when interface nailed down finally
384  *  This returns FALSE if there is an error the caller should probably die
385  *  from; otherwise TRUE.
386  *  It guarantees that the returned codepoint, *uv, when expressed as
387  *  utf8 bytes, would fit within the skipped "\x{...}" bytes.
388  *
389  *  On input:
390  *	s   is the address of a pointer to a string.  **s is 'x', and the
391  *	    previous character was a backslash.  At exit, *s will be advanced
392  *	    to the byte just after those absorbed by this function.  Hence the
393  *	    caller can continue parsing from there.  In the case of an error,
394  *	    this routine has generally positioned *s to point just to the right
395  *	    of the first bad spot, so that a message that has a "<--" to mark
396  *	    the spot will be correctly positioned.
397  *	send - 1  gives a limit in *s that this function is not permitted to
398  *	    look beyond.  That is, the function may look at bytes only in the
399  *	    range *s..send-1
400  *	uv  points to a UV that will hold the output value, valid only if the
401  *	    return from the function is TRUE; may be changed from the input
402  *	    value even when FALSE is returned.
403  *      message  A pointer to any warning or error message will be stored into
404  *          this pointer; NULL if none.
405  *      packed_warn if NULL on input asks that this routine display any warning
406  *          messages.  Otherwise, if the function found a warning, the packed
407  *          warning categories will be stored into *packed_warn (and the
408  *          corresponding message text into *message); 0 if none.
409  *	strict is true if anything out of the ordinary should cause this to
410  *	    fail instead of warn or be silent.  For example, it requires
411  *	    exactly 2 digits following the \x (when there are no braces).
412  *	    3 digits could be a mistake, so is forbidden in this mode.
413  *      allow_UV_MAX is true if this shouldn't fail if the input code point is
414  *          UV_MAX, which is normally illegal, reserved for internal use.
415  *	UTF is true iff the string *s is encoded in UTF-8.
416  */
417     char* e;
418     char * rbrace;
419     STRLEN numbers_len;
420     STRLEN trailing_blanks_len = 0;
421     I32 flags = PERL_SCAN_DISALLOW_PREFIX
422               | PERL_SCAN_SILENT_ILLDIGIT
423               | PERL_SCAN_NOTIFY_ILLDIGIT
424               | PERL_SCAN_SILENT_NON_PORTABLE
425               | PERL_SCAN_SILENT_OVERFLOW;
426 
427     PERL_ARGS_ASSERT_GROK_BSLASH_X;
428 
429     assert(*(*s - 1) == '\\');
430     assert(* *s      == 'x');
431 
432     *message = NULL;
433     if (packed_warn) *packed_warn = 0;
434 
435     (*s)++;
436 
437     if (send <= *s) {
438         if (strict) {
439             *message = "Empty \\x";
440             return FALSE;
441         }
442 
443         /* Sadly, to preserve backcompat, an empty \x at the end of string is
444          * interpreted as a NUL */
445         *uv = 0;
446         return TRUE;
447     }
448 
449     if (**s != '{') {
450         numbers_len = (strict) ? 3 : 2;
451 
452         *uv = grok_hex(*s, &numbers_len, &flags, NULL);
453         *s += numbers_len;
454 
455         if (numbers_len != 2 && (strict || (flags & PERL_SCAN_NOTIFY_ILLDIGIT))) {
456             if (numbers_len == 3) { /* numbers_len 3 only happens with strict */
457                 *message = "Use \\x{...} for more than two hex characters";
458                 return FALSE;
459             }
460             else if (strict) {
461                     *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
462                     *message = "Non-hex character";
463                     return FALSE;
464             }
465             else if (ckWARN(WARN_DIGIT)) {
466                 const char * failure = form_alien_digit_msg(16, numbers_len, *s,
467                                                               send, UTF, FALSE);
468 
469                 if (! packed_warn) {
470                     Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
471                 }
472                 else {
473                     *message = failure;
474                     *packed_warn = packWARN(WARN_DIGIT);
475                 }
476             }
477         }
478         return TRUE;
479     }
480 
481     rbrace = (char *) memchr(*s, '}', send - *s);
482     if (!rbrace) {
483         (*s)++;  /* Move past the '{' */
484 
485         /* Position beyond legal blanks and digits */
486         while (*s < send && isBLANK(**s)) {
487             (*s)++;
488         }
489 
490         while (*s < send && isXDIGIT(**s)) {
491             (*s)++;
492         }
493 
494         *message = "Missing right brace on \\x{}";
495         return FALSE;
496     }
497 
498     (*s)++;    /* Point to expected first digit (could be first byte of utf8
499                   sequence if not a digit) */
500     while (isBLANK(**s)) {
501         (*s)++;
502     }
503 
504     e = rbrace;
505     while (*s < e && isBLANK(*(e - 1))) {
506         e--;
507     }
508 
509     numbers_len = e - *s;
510     if (numbers_len == 0) {
511         if (strict) {
512             (*s)++;    /* Move past the } */
513             *message = "Empty \\x{}";
514             return FALSE;
515         }
516         *s = rbrace + 1;
517         *uv = 0;
518         return TRUE;
519     }
520 
521     flags |= PERL_SCAN_ALLOW_UNDERSCORES;
522 
523     *uv = grok_hex(*s, &numbers_len, &flags, NULL);
524     if (UNLIKELY(   (flags & PERL_SCAN_GREATER_THAN_UV_MAX)
525                  || (! allow_UV_MAX && *uv == UV_MAX)))
526     {
527         *message = form_cp_too_large_msg(16, *s, numbers_len, 0);
528         *s = e + 1;
529         return FALSE;
530     }
531 
532     while (isBLANK(**s)) {
533         trailing_blanks_len++;
534         (*s)++;
535     }
536 
537     if (numbers_len + trailing_blanks_len != (STRLEN) (e - *s)) {
538         *s += numbers_len;
539         if (strict) {
540             *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1;
541             *message = "Non-hex character";
542             return FALSE;
543         }
544 
545         if (ckWARN(WARN_DIGIT)) {
546             const char * failure = form_alien_digit_msg(16, numbers_len, *s,
547                                                                 send, UTF, TRUE);
548             if (! packed_warn) {
549                 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure);
550             }
551             else {
552                 *message = failure;
553                 *packed_warn = packWARN(WARN_DIGIT);
554             }
555         }
556     }
557 
558     /* Return past the '}' */
559     *s = rbrace + 1;
560 
561     return TRUE;
562 }
563 
564 /*
565  * ex: set ts=8 sts=4 sw=4 et:
566  */
567