1 /* dquote.c 2 * 3 * This file contains functions that are related to 4 * parsing double-quotish expressions. 5 * 6 */ 7 8 #include "EXTERN.h" 9 #define PERL_IN_DQUOTE_C 10 #include "perl.h" 11 12 /* XXX Add documentation after final interface and behavior is decided */ 13 14 bool 15 Perl_grok_bslash_c(pTHX_ const char source, 16 U8 * result, 17 const char** message, 18 U32 * packed_warn) 19 { 20 PERL_ARGS_ASSERT_GROK_BSLASH_C; 21 22 /* This returns TRUE if the \c? sequence is valid; FALSE otherwise. If it 23 * is valid, the sequence evaluates to a single character, which will be 24 * stored into *result. 25 * 26 * source is the character immediately after a '\c' sequence. 27 * result points to a char variable into which this function will store 28 * what the sequence evaluates to, if valid; unchanged otherwise. 29 * message A pointer to any warning or error message will be stored into 30 * this pointer; NULL if none. 31 * packed_warn if NULL on input asks that this routine display any warning 32 * messages. Otherwise, if the function found a warning, the 33 * packed warning categories will be stored into *packed_warn (and 34 * the corresponding message text into *message); 0 if none. 35 */ 36 37 *message = NULL; 38 if (packed_warn) *packed_warn = 0; 39 40 if (! isPRINT_A(source)) { 41 *message = "Character following \"\\c\" must be printable ASCII"; 42 return FALSE; 43 } 44 45 if (source == '{') { 46 const char control = toCTRL('{'); 47 if (isPRINT_A(control)) { 48 /* diag_listed_as: Use "%s" instead of "%s" */ 49 *message = Perl_form(aTHX_ "Use \"%c\" instead of \"\\c{\"", control); 50 } 51 else { 52 *message = "Sequence \"\\c{\" invalid"; 53 } 54 return FALSE; 55 } 56 57 *result = toCTRL(source); 58 if (isPRINT_A(*result) && ckWARN(WARN_SYNTAX)) { 59 U8 clearer[3]; 60 U8 i = 0; 61 char format[] = "\"\\c%c\" is more clearly written simply as \"%s\""; 62 63 if (! isWORDCHAR(*result)) { 64 clearer[i++] = '\\'; 65 } 66 clearer[i++] = *result; 67 clearer[i++] = '\0'; 68 69 if (packed_warn) { 70 *message = Perl_form(aTHX_ format, source, clearer); 71 *packed_warn = packWARN(WARN_SYNTAX); 72 } 73 else { 74 Perl_warner(aTHX_ packWARN(WARN_SYNTAX), format, source, clearer); 75 } 76 } 77 78 return TRUE; 79 } 80 81 const char * 82 Perl_form_alien_digit_msg(pTHX_ 83 const U8 which, /* 8 or 16 */ 84 const STRLEN valids_len, /* length of input before first bad char */ 85 const char * const first_bad, /* Ptr to that bad char */ 86 const char * const send, /* End of input string */ 87 const bool UTF, /* Is it in UTF-8? */ 88 const bool braced) /* Is it enclosed in {} */ 89 { 90 /* Generate a mortal SV containing an appropriate warning message about 91 * alien characters found in an octal or hex constant given by the inputs, 92 * and return a pointer to that SV's string. The message looks like: 93 * 94 * Non-hex character '?' terminates \x early. Resolved as "\x{...}" 95 * 96 */ 97 98 /* The usual worst case scenario: 2 chars to display per byte, plus \x{} 99 * (leading zeros could take up more space, and the scalar will 100 * automatically grow if necessary). Space for NUL is added by the newSV() 101 * function */ 102 SV * display_char = newSV(2 * UTF8_MAXBYTES + 4); 103 SV * message_sv = sv_newmortal(); 104 char symbol; 105 106 PERL_ARGS_ASSERT_FORM_ALIEN_DIGIT_MSG; 107 assert(which == 8 || which == 16); 108 109 /* Calculate the display form of the character */ 110 if ( UVCHR_IS_INVARIANT(*first_bad) 111 || (UTF && isUTF8_CHAR((U8 *) first_bad, (U8 *) send))) 112 { 113 pv_uni_display(display_char, (U8 *) first_bad, UTF8SKIP(first_bad), 114 (STRLEN) -1, UNI_DISPLAY_QQ); 115 } 116 else { /* Is not UTF-8, or is illegal UTF-8. Show just the one byte */ 117 118 /* It also isn't a UTF-8 invariant character, so no display shortcuts 119 * are available. Use \\x{...} */ 120 Perl_sv_setpvf(aTHX_ display_char, "\\x{%02x}", *first_bad); 121 } 122 123 /* Ready to start building the message */ 124 sv_setpvs(message_sv, "Non-"); 125 if (which == 8) { 126 sv_catpvs(message_sv, "octal"); 127 if (braced) { 128 symbol = 'o'; 129 } 130 else { 131 symbol = '0'; /* \008, for example */ 132 } 133 } 134 else { 135 sv_catpvs(message_sv, "hex"); 136 symbol = 'x'; 137 } 138 sv_catpvs(message_sv, " character "); 139 140 if (isPRINT(*first_bad)) { 141 sv_catpvs(message_sv, "'"); 142 } 143 sv_catsv(message_sv, display_char); 144 if (isPRINT(*first_bad)) { 145 sv_catpvs(message_sv, "'"); 146 } 147 Perl_sv_catpvf(aTHX_ message_sv, " terminates \\%c early. Resolved as " 148 "\"\\%c", symbol, symbol); 149 if (braced) { 150 sv_catpvs(message_sv, "{"); 151 } 152 153 /* Octal constants have an extra leading 0, but \0 already includes that */ 154 if (symbol == 'o' && valids_len < 3) { 155 sv_catpvs(message_sv, "0"); 156 } 157 if (valids_len == 0) { /* No legal digits at all */ 158 sv_catpvs(message_sv, "00"); 159 } 160 else if (valids_len == 1) { /* Just one is legal */ 161 sv_catpvs(message_sv, "0"); 162 } 163 sv_catpvn(message_sv, first_bad - valids_len, valids_len); 164 165 if (braced) { 166 sv_catpvs(message_sv, "}"); 167 } 168 else { 169 sv_catsv(message_sv, display_char); 170 } 171 sv_catpvs(message_sv, "\""); 172 173 SvREFCNT_dec_NN(display_char); 174 175 return SvPVX_const(message_sv); 176 } 177 178 const char * 179 Perl_form_cp_too_large_msg(pTHX_ 180 const U8 which, /* 8 or 16 */ 181 const char * string, /* NULL, or the text that is supposed to 182 represent a code point */ 183 const Size_t len, /* length of 'string' if not NULL; else 0 */ 184 const UV cp) /* 0 if 'string' not NULL; else the too-large 185 code point */ 186 { 187 /* Generate a mortal SV containing an appropriate warning message about 188 * code points that are too large for this system, given by the inputs, 189 * and return a pointer to that SV's string. Either the text of the string 190 * to be converted to a code point is input, or a code point itself. The 191 * former is needed to accurately represent something that overflows. 192 * 193 * The message looks like: 194 * 195 * Use of code point %s is not allowed; the permissible max is %s 196 * 197 */ 198 199 SV * message_sv = sv_newmortal(); 200 const char * format; 201 const char * prefix; 202 203 PERL_ARGS_ASSERT_FORM_CP_TOO_LARGE_MSG; 204 assert(which == 8 || which == 16); 205 206 /* One but not both must be non-zero */ 207 assert((string != NULL) ^ (cp != 0)); 208 assert((string == NULL) || len); 209 210 if (which == 8) { 211 format = "%" UVof; 212 prefix = "0"; 213 } 214 else { 215 format = "%" UVXf; 216 prefix = "0x"; 217 } 218 219 Perl_sv_setpvf(aTHX_ message_sv, "Use of code point %s", prefix); 220 if (string) { 221 Perl_sv_catpvf(aTHX_ message_sv, "%.*s", (int) len, string); 222 } 223 else { 224 Perl_sv_catpvf(aTHX_ message_sv, format, cp); 225 } 226 Perl_sv_catpvf(aTHX_ message_sv, " is not allowed; the permissible max is %s", prefix); 227 Perl_sv_catpvf(aTHX_ message_sv, format, MAX_LEGAL_CP); 228 229 return SvPVX_const(message_sv); 230 } 231 232 bool 233 Perl_grok_bslash_o(pTHX_ char **s, const char * const send, UV *uv, 234 const char** message, 235 U32 * packed_warn, 236 const bool strict, 237 const bool allow_UV_MAX, 238 const bool UTF) 239 { 240 241 /* Documentation to be supplied when interface nailed down finally 242 * This returns FALSE if there is an error the caller should probably die 243 * from; otherwise TRUE. 244 * s is the address of a pointer to a string. **s is 'o', and the 245 * previous character was a backslash. At exit, *s will be advanced 246 * to the byte just after those absorbed by this function. Hence the 247 * caller can continue parsing from there. In the case of an error 248 * when this function returns FALSE, continuing to parse is not an 249 * option, this routine has generally positioned *s to point just to 250 * the right of the first bad spot, so that a message that has a "<--" 251 * to mark the spot will be correctly positioned. 252 * send - 1 gives a limit in *s that this function is not permitted to 253 * look beyond. That is, the function may look at bytes only in the 254 * range *s..send-1 255 * uv points to a UV that will hold the output value, valid only if the 256 * return from the function is TRUE; may be changed from the input 257 * value even when FALSE is returned. 258 * message A pointer to any warning or error message will be stored into 259 * this pointer; NULL if none. 260 * packed_warn if NULL on input asks that this routine display any warning 261 * messages. Otherwise, if the function found a warning, the packed 262 * warning categories will be stored into *packed_warn (and the 263 * corresponding message text into *message); 0 if none. 264 * strict is true if this should fail instead of warn if there are 265 * non-octal digits within the braces 266 * allow_UV_MAX is true if this shouldn't fail if the input code point is 267 * UV_MAX, which is normally illegal, reserved for internal use. 268 * UTF is true iff the string *s is encoded in UTF-8. 269 */ 270 char* e; 271 STRLEN numbers_len; 272 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES 273 | PERL_SCAN_DISALLOW_PREFIX 274 | PERL_SCAN_SILENT_NON_PORTABLE 275 | PERL_SCAN_SILENT_ILLDIGIT 276 | PERL_SCAN_SILENT_OVERFLOW; 277 278 PERL_ARGS_ASSERT_GROK_BSLASH_O; 279 280 assert(*(*s - 1) == '\\'); 281 assert(* *s == 'o'); 282 283 *message = NULL; 284 if (packed_warn) *packed_warn = 0; 285 286 (*s)++; 287 288 if (send <= *s || **s != '{') { 289 *message = "Missing braces on \\o{}"; 290 return FALSE; 291 } 292 293 e = (char *) memchr(*s, '}', send - *s); 294 if (!e) { 295 (*s)++; /* Move past the '{' */ 296 while (isOCTAL(**s)) { /* Position beyond the legal digits */ 297 (*s)++; 298 } 299 *message = "Missing right brace on \\o{}"; 300 return FALSE; 301 } 302 303 (*s)++; /* Point to expected first digit (could be first byte of utf8 304 sequence if not a digit) */ 305 numbers_len = e - *s; 306 if (numbers_len == 0) { 307 (*s)++; /* Move past the '}' */ 308 *message = "Empty \\o{}"; 309 return FALSE; 310 } 311 312 *uv = grok_oct(*s, &numbers_len, &flags, NULL); 313 if (UNLIKELY( (flags & PERL_SCAN_GREATER_THAN_UV_MAX) 314 || (! allow_UV_MAX && *uv == UV_MAX))) 315 { 316 *message = form_cp_too_large_msg(8, *s, numbers_len, 0); 317 *s = e + 1; 318 return FALSE; 319 } 320 321 /* Note that if has non-octal, will ignore everything starting with that up 322 * to the '}' */ 323 if (numbers_len != (STRLEN) (e - *s)) { 324 *s += numbers_len; 325 if (strict) { 326 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1; 327 *message = "Non-octal character"; 328 return FALSE; 329 } 330 331 if (ckWARN(WARN_DIGIT)) { 332 const char * failure = form_alien_digit_msg(8, numbers_len, *s, send, 333 UTF, TRUE); 334 if (packed_warn) { 335 *message = failure; 336 *packed_warn = packWARN(WARN_DIGIT); 337 } 338 else { 339 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure); 340 } 341 } 342 } 343 344 /* Return past the '}' */ 345 *s = e + 1; 346 347 return TRUE; 348 } 349 350 bool 351 Perl_grok_bslash_x(pTHX_ char ** s, const char * const send, UV *uv, 352 const char** message, 353 U32 * packed_warn, 354 const bool strict, 355 const bool allow_UV_MAX, 356 const bool UTF) 357 { 358 359 /* Documentation to be supplied when interface nailed down finally 360 * This returns FALSE if there is an error the caller should probably die 361 * from; otherwise TRUE. 362 * It guarantees that the returned codepoint, *uv, when expressed as 363 * utf8 bytes, would fit within the skipped "\x{...}" bytes. 364 * 365 * On input: 366 * s is the address of a pointer to a string. **s is 'x', and the 367 * previous character was a backslash. At exit, *s will be advanced 368 * to the byte just after those absorbed by this function. Hence the 369 * caller can continue parsing from there. In the case of an error, 370 * this routine has generally positioned *s to point just to the right 371 * of the first bad spot, so that a message that has a "<--" to mark 372 * the spot will be correctly positioned. 373 * send - 1 gives a limit in *s that this function is not permitted to 374 * look beyond. That is, the function may look at bytes only in the 375 * range *s..send-1 376 * uv points to a UV that will hold the output value, valid only if the 377 * return from the function is TRUE; may be changed from the input 378 * value even when FALSE is returned. 379 * message A pointer to any warning or error message will be stored into 380 * this pointer; NULL if none. 381 * packed_warn if NULL on input asks that this routine display any warning 382 * messages. Otherwise, if the function found a warning, the packed 383 * warning categories will be stored into *packed_warn (and the 384 * corresponding message text into *message); 0 if none. 385 * strict is true if anything out of the ordinary should cause this to 386 * fail instead of warn or be silent. For example, it requires 387 * exactly 2 digits following the \x (when there are no braces). 388 * 3 digits could be a mistake, so is forbidden in this mode. 389 * allow_UV_MAX is true if this shouldn't fail if the input code point is 390 * UV_MAX, which is normally illegal, reserved for internal use. 391 * UTF is true iff the string *s is encoded in UTF-8. 392 */ 393 char* e; 394 STRLEN numbers_len; 395 I32 flags = PERL_SCAN_DISALLOW_PREFIX 396 | PERL_SCAN_SILENT_ILLDIGIT 397 | PERL_SCAN_NOTIFY_ILLDIGIT 398 | PERL_SCAN_SILENT_NON_PORTABLE 399 | PERL_SCAN_SILENT_OVERFLOW; 400 401 PERL_ARGS_ASSERT_GROK_BSLASH_X; 402 403 assert(*(*s - 1) == '\\'); 404 assert(* *s == 'x'); 405 406 *message = NULL; 407 if (packed_warn) *packed_warn = 0; 408 409 (*s)++; 410 411 if (send <= *s) { 412 if (strict) { 413 *message = "Empty \\x"; 414 return FALSE; 415 } 416 417 /* Sadly, to preserve backcompat, an empty \x at the end of string is 418 * interpreted as a NUL */ 419 *uv = 0; 420 return TRUE; 421 } 422 423 if (**s != '{') { 424 numbers_len = (strict) ? 3 : 2; 425 426 *uv = grok_hex(*s, &numbers_len, &flags, NULL); 427 *s += numbers_len; 428 429 if (numbers_len != 2 && (strict || (flags & PERL_SCAN_NOTIFY_ILLDIGIT))) { 430 if (numbers_len == 3) { /* numbers_len 3 only happens with strict */ 431 *message = "Use \\x{...} for more than two hex characters"; 432 return FALSE; 433 } 434 else if (strict) { 435 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1; 436 *message = "Non-hex character"; 437 return FALSE; 438 } 439 else if (ckWARN(WARN_DIGIT)) { 440 const char * failure = form_alien_digit_msg(16, numbers_len, *s, 441 send, UTF, FALSE); 442 443 if (! packed_warn) { 444 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure); 445 } 446 else { 447 *message = failure; 448 *packed_warn = packWARN(WARN_DIGIT); 449 } 450 } 451 } 452 return TRUE; 453 } 454 455 e = (char *) memchr(*s, '}', send - *s); 456 if (!e) { 457 (*s)++; /* Move past the '{' */ 458 while (*s < send && isXDIGIT(**s)) { /* Position beyond legal digits */ 459 (*s)++; 460 } 461 *message = "Missing right brace on \\x{}"; 462 return FALSE; 463 } 464 465 (*s)++; /* Point to expected first digit (could be first byte of utf8 466 sequence if not a digit) */ 467 numbers_len = e - *s; 468 if (numbers_len == 0) { 469 if (strict) { 470 (*s)++; /* Move past the } */ 471 *message = "Empty \\x{}"; 472 return FALSE; 473 } 474 *s = e + 1; 475 *uv = 0; 476 return TRUE; 477 } 478 479 flags |= PERL_SCAN_ALLOW_UNDERSCORES; 480 481 *uv = grok_hex(*s, &numbers_len, &flags, NULL); 482 if (UNLIKELY( (flags & PERL_SCAN_GREATER_THAN_UV_MAX) 483 || (! allow_UV_MAX && *uv == UV_MAX))) 484 { 485 *message = form_cp_too_large_msg(16, *s, numbers_len, 0); 486 *s = e + 1; 487 return FALSE; 488 } 489 490 if (numbers_len != (STRLEN) (e - *s)) { 491 *s += numbers_len; 492 if (strict) { 493 *s += (UTF) ? UTF8_SAFE_SKIP(*s, send) : 1; 494 *message = "Non-hex character"; 495 return FALSE; 496 } 497 498 if (ckWARN(WARN_DIGIT)) { 499 const char * failure = form_alien_digit_msg(16, numbers_len, *s, 500 send, UTF, TRUE); 501 if (! packed_warn) { 502 Perl_warner(aTHX_ packWARN(WARN_DIGIT), "%s", failure); 503 } 504 else { 505 *message = failure; 506 *packed_warn = packWARN(WARN_DIGIT); 507 } 508 } 509 } 510 511 /* Return past the '}' */ 512 *s = e + 1; 513 514 return TRUE; 515 } 516 517 /* 518 * ex: set ts=8 sts=4 sw=4 et: 519 */ 520