1 /* toke.c
2 *
3 * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
4 * 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 by Larry Wall and others
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11 /*
12 * 'It all comes from here, the stench and the peril.' --Frodo
13 *
14 * [p.719 of _The Lord of the Rings_, IV/ix: "Shelob's Lair"]
15 */
16
17 /*
18 * This file is the lexer for Perl. It's closely linked to the
19 * parser, perly.y.
20 *
21 * The main routine is yylex(), which returns the next token.
22 */
23
24 /*
25 =head1 Lexer interface
26 This is the lower layer of the Perl parser, managing characters and tokens.
27
28 =for apidoc AmnU|yy_parser *|PL_parser
29
30 Pointer to a structure encapsulating the state of the parsing operation
31 currently in progress. The pointer can be locally changed to perform
32 a nested parse without interfering with the state of an outer parse.
33 Individual members of C<PL_parser> have their own documentation.
34
35 =cut
36 */
37
38 #include "EXTERN.h"
39 #define PERL_IN_TOKE_C
40 #include "perl.h"
41 #include "invlist_inline.h"
42
43 #define new_constant(a,b,c,d,e,f,g, h) \
44 S_new_constant(aTHX_ a,b,STR_WITH_LEN(c),d,e,f, g, h)
45
46 #define pl_yylval (PL_parser->yylval)
47
48 /* XXX temporary backwards compatibility */
49 #define PL_lex_brackets (PL_parser->lex_brackets)
50 #define PL_lex_allbrackets (PL_parser->lex_allbrackets)
51 #define PL_lex_fakeeof (PL_parser->lex_fakeeof)
52 #define PL_lex_brackstack (PL_parser->lex_brackstack)
53 #define PL_lex_casemods (PL_parser->lex_casemods)
54 #define PL_lex_casestack (PL_parser->lex_casestack)
55 #define PL_lex_dojoin (PL_parser->lex_dojoin)
56 #define PL_lex_formbrack (PL_parser->lex_formbrack)
57 #define PL_lex_inpat (PL_parser->lex_inpat)
58 #define PL_lex_inwhat (PL_parser->lex_inwhat)
59 #define PL_lex_op (PL_parser->lex_op)
60 #define PL_lex_repl (PL_parser->lex_repl)
61 #define PL_lex_starts (PL_parser->lex_starts)
62 #define PL_lex_stuff (PL_parser->lex_stuff)
63 #define PL_multi_start (PL_parser->multi_start)
64 #define PL_multi_open (PL_parser->multi_open)
65 #define PL_multi_close (PL_parser->multi_close)
66 #define PL_preambled (PL_parser->preambled)
67 #define PL_linestr (PL_parser->linestr)
68 #define PL_expect (PL_parser->expect)
69 #define PL_copline (PL_parser->copline)
70 #define PL_bufptr (PL_parser->bufptr)
71 #define PL_oldbufptr (PL_parser->oldbufptr)
72 #define PL_oldoldbufptr (PL_parser->oldoldbufptr)
73 #define PL_linestart (PL_parser->linestart)
74 #define PL_bufend (PL_parser->bufend)
75 #define PL_last_uni (PL_parser->last_uni)
76 #define PL_last_lop (PL_parser->last_lop)
77 #define PL_last_lop_op (PL_parser->last_lop_op)
78 #define PL_lex_state (PL_parser->lex_state)
79 #define PL_rsfp (PL_parser->rsfp)
80 #define PL_rsfp_filters (PL_parser->rsfp_filters)
81 #define PL_in_my (PL_parser->in_my)
82 #define PL_in_my_stash (PL_parser->in_my_stash)
83 #define PL_tokenbuf (PL_parser->tokenbuf)
84 #define PL_multi_end (PL_parser->multi_end)
85 #define PL_error_count (PL_parser->error_count)
86
87 # define PL_nexttoke (PL_parser->nexttoke)
88 # define PL_nexttype (PL_parser->nexttype)
89 # define PL_nextval (PL_parser->nextval)
90
91
92 #define SvEVALED(sv) \
93 (SvTYPE(sv) >= SVt_PVNV \
94 && ((XPVIV*)SvANY(sv))->xiv_u.xivu_eval_seen)
95
96 static const char ident_too_long[] = "Identifier too long";
97 static const char ident_var_zero_multi_digit[] = "Numeric variables with more than one digit may not start with '0'";
98
99 # define NEXTVAL_NEXTTOKE PL_nextval[PL_nexttoke]
100
101 #define XENUMMASK 0x3f
102 #define XFAKEEOF 0x40
103 #define XFAKEBRACK 0x80
104
105 #ifdef USE_UTF8_SCRIPTS
106 # define UTF cBOOL(!IN_BYTES)
107 #else
108 # define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8)))
109 #endif
110
111 /* The maximum number of characters preceding the unrecognized one to display */
112 #define UNRECOGNIZED_PRECEDE_COUNT 10
113
114 /* In variables named $^X, these are the legal values for X.
115 * 1999-02-27 mjd-perl-patch@plover.com */
116 #define isCONTROLVAR(x) (isUPPER(x) || memCHRs("[\\]^_?", (x)))
117
118 /* Non-identifier plugin infix operators are allowed any printing character
119 * except spaces, digits, or identifier chars
120 */
121 #define isPLUGINFIX(c) (c && !isSPACE(c) && !isDIGIT(c) && !isALPHA(c))
122 /* Plugin infix operators may not begin with a quote symbol */
123 #define isPLUGINFIX_FIRST(c) (isPLUGINFIX(c) && c != '"' && c != '\'')
124
125 #define PLUGINFIX_IS_ENABLED UNLIKELY(PL_infix_plugin != &Perl_infix_plugin_standard)
126
127 #define SPACE_OR_TAB(c) isBLANK_A(c)
128
129 #define HEXFP_PEEK(s) \
130 (((s[0] == '.') && \
131 (isXDIGIT(s[1]) || isALPHA_FOLD_EQ(s[1], 'p'))) || \
132 isALPHA_FOLD_EQ(s[0], 'p'))
133
134 /* LEX_* are values for PL_lex_state, the state of the lexer.
135 * They are arranged oddly so that the guard on the switch statement
136 * can get by with a single comparison (if the compiler is smart enough).
137 *
138 * These values refer to the various states within a sublex parse,
139 * i.e. within a double quotish string
140 */
141
142 /* #define LEX_NOTPARSING 11 is done in perl.h. */
143
144 #define LEX_NORMAL 10 /* normal code (ie not within "...") */
145 #define LEX_INTERPNORMAL 9 /* code within a string, eg "$foo[$x+1]" */
146 #define LEX_INTERPCASEMOD 8 /* expecting a \U, \Q or \E etc */
147 #define LEX_INTERPPUSH 7 /* starting a new sublex parse level */
148 #define LEX_INTERPSTART 6 /* expecting the start of a $var */
149
150 /* at end of code, eg "$x" followed by: */
151 #define LEX_INTERPEND 5 /* ... eg not one of [, { or -> */
152 #define LEX_INTERPENDMAYBE 4 /* ... eg one of [, { or -> */
153
154 #define LEX_INTERPCONCAT 3 /* expecting anything, eg at start of
155 string or after \E, $foo, etc */
156 #define LEX_INTERPCONST 2 /* NOT USED */
157 #define LEX_FORMLINE 1 /* expecting a format line */
158
159 /* returned to yyl_try() to request it to retry the parse loop, expected to only
160 be returned directly by yyl_fake_eof(), but functions that call yyl_fake_eof()
161 can also return it.
162
163 yylex (aka Perl_yylex) returns 0 on EOF rather than returning -1,
164 other token values are 258 or higher (see perly.h), so -1 should be
165 a safe value here.
166 */
167 #define YYL_RETRY (-1)
168
169 #ifdef DEBUGGING
170 static const char* const lex_state_names[] = {
171 "KNOWNEXT",
172 "FORMLINE",
173 "INTERPCONST",
174 "INTERPCONCAT",
175 "INTERPENDMAYBE",
176 "INTERPEND",
177 "INTERPSTART",
178 "INTERPPUSH",
179 "INTERPCASEMOD",
180 "INTERPNORMAL",
181 "NORMAL"
182 };
183 #endif
184
185 #include "keywords.h"
186
187 /* CLINE is a macro that ensures PL_copline has a sane value */
188
189 #define CLINE (PL_copline = (CopLINE(PL_curcop) < PL_copline ? CopLINE(PL_curcop) : PL_copline))
190
191 /*
192 * Convenience functions to return different tokens and prime the
193 * lexer for the next token. They all take an argument.
194 *
195 * TOKEN : generic token (used for '(', DOLSHARP, etc)
196 * OPERATOR : generic operator
197 * AOPERATOR : assignment operator
198 * PREBLOCK : beginning the block after an if, while, foreach, ...
199 * PRETERMBLOCK : beginning a non-code-defining {} block (eg, hash ref)
200 * PREREF : *EXPR where EXPR is not a simple identifier
201 * TERM : expression term
202 * POSTDEREF : postfix dereference (->$* ->@[...] etc.)
203 * LOOPX : loop exiting command (goto, last, dump, etc)
204 * FTST : file test operator
205 * FUN0 : zero-argument function
206 * FUN0OP : zero-argument function, with its op created in this file
207 * FUN1 : not used, except for not, which isn't a UNIOP
208 * BOop : bitwise or or xor
209 * BAop : bitwise and
210 * BCop : bitwise complement
211 * SHop : shift operator
212 * PWop : power operator
213 * PMop : pattern-matching operator
214 * Aop : addition-level operator
215 * AopNOASSIGN : addition-level operator that is never part of .=
216 * Mop : multiplication-level operator
217 * ChEop : chaining equality-testing operator
218 * NCEop : non-chaining comparison operator at equality precedence
219 * ChRop : chaining relational operator <= != gt
220 * NCRop : non-chaining relational operator isa
221 *
222 * Also see LOP and lop() below.
223 */
224
225 #ifdef DEBUGGING /* Serve -DT. */
226 # define REPORT(retval) tokereport((I32)retval, &pl_yylval)
227 #else
228 # define REPORT(retval) (retval)
229 #endif
230
231 #define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval))
232 #define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval))
233 #define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval))
234 #define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval))
235 #define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval))
236 #define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval))
237 #define TERM(retval) return (CLINE, PL_expect = XOPERATOR, PL_bufptr = s, REPORT(retval))
238 #define PHASERBLOCK(f) return (pl_yylval.ival=f, PL_expect = XBLOCK, PL_bufptr = s, REPORT((int)PHASER))
239 #define POSTDEREF(f) return (PL_bufptr = s, S_postderef(aTHX_ REPORT(f),s[1]))
240 #define LOOPX(f) return (PL_bufptr = force_word(s,BAREWORD,TRUE,FALSE), \
241 pl_yylval.ival=f, \
242 PL_expect = PL_nexttoke ? XOPERATOR : XTERM, \
243 REPORT((int)LOOPEX))
244 #define FTST(f) return (pl_yylval.ival=f, PL_expect=XTERMORDORDOR, PL_bufptr=s, REPORT((int)UNIOP))
245 #define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0))
246 #define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP))
247 #define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1))
248 #define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP))
249 #define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP))
250 #define BCop(f) return pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr = s, \
251 REPORT(PERLY_TILDE)
252 #define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP))
253 #define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP))
254 #define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP))
255 #define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP))
256 #define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP))
257 #define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP))
258 #define ChEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHEQOP))
259 #define NCEop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCEQOP))
260 #define ChRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)CHRELOP))
261 #define NCRop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)NCRELOP))
262
263 /* This bit of chicanery makes a unary function followed by
264 * a parenthesis into a function with one argument, highest precedence.
265 * The UNIDOR macro is for unary functions that can be followed by the //
266 * operator (such as C<shift // 0>).
267 */
268 #define UNI3(f,x,have_x) { \
269 pl_yylval.ival = f; \
270 if (have_x) PL_expect = x; \
271 PL_bufptr = s; \
272 PL_last_uni = PL_oldbufptr; \
273 PL_last_lop_op = (f) < 0 ? -(f) : (f); \
274 if (*s == '(') \
275 return REPORT( (int)FUNC1 ); \
276 s = skipspace(s); \
277 return REPORT( *s=='(' ? (int)FUNC1 : (int)UNIOP ); \
278 }
279 #define UNI(f) UNI3(f,XTERM,1)
280 #define UNIDOR(f) UNI3(f,XTERMORDORDOR,1)
281 #define UNIPROTO(f,optional) { \
282 if (optional) PL_last_uni = PL_oldbufptr; \
283 OPERATOR(f); \
284 }
285
286 #define UNIBRACK(f) UNI3(f,0,0)
287
288 /* return has special case parsing.
289 *
290 * List operators have low precedence. Functions have high precedence.
291 * Every built in, *except return*, if written with () around its arguments, is
292 * parsed as a function. Hence every other list built in:
293 *
294 * $ perl -lwe 'sub foo { join 2,4,6 * 1.5 } print for foo()' # join 2,4,9
295 * 429
296 * $ perl -lwe 'sub foo { join(2,4,6) * 1.5 } print for foo()' # 426 * 1.5
297 * 639
298 * $ perl -lwe 'sub foo { join+(2,4,6) * 1.5 } print for foo()'
299 * Useless use of a constant (2) in void context at -e line 1.
300 * Useless use of a constant (4) in void context at -e line 1.
301 *
302 * $
303 *
304 * empty line output because C<(2, 4, 6) * 1.5> is the comma operator, not a
305 * list. * forces scalar context, 6 * 1.5 is 9, and join(9) is the empty string.
306 *
307 * Whereas return:
308 *
309 * $ perl -lwe 'sub foo { return 2,4,6 * 1.5 } print for foo()'
310 * 2
311 * 4
312 * 9
313 * $ perl -lwe 'sub foo { return(2,4,6) * 1.5 } print for foo()'
314 * Useless use of a constant (2) in void context at -e line 1.
315 * Useless use of a constant (4) in void context at -e line 1.
316 * 9
317 * $ perl -lwe 'sub foo { return+(2,4,6) * 1.5 } print for foo()'
318 * Useless use of a constant (2) in void context at -e line 1.
319 * Useless use of a constant (4) in void context at -e line 1.
320 * 9
321 * $
322 *
323 * and:
324 * $ perl -lwe 'sub foo { return(2,4,6) } print for foo()'
325 * 2
326 * 4
327 * 6
328 *
329 * This last example is what we expect, but it's clearly inconsistent with how
330 * C<return(2,4,6) * 1.5> *ought* to behave, if the rules were consistently
331 * followed.
332 *
333 *
334 * Perl 3 attempted to be consistent:
335 *
336 * The rules are more consistent about where parens are needed and
337 * where they are not. In particular, unary operators and list operators now
338 * behave like functions if they're called like functions.
339 *
340 * However, the behaviour for return was reverted to the "old" parsing with
341 * patches 9-12:
342 *
343 * The construct
344 * return (1,2,3);
345 * did not do what was expected, since return was swallowing the
346 * parens in order to consider itself a function. The solution,
347 * since return never wants any trailing expression such as
348 * return (1,2,3) + 2;
349 * is to simply make return an exception to the paren-makes-a-function
350 * rule, and treat it the way it always was, so that it doesn't
351 * strip the parens.
352 *
353 * To demonstrate the special-case parsing, replace OLDLOP(OP_RETURN); with
354 * LOP(OP_RETURN, XTERM);
355 *
356 * and constructs such as
357 *
358 * return (Internals::V())[2]
359 *
360 * turn into syntax errors
361 */
362
363 #define OLDLOP(f) \
364 do { \
365 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) \
366 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; \
367 pl_yylval.ival = (f); \
368 PL_expect = XTERM; \
369 PL_bufptr = s; \
370 return (int)LSTOP; \
371 } while(0)
372
373 #define COPLINE_INC_WITH_HERELINES \
374 STMT_START { \
375 CopLINE_inc(PL_curcop); \
376 if (PL_parser->herelines) \
377 CopLINE(PL_curcop) += PL_parser->herelines, \
378 PL_parser->herelines = 0; \
379 } STMT_END
380 /* Called after scan_str to update CopLINE(PL_curcop), but only when there
381 * is no sublex_push to follow. */
382 #define COPLINE_SET_FROM_MULTI_END \
383 STMT_START { \
384 CopLINE_set(PL_curcop, PL_multi_end); \
385 if (PL_multi_end != PL_multi_start) \
386 PL_parser->herelines = 0; \
387 } STMT_END
388
389
390 /* A file-local structure for passing around information about subroutines and
391 * related definable words */
392 struct code {
393 SV *sv;
394 CV *cv;
395 GV *gv, **gvp;
396 OP *rv2cv_op;
397 PADOFFSET off;
398 bool lex;
399 };
400
401 static const struct code no_code = { NULL, NULL, NULL, NULL, NULL, 0, FALSE };
402
403 #ifdef DEBUGGING
404
405 /* how to interpret the pl_yylval associated with the token */
406 enum token_type {
407 TOKENTYPE_NONE,
408 TOKENTYPE_IVAL,
409 TOKENTYPE_OPNUM, /* pl_yylval.ival contains an opcode number */
410 TOKENTYPE_PVAL,
411 TOKENTYPE_OPVAL
412 };
413
414 #define DEBUG_TOKEN(Type, Name) \
415 { Name, TOKENTYPE_##Type, #Name }
416
417 static struct debug_tokens {
418 const int token;
419 enum token_type type;
420 const char *name;
421 } const debug_tokens[] =
422 {
423 DEBUG_TOKEN (OPNUM, ADDOP),
424 DEBUG_TOKEN (NONE, ANDAND),
425 DEBUG_TOKEN (NONE, ANDOP),
426 DEBUG_TOKEN (NONE, ARROW),
427 DEBUG_TOKEN (OPNUM, ASSIGNOP),
428 DEBUG_TOKEN (OPNUM, BITANDOP),
429 DEBUG_TOKEN (OPNUM, BITOROP),
430 DEBUG_TOKEN (OPNUM, CHEQOP),
431 DEBUG_TOKEN (OPNUM, CHRELOP),
432 DEBUG_TOKEN (NONE, COLONATTR),
433 DEBUG_TOKEN (NONE, DOLSHARP),
434 DEBUG_TOKEN (NONE, DORDOR),
435 DEBUG_TOKEN (IVAL, DOTDOT),
436 DEBUG_TOKEN (NONE, FORMLBRACK),
437 DEBUG_TOKEN (NONE, FORMRBRACK),
438 DEBUG_TOKEN (OPNUM, FUNC),
439 DEBUG_TOKEN (OPNUM, FUNC0),
440 DEBUG_TOKEN (OPVAL, FUNC0OP),
441 DEBUG_TOKEN (OPVAL, FUNC0SUB),
442 DEBUG_TOKEN (OPNUM, FUNC1),
443 DEBUG_TOKEN (NONE, HASHBRACK),
444 DEBUG_TOKEN (IVAL, KW_CATCH),
445 DEBUG_TOKEN (IVAL, KW_CLASS),
446 DEBUG_TOKEN (IVAL, KW_CONTINUE),
447 DEBUG_TOKEN (IVAL, KW_DEFAULT),
448 DEBUG_TOKEN (IVAL, KW_DO),
449 DEBUG_TOKEN (IVAL, KW_ELSE),
450 DEBUG_TOKEN (IVAL, KW_ELSIF),
451 DEBUG_TOKEN (IVAL, KW_FIELD),
452 DEBUG_TOKEN (IVAL, KW_GIVEN),
453 DEBUG_TOKEN (IVAL, KW_FOR),
454 DEBUG_TOKEN (IVAL, KW_FORMAT),
455 DEBUG_TOKEN (IVAL, KW_IF),
456 DEBUG_TOKEN (IVAL, KW_LOCAL),
457 DEBUG_TOKEN (IVAL, KW_METHOD_anon),
458 DEBUG_TOKEN (IVAL, KW_METHOD_named),
459 DEBUG_TOKEN (IVAL, KW_MY),
460 DEBUG_TOKEN (IVAL, KW_PACKAGE),
461 DEBUG_TOKEN (IVAL, KW_REQUIRE),
462 DEBUG_TOKEN (IVAL, KW_SUB_anon),
463 DEBUG_TOKEN (IVAL, KW_SUB_anon_sig),
464 DEBUG_TOKEN (IVAL, KW_SUB_named),
465 DEBUG_TOKEN (IVAL, KW_SUB_named_sig),
466 DEBUG_TOKEN (IVAL, KW_TRY),
467 DEBUG_TOKEN (IVAL, KW_USE_or_NO),
468 DEBUG_TOKEN (IVAL, KW_UNLESS),
469 DEBUG_TOKEN (IVAL, KW_UNTIL),
470 DEBUG_TOKEN (IVAL, KW_WHEN),
471 DEBUG_TOKEN (IVAL, KW_WHILE),
472 DEBUG_TOKEN (OPVAL, LABEL),
473 DEBUG_TOKEN (OPNUM, LOOPEX),
474 DEBUG_TOKEN (OPNUM, LSTOP),
475 DEBUG_TOKEN (OPVAL, LSTOPSUB),
476 DEBUG_TOKEN (OPNUM, MATCHOP),
477 DEBUG_TOKEN (OPVAL, METHCALL),
478 DEBUG_TOKEN (OPVAL, METHCALL0),
479 DEBUG_TOKEN (OPNUM, MULOP),
480 DEBUG_TOKEN (OPNUM, NCEQOP),
481 DEBUG_TOKEN (OPNUM, NCRELOP),
482 DEBUG_TOKEN (NONE, NOAMP),
483 DEBUG_TOKEN (NONE, NOTOP),
484 DEBUG_TOKEN (IVAL, OROP),
485 DEBUG_TOKEN (NONE, OROR),
486 DEBUG_TOKEN (IVAL, PERLY_AMPERSAND),
487 DEBUG_TOKEN (IVAL, PERLY_BRACE_CLOSE),
488 DEBUG_TOKEN (IVAL, PERLY_BRACE_OPEN),
489 DEBUG_TOKEN (IVAL, PERLY_BRACKET_CLOSE),
490 DEBUG_TOKEN (IVAL, PERLY_BRACKET_OPEN),
491 DEBUG_TOKEN (IVAL, PERLY_COLON),
492 DEBUG_TOKEN (IVAL, PERLY_COMMA),
493 DEBUG_TOKEN (IVAL, PERLY_DOT),
494 DEBUG_TOKEN (IVAL, PERLY_EQUAL_SIGN),
495 DEBUG_TOKEN (IVAL, PERLY_EXCLAMATION_MARK),
496 DEBUG_TOKEN (IVAL, PERLY_MINUS),
497 DEBUG_TOKEN (IVAL, PERLY_PAREN_OPEN),
498 DEBUG_TOKEN (IVAL, PERLY_PERCENT_SIGN),
499 DEBUG_TOKEN (IVAL, PERLY_PLUS),
500 DEBUG_TOKEN (IVAL, PERLY_QUESTION_MARK),
501 DEBUG_TOKEN (IVAL, PERLY_SEMICOLON),
502 DEBUG_TOKEN (IVAL, PERLY_SLASH),
503 DEBUG_TOKEN (IVAL, PERLY_SNAIL),
504 DEBUG_TOKEN (IVAL, PERLY_STAR),
505 DEBUG_TOKEN (IVAL, PERLY_TILDE),
506 DEBUG_TOKEN (OPVAL, PLUGEXPR),
507 DEBUG_TOKEN (OPVAL, PLUGSTMT),
508 DEBUG_TOKEN (PVAL, PLUGIN_ADD_OP),
509 DEBUG_TOKEN (PVAL, PLUGIN_ASSIGN_OP),
510 DEBUG_TOKEN (PVAL, PLUGIN_HIGH_OP),
511 DEBUG_TOKEN (PVAL, PLUGIN_LOGICAL_AND_OP),
512 DEBUG_TOKEN (PVAL, PLUGIN_LOGICAL_OR_OP),
513 DEBUG_TOKEN (PVAL, PLUGIN_LOGICAL_AND_LOW_OP),
514 DEBUG_TOKEN (PVAL, PLUGIN_LOGICAL_OR_LOW_OP),
515 DEBUG_TOKEN (PVAL, PLUGIN_LOW_OP),
516 DEBUG_TOKEN (PVAL, PLUGIN_MUL_OP),
517 DEBUG_TOKEN (PVAL, PLUGIN_POW_OP),
518 DEBUG_TOKEN (PVAL, PLUGIN_REL_OP),
519 DEBUG_TOKEN (OPVAL, PMFUNC),
520 DEBUG_TOKEN (NONE, POSTJOIN),
521 DEBUG_TOKEN (NONE, POSTDEC),
522 DEBUG_TOKEN (NONE, POSTINC),
523 DEBUG_TOKEN (OPNUM, POWOP),
524 DEBUG_TOKEN (NONE, PREDEC),
525 DEBUG_TOKEN (NONE, PREINC),
526 DEBUG_TOKEN (OPVAL, PRIVATEREF),
527 DEBUG_TOKEN (OPVAL, QWLIST),
528 DEBUG_TOKEN (NONE, REFGEN),
529 DEBUG_TOKEN (OPNUM, SHIFTOP),
530 DEBUG_TOKEN (NONE, SUBLEXEND),
531 DEBUG_TOKEN (NONE, SUBLEXSTART),
532 DEBUG_TOKEN (OPVAL, THING),
533 DEBUG_TOKEN (NONE, UMINUS),
534 DEBUG_TOKEN (OPNUM, UNIOP),
535 DEBUG_TOKEN (OPVAL, UNIOPSUB),
536 DEBUG_TOKEN (OPVAL, BAREWORD),
537 DEBUG_TOKEN (IVAL, YADAYADA),
538 { 0, TOKENTYPE_NONE, NULL }
539 };
540
541 #undef DEBUG_TOKEN
542
543 /* dump the returned token in rv, plus any optional arg in pl_yylval */
544
545 STATIC int
S_tokereport(pTHX_ I32 rv,const YYSTYPE * lvalp)546 S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp)
547 {
548 PERL_ARGS_ASSERT_TOKEREPORT;
549
550 if (DEBUG_T_TEST) {
551 const char *name = NULL;
552 enum token_type type = TOKENTYPE_NONE;
553 const struct debug_tokens *p;
554 SV* const report = newSVpvs("<== ");
555
556 for (p = debug_tokens; p->token; p++) {
557 if (p->token == (int)rv) {
558 name = p->name;
559 type = p->type;
560 break;
561 }
562 }
563 if (name)
564 Perl_sv_catpv(aTHX_ report, name);
565 else if (isGRAPH(rv))
566 {
567 Perl_sv_catpvf(aTHX_ report, "'%c'", (char)rv);
568 if ((char)rv == 'p')
569 sv_catpvs(report, " (pending identifier)");
570 }
571 else if (!rv)
572 sv_catpvs(report, "EOF");
573 else
574 Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv);
575 switch (type) {
576 case TOKENTYPE_NONE:
577 break;
578 case TOKENTYPE_IVAL:
579 Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival);
580 break;
581 case TOKENTYPE_OPNUM:
582 Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)",
583 PL_op_name[lvalp->ival]);
584 break;
585 case TOKENTYPE_PVAL:
586 Perl_sv_catpvf(aTHX_ report, "(pval=%p)", lvalp->pval);
587 break;
588 case TOKENTYPE_OPVAL:
589 if (lvalp->opval) {
590 Perl_sv_catpvf(aTHX_ report, "(opval=op_%s)",
591 PL_op_name[lvalp->opval->op_type]);
592 if (lvalp->opval->op_type == OP_CONST) {
593 Perl_sv_catpvf(aTHX_ report, " %s",
594 SvPEEK(cSVOPx_sv(lvalp->opval)));
595 }
596
597 }
598 else
599 sv_catpvs(report, "(opval=null)");
600 break;
601 }
602 PerlIO_printf(Perl_debug_log, "### %s\n\n", SvPV_nolen_const(report));
603 };
604 return (int)rv;
605 }
606
607
608 /* print the buffer with suitable escapes */
609
610 STATIC void
S_printbuf(pTHX_ const char * const fmt,const char * const s)611 S_printbuf(pTHX_ const char *const fmt, const char *const s)
612 {
613 SV* const tmp = newSVpvs("");
614
615 PERL_ARGS_ASSERT_PRINTBUF;
616
617 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral); /* fmt checked by caller */
618 PerlIO_printf(Perl_debug_log, fmt, pv_display(tmp, s, strlen(s), 0, 60));
619 GCC_DIAG_RESTORE_STMT;
620 SvREFCNT_dec(tmp);
621 }
622
623 #endif
624
625 /*
626 * S_ao
627 *
628 * This subroutine looks for an '=' next to the operator that has just been
629 * parsed and turns it into an ASSIGNOP if it finds one.
630 */
631
632 STATIC int
S_ao(pTHX_ int toketype)633 S_ao(pTHX_ int toketype)
634 {
635 if (*PL_bufptr == '=') {
636 PL_bufptr++;
637
638 switch (toketype) {
639 case ANDAND: pl_yylval.ival = OP_ANDASSIGN; break;
640 case OROR: pl_yylval.ival = OP_ORASSIGN; break;
641 case DORDOR: pl_yylval.ival = OP_DORASSIGN; break;
642 }
643
644 toketype = ASSIGNOP;
645 }
646 return REPORT(toketype);
647 }
648
649 /*
650 * S_no_op
651 * When Perl expects an operator and finds something else, no_op
652 * prints the warning. It always prints "<something> found where
653 * operator expected. It prints "Missing semicolon on previous line?"
654 * if the surprise occurs at the start of the line. "do you need to
655 * predeclare ..." is printed out for code like "sub bar; foo bar $x"
656 * where the compiler doesn't know if foo is a method call or a function.
657 * It prints "Missing operator before end of line" if there's nothing
658 * after the missing operator, or "... before <...>" if there is something
659 * after the missing operator.
660 *
661 * PL_bufptr is expected to point to the start of the thing that was found,
662 * and s after the next token or partial token.
663 */
664
665 STATIC void
S_no_op(pTHX_ const char * const what,char * s)666 S_no_op(pTHX_ const char *const what, char *s)
667 {
668 char * const oldbp = PL_bufptr;
669 const bool is_first = (PL_oldbufptr == PL_linestart);
670 SV *message = sv_2mortal( newSVpvf(
671 PERL_DIAG_WARN_SYNTAX("%s found where operator expected"),
672 what
673 ) );
674
675 PERL_ARGS_ASSERT_NO_OP;
676
677 if (!s)
678 s = oldbp;
679 else
680 PL_bufptr = s;
681
682 if (ckWARN_d(WARN_SYNTAX)) {
683 bool has_more = FALSE;
684 if (is_first) {
685 has_more = TRUE;
686 sv_catpvs(message,
687 " (Missing semicolon on previous line?)");
688 }
689 else if (PL_oldoldbufptr) {
690 /* yyerror (via yywarn) would do this itself, so we should too */
691 const char *t;
692 for (t = PL_oldoldbufptr;
693 t < PL_bufptr && isSPACE(*t);
694 t += UTF ? UTF8SKIP(t) : 1)
695 {
696 NOOP;
697 }
698 /* see if we can identify the cause of the warning */
699 if (isIDFIRST_lazy_if_safe(t,PL_bufend,UTF))
700 {
701 const char *t_start= t;
702 for ( ;
703 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
704 t += UTF ? UTF8SKIP(t) : 1)
705 {
706 NOOP;
707 }
708 if (t < PL_bufptr && isSPACE(*t)) {
709 has_more = TRUE;
710 sv_catpvf( message,
711 " (Do you need to predeclare \"%" UTF8f "\"?)",
712 UTF8fARG(UTF, t - t_start, t_start));
713 }
714 }
715 }
716 if (!has_more) {
717 const char *t= oldbp;
718 assert(s >= oldbp);
719 while (t < s && isSPACE(*t)) {
720 t += UTF ? UTF8SKIP(t) : 1;
721 }
722
723 sv_catpvf(message,
724 " (Missing operator before \"%" UTF8f "\"?)",
725 UTF8fARG(UTF, s - t, t));
726 }
727 }
728 yywarn(SvPV_nolen(message), UTF ? SVf_UTF8 : 0);
729 PL_bufptr = oldbp;
730 }
731
732 /*
733 * S_missingterm
734 * Complain about missing quote/regexp/heredoc terminator.
735 * If it's called with NULL then it cauterizes the line buffer.
736 * If we're in a delimited string and the delimiter is a control
737 * character, it's reformatted into a two-char sequence like ^C.
738 * This is fatal.
739 */
740
741 STATIC void
S_missingterm(pTHX_ char * s,STRLEN len)742 S_missingterm(pTHX_ char *s, STRLEN len)
743 {
744 char tmpbuf[UTF8_MAXBYTES + 1];
745 char q;
746 bool uni = FALSE;
747 if (s) {
748 char * const nl = (char *) my_memrchr(s, '\n', len);
749 if (nl) {
750 *nl = '\0';
751 len = nl - s;
752 }
753 uni = UTF;
754 }
755 else if (PL_multi_close < 32) {
756 *tmpbuf = '^';
757 tmpbuf[1] = (char)toCTRL(PL_multi_close);
758 tmpbuf[2] = '\0';
759 s = tmpbuf;
760 len = 2;
761 }
762 else {
763 if (! UTF && LIKELY(PL_multi_close < 256)) {
764 *tmpbuf = (char)PL_multi_close;
765 tmpbuf[1] = '\0';
766 len = 1;
767 }
768 else {
769 char *end = (char *)uvchr_to_utf8((U8 *)tmpbuf, PL_multi_close);
770 *end = '\0';
771 len = end - tmpbuf;
772 uni = TRUE;
773 }
774 s = tmpbuf;
775 }
776 q = memchr(s, '"', len) ? '\'' : '"';
777 Perl_croak(aTHX_ "Can't find string terminator %c%" UTF8f "%c"
778 " anywhere before EOF", q, UTF8fARG(uni, len, s), q);
779 }
780
781 #include "feature.h"
782
783 /*
784 * experimental text filters for win32 carriage-returns, utf16-to-utf8 and
785 * utf16-to-utf8-reversed.
786 */
787
788 #ifdef PERL_CR_FILTER
789 static void
strip_return(SV * sv)790 strip_return(SV *sv)
791 {
792 const char *s = SvPVX_const(sv);
793 const char * const e = s + SvCUR(sv);
794
795 PERL_ARGS_ASSERT_STRIP_RETURN;
796
797 /* outer loop optimized to do nothing if there are no CR-LFs */
798 while (s < e) {
799 if (*s++ == '\r' && *s == '\n') {
800 /* hit a CR-LF, need to copy the rest */
801 char *d = s - 1;
802 *d++ = *s++;
803 while (s < e) {
804 if (*s == '\r' && s[1] == '\n')
805 s++;
806 *d++ = *s++;
807 }
808 SvCUR(sv) -= s - d;
809 return;
810 }
811 }
812 }
813
814 STATIC I32
S_cr_textfilter(pTHX_ int idx,SV * sv,int maxlen)815 S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen)
816 {
817 const I32 count = FILTER_READ(idx+1, sv, maxlen);
818 if (count > 0 && !maxlen)
819 strip_return(sv);
820 return count;
821 }
822 #endif
823
824 /*
825 =for apidoc lex_start
826
827 Creates and initialises a new lexer/parser state object, supplying
828 a context in which to lex and parse from a new source of Perl code.
829 A pointer to the new state object is placed in L</PL_parser>. An entry
830 is made on the save stack so that upon unwinding, the new state object
831 will be destroyed and the former value of L</PL_parser> will be restored.
832 Nothing else need be done to clean up the parsing context.
833
834 The code to be parsed comes from C<line> and C<rsfp>. C<line>, if
835 non-null, provides a string (in SV form) containing code to be parsed.
836 A copy of the string is made, so subsequent modification of C<line>
837 does not affect parsing. C<rsfp>, if non-null, provides an input stream
838 from which code will be read to be parsed. If both are non-null, the
839 code in C<line> comes first and must consist of complete lines of input,
840 and C<rsfp> supplies the remainder of the source.
841
842 The C<flags> parameter is reserved for future use. Currently it is only
843 used by perl internally, so extensions should always pass zero.
844
845 =cut
846 */
847
848 /* LEX_START_SAME_FILTER indicates that this is not a new file, so it
849 can share filters with the current parser.
850 LEX_START_DONT_CLOSE indicates that the file handle wasn't opened by the
851 caller, hence isn't owned by the parser, so shouldn't be closed on parser
852 destruction. This is used to handle the case of defaulting to reading the
853 script from the standard input because no filename was given on the command
854 line (without getting confused by situation where STDIN has been closed, so
855 the script handle is opened on fd 0) */
856
857 void
Perl_lex_start(pTHX_ SV * line,PerlIO * rsfp,U32 flags)858 Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags)
859 {
860 const char *s = NULL;
861 yy_parser *parser, *oparser;
862
863 if (flags && flags & ~LEX_START_FLAGS)
864 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start");
865
866 /* create and initialise a parser */
867
868 Newxz(parser, 1, yy_parser);
869 parser->old_parser = oparser = PL_parser;
870 PL_parser = parser;
871
872 parser->stack = NULL;
873 parser->stack_max1 = NULL;
874 parser->ps = NULL;
875
876 /* on scope exit, free this parser and restore any outer one */
877 SAVEPARSER(parser);
878 parser->saved_curcop = PL_curcop;
879
880 /* initialise lexer state */
881
882 parser->nexttoke = 0;
883 parser->error_count = oparser ? oparser->error_count : 0;
884 parser->copline = parser->preambling = NOLINE;
885 parser->lex_state = LEX_NORMAL;
886 parser->expect = XSTATE;
887 parser->rsfp = rsfp;
888 parser->recheck_utf8_validity = TRUE;
889 parser->rsfp_filters =
890 !(flags & LEX_START_SAME_FILTER) || !oparser
891 ? NULL
892 : MUTABLE_AV(SvREFCNT_inc(
893 oparser->rsfp_filters
894 ? oparser->rsfp_filters
895 : (oparser->rsfp_filters = newAV())
896 ));
897
898 Newx(parser->lex_brackstack, 120, char);
899 Newx(parser->lex_casestack, 12, char);
900 *parser->lex_casestack = '\0';
901 Newxz(parser->lex_shared, 1, LEXSHARED);
902
903 if (line) {
904 Size_t len;
905 const U8* first_bad_char_loc;
906
907 s = SvPV_const(line, len);
908
909 if ( SvUTF8(line)
910 && UNLIKELY(! is_utf8_string_loc((U8 *) s,
911 SvCUR(line),
912 &first_bad_char_loc)))
913 {
914 _force_out_malformed_utf8_message(first_bad_char_loc,
915 (U8 *) s + SvCUR(line),
916 0,
917 1 /* 1 means die */ );
918 NOT_REACHED; /* NOTREACHED */
919 }
920
921 parser->linestr = flags & LEX_START_COPIED
922 ? SvREFCNT_inc_simple_NN(line)
923 : newSVpvn_flags(s, len, SvUTF8(line));
924 if (!rsfp)
925 sv_catpvs(parser->linestr, "\n;");
926 } else {
927 parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2);
928 }
929
930 parser->oldoldbufptr =
931 parser->oldbufptr =
932 parser->bufptr =
933 parser->linestart = SvPVX(parser->linestr);
934 parser->bufend = parser->bufptr + SvCUR(parser->linestr);
935 parser->last_lop = parser->last_uni = NULL;
936
937 STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
938 |LEX_DONT_CLOSE_RSFP));
939 parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES
940 |LEX_DONT_CLOSE_RSFP));
941
942 parser->in_pod = parser->filtered = 0;
943 }
944
945
946 /* delete a parser object */
947
948 void
Perl_parser_free(pTHX_ const yy_parser * parser)949 Perl_parser_free(pTHX_ const yy_parser *parser)
950 {
951 PERL_ARGS_ASSERT_PARSER_FREE;
952
953 PL_curcop = parser->saved_curcop;
954 SvREFCNT_dec(parser->linestr);
955
956 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
957 PerlIO_clearerr(parser->rsfp);
958 else if (parser->rsfp && (!parser->old_parser
959 || (parser->old_parser && parser->rsfp != parser->old_parser->rsfp)))
960 PerlIO_close(parser->rsfp);
961 SvREFCNT_dec(parser->rsfp_filters);
962 SvREFCNT_dec(parser->lex_stuff);
963 SvREFCNT_dec(parser->lex_sub_repl);
964
965 Safefree(parser->lex_brackstack);
966 Safefree(parser->lex_casestack);
967 Safefree(parser->lex_shared);
968 PL_parser = parser->old_parser;
969 Safefree(parser);
970 }
971
972 void
Perl_parser_free_nexttoke_ops(pTHX_ yy_parser * parser,OPSLAB * slab)973 Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab)
974 {
975 I32 nexttoke = parser->nexttoke;
976 PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS;
977 while (nexttoke--) {
978 if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff)
979 && parser->nextval[nexttoke].opval
980 && parser->nextval[nexttoke].opval->op_slabbed
981 && OpSLAB(parser->nextval[nexttoke].opval) == slab) {
982 op_free(parser->nextval[nexttoke].opval);
983 parser->nextval[nexttoke].opval = NULL;
984 }
985 }
986 }
987
988
989 /*
990 =for apidoc AmnxUN|SV *|PL_parser-E<gt>linestr
991
992 Buffer scalar containing the chunk currently under consideration of the
993 text currently being lexed. This is always a plain string scalar (for
994 which C<SvPOK> is true). It is not intended to be used as a scalar by
995 normal scalar means; instead refer to the buffer directly by the pointer
996 variables described below.
997
998 The lexer maintains various C<char*> pointers to things in the
999 C<PL_parser-E<gt>linestr> buffer. If C<PL_parser-E<gt>linestr> is ever
1000 reallocated, all of these pointers must be updated. Don't attempt to
1001 do this manually, but rather use L</lex_grow_linestr> if you need to
1002 reallocate the buffer.
1003
1004 The content of the text chunk in the buffer is commonly exactly one
1005 complete line of input, up to and including a newline terminator,
1006 but there are situations where it is otherwise. The octets of the
1007 buffer may be intended to be interpreted as either UTF-8 or Latin-1.
1008 The function L</lex_bufutf8> tells you which. Do not use the C<SvUTF8>
1009 flag on this scalar, which may disagree with it.
1010
1011 For direct examination of the buffer, the variable
1012 L</PL_parser-E<gt>bufend> points to the end of the buffer. The current
1013 lexing position is pointed to by L</PL_parser-E<gt>bufptr>. Direct use
1014 of these pointers is usually preferable to examination of the scalar
1015 through normal scalar means.
1016
1017 =for apidoc AmnxUN|char *|PL_parser-E<gt>bufend
1018
1019 Direct pointer to the end of the chunk of text currently being lexed, the
1020 end of the lexer buffer. This is equal to C<SvPVX(PL_parser-E<gt>linestr)
1021 + SvCUR(PL_parser-E<gt>linestr)>. A C<NUL> character (zero octet) is
1022 always located at the end of the buffer, and does not count as part of
1023 the buffer's contents.
1024
1025 =for apidoc AmnxUN|char *|PL_parser-E<gt>bufptr
1026
1027 Points to the current position of lexing inside the lexer buffer.
1028 Characters around this point may be freely examined, within
1029 the range delimited by C<SvPVX(L</PL_parser-E<gt>linestr>)> and
1030 L</PL_parser-E<gt>bufend>. The octets of the buffer may be intended to be
1031 interpreted as either UTF-8 or Latin-1, as indicated by L</lex_bufutf8>.
1032
1033 Lexing code (whether in the Perl core or not) moves this pointer past
1034 the characters that it consumes. It is also expected to perform some
1035 bookkeeping whenever a newline character is consumed. This movement
1036 can be more conveniently performed by the function L</lex_read_to>,
1037 which handles newlines appropriately.
1038
1039 Interpretation of the buffer's octets can be abstracted out by
1040 using the slightly higher-level functions L</lex_peek_unichar> and
1041 L</lex_read_unichar>.
1042
1043 =for apidoc AmnxUN|char *|PL_parser-E<gt>linestart
1044
1045 Points to the start of the current line inside the lexer buffer.
1046 This is useful for indicating at which column an error occurred, and
1047 not much else. This must be updated by any lexing code that consumes
1048 a newline; the function L</lex_read_to> handles this detail.
1049
1050 =cut
1051 */
1052
1053 /*
1054 =for apidoc lex_bufutf8
1055
1056 Indicates whether the octets in the lexer buffer
1057 (L</PL_parser-E<gt>linestr>) should be interpreted as the UTF-8 encoding
1058 of Unicode characters. If not, they should be interpreted as Latin-1
1059 characters. This is analogous to the C<SvUTF8> flag for scalars.
1060
1061 In UTF-8 mode, it is not guaranteed that the lexer buffer actually
1062 contains valid UTF-8. Lexing code must be robust in the face of invalid
1063 encoding.
1064
1065 The actual C<SvUTF8> flag of the L</PL_parser-E<gt>linestr> scalar
1066 is significant, but not the whole story regarding the input character
1067 encoding. Normally, when a file is being read, the scalar contains octets
1068 and its C<SvUTF8> flag is off, but the octets should be interpreted as
1069 UTF-8 if the C<use utf8> pragma is in effect. During a string eval,
1070 however, the scalar may have the C<SvUTF8> flag on, and in this case its
1071 octets should be interpreted as UTF-8 unless the C<use bytes> pragma
1072 is in effect. This logic may change in the future; use this function
1073 instead of implementing the logic yourself.
1074
1075 =cut
1076 */
1077
1078 bool
Perl_lex_bufutf8(pTHX)1079 Perl_lex_bufutf8(pTHX)
1080 {
1081 return UTF;
1082 }
1083
1084 /*
1085 =for apidoc lex_grow_linestr
1086
1087 Reallocates the lexer buffer (L</PL_parser-E<gt>linestr>) to accommodate
1088 at least C<len> octets (including terminating C<NUL>). Returns a
1089 pointer to the reallocated buffer. This is necessary before making
1090 any direct modification of the buffer that would increase its length.
1091 L</lex_stuff_pvn> provides a more convenient way to insert text into
1092 the buffer.
1093
1094 Do not use C<SvGROW> or C<sv_grow> directly on C<PL_parser-E<gt>linestr>;
1095 this function updates all of the lexer's variables that point directly
1096 into the buffer.
1097
1098 =cut
1099 */
1100
1101 char *
Perl_lex_grow_linestr(pTHX_ STRLEN len)1102 Perl_lex_grow_linestr(pTHX_ STRLEN len)
1103 {
1104 SV *linestr;
1105 char *buf;
1106 STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1107 STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos;
1108 bool current;
1109
1110 linestr = PL_parser->linestr;
1111 buf = SvPVX(linestr);
1112 if (len <= SvLEN(linestr))
1113 return buf;
1114
1115 /* Is the lex_shared linestr SV the same as the current linestr SV?
1116 * Only in this case does re_eval_start need adjusting, since it
1117 * points within lex_shared->ls_linestr's buffer */
1118 current = ( !PL_parser->lex_shared->ls_linestr
1119 || linestr == PL_parser->lex_shared->ls_linestr);
1120
1121 bufend_pos = PL_parser->bufend - buf;
1122 bufptr_pos = PL_parser->bufptr - buf;
1123 oldbufptr_pos = PL_parser->oldbufptr - buf;
1124 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1125 linestart_pos = PL_parser->linestart - buf;
1126 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1127 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1128 re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ?
1129 PL_parser->lex_shared->re_eval_start - buf : 0;
1130
1131 buf = sv_grow(linestr, len);
1132
1133 PL_parser->bufend = buf + bufend_pos;
1134 PL_parser->bufptr = buf + bufptr_pos;
1135 PL_parser->oldbufptr = buf + oldbufptr_pos;
1136 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1137 PL_parser->linestart = buf + linestart_pos;
1138 if (PL_parser->last_uni)
1139 PL_parser->last_uni = buf + last_uni_pos;
1140 if (PL_parser->last_lop)
1141 PL_parser->last_lop = buf + last_lop_pos;
1142 if (current && PL_parser->lex_shared->re_eval_start)
1143 PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos;
1144 return buf;
1145 }
1146
1147 /*
1148 =for apidoc lex_stuff_pvn
1149
1150 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1151 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1152 reallocating the buffer if necessary. This means that lexing code that
1153 runs later will see the characters as if they had appeared in the input.
1154 It is not recommended to do this as part of normal parsing, and most
1155 uses of this facility run the risk of the inserted characters being
1156 interpreted in an unintended manner.
1157
1158 The string to be inserted is represented by C<len> octets starting
1159 at C<pv>. These octets are interpreted as either UTF-8 or Latin-1,
1160 according to whether the C<LEX_STUFF_UTF8> flag is set in C<flags>.
1161 The characters are recoded for the lexer buffer, according to how the
1162 buffer is currently being interpreted (L</lex_bufutf8>). If a string
1163 to be inserted is available as a Perl scalar, the L</lex_stuff_sv>
1164 function is more convenient.
1165
1166 =for apidoc Amnh||LEX_STUFF_UTF8
1167
1168 =cut
1169 */
1170
1171 void
Perl_lex_stuff_pvn(pTHX_ const char * pv,STRLEN len,U32 flags)1172 Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags)
1173 {
1174 char *bufptr;
1175 PERL_ARGS_ASSERT_LEX_STUFF_PVN;
1176 if (flags & ~(LEX_STUFF_UTF8))
1177 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_pvn");
1178 if (UTF) {
1179 if (flags & LEX_STUFF_UTF8) {
1180 goto plain_copy;
1181 } else {
1182 STRLEN highhalf = variant_under_utf8_count((U8 *) pv,
1183 (U8 *) pv + len);
1184 const char *p, *e = pv+len;;
1185 if (!highhalf)
1186 goto plain_copy;
1187 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
1188 bufptr = PL_parser->bufptr;
1189 Move(bufptr, bufptr+len+highhalf, PL_parser->bufend+1-bufptr, char);
1190 SvCUR_set(PL_parser->linestr,
1191 SvCUR(PL_parser->linestr) + len+highhalf);
1192 PL_parser->bufend += len+highhalf;
1193 for (p = pv; p != e; p++) {
1194 append_utf8_from_native_byte(*p, (U8 **) &bufptr);
1195 }
1196 }
1197 } else {
1198 if (flags & LEX_STUFF_UTF8) {
1199 STRLEN highhalf = 0;
1200 const char *p, *e = pv+len;
1201 for (p = pv; p != e; p++) {
1202 U8 c = (U8)*p;
1203 if (UTF8_IS_ABOVE_LATIN1(c)) {
1204 Perl_croak(aTHX_ "Lexing code attempted to stuff "
1205 "non-Latin-1 character into Latin-1 input");
1206 } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
1207 p++;
1208 highhalf++;
1209 } else assert(UTF8_IS_INVARIANT(c));
1210 }
1211 if (!highhalf)
1212 goto plain_copy;
1213 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len-highhalf);
1214 bufptr = PL_parser->bufptr;
1215 Move(bufptr, bufptr+len-highhalf, PL_parser->bufend+1-bufptr, char);
1216 SvCUR_set(PL_parser->linestr,
1217 SvCUR(PL_parser->linestr) + len-highhalf);
1218 PL_parser->bufend += len-highhalf;
1219 p = pv;
1220 while (p < e) {
1221 if (UTF8_IS_INVARIANT(*p)) {
1222 *bufptr++ = *p;
1223 p++;
1224 }
1225 else {
1226 assert(p < e -1 );
1227 *bufptr++ = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
1228 p += 2;
1229 }
1230 }
1231 } else {
1232 plain_copy:
1233 lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
1234 bufptr = PL_parser->bufptr;
1235 Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
1236 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len);
1237 PL_parser->bufend += len;
1238 Copy(pv, bufptr, len, char);
1239 }
1240 }
1241 }
1242
1243 /*
1244 =for apidoc lex_stuff_pv
1245
1246 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1247 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1248 reallocating the buffer if necessary. This means that lexing code that
1249 runs later will see the characters as if they had appeared in the input.
1250 It is not recommended to do this as part of normal parsing, and most
1251 uses of this facility run the risk of the inserted characters being
1252 interpreted in an unintended manner.
1253
1254 The string to be inserted is represented by octets starting at C<pv>
1255 and continuing to the first nul. These octets are interpreted as either
1256 UTF-8 or Latin-1, according to whether the C<LEX_STUFF_UTF8> flag is set
1257 in C<flags>. The characters are recoded for the lexer buffer, according
1258 to how the buffer is currently being interpreted (L</lex_bufutf8>).
1259 If it is not convenient to nul-terminate a string to be inserted, the
1260 L</lex_stuff_pvn> function is more appropriate.
1261
1262 =cut
1263 */
1264
1265 void
Perl_lex_stuff_pv(pTHX_ const char * pv,U32 flags)1266 Perl_lex_stuff_pv(pTHX_ const char *pv, U32 flags)
1267 {
1268 PERL_ARGS_ASSERT_LEX_STUFF_PV;
1269 lex_stuff_pvn(pv, strlen(pv), flags);
1270 }
1271
1272 /*
1273 =for apidoc lex_stuff_sv
1274
1275 Insert characters into the lexer buffer (L</PL_parser-E<gt>linestr>),
1276 immediately after the current lexing point (L</PL_parser-E<gt>bufptr>),
1277 reallocating the buffer if necessary. This means that lexing code that
1278 runs later will see the characters as if they had appeared in the input.
1279 It is not recommended to do this as part of normal parsing, and most
1280 uses of this facility run the risk of the inserted characters being
1281 interpreted in an unintended manner.
1282
1283 The string to be inserted is the string value of C<sv>. The characters
1284 are recoded for the lexer buffer, according to how the buffer is currently
1285 being interpreted (L</lex_bufutf8>). If a string to be inserted is
1286 not already a Perl scalar, the L</lex_stuff_pvn> function avoids the
1287 need to construct a scalar.
1288
1289 =cut
1290 */
1291
1292 void
Perl_lex_stuff_sv(pTHX_ SV * sv,U32 flags)1293 Perl_lex_stuff_sv(pTHX_ SV *sv, U32 flags)
1294 {
1295 char *pv;
1296 STRLEN len;
1297 PERL_ARGS_ASSERT_LEX_STUFF_SV;
1298 if (flags)
1299 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_stuff_sv");
1300 pv = SvPV(sv, len);
1301 lex_stuff_pvn(pv, len, flags | (SvUTF8(sv) ? LEX_STUFF_UTF8 : 0));
1302 }
1303
1304 /*
1305 =for apidoc lex_unstuff
1306
1307 Discards text about to be lexed, from L</PL_parser-E<gt>bufptr> up to
1308 C<ptr>. Text following C<ptr> will be moved, and the buffer shortened.
1309 This hides the discarded text from any lexing code that runs later,
1310 as if the text had never appeared.
1311
1312 This is not the normal way to consume lexed text. For that, use
1313 L</lex_read_to>.
1314
1315 =cut
1316 */
1317
1318 void
Perl_lex_unstuff(pTHX_ char * ptr)1319 Perl_lex_unstuff(pTHX_ char *ptr)
1320 {
1321 char *buf, *bufend;
1322 STRLEN unstuff_len;
1323 PERL_ARGS_ASSERT_LEX_UNSTUFF;
1324 buf = PL_parser->bufptr;
1325 if (ptr < buf)
1326 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1327 if (ptr == buf)
1328 return;
1329 bufend = PL_parser->bufend;
1330 if (ptr > bufend)
1331 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_unstuff");
1332 unstuff_len = ptr - buf;
1333 Move(ptr, buf, bufend+1-ptr, char);
1334 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - unstuff_len);
1335 PL_parser->bufend = bufend - unstuff_len;
1336 }
1337
1338 /*
1339 =for apidoc lex_read_to
1340
1341 Consume text in the lexer buffer, from L</PL_parser-E<gt>bufptr> up
1342 to C<ptr>. This advances L</PL_parser-E<gt>bufptr> to match C<ptr>,
1343 performing the correct bookkeeping whenever a newline character is passed.
1344 This is the normal way to consume lexed text.
1345
1346 Interpretation of the buffer's octets can be abstracted out by
1347 using the slightly higher-level functions L</lex_peek_unichar> and
1348 L</lex_read_unichar>.
1349
1350 =cut
1351 */
1352
1353 void
Perl_lex_read_to(pTHX_ char * ptr)1354 Perl_lex_read_to(pTHX_ char *ptr)
1355 {
1356 char *s;
1357 PERL_ARGS_ASSERT_LEX_READ_TO;
1358 s = PL_parser->bufptr;
1359 if (ptr < s || ptr > PL_parser->bufend)
1360 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_to");
1361 for (; s != ptr; s++)
1362 if (*s == '\n') {
1363 COPLINE_INC_WITH_HERELINES;
1364 PL_parser->linestart = s+1;
1365 }
1366 PL_parser->bufptr = ptr;
1367 }
1368
1369 /*
1370 =for apidoc lex_discard_to
1371
1372 Discards the first part of the L</PL_parser-E<gt>linestr> buffer,
1373 up to C<ptr>. The remaining content of the buffer will be moved, and
1374 all pointers into the buffer updated appropriately. C<ptr> must not
1375 be later in the buffer than the position of L</PL_parser-E<gt>bufptr>:
1376 it is not permitted to discard text that has yet to be lexed.
1377
1378 Normally it is not necessarily to do this directly, because it suffices to
1379 use the implicit discarding behaviour of L</lex_next_chunk> and things
1380 based on it. However, if a token stretches across multiple lines,
1381 and the lexing code has kept multiple lines of text in the buffer for
1382 that purpose, then after completion of the token it would be wise to
1383 explicitly discard the now-unneeded earlier lines, to avoid future
1384 multi-line tokens growing the buffer without bound.
1385
1386 =cut
1387 */
1388
1389 void
Perl_lex_discard_to(pTHX_ char * ptr)1390 Perl_lex_discard_to(pTHX_ char *ptr)
1391 {
1392 char *buf;
1393 STRLEN discard_len;
1394 PERL_ARGS_ASSERT_LEX_DISCARD_TO;
1395 buf = SvPVX(PL_parser->linestr);
1396 if (ptr < buf)
1397 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1398 if (ptr == buf)
1399 return;
1400 if (ptr > PL_parser->bufptr)
1401 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_discard_to");
1402 discard_len = ptr - buf;
1403 if (PL_parser->oldbufptr < ptr)
1404 PL_parser->oldbufptr = ptr;
1405 if (PL_parser->oldoldbufptr < ptr)
1406 PL_parser->oldoldbufptr = ptr;
1407 if (PL_parser->last_uni && PL_parser->last_uni < ptr)
1408 PL_parser->last_uni = NULL;
1409 if (PL_parser->last_lop && PL_parser->last_lop < ptr)
1410 PL_parser->last_lop = NULL;
1411 Move(ptr, buf, PL_parser->bufend+1-ptr, char);
1412 SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) - discard_len);
1413 PL_parser->bufend -= discard_len;
1414 PL_parser->bufptr -= discard_len;
1415 PL_parser->oldbufptr -= discard_len;
1416 PL_parser->oldoldbufptr -= discard_len;
1417 if (PL_parser->last_uni)
1418 PL_parser->last_uni -= discard_len;
1419 if (PL_parser->last_lop)
1420 PL_parser->last_lop -= discard_len;
1421 }
1422
1423 void
Perl_notify_parser_that_changed_to_utf8(pTHX)1424 Perl_notify_parser_that_changed_to_utf8(pTHX)
1425 {
1426 /* Called when $^H is changed to indicate that HINT_UTF8 has changed from
1427 * off to on. At compile time, this has the effect of entering a 'use
1428 * utf8' section. This means that any input was not previously checked for
1429 * UTF-8 (because it was off), but now we do need to check it, or our
1430 * assumptions about the input being sane could be wrong, and we could
1431 * segfault. This routine just sets a flag so that the next time we look
1432 * at the input we do the well-formed UTF-8 check. If we aren't in the
1433 * proper phase, there may not be a parser object, but if there is, setting
1434 * the flag is harmless */
1435
1436 if (PL_parser) {
1437 PL_parser->recheck_utf8_validity = TRUE;
1438 }
1439 }
1440
1441 /*
1442 =for apidoc lex_next_chunk
1443
1444 Reads in the next chunk of text to be lexed, appending it to
1445 L</PL_parser-E<gt>linestr>. This should be called when lexing code has
1446 looked to the end of the current chunk and wants to know more. It is
1447 usual, but not necessary, for lexing to have consumed the entirety of
1448 the current chunk at this time.
1449
1450 If L</PL_parser-E<gt>bufptr> is pointing to the very end of the current
1451 chunk (i.e., the current chunk has been entirely consumed), normally the
1452 current chunk will be discarded at the same time that the new chunk is
1453 read in. If C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, the current chunk
1454 will not be discarded. If the current chunk has not been entirely
1455 consumed, then it will not be discarded regardless of the flag.
1456
1457 Returns true if some new text was added to the buffer, or false if the
1458 buffer has reached the end of the input text.
1459
1460 =for apidoc Amnh||LEX_KEEP_PREVIOUS
1461
1462 =cut
1463 */
1464
1465 #define LEX_FAKE_EOF 0x80000000
1466 #define LEX_NO_TERM 0x40000000 /* here-doc */
1467
1468 bool
Perl_lex_next_chunk(pTHX_ U32 flags)1469 Perl_lex_next_chunk(pTHX_ U32 flags)
1470 {
1471 SV *linestr;
1472 char *buf;
1473 STRLEN old_bufend_pos, new_bufend_pos;
1474 STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos;
1475 STRLEN linestart_pos, last_uni_pos, last_lop_pos;
1476 bool got_some_for_debugger = 0;
1477 bool got_some;
1478
1479 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
1480 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
1481 if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
1482 return FALSE;
1483 linestr = PL_parser->linestr;
1484 buf = SvPVX(linestr);
1485 if (!(flags & LEX_KEEP_PREVIOUS)
1486 && PL_parser->bufptr == PL_parser->bufend)
1487 {
1488 old_bufend_pos = bufptr_pos = oldbufptr_pos = oldoldbufptr_pos = 0;
1489 linestart_pos = 0;
1490 if (PL_parser->last_uni != PL_parser->bufend)
1491 PL_parser->last_uni = NULL;
1492 if (PL_parser->last_lop != PL_parser->bufend)
1493 PL_parser->last_lop = NULL;
1494 last_uni_pos = last_lop_pos = 0;
1495 *buf = 0;
1496 SvCUR_set(linestr, 0);
1497 } else {
1498 old_bufend_pos = PL_parser->bufend - buf;
1499 bufptr_pos = PL_parser->bufptr - buf;
1500 oldbufptr_pos = PL_parser->oldbufptr - buf;
1501 oldoldbufptr_pos = PL_parser->oldoldbufptr - buf;
1502 linestart_pos = PL_parser->linestart - buf;
1503 last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
1504 last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
1505 }
1506 if (flags & LEX_FAKE_EOF) {
1507 goto eof;
1508 } else if (!PL_parser->rsfp && !PL_parser->filtered) {
1509 got_some = 0;
1510 } else if (filter_gets(linestr, old_bufend_pos)) {
1511 got_some = 1;
1512 got_some_for_debugger = 1;
1513 } else if (flags & LEX_NO_TERM) {
1514 got_some = 0;
1515 } else {
1516 if (!SvPOK(linestr)) /* can get undefined by filter_gets */
1517 SvPVCLEAR(linestr);
1518 eof:
1519 /* End of real input. Close filehandle (unless it was STDIN),
1520 * then add implicit termination.
1521 */
1522 if (PL_parser->lex_flags & LEX_DONT_CLOSE_RSFP)
1523 PerlIO_clearerr(PL_parser->rsfp);
1524 else if (PL_parser->rsfp)
1525 (void)PerlIO_close(PL_parser->rsfp);
1526 PL_parser->rsfp = NULL;
1527 PL_parser->in_pod = PL_parser->filtered = 0;
1528 if (!PL_in_eval && PL_minus_p) {
1529 sv_catpvs(linestr,
1530 /*{*/";}continue{print or die qq(-p destination: $!\\n);}");
1531 PL_minus_n = PL_minus_p = 0;
1532 } else if (!PL_in_eval && PL_minus_n) {
1533 sv_catpvs(linestr, /*{*/";}");
1534 PL_minus_n = 0;
1535 } else
1536 sv_catpvs(linestr, ";");
1537 got_some = 1;
1538 }
1539 buf = SvPVX(linestr);
1540 new_bufend_pos = SvCUR(linestr);
1541 PL_parser->bufend = buf + new_bufend_pos;
1542 PL_parser->bufptr = buf + bufptr_pos;
1543
1544 if (UTF) {
1545 const U8* first_bad_char_loc;
1546 if (UNLIKELY(! is_utf8_string_loc(
1547 (U8 *) PL_parser->bufptr,
1548 PL_parser->bufend - PL_parser->bufptr,
1549 &first_bad_char_loc)))
1550 {
1551 _force_out_malformed_utf8_message(first_bad_char_loc,
1552 (U8 *) PL_parser->bufend,
1553 0,
1554 1 /* 1 means die */ );
1555 NOT_REACHED; /* NOTREACHED */
1556 }
1557 }
1558
1559 PL_parser->oldbufptr = buf + oldbufptr_pos;
1560 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
1561 PL_parser->linestart = buf + linestart_pos;
1562 if (PL_parser->last_uni)
1563 PL_parser->last_uni = buf + last_uni_pos;
1564 if (PL_parser->last_lop)
1565 PL_parser->last_lop = buf + last_lop_pos;
1566 if (PL_parser->preambling != NOLINE) {
1567 CopLINE_set(PL_curcop, PL_parser->preambling + 1);
1568 PL_parser->preambling = NOLINE;
1569 }
1570 if ( got_some_for_debugger
1571 && PERLDB_LINE_OR_SAVESRC
1572 && PL_curstash != PL_debstash)
1573 {
1574 /* debugger active and we're not compiling the debugger code,
1575 * so store the line into the debugger's array of lines
1576 */
1577 update_debugger_info(NULL, buf+old_bufend_pos,
1578 new_bufend_pos-old_bufend_pos);
1579 }
1580 return got_some;
1581 }
1582
1583 /*
1584 =for apidoc lex_peek_unichar
1585
1586 Looks ahead one (Unicode) character in the text currently being lexed.
1587 Returns the codepoint (unsigned integer value) of the next character,
1588 or -1 if lexing has reached the end of the input text. To consume the
1589 peeked character, use L</lex_read_unichar>.
1590
1591 If the next character is in (or extends into) the next chunk of input
1592 text, the next chunk will be read in. Normally the current chunk will be
1593 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1594 bit set, then the current chunk will not be discarded.
1595
1596 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1597 is encountered, an exception is generated.
1598
1599 =cut
1600 */
1601
1602 I32
Perl_lex_peek_unichar(pTHX_ U32 flags)1603 Perl_lex_peek_unichar(pTHX_ U32 flags)
1604 {
1605 char *s, *bufend;
1606 if (flags & ~(LEX_KEEP_PREVIOUS))
1607 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar");
1608 s = PL_parser->bufptr;
1609 bufend = PL_parser->bufend;
1610 if (UTF) {
1611 U8 head;
1612 I32 unichar;
1613 STRLEN len, retlen;
1614 if (s == bufend) {
1615 if (!lex_next_chunk(flags))
1616 return -1;
1617 s = PL_parser->bufptr;
1618 bufend = PL_parser->bufend;
1619 }
1620 head = (U8)*s;
1621 if (UTF8_IS_INVARIANT(head))
1622 return head;
1623 if (UTF8_IS_START(head)) {
1624 len = UTF8SKIP(&head);
1625 while ((STRLEN)(bufend-s) < len) {
1626 if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
1627 break;
1628 s = PL_parser->bufptr;
1629 bufend = PL_parser->bufend;
1630 }
1631 }
1632 unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY);
1633 if (retlen == (STRLEN)-1) {
1634 _force_out_malformed_utf8_message((U8 *) s,
1635 (U8 *) bufend,
1636 0,
1637 1 /* 1 means die */ );
1638 NOT_REACHED; /* NOTREACHED */
1639 }
1640 return unichar;
1641 } else {
1642 if (s == bufend) {
1643 if (!lex_next_chunk(flags))
1644 return -1;
1645 s = PL_parser->bufptr;
1646 }
1647 return (U8)*s;
1648 }
1649 }
1650
1651 /*
1652 =for apidoc lex_read_unichar
1653
1654 Reads the next (Unicode) character in the text currently being lexed.
1655 Returns the codepoint (unsigned integer value) of the character read,
1656 and moves L</PL_parser-E<gt>bufptr> past the character, or returns -1
1657 if lexing has reached the end of the input text. To non-destructively
1658 examine the next character, use L</lex_peek_unichar> instead.
1659
1660 If the next character is in (or extends into) the next chunk of input
1661 text, the next chunk will be read in. Normally the current chunk will be
1662 discarded at the same time, but if C<flags> has the C<LEX_KEEP_PREVIOUS>
1663 bit set, then the current chunk will not be discarded.
1664
1665 If the input is being interpreted as UTF-8 and a UTF-8 encoding error
1666 is encountered, an exception is generated.
1667
1668 =cut
1669 */
1670
1671 I32
Perl_lex_read_unichar(pTHX_ U32 flags)1672 Perl_lex_read_unichar(pTHX_ U32 flags)
1673 {
1674 I32 c;
1675 if (flags & ~(LEX_KEEP_PREVIOUS))
1676 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_unichar");
1677 c = lex_peek_unichar(flags);
1678 if (c != -1) {
1679 if (c == '\n')
1680 COPLINE_INC_WITH_HERELINES;
1681 if (UTF)
1682 PL_parser->bufptr += UTF8SKIP(PL_parser->bufptr);
1683 else
1684 ++(PL_parser->bufptr);
1685 }
1686 return c;
1687 }
1688
1689 /*
1690 =for apidoc lex_read_space
1691
1692 Reads optional spaces, in Perl style, in the text currently being
1693 lexed. The spaces may include ordinary whitespace characters and
1694 Perl-style comments. C<#line> directives are processed if encountered.
1695 L</PL_parser-E<gt>bufptr> is moved past the spaces, so that it points
1696 at a non-space character (or the end of the input text).
1697
1698 If spaces extend into the next chunk of input text, the next chunk will
1699 be read in. Normally the current chunk will be discarded at the same
1700 time, but if C<flags> has the C<LEX_KEEP_PREVIOUS> bit set, then the current
1701 chunk will not be discarded.
1702
1703 =cut
1704 */
1705
1706 #define LEX_NO_INCLINE 0x40000000
1707 #define LEX_NO_NEXT_CHUNK 0x80000000
1708
1709 void
Perl_lex_read_space(pTHX_ U32 flags)1710 Perl_lex_read_space(pTHX_ U32 flags)
1711 {
1712 char *s, *bufend;
1713 const bool can_incline = !(flags & LEX_NO_INCLINE);
1714 bool need_incline = 0;
1715 if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK|LEX_NO_INCLINE))
1716 Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space");
1717 s = PL_parser->bufptr;
1718 bufend = PL_parser->bufend;
1719 while (1) {
1720 char c = *s;
1721 if (c == '#') {
1722 do {
1723 c = *++s;
1724 } while (!(c == '\n' || (c == 0 && s == bufend)));
1725 } else if (c == '\n') {
1726 s++;
1727 if (can_incline) {
1728 PL_parser->linestart = s;
1729 if (s == bufend)
1730 need_incline = 1;
1731 else
1732 incline(s, bufend);
1733 }
1734 } else if (isSPACE(c)) {
1735 s++;
1736 } else if (c == 0 && s == bufend) {
1737 bool got_more;
1738 line_t l;
1739 if (flags & LEX_NO_NEXT_CHUNK)
1740 break;
1741 PL_parser->bufptr = s;
1742 l = CopLINE(PL_curcop);
1743 CopLINE(PL_curcop) += PL_parser->herelines + 1;
1744 got_more = lex_next_chunk(flags);
1745 CopLINE_set(PL_curcop, l);
1746 s = PL_parser->bufptr;
1747 bufend = PL_parser->bufend;
1748 if (!got_more)
1749 break;
1750 if (can_incline && need_incline && PL_parser->rsfp) {
1751 incline(s, bufend);
1752 need_incline = 0;
1753 }
1754 } else if (!c) {
1755 s++;
1756 } else {
1757 break;
1758 }
1759 }
1760 PL_parser->bufptr = s;
1761 }
1762
1763 /*
1764
1765 =for apidoc validate_proto
1766
1767 This function performs syntax checking on a prototype, C<proto>.
1768 If C<warn> is true, any illegal characters or mismatched brackets
1769 will trigger illegalproto warnings, declaring that they were
1770 detected in the prototype for C<name>.
1771
1772 The return value is C<true> if this is a valid prototype, and
1773 C<false> if it is not, regardless of whether C<warn> was C<true> or
1774 C<false>.
1775
1776 Note that C<NULL> is a valid C<proto> and will always return C<true>.
1777
1778 =cut
1779
1780 */
1781
1782 bool
Perl_validate_proto(pTHX_ SV * name,SV * proto,bool warn,bool curstash)1783 Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn, bool curstash)
1784 {
1785 STRLEN len, origlen;
1786 char *p;
1787 bool bad_proto = FALSE;
1788 bool in_brackets = FALSE;
1789 bool after_slash = FALSE;
1790 char greedy_proto = ' ';
1791 bool proto_after_greedy_proto = FALSE;
1792 bool must_be_last = FALSE;
1793 bool underscore = FALSE;
1794 bool bad_proto_after_underscore = FALSE;
1795
1796 PERL_ARGS_ASSERT_VALIDATE_PROTO;
1797
1798 if (!proto)
1799 return TRUE;
1800
1801 p = SvPV(proto, len);
1802 origlen = len;
1803 for (; len--; p++) {
1804 if (!isSPACE(*p)) {
1805 if (must_be_last)
1806 proto_after_greedy_proto = TRUE;
1807 if (underscore) {
1808 if (!memCHRs(";@%", *p))
1809 bad_proto_after_underscore = TRUE;
1810 underscore = FALSE;
1811 }
1812 if (!memCHRs("$@%*;[]&\\_+", *p) || *p == '\0') {
1813 bad_proto = TRUE;
1814 }
1815 else {
1816 if (*p == '[')
1817 in_brackets = TRUE;
1818 else if (*p == ']')
1819 in_brackets = FALSE;
1820 else if ((*p == '@' || *p == '%')
1821 && !after_slash
1822 && !in_brackets )
1823 {
1824 must_be_last = TRUE;
1825 greedy_proto = *p;
1826 }
1827 else if (*p == '_')
1828 underscore = TRUE;
1829 }
1830 if (*p == '\\')
1831 after_slash = TRUE;
1832 else
1833 after_slash = FALSE;
1834 }
1835 }
1836
1837 if (warn) {
1838 SV *tmpsv = newSVpvs_flags("", SVs_TEMP);
1839 p -= origlen;
1840 p = SvUTF8(proto)
1841 ? sv_uni_display(tmpsv, newSVpvn_flags(p, origlen, SVs_TEMP | SVf_UTF8),
1842 origlen, UNI_DISPLAY_ISPRINT)
1843 : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII);
1844
1845 if (curstash && !memchr(SvPVX(name), ':', SvCUR(name))) {
1846 SV *name2 = sv_2mortal(newSVsv(PL_curstname));
1847 sv_catpvs(name2, "::");
1848 sv_catsv(name2, (SV *)name);
1849 name = name2;
1850 }
1851
1852 if (proto_after_greedy_proto)
1853 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1854 "Prototype after '%c' for %" SVf " : %s",
1855 greedy_proto, SVfARG(name), p);
1856 if (in_brackets)
1857 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1858 "Missing ']' in prototype for %" SVf " : %s",
1859 SVfARG(name), p);
1860 if (bad_proto)
1861 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1862 "Illegal character in prototype for %" SVf " : %s",
1863 SVfARG(name), p);
1864 if (bad_proto_after_underscore)
1865 Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO),
1866 "Illegal character after '_' in prototype for %" SVf " : %s",
1867 SVfARG(name), p);
1868 }
1869
1870 return (! (proto_after_greedy_proto || bad_proto) );
1871 }
1872
1873 /*
1874 * S_incline
1875 * This subroutine has nothing to do with tilting, whether at windmills
1876 * or pinball tables. Its name is short for "increment line". It
1877 * increments the current line number in CopLINE(PL_curcop) and checks
1878 * to see whether the line starts with a comment of the form
1879 * # line 500 "foo.pm"
1880 * If so, it sets the current line number and file to the values in the comment.
1881 */
1882
1883 STATIC void
S_incline(pTHX_ const char * s,const char * end)1884 S_incline(pTHX_ const char *s, const char *end)
1885 {
1886 const char *t;
1887 const char *n;
1888 const char *e;
1889 line_t line_num;
1890 UV uv;
1891
1892 PERL_ARGS_ASSERT_INCLINE;
1893
1894 assert(end >= s);
1895
1896 COPLINE_INC_WITH_HERELINES;
1897 if (!PL_rsfp && !PL_parser->filtered && PL_lex_state == LEX_NORMAL
1898 && s+1 == PL_bufend && *s == ';') {
1899 /* fake newline in string eval */
1900 CopLINE_dec(PL_curcop);
1901 return;
1902 }
1903 if (*s++ != '#')
1904 return;
1905 while (SPACE_OR_TAB(*s))
1906 s++;
1907 if (memBEGINs(s, (STRLEN) (end - s), "line"))
1908 s += sizeof("line") - 1;
1909 else
1910 return;
1911 if (SPACE_OR_TAB(*s))
1912 s++;
1913 else
1914 return;
1915 while (SPACE_OR_TAB(*s))
1916 s++;
1917 if (!isDIGIT(*s))
1918 return;
1919
1920 n = s;
1921 while (isDIGIT(*s))
1922 s++;
1923 if (!SPACE_OR_TAB(*s) && *s != '\r' && *s != '\n' && *s != '\0')
1924 return;
1925 while (SPACE_OR_TAB(*s))
1926 s++;
1927 if (*s == '"' && (t = (char *) memchr(s+1, '"', end - s))) {
1928 s++;
1929 e = t + 1;
1930 }
1931 else {
1932 t = s;
1933 while (*t && !isSPACE(*t))
1934 t++;
1935 e = t;
1936 }
1937 while (SPACE_OR_TAB(*e) || *e == '\r' || *e == '\f')
1938 e++;
1939 if (*e != '\n' && *e != '\0')
1940 return; /* false alarm */
1941
1942 if (!grok_atoUV(n, &uv, &e))
1943 return;
1944 line_num = ((line_t)uv) - 1;
1945
1946 if (t - s > 0) {
1947 const STRLEN len = t - s;
1948
1949 if (!PL_rsfp && !PL_parser->filtered) {
1950 /* must copy *{"::_<(eval N)[oldfilename:L]"}
1951 * to *{"::_<newfilename"} */
1952 /* However, the long form of evals is only turned on by the
1953 debugger - usually they're "(eval %lu)" */
1954 GV * const cfgv = CopFILEGV(PL_curcop);
1955 if (cfgv) {
1956 char smallbuf[128];
1957 STRLEN tmplen2 = len;
1958 char *tmpbuf2;
1959 GV *gv2;
1960
1961 if (tmplen2 + 2 <= sizeof smallbuf)
1962 tmpbuf2 = smallbuf;
1963 else
1964 Newx(tmpbuf2, tmplen2 + 2, char);
1965
1966 tmpbuf2[0] = '_';
1967 tmpbuf2[1] = '<';
1968
1969 memcpy(tmpbuf2 + 2, s, tmplen2);
1970 tmplen2 += 2;
1971
1972 gv2 = *(GV**)hv_fetch(PL_defstash, tmpbuf2, tmplen2, TRUE);
1973 if (!isGV(gv2)) {
1974 gv_init(gv2, PL_defstash, tmpbuf2, tmplen2, FALSE);
1975 /* adjust ${"::_<newfilename"} to store the new file name */
1976 GvSV(gv2) = newSVpvn(tmpbuf2 + 2, tmplen2 - 2);
1977 /* The line number may differ. If that is the case,
1978 alias the saved lines that are in the array.
1979 Otherwise alias the whole array. */
1980 if (CopLINE(PL_curcop) == line_num) {
1981 GvHV(gv2) = MUTABLE_HV(SvREFCNT_inc(GvHV(cfgv)));
1982 GvAV(gv2) = MUTABLE_AV(SvREFCNT_inc(GvAV(cfgv)));
1983 }
1984 else if (GvAV(cfgv)) {
1985 AV * const av = GvAV(cfgv);
1986 const line_t start = CopLINE(PL_curcop)+1;
1987 SSize_t items = AvFILLp(av) - start;
1988 if (items > 0) {
1989 AV * const av2 = GvAVn(gv2);
1990 SV **svp = AvARRAY(av) + start;
1991 Size_t l = line_num+1;
1992 while (items-- && l < SSize_t_MAX && l == (line_t)l)
1993 av_store(av2, (SSize_t)l++, SvREFCNT_inc(*svp++));
1994 }
1995 }
1996 }
1997
1998 if (tmpbuf2 != smallbuf) Safefree(tmpbuf2);
1999 }
2000 }
2001 CopFILE_free(PL_curcop);
2002 CopFILE_setn(PL_curcop, s, len);
2003 }
2004 CopLINE_set(PL_curcop, line_num);
2005 }
2006
2007 STATIC void
S_update_debugger_info(pTHX_ SV * orig_sv,const char * const buf,STRLEN len)2008 S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len)
2009 {
2010 AV *av = CopFILEAVx(PL_curcop);
2011 if (av) {
2012 SV * sv;
2013 if (PL_parser->preambling == NOLINE) sv = newSV_type(SVt_PVMG);
2014 else {
2015 sv = *av_fetch(av, 0, 1);
2016 SvUPGRADE(sv, SVt_PVMG);
2017 }
2018 if (!SvPOK(sv)) SvPVCLEAR(sv);
2019 if (orig_sv)
2020 sv_catsv(sv, orig_sv);
2021 else
2022 sv_catpvn(sv, buf, len);
2023 if (!SvIOK(sv)) {
2024 (void)SvIOK_on(sv);
2025 SvIV_set(sv, 0);
2026 }
2027 if (PL_parser->preambling == NOLINE)
2028 av_store(av, CopLINE(PL_curcop), sv);
2029 }
2030 }
2031
2032 /*
2033 * skipspace
2034 * Called to gobble the appropriate amount and type of whitespace.
2035 * Skips comments as well.
2036 * Returns the next character after the whitespace that is skipped.
2037 *
2038 * peekspace
2039 * Same thing, but look ahead without incrementing line numbers or
2040 * adjusting PL_linestart.
2041 */
2042
2043 #define skipspace(s) skipspace_flags(s, 0)
2044 #define peekspace(s) skipspace_flags(s, LEX_NO_INCLINE)
2045
2046 char *
Perl_skipspace_flags(pTHX_ char * s,U32 flags)2047 Perl_skipspace_flags(pTHX_ char *s, U32 flags)
2048 {
2049 PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
2050 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
2051 while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
2052 s++;
2053 } else {
2054 STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
2055 PL_bufptr = s;
2056 lex_read_space(flags | LEX_KEEP_PREVIOUS |
2057 (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
2058 LEX_NO_NEXT_CHUNK : 0));
2059 s = PL_bufptr;
2060 PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
2061 if (PL_linestart > PL_bufptr)
2062 PL_bufptr = PL_linestart;
2063 return s;
2064 }
2065 return s;
2066 }
2067
2068 /*
2069 * S_check_uni
2070 * Check the unary operators to ensure there's no ambiguity in how they're
2071 * used. An ambiguous piece of code would be:
2072 * rand + 5
2073 * This doesn't mean rand() + 5. Because rand() is a unary operator,
2074 * the +5 is its argument.
2075 */
2076
2077 STATIC void
S_check_uni(pTHX)2078 S_check_uni(pTHX)
2079 {
2080 const char *s;
2081
2082 if (PL_oldoldbufptr != PL_last_uni)
2083 return;
2084 while (isSPACE(*PL_last_uni))
2085 PL_last_uni++;
2086 s = PL_last_uni;
2087 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
2088 s += UTF ? UTF8SKIP(s) : 1;
2089 if (s < PL_bufptr && memchr(s, '(', PL_bufptr - s))
2090 return;
2091
2092 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
2093 "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous",
2094 UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
2095 }
2096
2097 /*
2098 * LOP : macro to build a list operator. Its behaviour has been replaced
2099 * with a subroutine, S_lop() for which LOP is just another name.
2100 */
2101
2102 #define LOP(f,x) return lop(f,x,s)
2103
2104 /*
2105 * S_lop
2106 * Build a list operator (or something that might be one). The rules:
2107 * - if we have a next token, then it's a list operator (no parens) for
2108 * which the next token has already been parsed; e.g.,
2109 * sort foo @args
2110 * sort foo (@args)
2111 * - if the next thing is an opening paren, then it's a function
2112 * - else it's a list operator
2113 */
2114
2115 STATIC I32
S_lop(pTHX_ I32 f,U8 x,char * s)2116 S_lop(pTHX_ I32 f, U8 x, char *s)
2117 {
2118 PERL_ARGS_ASSERT_LOP;
2119
2120 pl_yylval.ival = f;
2121 CLINE;
2122 PL_bufptr = s;
2123 PL_last_lop = PL_oldbufptr;
2124 PL_last_lop_op = (OPCODE)f;
2125 if (PL_nexttoke)
2126 goto lstop;
2127 PL_expect = x;
2128 if (*s == '(')
2129 return REPORT(FUNC);
2130 s = skipspace(s);
2131 if (*s == '(')
2132 return REPORT(FUNC);
2133 else {
2134 lstop:
2135 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
2136 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
2137 return REPORT(LSTOP);
2138 }
2139 }
2140
2141 /*
2142 * S_force_next
2143 * When the lexer realizes it knows the next token (for instance,
2144 * it is reordering tokens for the parser) then it can call S_force_next
2145 * to know what token to return the next time the lexer is called. Caller
2146 * will need to set PL_nextval[] and possibly PL_expect to ensure
2147 * the lexer handles the token correctly.
2148 */
2149
2150 STATIC void
S_force_next(pTHX_ I32 type)2151 S_force_next(pTHX_ I32 type)
2152 {
2153 #ifdef DEBUGGING
2154 if (DEBUG_T_TEST) {
2155 PerlIO_printf(Perl_debug_log, "### forced token:\n");
2156 tokereport(type, &NEXTVAL_NEXTTOKE);
2157 }
2158 #endif
2159 assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
2160 PL_nexttype[PL_nexttoke] = type;
2161 PL_nexttoke++;
2162 }
2163
2164 /*
2165 * S_postderef
2166 *
2167 * This subroutine handles postfix deref syntax after the arrow has already
2168 * been emitted. @* $* etc. are emitted as two separate tokens right here.
2169 * @[ @{ %[ %{ *{ are emitted also as two tokens, but this function emits
2170 * only the first, leaving yylex to find the next.
2171 */
2172
2173 static int
S_postderef(pTHX_ int const funny,char const next)2174 S_postderef(pTHX_ int const funny, char const next)
2175 {
2176 assert(funny == DOLSHARP
2177 || funny == PERLY_DOLLAR
2178 || funny == PERLY_SNAIL
2179 || funny == PERLY_PERCENT_SIGN
2180 || funny == PERLY_AMPERSAND
2181 || funny == PERLY_STAR
2182 );
2183 if (next == '*') {
2184 PL_expect = XOPERATOR;
2185 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
2186 assert(PERLY_SNAIL == funny || PERLY_DOLLAR == funny || DOLSHARP == funny);
2187 PL_lex_state = LEX_INTERPEND;
2188 if (PERLY_SNAIL == funny)
2189 force_next(POSTJOIN);
2190 }
2191 force_next(PERLY_STAR);
2192 PL_bufptr+=2;
2193 }
2194 else {
2195 if (PERLY_SNAIL == funny && PL_lex_state == LEX_INTERPNORMAL
2196 && !PL_lex_brackets)
2197 PL_lex_dojoin = 2;
2198 PL_expect = XOPERATOR;
2199 PL_bufptr++;
2200 }
2201 return funny;
2202 }
2203
2204 void
Perl_yyunlex(pTHX)2205 Perl_yyunlex(pTHX)
2206 {
2207 int yyc = PL_parser->yychar;
2208 if (yyc != YYEMPTY) {
2209 if (yyc) {
2210 NEXTVAL_NEXTTOKE = PL_parser->yylval;
2211 if (yyc == PERLY_BRACE_OPEN || yyc == HASHBRACK || yyc == PERLY_BRACKET_OPEN) {
2212 PL_lex_allbrackets--;
2213 PL_lex_brackets--;
2214 yyc |= (3<<24) | (PL_lex_brackstack[PL_lex_brackets] << 16);
2215 } else if (yyc == PERLY_PAREN_OPEN) {
2216 PL_lex_allbrackets--;
2217 yyc |= (2<<24);
2218 }
2219 force_next(yyc);
2220 }
2221 PL_parser->yychar = YYEMPTY;
2222 }
2223 }
2224
2225 STATIC SV *
S_newSV_maybe_utf8(pTHX_ const char * const start,STRLEN len)2226 S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len)
2227 {
2228 SV * const sv = newSVpvn_utf8(start, len,
2229 ! IN_BYTES
2230 && UTF
2231 && len != 0
2232 && is_utf8_non_invariant_string((const U8*)start, len));
2233 return sv;
2234 }
2235
2236 /*
2237 * S_force_word
2238 * When the lexer knows the next thing is a word (for instance, it has
2239 * just seen -> and it knows that the next char is a word char, then
2240 * it calls S_force_word to stick the next word into the PL_nexttoke/val
2241 * lookahead.
2242 *
2243 * Arguments:
2244 * char *start : buffer position (must be within PL_linestr)
2245 * int token : PL_next* will be this type of bare word
2246 * (e.g., METHCALL0,BAREWORD)
2247 * int check_keyword : if true, Perl checks to make sure the word isn't
2248 * a keyword (do this if the word is a label, e.g. goto FOO)
2249 * int allow_pack : if true, : characters will also be allowed (require,
2250 * use, etc. do this)
2251 */
2252
2253 STATIC char *
S_force_word(pTHX_ char * start,int token,int check_keyword,int allow_pack)2254 S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
2255 {
2256 char *s;
2257 STRLEN len;
2258
2259 PERL_ARGS_ASSERT_FORCE_WORD;
2260
2261 start = skipspace(start);
2262 s = start;
2263 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
2264 || (allow_pack && *s == ':' && s[1] == ':') )
2265 {
2266 s = scan_word6(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len, allow_pack);
2267 if (check_keyword) {
2268 char *s2 = PL_tokenbuf;
2269 STRLEN len2 = len;
2270 if (allow_pack && memBEGINPs(s2, len, "CORE::")) {
2271 s2 += sizeof("CORE::") - 1;
2272 len2 -= sizeof("CORE::") - 1;
2273 }
2274 if (keyword(s2, len2, 0))
2275 return start;
2276 }
2277 if (token == METHCALL0) {
2278 s = skipspace(s);
2279 if (*s == '(')
2280 PL_expect = XTERM;
2281 else {
2282 PL_expect = XOPERATOR;
2283 }
2284 }
2285 NEXTVAL_NEXTTOKE.opval
2286 = newSVOP(OP_CONST,0,
2287 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
2288 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
2289 force_next(token);
2290 }
2291 return s;
2292 }
2293
2294 /*
2295 * S_force_ident
2296 * Called when the lexer wants $foo *foo &foo etc, but the program
2297 * text only contains the "foo" portion. The first argument is a pointer
2298 * to the "foo", and the second argument is the type symbol to prefix.
2299 * Forces the next token to be a "BAREWORD".
2300 * Creates the symbol if it didn't already exist (via gv_fetchpv()).
2301 */
2302
2303 STATIC void
S_force_ident(pTHX_ const char * s,int kind)2304 S_force_ident(pTHX_ const char *s, int kind)
2305 {
2306 PERL_ARGS_ASSERT_FORCE_IDENT;
2307
2308 if (s[0]) {
2309 const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */
2310 OP* const o = newSVOP(OP_CONST, 0, newSVpvn_flags(s, len,
2311 UTF ? SVf_UTF8 : 0));
2312 NEXTVAL_NEXTTOKE.opval = o;
2313 force_next(BAREWORD);
2314 if (kind) {
2315 o->op_private = OPpCONST_ENTERED;
2316 /* XXX see note in pp_entereval() for why we forgo typo
2317 warnings if the symbol must be introduced in an eval.
2318 GSAR 96-10-12 */
2319 gv_fetchpvn_flags(s, len,
2320 (PL_in_eval ? GV_ADDMULTI
2321 : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ),
2322 kind == PERLY_DOLLAR ? SVt_PV :
2323 kind == PERLY_SNAIL ? SVt_PVAV :
2324 kind == PERLY_PERCENT_SIGN ? SVt_PVHV :
2325 SVt_PVGV
2326 );
2327 }
2328 }
2329 }
2330
2331 static void
S_force_ident_maybe_lex(pTHX_ char pit)2332 S_force_ident_maybe_lex(pTHX_ char pit)
2333 {
2334 NEXTVAL_NEXTTOKE.ival = pit;
2335 force_next('p');
2336 }
2337
2338 NV
Perl_str_to_version(pTHX_ SV * sv)2339 Perl_str_to_version(pTHX_ SV *sv)
2340 {
2341 NV retval = 0.0;
2342 NV nshift = 1.0;
2343 STRLEN len;
2344 const char *start = SvPV_const(sv,len);
2345 const char * const end = start + len;
2346 const bool utf = cBOOL(SvUTF8(sv));
2347
2348 PERL_ARGS_ASSERT_STR_TO_VERSION;
2349
2350 while (start < end) {
2351 STRLEN skip;
2352 UV n;
2353 if (utf)
2354 n = utf8n_to_uvchr((U8*)start, len, &skip, 0);
2355 else {
2356 n = *(U8*)start;
2357 skip = 1;
2358 }
2359 retval += ((NV)n)/nshift;
2360 start += skip;
2361 nshift *= 1000;
2362 }
2363 return retval;
2364 }
2365
2366 /*
2367 * S_force_version
2368 * Forces the next token to be a version number.
2369 * If the next token appears to be an invalid version number, (e.g. "v2b"),
2370 * and if "guessing" is TRUE, then no new token is created (and the caller
2371 * must use an alternative parsing method).
2372 */
2373
2374 STATIC char *
S_force_version(pTHX_ char * s,int guessing)2375 S_force_version(pTHX_ char *s, int guessing)
2376 {
2377 OP *version = NULL;
2378 char *d;
2379
2380 PERL_ARGS_ASSERT_FORCE_VERSION;
2381
2382 s = skipspace(s);
2383
2384 d = s;
2385 if (*d == 'v')
2386 d++;
2387 if (isDIGIT(*d)) {
2388 while (isDIGIT(*d) || *d == '_' || *d == '.')
2389 d++;
2390 if (*d == ';' || isSPACE(*d) || *d == '{' || *d == '}' || !*d) {
2391 SV *ver;
2392 s = scan_num(s, &pl_yylval);
2393 version = pl_yylval.opval;
2394 ver = cSVOPx(version)->op_sv;
2395 if (SvPOK(ver) && !SvNIOK(ver)) {
2396 SvUPGRADE(ver, SVt_PVNV);
2397 SvNV_set(ver, str_to_version(ver));
2398 SvNOK_on(ver); /* hint that it is a version */
2399 }
2400 }
2401 else if (guessing) {
2402 return s;
2403 }
2404 }
2405
2406 /* NOTE: The parser sees the package name and the VERSION swapped */
2407 NEXTVAL_NEXTTOKE.opval = version;
2408 force_next(BAREWORD);
2409
2410 return s;
2411 }
2412
2413 /*
2414 * S_force_strict_version
2415 * Forces the next token to be a version number using strict syntax rules.
2416 */
2417
2418 STATIC char *
S_force_strict_version(pTHX_ char * s)2419 S_force_strict_version(pTHX_ char *s)
2420 {
2421 OP *version = NULL;
2422 const char *errstr = NULL;
2423
2424 PERL_ARGS_ASSERT_FORCE_STRICT_VERSION;
2425
2426 while (isSPACE(*s)) /* leading whitespace */
2427 s++;
2428
2429 if (is_STRICT_VERSION(s,&errstr)) {
2430 SV *ver = newSV_type(SVt_NULL);
2431 s = (char *)scan_version(s, ver, 0);
2432 version = newSVOP(OP_CONST, 0, ver);
2433 }
2434 else if ((*s != ';' && *s != ':' && *s != '{' && *s != '}' )
2435 && (s = skipspace(s), (*s != ';' && *s != ':' && *s != '{' && *s != '}' )))
2436 {
2437 PL_bufptr = s;
2438 if (errstr)
2439 yyerror(errstr); /* version required */
2440 return s;
2441 }
2442
2443 /* NOTE: The parser sees the package name and the VERSION swapped */
2444 NEXTVAL_NEXTTOKE.opval = version;
2445 force_next(BAREWORD);
2446
2447 return s;
2448 }
2449
2450 /*
2451 * S_tokeq
2452 * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv',
2453 * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is
2454 * unchanged, and a new SV containing the modified input is returned.
2455 */
2456
2457 STATIC SV *
S_tokeq(pTHX_ SV * sv)2458 S_tokeq(pTHX_ SV *sv)
2459 {
2460 char *s;
2461 char *send;
2462 char *d;
2463 SV *pv = sv;
2464
2465 PERL_ARGS_ASSERT_TOKEQ;
2466
2467 assert (SvPOK(sv));
2468 assert (SvLEN(sv));
2469 assert (!SvIsCOW(sv));
2470 if (SvTYPE(sv) >= SVt_PVIV && SvIVX(sv) == -1) /* <<'heredoc' */
2471 goto finish;
2472 s = SvPVX(sv);
2473 send = SvEND(sv);
2474 /* This is relying on the SV being "well formed" with a trailing '\0' */
2475 while (s < send && !(*s == '\\' && s[1] == '\\'))
2476 s++;
2477 if (s == send)
2478 goto finish;
2479 d = s;
2480 if ( PL_hints & HINT_NEW_STRING ) {
2481 pv = newSVpvn_flags(SvPVX_const(pv), SvCUR(sv),
2482 SVs_TEMP | SvUTF8(sv));
2483 }
2484 while (s < send) {
2485 if (*s == '\\') {
2486 if (s + 1 < send && (s[1] == '\\'))
2487 s++; /* all that, just for this */
2488 }
2489 *d++ = *s++;
2490 }
2491 *d = '\0';
2492 SvCUR_set(sv, d - SvPVX_const(sv));
2493 finish:
2494 if ( PL_hints & HINT_NEW_STRING )
2495 return new_constant(NULL, 0, "q", sv, pv, "q", 1, NULL);
2496 return sv;
2497 }
2498
2499 /*
2500 * Now come three functions related to double-quote context,
2501 * S_sublex_start, S_sublex_push, and S_sublex_done. They're used when
2502 * converting things like "\u\Lgnat" into ucfirst(lc("gnat")). They
2503 * interact with PL_lex_state, and create fake ( ... ) argument lists
2504 * to handle functions and concatenation.
2505 * For example,
2506 * "foo\lbar"
2507 * is tokenised as
2508 * stringify ( const[foo] concat lcfirst ( const[bar] ) )
2509 */
2510
2511 /*
2512 * S_sublex_start
2513 * Assumes that pl_yylval.ival is the op we're creating (e.g. OP_LCFIRST).
2514 *
2515 * Pattern matching will set PL_lex_op to the pattern-matching op to
2516 * make (we return THING if pl_yylval.ival is OP_NULL, PMFUNC otherwise).
2517 *
2518 * OP_CONST is easy--just make the new op and return.
2519 *
2520 * Everything else becomes a FUNC.
2521 *
2522 * Sets PL_lex_state to LEX_INTERPPUSH unless ival was OP_NULL or we
2523 * had an OP_CONST. This just sets us up for a
2524 * call to S_sublex_push().
2525 */
2526
2527 STATIC I32
S_sublex_start(pTHX)2528 S_sublex_start(pTHX)
2529 {
2530 const I32 op_type = pl_yylval.ival;
2531
2532 if (op_type == OP_NULL) {
2533 pl_yylval.opval = PL_lex_op;
2534 PL_lex_op = NULL;
2535 return THING;
2536 }
2537 if (op_type == OP_CONST) {
2538 SV *sv = PL_lex_stuff;
2539 PL_lex_stuff = NULL;
2540 sv = tokeq(sv);
2541
2542 if (SvTYPE(sv) == SVt_PVIV) {
2543 /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
2544 STRLEN len;
2545 const char * const p = SvPV_const(sv, len);
2546 SV * const nsv = newSVpvn_flags(p, len, SvUTF8(sv));
2547 SvREFCNT_dec(sv);
2548 sv = nsv;
2549 }
2550 pl_yylval.opval = newSVOP(op_type, 0, sv);
2551 return THING;
2552 }
2553
2554 PL_parser->lex_super_state = PL_lex_state;
2555 PL_parser->lex_sub_inwhat = (U16)op_type;
2556 PL_parser->lex_sub_op = PL_lex_op;
2557 PL_parser->sub_no_recover = FALSE;
2558 PL_parser->sub_error_count = PL_error_count;
2559 PL_lex_state = LEX_INTERPPUSH;
2560
2561 PL_expect = XTERM;
2562 if (PL_lex_op) {
2563 pl_yylval.opval = PL_lex_op;
2564 PL_lex_op = NULL;
2565 return PMFUNC;
2566 }
2567 else
2568 return FUNC;
2569 }
2570
2571 /*
2572 * S_sublex_push
2573 * Create a new scope to save the lexing state. The scope will be
2574 * ended in S_sublex_done. Returns a '(', starting the function arguments
2575 * to the uc, lc, etc. found before.
2576 * Sets PL_lex_state to LEX_INTERPCONCAT.
2577 */
2578
2579 STATIC I32
S_sublex_push(pTHX)2580 S_sublex_push(pTHX)
2581 {
2582 LEXSHARED *shared;
2583 const bool is_heredoc = PL_multi_close == '<';
2584 ENTER;
2585
2586 PL_lex_state = PL_parser->lex_super_state;
2587 SAVEI8(PL_lex_dojoin);
2588 SAVEI32(PL_lex_brackets);
2589 SAVEI32(PL_lex_allbrackets);
2590 SAVEI32(PL_lex_formbrack);
2591 SAVEI8(PL_lex_fakeeof);
2592 SAVEI32(PL_lex_casemods);
2593 SAVEI32(PL_lex_starts);
2594 SAVEI8(PL_lex_state);
2595 SAVESPTR(PL_lex_repl);
2596 SAVEVPTR(PL_lex_inpat);
2597 SAVEI16(PL_lex_inwhat);
2598 if (is_heredoc)
2599 {
2600 SAVECOPLINE(PL_curcop);
2601 SAVEI32(PL_multi_end);
2602 SAVEI32(PL_parser->herelines);
2603 PL_parser->herelines = 0;
2604 }
2605 SAVEIV(PL_multi_close);
2606 SAVEPPTR(PL_bufptr);
2607 SAVEPPTR(PL_bufend);
2608 SAVEPPTR(PL_oldbufptr);
2609 SAVEPPTR(PL_oldoldbufptr);
2610 SAVEPPTR(PL_last_lop);
2611 SAVEPPTR(PL_last_uni);
2612 SAVEPPTR(PL_linestart);
2613 SAVESPTR(PL_linestr);
2614 SAVEGENERICPV(PL_lex_brackstack);
2615 SAVEGENERICPV(PL_lex_casestack);
2616 SAVEGENERICPV(PL_parser->lex_shared);
2617 SAVEBOOL(PL_parser->lex_re_reparsing);
2618 SAVEI32(PL_copline);
2619
2620 /* The here-doc parser needs to be able to peek into outer lexing
2621 scopes to find the body of the here-doc. So we put PL_linestr and
2622 PL_bufptr into lex_shared, to 'share' those values.
2623 */
2624 PL_parser->lex_shared->ls_linestr = PL_linestr;
2625 PL_parser->lex_shared->ls_bufptr = PL_bufptr;
2626
2627 PL_linestr = PL_lex_stuff;
2628 PL_lex_repl = PL_parser->lex_sub_repl;
2629 PL_lex_stuff = NULL;
2630 PL_parser->lex_sub_repl = NULL;
2631
2632 /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
2633 set for an inner quote-like operator and then an error causes scope-
2634 popping. We must not have a PL_lex_stuff value left dangling, as
2635 that breaks assumptions elsewhere. See bug #123617. */
2636 SAVEGENERICSV(PL_lex_stuff);
2637 SAVEGENERICSV(PL_parser->lex_sub_repl);
2638
2639 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
2640 = SvPVX(PL_linestr);
2641 PL_bufend += SvCUR(PL_linestr);
2642 PL_last_lop = PL_last_uni = NULL;
2643 SAVEFREESV(PL_linestr);
2644 if (PL_lex_repl) SAVEFREESV(PL_lex_repl);
2645
2646 PL_lex_dojoin = FALSE;
2647 PL_lex_brackets = PL_lex_formbrack = 0;
2648 PL_lex_allbrackets = 0;
2649 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2650 Newx(PL_lex_brackstack, 120, char);
2651 Newx(PL_lex_casestack, 12, char);
2652 PL_lex_casemods = 0;
2653 *PL_lex_casestack = '\0';
2654 PL_lex_starts = 0;
2655 PL_lex_state = LEX_INTERPCONCAT;
2656 if (is_heredoc)
2657 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
2658 PL_copline = NOLINE;
2659
2660 Newxz(shared, 1, LEXSHARED);
2661 shared->ls_prev = PL_parser->lex_shared;
2662 PL_parser->lex_shared = shared;
2663
2664 PL_lex_inwhat = PL_parser->lex_sub_inwhat;
2665 if (PL_lex_inwhat == OP_TRANSR) PL_lex_inwhat = OP_TRANS;
2666 if (PL_lex_inwhat == OP_MATCH || PL_lex_inwhat == OP_QR || PL_lex_inwhat == OP_SUBST)
2667 PL_lex_inpat = PL_parser->lex_sub_op;
2668 else
2669 PL_lex_inpat = NULL;
2670
2671 PL_parser->lex_re_reparsing = cBOOL(PL_in_eval & EVAL_RE_REPARSING);
2672 PL_in_eval &= ~EVAL_RE_REPARSING;
2673
2674 return SUBLEXSTART;
2675 }
2676
2677 /*
2678 * S_sublex_done
2679 * Restores lexer state after a S_sublex_push.
2680 */
2681
2682 STATIC I32
S_sublex_done(pTHX)2683 S_sublex_done(pTHX)
2684 {
2685 if (!PL_lex_starts++) {
2686 SV * const sv = newSVpvs("");
2687 if (SvUTF8(PL_linestr))
2688 SvUTF8_on(sv);
2689 PL_expect = XOPERATOR;
2690 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
2691 return THING;
2692 }
2693
2694 if (PL_lex_casemods) { /* oops, we've got some unbalanced parens */
2695 PL_lex_state = LEX_INTERPCASEMOD;
2696 return yylex();
2697 }
2698
2699 /* Is there a right-hand side to take care of? (s//RHS/ or tr//RHS/) */
2700 assert(PL_lex_inwhat != OP_TRANSR);
2701 if (PL_lex_repl) {
2702 assert (PL_lex_inwhat == OP_SUBST || PL_lex_inwhat == OP_TRANS);
2703 PL_linestr = PL_lex_repl;
2704 PL_lex_inpat = 0;
2705 PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart = SvPVX(PL_linestr);
2706 PL_bufend += SvCUR(PL_linestr);
2707 PL_last_lop = PL_last_uni = NULL;
2708 PL_lex_dojoin = FALSE;
2709 PL_lex_brackets = 0;
2710 PL_lex_allbrackets = 0;
2711 PL_lex_fakeeof = LEX_FAKEEOF_NEVER;
2712 PL_lex_casemods = 0;
2713 *PL_lex_casestack = '\0';
2714 PL_lex_starts = 0;
2715 if (SvEVALED(PL_lex_repl)) {
2716 PL_lex_state = LEX_INTERPNORMAL;
2717 PL_lex_starts++;
2718 /* we don't clear PL_lex_repl here, so that we can check later
2719 whether this is an evalled subst; that means we rely on the
2720 logic to ensure sublex_done() is called again only via the
2721 branch (in yylex()) that clears PL_lex_repl, else we'll loop */
2722 }
2723 else {
2724 PL_lex_state = LEX_INTERPCONCAT;
2725 PL_lex_repl = NULL;
2726 }
2727 if (SvTYPE(PL_linestr) >= SVt_PVNV) {
2728 CopLINE(PL_curcop) +=
2729 ((XPVNV*)SvANY(PL_linestr))->xnv_u.xnv_lines
2730 + PL_parser->herelines;
2731 PL_parser->herelines = 0;
2732 }
2733 return PERLY_SLASH;
2734 }
2735 else {
2736 const line_t l = CopLINE(PL_curcop);
2737 LEAVE;
2738 if (PL_parser->sub_error_count != PL_error_count) {
2739 if (PL_parser->sub_no_recover) {
2740 yyquit();
2741 NOT_REACHED;
2742 }
2743 }
2744 if (PL_multi_close == '<')
2745 PL_parser->herelines += l - PL_multi_end;
2746 PL_bufend = SvPVX(PL_linestr);
2747 PL_bufend += SvCUR(PL_linestr);
2748 PL_expect = XOPERATOR;
2749 return SUBLEXEND;
2750 }
2751 }
2752
2753 HV *
Perl_load_charnames(pTHX_ SV * char_name,const char * context,const STRLEN context_len,const char ** error_msg)2754 Perl_load_charnames(pTHX_ SV * char_name, const char * context,
2755 const STRLEN context_len, const char ** error_msg)
2756 {
2757 /* Load the official _charnames module if not already there. The
2758 * parameters are just to give info for any error messages generated:
2759 * char_name a name to look up which is the reason for loading this
2760 * context 'char_name' in the context in the input in which it appears
2761 * context_len how many bytes 'context' occupies
2762 * error_msg *error_msg will be set to any error
2763 *
2764 * Returns the ^H table if success; otherwise NULL */
2765
2766 unsigned int i;
2767 HV * table;
2768 SV **cvp;
2769 SV * res;
2770
2771 PERL_ARGS_ASSERT_LOAD_CHARNAMES;
2772
2773 /* This loop is executed 1 1/2 times. On the first time through, if it
2774 * isn't already loaded, try loading it, and iterate just once to see if it
2775 * worked. */
2776 for (i = 0; i < 2; i++) {
2777 table = GvHV(PL_hintgv); /* ^H */
2778
2779 if ( table
2780 && (PL_hints & HINT_LOCALIZE_HH)
2781 && (cvp = hv_fetchs(table, "charnames", FALSE))
2782 && SvOK(*cvp))
2783 {
2784 return table; /* Quit if already loaded */
2785 }
2786
2787 if (i == 0) {
2788 Perl_load_module(aTHX_
2789 0,
2790 newSVpvs("_charnames"),
2791
2792 /* version parameter; no need to specify it, as if we get too early
2793 * a version, will fail anyway, not being able to find 'charnames'
2794 * */
2795 NULL,
2796 newSVpvs(":full"),
2797 newSVpvs(":short"),
2798 NULL);
2799 }
2800 }
2801
2802 /* Here, it failed; new_constant will give appropriate error messages */
2803 *error_msg = NULL;
2804 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2805 context, context_len, error_msg);
2806 SvREFCNT_dec(res);
2807
2808 return NULL;
2809 }
2810
2811 STATIC SV*
S_get_and_check_backslash_N_name_wrapper(pTHX_ const char * s,const char * const e)2812 S_get_and_check_backslash_N_name_wrapper(pTHX_ const char* s, const char* const e)
2813 {
2814 /* This justs wraps get_and_check_backslash_N_name() to output any error
2815 * message it returns. */
2816
2817 const char * error_msg = NULL;
2818 SV * result;
2819
2820 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME_WRAPPER;
2821
2822 /* charnames doesn't work well if there have been errors found */
2823 if (PL_error_count > 0) {
2824 return NULL;
2825 }
2826
2827 result = get_and_check_backslash_N_name(s, e, cBOOL(UTF), &error_msg);
2828
2829 if (error_msg) {
2830 yyerror_pv(error_msg, UTF ? SVf_UTF8 : 0);
2831 }
2832
2833 return result;
2834 }
2835
2836 SV*
Perl_get_and_check_backslash_N_name(pTHX_ const char * s,const char * e,const bool is_utf8,const char ** error_msg)2837 Perl_get_and_check_backslash_N_name(pTHX_ const char* s,
2838 const char* e,
2839 const bool is_utf8,
2840 const char ** error_msg)
2841 {
2842 /* <s> points to first character of interior of \N{}, <e> to one beyond the
2843 * interior, hence to the "}". Finds what the name resolves to, returning
2844 * an SV* containing it; NULL if no valid one found.
2845 *
2846 * 'is_utf8' is TRUE if we know we want the result to be UTF-8 even if it
2847 * doesn't have to be. */
2848
2849 SV* char_name;
2850 SV* res;
2851 HV * table;
2852 SV **cvp;
2853 SV *cv;
2854 SV *rv;
2855 HV *stash;
2856
2857 /* Points to the beginning of the \N{... so that any messages include the
2858 * context of what's failing*/
2859 const char* context = s - 3;
2860 STRLEN context_len = e - context + 1; /* include all of \N{...} */
2861
2862
2863 PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME;
2864
2865 assert(e >= s);
2866 assert(s > (char *) 3);
2867
2868 while (s < e && isBLANK(*s)) {
2869 s++;
2870 }
2871
2872 while (s < e && isBLANK(*(e - 1))) {
2873 e--;
2874 }
2875
2876 char_name = newSVpvn_flags(s, e - s, (is_utf8) ? SVf_UTF8 : 0);
2877
2878 if (!SvCUR(char_name)) {
2879 SvREFCNT_dec_NN(char_name);
2880 /* diag_listed_as: Unknown charname '%s' */
2881 *error_msg = Perl_form(aTHX_ "Unknown charname ''");
2882 return NULL;
2883 }
2884
2885 /* Autoload the charnames module */
2886
2887 table = load_charnames(char_name, context, context_len, error_msg);
2888 if (table == NULL) {
2889 return NULL;
2890 }
2891
2892 *error_msg = NULL;
2893 res = new_constant( NULL, 0, "charnames", char_name, NULL,
2894 context, context_len, error_msg);
2895 if (*error_msg) {
2896 *error_msg = Perl_form(aTHX_ "Unknown charname '%s'", SvPVX(char_name));
2897
2898 SvREFCNT_dec(res);
2899 return NULL;
2900 }
2901
2902 /* See if the charnames handler is the Perl core's, and if so, we can skip
2903 * the validation needed for a user-supplied one, as Perl's does its own
2904 * validation. */
2905 cvp = hv_fetchs(table, "charnames", FALSE);
2906 if (cvp && (cv = *cvp) && SvROK(cv) && (rv = SvRV(cv),
2907 SvTYPE(rv) == SVt_PVCV) && ((stash = CvSTASH(rv)) != NULL))
2908 {
2909 const char * const name = HvNAME(stash);
2910 if (memEQs(name, HvNAMELEN(stash), "_charnames")) {
2911 return res;
2912 }
2913 }
2914
2915 /* Here, it isn't Perl's charname handler. We can't rely on a
2916 * user-supplied handler to validate the input name. For non-ut8 input,
2917 * look to see that the first character is legal. Then loop through the
2918 * rest checking that each is a continuation */
2919
2920 /* This code makes the reasonable assumption that the only Latin1-range
2921 * characters that begin a character name alias are alphabetic, otherwise
2922 * would have to create a isCHARNAME_BEGIN macro */
2923
2924 if (! is_utf8) {
2925 if (! isALPHAU(*s)) {
2926 goto bad_charname;
2927 }
2928 s++;
2929 while (s < e) {
2930 if (! isCHARNAME_CONT(*s)) {
2931 goto bad_charname;
2932 }
2933 if (*s == ' ' && *(s-1) == ' ') {
2934 goto multi_spaces;
2935 }
2936 s++;
2937 }
2938 }
2939 else {
2940 /* Similarly for utf8. For invariants can check directly; for other
2941 * Latin1, can calculate their code point and check; otherwise use an
2942 * inversion list */
2943 if (UTF8_IS_INVARIANT(*s)) {
2944 if (! isALPHAU(*s)) {
2945 goto bad_charname;
2946 }
2947 s++;
2948 } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2949 if (! isALPHAU(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)))) {
2950 goto bad_charname;
2951 }
2952 s += 2;
2953 }
2954 else {
2955 if (! _invlist_contains_cp(PL_utf8_charname_begin,
2956 utf8_to_uvchr_buf((U8 *) s,
2957 (U8 *) e,
2958 NULL)))
2959 {
2960 goto bad_charname;
2961 }
2962 s += UTF8SKIP(s);
2963 }
2964
2965 while (s < e) {
2966 if (UTF8_IS_INVARIANT(*s)) {
2967 if (! isCHARNAME_CONT(*s)) {
2968 goto bad_charname;
2969 }
2970 if (*s == ' ' && *(s-1) == ' ') {
2971 goto multi_spaces;
2972 }
2973 s++;
2974 }
2975 else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
2976 if (! isCHARNAME_CONT(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1))))
2977 {
2978 goto bad_charname;
2979 }
2980 s += 2;
2981 }
2982 else {
2983 if (! _invlist_contains_cp(PL_utf8_charname_continue,
2984 utf8_to_uvchr_buf((U8 *) s,
2985 (U8 *) e,
2986 NULL)))
2987 {
2988 goto bad_charname;
2989 }
2990 s += UTF8SKIP(s);
2991 }
2992 }
2993 }
2994 if (*(s-1) == ' ') {
2995 /* diag_listed_as: charnames alias definitions may not contain
2996 trailing white-space; marked by <-- HERE in %s
2997 */
2998 *error_msg = Perl_form(aTHX_
2999 "charnames alias definitions may not contain trailing "
3000 "white-space; marked by <-- HERE in %.*s<-- HERE %.*s",
3001 (int)(s - context + 1), context,
3002 (int)(e - s + 1), s + 1);
3003 return NULL;
3004 }
3005
3006 if (SvUTF8(res)) { /* Don't accept malformed charname value */
3007 const U8* first_bad_char_loc;
3008 STRLEN len;
3009 const char* const str = SvPV_const(res, len);
3010 if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len,
3011 &first_bad_char_loc)))
3012 {
3013 _force_out_malformed_utf8_message(first_bad_char_loc,
3014 (U8 *) PL_parser->bufend,
3015 0,
3016 0 /* 0 means don't die */ );
3017 /* diag_listed_as: Malformed UTF-8 returned by \N{%s}
3018 immediately after '%s' */
3019 *error_msg = Perl_form(aTHX_
3020 "Malformed UTF-8 returned by %.*s immediately after '%.*s'",
3021 (int) context_len, context,
3022 (int) ((char *) first_bad_char_loc - str), str);
3023 return NULL;
3024 }
3025 }
3026
3027 return res;
3028
3029 bad_charname: {
3030
3031 /* The final %.*s makes sure that should the trailing NUL be missing
3032 * that this print won't run off the end of the string */
3033 /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE
3034 in \N{%s} */
3035 *error_msg = Perl_form(aTHX_
3036 "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s",
3037 (int)(s - context + 1), context,
3038 (int)(e - s + 1), s + 1);
3039 return NULL;
3040 }
3041
3042 multi_spaces:
3043 /* diag_listed_as: charnames alias definitions may not contain a
3044 sequence of multiple spaces; marked by <-- HERE
3045 in %s */
3046 *error_msg = Perl_form(aTHX_
3047 "charnames alias definitions may not contain a sequence of "
3048 "multiple spaces; marked by <-- HERE in %.*s<-- HERE %.*s",
3049 (int)(s - context + 1), context,
3050 (int)(e - s + 1), s + 1);
3051 return NULL;
3052 }
3053
3054 /*
3055 scan_const
3056
3057 Extracts the next constant part of a pattern, double-quoted string,
3058 or transliteration. This is terrifying code.
3059
3060 For example, in parsing the double-quoted string "ab\x63$d", it would
3061 stop at the '$' and return an OP_CONST containing 'abc'.
3062
3063 It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
3064 processing a pattern (PL_lex_inpat is true), a transliteration
3065 (PL_lex_inwhat == OP_TRANS is true), or a double-quoted string.
3066
3067 Returns a pointer to the character scanned up to. If this is
3068 advanced from the start pointer supplied (i.e. if anything was
3069 successfully parsed), will leave an OP_CONST for the substring scanned
3070 in pl_yylval. Caller must intuit reason for not parsing further
3071 by looking at the next characters herself.
3072
3073 In patterns:
3074 expand:
3075 \N{FOO} => \N{U+hex_for_character_FOO}
3076 (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...})
3077
3078 pass through:
3079 all other \-char, including \N and \N{ apart from \N{ABC}
3080
3081 stops on:
3082 @ and $ where it appears to be a var, but not for $ as tail anchor
3083 \l \L \u \U \Q \E
3084 (?{ or (??{ or (*{
3085
3086 In transliterations:
3087 characters are VERY literal, except for - not at the start or end
3088 of the string, which indicates a range. However some backslash sequences
3089 are recognized: \r, \n, and the like
3090 \007 \o{}, \x{}, \N{}
3091 If all elements in the transliteration are below 256,
3092 scan_const expands the range to the full set of intermediate
3093 characters. If the range is in utf8, the hyphen is replaced with
3094 a certain range mark which will be handled by pmtrans() in op.c.
3095
3096 In double-quoted strings:
3097 backslashes:
3098 all those recognized in transliterations
3099 deprecated backrefs: \1 (in substitution replacements)
3100 case and quoting: \U \Q \E
3101 stops on @ and $
3102
3103 scan_const does *not* construct ops to handle interpolated strings.
3104 It stops processing as soon as it finds an embedded $ or @ variable
3105 and leaves it to the caller to work out what's going on.
3106
3107 embedded arrays (whether in pattern or not) could be:
3108 @foo, @::foo, @'foo, @{foo}, @$foo, @+, @-.
3109
3110 $ in double-quoted strings must be the symbol of an embedded scalar.
3111
3112 $ in pattern could be $foo or could be tail anchor. Assumption:
3113 it's a tail anchor if $ is the last thing in the string, or if it's
3114 followed by one of "()| \r\n\t"
3115
3116 \1 (backreferences) are turned into $1 in substitutions
3117
3118 The structure of the code is
3119 while (there's a character to process) {
3120 handle transliteration ranges
3121 skip regexp comments /(?#comment)/ and codes /(?{code})/ ((*{code})/
3122 skip #-initiated comments in //x patterns
3123 check for embedded arrays
3124 check for embedded scalars
3125 if (backslash) {
3126 deprecate \1 in substitution replacements
3127 handle string-changing backslashes \l \U \Q \E, etc.
3128 switch (what was escaped) {
3129 handle \- in a transliteration (becomes a literal -)
3130 if a pattern and not \N{, go treat as regular character
3131 handle \132 (octal characters)
3132 handle \x15 and \x{1234} (hex characters)
3133 handle \N{name} (named characters, also \N{3,5} in a pattern)
3134 handle \cV (control characters)
3135 handle printf-style backslashes (\f, \r, \n, etc)
3136 } (end switch)
3137 continue
3138 } (end if backslash)
3139 handle regular character
3140 } (end while character to read)
3141
3142 */
3143
3144 STATIC char *
S_scan_const(pTHX_ char * start)3145 S_scan_const(pTHX_ char *start)
3146 {
3147 const char * const send = PL_bufend;/* end of the constant */
3148 SV *sv = newSV(send - start); /* sv for the constant. See note below
3149 on sizing. */
3150 char *s = start; /* start of the constant */
3151 char *d = SvPVX(sv); /* destination for copies */
3152 bool dorange = FALSE; /* are we in a translit range? */
3153 bool didrange = FALSE; /* did we just finish a range? */
3154 bool in_charclass = FALSE; /* within /[...]/ */
3155 const bool s_is_utf8 = cBOOL(UTF); /* Is the source string assumed to be
3156 UTF8? But, this can show as true
3157 when the source isn't utf8, as for
3158 example when it is entirely composed
3159 of hex constants */
3160 bool d_is_utf8 = FALSE; /* Output constant is UTF8 */
3161 STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
3162 number of characters found so far
3163 that will expand (into 2 bytes)
3164 should we have to convert to
3165 UTF-8) */
3166 SV *res; /* result from charnames */
3167 STRLEN offset_to_max = 0; /* The offset in the output to where the range
3168 high-end character is temporarily placed */
3169
3170 /* Does something require special handling in tr/// ? This avoids extra
3171 * work in a less likely case. As such, khw didn't feel it was worth
3172 * adding any branches to the more mainline code to handle this, which
3173 * means that this doesn't get set in some circumstances when things like
3174 * \x{100} get expanded out. As a result there needs to be extra testing
3175 * done in the tr code */
3176 bool has_above_latin1 = FALSE;
3177
3178 /* Note on sizing: The scanned constant is placed into sv, which is
3179 * initialized by newSV() assuming one byte of output for every byte of
3180 * input. This routine expects newSV() to allocate an extra byte for a
3181 * trailing NUL, which this routine will append if it gets to the end of
3182 * the input. There may be more bytes of input than output (eg., \N{LATIN
3183 * CAPITAL LETTER A}), or more output than input if the constant ends up
3184 * recoded to utf8, but each time a construct is found that might increase
3185 * the needed size, SvGROW() is called. Its size parameter each time is
3186 * based on the best guess estimate at the time, namely the length used so
3187 * far, plus the length the current construct will occupy, plus room for
3188 * the trailing NUL, plus one byte for every input byte still unscanned */
3189
3190 UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses
3191 before set */
3192 #ifdef EBCDIC
3193 int backslash_N = 0; /* ? was the character from \N{} */
3194 int non_portable_endpoint = 0; /* ? In a range is an endpoint
3195 platform-specific like \x65 */
3196 #endif
3197
3198 PERL_ARGS_ASSERT_SCAN_CONST;
3199
3200 assert(PL_lex_inwhat != OP_TRANSR);
3201
3202 /* Protect sv from errors and fatal warnings. */
3203 ENTER_with_name("scan_const");
3204 SAVEFREESV(sv);
3205
3206 /* A bunch of code in the loop below assumes that if s[n] exists and is not
3207 * NUL, then s[n+1] exists. This assertion makes sure that assumption is
3208 * valid */
3209 assert(*send == '\0');
3210
3211 while (s < send
3212 || dorange /* Handle tr/// range at right edge of input */
3213 ) {
3214
3215 /* get transliterations out of the way (they're most literal) */
3216 if (PL_lex_inwhat == OP_TRANS) {
3217
3218 /* But there isn't any special handling necessary unless there is a
3219 * range, so for most cases we just drop down and handle the value
3220 * as any other. There are two exceptions.
3221 *
3222 * 1. A hyphen indicates that we are actually going to have a
3223 * range. In this case, skip the '-', set a flag, then drop
3224 * down to handle what should be the end range value.
3225 * 2. After we've handled that value, the next time through, that
3226 * flag is set and we fix up the range.
3227 *
3228 * Ranges entirely within Latin1 are expanded out entirely, in
3229 * order to make the transliteration a simple table look-up.
3230 * Ranges that extend above Latin1 have to be done differently, so
3231 * there is no advantage to expanding them here, so they are
3232 * stored here as Min, RANGE_INDICATOR, Max. 'RANGE_INDICATOR' is
3233 * a byte that can't occur in legal UTF-8, and hence can signify a
3234 * hyphen without any possible ambiguity. On EBCDIC machines, if
3235 * the range is expressed as Unicode, the Latin1 portion is
3236 * expanded out even if the range extends above Latin1. This is
3237 * because each code point in it has to be processed here
3238 * individually to get its native translation */
3239
3240 if (! dorange) {
3241
3242 /* Here, we don't think we're in a range. If the new character
3243 * is not a hyphen; or if it is a hyphen, but it's too close to
3244 * either edge to indicate a range, or if we haven't output any
3245 * characters yet then it's a regular character. */
3246 if (*s != '-' || s >= send - 1 || s == start || d == SvPVX(sv))
3247 {
3248
3249 /* A regular character. Process like any other, but first
3250 * clear any flags */
3251 didrange = FALSE;
3252 dorange = FALSE;
3253 #ifdef EBCDIC
3254 non_portable_endpoint = 0;
3255 backslash_N = 0;
3256 #endif
3257 /* The tests here for being above Latin1 and similar ones
3258 * in the following 'else' suffice to find all such
3259 * occurences in the constant, except those added by a
3260 * backslash escape sequence, like \x{100}. Mostly, those
3261 * set 'has_above_latin1' as appropriate */
3262 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
3263 has_above_latin1 = TRUE;
3264 }
3265
3266 /* Drops down to generic code to process current byte */
3267 }
3268 else { /* Is a '-' in the context where it means a range */
3269 if (didrange) { /* Something like y/A-C-Z// */
3270 Perl_croak(aTHX_ "Ambiguous range in transliteration"
3271 " operator");
3272 }
3273
3274 dorange = TRUE;
3275
3276 s++; /* Skip past the hyphen */
3277
3278 /* d now points to where the end-range character will be
3279 * placed. Drop down to get that character. We'll finish
3280 * processing the range the next time through the loop */
3281
3282 if (s_is_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) {
3283 has_above_latin1 = TRUE;
3284 }
3285
3286 /* Drops down to generic code to process current byte */
3287 }
3288 } /* End of not a range */
3289 else {
3290 /* Here we have parsed a range. Now must handle it. At this
3291 * point:
3292 * 'sv' is a SV* that contains the output string we are
3293 * constructing. The final two characters in that string
3294 * are the range start and range end, in order.
3295 * 'd' points to just beyond the range end in the 'sv' string,
3296 * where we would next place something
3297 */
3298 char * max_ptr;
3299 char * min_ptr;
3300 IV range_min;
3301 IV range_max; /* last character in range */
3302 STRLEN grow;
3303 Size_t offset_to_min = 0;
3304 Size_t extras = 0;
3305 #ifdef EBCDIC
3306 bool convert_unicode;
3307 IV real_range_max = 0;
3308 #endif
3309 /* Get the code point values of the range ends. */
3310 max_ptr = (d_is_utf8) ? (char *) utf8_hop( (U8*) d, -1) : d - 1;
3311 offset_to_max = max_ptr - SvPVX_const(sv);
3312 if (d_is_utf8) {
3313 /* We know the utf8 is valid, because we just constructed
3314 * it ourselves in previous loop iterations */
3315 min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1);
3316 range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL);
3317 range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL);
3318
3319 /* This compensates for not all code setting
3320 * 'has_above_latin1', so that we don't skip stuff that
3321 * should be executed */
3322 if (range_max > 255) {
3323 has_above_latin1 = TRUE;
3324 }
3325 }
3326 else {
3327 min_ptr = max_ptr - 1;
3328 range_min = * (U8*) min_ptr;
3329 range_max = * (U8*) max_ptr;
3330 }
3331
3332 /* If the range is just a single code point, like tr/a-a/.../,
3333 * that code point is already in the output, twice. We can
3334 * just back up over the second instance and avoid all the rest
3335 * of the work. But if it is a variant character, it's been
3336 * counted twice, so decrement. (This unlikely scenario is
3337 * special cased, like the one for a range of 2 code points
3338 * below, only because the main-line code below needs a range
3339 * of 3 or more to work without special casing. Might as well
3340 * get it out of the way now.) */
3341 if (UNLIKELY(range_max == range_min)) {
3342 d = max_ptr;
3343 if (! d_is_utf8 && ! UVCHR_IS_INVARIANT(range_max)) {
3344 utf8_variant_count--;
3345 }
3346 goto range_done;
3347 }
3348
3349 #ifdef EBCDIC
3350 /* On EBCDIC platforms, we may have to deal with portable
3351 * ranges. These happen if at least one range endpoint is a
3352 * Unicode value (\N{...}), or if the range is a subset of
3353 * [A-Z] or [a-z], and both ends are literal characters,
3354 * like 'A', and not like \x{C1} */
3355 convert_unicode =
3356 cBOOL(backslash_N) /* \N{} forces Unicode,
3357 hence portable range */
3358 || ( ! non_portable_endpoint
3359 && (( isLOWER_A(range_min) && isLOWER_A(range_max))
3360 || (isUPPER_A(range_min) && isUPPER_A(range_max))));
3361 if (convert_unicode) {
3362
3363 /* Special handling is needed for these portable ranges.
3364 * They are defined to be in Unicode terms, which includes
3365 * all the Unicode code points between the end points.
3366 * Convert to Unicode to get the Unicode range. Later we
3367 * will convert each code point in the range back to
3368 * native. */
3369 range_min = NATIVE_TO_UNI(range_min);
3370 range_max = NATIVE_TO_UNI(range_max);
3371 }
3372 #endif
3373
3374 if (range_min > range_max) {
3375 #ifdef EBCDIC
3376 if (convert_unicode) {
3377 /* Need to convert back to native for meaningful
3378 * messages for this platform */
3379 range_min = UNI_TO_NATIVE(range_min);
3380 range_max = UNI_TO_NATIVE(range_max);
3381 }
3382 #endif
3383 /* Use the characters themselves for the error message if
3384 * ASCII printables; otherwise some visible representation
3385 * of them */
3386 if (isPRINT_A(range_min) && isPRINT_A(range_max)) {
3387 Perl_croak(aTHX_
3388 "Invalid range \"%c-%c\" in transliteration operator",
3389 (char)range_min, (char)range_max);
3390 }
3391 #ifdef EBCDIC
3392 else if (convert_unicode) {
3393 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3394 Perl_croak(aTHX_
3395 "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04"
3396 UVXf "}\" in transliteration operator",
3397 range_min, range_max);
3398 }
3399 #endif
3400 else {
3401 /* diag_listed_as: Invalid range "%s" in transliteration operator */
3402 Perl_croak(aTHX_
3403 "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\""
3404 " in transliteration operator",
3405 range_min, range_max);
3406 }
3407 }
3408
3409 /* If the range is exactly two code points long, they are
3410 * already both in the output */
3411 if (UNLIKELY(range_min + 1 == range_max)) {
3412 goto range_done;
3413 }
3414
3415 /* Here the range contains at least 3 code points */
3416
3417 if (d_is_utf8) {
3418
3419 /* If everything in the transliteration is below 256, we
3420 * can avoid special handling later. A translation table
3421 * for each of those bytes is created by op.c. So we
3422 * expand out all ranges to their constituent code points.
3423 * But if we've encountered something above 255, the
3424 * expanding won't help, so skip doing that. But if it's
3425 * EBCDIC, we may have to look at each character below 256
3426 * if we have to convert to/from Unicode values */
3427 if ( has_above_latin1
3428 #ifdef EBCDIC
3429 && (range_min > 255 || ! convert_unicode)
3430 #endif
3431 ) {
3432 const STRLEN off = d - SvPVX(sv);
3433 const STRLEN extra = 1 + (send - s) + 1;
3434 char *e;
3435
3436 /* Move the high character one byte to the right; then
3437 * insert between it and the range begin, an illegal
3438 * byte which serves to indicate this is a range (using
3439 * a '-' would be ambiguous). */
3440
3441 if (off + extra > SvLEN(sv)) {
3442 d = off + SvGROW(sv, off + extra);
3443 max_ptr = d - off + offset_to_max;
3444 }
3445
3446 e = d++;
3447 while (e-- > max_ptr) {
3448 *(e + 1) = *e;
3449 }
3450 *(e + 1) = (char) RANGE_INDICATOR;
3451 goto range_done;
3452 }
3453
3454 /* Here, we're going to expand out the range. For EBCDIC
3455 * the range can extend above 255 (not so in ASCII), so
3456 * for EBCDIC, split it into the parts above and below
3457 * 255/256 */
3458 #ifdef EBCDIC
3459 if (range_max > 255) {
3460 real_range_max = range_max;
3461 range_max = 255;
3462 }
3463 #endif
3464 }
3465
3466 /* Here we need to expand out the string to contain each
3467 * character in the range. Grow the output to handle this.
3468 * For non-UTF8, we need a byte for each code point in the
3469 * range, minus the three that we've already allocated for: the
3470 * hyphen, the min, and the max. For UTF-8, we need this
3471 * plus an extra byte for each code point that occupies two
3472 * bytes (is variant) when in UTF-8 (except we've already
3473 * allocated for the end points, including if they are
3474 * variants). For ASCII platforms and Unicode ranges on EBCDIC
3475 * platforms, it's easy to calculate a precise number. To
3476 * start, we count the variants in the range, which we need
3477 * elsewhere in this function anyway. (For the case where it
3478 * isn't easy to calculate, 'extras' has been initialized to 0,
3479 * and the calculation is done in a loop further down.) */
3480 #ifdef EBCDIC
3481 if (convert_unicode)
3482 #endif
3483 {
3484 /* This is executed unconditionally on ASCII, and for
3485 * Unicode ranges on EBCDIC. Under these conditions, all
3486 * code points above a certain value are variant; and none
3487 * under that value are. We just need to find out how much
3488 * of the range is above that value. We don't count the
3489 * end points here, as they will already have been counted
3490 * as they were parsed. */
3491 if (range_min >= UTF_CONTINUATION_MARK) {
3492
3493 /* The whole range is made up of variants */
3494 extras = (range_max - 1) - (range_min + 1) + 1;
3495 }
3496 else if (range_max >= UTF_CONTINUATION_MARK) {
3497
3498 /* Only the higher portion of the range is variants */
3499 extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1;
3500 }
3501
3502 utf8_variant_count += extras;
3503 }
3504
3505 /* The base growth is the number of code points in the range,
3506 * not including the endpoints, which have already been sized
3507 * for (and output). We don't subtract for the hyphen, as it
3508 * has been parsed but not output, and the SvGROW below is
3509 * based only on what's been output plus what's left to parse.
3510 * */
3511 grow = (range_max - 1) - (range_min + 1) + 1;
3512
3513 if (d_is_utf8) {
3514 #ifdef EBCDIC
3515 /* In some cases in EBCDIC, we haven't yet calculated a
3516 * precise amount needed for the UTF-8 variants. Just
3517 * assume the worst case, that everything will expand by a
3518 * byte */
3519 if (! convert_unicode) {
3520 grow *= 2;
3521 }
3522 else
3523 #endif
3524 {
3525 /* Otherwise we know exactly how many variants there
3526 * are in the range. */
3527 grow += extras;
3528 }
3529 }
3530
3531 /* Grow, but position the output to overwrite the range min end
3532 * point, because in some cases we overwrite that */
3533 SvCUR_set(sv, d - SvPVX_const(sv));
3534 offset_to_min = min_ptr - SvPVX_const(sv);
3535
3536 /* See Note on sizing above. */
3537 d = offset_to_min + SvGROW(sv, SvCUR(sv)
3538 + (send - s)
3539 + grow
3540 + 1 /* Trailing NUL */ );
3541
3542 /* Now, we can expand out the range. */
3543 #ifdef EBCDIC
3544 if (convert_unicode) {
3545 SSize_t i;
3546
3547 /* Recall that the min and max are now in Unicode terms, so
3548 * we have to convert each character to its native
3549 * equivalent */
3550 if (d_is_utf8) {
3551 for (i = range_min; i <= range_max; i++) {
3552 append_utf8_from_native_byte(
3553 LATIN1_TO_NATIVE((U8) i),
3554 (U8 **) &d);
3555 }
3556 }
3557 else {
3558 for (i = range_min; i <= range_max; i++) {
3559 *d++ = (char)LATIN1_TO_NATIVE((U8) i);
3560 }
3561 }
3562 }
3563 else
3564 #endif
3565 /* Always gets run for ASCII, and sometimes for EBCDIC. */
3566 {
3567 /* Here, no conversions are necessary, which means that the
3568 * first character in the range is already in 'd' and
3569 * valid, so we can skip overwriting it */
3570 if (d_is_utf8) {
3571 SSize_t i;
3572 d += UTF8SKIP(d);
3573 for (i = range_min + 1; i <= range_max; i++) {
3574 append_utf8_from_native_byte((U8) i, (U8 **) &d);
3575 }
3576 }
3577 else {
3578 SSize_t i;
3579 d++;
3580 assert(range_min + 1 <= range_max);
3581 for (i = range_min + 1; i < range_max; i++) {
3582 #ifdef EBCDIC
3583 /* In this case on EBCDIC, we haven't calculated
3584 * the variants. Do it here, as we go along */
3585 if (! UVCHR_IS_INVARIANT(i)) {
3586 utf8_variant_count++;
3587 }
3588 #endif
3589 *d++ = (char)i;
3590 }
3591
3592 /* The range_max is done outside the loop so as to
3593 * avoid having to special case not incrementing
3594 * 'utf8_variant_count' on EBCDIC (it's already been
3595 * counted when originally parsed) */
3596 *d++ = (char) range_max;
3597 }
3598 }
3599
3600 #ifdef EBCDIC
3601 /* If the original range extended above 255, add in that
3602 * portion. */
3603 if (real_range_max) {
3604 *d++ = (char) UTF8_TWO_BYTE_HI(0x100);
3605 *d++ = (char) UTF8_TWO_BYTE_LO(0x100);
3606 if (real_range_max > 0x100) {
3607 if (real_range_max > 0x101) {
3608 *d++ = (char) RANGE_INDICATOR;
3609 }
3610 d = (char*)uvchr_to_utf8((U8*)d, real_range_max);
3611 }
3612 }
3613 #endif
3614
3615 range_done:
3616 /* mark the range as done, and continue */
3617 didrange = TRUE;
3618 dorange = FALSE;
3619 #ifdef EBCDIC
3620 non_portable_endpoint = 0;
3621 backslash_N = 0;
3622 #endif
3623 continue;
3624 } /* End of is a range */
3625 } /* End of transliteration. Joins main code after these else's */
3626 else if (*s == '[' && PL_lex_inpat && !in_charclass) {
3627 char *s1 = s-1;
3628 int esc = 0;
3629 while (s1 >= start && *s1-- == '\\')
3630 esc = !esc;
3631 if (!esc)
3632 in_charclass = TRUE;
3633 }
3634 else if (*s == ']' && PL_lex_inpat && in_charclass) {
3635 char *s1 = s-1;
3636 int esc = 0;
3637 while (s1 >= start && *s1-- == '\\')
3638 esc = !esc;
3639 if (!esc)
3640 in_charclass = FALSE;
3641 }
3642 /* skip for regexp comments /(?#comment)/, except for the last
3643 * char, which will be done separately. Stop on (?{..}) and
3644 * friends (??{ ... }) or (*{ ... }) */
3645 else if (*s == '(' && PL_lex_inpat && (s[1] == '?' || s[1] == '*') && !in_charclass) {
3646 if (s[1] == '?' && s[2] == '#') {
3647 if (s_is_utf8) {
3648 PERL_UINT_FAST8_T len = UTF8SKIP(s);
3649
3650 while (s + len < send && *s != ')') {
3651 Copy(s, d, len, U8);
3652 d += len;
3653 s += len;
3654 len = UTF8_SAFE_SKIP(s, send);
3655 }
3656 }
3657 else while (s+1 < send && *s != ')') {
3658 *d++ = *s++;
3659 }
3660 }
3661 else
3662 if (!PL_lex_casemods &&
3663 /* The following should match regcomp.c */
3664 ((s[1] == '?' && (s[2] == '{' /* (?{ ... }) */
3665 || (s[2] == '?' && s[3] == '{'))) || /* (??{ ... }) */
3666 (s[1] == '*' && (s[2] == '{' ))) /* (*{ ... }) */
3667 ){
3668 break;
3669 }
3670 }
3671 /* likewise skip #-initiated comments in //x patterns */
3672 else if (*s == '#'
3673 && PL_lex_inpat
3674 && !in_charclass
3675 && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED)
3676 {
3677 while (s < send && *s != '\n')
3678 *d++ = *s++;
3679 }
3680 /* no further processing of single-quoted regex */
3681 else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
3682 goto default_action;
3683
3684 /* check for embedded arrays
3685 * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
3686 */
3687 else if (*s == '@' && s[1]) {
3688 if (UTF
3689 ? isIDFIRST_utf8_safe(s+1, send)
3690 : isWORDCHAR_A(s[1]))
3691 {
3692 break;
3693 }
3694 if (memCHRs(":'{$", s[1]))
3695 break;
3696 if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
3697 break; /* in regexp, neither @+ nor @- are interpolated */
3698 }
3699 /* check for embedded scalars. only stop if we're sure it's a
3700 * variable. */
3701 else if (*s == '$') {
3702 if (!PL_lex_inpat) /* not a regexp, so $ must be var */
3703 break;
3704 if (s + 1 < send && !memCHRs("()| \r\n\t", s[1])) {
3705 if (s[1] == '\\') {
3706 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
3707 "Possible unintended interpolation of $\\ in regex");
3708 }
3709 break; /* in regexp, $ might be tail anchor */
3710 }
3711 }
3712
3713 /* End of else if chain - OP_TRANS rejoin rest */
3714
3715 if (UNLIKELY(s >= send)) {
3716 assert(s == send);
3717 break;
3718 }
3719
3720 /* backslashes */
3721 if (*s == '\\' && s+1 < send) {
3722 char* bslash = s; /* point to beginning \ */
3723 char* rbrace; /* point to ending '}' */
3724 char* e; /* 1 past the meat (non-blanks) before the
3725 brace */
3726 s++;
3727
3728 /* warn on \1 - \9 in substitution replacements, but note that \11
3729 * is an octal; and \19 is \1 followed by '9' */
3730 if (PL_lex_inwhat == OP_SUBST
3731 && !PL_lex_inpat
3732 && isDIGIT(*s)
3733 && *s != '0'
3734 && !isDIGIT(s[1]))
3735 {
3736 /* diag_listed_as: \%d better written as $%d */
3737 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\\%c better written as $%c", *s, *s);
3738 s = bslash;
3739 *s = '$';
3740 break;
3741 }
3742
3743 /* string-change backslash escapes */
3744 if (PL_lex_inwhat != OP_TRANS && *s && memCHRs("lLuUEQF", *s)) {
3745 s = bslash;
3746 break;
3747 }
3748 /* In a pattern, process \N, but skip any other backslash escapes.
3749 * This is because we don't want to translate an escape sequence
3750 * into a meta symbol and have the regex compiler use the meta
3751 * symbol meaning, e.g. \x{2E} would be confused with a dot. But
3752 * in spite of this, we do have to process \N here while the proper
3753 * charnames handler is in scope. See bugs #56444 and #62056.
3754 *
3755 * There is a complication because \N in a pattern may also stand
3756 * for 'match a non-nl', and not mean a charname, in which case its
3757 * processing should be deferred to the regex compiler. To be a
3758 * charname it must be followed immediately by a '{', and not look
3759 * like \N followed by a curly quantifier, i.e., not something like
3760 * \N{3,}. regcurly returns a boolean indicating if it is a legal
3761 * quantifier */
3762 else if (PL_lex_inpat
3763 && (*s != 'N'
3764 || s[1] != '{'
3765 || regcurly(s + 1, send, NULL)))
3766 {
3767 *d++ = '\\';
3768 goto default_action;
3769 }
3770
3771 switch (*s) {
3772 default:
3773 {
3774 if ((isALPHANUMERIC(*s)))
3775 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
3776 "Unrecognized escape \\%c passed through",
3777 *s);
3778 /* default action is to copy the quoted character */
3779 goto default_action;
3780 }
3781
3782 /* eg. \132 indicates the octal constant 0132 */
3783 case '0': case '1': case '2': case '3':
3784 case '4': case '5': case '6': case '7':
3785 {
3786 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3787 | PERL_SCAN_NOTIFY_ILLDIGIT;
3788 STRLEN len = 3;
3789 uv = grok_oct(s, &len, &flags, NULL);
3790 s += len;
3791 if ( (flags & PERL_SCAN_NOTIFY_ILLDIGIT)
3792 && s < send
3793 && isDIGIT(*s) /* like \08, \178 */
3794 && ckWARN(WARN_MISC))
3795 {
3796 Perl_warner(aTHX_ packWARN(WARN_MISC), "%s",
3797 form_alien_digit_msg(8, len, s, send, UTF, FALSE));
3798 }
3799 }
3800 goto NUM_ESCAPE_INSERT;
3801
3802 /* eg. \o{24} indicates the octal constant \024 */
3803 case 'o':
3804 {
3805 const char* error;
3806
3807 if (! grok_bslash_o(&s, send,
3808 &uv, &error,
3809 NULL,
3810 FALSE, /* Not strict */
3811 FALSE, /* No illegal cp's */
3812 UTF))
3813 {
3814 yyerror(error);
3815 uv = 0; /* drop through to ensure range ends are set */
3816 }
3817 goto NUM_ESCAPE_INSERT;
3818 }
3819
3820 /* eg. \x24 indicates the hex constant 0x24 */
3821 case 'x':
3822 {
3823 const char* error;
3824
3825 if (! grok_bslash_x(&s, send,
3826 &uv, &error,
3827 NULL,
3828 FALSE, /* Not strict */
3829 FALSE, /* No illegal cp's */
3830 UTF))
3831 {
3832 yyerror(error);
3833 uv = 0; /* drop through to ensure range ends are set */
3834 }
3835 }
3836
3837 NUM_ESCAPE_INSERT:
3838 /* Insert oct or hex escaped character. */
3839
3840 /* Here uv is the ordinal of the next character being added */
3841 if (UVCHR_IS_INVARIANT(uv)) {
3842 *d++ = (char) uv;
3843 }
3844 else {
3845 if (!d_is_utf8 && uv > 255) {
3846
3847 /* Here, 'uv' won't fit unless we convert to UTF-8.
3848 * If we've only seen invariants so far, all we have to
3849 * do is turn on the flag */
3850 if (utf8_variant_count == 0) {
3851 SvUTF8_on(sv);
3852 }
3853 else {
3854 SvCUR_set(sv, d - SvPVX_const(sv));
3855 SvPOK_on(sv);
3856 *d = '\0';
3857
3858 sv_utf8_upgrade_flags_grow(
3859 sv,
3860 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
3861
3862 /* Since we're having to grow here,
3863 * make sure we have enough room for
3864 * this escape and a NUL, so the
3865 * code immediately below won't have
3866 * to actually grow again */
3867 UVCHR_SKIP(uv)
3868 + (STRLEN)(send - s) + 1);
3869 d = SvPVX(sv) + SvCUR(sv);
3870 }
3871
3872 has_above_latin1 = TRUE;
3873 d_is_utf8 = TRUE;
3874 }
3875
3876 if (! d_is_utf8) {
3877 *d++ = (char)uv;
3878 utf8_variant_count++;
3879 }
3880 else {
3881 /* Usually, there will already be enough room in 'sv'
3882 * since such escapes are likely longer than any UTF-8
3883 * sequence they can end up as. This isn't the case on
3884 * EBCDIC where \x{40000000} contains 12 bytes, and the
3885 * UTF-8 for it contains 14. And, we have to allow for
3886 * a trailing NUL. It probably can't happen on ASCII
3887 * platforms, but be safe. See Note on sizing above. */
3888 const STRLEN needed = d - SvPVX(sv)
3889 + UVCHR_SKIP(uv)
3890 + (send - s)
3891 + 1;
3892 if (UNLIKELY(needed > SvLEN(sv))) {
3893 SvCUR_set(sv, d - SvPVX_const(sv));
3894 d = SvCUR(sv) + SvGROW(sv, needed);
3895 }
3896
3897 d = (char*) uvchr_to_utf8_flags((U8*)d, uv,
3898 (ckWARN(WARN_PORTABLE))
3899 ? UNICODE_WARN_PERL_EXTENDED
3900 : 0);
3901 }
3902 }
3903 #ifdef EBCDIC
3904 non_portable_endpoint++;
3905 #endif
3906 continue;
3907
3908 case 'N':
3909 /* In a non-pattern \N must be like \N{U+0041}, or it can be a
3910 * named character, like \N{LATIN SMALL LETTER A}, or a named
3911 * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND
3912 * GRAVE} (except y/// can't handle the latter, croaking). For
3913 * convenience all three forms are referred to as "named
3914 * characters" below.
3915 *
3916 * For patterns, \N also can mean to match a non-newline. Code
3917 * before this 'switch' statement should already have handled
3918 * this situation, and hence this code only has to deal with
3919 * the named character cases.
3920 *
3921 * For non-patterns, the named characters are converted to
3922 * their string equivalents. In patterns, named characters are
3923 * not converted to their ultimate forms for the same reasons
3924 * that other escapes aren't (mainly that the ultimate
3925 * character could be considered a meta-symbol by the regex
3926 * compiler). Instead, they are converted to the \N{U+...}
3927 * form to get the value from the charnames that is in effect
3928 * right now, while preserving the fact that it was a named
3929 * character, so that the regex compiler knows this.
3930 *
3931 * The structure of this section of code (besides checking for
3932 * errors and upgrading to utf8) is:
3933 * If the named character is of the form \N{U+...}, pass it
3934 * through if a pattern; otherwise convert the code point
3935 * to utf8
3936 * Otherwise must be some \N{NAME}: convert to
3937 * \N{U+c1.c2...} if a pattern; otherwise convert to utf8
3938 *
3939 * Transliteration is an exception. The conversion to utf8 is
3940 * only done if the code point requires it to be representable.
3941 *
3942 * Here, 's' points to the 'N'; the test below is guaranteed to
3943 * succeed if we are being called on a pattern, as we already
3944 * know from a test above that the next character is a '{'. A
3945 * non-pattern \N must mean 'named character', which requires
3946 * braces */
3947 s++;
3948 if (*s != '{') {
3949 yyerror("Missing braces on \\N{}");
3950 *d++ = '\0';
3951 continue;
3952 }
3953 s++;
3954
3955 /* If there is no matching '}', it is an error. */
3956 if (! (rbrace = (char *) memchr(s, '}', send - s))) {
3957 if (! PL_lex_inpat) {
3958 yyerror("Missing right brace on \\N{}");
3959 } else {
3960 yyerror("Missing right brace on \\N{} or unescaped left brace after \\N");
3961 }
3962 yyquit(); /* Have exhausted the input. */
3963 }
3964
3965 /* Here it looks like a named character */
3966 while (s < rbrace && isBLANK(*s)) {
3967 s++;
3968 }
3969
3970 e = rbrace;
3971 while (s < e && isBLANK(*(e - 1))) {
3972 e--;
3973 }
3974
3975 if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
3976 s += 2; /* Skip to next char after the 'U+' */
3977 if (PL_lex_inpat) {
3978
3979 /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
3980 /* Check the syntax. */
3981 if (!isXDIGIT(*s)) {
3982 bad_NU:
3983 yyerror(
3984 "Invalid hexadecimal number in \\N{U+...}"
3985 );
3986 s = rbrace + 1;
3987 *d++ = '\0';
3988 continue;
3989 }
3990 while (++s < e) {
3991 if (isXDIGIT(*s))
3992 continue;
3993 else if ((*s == '.' || *s == '_')
3994 && isXDIGIT(s[1]))
3995 continue;
3996 goto bad_NU;
3997 }
3998
3999 /* Pass everything through unchanged.
4000 * +1 is to include the '}' */
4001 Copy(bslash, d, rbrace - bslash + 1, char);
4002 d += rbrace - bslash + 1;
4003 }
4004 else { /* Not a pattern: convert the hex to string */
4005 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
4006 | PERL_SCAN_SILENT_ILLDIGIT
4007 | PERL_SCAN_SILENT_OVERFLOW
4008 | PERL_SCAN_DISALLOW_PREFIX;
4009 STRLEN len = e - s;
4010
4011 uv = grok_hex(s, &len, &flags, NULL);
4012 if (len == 0 || (len != (STRLEN)(e - s)))
4013 goto bad_NU;
4014
4015 if ( uv > MAX_LEGAL_CP
4016 || (flags & PERL_SCAN_GREATER_THAN_UV_MAX))
4017 {
4018 yyerror(form_cp_too_large_msg(16, s, len, 0));
4019 uv = 0; /* drop through to ensure range ends are
4020 set */
4021 }
4022
4023 /* For non-tr///, if the destination is not in utf8,
4024 * unconditionally recode it to be so. This is
4025 * because \N{} implies Unicode semantics, and scalars
4026 * have to be in utf8 to guarantee those semantics.
4027 * tr/// doesn't care about Unicode rules, so no need
4028 * there to upgrade to UTF-8 for small enough code
4029 * points */
4030 if (! d_is_utf8 && ( uv > 0xFF
4031 || PL_lex_inwhat != OP_TRANS))
4032 {
4033 /* See Note on sizing above. */
4034 const STRLEN extra = OFFUNISKIP(uv) + (send - rbrace) + 1;
4035
4036 SvCUR_set(sv, d - SvPVX_const(sv));
4037 SvPOK_on(sv);
4038 *d = '\0';
4039
4040 if (utf8_variant_count == 0) {
4041 SvUTF8_on(sv);
4042 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
4043 }
4044 else {
4045 sv_utf8_upgrade_flags_grow(
4046 sv,
4047 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
4048 extra);
4049 d = SvPVX(sv) + SvCUR(sv);
4050 }
4051
4052 d_is_utf8 = TRUE;
4053 has_above_latin1 = TRUE;
4054 }
4055
4056 /* Add the (Unicode) code point to the output. */
4057 if (! d_is_utf8 || OFFUNI_IS_INVARIANT(uv)) {
4058 *d++ = (char) LATIN1_TO_NATIVE(uv);
4059 }
4060 else {
4061 d = (char*) uvoffuni_to_utf8_flags((U8*)d, uv,
4062 (ckWARN(WARN_PORTABLE))
4063 ? UNICODE_WARN_PERL_EXTENDED
4064 : 0);
4065 }
4066 }
4067 }
4068 else /* Here is \N{NAME} but not \N{U+...}. */
4069 if (! (res = get_and_check_backslash_N_name_wrapper(s, e)))
4070 { /* Failed. We should die eventually, but for now use a NUL
4071 to keep parsing */
4072 *d++ = '\0';
4073 }
4074 else { /* Successfully evaluated the name */
4075 STRLEN len;
4076 const char *str = SvPV_const(res, len);
4077 if (PL_lex_inpat) {
4078
4079 if (! len) { /* The name resolved to an empty string */
4080 const char empty_N[] = "\\N{_}";
4081 Copy(empty_N, d, sizeof(empty_N) - 1, char);
4082 d += sizeof(empty_N) - 1;
4083 }
4084 else {
4085 /* In order to not lose information for the regex
4086 * compiler, pass the result in the specially made
4087 * syntax: \N{U+c1.c2.c3...}, where c1 etc. are
4088 * the code points in hex of each character
4089 * returned by charnames */
4090
4091 const char *str_end = str + len;
4092 const STRLEN off = d - SvPVX_const(sv);
4093
4094 if (! SvUTF8(res)) {
4095 /* For the non-UTF-8 case, we can determine the
4096 * exact length needed without having to parse
4097 * through the string. Each character takes up
4098 * 2 hex digits plus either a trailing dot or
4099 * the "}" */
4100 const char initial_text[] = "\\N{U+";
4101 const STRLEN initial_len = sizeof(initial_text)
4102 - 1;
4103 d = off + SvGROW(sv, off
4104 + 3 * len
4105
4106 /* +1 for trailing NUL */
4107 + initial_len + 1
4108
4109 + (STRLEN)(send - rbrace));
4110 Copy(initial_text, d, initial_len, char);
4111 d += initial_len;
4112 while (str < str_end) {
4113 char hex_string[4];
4114 int len =
4115 my_snprintf(hex_string,
4116 sizeof(hex_string),
4117 "%02X.",
4118
4119 /* The regex compiler is
4120 * expecting Unicode, not
4121 * native */
4122 NATIVE_TO_LATIN1(*str));
4123 PERL_MY_SNPRINTF_POST_GUARD(len,
4124 sizeof(hex_string));
4125 Copy(hex_string, d, 3, char);
4126 d += 3;
4127 str++;
4128 }
4129 d--; /* Below, we will overwrite the final
4130 dot with a right brace */
4131 }
4132 else {
4133 STRLEN char_length; /* cur char's byte length */
4134
4135 /* and the number of bytes after this is
4136 * translated into hex digits */
4137 STRLEN output_length;
4138
4139 /* 2 hex per byte; 2 chars for '\N'; 2 chars
4140 * for max('U+', '.'); and 1 for NUL */
4141 char hex_string[2 * UTF8_MAXBYTES + 5];
4142
4143 /* Get the first character of the result. */
4144 U32 uv = utf8n_to_uvchr((U8 *) str,
4145 len,
4146 &char_length,
4147 UTF8_ALLOW_ANYUV);
4148 /* Convert first code point to Unicode hex,
4149 * including the boiler plate before it. */
4150 output_length =
4151 my_snprintf(hex_string, sizeof(hex_string),
4152 "\\N{U+%X",
4153 (unsigned int) NATIVE_TO_UNI(uv));
4154
4155 /* Make sure there is enough space to hold it */
4156 d = off + SvGROW(sv, off
4157 + output_length
4158 + (STRLEN)(send - rbrace)
4159 + 2); /* '}' + NUL */
4160 /* And output it */
4161 Copy(hex_string, d, output_length, char);
4162 d += output_length;
4163
4164 /* For each subsequent character, append dot and
4165 * its Unicode code point in hex */
4166 while ((str += char_length) < str_end) {
4167 const STRLEN off = d - SvPVX_const(sv);
4168 U32 uv = utf8n_to_uvchr((U8 *) str,
4169 str_end - str,
4170 &char_length,
4171 UTF8_ALLOW_ANYUV);
4172 output_length =
4173 my_snprintf(hex_string,
4174 sizeof(hex_string),
4175 ".%X",
4176 (unsigned int) NATIVE_TO_UNI(uv));
4177
4178 d = off + SvGROW(sv, off
4179 + output_length
4180 + (STRLEN)(send - rbrace)
4181 + 2); /* '}' + NUL */
4182 Copy(hex_string, d, output_length, char);
4183 d += output_length;
4184 }
4185 }
4186
4187 *d++ = '}'; /* Done. Add the trailing brace */
4188 }
4189 }
4190 else { /* Here, not in a pattern. Convert the name to a
4191 * string. */
4192
4193 if (PL_lex_inwhat == OP_TRANS) {
4194 str = SvPV_const(res, len);
4195 if (len > ((SvUTF8(res))
4196 ? UTF8SKIP(str)
4197 : 1U))
4198 {
4199 yyerror(Perl_form(aTHX_
4200 "%.*s must not be a named sequence"
4201 " in transliteration operator",
4202 /* +1 to include the "}" */
4203 (int) (rbrace + 1 - start), start));
4204 *d++ = '\0';
4205 goto end_backslash_N;
4206 }
4207
4208 if (SvUTF8(res) && UTF8_IS_ABOVE_LATIN1(*str)) {
4209 has_above_latin1 = TRUE;
4210 }
4211
4212 }
4213 else if (! SvUTF8(res)) {
4214 /* Make sure \N{} return is UTF-8. This is because
4215 * \N{} implies Unicode semantics, and scalars have
4216 * to be in utf8 to guarantee those semantics; but
4217 * not needed in tr/// */
4218 sv_utf8_upgrade_flags(res, 0);
4219 str = SvPV_const(res, len);
4220 }
4221
4222 /* Upgrade destination to be utf8 if this new
4223 * component is */
4224 if (! d_is_utf8 && SvUTF8(res)) {
4225 /* See Note on sizing above. */
4226 const STRLEN extra = len + (send - s) + 1;
4227
4228 SvCUR_set(sv, d - SvPVX_const(sv));
4229 SvPOK_on(sv);
4230 *d = '\0';
4231
4232 if (utf8_variant_count == 0) {
4233 SvUTF8_on(sv);
4234 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
4235 }
4236 else {
4237 sv_utf8_upgrade_flags_grow(sv,
4238 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
4239 extra);
4240 d = SvPVX(sv) + SvCUR(sv);
4241 }
4242 d_is_utf8 = TRUE;
4243 } else if (len > (STRLEN)(e - s + 4)) { /* +4 is for \N{} */
4244
4245 /* See Note on sizing above. (NOTE: SvCUR() is not
4246 * set correctly here). */
4247 const STRLEN extra = len + (send - rbrace) + 1;
4248 const STRLEN off = d - SvPVX_const(sv);
4249 d = off + SvGROW(sv, off + extra);
4250 }
4251 Copy(str, d, len, char);
4252 d += len;
4253 }
4254
4255 SvREFCNT_dec(res);
4256
4257 } /* End \N{NAME} */
4258
4259 end_backslash_N:
4260 #ifdef EBCDIC
4261 backslash_N++; /* \N{} is defined to be Unicode */
4262 #endif
4263 s = rbrace + 1; /* Point to just after the '}' */
4264 continue;
4265
4266 /* \c is a control character */
4267 case 'c':
4268 s++;
4269 if (s < send) {
4270 const char * message;
4271
4272 if (! grok_bslash_c(*s, (U8 *) d, &message, NULL)) {
4273 yyerror(message);
4274 yyquit(); /* Have always immediately croaked on
4275 errors in this */
4276 }
4277 d++;
4278 }
4279 else {
4280 yyerror("Missing control char name in \\c");
4281 yyquit(); /* Are at end of input, no sense continuing */
4282 }
4283 #ifdef EBCDIC
4284 non_portable_endpoint++;
4285 #endif
4286 break;
4287
4288 /* printf-style backslashes, formfeeds, newlines, etc */
4289 case 'b':
4290 *d++ = '\b';
4291 break;
4292 case 'n':
4293 *d++ = '\n';
4294 break;
4295 case 'r':
4296 *d++ = '\r';
4297 break;
4298 case 'f':
4299 *d++ = '\f';
4300 break;
4301 case 't':
4302 *d++ = '\t';
4303 break;
4304 case 'e':
4305 *d++ = ESC_NATIVE;
4306 break;
4307 case 'a':
4308 *d++ = '\a';
4309 break;
4310 } /* end switch */
4311
4312 s++;
4313 continue;
4314 } /* end if (backslash) */
4315
4316 default_action:
4317 /* Just copy the input to the output, though we may have to convert
4318 * to/from UTF-8.
4319 *
4320 * If the input has the same representation in UTF-8 as not, it will be
4321 * a single byte, and we don't care about UTF8ness; just copy the byte */
4322 if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
4323 *d++ = *s++;
4324 }
4325 else if (! s_is_utf8 && ! d_is_utf8) {
4326 /* If neither source nor output is UTF-8, is also a single byte,
4327 * just copy it; but this byte counts should we later have to
4328 * convert to UTF-8 */
4329 *d++ = *s++;
4330 utf8_variant_count++;
4331 }
4332 else if (s_is_utf8 && d_is_utf8) { /* Both UTF-8, can just copy */
4333 const STRLEN len = UTF8SKIP(s);
4334
4335 /* We expect the source to have already been checked for
4336 * malformedness */
4337 assert(isUTF8_CHAR((U8 *) s, (U8 *) send));
4338
4339 Copy(s, d, len, U8);
4340 d += len;
4341 s += len;
4342 }
4343 else if (s_is_utf8) { /* UTF8ness matters: convert output to utf8 */
4344 STRLEN need = send - s + 1; /* See Note on sizing above. */
4345
4346 SvCUR_set(sv, d - SvPVX_const(sv));
4347 SvPOK_on(sv);
4348 *d = '\0';
4349
4350 if (utf8_variant_count == 0) {
4351 SvUTF8_on(sv);
4352 d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
4353 }
4354 else {
4355 sv_utf8_upgrade_flags_grow(sv,
4356 SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
4357 need);
4358 d = SvPVX(sv) + SvCUR(sv);
4359 }
4360 d_is_utf8 = TRUE;
4361 goto default_action; /* Redo, having upgraded so both are UTF-8 */
4362 }
4363 else { /* UTF8ness matters: convert this non-UTF8 source char to
4364 UTF-8 for output. It will occupy 2 bytes, but don't include
4365 the input byte since we haven't incremented 's' yet. See
4366 Note on sizing above. */
4367 const STRLEN off = d - SvPVX(sv);
4368 const STRLEN extra = 2 + (send - s - 1) + 1;
4369 if (off + extra > SvLEN(sv)) {
4370 d = off + SvGROW(sv, off + extra);
4371 }
4372 *d++ = UTF8_EIGHT_BIT_HI(*s);
4373 *d++ = UTF8_EIGHT_BIT_LO(*s);
4374 s++;
4375 }
4376 } /* while loop to process each character */
4377
4378 {
4379 const STRLEN off = d - SvPVX(sv);
4380
4381 /* See if room for the terminating NUL */
4382 if (UNLIKELY(off >= SvLEN(sv))) {
4383
4384 #ifndef DEBUGGING
4385
4386 if (off > SvLEN(sv))
4387 #endif
4388 Perl_croak(aTHX_ "panic: constant overflowed allocated space,"
4389 " %" UVuf " >= %" UVuf, (UV)off, (UV)SvLEN(sv));
4390
4391 /* Whew! Here we don't have room for the terminating NUL, but
4392 * everything else so far has fit. It's not too late to grow
4393 * to fit the NUL and continue on. But it is a bug, as the code
4394 * above was supposed to have made room for this, so under
4395 * DEBUGGING builds, we panic anyway. */
4396 d = off + SvGROW(sv, off + 1);
4397 }
4398 }
4399
4400 /* terminate the string and set up the sv */
4401 *d = '\0';
4402 SvCUR_set(sv, d - SvPVX_const(sv));
4403
4404 SvPOK_on(sv);
4405 if (d_is_utf8) {
4406 SvUTF8_on(sv);
4407 }
4408
4409 /* shrink the sv if we allocated more than we used */
4410 if (SvCUR(sv) + 5 < SvLEN(sv)) {
4411 SvPV_shrink_to_cur(sv);
4412 }
4413
4414 /* return the substring (via pl_yylval) only if we parsed anything */
4415 if (s > start) {
4416 char *s2 = start;
4417 for (; s2 < s; s2++) {
4418 if (*s2 == '\n')
4419 COPLINE_INC_WITH_HERELINES;
4420 }
4421 SvREFCNT_inc_simple_void_NN(sv);
4422 if ( (PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ))
4423 && ! PL_parser->lex_re_reparsing)
4424 {
4425 const char *const key = PL_lex_inpat ? "qr" : "q";
4426 const STRLEN keylen = PL_lex_inpat ? 2 : 1;
4427 const char *type;
4428 STRLEN typelen;
4429
4430 if (PL_lex_inwhat == OP_TRANS) {
4431 type = "tr";
4432 typelen = 2;
4433 } else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
4434 type = "s";
4435 typelen = 1;
4436 } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
4437 type = "q";
4438 typelen = 1;
4439 } else {
4440 type = "qq";
4441 typelen = 2;
4442 }
4443
4444 sv = S_new_constant(aTHX_ start, s - start, key, keylen, sv, NULL,
4445 type, typelen, NULL);
4446 }
4447 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
4448 }
4449 LEAVE_with_name("scan_const");
4450 return s;
4451 }
4452
4453 /* S_intuit_more
4454 * Returns TRUE if there's more to the expression (e.g., a subscript),
4455 * FALSE otherwise.
4456 *
4457 * It deals with "$foo[3]" and /$foo[3]/ and /$foo[0123456789$]+/
4458 *
4459 * ->[ and ->{ return TRUE
4460 * ->$* ->$#* ->@* ->@[ ->@{ return TRUE if postderef_qq is enabled
4461 * { and [ outside a pattern are always subscripts, so return TRUE
4462 * if we're outside a pattern and it's not { or [, then return FALSE
4463 * if we're in a pattern and the first char is a {
4464 * {4,5} (any digits around the comma) returns FALSE
4465 * if we're in a pattern and the first char is a [
4466 * [] returns FALSE
4467 * [SOMETHING] has a funky algorithm to decide whether it's a
4468 * character class or not. It has to deal with things like
4469 * /$foo[-3]/ and /$foo[$bar]/ as well as /$foo[$\d]+/
4470 * anything else returns TRUE
4471 */
4472
4473 /* This is the one truly awful dwimmer necessary to conflate C and sed. */
4474
4475 STATIC int
S_intuit_more(pTHX_ char * s,char * e)4476 S_intuit_more(pTHX_ char *s, char *e)
4477 {
4478 PERL_ARGS_ASSERT_INTUIT_MORE;
4479
4480 if (PL_lex_brackets)
4481 return TRUE;
4482 if (*s == '-' && s[1] == '>' && (s[2] == '[' || s[2] == '{'))
4483 return TRUE;
4484 if (*s == '-' && s[1] == '>'
4485 && FEATURE_POSTDEREF_QQ_IS_ENABLED
4486 && ( (s[2] == '$' && (s[3] == '*' || (s[3] == '#' && s[4] == '*')))
4487 ||(s[2] == '@' && memCHRs("*[{",s[3])) ))
4488 return TRUE;
4489 if (*s != '{' && *s != '[')
4490 return FALSE;
4491 PL_parser->sub_no_recover = TRUE;
4492 if (!PL_lex_inpat)
4493 return TRUE;
4494
4495 /* In a pattern, so maybe we have {n,m}. */
4496 if (*s == '{') {
4497 if (regcurly(s, e, NULL)) {
4498 return FALSE;
4499 }
4500 return TRUE;
4501 }
4502
4503 /* On the other hand, maybe we have a character class */
4504
4505 s++;
4506 if (*s == ']' || *s == '^')
4507 return FALSE;
4508 else {
4509 /* this is terrifying, and it works */
4510 int weight;
4511 char seen[256];
4512 const char * const send = (char *) memchr(s, ']', e - s);
4513 unsigned char un_char, last_un_char;
4514 char tmpbuf[sizeof PL_tokenbuf * 4];
4515
4516 if (!send) /* has to be an expression */
4517 return TRUE;
4518 weight = 2; /* let's weigh the evidence */
4519
4520 if (*s == '$')
4521 weight -= 3;
4522 else if (isDIGIT(*s)) {
4523 if (s[1] != ']') {
4524 if (isDIGIT(s[1]) && s[2] == ']')
4525 weight -= 10;
4526 }
4527 else
4528 weight -= 100;
4529 }
4530 Zero(seen,256,char);
4531 un_char = 255;
4532 for (; s < send; s++) {
4533 last_un_char = un_char;
4534 un_char = (unsigned char)*s;
4535 switch (*s) {
4536 case '@':
4537 case '&':
4538 case '$':
4539 weight -= seen[un_char] * 10;
4540 if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
4541 int len;
4542 scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE);
4543 len = (int)strlen(tmpbuf);
4544 if (len > 1 && gv_fetchpvn_flags(tmpbuf, len,
4545 UTF ? SVf_UTF8 : 0, SVt_PV))
4546 weight -= 100;
4547 else
4548 weight -= 10;
4549 }
4550 else if (*s == '$'
4551 && s[1]
4552 && memCHRs("[#!%*<>()-=",s[1]))
4553 {
4554 if (/*{*/ memCHRs("])} =",s[2]))
4555 weight -= 10;
4556 else
4557 weight -= 1;
4558 }
4559 break;
4560 case '\\':
4561 un_char = 254;
4562 if (s[1]) {
4563 if (memCHRs("wds]",s[1]))
4564 weight += 100;
4565 else if (seen[(U8)'\''] || seen[(U8)'"'])
4566 weight += 1;
4567 else if (memCHRs("rnftbxcav",s[1]))
4568 weight += 40;
4569 else if (isDIGIT(s[1])) {
4570 weight += 40;
4571 while (s[1] && isDIGIT(s[1]))
4572 s++;
4573 }
4574 }
4575 else
4576 weight += 100;
4577 break;
4578 case '-':
4579 if (s[1] == '\\')
4580 weight += 50;
4581 if (memCHRs("aA01! ",last_un_char))
4582 weight += 30;
4583 if (memCHRs("zZ79~",s[1]))
4584 weight += 30;
4585 if (last_un_char == 255 && (isDIGIT(s[1]) || s[1] == '$'))
4586 weight -= 5; /* cope with negative subscript */
4587 break;
4588 default:
4589 if (!isWORDCHAR(last_un_char)
4590 && !(last_un_char == '$' || last_un_char == '@'
4591 || last_un_char == '&')
4592 && isALPHA(*s) && s[1] && isALPHA(s[1])) {
4593 char *d = s;
4594 while (isALPHA(*s))
4595 s++;
4596 if (keyword(d, s - d, 0))
4597 weight -= 150;
4598 }
4599 if (un_char == last_un_char + 1)
4600 weight += 5;
4601 weight -= seen[un_char];
4602 break;
4603 }
4604 seen[un_char]++;
4605 }
4606 if (weight >= 0) /* probably a character class */
4607 return FALSE;
4608 }
4609
4610 return TRUE;
4611 }
4612
4613 /*
4614 * S_intuit_method
4615 *
4616 * Does all the checking to disambiguate
4617 * foo bar
4618 * between foo(bar) and bar->foo. Returns 0 if not a method, otherwise
4619 * METHCALL (bar->foo(args)) or METHCALL0 (bar->foo args).
4620 *
4621 * First argument is the stuff after the first token, e.g. "bar".
4622 *
4623 * Not a method if foo is a filehandle.
4624 * Not a method if foo is a subroutine prototyped to take a filehandle.
4625 * Not a method if it's really "Foo $bar"
4626 * Method if it's "foo $bar"
4627 * Not a method if it's really "print foo $bar"
4628 * Method if it's really "foo package::" (interpreted as package->foo)
4629 * Not a method if bar is known to be a subroutine ("sub bar; foo bar")
4630 * Not a method if bar is a filehandle or package, but is quoted with
4631 * =>
4632 */
4633
4634 STATIC int
S_intuit_method(pTHX_ char * start,SV * ioname,CV * cv)4635 S_intuit_method(pTHX_ char *start, SV *ioname, CV *cv)
4636 {
4637 char *s = start + (*start == '$');
4638 char tmpbuf[sizeof PL_tokenbuf];
4639 STRLEN len;
4640 GV* indirgv;
4641 /* Mustn't actually add anything to a symbol table.
4642 But also don't want to "initialise" any placeholder
4643 constants that might already be there into full
4644 blown PVGVs with attached PVCV. */
4645 GV * const gv =
4646 ioname ? gv_fetchsv(ioname, GV_NOADD_NOINIT, SVt_PVCV) : NULL;
4647
4648 PERL_ARGS_ASSERT_INTUIT_METHOD;
4649
4650 if (!FEATURE_INDIRECT_IS_ENABLED)
4651 return 0;
4652
4653 if (gv && SvTYPE(gv) == SVt_PVGV && GvIO(gv))
4654 return 0;
4655 if (cv && SvPOK(cv)) {
4656 const char *proto = CvPROTO(cv);
4657 if (proto) {
4658 while (*proto && (isSPACE(*proto) || *proto == ';'))
4659 proto++;
4660 if (*proto == '*')
4661 return 0;
4662 }
4663 }
4664
4665 if (*start == '$') {
4666 SSize_t start_off = start - SvPVX(PL_linestr);
4667 if (cv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY
4668 || isUPPER(*PL_tokenbuf))
4669 return 0;
4670 /* this could be $# */
4671 if (isSPACE(*s))
4672 s = skipspace(s);
4673 PL_bufptr = SvPVX(PL_linestr) + start_off;
4674 PL_expect = XREF;
4675 return *s == '(' ? METHCALL : METHCALL0;
4676 }
4677
4678 s = scan_word6(s, tmpbuf, sizeof tmpbuf, TRUE, &len, FALSE);
4679 /* start is the beginning of the possible filehandle/object,
4680 * and s is the end of it
4681 * tmpbuf is a copy of it (but with single quotes as double colons)
4682 */
4683
4684 if (!keyword(tmpbuf, len, 0)) {
4685 if (len > 2 && tmpbuf[len - 2] == ':' && tmpbuf[len - 1] == ':') {
4686 len -= 2;
4687 tmpbuf[len] = '\0';
4688 goto bare_package;
4689 }
4690 indirgv = gv_fetchpvn_flags(tmpbuf, len,
4691 GV_NOADD_NOINIT|( UTF ? SVf_UTF8 : 0 ),
4692 SVt_PVCV);
4693 if (indirgv && SvTYPE(indirgv) != SVt_NULL
4694 && (!isGV(indirgv) || GvCVu(indirgv)))
4695 return 0;
4696 /* filehandle or package name makes it a method */
4697 if (!cv || GvIO(indirgv) || gv_stashpvn(tmpbuf, len, UTF ? SVf_UTF8 : 0)) {
4698 s = skipspace(s);
4699 if ((PL_bufend - s) >= 2 && *s == '=' && *(s+1) == '>')
4700 return 0; /* no assumptions -- "=>" quotes bareword */
4701 bare_package:
4702 NEXTVAL_NEXTTOKE.opval = newSVOP(OP_CONST, 0,
4703 S_newSV_maybe_utf8(aTHX_ tmpbuf, len));
4704 NEXTVAL_NEXTTOKE.opval->op_private = OPpCONST_BARE;
4705 PL_expect = XTERM;
4706 force_next(BAREWORD);
4707 PL_bufptr = s;
4708 return *s == '(' ? METHCALL : METHCALL0;
4709 }
4710 }
4711 return 0;
4712 }
4713
4714 /* Encoded script support. filter_add() effectively inserts a
4715 * 'pre-processing' function into the current source input stream.
4716 * Note that the filter function only applies to the current source file
4717 * (e.g., it will not affect files 'require'd or 'use'd by this one).
4718 *
4719 * The datasv parameter (which may be NULL) can be used to pass
4720 * private data to this instance of the filter. The filter function
4721 * can recover the SV using the FILTER_DATA macro and use it to
4722 * store private buffers and state information.
4723 *
4724 * The supplied datasv parameter is upgraded to a PVIO type
4725 * and the IoDIRP/IoANY field is used to store the function pointer,
4726 * and IOf_FAKE_DIRP is enabled on datasv to mark this as such.
4727 * Note that IoTOP_NAME, IoFMT_NAME, IoBOTTOM_NAME, if set for
4728 * private use must be set using malloc'd pointers.
4729 */
4730
4731 SV *
Perl_filter_add(pTHX_ filter_t funcp,SV * datasv)4732 Perl_filter_add(pTHX_ filter_t funcp, SV *datasv)
4733 {
4734 if (!funcp)
4735 return NULL;
4736
4737 if (!PL_parser)
4738 return NULL;
4739
4740 if (PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS)
4741 Perl_croak(aTHX_ "Source filters apply only to byte streams");
4742
4743 if (!PL_rsfp_filters)
4744 PL_rsfp_filters = newAV();
4745 if (!datasv)
4746 datasv = newSV(0);
4747 SvUPGRADE(datasv, SVt_PVIO);
4748 IoANY(datasv) = FPTR2DPTR(void *, funcp); /* stash funcp into spare field */
4749 IoFLAGS(datasv) |= IOf_FAKE_DIRP;
4750 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_add func %p (%s)\n",
4751 FPTR2DPTR(void *, IoANY(datasv)),
4752 SvPV_nolen(datasv)));
4753 av_unshift(PL_rsfp_filters, 1);
4754 av_store(PL_rsfp_filters, 0, datasv) ;
4755 if (
4756 !PL_parser->filtered
4757 && PL_parser->lex_flags & LEX_EVALBYTES
4758 && PL_bufptr < PL_bufend
4759 ) {
4760 const char *s = PL_bufptr;
4761 while (s < PL_bufend) {
4762 if (*s == '\n') {
4763 SV *linestr = PL_parser->linestr;
4764 char *buf = SvPVX(linestr);
4765 STRLEN const bufptr_pos = PL_parser->bufptr - buf;
4766 STRLEN const oldbufptr_pos = PL_parser->oldbufptr - buf;
4767 STRLEN const oldoldbufptr_pos=PL_parser->oldoldbufptr-buf;
4768 STRLEN const linestart_pos = PL_parser->linestart - buf;
4769 STRLEN const last_uni_pos =
4770 PL_parser->last_uni ? PL_parser->last_uni - buf : 0;
4771 STRLEN const last_lop_pos =
4772 PL_parser->last_lop ? PL_parser->last_lop - buf : 0;
4773 av_push(PL_rsfp_filters, linestr);
4774 PL_parser->linestr =
4775 newSVpvn(SvPVX(linestr), ++s-SvPVX(linestr));
4776 buf = SvPVX(PL_parser->linestr);
4777 PL_parser->bufend = buf + SvCUR(PL_parser->linestr);
4778 PL_parser->bufptr = buf + bufptr_pos;
4779 PL_parser->oldbufptr = buf + oldbufptr_pos;
4780 PL_parser->oldoldbufptr = buf + oldoldbufptr_pos;
4781 PL_parser->linestart = buf + linestart_pos;
4782 if (PL_parser->last_uni)
4783 PL_parser->last_uni = buf + last_uni_pos;
4784 if (PL_parser->last_lop)
4785 PL_parser->last_lop = buf + last_lop_pos;
4786 SvLEN_set(linestr, SvCUR(linestr));
4787 SvCUR_set(linestr, s - SvPVX(linestr));
4788 PL_parser->filtered = 1;
4789 break;
4790 }
4791 s++;
4792 }
4793 }
4794 return(datasv);
4795 }
4796
4797 /*
4798 =for apidoc_section $filters
4799 =for apidoc filter_del
4800
4801 Delete most recently added instance of the filter function argument
4802
4803 =cut
4804 */
4805
4806 void
Perl_filter_del(pTHX_ filter_t funcp)4807 Perl_filter_del(pTHX_ filter_t funcp)
4808 {
4809 SV *datasv;
4810
4811 PERL_ARGS_ASSERT_FILTER_DEL;
4812
4813 #ifdef DEBUGGING
4814 DEBUG_P(PerlIO_printf(Perl_debug_log, "filter_del func %p",
4815 FPTR2DPTR(void*, funcp)));
4816 #endif
4817 if (!PL_parser || !PL_rsfp_filters || AvFILLp(PL_rsfp_filters)<0)
4818 return;
4819 /* if filter is on top of stack (usual case) just pop it off */
4820 datasv = FILTER_DATA(AvFILLp(PL_rsfp_filters));
4821 if (IoANY(datasv) == FPTR2DPTR(void *, funcp)) {
4822 SvREFCNT_dec(av_pop(PL_rsfp_filters));
4823
4824 return;
4825 }
4826 /* we need to search for the correct entry and clear it */
4827 Perl_die(aTHX_ "filter_del can only delete in reverse order (currently)");
4828 }
4829
4830
4831 /* Invoke the idxth filter function for the current rsfp. */
4832 /* maxlen 0 = read one text line */
4833 I32
Perl_filter_read(pTHX_ int idx,SV * buf_sv,int maxlen)4834 Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen)
4835 {
4836 filter_t funcp;
4837 I32 ret;
4838 SV *datasv = NULL;
4839 /* This API is bad. It should have been using unsigned int for maxlen.
4840 Not sure if we want to change the API, but if not we should sanity
4841 check the value here. */
4842 unsigned int correct_length = maxlen < 0 ? PERL_INT_MAX : maxlen;
4843
4844 PERL_ARGS_ASSERT_FILTER_READ;
4845
4846 if (!PL_parser || !PL_rsfp_filters)
4847 return -1;
4848 if (idx > AvFILLp(PL_rsfp_filters)) { /* Any more filters? */
4849 /* Provide a default input filter to make life easy. */
4850 /* Note that we append to the line. This is handy. */
4851 DEBUG_P(PerlIO_printf(Perl_debug_log,
4852 "filter_read %d: from rsfp\n", idx));
4853 if (correct_length) {
4854 /* Want a block */
4855 int len ;
4856 const int old_len = SvCUR(buf_sv);
4857
4858 /* ensure buf_sv is large enough */
4859 SvGROW(buf_sv, (STRLEN)(old_len + correct_length + 1)) ;
4860 if ((len = PerlIO_read(PL_rsfp, SvPVX(buf_sv) + old_len,
4861 correct_length)) <= 0) {
4862 if (PerlIO_error(PL_rsfp))
4863 return -1; /* error */
4864 else
4865 return 0 ; /* end of file */
4866 }
4867 SvCUR_set(buf_sv, old_len + len) ;
4868 SvPVX(buf_sv)[old_len + len] = '\0';
4869 } else {
4870 /* Want a line */
4871 if (sv_gets(buf_sv, PL_rsfp, SvCUR(buf_sv)) == NULL) {
4872 if (PerlIO_error(PL_rsfp))
4873 return -1; /* error */
4874 else
4875 return 0 ; /* end of file */
4876 }
4877 }
4878 return SvCUR(buf_sv);
4879 }
4880 /* Skip this filter slot if filter has been deleted */
4881 if ( (datasv = FILTER_DATA(idx)) == &PL_sv_undef) {
4882 DEBUG_P(PerlIO_printf(Perl_debug_log,
4883 "filter_read %d: skipped (filter deleted)\n",
4884 idx));
4885 return FILTER_READ(idx+1, buf_sv, correct_length); /* recurse */
4886 }
4887 if (SvTYPE(datasv) != SVt_PVIO) {
4888 if (correct_length) {
4889 /* Want a block */
4890 const STRLEN remainder = SvLEN(datasv) - SvCUR(datasv);
4891 if (!remainder) return 0; /* eof */
4892 if (correct_length > remainder) correct_length = remainder;
4893 sv_catpvn(buf_sv, SvEND(datasv), correct_length);
4894 SvCUR_set(datasv, SvCUR(datasv) + correct_length);
4895 } else {
4896 /* Want a line */
4897 const char *s = SvEND(datasv);
4898 const char *send = SvPVX(datasv) + SvLEN(datasv);
4899 while (s < send) {
4900 if (*s == '\n') {
4901 s++;
4902 break;
4903 }
4904 s++;
4905 }
4906 if (s == send) return 0; /* eof */
4907 sv_catpvn(buf_sv, SvEND(datasv), s-SvEND(datasv));
4908 SvCUR_set(datasv, s-SvPVX(datasv));
4909 }
4910 return SvCUR(buf_sv);
4911 }
4912 /* Get function pointer hidden within datasv */
4913 funcp = DPTR2FPTR(filter_t, IoANY(datasv));
4914 DEBUG_P(PerlIO_printf(Perl_debug_log,
4915 "filter_read %d: via function %p (%s)\n",
4916 idx, (void*)datasv, SvPV_nolen_const(datasv)));
4917 /* Call function. The function is expected to */
4918 /* call "FILTER_READ(idx+1, buf_sv)" first. */
4919 /* Return: <0:error, =0:eof, >0:not eof */
4920 ENTER;
4921 save_scalar(PL_errgv);
4922 ret = (*funcp)(aTHX_ idx, buf_sv, correct_length);
4923 LEAVE;
4924 return ret;
4925 }
4926
4927 STATIC char *
S_filter_gets(pTHX_ SV * sv,STRLEN append)4928 S_filter_gets(pTHX_ SV *sv, STRLEN append)
4929 {
4930 PERL_ARGS_ASSERT_FILTER_GETS;
4931
4932 #ifdef PERL_CR_FILTER
4933 if (!PL_rsfp_filters) {
4934 filter_add(S_cr_textfilter,NULL);
4935 }
4936 #endif
4937 if (PL_rsfp_filters) {
4938 if (!append)
4939 SvCUR_set(sv, 0); /* start with empty line */
4940 if (FILTER_READ(0, sv, 0) > 0)
4941 return ( SvPVX(sv) ) ;
4942 else
4943 return NULL ;
4944 }
4945 else
4946 return (sv_gets(sv, PL_rsfp, append));
4947 }
4948
4949 STATIC HV *
S_find_in_my_stash(pTHX_ const char * pkgname,STRLEN len)4950 S_find_in_my_stash(pTHX_ const char *pkgname, STRLEN len)
4951 {
4952 GV *gv;
4953
4954 PERL_ARGS_ASSERT_FIND_IN_MY_STASH;
4955
4956 if (memEQs(pkgname, len, "__PACKAGE__"))
4957 return PL_curstash;
4958
4959 if (len > 2
4960 && (pkgname[len - 2] == ':' && pkgname[len - 1] == ':')
4961 && (gv = gv_fetchpvn_flags(pkgname,
4962 len,
4963 ( UTF ? SVf_UTF8 : 0 ), SVt_PVHV)))
4964 {
4965 return GvHV(gv); /* Foo:: */
4966 }
4967
4968 /* use constant CLASS => 'MyClass' */
4969 gv = gv_fetchpvn_flags(pkgname, len, UTF ? SVf_UTF8 : 0, SVt_PVCV);
4970 if (gv && GvCV(gv)) {
4971 SV * const sv = cv_const_sv(GvCV(gv));
4972 if (sv)
4973 return gv_stashsv(sv, 0);
4974 }
4975
4976 return gv_stashpvn(pkgname, len, UTF ? SVf_UTF8 : 0);
4977 }
4978
4979
4980 STATIC char *
S_tokenize_use(pTHX_ int is_use,char * s)4981 S_tokenize_use(pTHX_ int is_use, char *s) {
4982 PERL_ARGS_ASSERT_TOKENIZE_USE;
4983
4984 if (PL_expect != XSTATE)
4985 /* diag_listed_as: "use" not allowed in expression */
4986 yyerror(Perl_form(aTHX_ "\"%s\" not allowed in expression",
4987 is_use ? "use" : "no"));
4988 PL_expect = XTERM;
4989 s = skipspace(s);
4990 if (isDIGIT(*s) || (*s == 'v' && isDIGIT(s[1]))) {
4991 s = force_version(s, TRUE);
4992 if (*s == ';' || *s == '}'
4993 || (s = skipspace(s), (*s == ';' || *s == '}'))) {
4994 NEXTVAL_NEXTTOKE.opval = NULL;
4995 force_next(BAREWORD);
4996 }
4997 else if (*s == 'v') {
4998 s = force_word(s,BAREWORD,FALSE,TRUE);
4999 s = force_version(s, FALSE);
5000 }
5001 }
5002 else {
5003 s = force_word(s,BAREWORD,FALSE,TRUE);
5004 s = force_version(s, FALSE);
5005 }
5006 pl_yylval.ival = is_use;
5007 return s;
5008 }
5009 #ifdef DEBUGGING
5010 static const char* const exp_name[] =
5011 { "OPERATOR", "TERM", "REF", "STATE", "BLOCK", "ATTRBLOCK",
5012 "ATTRTERM", "TERMBLOCK", "XBLOCKTERM", "POSTDEREF",
5013 "SIGVAR", "TERMORDORDOR"
5014 };
5015 #endif
5016
5017 #define word_takes_any_delimiter(p,l) S_word_takes_any_delimiter(p,l)
5018 STATIC bool
S_word_takes_any_delimiter(char * p,STRLEN len)5019 S_word_takes_any_delimiter(char *p, STRLEN len)
5020 {
5021 return (len == 1 && memCHRs("msyq", p[0]))
5022 || (len == 2
5023 && ((p[0] == 't' && p[1] == 'r')
5024 || (p[0] == 'q' && memCHRs("qwxr", p[1]))));
5025 }
5026
5027 static void
S_check_scalar_slice(pTHX_ char * s)5028 S_check_scalar_slice(pTHX_ char *s)
5029 {
5030 s++;
5031 while (SPACE_OR_TAB(*s)) s++;
5032 if (*s == 'q' && s[1] == 'w' && !isWORDCHAR_lazy_if_safe(s+2,
5033 PL_bufend,
5034 UTF))
5035 {
5036 return;
5037 }
5038 while ( isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)
5039 || (*s && memCHRs(" \t$#+-'\"", *s)))
5040 {
5041 s += UTF ? UTF8SKIP(s) : 1;
5042 }
5043 if (*s == '}' || *s == ']')
5044 pl_yylval.ival = OPpSLICEWARNING;
5045 }
5046
5047 #define lex_token_boundary() S_lex_token_boundary(aTHX)
5048 static void
S_lex_token_boundary(pTHX)5049 S_lex_token_boundary(pTHX)
5050 {
5051 PL_oldoldbufptr = PL_oldbufptr;
5052 PL_oldbufptr = PL_bufptr;
5053 }
5054
5055 #define vcs_conflict_marker(s) S_vcs_conflict_marker(aTHX_ s)
5056 static char *
S_vcs_conflict_marker(pTHX_ char * s)5057 S_vcs_conflict_marker(pTHX_ char *s)
5058 {
5059 lex_token_boundary();
5060 PL_bufptr = s;
5061 yyerror("Version control conflict marker");
5062 while (s < PL_bufend && *s != '\n')
5063 s++;
5064 return s;
5065 }
5066
5067 static int
yyl_sigvar(pTHX_ char * s)5068 yyl_sigvar(pTHX_ char *s)
5069 {
5070 /* we expect the sigil and optional var name part of a
5071 * signature element here. Since a '$' is not necessarily
5072 * followed by a var name, handle it specially here; the general
5073 * yylex code would otherwise try to interpret whatever follows
5074 * as a var; e.g. ($, ...) would be seen as the var '$,'
5075 */
5076
5077 U8 sigil;
5078
5079 s = skipspace(s);
5080 sigil = *s++;
5081 PL_bufptr = s; /* for error reporting */
5082 switch (sigil) {
5083 case '$':
5084 case '@':
5085 case '%':
5086 /* spot stuff that looks like an prototype */
5087 if (memCHRs("$:@%&*;\\[]", *s)) {
5088 yyerror("Illegal character following sigil in a subroutine signature");
5089 break;
5090 }
5091 /* '$#' is banned, while '$ # comment' isn't */
5092 if (*s == '#') {
5093 yyerror("'#' not allowed immediately following a sigil in a subroutine signature");
5094 break;
5095 }
5096 s = skipspace(s);
5097 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
5098 char *dest = PL_tokenbuf + 1;
5099 /* read var name, including sigil, into PL_tokenbuf */
5100 PL_tokenbuf[0] = sigil;
5101 parse_ident(&s, &dest, dest + sizeof(PL_tokenbuf) - 1,
5102 0, cBOOL(UTF), FALSE, FALSE);
5103 *dest = '\0';
5104 assert(PL_tokenbuf[1]); /* we have a variable name */
5105 }
5106 else {
5107 *PL_tokenbuf = 0;
5108 PL_in_my = 0;
5109 }
5110
5111 s = skipspace(s);
5112 /* parse the = for the default ourselves to avoid '+=' etc being accepted here
5113 * as the ASSIGNOP, and exclude other tokens that start with =
5114 */
5115 if (*s == '=' && (!s[1] || memCHRs("=~>", s[1]) == 0)) {
5116 /* save now to report with the same context as we did when
5117 * all ASSIGNOPS were accepted */
5118 PL_oldbufptr = s;
5119
5120 ++s;
5121 NEXTVAL_NEXTTOKE.ival = OP_SASSIGN;
5122 force_next(ASSIGNOP);
5123 PL_expect = XTERM;
5124 }
5125 else if(*s == '/' && s[1] == '/' && s[2] == '=') {
5126 PL_oldbufptr = s;
5127
5128 s += 3;
5129 NEXTVAL_NEXTTOKE.ival = OP_DORASSIGN;
5130 force_next(ASSIGNOP);
5131 PL_expect = XTERM;
5132 }
5133 else if(*s == '|' && s[1] == '|' && s[2] == '=') {
5134 PL_oldbufptr = s;
5135
5136 s += 3;
5137 NEXTVAL_NEXTTOKE.ival = OP_ORASSIGN;
5138 force_next(ASSIGNOP);
5139 PL_expect = XTERM;
5140 }
5141 else if (*s == ',' || *s == ')') {
5142 PL_expect = XOPERATOR;
5143 }
5144 else {
5145 /* make sure the context shows the unexpected character and
5146 * hopefully a bit more */
5147 if (*s) ++s;
5148 while (*s && *s != '$' && *s != '@' && *s != '%' && *s != ')')
5149 s++;
5150 PL_bufptr = s; /* for error reporting */
5151 yyerror("Illegal operator following parameter in a subroutine signature");
5152 PL_in_my = 0;
5153 }
5154 if (*PL_tokenbuf) {
5155 NEXTVAL_NEXTTOKE.ival = sigil;
5156 force_next('p'); /* force a signature pending identifier */
5157 }
5158 break;
5159
5160 case ')':
5161 PL_expect = XBLOCK;
5162 break;
5163 case ',': /* handle ($a,,$b) */
5164 break;
5165
5166 default:
5167 PL_in_my = 0;
5168 yyerror("A signature parameter must start with '$', '@' or '%'");
5169 /* very crude error recovery: skip to likely next signature
5170 * element */
5171 while (*s && *s != '$' && *s != '@' && *s != '%' && *s != ')')
5172 s++;
5173 break;
5174 }
5175
5176 switch (sigil) {
5177 case ',': TOKEN (PERLY_COMMA);
5178 case '$': TOKEN (PERLY_DOLLAR);
5179 case '@': TOKEN (PERLY_SNAIL);
5180 case '%': TOKEN (PERLY_PERCENT_SIGN);
5181 case ')': TOKEN (PERLY_PAREN_CLOSE);
5182 default: TOKEN (sigil);
5183 }
5184 }
5185
5186 static int
yyl_dollar(pTHX_ char * s)5187 yyl_dollar(pTHX_ char *s)
5188 {
5189 CLINE;
5190
5191 if (PL_expect == XPOSTDEREF) {
5192 if (s[1] == '#') {
5193 s++;
5194 POSTDEREF(DOLSHARP);
5195 }
5196 POSTDEREF(PERLY_DOLLAR);
5197 }
5198
5199 if ( s[1] == '#'
5200 && ( isIDFIRST_lazy_if_safe(s+2, PL_bufend, UTF)
5201 || memCHRs("{$:+-@", s[2])))
5202 {
5203 PL_tokenbuf[0] = '@';
5204 s = scan_ident(s + 1, PL_tokenbuf + 1,
5205 sizeof PL_tokenbuf - 1, FALSE);
5206 if (PL_expect == XOPERATOR) {
5207 char *d = s;
5208 if (PL_bufptr > s) {
5209 d = PL_bufptr-1;
5210 PL_bufptr = PL_oldbufptr;
5211 }
5212 no_op("Array length", d);
5213 }
5214 if (!PL_tokenbuf[1])
5215 PREREF(DOLSHARP);
5216 PL_expect = XOPERATOR;
5217 force_ident_maybe_lex('#');
5218 TOKEN(DOLSHARP);
5219 }
5220
5221 PL_tokenbuf[0] = '$';
5222 s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE);
5223 if (PL_expect == XOPERATOR) {
5224 char *d = s;
5225 if (PL_bufptr > s) {
5226 d = PL_bufptr-1;
5227 PL_bufptr = PL_oldbufptr;
5228 }
5229 no_op("Scalar", d);
5230 }
5231 if (!PL_tokenbuf[1]) {
5232 if (s == PL_bufend)
5233 yyerror("Final $ should be \\$ or $name");
5234 PREREF(PERLY_DOLLAR);
5235 }
5236
5237 {
5238 const char tmp = *s;
5239 if (PL_lex_state == LEX_NORMAL || PL_lex_brackets)
5240 s = skipspace(s);
5241
5242 if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
5243 && intuit_more(s, PL_bufend)) {
5244 if (*s == '[') {
5245 PL_tokenbuf[0] = '@';
5246 if (ckWARN(WARN_SYNTAX)) {
5247 char *t = s+1;
5248
5249 while ( t < PL_bufend ) {
5250 if (isSPACE(*t)) {
5251 do { t += UTF ? UTF8SKIP(t) : 1; } while (t < PL_bufend && isSPACE(*t));
5252 /* consumed one or more space chars */
5253 } else if (*t == '$' || *t == '@') {
5254 /* could be more than one '$' like $$ref or @$ref */
5255 do { t++; } while (t < PL_bufend && *t == '$');
5256
5257 /* could be an abigail style identifier like $ foo */
5258 while (t < PL_bufend && *t == ' ') t++;
5259
5260 /* strip off the name of the var */
5261 while (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
5262 t += UTF ? UTF8SKIP(t) : 1;
5263 /* consumed a varname */
5264 } else if (isDIGIT(*t)) {
5265 /* deal with hex constants like 0x11 */
5266 if (t[0] == '0' && t[1] == 'x') {
5267 t += 2;
5268 while (t < PL_bufend && isXDIGIT(*t)) t++;
5269 } else {
5270 /* deal with decimal/octal constants like 1 and 0123 */
5271 do { t++; } while (isDIGIT(*t));
5272 if (t<PL_bufend && *t == '.') {
5273 do { t++; } while (isDIGIT(*t));
5274 }
5275 }
5276 /* consumed a number */
5277 } else {
5278 /* not a var nor a space nor a number */
5279 break;
5280 }
5281 }
5282 if (t < PL_bufend && *t++ == ',') {
5283 PL_bufptr = skipspace(PL_bufptr); /* XXX can realloc */
5284 while (t < PL_bufend && *t != ']')
5285 t++;
5286 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
5287 "Multidimensional syntax %" UTF8f " not supported",
5288 UTF8fARG(UTF,(int)((t - PL_bufptr) + 1), PL_bufptr));
5289 }
5290 }
5291 }
5292 else if (*s == '{') {
5293 char *t;
5294 PL_tokenbuf[0] = '%';
5295 if ( strEQ(PL_tokenbuf+1, "SIG")
5296 && ckWARN(WARN_SYNTAX)
5297 && (t = (char *) memchr(s, '}', PL_bufend - s))
5298 && (t = (char *) memchr(t, '=', PL_bufend - t)))
5299 {
5300 char tmpbuf[sizeof PL_tokenbuf];
5301 do {
5302 t++;
5303 } while (isSPACE(*t));
5304 if (isIDFIRST_lazy_if_safe(t, PL_bufend, UTF)) {
5305 STRLEN len;
5306 t = scan_word6(t, tmpbuf, sizeof tmpbuf, TRUE,
5307 &len, TRUE);
5308 while (isSPACE(*t))
5309 t++;
5310 if ( *t == ';'
5311 && get_cvn_flags(tmpbuf, len, UTF
5312 ? SVf_UTF8
5313 : 0))
5314 {
5315 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
5316 "You need to quote \"%" UTF8f "\"",
5317 UTF8fARG(UTF, len, tmpbuf));
5318 }
5319 }
5320 }
5321 }
5322 }
5323
5324 PL_expect = XOPERATOR;
5325 if ((PL_lex_state == LEX_NORMAL || PL_lex_brackets) && isSPACE((char)tmp)) {
5326 const bool islop = (PL_last_lop == PL_oldoldbufptr);
5327 if (!islop || PL_last_lop_op == OP_GREPSTART)
5328 PL_expect = XOPERATOR;
5329 else if (memCHRs("$@\"'`q", *s))
5330 PL_expect = XTERM; /* e.g. print $fh "foo" */
5331 else if ( memCHRs("&*<%", *s)
5332 && isIDFIRST_lazy_if_safe(s+1, PL_bufend, UTF))
5333 {
5334 PL_expect = XTERM; /* e.g. print $fh &sub */
5335 }
5336 else if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
5337 char tmpbuf[sizeof PL_tokenbuf];
5338 int t2;
5339 STRLEN len;
5340 scan_word6(s, tmpbuf, sizeof tmpbuf, TRUE, &len, FALSE);
5341 if ((t2 = keyword(tmpbuf, len, 0))) {
5342 /* binary operators exclude handle interpretations */
5343 switch (t2) {
5344 case -KEY_x:
5345 case -KEY_eq:
5346 case -KEY_ne:
5347 case -KEY_gt:
5348 case -KEY_lt:
5349 case -KEY_ge:
5350 case -KEY_le:
5351 case -KEY_cmp:
5352 break;
5353 default:
5354 PL_expect = XTERM; /* e.g. print $fh length() */
5355 break;
5356 }
5357 }
5358 else {
5359 PL_expect = XTERM; /* e.g. print $fh subr() */
5360 }
5361 }
5362 else if (isDIGIT(*s))
5363 PL_expect = XTERM; /* e.g. print $fh 3 */
5364 else if (*s == '.' && isDIGIT(s[1]))
5365 PL_expect = XTERM; /* e.g. print $fh .3 */
5366 else if ((*s == '?' || *s == '-' || *s == '+')
5367 && !isSPACE(s[1]) && s[1] != '=')
5368 PL_expect = XTERM; /* e.g. print $fh -1 */
5369 else if (*s == '/' && !isSPACE(s[1]) && s[1] != '='
5370 && s[1] != '/')
5371 PL_expect = XTERM; /* e.g. print $fh /.../
5372 XXX except DORDOR operator
5373 */
5374 else if (*s == '<' && s[1] == '<' && !isSPACE(s[2])
5375 && s[2] != '=')
5376 PL_expect = XTERM; /* print $fh <<"EOF" */
5377 }
5378 }
5379 force_ident_maybe_lex('$');
5380 TOKEN(PERLY_DOLLAR);
5381 }
5382
5383 static int
yyl_sub(pTHX_ char * s,const int key)5384 yyl_sub(pTHX_ char *s, const int key)
5385 {
5386 char * const tmpbuf = PL_tokenbuf + 1;
5387 bool have_name, have_proto;
5388 STRLEN len;
5389 SV *format_name = NULL;
5390 bool is_method = (key == KEY_method);
5391
5392 /* method always implies signatures */
5393 bool is_sigsub = is_method || FEATURE_SIGNATURES_IS_ENABLED;
5394
5395 SSize_t off = s-SvPVX(PL_linestr);
5396 char *d;
5397
5398 s = skipspace(s); /* can move PL_linestr */
5399
5400 d = SvPVX(PL_linestr)+off;
5401
5402 SAVEBOOL(PL_parser->sig_seen);
5403 PL_parser->sig_seen = FALSE;
5404
5405 if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
5406 || *s == '\''
5407 || (*s == ':' && s[1] == ':'))
5408 {
5409
5410 PL_expect = XATTRBLOCK;
5411 d = scan_word6(s, tmpbuf, sizeof PL_tokenbuf - 1, TRUE,
5412 &len, TRUE);
5413 if (key == KEY_format)
5414 format_name = S_newSV_maybe_utf8(aTHX_ s, d - s);
5415 *PL_tokenbuf = '&';
5416 if (memchr(tmpbuf, ':', len) || key != KEY_sub
5417 || pad_findmy_pvn(
5418 PL_tokenbuf, len + 1, 0
5419 ) != NOT_IN_PAD)
5420 sv_setpvn(PL_subname, tmpbuf, len);
5421 else {
5422 sv_setsv(PL_subname,PL_curstname);
5423 sv_catpvs(PL_subname,"::");
5424 sv_catpvn(PL_subname,tmpbuf,len);
5425 }
5426 if (SvUTF8(PL_linestr))
5427 SvUTF8_on(PL_subname);
5428 have_name = TRUE;
5429
5430 s = skipspace(d);
5431 }
5432 else {
5433 if (key == KEY_my || key == KEY_our || key==KEY_state) {
5434 *d = '\0';
5435 /* diag_listed_as: Missing name in "%s sub" */
5436 Perl_croak(aTHX_
5437 "Missing name in \"%s\"", PL_bufptr);
5438 }
5439 PL_expect = XATTRTERM;
5440 sv_setpvs(PL_subname,"?");
5441 have_name = FALSE;
5442 }
5443
5444 if (key == KEY_format) {
5445 if (format_name) {
5446 NEXTVAL_NEXTTOKE.opval
5447 = newSVOP(OP_CONST,0, format_name);
5448 NEXTVAL_NEXTTOKE.opval->op_private |= OPpCONST_BARE;
5449 force_next(BAREWORD);
5450 }
5451 PREBLOCK(KW_FORMAT);
5452 }
5453
5454 /* Look for a prototype */
5455 if (*s == '(' && !is_sigsub) {
5456 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
5457 if (!s)
5458 Perl_croak(aTHX_ "Prototype not terminated");
5459 COPLINE_SET_FROM_MULTI_END;
5460 (void)validate_proto(PL_subname, PL_lex_stuff,
5461 ckWARN(WARN_ILLEGALPROTO), 0);
5462 have_proto = TRUE;
5463
5464 s = skipspace(s);
5465 }
5466 else
5467 have_proto = FALSE;
5468
5469 if ( !(*s == ':' && s[1] != ':')
5470 && (*s != '{' && *s != '(') && key != KEY_format)
5471 {
5472 assert(key == KEY_sub || key == KEY_method ||
5473 key == KEY_AUTOLOAD || key == KEY_DESTROY ||
5474 key == KEY_BEGIN || key == KEY_UNITCHECK || key == KEY_CHECK ||
5475 key == KEY_INIT || key == KEY_END ||
5476 key == KEY_my || key == KEY_state ||
5477 key == KEY_our);
5478 if (!have_name)
5479 Perl_croak(aTHX_ "Illegal declaration of anonymous subroutine");
5480 else if (*s != ';' && *s != '}')
5481 Perl_croak(aTHX_ "Illegal declaration of subroutine %" SVf, SVfARG(PL_subname));
5482 }
5483
5484 if (have_proto) {
5485 NEXTVAL_NEXTTOKE.opval =
5486 newSVOP(OP_CONST, 0, PL_lex_stuff);
5487 PL_lex_stuff = NULL;
5488 force_next(THING);
5489 }
5490
5491 if (!have_name) {
5492 if (PL_curstash)
5493 sv_setpvs(PL_subname, "__ANON__");
5494 else
5495 sv_setpvs(PL_subname, "__ANON__::__ANON__");
5496 if (is_method)
5497 TOKEN(KW_METHOD_anon);
5498 else if (is_sigsub)
5499 TOKEN(KW_SUB_anon_sig);
5500 else
5501 TOKEN(KW_SUB_anon);
5502 }
5503 force_ident_maybe_lex('&');
5504 if (is_method)
5505 TOKEN(KW_METHOD_named);
5506 else if (is_sigsub)
5507 TOKEN(KW_SUB_named_sig);
5508 else
5509 TOKEN(KW_SUB_named);
5510 }
5511
5512 static int
yyl_interpcasemod(pTHX_ char * s)5513 yyl_interpcasemod(pTHX_ char *s)
5514 {
5515 #ifdef DEBUGGING
5516 if (PL_bufptr != PL_bufend && *PL_bufptr != '\\')
5517 Perl_croak(aTHX_
5518 "panic: INTERPCASEMOD bufptr=%p, bufend=%p, *bufptr=%u",
5519 PL_bufptr, PL_bufend, *PL_bufptr);
5520 #endif
5521
5522 if (PL_bufptr == PL_bufend || PL_bufptr[1] == 'E') {
5523 /* if at a \E */
5524 if (PL_lex_casemods) {
5525 const char oldmod = PL_lex_casestack[--PL_lex_casemods];
5526 PL_lex_casestack[PL_lex_casemods] = '\0';
5527
5528 if (PL_bufptr != PL_bufend
5529 && (oldmod == 'L' || oldmod == 'U' || oldmod == 'Q'
5530 || oldmod == 'F')) {
5531 PL_bufptr += 2;
5532 PL_lex_state = LEX_INTERPCONCAT;
5533 }
5534 PL_lex_allbrackets--;
5535 return REPORT(PERLY_PAREN_CLOSE);
5536 }
5537 else if ( PL_bufptr != PL_bufend && PL_bufptr[1] == 'E' ) {
5538 /* Got an unpaired \E */
5539 Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
5540 "Useless use of \\E");
5541 }
5542 if (PL_bufptr != PL_bufend)
5543 PL_bufptr += 2;
5544 PL_lex_state = LEX_INTERPCONCAT;
5545 return yylex();
5546 }
5547 else {
5548 DEBUG_T({
5549 PerlIO_printf(Perl_debug_log, "### Saw case modifier\n");
5550 });
5551 s = PL_bufptr + 1;
5552 if (s[1] == '\\' && s[2] == 'E') {
5553 PL_bufptr = s + 3;
5554 PL_lex_state = LEX_INTERPCONCAT;
5555 return yylex();
5556 }
5557 else {
5558 I32 tmp;
5559 if ( memBEGINs(s, (STRLEN) (PL_bufend - s), "L\\u")
5560 || memBEGINs(s, (STRLEN) (PL_bufend - s), "U\\l"))
5561 {
5562 tmp = *s, *s = s[2], s[2] = (char)tmp; /* misordered... */
5563 }
5564 if ((*s == 'L' || *s == 'U' || *s == 'F')
5565 && (strpbrk(PL_lex_casestack, "LUF")))
5566 {
5567 PL_lex_casestack[--PL_lex_casemods] = '\0';
5568 PL_lex_allbrackets--;
5569 return REPORT(PERLY_PAREN_CLOSE);
5570 }
5571 if (PL_lex_casemods > 10)
5572 Renew(PL_lex_casestack, PL_lex_casemods + 2, char);
5573 PL_lex_casestack[PL_lex_casemods++] = *s;
5574 PL_lex_casestack[PL_lex_casemods] = '\0';
5575 PL_lex_state = LEX_INTERPCONCAT;
5576 NEXTVAL_NEXTTOKE.ival = 0;
5577 force_next((2<<24)|PERLY_PAREN_OPEN);
5578 if (*s == 'l')
5579 NEXTVAL_NEXTTOKE.ival = OP_LCFIRST;
5580 else if (*s == 'u')
5581 NEXTVAL_NEXTTOKE.ival = OP_UCFIRST;
5582 else if (*s == 'L')
5583 NEXTVAL_NEXTTOKE.ival = OP_LC;
5584 else if (*s == 'U')
5585 NEXTVAL_NEXTTOKE.ival = OP_UC;
5586 else if (*s == 'Q')
5587 NEXTVAL_NEXTTOKE.ival = OP_QUOTEMETA;
5588 else if (*s == 'F')
5589 NEXTVAL_NEXTTOKE.ival = OP_FC;
5590 else
5591 Perl_croak(aTHX_ "panic: yylex, *s=%u", *s);
5592 PL_bufptr = s + 1;
5593 }
5594 force_next(FUNC);
5595 if (PL_lex_starts) {
5596 s = PL_bufptr;
5597 PL_lex_starts = 0;
5598 /* commas only at base level: /$a\Ub$c/ => ($a,uc(b.$c)) */
5599 if (PL_lex_casemods == 1 && PL_lex_inpat)
5600 TOKEN(PERLY_COMMA);
5601 else
5602 AopNOASSIGN(OP_CONCAT);
5603 }
5604 else
5605 return yylex();
5606 }
5607 }
5608
5609 static int
yyl_secondclass_keyword(pTHX_ char * s,STRLEN len,int key,I32 * orig_keyword,GV ** pgv,GV *** pgvp)5610 yyl_secondclass_keyword(pTHX_ char *s, STRLEN len, int key, I32 *orig_keyword,
5611 GV **pgv, GV ***pgvp)
5612 {
5613 GV *ogv = NULL; /* override (winner) */
5614 GV *hgv = NULL; /* hidden (loser) */
5615 GV *gv = *pgv;
5616
5617 if (PL_expect != XOPERATOR && (*s != ':' || s[1] != ':')) {
5618 CV *cv;
5619 if ((gv = gv_fetchpvn_flags(PL_tokenbuf, len,
5620 (UTF ? SVf_UTF8 : 0)|GV_NOTQUAL,
5621 SVt_PVCV))
5622 && (cv = GvCVu(gv)))
5623 {
5624 if (GvIMPORTED_CV(gv))
5625 ogv = gv;
5626 else if (! CvNOWARN_AMBIGUOUS(cv))
5627 hgv = gv;
5628 }
5629 if (!ogv
5630 && (*pgvp = (GV**)hv_fetch(PL_globalstash, PL_tokenbuf, len, FALSE))
5631 && (gv = **pgvp)
5632 && (isGV_with_GP(gv)
5633 ? GvCVu(gv) && GvIMPORTED_CV(gv)
5634 : SvPCS_IMPORTED(gv)
5635 && (gv_init(gv, PL_globalstash, PL_tokenbuf,
5636 len, 0), 1)))
5637 {
5638 ogv = gv;
5639 }
5640 }
5641
5642 *pgv = gv;
5643
5644 if (ogv) {
5645 *orig_keyword = key;
5646 return 0; /* overridden by import or by GLOBAL */
5647 }
5648 else if (gv && !*pgvp
5649 && -key==KEY_lock /* XXX generalizable kludge */
5650 && GvCVu(gv))
5651 {
5652 return 0; /* any sub overrides "weak" keyword */
5653 }
5654 else { /* no override */
5655 key = -key;
5656 if (key == KEY_dump) {
5657 Perl_croak(aTHX_ "dump() must be written as CORE::dump() as of Perl 5.30");
5658 }
5659 *pgv = NULL;
5660 *pgvp = 0;
5661 if (hgv && key != KEY_x) /* never ambiguous */
5662 Perl_ck_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
5663 "Ambiguous call resolved as CORE::%s(), "
5664 "qualify as such or use &",
5665 GvENAME(hgv));
5666 return key;
5667 }
5668 }
5669
5670 static int
yyl_qw(pTHX_ char * s,STRLEN len)5671 yyl_qw(pTHX_ char *s, STRLEN len)
5672 {
5673 OP *words = NULL;
5674
5675 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
5676 if (!s)
5677 missingterm(NULL, 0);
5678
5679 COPLINE_SET_FROM_MULTI_END;
5680 PL_expect = XOPERATOR;
5681 if (SvCUR(PL_lex_stuff)) {
5682 int warned_comma = !ckWARN(WARN_QW);
5683 int warned_comment = warned_comma;
5684 char *d = SvPV_force(PL_lex_stuff, len);
5685 while (len) {
5686 for (; isSPACE(*d) && len; --len, ++d)
5687 /**/;
5688 if (len) {
5689 SV *sv;
5690 const char *b = d;
5691 if (!warned_comma || !warned_comment) {
5692 for (; !isSPACE(*d) && len; --len, ++d) {
5693 if (!warned_comma && *d == ',') {
5694 Perl_warner(aTHX_ packWARN(WARN_QW),
5695 "Possible attempt to separate words with commas");
5696 ++warned_comma;
5697 }
5698 else if (!warned_comment && *d == '#') {
5699 Perl_warner(aTHX_ packWARN(WARN_QW),
5700 "Possible attempt to put comments in qw() list");
5701 ++warned_comment;
5702 }
5703 }
5704 }
5705 else {
5706 for (; !isSPACE(*d) && len; --len, ++d)
5707 /**/;
5708 }
5709 sv = newSVpvn_utf8(b, d-b, DO_UTF8(PL_lex_stuff));
5710 words = op_append_elem(OP_LIST, words,
5711 newSVOP(OP_CONST, 0, tokeq(sv)));
5712 }
5713 }
5714 }
5715 if (!words)
5716 words = newNULLLIST();
5717 SvREFCNT_dec_NN(PL_lex_stuff);
5718 PL_lex_stuff = NULL;
5719 PL_expect = XOPERATOR;
5720 pl_yylval.opval = sawparens(words);
5721 TOKEN(QWLIST);
5722 }
5723
5724 static int
yyl_hyphen(pTHX_ char * s)5725 yyl_hyphen(pTHX_ char *s)
5726 {
5727 if (s[1] && isALPHA(s[1]) && !isWORDCHAR(s[2])) {
5728 I32 ftst = 0;
5729 char tmp;
5730
5731 s++;
5732 PL_bufptr = s;
5733 tmp = *s++;
5734
5735 while (s < PL_bufend && SPACE_OR_TAB(*s))
5736 s++;
5737
5738 if (memBEGINs(s, (STRLEN) (PL_bufend - s), "=>")) {
5739 s = force_word(PL_bufptr,BAREWORD,FALSE,FALSE);
5740 DEBUG_T( { printbuf("### Saw unary minus before =>, forcing word %s\n", s); } );
5741 OPERATOR(PERLY_MINUS); /* unary minus */
5742 }
5743 switch (tmp) {
5744 case 'r': ftst = OP_FTEREAD; break;
5745 case 'w': ftst = OP_FTEWRITE; break;
5746 case 'x': ftst = OP_FTEEXEC; break;
5747 case 'o': ftst = OP_FTEOWNED; break;
5748 case 'R': ftst = OP_FTRREAD; break;
5749 case 'W': ftst = OP_FTRWRITE; break;
5750 case 'X': ftst = OP_FTREXEC; break;
5751 case 'O': ftst = OP_FTROWNED; break;
5752 case 'e': ftst = OP_FTIS; break;
5753 case 'z': ftst = OP_FTZERO; break;
5754 case 's': ftst = OP_FTSIZE; break;
5755 case 'f': ftst = OP_FTFILE; break;
5756 case 'd': ftst = OP_FTDIR; break;
5757 case 'l': ftst = OP_FTLINK; break;
5758 case 'p': ftst = OP_FTPIPE; break;
5759 case 'S': ftst = OP_FTSOCK; break;
5760 case 'u': ftst = OP_FTSUID; break;
5761 case 'g': ftst = OP_FTSGID; break;
5762 case 'k': ftst = OP_FTSVTX; break;
5763 case 'b': ftst = OP_FTBLK; break;
5764 case 'c': ftst = OP_FTCHR; break;
5765 case 't': ftst = OP_FTTTY; break;
5766 case 'T': ftst = OP_FTTEXT; break;
5767 case 'B': ftst = OP_FTBINARY; break;
5768 case 'M': case 'A': case 'C':
5769 gv_fetchpvs("\024", GV_ADD|GV_NOTQUAL, SVt_PV);
5770 switch (tmp) {
5771 case 'M': ftst = OP_FTMTIME; break;
5772 case 'A': ftst = OP_FTATIME; break;
5773 case 'C': ftst = OP_FTCTIME; break;
5774 default: break;
5775 }
5776 break;
5777 default:
5778 break;
5779 }
5780 if (ftst) {
5781 PL_last_uni = PL_oldbufptr;
5782 PL_last_lop_op = (OPCODE)ftst;
5783 DEBUG_T( {
5784 PerlIO_printf(Perl_debug_log, "### Saw file test %c\n", (int)tmp);
5785 } );
5786 FTST(ftst);
5787 }
5788 else {
5789 /* Assume it was a minus followed by a one-letter named
5790 * subroutine call (or a -bareword), then. */
5791 DEBUG_T( {
5792 PerlIO_printf(Perl_debug_log,
5793 "### '-%c' looked like a file test but was not\n",
5794 (int) tmp);
5795 } );
5796 s = --PL_bufptr;
5797 }
5798 }
5799 {
5800 const char tmp = *s++;
5801 if (*s == tmp) {
5802 s++;
5803 if (PL_expect == XOPERATOR)
5804 TERM(POSTDEC);
5805 else
5806 OPERATOR(PREDEC);
5807 }
5808 else if (*s == '>') {
5809 s++;
5810 s = skipspace(s);
5811 if (((*s == '$' || *s == '&') && s[1] == '*')
5812 ||(*s == '$' && s[1] == '#' && s[2] == '*')
5813 ||((*s == '@' || *s == '%') && memCHRs("*[{", s[1]))
5814 ||(*s == '*' && (s[1] == '*' || s[1] == '{'))
5815 )
5816 {
5817 PL_expect = XPOSTDEREF;
5818 TOKEN(ARROW);
5819 }
5820 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
5821 s = force_word(s,METHCALL0,FALSE,TRUE);
5822 TOKEN(ARROW);
5823 }
5824 else if (*s == '$')
5825 OPERATOR(ARROW);
5826 else
5827 TERM(ARROW);
5828 }
5829 if (PL_expect == XOPERATOR) {
5830 if (*s == '='
5831 && !PL_lex_allbrackets
5832 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5833 {
5834 s--;
5835 TOKEN(0);
5836 }
5837 Aop(OP_SUBTRACT);
5838 }
5839 else {
5840 if (isSPACE(*s) || !isSPACE(*PL_bufptr))
5841 check_uni();
5842 OPERATOR(PERLY_MINUS); /* unary minus */
5843 }
5844 }
5845 }
5846
5847 static int
yyl_plus(pTHX_ char * s)5848 yyl_plus(pTHX_ char *s)
5849 {
5850 const char tmp = *s++;
5851 if (*s == tmp) {
5852 s++;
5853 if (PL_expect == XOPERATOR)
5854 TERM(POSTINC);
5855 else
5856 OPERATOR(PREINC);
5857 }
5858 if (PL_expect == XOPERATOR) {
5859 if (*s == '='
5860 && !PL_lex_allbrackets
5861 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5862 {
5863 s--;
5864 TOKEN(0);
5865 }
5866 Aop(OP_ADD);
5867 }
5868 else {
5869 if (isSPACE(*s) || !isSPACE(*PL_bufptr))
5870 check_uni();
5871 OPERATOR(PERLY_PLUS);
5872 }
5873 }
5874
5875 static int
yyl_star(pTHX_ char * s)5876 yyl_star(pTHX_ char *s)
5877 {
5878 if (PL_expect == XPOSTDEREF)
5879 POSTDEREF(PERLY_STAR);
5880
5881 if (PL_expect != XOPERATOR) {
5882 s = scan_ident(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE);
5883 PL_expect = XOPERATOR;
5884 force_ident(PL_tokenbuf, PERLY_STAR);
5885 if (!*PL_tokenbuf)
5886 PREREF(PERLY_STAR);
5887 TERM(PERLY_STAR);
5888 }
5889
5890 s++;
5891 if (*s == '*') {
5892 s++;
5893 if (*s == '=' && !PL_lex_allbrackets
5894 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5895 {
5896 s -= 2;
5897 TOKEN(0);
5898 }
5899 PWop(OP_POW);
5900 }
5901
5902 if (*s == '='
5903 && !PL_lex_allbrackets
5904 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5905 {
5906 s--;
5907 TOKEN(0);
5908 }
5909
5910 Mop(OP_MULTIPLY);
5911 }
5912
5913 static int
yyl_percent(pTHX_ char * s)5914 yyl_percent(pTHX_ char *s)
5915 {
5916 if (PL_expect == XOPERATOR) {
5917 if (s[1] == '='
5918 && !PL_lex_allbrackets
5919 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
5920 {
5921 TOKEN(0);
5922 }
5923 ++s;
5924 Mop(OP_MODULO);
5925 }
5926 else if (PL_expect == XPOSTDEREF)
5927 POSTDEREF(PERLY_PERCENT_SIGN);
5928
5929 PL_tokenbuf[0] = '%';
5930 s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE);
5931 pl_yylval.ival = 0;
5932 if (!PL_tokenbuf[1]) {
5933 PREREF(PERLY_PERCENT_SIGN);
5934 }
5935 if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
5936 && intuit_more(s, PL_bufend)) {
5937 if (*s == '[')
5938 PL_tokenbuf[0] = '@';
5939 }
5940 PL_expect = XOPERATOR;
5941 force_ident_maybe_lex('%');
5942 TERM(PERLY_PERCENT_SIGN);
5943 }
5944
5945 static int
yyl_caret(pTHX_ char * s)5946 yyl_caret(pTHX_ char *s)
5947 {
5948 char *d = s;
5949 const bool bof = cBOOL(FEATURE_BITWISE_IS_ENABLED);
5950 if (bof && s[1] == '.')
5951 s++;
5952 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
5953 (s[1] == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_BITWISE))
5954 {
5955 s = d;
5956 TOKEN(0);
5957 }
5958 s++;
5959 BOop(bof ? d == s-2 ? OP_SBIT_XOR : OP_NBIT_XOR : OP_BIT_XOR);
5960 }
5961
5962 static int
yyl_colon(pTHX_ char * s)5963 yyl_colon(pTHX_ char *s)
5964 {
5965 OP *attrs;
5966
5967 switch (PL_expect) {
5968 case XOPERATOR:
5969 if (!PL_in_my || (PL_lex_state != LEX_NORMAL && !PL_lex_brackets))
5970 break;
5971 PL_bufptr = s; /* update in case we back off */
5972 if (*s == '=') {
5973 Perl_croak(aTHX_
5974 "Use of := for an empty attribute list is not allowed");
5975 }
5976 goto grabattrs;
5977 case XATTRBLOCK:
5978 PL_expect = XBLOCK;
5979 goto grabattrs;
5980 case XATTRTERM:
5981 PL_expect = XTERMBLOCK;
5982 grabattrs:
5983 /* NB: as well as parsing normal attributes, we also end up
5984 * here if there is something looking like attributes
5985 * following a signature (which is illegal, but used to be
5986 * legal in 5.20..5.26). If the latter, we still parse the
5987 * attributes so that error messages(s) are less confusing,
5988 * but ignore them (parser->sig_seen).
5989 */
5990 s = skipspace(s);
5991 attrs = NULL;
5992 while (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
5993 I32 tmp;
5994 SV *sv;
5995 STRLEN len;
5996 char *d = scan_word6(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len, FALSE);
5997 if (isLOWER(*s) && (tmp = keyword(PL_tokenbuf, len, 0))) {
5998 if (tmp < 0) tmp = -tmp;
5999 switch (tmp) {
6000 case KEY_or:
6001 case KEY_and:
6002 case KEY_for:
6003 case KEY_foreach:
6004 case KEY_unless:
6005 case KEY_if:
6006 case KEY_while:
6007 case KEY_until:
6008 goto got_attrs;
6009 default:
6010 break;
6011 }
6012 }
6013 sv = newSVpvn_flags(s, len, UTF ? SVf_UTF8 : 0);
6014 if (*d == '(') {
6015 d = scan_str(d,TRUE,TRUE,FALSE,NULL);
6016 if (!d) {
6017 if (attrs)
6018 op_free(attrs);
6019 ASSUME(sv && SvREFCNT(sv) == 1);
6020 SvREFCNT_dec(sv);
6021 Perl_croak(aTHX_ "Unterminated attribute parameter in attribute list");
6022 }
6023 COPLINE_SET_FROM_MULTI_END;
6024 }
6025 if (PL_lex_stuff) {
6026 sv_catsv(sv, PL_lex_stuff);
6027 attrs = op_append_elem(OP_LIST, attrs,
6028 newSVOP(OP_CONST, 0, sv));
6029 SvREFCNT_dec_NN(PL_lex_stuff);
6030 PL_lex_stuff = NULL;
6031 }
6032 else {
6033 attrs = op_append_elem(OP_LIST, attrs,
6034 newSVOP(OP_CONST, 0, sv));
6035 }
6036 s = skipspace(d);
6037 if (*s == ':' && s[1] != ':')
6038 s = skipspace(s+1);
6039 else if (s == d)
6040 break; /* require real whitespace or :'s */
6041 /* XXX losing whitespace on sequential attributes here */
6042 }
6043
6044 if (*s != ';'
6045 && *s != '}'
6046 && !(PL_expect == XOPERATOR
6047 /* if an operator is expected, permit =, //= and ||= or ) to end */
6048 ? (*s == '=' || *s == ')' || *s == '/' || *s == '|')
6049 : (*s == '{' || *s == '(')))
6050 {
6051 const char q = ((*s == '\'') ? '"' : '\'');
6052 /* If here for an expression, and parsed no attrs, back off. */
6053 if (PL_expect == XOPERATOR && !attrs) {
6054 s = PL_bufptr;
6055 break;
6056 }
6057 /* MUST advance bufptr here to avoid bogus "at end of line"
6058 context messages from yyerror().
6059 */
6060 PL_bufptr = s;
6061 yyerror( (const char *)
6062 (*s
6063 ? Perl_form(aTHX_ "Invalid separator character "
6064 "%c%c%c in attribute list", q, *s, q)
6065 : "Unterminated attribute list" ) );
6066 if (attrs)
6067 op_free(attrs);
6068 OPERATOR(PERLY_COLON);
6069 }
6070
6071 got_attrs:
6072 if (PL_parser->sig_seen) {
6073 /* see comment about about sig_seen and parser error
6074 * handling */
6075 if (attrs)
6076 op_free(attrs);
6077 Perl_croak(aTHX_ "Subroutine attributes must come "
6078 "before the signature");
6079 }
6080 if (attrs) {
6081 NEXTVAL_NEXTTOKE.opval = attrs;
6082 force_next(THING);
6083 }
6084 TOKEN(COLONATTR);
6085 }
6086
6087 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_CLOSING) {
6088 s--;
6089 TOKEN(0);
6090 }
6091
6092 PL_lex_allbrackets--;
6093 OPERATOR(PERLY_COLON);
6094 }
6095
6096 static int
yyl_subproto(pTHX_ char * s,CV * cv)6097 yyl_subproto(pTHX_ char *s, CV *cv)
6098 {
6099 STRLEN protolen = CvPROTOLEN(cv);
6100 const char *proto = CvPROTO(cv);
6101 bool optional;
6102
6103 proto = S_strip_spaces(aTHX_ proto, &protolen);
6104 if (!protolen)
6105 TERM(FUNC0SUB);
6106 if ((optional = *proto == ';')) {
6107 do {
6108 proto++;
6109 } while (*proto == ';');
6110 }
6111
6112 if (
6113 (
6114 (
6115 *proto == '$' || *proto == '_'
6116 || *proto == '*' || *proto == '+'
6117 )
6118 && proto[1] == '\0'
6119 )
6120 || (
6121 *proto == '\\' && proto[1] && proto[2] == '\0'
6122 )
6123 ) {
6124 UNIPROTO(UNIOPSUB,optional);
6125 }
6126
6127 if (*proto == '\\' && proto[1] == '[') {
6128 const char *p = proto + 2;
6129 while(*p && *p != ']')
6130 ++p;
6131 if(*p == ']' && !p[1])
6132 UNIPROTO(UNIOPSUB,optional);
6133 }
6134
6135 if (*proto == '&' && *s == '{') {
6136 if (PL_curstash)
6137 sv_setpvs(PL_subname, "__ANON__");
6138 else
6139 sv_setpvs(PL_subname, "__ANON__::__ANON__");
6140 if (!PL_lex_allbrackets
6141 && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
6142 {
6143 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
6144 }
6145 PREBLOCK(LSTOPSUB);
6146 }
6147
6148 return KEY_NULL;
6149 }
6150
6151 static int
yyl_leftcurly(pTHX_ char * s,const U8 formbrack)6152 yyl_leftcurly(pTHX_ char *s, const U8 formbrack)
6153 {
6154 char *d;
6155 if (PL_lex_brackets > 100) {
6156 Renew(PL_lex_brackstack, PL_lex_brackets + 10, char);
6157 }
6158
6159 switch (PL_expect) {
6160 case XTERM:
6161 case XTERMORDORDOR:
6162 PL_lex_brackstack[PL_lex_brackets++] = XOPERATOR;
6163 PL_lex_allbrackets++;
6164 OPERATOR(HASHBRACK);
6165 case XOPERATOR:
6166 while (s < PL_bufend && SPACE_OR_TAB(*s))
6167 s++;
6168 d = s;
6169 PL_tokenbuf[0] = '\0';
6170 if (d < PL_bufend && *d == '-') {
6171 PL_tokenbuf[0] = '-';
6172 d++;
6173 while (d < PL_bufend && SPACE_OR_TAB(*d))
6174 d++;
6175 }
6176 if (d < PL_bufend && isIDFIRST_lazy_if_safe(d, PL_bufend, UTF)) {
6177 STRLEN len;
6178 d = scan_word6(d, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1,
6179 FALSE, &len, FALSE);
6180 while (d < PL_bufend && SPACE_OR_TAB(*d))
6181 d++;
6182 if (*d == '}') {
6183 const char minus = (PL_tokenbuf[0] == '-');
6184 s = force_word(s + minus, BAREWORD, FALSE, TRUE);
6185 if (minus)
6186 force_next(PERLY_MINUS);
6187 }
6188 }
6189 /* FALLTHROUGH */
6190 case XATTRTERM:
6191 case XTERMBLOCK:
6192 PL_lex_brackstack[PL_lex_brackets++] = XOPERATOR;
6193 PL_lex_allbrackets++;
6194 PL_expect = XSTATE;
6195 break;
6196 case XATTRBLOCK:
6197 case XBLOCK:
6198 PL_lex_brackstack[PL_lex_brackets++] = XSTATE;
6199 PL_lex_allbrackets++;
6200 PL_expect = XSTATE;
6201 break;
6202 case XBLOCKTERM:
6203 PL_lex_brackstack[PL_lex_brackets++] = XTERM;
6204 PL_lex_allbrackets++;
6205 PL_expect = XSTATE;
6206 break;
6207 default: {
6208 const char *t;
6209 if (PL_oldoldbufptr == PL_last_lop)
6210 PL_lex_brackstack[PL_lex_brackets++] = XTERM;
6211 else
6212 PL_lex_brackstack[PL_lex_brackets++] = XOPERATOR;
6213 PL_lex_allbrackets++;
6214 s = skipspace(s);
6215 if (*s == '}') {
6216 if (PL_expect == XREF && PL_lex_state == LEX_INTERPNORMAL) {
6217 PL_expect = XTERM;
6218 /* This hack is to get the ${} in the message. */
6219 PL_bufptr = s+1;
6220 yyerror("syntax error");
6221 yyquit();
6222 break;
6223 }
6224 OPERATOR(HASHBRACK);
6225 }
6226 if (PL_expect == XREF && PL_oldoldbufptr != PL_last_lop) {
6227 /* ${...} or @{...} etc., but not print {...}
6228 * Skip the disambiguation and treat this as a block.
6229 */
6230 goto block_expectation;
6231 }
6232 /* This hack serves to disambiguate a pair of curlies
6233 * as being a block or an anon hash. Normally, expectation
6234 * determines that, but in cases where we're not in a
6235 * position to expect anything in particular (like inside
6236 * eval"") we have to resolve the ambiguity. This code
6237 * covers the case where the first term in the curlies is a
6238 * quoted string. Most other cases need to be explicitly
6239 * disambiguated by prepending a "+" before the opening
6240 * curly in order to force resolution as an anon hash.
6241 *
6242 * XXX should probably propagate the outer expectation
6243 * into eval"" to rely less on this hack, but that could
6244 * potentially break current behavior of eval"".
6245 * GSAR 97-07-21
6246 */
6247 t = s;
6248 if (*s == '\'' || *s == '"' || *s == '`') {
6249 /* common case: get past first string, handling escapes */
6250 for (t++; t < PL_bufend && *t != *s;)
6251 if (*t++ == '\\')
6252 t++;
6253 t++;
6254 }
6255 else if (*s == 'q') {
6256 if (++t < PL_bufend
6257 && (!isWORDCHAR(*t)
6258 || ((*t == 'q' || *t == 'x') && ++t < PL_bufend
6259 && !isWORDCHAR(*t))))
6260 {
6261 /* skip q//-like construct */
6262 const char *tmps;
6263 char open, close, term;
6264 I32 brackets = 1;
6265
6266 while (t < PL_bufend && isSPACE(*t))
6267 t++;
6268 /* check for q => */
6269 if (t+1 < PL_bufend && t[0] == '=' && t[1] == '>') {
6270 OPERATOR(HASHBRACK);
6271 }
6272 term = *t;
6273 open = term;
6274 if (term && (tmps = memCHRs("([{< )]}> )]}>",term)))
6275 term = tmps[5];
6276 close = term;
6277 if (open == close)
6278 for (t++; t < PL_bufend; t++) {
6279 if (*t == '\\' && t+1 < PL_bufend && open != '\\')
6280 t++;
6281 else if (*t == open)
6282 break;
6283 }
6284 else {
6285 for (t++; t < PL_bufend; t++) {
6286 if (*t == '\\' && t+1 < PL_bufend)
6287 t++;
6288 else if (*t == close && --brackets <= 0)
6289 break;
6290 else if (*t == open)
6291 brackets++;
6292 }
6293 }
6294 t++;
6295 }
6296 else
6297 /* skip plain q word */
6298 while ( t < PL_bufend
6299 && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
6300 {
6301 t += UTF ? UTF8SKIP(t) : 1;
6302 }
6303 }
6304 else if (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) {
6305 t += UTF ? UTF8SKIP(t) : 1;
6306 while ( t < PL_bufend
6307 && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
6308 {
6309 t += UTF ? UTF8SKIP(t) : 1;
6310 }
6311 }
6312 while (t < PL_bufend && isSPACE(*t))
6313 t++;
6314 /* if comma follows first term, call it an anon hash */
6315 /* XXX it could be a comma expression with loop modifiers */
6316 if (t < PL_bufend && ((*t == ',' && (*s == 'q' || !isLOWER(*s)))
6317 || (*t == '=' && t[1] == '>')))
6318 OPERATOR(HASHBRACK);
6319 if (PL_expect == XREF) {
6320 block_expectation:
6321 /* If there is an opening brace or 'sub:', treat it
6322 as a term to make ${{...}}{k} and &{sub:attr...}
6323 dwim. Otherwise, treat it as a statement, so
6324 map {no strict; ...} works.
6325 */
6326 s = skipspace(s);
6327 if (*s == '{') {
6328 PL_expect = XTERM;
6329 break;
6330 }
6331 if (memBEGINs(s, (STRLEN) (PL_bufend - s), "sub")) {
6332 PL_bufptr = s;
6333 d = s + 3;
6334 d = skipspace(d);
6335 s = PL_bufptr;
6336 if (*d == ':') {
6337 PL_expect = XTERM;
6338 break;
6339 }
6340 }
6341 PL_expect = XSTATE;
6342 }
6343 else {
6344 PL_lex_brackstack[PL_lex_brackets-1] = XSTATE;
6345 PL_expect = XSTATE;
6346 }
6347 }
6348 break;
6349 }
6350
6351 pl_yylval.ival = CopLINE(PL_curcop);
6352 PL_copline = NOLINE; /* invalidate current command line number */
6353 TOKEN(formbrack ? PERLY_EQUAL_SIGN : PERLY_BRACE_OPEN);
6354 }
6355
6356 static int
yyl_rightcurly(pTHX_ char * s,const U8 formbrack)6357 yyl_rightcurly(pTHX_ char *s, const U8 formbrack)
6358 {
6359 assert(s != PL_bufend);
6360 s++;
6361
6362 if (PL_lex_brackets <= 0)
6363 /* diag_listed_as: Unmatched right %s bracket */
6364 yyerror("Unmatched right curly bracket");
6365 else
6366 PL_expect = (expectation)PL_lex_brackstack[--PL_lex_brackets];
6367
6368 PL_lex_allbrackets--;
6369
6370 if (PL_lex_state == LEX_INTERPNORMAL) {
6371 if (PL_lex_brackets == 0) {
6372 if (PL_expect & XFAKEBRACK) {
6373 PL_expect &= XENUMMASK;
6374 PL_lex_state = LEX_INTERPEND;
6375 PL_bufptr = s;
6376 return yylex(); /* ignore fake brackets */
6377 }
6378 if (PL_lex_inwhat == OP_SUBST && PL_lex_repl == PL_linestr
6379 && SvEVALED(PL_lex_repl))
6380 PL_lex_state = LEX_INTERPEND;
6381 else if (*s == '-' && s[1] == '>')
6382 PL_lex_state = LEX_INTERPENDMAYBE;
6383 else if (*s != '[' && *s != '{')
6384 PL_lex_state = LEX_INTERPEND;
6385 }
6386 }
6387
6388 if (PL_expect & XFAKEBRACK) {
6389 PL_expect &= XENUMMASK;
6390 PL_bufptr = s;
6391 return yylex(); /* ignore fake brackets */
6392 }
6393
6394 force_next(formbrack ? PERLY_DOT : PERLY_BRACE_CLOSE);
6395 if (formbrack) LEAVE_with_name("lex_format");
6396 if (formbrack == 2) { /* means . where arguments were expected */
6397 force_next(PERLY_SEMICOLON);
6398 TOKEN(FORMRBRACK);
6399 }
6400
6401 TOKEN(PERLY_SEMICOLON);
6402 }
6403
6404 static int
yyl_ampersand(pTHX_ char * s)6405 yyl_ampersand(pTHX_ char *s)
6406 {
6407 if (PL_expect == XPOSTDEREF)
6408 POSTDEREF(PERLY_AMPERSAND);
6409
6410 s++;
6411 if (*s++ == '&') {
6412 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6413 (*s == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_LOGIC)) {
6414 s -= 2;
6415 TOKEN(0);
6416 }
6417 AOPERATOR(ANDAND);
6418 }
6419 s--;
6420
6421 if (PL_expect == XOPERATOR) {
6422 char *d;
6423 bool bof;
6424 if ( PL_bufptr == PL_linestart
6425 && ckWARN(WARN_SEMICOLON)
6426 && isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
6427 {
6428 CopLINE_dec(PL_curcop);
6429 Perl_warner(aTHX_ packWARN(WARN_SEMICOLON), "%s", PL_warn_nosemi);
6430 CopLINE_inc(PL_curcop);
6431 }
6432 d = s;
6433 if ((bof = FEATURE_BITWISE_IS_ENABLED) && *s == '.')
6434 s++;
6435 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6436 (*s == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_BITWISE)) {
6437 s = d;
6438 s--;
6439 TOKEN(0);
6440 }
6441 if (d == s)
6442 BAop(bof ? OP_NBIT_AND : OP_BIT_AND);
6443 else
6444 BAop(OP_SBIT_AND);
6445 }
6446
6447 PL_tokenbuf[0] = '&';
6448 s = scan_ident(s - 1, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, TRUE);
6449 pl_yylval.ival = (OPpENTERSUB_AMPER<<8);
6450
6451 if (PL_tokenbuf[1])
6452 force_ident_maybe_lex('&');
6453 else
6454 PREREF(PERLY_AMPERSAND);
6455
6456 TERM(PERLY_AMPERSAND);
6457 }
6458
6459 static int
yyl_verticalbar(pTHX_ char * s)6460 yyl_verticalbar(pTHX_ char *s)
6461 {
6462 char *d;
6463 bool bof;
6464
6465 s++;
6466 if (*s++ == '|') {
6467 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6468 (*s == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_LOGIC)) {
6469 s -= 2;
6470 TOKEN(0);
6471 }
6472 AOPERATOR(OROR);
6473 }
6474
6475 s--;
6476 d = s;
6477 if ((bof = FEATURE_BITWISE_IS_ENABLED) && *s == '.')
6478 s++;
6479
6480 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6481 (*s == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_BITWISE)) {
6482 s = d - 1;
6483 TOKEN(0);
6484 }
6485
6486 BOop(bof ? s == d ? OP_NBIT_OR : OP_SBIT_OR : OP_BIT_OR);
6487 }
6488
6489 static int
yyl_bang(pTHX_ char * s)6490 yyl_bang(pTHX_ char *s)
6491 {
6492 const char tmp = *s++;
6493 if (tmp == '=') {
6494 /* was this !=~ where !~ was meant?
6495 * warn on m:!=~\s+([/?]|[msy]\W|tr\W): */
6496
6497 if (*s == '~' && ckWARN(WARN_SYNTAX)) {
6498 const char *t = s+1;
6499
6500 while (t < PL_bufend && isSPACE(*t))
6501 ++t;
6502
6503 if (*t == '/' || *t == '?'
6504 || ((*t == 'm' || *t == 's' || *t == 'y')
6505 && !isWORDCHAR(t[1]))
6506 || (*t == 't' && t[1] == 'r' && !isWORDCHAR(t[2])))
6507 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
6508 "!=~ should be !~");
6509 }
6510
6511 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE) {
6512 s -= 2;
6513 TOKEN(0);
6514 }
6515
6516 ChEop(OP_NE);
6517 }
6518
6519 if (tmp == '~')
6520 PMop(OP_NOT);
6521
6522 s--;
6523 OPERATOR(PERLY_EXCLAMATION_MARK);
6524 }
6525
6526 static int
yyl_snail(pTHX_ char * s)6527 yyl_snail(pTHX_ char *s)
6528 {
6529 if (PL_expect == XPOSTDEREF)
6530 POSTDEREF(PERLY_SNAIL);
6531 PL_tokenbuf[0] = '@';
6532 s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE);
6533 if (PL_expect == XOPERATOR) {
6534 char *d = s;
6535 if (PL_bufptr > s) {
6536 d = PL_bufptr-1;
6537 PL_bufptr = PL_oldbufptr;
6538 }
6539 no_op("Array", d);
6540 }
6541 pl_yylval.ival = 0;
6542 if (!PL_tokenbuf[1]) {
6543 PREREF(PERLY_SNAIL);
6544 }
6545 if (PL_lex_state == LEX_NORMAL || PL_lex_brackets)
6546 s = skipspace(s);
6547 if ( (PL_expect != XREF || PL_oldoldbufptr == PL_last_lop)
6548 && intuit_more(s, PL_bufend))
6549 {
6550 if (*s == '{')
6551 PL_tokenbuf[0] = '%';
6552
6553 /* Warn about @ where they meant $. */
6554 if (*s == '[' || *s == '{') {
6555 if (ckWARN(WARN_SYNTAX)) {
6556 S_check_scalar_slice(aTHX_ s);
6557 }
6558 }
6559 }
6560 PL_expect = XOPERATOR;
6561 force_ident_maybe_lex('@');
6562 TERM(PERLY_SNAIL);
6563 }
6564
6565 static int
yyl_slash(pTHX_ char * s)6566 yyl_slash(pTHX_ char *s)
6567 {
6568 if ((PL_expect == XOPERATOR || PL_expect == XTERMORDORDOR) && s[1] == '/') {
6569 if (!PL_lex_allbrackets && PL_lex_fakeeof >=
6570 (s[2] == '=' ? LEX_FAKEEOF_ASSIGN : LEX_FAKEEOF_LOGIC))
6571 TOKEN(0);
6572 s += 2;
6573 AOPERATOR(DORDOR);
6574 }
6575 else if (PL_expect == XOPERATOR) {
6576 s++;
6577 if (*s == '=' && !PL_lex_allbrackets
6578 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
6579 {
6580 s--;
6581 TOKEN(0);
6582 }
6583 Mop(OP_DIVIDE);
6584 }
6585 else {
6586 /* Disable warning on "study /blah/" */
6587 if ( PL_oldoldbufptr == PL_last_uni
6588 && ( *PL_last_uni != 's' || s - PL_last_uni < 5
6589 || memNE(PL_last_uni, "study", 5)
6590 || isWORDCHAR_lazy_if_safe(PL_last_uni+5, PL_bufend, UTF)
6591 ))
6592 check_uni();
6593 s = scan_pat(s,OP_MATCH);
6594 TERM(sublex_start());
6595 }
6596 }
6597
6598 static int
yyl_leftsquare(pTHX_ char * s)6599 yyl_leftsquare(pTHX_ char *s)
6600 {
6601 if (PL_lex_brackets > 100)
6602 Renew(PL_lex_brackstack, PL_lex_brackets + 10, char);
6603 PL_lex_brackstack[PL_lex_brackets++] = 0;
6604 PL_lex_allbrackets++;
6605 s++;
6606 OPERATOR(PERLY_BRACKET_OPEN);
6607 }
6608
6609 static int
yyl_rightsquare(pTHX_ char * s)6610 yyl_rightsquare(pTHX_ char *s)
6611 {
6612 if (PL_lex_brackets && PL_lex_brackstack[PL_lex_brackets-1] == XFAKEEOF)
6613 TOKEN(0);
6614 s++;
6615 if (PL_lex_brackets <= 0)
6616 /* diag_listed_as: Unmatched right %s bracket */
6617 yyerror("Unmatched right square bracket");
6618 else
6619 --PL_lex_brackets;
6620 PL_lex_allbrackets--;
6621 if (PL_lex_state == LEX_INTERPNORMAL) {
6622 if (PL_lex_brackets == 0) {
6623 if (*s == '-' && s[1] == '>')
6624 PL_lex_state = LEX_INTERPENDMAYBE;
6625 else if (*s != '[' && *s != '{')
6626 PL_lex_state = LEX_INTERPEND;
6627 }
6628 }
6629 TERM(PERLY_BRACKET_CLOSE);
6630 }
6631
6632 static int
yyl_tilde(pTHX_ char * s)6633 yyl_tilde(pTHX_ char *s)
6634 {
6635 bool bof;
6636 if (s[1] == '~' && (PL_expect == XOPERATOR || PL_expect == XTERMORDORDOR)) {
6637 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
6638 TOKEN(0);
6639 s += 2;
6640 Perl_ck_warner_d(aTHX_
6641 packWARN(WARN_DEPRECATED__SMARTMATCH),
6642 "Smartmatch is deprecated");
6643 NCEop(OP_SMARTMATCH);
6644 }
6645 s++;
6646 if ((bof = FEATURE_BITWISE_IS_ENABLED) && *s == '.') {
6647 s++;
6648 BCop(OP_SCOMPLEMENT);
6649 }
6650 BCop(bof ? OP_NCOMPLEMENT : OP_COMPLEMENT);
6651 }
6652
6653 static int
yyl_leftparen(pTHX_ char * s)6654 yyl_leftparen(pTHX_ char *s)
6655 {
6656 if (PL_last_lop == PL_oldoldbufptr || PL_last_uni == PL_oldoldbufptr)
6657 PL_oldbufptr = PL_oldoldbufptr; /* allow print(STDOUT 123) */
6658 else
6659 PL_expect = XTERM;
6660 s = skipspace(s);
6661 PL_lex_allbrackets++;
6662 TOKEN(PERLY_PAREN_OPEN);
6663 }
6664
6665 static int
yyl_rightparen(pTHX_ char * s)6666 yyl_rightparen(pTHX_ char *s)
6667 {
6668 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_CLOSING)
6669 TOKEN(0);
6670 s++;
6671 PL_lex_allbrackets--;
6672 s = skipspace(s);
6673 if (*s == '{')
6674 PREBLOCK(PERLY_PAREN_CLOSE);
6675 TERM(PERLY_PAREN_CLOSE);
6676 }
6677
6678 static int
yyl_leftpointy(pTHX_ char * s)6679 yyl_leftpointy(pTHX_ char *s)
6680 {
6681 char tmp;
6682
6683 if (PL_expect != XOPERATOR) {
6684 if (s[1] != '<' && !memchr(s,'>', PL_bufend - s))
6685 check_uni();
6686 if (s[1] == '<' && s[2] != '>')
6687 s = scan_heredoc(s);
6688 else
6689 s = scan_inputsymbol(s);
6690 PL_expect = XOPERATOR;
6691 TOKEN(sublex_start());
6692 }
6693
6694 s++;
6695
6696 tmp = *s++;
6697 if (tmp == '<') {
6698 if (*s == '=' && !PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN) {
6699 s -= 2;
6700 TOKEN(0);
6701 }
6702 SHop(OP_LEFT_SHIFT);
6703 }
6704 if (tmp == '=') {
6705 tmp = *s++;
6706 if (tmp == '>') {
6707 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE) {
6708 s -= 3;
6709 TOKEN(0);
6710 }
6711 NCEop(OP_NCMP);
6712 }
6713 s--;
6714 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE) {
6715 s -= 2;
6716 TOKEN(0);
6717 }
6718 ChRop(OP_LE);
6719 }
6720
6721 s--;
6722 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE) {
6723 s--;
6724 TOKEN(0);
6725 }
6726
6727 ChRop(OP_LT);
6728 }
6729
6730 static int
yyl_rightpointy(pTHX_ char * s)6731 yyl_rightpointy(pTHX_ char *s)
6732 {
6733 const char tmp = *s++;
6734
6735 if (tmp == '>') {
6736 if (*s == '=' && !PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN) {
6737 s -= 2;
6738 TOKEN(0);
6739 }
6740 SHop(OP_RIGHT_SHIFT);
6741 }
6742 else if (tmp == '=') {
6743 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE) {
6744 s -= 2;
6745 TOKEN(0);
6746 }
6747 ChRop(OP_GE);
6748 }
6749
6750 s--;
6751 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE) {
6752 s--;
6753 TOKEN(0);
6754 }
6755
6756 ChRop(OP_GT);
6757 }
6758
6759 static int
yyl_sglquote(pTHX_ char * s)6760 yyl_sglquote(pTHX_ char *s)
6761 {
6762 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
6763 if (!s)
6764 missingterm(NULL, 0);
6765 COPLINE_SET_FROM_MULTI_END;
6766 DEBUG_T( { printbuf("### Saw string before %s\n", s); } );
6767 if (PL_expect == XOPERATOR) {
6768 no_op("String",s);
6769 }
6770 pl_yylval.ival = OP_CONST;
6771 TERM(sublex_start());
6772 }
6773
6774 static int
yyl_dblquote(pTHX_ char * s)6775 yyl_dblquote(pTHX_ char *s)
6776 {
6777 char *d;
6778 STRLEN len;
6779 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
6780 DEBUG_T( {
6781 if (s)
6782 printbuf("### Saw string before %s\n", s);
6783 else
6784 PerlIO_printf(Perl_debug_log,
6785 "### Saw unterminated string\n");
6786 } );
6787 if (PL_expect == XOPERATOR) {
6788 no_op("String",s);
6789 }
6790 if (!s)
6791 missingterm(NULL, 0);
6792 pl_yylval.ival = OP_CONST;
6793 /* FIXME. I think that this can be const if char *d is replaced by
6794 more localised variables. */
6795 for (d = SvPV(PL_lex_stuff, len); len; len--, d++) {
6796 if (*d == '$' || *d == '@' || *d == '\\' || !UTF8_IS_INVARIANT((U8)*d)) {
6797 pl_yylval.ival = OP_STRINGIFY;
6798 break;
6799 }
6800 }
6801 if (pl_yylval.ival == OP_CONST)
6802 COPLINE_SET_FROM_MULTI_END;
6803 TERM(sublex_start());
6804 }
6805
6806 static int
yyl_backtick(pTHX_ char * s)6807 yyl_backtick(pTHX_ char *s)
6808 {
6809 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
6810 DEBUG_T( {
6811 if (s)
6812 printbuf("### Saw backtick string before %s\n", s);
6813 else
6814 PerlIO_printf(Perl_debug_log,
6815 "### Saw unterminated backtick string\n");
6816 } );
6817 if (PL_expect == XOPERATOR)
6818 no_op("Backticks",s);
6819 if (!s)
6820 missingterm(NULL, 0);
6821 pl_yylval.ival = OP_BACKTICK;
6822 TERM(sublex_start());
6823 }
6824
6825 static int
yyl_backslash(pTHX_ char * s)6826 yyl_backslash(pTHX_ char *s)
6827 {
6828 if (PL_lex_inwhat == OP_SUBST && PL_lex_repl == PL_linestr && isDIGIT(*s))
6829 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX),"Can't use \\%c to mean $%c in expression",
6830 *s, *s);
6831 if (PL_expect == XOPERATOR)
6832 no_op("Backslash",s);
6833 OPERATOR(REFGEN);
6834 }
6835
6836 static void
yyl_data_handle(pTHX)6837 yyl_data_handle(pTHX)
6838 {
6839 HV * const stash = PL_tokenbuf[2] == 'D' && PL_curstash
6840 ? PL_curstash
6841 : PL_defstash;
6842 GV *gv = (GV *)*hv_fetchs(stash, "DATA", 1);
6843
6844 if (!isGV(gv))
6845 gv_init(gv,stash,"DATA",4,0);
6846
6847 GvMULTI_on(gv);
6848 if (!GvIO(gv))
6849 GvIOp(gv) = newIO();
6850 IoIFP(GvIOp(gv)) = PL_rsfp;
6851
6852 /* Mark this internal pseudo-handle as clean */
6853 IoFLAGS(GvIOp(gv)) |= IOf_UNTAINT;
6854 if ((PerlIO*)PL_rsfp == PerlIO_stdin())
6855 IoTYPE(GvIOp(gv)) = IoTYPE_STD;
6856 else
6857 IoTYPE(GvIOp(gv)) = IoTYPE_RDONLY;
6858
6859 #if defined(WIN32) && !defined(PERL_TEXTMODE_SCRIPTS)
6860 /* if the script was opened in binmode, we need to revert
6861 * it to text mode for compatibility; but only iff it has CRs
6862 * XXX this is a questionable hack at best. */
6863 if (PL_bufend-PL_bufptr > 2
6864 && PL_bufend[-1] == '\n' && PL_bufend[-2] == '\r')
6865 {
6866 Off_t loc = 0;
6867 if (IoTYPE(GvIOp(gv)) == IoTYPE_RDONLY) {
6868 loc = PerlIO_tell(PL_rsfp);
6869 (void)PerlIO_seek(PL_rsfp, 0L, 0);
6870 }
6871 if (PerlLIO_setmode(PerlIO_fileno(PL_rsfp), O_TEXT) != -1) {
6872 if (loc > 0)
6873 PerlIO_seek(PL_rsfp, loc, 0);
6874 }
6875 }
6876 #endif
6877
6878 #ifdef PERLIO_LAYERS
6879 if (!IN_BYTES) {
6880 if (UTF)
6881 PerlIO_apply_layers(aTHX_ PL_rsfp, NULL, ":utf8");
6882 }
6883 #endif
6884
6885 PL_rsfp = NULL;
6886 }
6887
6888 PERL_STATIC_NO_RET void yyl_croak_unrecognised(pTHX_ char*)
6889 __attribute__noreturn__;
6890
6891 PERL_STATIC_NO_RET void
yyl_croak_unrecognised(pTHX_ char * s)6892 yyl_croak_unrecognised(pTHX_ char *s)
6893 {
6894 SV *dsv = newSVpvs_flags("", SVs_TEMP);
6895 const char *c;
6896 char *d;
6897 STRLEN len;
6898
6899 if (UTF) {
6900 STRLEN skiplen = UTF8SKIP(s);
6901 STRLEN stravail = PL_bufend - s;
6902 c = sv_uni_display(dsv, newSVpvn_flags(s,
6903 skiplen > stravail ? stravail : skiplen,
6904 SVs_TEMP | SVf_UTF8),
6905 10, UNI_DISPLAY_ISPRINT);
6906 }
6907 else {
6908 c = Perl_form(aTHX_ "\\x%02X", (unsigned char)*s);
6909 }
6910
6911 if (s >= PL_linestart) {
6912 d = PL_linestart;
6913 }
6914 else {
6915 /* somehow (probably due to a parse failure), PL_linestart has advanced
6916 * pass PL_bufptr, get a reasonable beginning of line
6917 */
6918 d = s;
6919 while (d > SvPVX(PL_linestr) && d[-1] && d[-1] != '\n')
6920 --d;
6921 }
6922 len = UTF ? Perl_utf8_length(aTHX_ (U8 *) d, (U8 *) s) : (STRLEN) (s - d);
6923 if (len > UNRECOGNIZED_PRECEDE_COUNT) {
6924 d = UTF ? (char *) utf8_hop_back((U8 *) s, -UNRECOGNIZED_PRECEDE_COUNT, (U8 *)d) : s - UNRECOGNIZED_PRECEDE_COUNT;
6925 }
6926
6927 Perl_croak(aTHX_ "Unrecognized character %s; marked by <-- HERE after %" UTF8f "<-- HERE near column %d", c,
6928 UTF8fARG(UTF, (s - d), d),
6929 (int) len + 1);
6930 }
6931
6932 static int
yyl_require(pTHX_ char * s,I32 orig_keyword)6933 yyl_require(pTHX_ char *s, I32 orig_keyword)
6934 {
6935 s = skipspace(s);
6936 if (isDIGIT(*s)) {
6937 s = force_version(s, FALSE);
6938 }
6939 else if (*s != 'v' || !isDIGIT(s[1])
6940 || (s = force_version(s, TRUE), *s == 'v'))
6941 {
6942 *PL_tokenbuf = '\0';
6943 s = force_word(s,BAREWORD,TRUE,TRUE);
6944 if (isIDFIRST_lazy_if_safe(PL_tokenbuf,
6945 PL_tokenbuf + sizeof(PL_tokenbuf),
6946 UTF))
6947 {
6948 gv_stashpvn(PL_tokenbuf, strlen(PL_tokenbuf),
6949 GV_ADD | (UTF ? SVf_UTF8 : 0));
6950 }
6951 else if (*s == '<')
6952 yyerror("<> at require-statement should be quotes");
6953 }
6954
6955 if (orig_keyword == KEY_require)
6956 pl_yylval.ival = 1;
6957 else
6958 pl_yylval.ival = 0;
6959
6960 PL_expect = PL_nexttoke ? XOPERATOR : XTERM;
6961 PL_bufptr = s;
6962 PL_last_uni = PL_oldbufptr;
6963 PL_last_lop_op = OP_REQUIRE;
6964 s = skipspace(s);
6965 return REPORT( (int)KW_REQUIRE );
6966 }
6967
6968 static int
yyl_foreach(pTHX_ char * s)6969 yyl_foreach(pTHX_ char *s)
6970 {
6971 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
6972 return REPORT(0);
6973 pl_yylval.ival = CopLINE(PL_curcop);
6974 s = skipspace(s);
6975 if (PL_expect == XSTATE && isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
6976 char *p = s;
6977 SSize_t s_off = s - SvPVX(PL_linestr);
6978 bool paren_is_valid = FALSE;
6979 bool maybe_package = FALSE;
6980 bool saw_core = FALSE;
6981 bool core_valid = FALSE;
6982
6983 if (UNLIKELY(memBEGINPs(p, (STRLEN) (PL_bufend - p), "CORE::"))) {
6984 saw_core = TRUE;
6985 p += 6;
6986 }
6987 if (LIKELY(memBEGINPs(p, (STRLEN) (PL_bufend - p), "my"))) {
6988 core_valid = TRUE;
6989 paren_is_valid = TRUE;
6990 if (isSPACE(p[2])) {
6991 p = skipspace(p + 3);
6992 maybe_package = TRUE;
6993 }
6994 else {
6995 p += 2;
6996 }
6997 }
6998 else if (memBEGINPs(p, (STRLEN) (PL_bufend - p), "our")) {
6999 core_valid = TRUE;
7000 if (isSPACE(p[3])) {
7001 p = skipspace(p + 4);
7002 maybe_package = TRUE;
7003 }
7004 else {
7005 p += 3;
7006 }
7007 }
7008 else if (memBEGINPs(p, (STRLEN) (PL_bufend - p), "state")) {
7009 core_valid = TRUE;
7010 if (isSPACE(p[5])) {
7011 p = skipspace(p + 6);
7012 }
7013 else {
7014 p += 5;
7015 }
7016 }
7017 if (saw_core && !core_valid) {
7018 Perl_croak(aTHX_ "Missing $ on loop variable");
7019 }
7020
7021 if (maybe_package && !saw_core) {
7022 /* skip optional package name, as in "for my abc $x (..)" */
7023 if (UNLIKELY(isIDFIRST_lazy_if_safe(p, PL_bufend, UTF))) {
7024 STRLEN len;
7025 p = scan_word6(p, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len, TRUE);
7026 p = skipspace(p);
7027 paren_is_valid = FALSE;
7028 }
7029 }
7030
7031 if (UNLIKELY(paren_is_valid && *p == '(')) {
7032 Perl_ck_warner_d(aTHX_
7033 packWARN(WARN_EXPERIMENTAL__FOR_LIST),
7034 "for my (...) is experimental");
7035 }
7036 else if (UNLIKELY(*p != '$' && *p != '\\')) {
7037 /* "for myfoo (" will end up here, but with p pointing at the 'f' */
7038 Perl_croak(aTHX_ "Missing $ on loop variable");
7039 }
7040 /* The buffer may have been reallocated, update s */
7041 s = SvPVX(PL_linestr) + s_off;
7042 }
7043 OPERATOR(KW_FOR);
7044 }
7045
7046 static int
yyl_do(pTHX_ char * s,I32 orig_keyword)7047 yyl_do(pTHX_ char *s, I32 orig_keyword)
7048 {
7049 s = skipspace(s);
7050 if (*s == '{')
7051 PRETERMBLOCK(KW_DO);
7052 if (*s != '\'') {
7053 char *d;
7054 STRLEN len;
7055 *PL_tokenbuf = '&';
7056 d = scan_word6(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1,
7057 1, &len, TRUE);
7058 if (len && memNEs(PL_tokenbuf+1, len, "CORE")
7059 && !keyword(PL_tokenbuf + 1, len, 0)) {
7060 SSize_t off = s-SvPVX(PL_linestr);
7061 d = skipspace(d);
7062 s = SvPVX(PL_linestr)+off;
7063 if (*d == '(') {
7064 force_ident_maybe_lex('&');
7065 s = d;
7066 }
7067 }
7068 }
7069 if (orig_keyword == KEY_do)
7070 pl_yylval.ival = 1;
7071 else
7072 pl_yylval.ival = 0;
7073 OPERATOR(KW_DO);
7074 }
7075
7076 static int
yyl_my(pTHX_ char * s,I32 my)7077 yyl_my(pTHX_ char *s, I32 my)
7078 {
7079 if (PL_in_my) {
7080 PL_bufptr = s;
7081 yyerror(Perl_form(aTHX_
7082 "Can't redeclare \"%s\" in \"%s\"",
7083 my == KEY_my ? "my" :
7084 my == KEY_state ? "state" : "our",
7085 PL_in_my == KEY_my ? "my" :
7086 PL_in_my == KEY_state ? "state" : "our"));
7087 }
7088 PL_in_my = (U16)my;
7089 s = skipspace(s);
7090 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
7091 STRLEN len;
7092 s = scan_word6(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len, TRUE);
7093 if (memEQs(PL_tokenbuf, len, "sub"))
7094 return yyl_sub(aTHX_ s, my);
7095 PL_in_my_stash = find_in_my_stash(PL_tokenbuf, len);
7096 if (!PL_in_my_stash) {
7097 char tmpbuf[1024];
7098 int i;
7099 PL_bufptr = s;
7100 i = my_snprintf(tmpbuf, sizeof(tmpbuf), "No such class %.1000s", PL_tokenbuf);
7101 PERL_MY_SNPRINTF_POST_GUARD(i, sizeof(tmpbuf));
7102 yyerror_pv(tmpbuf, UTF ? SVf_UTF8 : 0);
7103 }
7104 }
7105 else if (*s == '\\') {
7106 if (!FEATURE_MYREF_IS_ENABLED)
7107 Perl_croak(aTHX_ "The experimental declared_refs "
7108 "feature is not enabled");
7109 Perl_ck_warner_d(aTHX_
7110 packWARN(WARN_EXPERIMENTAL__DECLARED_REFS),
7111 "Declaring references is experimental");
7112 }
7113 OPERATOR(KW_MY);
7114 }
7115
7116 static int yyl_try(pTHX_ char*);
7117
7118 static bool
yyl_eol_needs_semicolon(pTHX_ char ** ps)7119 yyl_eol_needs_semicolon(pTHX_ char **ps)
7120 {
7121 char *s = *ps;
7122 if (PL_lex_state != LEX_NORMAL
7123 || (PL_in_eval && !PL_rsfp && !PL_parser->filtered))
7124 {
7125 const bool in_comment = *s == '#';
7126 char *d;
7127 if (*s == '#' && s == PL_linestart && PL_in_eval
7128 && !PL_rsfp && !PL_parser->filtered) {
7129 /* handle eval qq[#line 1 "foo"\n ...] */
7130 CopLINE_dec(PL_curcop);
7131 incline(s, PL_bufend);
7132 }
7133 d = s;
7134 while (d < PL_bufend && *d != '\n')
7135 d++;
7136 if (d < PL_bufend)
7137 d++;
7138 s = d;
7139 if (in_comment && d == PL_bufend
7140 && PL_lex_state == LEX_INTERPNORMAL
7141 && PL_lex_inwhat == OP_SUBST && PL_lex_repl == PL_linestr
7142 && SvEVALED(PL_lex_repl) && d[-1] == '}') s--;
7143 else
7144 incline(s, PL_bufend);
7145 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
7146 PL_lex_state = LEX_FORMLINE;
7147 force_next(FORMRBRACK);
7148 *ps = s;
7149 return TRUE;
7150 }
7151 }
7152 else {
7153 while (s < PL_bufend && *s != '\n')
7154 s++;
7155 if (s < PL_bufend) {
7156 s++;
7157 if (s < PL_bufend)
7158 incline(s, PL_bufend);
7159 }
7160 }
7161 *ps = s;
7162 return FALSE;
7163 }
7164
7165 static int
yyl_fake_eof(pTHX_ U32 fake_eof,bool bof,char * s)7166 yyl_fake_eof(pTHX_ U32 fake_eof, bool bof, char *s)
7167 {
7168 char *d;
7169
7170 goto start;
7171
7172 do {
7173 fake_eof = 0;
7174 bof = cBOOL(PL_rsfp);
7175 start:
7176
7177 PL_bufptr = PL_bufend;
7178 COPLINE_INC_WITH_HERELINES;
7179 if (!lex_next_chunk(fake_eof)) {
7180 CopLINE_dec(PL_curcop);
7181 s = PL_bufptr;
7182 TOKEN(PERLY_SEMICOLON); /* not infinite loop because rsfp is NULL now */
7183 }
7184 CopLINE_dec(PL_curcop);
7185 s = PL_bufptr;
7186 /* If it looks like the start of a BOM or raw UTF-16,
7187 * check if it in fact is. */
7188 if (bof && PL_rsfp
7189 && ( *s == 0
7190 || *(U8*)s == BOM_UTF8_FIRST_BYTE
7191 || *(U8*)s >= 0xFE
7192 || s[1] == 0))
7193 {
7194 Off_t offset = (IV)PerlIO_tell(PL_rsfp);
7195 bof = (offset == (Off_t)SvCUR(PL_linestr));
7196 #if defined(PERLIO_USING_CRLF) && defined(PERL_TEXTMODE_SCRIPTS)
7197 /* offset may include swallowed CR */
7198 if (!bof)
7199 bof = (offset == (Off_t)SvCUR(PL_linestr)+1);
7200 #endif
7201 if (bof) {
7202 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
7203 s = swallow_bom((U8*)s);
7204 }
7205 }
7206 if (PL_parser->in_pod) {
7207 /* Incest with pod. */
7208 if ( memBEGINPs(s, (STRLEN) (PL_bufend - s), "=cut")
7209 && !isALPHA(s[4]))
7210 {
7211 SvPVCLEAR(PL_linestr);
7212 PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
7213 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
7214 PL_last_lop = PL_last_uni = NULL;
7215 PL_parser->in_pod = 0;
7216 }
7217 }
7218 if (PL_rsfp || PL_parser->filtered)
7219 incline(s, PL_bufend);
7220 } while (PL_parser->in_pod);
7221
7222 PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = s;
7223 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
7224 PL_last_lop = PL_last_uni = NULL;
7225 if (CopLINE(PL_curcop) == 1) {
7226 while (s < PL_bufend && isSPACE(*s))
7227 s++;
7228 if (*s == ':' && s[1] != ':') /* for csh execing sh scripts */
7229 s++;
7230 d = NULL;
7231 if (!PL_in_eval) {
7232 if (*s == '#' && *(s+1) == '!')
7233 d = s + 2;
7234 #ifdef ALTERNATE_SHEBANG
7235 else {
7236 static char const as[] = ALTERNATE_SHEBANG;
7237 if (*s == as[0] && strnEQ(s, as, sizeof(as) - 1))
7238 d = s + (sizeof(as) - 1);
7239 }
7240 #endif /* ALTERNATE_SHEBANG */
7241 }
7242 if (d) {
7243 char *ipath;
7244 char *ipathend;
7245
7246 while (isSPACE(*d))
7247 d++;
7248 ipath = d;
7249 while (*d && !isSPACE(*d))
7250 d++;
7251 ipathend = d;
7252
7253 #ifdef ARG_ZERO_IS_SCRIPT
7254 if (ipathend > ipath) {
7255 /*
7256 * HP-UX (at least) sets argv[0] to the script name,
7257 * which makes $^X incorrect. And Digital UNIX and Linux,
7258 * at least, set argv[0] to the basename of the Perl
7259 * interpreter. So, having found "#!", we'll set it right.
7260 */
7261 SV* copfilesv = CopFILESV(PL_curcop);
7262 if (copfilesv) {
7263 SV * const x =
7264 GvSV(gv_fetchpvs("\030", GV_ADD|GV_NOTQUAL,
7265 SVt_PV)); /* $^X */
7266 assert(SvPOK(x) || SvGMAGICAL(x));
7267 if (sv_eq(x, copfilesv)) {
7268 sv_setpvn(x, ipath, ipathend - ipath);
7269 SvSETMAGIC(x);
7270 }
7271 else {
7272 STRLEN blen;
7273 STRLEN llen;
7274 const char *bstart = SvPV_const(copfilesv, blen);
7275 const char * const lstart = SvPV_const(x, llen);
7276 if (llen < blen) {
7277 bstart += blen - llen;
7278 if (strnEQ(bstart, lstart, llen) && bstart[-1] == '/') {
7279 sv_setpvn(x, ipath, ipathend - ipath);
7280 SvSETMAGIC(x);
7281 }
7282 }
7283 }
7284 }
7285 else {
7286 /* Anything to do if no copfilesv? */
7287 }
7288 TAINT_NOT; /* $^X is always tainted, but that's OK */
7289 }
7290 #endif /* ARG_ZERO_IS_SCRIPT */
7291
7292 /*
7293 * Look for options.
7294 */
7295 d = instr(s,"perl -");
7296 if (!d) {
7297 d = instr(s,"perl");
7298 #if defined(DOSISH)
7299 /* avoid getting into infinite loops when shebang
7300 * line contains "Perl" rather than "perl" */
7301 if (!d) {
7302 for (d = ipathend-4; d >= ipath; --d) {
7303 if (isALPHA_FOLD_EQ(*d, 'p')
7304 && !ibcmp(d, "perl", 4))
7305 {
7306 break;
7307 }
7308 }
7309 if (d < ipath)
7310 d = NULL;
7311 }
7312 #endif
7313 }
7314 #ifdef ALTERNATE_SHEBANG
7315 /*
7316 * If the ALTERNATE_SHEBANG on this system starts with a
7317 * character that can be part of a Perl expression, then if
7318 * we see it but not "perl", we're probably looking at the
7319 * start of Perl code, not a request to hand off to some
7320 * other interpreter. Similarly, if "perl" is there, but
7321 * not in the first 'word' of the line, we assume the line
7322 * contains the start of the Perl program.
7323 */
7324 if (d && *s != '#') {
7325 const char *c = ipath;
7326 while (*c && !memCHRs("; \t\r\n\f\v#", *c))
7327 c++;
7328 if (c < d)
7329 d = NULL; /* "perl" not in first word; ignore */
7330 else
7331 *s = '#'; /* Don't try to parse shebang line */
7332 }
7333 #endif /* ALTERNATE_SHEBANG */
7334 if (!d
7335 && *s == '#'
7336 && ipathend > ipath
7337 && !PL_minus_c
7338 && !instr(s,"indir")
7339 && instr(PL_origargv[0],"perl"))
7340 {
7341 char **newargv;
7342
7343 *ipathend = '\0';
7344 s = ipathend + 1;
7345 while (s < PL_bufend && isSPACE(*s))
7346 s++;
7347 if (s < PL_bufend) {
7348 Newx(newargv,PL_origargc+3,char*);
7349 newargv[1] = s;
7350 while (s < PL_bufend && !isSPACE(*s))
7351 s++;
7352 *s = '\0';
7353 Copy(PL_origargv+1, newargv+2, PL_origargc+1, char*);
7354 }
7355 else
7356 newargv = PL_origargv;
7357 newargv[0] = ipath;
7358 PERL_FPU_PRE_EXEC
7359 PerlProc_execv(ipath, EXEC_ARGV_CAST(newargv));
7360 PERL_FPU_POST_EXEC
7361 Perl_croak(aTHX_ "Can't exec %s", ipath);
7362 }
7363 if (d) {
7364 while (*d && !isSPACE(*d))
7365 d++;
7366 while (SPACE_OR_TAB(*d))
7367 d++;
7368
7369 if (*d++ == '-') {
7370 const bool switches_done = PL_doswitches;
7371 const U32 oldpdb = PL_perldb;
7372 const bool oldn = PL_minus_n;
7373 const bool oldp = PL_minus_p;
7374 const char *d1 = d;
7375
7376 do {
7377 bool baduni = FALSE;
7378 if (*d1 == 'C') {
7379 const char *d2 = d1 + 1;
7380 if (parse_unicode_opts((const char **)&d2)
7381 != PL_unicode)
7382 baduni = TRUE;
7383 }
7384 if (baduni || isALPHA_FOLD_EQ(*d1, 'M')) {
7385 const char * const m = d1;
7386 while (*d1 && !isSPACE(*d1))
7387 d1++;
7388 Perl_croak(aTHX_ "Too late for \"-%.*s\" option",
7389 (int)(d1 - m), m);
7390 }
7391 d1 = moreswitches(d1);
7392 } while (d1);
7393 if (PL_doswitches && !switches_done) {
7394 int argc = PL_origargc;
7395 char **argv = PL_origargv;
7396 do {
7397 argc--,argv++;
7398 } while (argc && argv[0][0] == '-' && argv[0][1]);
7399 init_argv_symbols(argc,argv);
7400 }
7401 if ( (PERLDB_LINE_OR_SAVESRC && !oldpdb)
7402 || ((PL_minus_n || PL_minus_p) && !(oldn || oldp)))
7403 /* if we have already added "LINE: while (<>) {",
7404 we must not do it again */
7405 {
7406 SvPVCLEAR(PL_linestr);
7407 PL_bufptr = PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
7408 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
7409 PL_last_lop = PL_last_uni = NULL;
7410 PL_preambled = FALSE;
7411 if (PERLDB_LINE_OR_SAVESRC)
7412 (void)gv_fetchfile(PL_origfilename);
7413 return YYL_RETRY;
7414 }
7415 }
7416 }
7417 }
7418 }
7419
7420 if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
7421 PL_lex_state = LEX_FORMLINE;
7422 force_next(FORMRBRACK);
7423 TOKEN(PERLY_SEMICOLON);
7424 }
7425
7426 PL_bufptr = s;
7427 return YYL_RETRY;
7428 }
7429
7430 static int
yyl_fatcomma(pTHX_ char * s,STRLEN len)7431 yyl_fatcomma(pTHX_ char *s, STRLEN len)
7432 {
7433 CLINE;
7434 pl_yylval.opval
7435 = newSVOP(OP_CONST, 0,
7436 S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len));
7437 pl_yylval.opval->op_private = OPpCONST_BARE;
7438 TERM(BAREWORD);
7439 }
7440
7441 static int
yyl_safe_bareword(pTHX_ char * s,const char lastchar)7442 yyl_safe_bareword(pTHX_ char *s, const char lastchar)
7443 {
7444 if ((lastchar == '*' || lastchar == '%' || lastchar == '&')
7445 && PL_parser->saw_infix_sigil)
7446 {
7447 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
7448 "Operator or semicolon missing before %c%" UTF8f,
7449 lastchar,
7450 UTF8fARG(UTF, strlen(PL_tokenbuf),
7451 PL_tokenbuf));
7452 Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
7453 "Ambiguous use of %c resolved as operator %c",
7454 lastchar, lastchar);
7455 }
7456 TOKEN(BAREWORD);
7457 }
7458
7459 static int
yyl_constant_op(pTHX_ char * s,SV * sv,CV * cv,OP * rv2cv_op,PADOFFSET off)7460 yyl_constant_op(pTHX_ char *s, SV *sv, CV *cv, OP *rv2cv_op, PADOFFSET off)
7461 {
7462 if (sv) {
7463 op_free(rv2cv_op);
7464 SvREFCNT_dec(((SVOP*)pl_yylval.opval)->op_sv);
7465 ((SVOP*)pl_yylval.opval)->op_sv = SvREFCNT_inc_simple(sv);
7466 if (SvTYPE(sv) == SVt_PVAV)
7467 pl_yylval.opval = newUNOP(OP_RV2AV, OPf_PARENS,
7468 pl_yylval.opval);
7469 else {
7470 pl_yylval.opval->op_private = 0;
7471 pl_yylval.opval->op_folded = 1;
7472 pl_yylval.opval->op_flags |= OPf_SPECIAL;
7473 }
7474 TOKEN(BAREWORD);
7475 }
7476
7477 op_free(pl_yylval.opval);
7478 pl_yylval.opval =
7479 off ? newCVREF(0, rv2cv_op) : rv2cv_op;
7480 pl_yylval.opval->op_private |= OPpENTERSUB_NOPAREN;
7481 PL_last_lop = PL_oldbufptr;
7482 PL_last_lop_op = OP_ENTERSUB;
7483
7484 /* Is there a prototype? */
7485 if (SvPOK(cv)) {
7486 int k = yyl_subproto(aTHX_ s, cv);
7487 if (k != KEY_NULL)
7488 return k;
7489 }
7490
7491 NEXTVAL_NEXTTOKE.opval = pl_yylval.opval;
7492 PL_expect = XTERM;
7493 force_next(off ? PRIVATEREF : BAREWORD);
7494 if (!PL_lex_allbrackets
7495 && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
7496 {
7497 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
7498 }
7499
7500 TOKEN(NOAMP);
7501 }
7502
7503 /* Honour "reserved word" warnings, and enforce strict subs */
7504 static void
yyl_strictwarn_bareword(pTHX_ const char lastchar)7505 yyl_strictwarn_bareword(pTHX_ const char lastchar)
7506 {
7507 /* after "print" and similar functions (corresponding to
7508 * "F? L" in opcode.pl), whatever wasn't already parsed as
7509 * a filehandle should be subject to "strict subs".
7510 * Likewise for the optional indirect-object argument to system
7511 * or exec, which can't be a bareword */
7512 if ((PL_last_lop_op == OP_PRINT
7513 || PL_last_lop_op == OP_PRTF
7514 || PL_last_lop_op == OP_SAY
7515 || PL_last_lop_op == OP_SYSTEM
7516 || PL_last_lop_op == OP_EXEC)
7517 && (PL_hints & HINT_STRICT_SUBS))
7518 {
7519 pl_yylval.opval->op_private |= OPpCONST_STRICT;
7520 }
7521
7522 if (lastchar != '-' && ckWARN(WARN_RESERVED)) {
7523 char *d = PL_tokenbuf;
7524 while (isLOWER(*d))
7525 d++;
7526 if (!*d && !gv_stashpv(PL_tokenbuf, UTF ? SVf_UTF8 : 0)) {
7527 /* PL_warn_reserved is constant */
7528 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
7529 Perl_warner(aTHX_ packWARN(WARN_RESERVED), PL_warn_reserved,
7530 PL_tokenbuf);
7531 GCC_DIAG_RESTORE_STMT;
7532 }
7533 }
7534 }
7535
7536 static int
yyl_just_a_word(pTHX_ char * s,STRLEN len,I32 orig_keyword,struct code c)7537 yyl_just_a_word(pTHX_ char *s, STRLEN len, I32 orig_keyword, struct code c)
7538 {
7539 int pkgname = 0;
7540 const char lastchar = (PL_bufptr == PL_oldoldbufptr ? 0 : PL_bufptr[-1]);
7541 bool safebw;
7542 bool no_op_error = FALSE;
7543 /* Use this var to track whether intuit_method has been
7544 called. intuit_method returns 0 or > 255. */
7545 int key = 1;
7546
7547 if (PL_expect == XOPERATOR) {
7548 if (PL_bufptr == PL_linestart) {
7549 CopLINE_dec(PL_curcop);
7550 Perl_warner(aTHX_ packWARN(WARN_SEMICOLON), "%s", PL_warn_nosemi);
7551 CopLINE_inc(PL_curcop);
7552 }
7553 else
7554 /* We want to call no_op with s pointing after the
7555 bareword, so defer it. But we want it to come
7556 before the Bad name croak. */
7557 no_op_error = TRUE;
7558 }
7559
7560 /* Get the rest if it looks like a package qualifier */
7561
7562 if (*s == '\'' || (*s == ':' && s[1] == ':')) {
7563 STRLEN morelen;
7564 s = scan_word6(s, PL_tokenbuf + len, sizeof PL_tokenbuf - len,
7565 TRUE, &morelen, TRUE);
7566 if (no_op_error) {
7567 no_op("Bareword",s);
7568 no_op_error = FALSE;
7569 }
7570 if (!morelen)
7571 Perl_croak(aTHX_ "Bad name after %" UTF8f "%s",
7572 UTF8fARG(UTF, len, PL_tokenbuf),
7573 *s == '\'' ? "'" : "::");
7574 len += morelen;
7575 pkgname = 1;
7576 }
7577
7578 if (no_op_error)
7579 no_op("Bareword",s);
7580
7581 /* See if the name is "Foo::",
7582 in which case Foo is a bareword
7583 (and a package name). */
7584
7585 if (len > 2 && PL_tokenbuf[len - 2] == ':' && PL_tokenbuf[len - 1] == ':') {
7586 if (ckWARN(WARN_BAREWORD)
7587 && ! gv_fetchpvn_flags(PL_tokenbuf, len, UTF ? SVf_UTF8 : 0, SVt_PVHV))
7588 Perl_warner(aTHX_ packWARN(WARN_BAREWORD),
7589 "Bareword \"%" UTF8f
7590 "\" refers to nonexistent package",
7591 UTF8fARG(UTF, len, PL_tokenbuf));
7592 len -= 2;
7593 PL_tokenbuf[len] = '\0';
7594 c.gv = NULL;
7595 c.gvp = 0;
7596 safebw = TRUE;
7597 }
7598 else {
7599 safebw = FALSE;
7600 }
7601
7602 /* if we saw a global override before, get the right name */
7603
7604 if (!c.sv)
7605 c.sv = S_newSV_maybe_utf8(aTHX_ PL_tokenbuf, len);
7606 if (c.gvp) {
7607 SV *sv = newSVpvs("CORE::GLOBAL::");
7608 sv_catsv(sv, c.sv);
7609 SvREFCNT_dec(c.sv);
7610 c.sv = sv;
7611 }
7612
7613 /* Presume this is going to be a bareword of some sort. */
7614 CLINE;
7615 pl_yylval.opval = newSVOP(OP_CONST, 0, c.sv);
7616 pl_yylval.opval->op_private = OPpCONST_BARE;
7617
7618 /* And if "Foo::", then that's what it certainly is. */
7619 if (safebw)
7620 return yyl_safe_bareword(aTHX_ s, lastchar);
7621
7622 if (!c.off) {
7623 OP *const_op = newSVOP(OP_CONST, 0, SvREFCNT_inc_NN(c.sv));
7624 const_op->op_private = OPpCONST_BARE;
7625 c.rv2cv_op = newCVREF(OPpMAY_RETURN_CONSTANT<<8, const_op);
7626 c.cv = c.lex
7627 ? isGV(c.gv)
7628 ? GvCV(c.gv)
7629 : SvROK(c.gv) && SvTYPE(SvRV(c.gv)) == SVt_PVCV
7630 ? (CV *)SvRV(c.gv)
7631 : ((CV *)c.gv)
7632 : rv2cv_op_cv(c.rv2cv_op, RV2CVOPCV_RETURN_STUB);
7633 }
7634
7635 /* See if it's the indirect object for a list operator. */
7636
7637 if (PL_oldoldbufptr
7638 && PL_oldoldbufptr < PL_bufptr
7639 && (PL_oldoldbufptr == PL_last_lop
7640 || PL_oldoldbufptr == PL_last_uni)
7641 && /* NO SKIPSPACE BEFORE HERE! */
7642 (PL_expect == XREF
7643 || ((PL_opargs[PL_last_lop_op] >> OASHIFT)& 7)
7644 == OA_FILEREF))
7645 {
7646 bool immediate_paren = *s == '(';
7647 SSize_t s_off;
7648
7649 /* (Now we can afford to cross potential line boundary.) */
7650 s = skipspace(s);
7651
7652 /* intuit_method() can indirectly call lex_next_chunk(),
7653 * invalidating s
7654 */
7655 s_off = s - SvPVX(PL_linestr);
7656 /* Two barewords in a row may indicate method call. */
7657 if ( ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
7658 || *s == '$')
7659 && (key = intuit_method(s, c.lex ? NULL : c.sv, c.cv)))
7660 {
7661 /* the code at method: doesn't use s */
7662 goto method;
7663 }
7664 s = SvPVX(PL_linestr) + s_off;
7665
7666 /* If not a declared subroutine, it's an indirect object. */
7667 /* (But it's an indir obj regardless for sort.) */
7668 /* Also, if "_" follows a filetest operator, it's a bareword */
7669
7670 if (
7671 ( !immediate_paren && (PL_last_lop_op == OP_SORT
7672 || (!c.cv
7673 && (PL_last_lop_op != OP_MAPSTART
7674 && PL_last_lop_op != OP_GREPSTART))))
7675 || (PL_tokenbuf[0] == '_' && PL_tokenbuf[1] == '\0'
7676 && ((PL_opargs[PL_last_lop_op] & OA_CLASS_MASK)
7677 == OA_FILESTATOP))
7678 )
7679 {
7680 PL_expect = (PL_last_lop == PL_oldoldbufptr) ? XTERM : XOPERATOR;
7681 yyl_strictwarn_bareword(aTHX_ lastchar);
7682 op_free(c.rv2cv_op);
7683 return yyl_safe_bareword(aTHX_ s, lastchar);
7684 }
7685 }
7686
7687 PL_expect = XOPERATOR;
7688 s = skipspace(s);
7689
7690 /* Is this a word before a => operator? */
7691 if (*s == '=' && s[1] == '>' && !pkgname) {
7692 op_free(c.rv2cv_op);
7693 CLINE;
7694 if (c.gvp || (c.lex && !c.off)) {
7695 assert (cSVOPx(pl_yylval.opval)->op_sv == c.sv);
7696 /* This is our own scalar, created a few lines
7697 above, so this is safe. */
7698 SvREADONLY_off(c.sv);
7699 sv_setpv(c.sv, PL_tokenbuf);
7700 if (UTF && !IN_BYTES
7701 && is_utf8_string((U8*)PL_tokenbuf, len))
7702 SvUTF8_on(c.sv);
7703 SvREADONLY_on(c.sv);
7704 }
7705 TERM(BAREWORD);
7706 }
7707
7708 /* If followed by a paren, it's certainly a subroutine. */
7709 if (*s == '(') {
7710 CLINE;
7711 if (c.cv) {
7712 char *d = s + 1;
7713 while (SPACE_OR_TAB(*d))
7714 d++;
7715 if (*d == ')' && (c.sv = cv_const_sv_or_av(c.cv)))
7716 return yyl_constant_op(aTHX_ d + 1, c.sv, c.cv, c.rv2cv_op, c.off);
7717 }
7718 NEXTVAL_NEXTTOKE.opval =
7719 c.off ? c.rv2cv_op : pl_yylval.opval;
7720 if (c.off)
7721 op_free(pl_yylval.opval), force_next(PRIVATEREF);
7722 else op_free(c.rv2cv_op), force_next(BAREWORD);
7723 pl_yylval.ival = 0;
7724 TOKEN(PERLY_AMPERSAND);
7725 }
7726
7727 /* If followed by var or block, call it a method (unless sub) */
7728
7729 if ((*s == '$' || *s == '{') && !c.cv && FEATURE_INDIRECT_IS_ENABLED) {
7730 op_free(c.rv2cv_op);
7731 PL_last_lop = PL_oldbufptr;
7732 PL_last_lop_op = OP_METHOD;
7733 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
7734 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
7735 PL_expect = XBLOCKTERM;
7736 PL_bufptr = s;
7737 return REPORT(METHCALL0);
7738 }
7739
7740 /* If followed by a bareword, see if it looks like indir obj. */
7741
7742 if ( key == 1
7743 && !orig_keyword
7744 && (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF) || *s == '$')
7745 && (key = intuit_method(s, c.lex ? NULL : c.sv, c.cv)))
7746 {
7747 method:
7748 if (c.lex && !c.off) {
7749 assert(cSVOPx(pl_yylval.opval)->op_sv == c.sv);
7750 SvREADONLY_off(c.sv);
7751 sv_setpvn(c.sv, PL_tokenbuf, len);
7752 if (UTF && !IN_BYTES
7753 && is_utf8_string((U8*)PL_tokenbuf, len))
7754 SvUTF8_on(c.sv);
7755 else SvUTF8_off(c.sv);
7756 }
7757 op_free(c.rv2cv_op);
7758 if (key == METHCALL0 && !PL_lex_allbrackets
7759 && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
7760 {
7761 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
7762 }
7763 return REPORT(key);
7764 }
7765
7766 /* Not a method, so call it a subroutine (if defined) */
7767
7768 if (c.cv) {
7769 /* Check for a constant sub */
7770 c.sv = cv_const_sv_or_av(c.cv);
7771 return yyl_constant_op(aTHX_ s, c.sv, c.cv, c.rv2cv_op, c.off);
7772 }
7773
7774 /* Call it a bare word */
7775
7776 if (PL_hints & HINT_STRICT_SUBS)
7777 pl_yylval.opval->op_private |= OPpCONST_STRICT;
7778 else
7779 yyl_strictwarn_bareword(aTHX_ lastchar);
7780
7781 op_free(c.rv2cv_op);
7782
7783 return yyl_safe_bareword(aTHX_ s, lastchar);
7784 }
7785
7786 static int
yyl_word_or_keyword(pTHX_ char * s,STRLEN len,I32 key,I32 orig_keyword,struct code c)7787 yyl_word_or_keyword(pTHX_ char *s, STRLEN len, I32 key, I32 orig_keyword, struct code c)
7788 {
7789 switch (key) {
7790 default: /* not a keyword */
7791 return yyl_just_a_word(aTHX_ s, len, orig_keyword, c);
7792
7793 case KEY___FILE__:
7794 FUN0OP( newSVOP(OP_CONST, 0, newSVpv(CopFILE(PL_curcop),0)) );
7795
7796 case KEY___LINE__:
7797 FUN0OP(
7798 newSVOP(OP_CONST, 0,
7799 Perl_newSVpvf(aTHX_ "%" LINE_Tf, CopLINE(PL_curcop)))
7800 );
7801
7802 case KEY___PACKAGE__:
7803 FUN0OP(
7804 newSVOP(OP_CONST, 0, (PL_curstash
7805 ? newSVhek(HvNAME_HEK(PL_curstash))
7806 : &PL_sv_undef))
7807 );
7808
7809 case KEY___DATA__:
7810 case KEY___END__:
7811 if (PL_rsfp && (!PL_in_eval || PL_tokenbuf[2] == 'D'))
7812 yyl_data_handle(aTHX);
7813 return yyl_fake_eof(aTHX_ LEX_FAKE_EOF, FALSE, s);
7814
7815 case KEY___SUB__:
7816 /* If !CvCLONE(PL_compcv) then rpeep will probably turn this into an
7817 * OP_CONST. We need to make it big enough to allow room for that if
7818 * so */
7819 FUN0OP(CvCLONE(PL_compcv)
7820 ? newOP(OP_RUNCV, 0)
7821 : newSVOP(OP_RUNCV, 0, &PL_sv_undef));
7822
7823 case KEY_AUTOLOAD:
7824 case KEY_DESTROY:
7825 case KEY_BEGIN:
7826 case KEY_UNITCHECK:
7827 case KEY_CHECK:
7828 case KEY_INIT:
7829 case KEY_END:
7830 if (PL_expect == XSTATE)
7831 return yyl_sub(aTHX_ PL_bufptr, key);
7832 return yyl_just_a_word(aTHX_ s, len, orig_keyword, c);
7833
7834 case KEY_ADJUST:
7835 Perl_ck_warner_d(aTHX_
7836 packWARN(WARN_EXPERIMENTAL__CLASS), "ADJUST is experimental");
7837
7838 /* The way that KEY_CHECK et.al. are handled currently are nothing
7839 * short of crazy. We won't copy that model for new phasers, but use
7840 * this as an experiment to test if this will work
7841 */
7842 PHASERBLOCK(KEY_ADJUST);
7843
7844 case KEY_abs:
7845 UNI(OP_ABS);
7846
7847 case KEY_alarm:
7848 UNI(OP_ALARM);
7849
7850 case KEY_accept:
7851 LOP(OP_ACCEPT,XTERM);
7852
7853 case KEY_and:
7854 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_LOWLOGIC)
7855 return REPORT(0);
7856 OPERATOR(ANDOP);
7857
7858 case KEY_atan2:
7859 LOP(OP_ATAN2,XTERM);
7860
7861 case KEY_bind:
7862 LOP(OP_BIND,XTERM);
7863
7864 case KEY_binmode:
7865 LOP(OP_BINMODE,XTERM);
7866
7867 case KEY_bless:
7868 LOP(OP_BLESS,XTERM);
7869
7870 case KEY_break:
7871 FUN0(OP_BREAK);
7872
7873 case KEY_catch:
7874 Perl_ck_warner_d(aTHX_
7875 packWARN(WARN_EXPERIMENTAL__TRY), "try/catch is experimental");
7876 PREBLOCK(KW_CATCH);
7877
7878 case KEY_chop:
7879 UNI(OP_CHOP);
7880
7881 case KEY_class:
7882 Perl_ck_warner_d(aTHX_
7883 packWARN(WARN_EXPERIMENTAL__CLASS), "class is experimental");
7884
7885 s = force_word(s,BAREWORD,FALSE,TRUE);
7886 s = skipspace(s);
7887 s = force_strict_version(s);
7888 PL_expect = XATTRBLOCK;
7889 TOKEN(KW_CLASS);
7890
7891 case KEY_continue:
7892 /* We have to disambiguate the two senses of
7893 "continue". If the next token is a '{' then
7894 treat it as the start of a continue block;
7895 otherwise treat it as a control operator.
7896 */
7897 s = skipspace(s);
7898 if (*s == '{')
7899 PREBLOCK(KW_CONTINUE);
7900 else
7901 FUN0(OP_CONTINUE);
7902
7903 case KEY_chdir:
7904 /* may use HOME */
7905 (void)gv_fetchpvs("ENV", GV_ADD|GV_NOTQUAL, SVt_PVHV);
7906 UNI(OP_CHDIR);
7907
7908 case KEY_close:
7909 UNI(OP_CLOSE);
7910
7911 case KEY_closedir:
7912 UNI(OP_CLOSEDIR);
7913
7914 case KEY_cmp:
7915 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
7916 return REPORT(0);
7917 NCEop(OP_SCMP);
7918
7919 case KEY_caller:
7920 UNI(OP_CALLER);
7921
7922 case KEY_crypt:
7923
7924 LOP(OP_CRYPT,XTERM);
7925
7926 case KEY_chmod:
7927 LOP(OP_CHMOD,XTERM);
7928
7929 case KEY_chown:
7930 LOP(OP_CHOWN,XTERM);
7931
7932 case KEY_connect:
7933 LOP(OP_CONNECT,XTERM);
7934
7935 case KEY_chr:
7936 UNI(OP_CHR);
7937
7938 case KEY_cos:
7939 UNI(OP_COS);
7940
7941 case KEY_chroot:
7942 UNI(OP_CHROOT);
7943
7944 case KEY_default:
7945 PREBLOCK(KW_DEFAULT);
7946
7947 case KEY_defer:
7948 Perl_ck_warner_d(aTHX_
7949 packWARN(WARN_EXPERIMENTAL__DEFER), "defer is experimental");
7950 PREBLOCK(KW_DEFER);
7951
7952 case KEY_do:
7953 return yyl_do(aTHX_ s, orig_keyword);
7954
7955 case KEY_die:
7956 PL_hints |= HINT_BLOCK_SCOPE;
7957 LOP(OP_DIE,XTERM);
7958
7959 case KEY_defined:
7960 UNI(OP_DEFINED);
7961
7962 case KEY_delete:
7963 UNI(OP_DELETE);
7964
7965 case KEY_dbmopen:
7966 Perl_populate_isa(aTHX_ STR_WITH_LEN("AnyDBM_File::ISA"),
7967 STR_WITH_LEN("NDBM_File::"),
7968 STR_WITH_LEN("DB_File::"),
7969 STR_WITH_LEN("GDBM_File::"),
7970 STR_WITH_LEN("SDBM_File::"),
7971 STR_WITH_LEN("ODBM_File::"),
7972 NULL);
7973 LOP(OP_DBMOPEN,XTERM);
7974
7975 case KEY_dbmclose:
7976 UNI(OP_DBMCLOSE);
7977
7978 case KEY_dump:
7979 LOOPX(OP_DUMP);
7980
7981 case KEY_else:
7982 PREBLOCK(KW_ELSE);
7983
7984 case KEY_elsif:
7985 pl_yylval.ival = CopLINE(PL_curcop);
7986 OPERATOR(KW_ELSIF);
7987
7988 case KEY_eq:
7989 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
7990 return REPORT(0);
7991 ChEop(OP_SEQ);
7992
7993 case KEY_exists:
7994 UNI(OP_EXISTS);
7995
7996 case KEY_exit:
7997 UNI(OP_EXIT);
7998
7999 case KEY_eval:
8000 s = skipspace(s);
8001 if (*s == '{') { /* block eval */
8002 PL_expect = XTERMBLOCK;
8003 UNIBRACK(OP_ENTERTRY);
8004 }
8005 else { /* string eval */
8006 PL_expect = XTERM;
8007 UNIBRACK(OP_ENTEREVAL);
8008 }
8009
8010 case KEY_evalbytes:
8011 PL_expect = XTERM;
8012 UNIBRACK(-OP_ENTEREVAL);
8013
8014 case KEY_eof:
8015 UNI(OP_EOF);
8016
8017 case KEY_exp:
8018 UNI(OP_EXP);
8019
8020 case KEY_each:
8021 UNI(OP_EACH);
8022
8023 case KEY_exec:
8024 LOP(OP_EXEC,XREF);
8025
8026 case KEY_endhostent:
8027 FUN0(OP_EHOSTENT);
8028
8029 case KEY_endnetent:
8030 FUN0(OP_ENETENT);
8031
8032 case KEY_endservent:
8033 FUN0(OP_ESERVENT);
8034
8035 case KEY_endprotoent:
8036 FUN0(OP_EPROTOENT);
8037
8038 case KEY_endpwent:
8039 FUN0(OP_EPWENT);
8040
8041 case KEY_endgrent:
8042 FUN0(OP_EGRENT);
8043
8044 case KEY_field:
8045 /* TODO: maybe this should use the same parser/grammar structures as
8046 * `my`, but it's also rather messy because of the `our` conflation
8047 */
8048 Perl_ck_warner_d(aTHX_
8049 packWARN(WARN_EXPERIMENTAL__CLASS), "field is experimental");
8050
8051 croak_kw_unless_class("field");
8052
8053 PL_parser->in_my = KEY_field;
8054 OPERATOR(KW_FIELD);
8055
8056 case KEY_finally:
8057 Perl_ck_warner_d(aTHX_
8058 packWARN(WARN_EXPERIMENTAL__TRY), "try/catch/finally is experimental");
8059 PREBLOCK(KW_FINALLY);
8060
8061 case KEY_for:
8062 case KEY_foreach:
8063 return yyl_foreach(aTHX_ s);
8064
8065 case KEY_formline:
8066 LOP(OP_FORMLINE,XTERM);
8067
8068 case KEY_fork:
8069 FUN0(OP_FORK);
8070
8071 case KEY_fc:
8072 UNI(OP_FC);
8073
8074 case KEY_fcntl:
8075 LOP(OP_FCNTL,XTERM);
8076
8077 case KEY_fileno:
8078 UNI(OP_FILENO);
8079
8080 case KEY_flock:
8081 LOP(OP_FLOCK,XTERM);
8082
8083 case KEY_gt:
8084 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
8085 return REPORT(0);
8086 ChRop(OP_SGT);
8087
8088 case KEY_ge:
8089 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
8090 return REPORT(0);
8091 ChRop(OP_SGE);
8092
8093 case KEY_grep:
8094 LOP(OP_GREPSTART, XREF);
8095
8096 case KEY_goto:
8097 LOOPX(OP_GOTO);
8098
8099 case KEY_gmtime:
8100 UNI(OP_GMTIME);
8101
8102 case KEY_getc:
8103 UNIDOR(OP_GETC);
8104
8105 case KEY_getppid:
8106 FUN0(OP_GETPPID);
8107
8108 case KEY_getpgrp:
8109 UNI(OP_GETPGRP);
8110
8111 case KEY_getpriority:
8112 LOP(OP_GETPRIORITY,XTERM);
8113
8114 case KEY_getprotobyname:
8115 UNI(OP_GPBYNAME);
8116
8117 case KEY_getprotobynumber:
8118 LOP(OP_GPBYNUMBER,XTERM);
8119
8120 case KEY_getprotoent:
8121 FUN0(OP_GPROTOENT);
8122
8123 case KEY_getpwent:
8124 FUN0(OP_GPWENT);
8125
8126 case KEY_getpwnam:
8127 UNI(OP_GPWNAM);
8128
8129 case KEY_getpwuid:
8130 UNI(OP_GPWUID);
8131
8132 case KEY_getpeername:
8133 UNI(OP_GETPEERNAME);
8134
8135 case KEY_gethostbyname:
8136 UNI(OP_GHBYNAME);
8137
8138 case KEY_gethostbyaddr:
8139 LOP(OP_GHBYADDR,XTERM);
8140
8141 case KEY_gethostent:
8142 FUN0(OP_GHOSTENT);
8143
8144 case KEY_getnetbyname:
8145 UNI(OP_GNBYNAME);
8146
8147 case KEY_getnetbyaddr:
8148 LOP(OP_GNBYADDR,XTERM);
8149
8150 case KEY_getnetent:
8151 FUN0(OP_GNETENT);
8152
8153 case KEY_getservbyname:
8154 LOP(OP_GSBYNAME,XTERM);
8155
8156 case KEY_getservbyport:
8157 LOP(OP_GSBYPORT,XTERM);
8158
8159 case KEY_getservent:
8160 FUN0(OP_GSERVENT);
8161
8162 case KEY_getsockname:
8163 UNI(OP_GETSOCKNAME);
8164
8165 case KEY_getsockopt:
8166 LOP(OP_GSOCKOPT,XTERM);
8167
8168 case KEY_getgrent:
8169 FUN0(OP_GGRENT);
8170
8171 case KEY_getgrnam:
8172 UNI(OP_GGRNAM);
8173
8174 case KEY_getgrgid:
8175 UNI(OP_GGRGID);
8176
8177 case KEY_getlogin:
8178 FUN0(OP_GETLOGIN);
8179
8180 case KEY_given:
8181 pl_yylval.ival = CopLINE(PL_curcop);
8182 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED__SMARTMATCH),
8183 "given is deprecated");
8184 OPERATOR(KW_GIVEN);
8185
8186 case KEY_glob:
8187 LOP( orig_keyword==KEY_glob ? -OP_GLOB : OP_GLOB, XTERM );
8188
8189 case KEY_hex:
8190 UNI(OP_HEX);
8191
8192 case KEY_if:
8193 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8194 return REPORT(0);
8195 pl_yylval.ival = CopLINE(PL_curcop);
8196 OPERATOR(KW_IF);
8197
8198 case KEY_index:
8199 LOP(OP_INDEX,XTERM);
8200
8201 case KEY_int:
8202 UNI(OP_INT);
8203
8204 case KEY_ioctl:
8205 LOP(OP_IOCTL,XTERM);
8206
8207 case KEY_isa:
8208 NCRop(OP_ISA);
8209
8210 case KEY_join:
8211 LOP(OP_JOIN,XTERM);
8212
8213 case KEY_keys:
8214 UNI(OP_KEYS);
8215
8216 case KEY_kill:
8217 LOP(OP_KILL,XTERM);
8218
8219 case KEY_last:
8220 LOOPX(OP_LAST);
8221
8222 case KEY_lc:
8223 UNI(OP_LC);
8224
8225 case KEY_lcfirst:
8226 UNI(OP_LCFIRST);
8227
8228 case KEY_local:
8229 OPERATOR(KW_LOCAL);
8230
8231 case KEY_length:
8232 UNI(OP_LENGTH);
8233
8234 case KEY_lt:
8235 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
8236 return REPORT(0);
8237 ChRop(OP_SLT);
8238
8239 case KEY_le:
8240 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
8241 return REPORT(0);
8242 ChRop(OP_SLE);
8243
8244 case KEY_localtime:
8245 UNI(OP_LOCALTIME);
8246
8247 case KEY_log:
8248 UNI(OP_LOG);
8249
8250 case KEY_link:
8251 LOP(OP_LINK,XTERM);
8252
8253 case KEY_listen:
8254 LOP(OP_LISTEN,XTERM);
8255
8256 case KEY_lock:
8257 UNI(OP_LOCK);
8258
8259 case KEY_lstat:
8260 UNI(OP_LSTAT);
8261
8262 case KEY_m:
8263 s = scan_pat(s,OP_MATCH);
8264 TERM(sublex_start());
8265
8266 case KEY_map:
8267 LOP(OP_MAPSTART, XREF);
8268
8269 case KEY_mkdir:
8270 LOP(OP_MKDIR,XTERM);
8271
8272 case KEY_msgctl:
8273 LOP(OP_MSGCTL,XTERM);
8274
8275 case KEY_msgget:
8276 LOP(OP_MSGGET,XTERM);
8277
8278 case KEY_msgrcv:
8279 LOP(OP_MSGRCV,XTERM);
8280
8281 case KEY_msgsnd:
8282 LOP(OP_MSGSND,XTERM);
8283
8284 case KEY_our:
8285 case KEY_my:
8286 case KEY_state:
8287 return yyl_my(aTHX_ s, key);
8288
8289 case KEY_next:
8290 LOOPX(OP_NEXT);
8291
8292 case KEY_ne:
8293 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
8294 return REPORT(0);
8295 ChEop(OP_SNE);
8296
8297 case KEY_no:
8298 s = tokenize_use(0, s);
8299 TOKEN(KW_USE_or_NO);
8300
8301 case KEY_not:
8302 if (*s == '(' || (s = skipspace(s), *s == '('))
8303 FUN1(OP_NOT);
8304 else {
8305 if (!PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC)
8306 PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC;
8307 OPERATOR(NOTOP);
8308 }
8309
8310 case KEY_open:
8311 s = skipspace(s);
8312 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
8313 const char *t;
8314 char *d = scan_word6(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len, FALSE);
8315 for (t=d; isSPACE(*t);)
8316 t++;
8317 if ( *t && memCHRs("|&*+-=!?:.", *t) && ckWARN_d(WARN_PRECEDENCE)
8318 /* [perl #16184] */
8319 && !(t[0] == '=' && t[1] == '>')
8320 && !(t[0] == ':' && t[1] == ':')
8321 && !keyword(s, d-s, 0)
8322 ) {
8323 Perl_warner(aTHX_ packWARN(WARN_PRECEDENCE),
8324 "Precedence problem: open %" UTF8f " should be open(%" UTF8f ")",
8325 UTF8fARG(UTF, d-s, s), UTF8fARG(UTF, d-s, s));
8326 }
8327 }
8328 LOP(OP_OPEN,XTERM);
8329
8330 case KEY_or:
8331 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_LOWLOGIC)
8332 return REPORT(0);
8333 pl_yylval.ival = OP_OR;
8334 OPERATOR(OROP);
8335
8336 case KEY_ord:
8337 UNI(OP_ORD);
8338
8339 case KEY_oct:
8340 UNI(OP_OCT);
8341
8342 case KEY_opendir:
8343 LOP(OP_OPEN_DIR,XTERM);
8344
8345 case KEY_print:
8346 checkcomma(s,PL_tokenbuf,"filehandle");
8347 LOP(OP_PRINT,XREF);
8348
8349 case KEY_printf:
8350 checkcomma(s,PL_tokenbuf,"filehandle");
8351 LOP(OP_PRTF,XREF);
8352
8353 case KEY_prototype:
8354 UNI(OP_PROTOTYPE);
8355
8356 case KEY_push:
8357 LOP(OP_PUSH,XTERM);
8358
8359 case KEY_pop:
8360 UNIDOR(OP_POP);
8361
8362 case KEY_pos:
8363 UNIDOR(OP_POS);
8364
8365 case KEY_pack:
8366 LOP(OP_PACK,XTERM);
8367
8368 case KEY_package:
8369 s = force_word(s,BAREWORD,FALSE,TRUE);
8370 s = skipspace(s);
8371 s = force_strict_version(s);
8372 PREBLOCK(KW_PACKAGE);
8373
8374 case KEY_pipe:
8375 LOP(OP_PIPE_OP,XTERM);
8376
8377 case KEY_q:
8378 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
8379 if (!s)
8380 missingterm(NULL, 0);
8381 COPLINE_SET_FROM_MULTI_END;
8382 pl_yylval.ival = OP_CONST;
8383 TERM(sublex_start());
8384
8385 case KEY_quotemeta:
8386 UNI(OP_QUOTEMETA);
8387
8388 case KEY_qw:
8389 return yyl_qw(aTHX_ s, len);
8390
8391 case KEY_qq:
8392 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
8393 if (!s)
8394 missingterm(NULL, 0);
8395 pl_yylval.ival = OP_STRINGIFY;
8396 if (SvIVX(PL_lex_stuff) == '\'')
8397 SvIV_set(PL_lex_stuff, 0); /* qq'$foo' should interpolate */
8398 TERM(sublex_start());
8399
8400 case KEY_qr:
8401 s = scan_pat(s,OP_QR);
8402 TERM(sublex_start());
8403
8404 case KEY_qx:
8405 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
8406 if (!s)
8407 missingterm(NULL, 0);
8408 pl_yylval.ival = OP_BACKTICK;
8409 TERM(sublex_start());
8410
8411 case KEY_return:
8412 OLDLOP(OP_RETURN);
8413
8414 case KEY_require:
8415 return yyl_require(aTHX_ s, orig_keyword);
8416
8417 case KEY_reset:
8418 UNI(OP_RESET);
8419
8420 case KEY_redo:
8421 LOOPX(OP_REDO);
8422
8423 case KEY_rename:
8424 LOP(OP_RENAME,XTERM);
8425
8426 case KEY_rand:
8427 UNI(OP_RAND);
8428
8429 case KEY_rmdir:
8430 UNI(OP_RMDIR);
8431
8432 case KEY_rindex:
8433 LOP(OP_RINDEX,XTERM);
8434
8435 case KEY_read:
8436 LOP(OP_READ,XTERM);
8437
8438 case KEY_readdir:
8439 UNI(OP_READDIR);
8440
8441 case KEY_readline:
8442 UNIDOR(OP_READLINE);
8443
8444 case KEY_readpipe:
8445 UNIDOR(OP_BACKTICK);
8446
8447 case KEY_rewinddir:
8448 UNI(OP_REWINDDIR);
8449
8450 case KEY_recv:
8451 LOP(OP_RECV,XTERM);
8452
8453 case KEY_reverse:
8454 LOP(OP_REVERSE,XTERM);
8455
8456 case KEY_readlink:
8457 UNIDOR(OP_READLINK);
8458
8459 case KEY_ref:
8460 UNI(OP_REF);
8461
8462 case KEY_s:
8463 s = scan_subst(s);
8464 if (pl_yylval.opval)
8465 TERM(sublex_start());
8466 else
8467 TOKEN(1); /* force error */
8468
8469 case KEY_say:
8470 checkcomma(s,PL_tokenbuf,"filehandle");
8471 LOP(OP_SAY,XREF);
8472
8473 case KEY_chomp:
8474 UNI(OP_CHOMP);
8475
8476 case KEY_scalar:
8477 UNI(OP_SCALAR);
8478
8479 case KEY_select:
8480 LOP(OP_SELECT,XTERM);
8481
8482 case KEY_seek:
8483 LOP(OP_SEEK,XTERM);
8484
8485 case KEY_semctl:
8486 LOP(OP_SEMCTL,XTERM);
8487
8488 case KEY_semget:
8489 LOP(OP_SEMGET,XTERM);
8490
8491 case KEY_semop:
8492 LOP(OP_SEMOP,XTERM);
8493
8494 case KEY_send:
8495 LOP(OP_SEND,XTERM);
8496
8497 case KEY_setpgrp:
8498 LOP(OP_SETPGRP,XTERM);
8499
8500 case KEY_setpriority:
8501 LOP(OP_SETPRIORITY,XTERM);
8502
8503 case KEY_sethostent:
8504 UNI(OP_SHOSTENT);
8505
8506 case KEY_setnetent:
8507 UNI(OP_SNETENT);
8508
8509 case KEY_setservent:
8510 UNI(OP_SSERVENT);
8511
8512 case KEY_setprotoent:
8513 UNI(OP_SPROTOENT);
8514
8515 case KEY_setpwent:
8516 FUN0(OP_SPWENT);
8517
8518 case KEY_setgrent:
8519 FUN0(OP_SGRENT);
8520
8521 case KEY_seekdir:
8522 LOP(OP_SEEKDIR,XTERM);
8523
8524 case KEY_setsockopt:
8525 LOP(OP_SSOCKOPT,XTERM);
8526
8527 case KEY_shift:
8528 UNIDOR(OP_SHIFT);
8529
8530 case KEY_shmctl:
8531 LOP(OP_SHMCTL,XTERM);
8532
8533 case KEY_shmget:
8534 LOP(OP_SHMGET,XTERM);
8535
8536 case KEY_shmread:
8537 LOP(OP_SHMREAD,XTERM);
8538
8539 case KEY_shmwrite:
8540 LOP(OP_SHMWRITE,XTERM);
8541
8542 case KEY_shutdown:
8543 LOP(OP_SHUTDOWN,XTERM);
8544
8545 case KEY_sin:
8546 UNI(OP_SIN);
8547
8548 case KEY_sleep:
8549 UNI(OP_SLEEP);
8550
8551 case KEY_socket:
8552 LOP(OP_SOCKET,XTERM);
8553
8554 case KEY_socketpair:
8555 LOP(OP_SOCKPAIR,XTERM);
8556
8557 case KEY_sort:
8558 checkcomma(s,PL_tokenbuf,"subroutine name");
8559 s = skipspace(s);
8560 PL_expect = XTERM;
8561 s = force_word(s,BAREWORD,TRUE,TRUE);
8562 LOP(OP_SORT,XREF);
8563
8564 case KEY_split:
8565 LOP(OP_SPLIT,XTERM);
8566
8567 case KEY_sprintf:
8568 LOP(OP_SPRINTF,XTERM);
8569
8570 case KEY_splice:
8571 LOP(OP_SPLICE,XTERM);
8572
8573 case KEY_sqrt:
8574 UNI(OP_SQRT);
8575
8576 case KEY_srand:
8577 UNI(OP_SRAND);
8578
8579 case KEY_stat:
8580 UNI(OP_STAT);
8581
8582 case KEY_study:
8583 UNI(OP_STUDY);
8584
8585 case KEY_substr:
8586 LOP(OP_SUBSTR,XTERM);
8587
8588 case KEY_method:
8589 /* For now we just treat 'method' identical to 'sub' plus a warning */
8590 Perl_ck_warner_d(aTHX_
8591 packWARN(WARN_EXPERIMENTAL__CLASS), "method is experimental");
8592 return yyl_sub(aTHX_ s, KEY_method);
8593
8594 case KEY_format:
8595 case KEY_sub:
8596 return yyl_sub(aTHX_ s, key);
8597
8598 case KEY_system:
8599 LOP(OP_SYSTEM,XREF);
8600
8601 case KEY_symlink:
8602 LOP(OP_SYMLINK,XTERM);
8603
8604 case KEY_syscall:
8605 LOP(OP_SYSCALL,XTERM);
8606
8607 case KEY_sysopen:
8608 LOP(OP_SYSOPEN,XTERM);
8609
8610 case KEY_sysseek:
8611 LOP(OP_SYSSEEK,XTERM);
8612
8613 case KEY_sysread:
8614 LOP(OP_SYSREAD,XTERM);
8615
8616 case KEY_syswrite:
8617 LOP(OP_SYSWRITE,XTERM);
8618
8619 case KEY_tr:
8620 case KEY_y:
8621 s = scan_trans(s);
8622 TERM(sublex_start());
8623
8624 case KEY_tell:
8625 UNI(OP_TELL);
8626
8627 case KEY_telldir:
8628 UNI(OP_TELLDIR);
8629
8630 case KEY_tie:
8631 LOP(OP_TIE,XTERM);
8632
8633 case KEY_tied:
8634 UNI(OP_TIED);
8635
8636 case KEY_time:
8637 FUN0(OP_TIME);
8638
8639 case KEY_times:
8640 FUN0(OP_TMS);
8641
8642 case KEY_truncate:
8643 LOP(OP_TRUNCATE,XTERM);
8644
8645 case KEY_try:
8646 pl_yylval.ival = CopLINE(PL_curcop);
8647 Perl_ck_warner_d(aTHX_
8648 packWARN(WARN_EXPERIMENTAL__TRY), "try/catch is experimental");
8649 PREBLOCK(KW_TRY);
8650
8651 case KEY_uc:
8652 UNI(OP_UC);
8653
8654 case KEY_ucfirst:
8655 UNI(OP_UCFIRST);
8656
8657 case KEY_untie:
8658 UNI(OP_UNTIE);
8659
8660 case KEY_until:
8661 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8662 return REPORT(0);
8663 pl_yylval.ival = CopLINE(PL_curcop);
8664 OPERATOR(KW_UNTIL);
8665
8666 case KEY_unless:
8667 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8668 return REPORT(0);
8669 pl_yylval.ival = CopLINE(PL_curcop);
8670 OPERATOR(KW_UNLESS);
8671
8672 case KEY_unlink:
8673 LOP(OP_UNLINK,XTERM);
8674
8675 case KEY_undef:
8676 UNIDOR(OP_UNDEF);
8677
8678 case KEY_unpack:
8679 LOP(OP_UNPACK,XTERM);
8680
8681 case KEY_utime:
8682 LOP(OP_UTIME,XTERM);
8683
8684 case KEY_umask:
8685 UNIDOR(OP_UMASK);
8686
8687 case KEY_unshift:
8688 LOP(OP_UNSHIFT,XTERM);
8689
8690 case KEY_use:
8691 s = tokenize_use(1, s);
8692 TOKEN(KW_USE_or_NO);
8693
8694 case KEY_values:
8695 UNI(OP_VALUES);
8696
8697 case KEY_vec:
8698 LOP(OP_VEC,XTERM);
8699
8700 case KEY_when:
8701 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8702 return REPORT(0);
8703 pl_yylval.ival = CopLINE(PL_curcop);
8704 Perl_ck_warner_d(aTHX_
8705 packWARN(WARN_DEPRECATED__SMARTMATCH),
8706 "when is deprecated");
8707 OPERATOR(KW_WHEN);
8708
8709 case KEY_while:
8710 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
8711 return REPORT(0);
8712 pl_yylval.ival = CopLINE(PL_curcop);
8713 OPERATOR(KW_WHILE);
8714
8715 case KEY_warn:
8716 PL_hints |= HINT_BLOCK_SCOPE;
8717 LOP(OP_WARN,XTERM);
8718
8719 case KEY_wait:
8720 FUN0(OP_WAIT);
8721
8722 case KEY_waitpid:
8723 LOP(OP_WAITPID,XTERM);
8724
8725 case KEY_wantarray:
8726 FUN0(OP_WANTARRAY);
8727
8728 case KEY_write:
8729 /* Make sure $^L is defined. 0x0C is CTRL-L on ASCII platforms, and
8730 * we use the same number on EBCDIC */
8731 gv_fetchpvs("\x0C", GV_ADD|GV_NOTQUAL, SVt_PV);
8732 UNI(OP_ENTERWRITE);
8733
8734 case KEY_x:
8735 if (PL_expect == XOPERATOR) {
8736 if (*s == '=' && !PL_lex_allbrackets
8737 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
8738 {
8739 return REPORT(0);
8740 }
8741 Mop(OP_REPEAT);
8742 }
8743 check_uni();
8744 return yyl_just_a_word(aTHX_ s, len, orig_keyword, c);
8745
8746 case KEY_xor:
8747 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_LOWLOGIC)
8748 return REPORT(0);
8749 pl_yylval.ival = OP_XOR;
8750 OPERATOR(OROP);
8751 }
8752 }
8753
8754 static int
yyl_key_core(pTHX_ char * s,STRLEN len,struct code c)8755 yyl_key_core(pTHX_ char *s, STRLEN len, struct code c)
8756 {
8757 I32 key = 0;
8758 I32 orig_keyword = 0;
8759 STRLEN olen = len;
8760 char *d = s;
8761 s += 2;
8762 s = scan_word6(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len, FALSE);
8763 if ((*s == ':' && s[1] == ':')
8764 || (!(key = keyword(PL_tokenbuf, len, 1)) && *s == '\''))
8765 {
8766 Copy(PL_bufptr, PL_tokenbuf, olen, char);
8767 return yyl_just_a_word(aTHX_ d, olen, 0, c);
8768 }
8769 if (!key)
8770 Perl_croak(aTHX_ "CORE::%" UTF8f " is not a keyword",
8771 UTF8fARG(UTF, len, PL_tokenbuf));
8772 if (key < 0)
8773 key = -key;
8774 else if (key == KEY_require || key == KEY_do
8775 || key == KEY_glob)
8776 /* that's a way to remember we saw "CORE::" */
8777 orig_keyword = key;
8778
8779 /* Known to be a reserved word at this point */
8780 return yyl_word_or_keyword(aTHX_ s, len, key, orig_keyword, c);
8781 }
8782
8783 struct Perl_custom_infix_result {
8784 struct Perl_custom_infix *def;
8785 SV *parsedata;
8786 };
8787
tokentype_for_plugop(struct Perl_custom_infix * def)8788 static enum yytokentype tokentype_for_plugop(struct Perl_custom_infix *def)
8789 {
8790 enum Perl_custom_infix_precedence prec = def->prec;
8791 if(prec <= INFIX_PREC_LOW)
8792 return PLUGIN_LOW_OP;
8793 if(prec <= INFIX_PREC_LOGICAL_OR_LOW)
8794 return PLUGIN_LOGICAL_OR_LOW_OP;
8795 if(prec <= INFIX_PREC_LOGICAL_AND_LOW)
8796 return PLUGIN_LOGICAL_AND_LOW_OP;
8797 if(prec <= INFIX_PREC_ASSIGN)
8798 return PLUGIN_ASSIGN_OP;
8799 if(prec <= INFIX_PREC_LOGICAL_OR)
8800 return PLUGIN_LOGICAL_OR_OP;
8801 if(prec <= INFIX_PREC_LOGICAL_AND)
8802 return PLUGIN_LOGICAL_AND_OP;
8803 if(prec <= INFIX_PREC_REL)
8804 return PLUGIN_REL_OP;
8805 if(prec <= INFIX_PREC_ADD)
8806 return PLUGIN_ADD_OP;
8807 if(prec <= INFIX_PREC_MUL)
8808 return PLUGIN_MUL_OP;
8809 if(prec <= INFIX_PREC_POW)
8810 return PLUGIN_POW_OP;
8811 return PLUGIN_HIGH_OP;
8812 }
8813
8814 OP *
Perl_build_infix_plugin(pTHX_ OP * lhs,OP * rhs,void * tokendata)8815 Perl_build_infix_plugin(pTHX_ OP *lhs, OP *rhs, void *tokendata)
8816 {
8817 PERL_ARGS_ASSERT_BUILD_INFIX_PLUGIN;
8818
8819 struct Perl_custom_infix_result *result = (struct Perl_custom_infix_result *)tokendata;
8820 SAVEFREEPV(result);
8821 if(result->parsedata)
8822 SAVEFREESV(result->parsedata);
8823
8824 return (*result->def->build_op)(aTHX_
8825 &result->parsedata, lhs, rhs, result->def);
8826 }
8827
8828 static int
yyl_keylookup(pTHX_ char * s,GV * gv)8829 yyl_keylookup(pTHX_ char *s, GV *gv)
8830 {
8831 STRLEN len;
8832 bool anydelim;
8833 I32 key;
8834 struct code c = no_code;
8835 I32 orig_keyword = 0;
8836 char *d;
8837
8838 c.gv = gv;
8839
8840 PL_bufptr = s;
8841 s = scan_word6(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len, FALSE);
8842
8843 /* Some keywords can be followed by any delimiter, including ':' */
8844 anydelim = word_takes_any_delimiter(PL_tokenbuf, len);
8845
8846 /* x::* is just a word, unless x is "CORE" */
8847 if (!anydelim && *s == ':' && s[1] == ':') {
8848 if (memEQs(PL_tokenbuf, len, "CORE"))
8849 return yyl_key_core(aTHX_ s, len, c);
8850 return yyl_just_a_word(aTHX_ s, len, 0, c);
8851 }
8852
8853 d = s;
8854 while (d < PL_bufend && isSPACE(*d))
8855 d++; /* no comments skipped here, or s### is misparsed */
8856
8857 /* Is this a word before a => operator? */
8858 if (*d == '=' && d[1] == '>') {
8859 return yyl_fatcomma(aTHX_ s, len);
8860 }
8861
8862 /* Check for plugged-in keyword */
8863 {
8864 OP *o;
8865 int result;
8866 char *saved_bufptr = PL_bufptr;
8867 PL_bufptr = s;
8868 result = PL_keyword_plugin(aTHX_ PL_tokenbuf, len, &o);
8869 s = PL_bufptr;
8870 if (result == KEYWORD_PLUGIN_DECLINE) {
8871 /* not a plugged-in keyword */
8872 PL_bufptr = saved_bufptr;
8873 } else if (result == KEYWORD_PLUGIN_STMT) {
8874 pl_yylval.opval = o;
8875 CLINE;
8876 if (!PL_nexttoke) PL_expect = XSTATE;
8877 return REPORT(PLUGSTMT);
8878 } else if (result == KEYWORD_PLUGIN_EXPR) {
8879 pl_yylval.opval = o;
8880 CLINE;
8881 if (!PL_nexttoke) PL_expect = XOPERATOR;
8882 return REPORT(PLUGEXPR);
8883 } else {
8884 Perl_croak(aTHX_ "Bad plugin affecting keyword '%s'", PL_tokenbuf);
8885 }
8886 }
8887
8888 /* Check for plugged-in named operator */
8889 if(PLUGINFIX_IS_ENABLED) {
8890 struct Perl_custom_infix *def;
8891 STRLEN result;
8892 result = PL_infix_plugin(aTHX_ PL_tokenbuf, len, &def);
8893 if(result) {
8894 if(result != len)
8895 Perl_croak(aTHX_ "Bad infix plugin result (%zd) - did not consume entire identifier <%s>\n",
8896 result, PL_tokenbuf);
8897 PL_bufptr = s = d;
8898 struct Perl_custom_infix_result *result;
8899 Newx(result, 1, struct Perl_custom_infix_result);
8900 result->def = def;
8901 result->parsedata = NULL;
8902 if(def->parse) {
8903 (*def->parse)(aTHX_ &result->parsedata, def);
8904 s = PL_bufptr; /* restore local s variable */
8905 }
8906 pl_yylval.pval = result;
8907 CLINE;
8908 OPERATOR(tokentype_for_plugop(def));
8909 }
8910 }
8911
8912 /* Is this a label? */
8913 if (!anydelim && PL_expect == XSTATE
8914 && d < PL_bufend && *d == ':' && *(d + 1) != ':') {
8915 s = d + 1;
8916 pl_yylval.opval =
8917 newSVOP(OP_CONST, 0,
8918 newSVpvn_flags(PL_tokenbuf, len, UTF ? SVf_UTF8 : 0));
8919 CLINE;
8920 TOKEN(LABEL);
8921 }
8922
8923 /* Check for lexical sub */
8924 if (PL_expect != XOPERATOR) {
8925 char tmpbuf[sizeof PL_tokenbuf + 1];
8926 *tmpbuf = '&';
8927 Copy(PL_tokenbuf, tmpbuf+1, len, char);
8928 c.off = pad_findmy_pvn(tmpbuf, len+1, 0);
8929 if (c.off != NOT_IN_PAD) {
8930 assert(c.off); /* we assume this is boolean-true below */
8931 if (PAD_COMPNAME_FLAGS_isOUR(c.off)) {
8932 HV * const stash = PAD_COMPNAME_OURSTASH(c.off);
8933 HEK * const stashname = HvNAME_HEK(stash);
8934 c.sv = newSVhek(stashname);
8935 sv_catpvs(c.sv, "::");
8936 sv_catpvn_flags(c.sv, PL_tokenbuf, len,
8937 (UTF ? SV_CATUTF8 : SV_CATBYTES));
8938 c.gv = gv_fetchsv(c.sv, GV_NOADD_NOINIT | SvUTF8(c.sv),
8939 SVt_PVCV);
8940 c.off = 0;
8941 if (!c.gv) {
8942 ASSUME(c.sv && SvREFCNT(c.sv) == 1);
8943 SvREFCNT_dec(c.sv);
8944 c.sv = NULL;
8945 return yyl_just_a_word(aTHX_ s, len, 0, c);
8946 }
8947 }
8948 else {
8949 c.rv2cv_op = newOP(OP_PADANY, 0);
8950 c.rv2cv_op->op_targ = c.off;
8951 c.cv = find_lexical_cv(c.off);
8952 }
8953 c.lex = TRUE;
8954 return yyl_just_a_word(aTHX_ s, len, 0, c);
8955 }
8956 c.off = 0;
8957 }
8958
8959 /* Check for built-in keyword */
8960 key = keyword(PL_tokenbuf, len, 0);
8961
8962 if (key < 0)
8963 key = yyl_secondclass_keyword(aTHX_ s, len, key, &orig_keyword, &c.gv, &c.gvp);
8964
8965 if (key && key != KEY___DATA__ && key != KEY___END__
8966 && (!anydelim || *s != '#')) {
8967 /* no override, and not s### either; skipspace is safe here
8968 * check for => on following line */
8969 bool arrow;
8970 STRLEN bufoff = PL_bufptr - SvPVX(PL_linestr);
8971 STRLEN soff = s - SvPVX(PL_linestr);
8972 s = peekspace(s);
8973 arrow = *s == '=' && s[1] == '>';
8974 PL_bufptr = SvPVX(PL_linestr) + bufoff;
8975 s = SvPVX(PL_linestr) + soff;
8976 if (arrow)
8977 return yyl_fatcomma(aTHX_ s, len);
8978 }
8979
8980 return yyl_word_or_keyword(aTHX_ s, len, key, orig_keyword, c);
8981 }
8982
8983 static int
yyl_try(pTHX_ char * s)8984 yyl_try(pTHX_ char *s)
8985 {
8986 char *d;
8987 GV *gv = NULL;
8988 int tok;
8989
8990 retry:
8991 /* Check for plugged-in symbolic operator */
8992 if(PLUGINFIX_IS_ENABLED && isPLUGINFIX_FIRST(*s)) {
8993 struct Perl_custom_infix *def;
8994 char *s_end = s, *d = PL_tokenbuf;
8995 STRLEN len;
8996
8997 /* Copy the longest sequence of isPLUGINFIX() chars into PL_tokenbuf */
8998 while(s_end < PL_bufend && d < PL_tokenbuf+sizeof(PL_tokenbuf)-1 && isPLUGINFIX(*s_end))
8999 *d++ = *s_end++;
9000 *d = '\0';
9001
9002 if((len = (*PL_infix_plugin)(aTHX_ PL_tokenbuf, s_end - s, &def))) {
9003 s += len;
9004 struct Perl_custom_infix_result *result;
9005 Newx(result, 1, struct Perl_custom_infix_result);
9006 result->def = def;
9007 result->parsedata = NULL;
9008 if(def->parse) {
9009 PL_bufptr = s;
9010 (*def->parse)(aTHX_ &result->parsedata, def);
9011 s = PL_bufptr; /* restore local s variable */
9012 }
9013 pl_yylval.pval = result;
9014 CLINE;
9015 OPERATOR(tokentype_for_plugop(def));
9016 }
9017 }
9018
9019 switch (*s) {
9020 default:
9021 if (UTF ? isIDFIRST_utf8_safe(s, PL_bufend) : isALNUMC(*s)) {
9022 if ((tok = yyl_keylookup(aTHX_ s, gv)) != YYL_RETRY)
9023 return tok;
9024 goto retry_bufptr;
9025 }
9026 yyl_croak_unrecognised(aTHX_ s);
9027
9028 case 4:
9029 case 26:
9030 /* emulate EOF on ^D or ^Z */
9031 if ((tok = yyl_fake_eof(aTHX_ LEX_FAKE_EOF, FALSE, s)) != YYL_RETRY)
9032 return tok;
9033 retry_bufptr:
9034 s = PL_bufptr;
9035 goto retry;
9036
9037 case 0:
9038 if ((!PL_rsfp || PL_lex_inwhat)
9039 && (!PL_parser->filtered || s+1 < PL_bufend)) {
9040 PL_last_uni = 0;
9041 PL_last_lop = 0;
9042 if (PL_lex_brackets
9043 && PL_lex_brackstack[PL_lex_brackets-1] != XFAKEEOF)
9044 {
9045 yyerror((const char *)
9046 (PL_lex_formbrack
9047 ? "Format not terminated"
9048 : "Missing right curly or square bracket"));
9049 }
9050 DEBUG_T({
9051 PerlIO_printf(Perl_debug_log, "### Tokener got EOF\n");
9052 });
9053 TOKEN(0);
9054 }
9055 if (s++ < PL_bufend)
9056 goto retry; /* ignore stray nulls */
9057 PL_last_uni = 0;
9058 PL_last_lop = 0;
9059 if (!PL_in_eval && !PL_preambled) {
9060 PL_preambled = TRUE;
9061 if (PL_perldb) {
9062 /* Generate a string of Perl code to load the debugger.
9063 * If PERL5DB is set, it will return the contents of that,
9064 * otherwise a compile-time require of perl5db.pl. */
9065
9066 const char * const pdb = PerlEnv_getenv("PERL5DB");
9067
9068 if (pdb) {
9069 sv_setpv(PL_linestr, pdb);
9070 sv_catpvs(PL_linestr,";");
9071 } else {
9072 SETERRNO(0,SS_NORMAL);
9073 sv_setpvs(PL_linestr, "BEGIN { require 'perl5db.pl' };");
9074 }
9075 PL_parser->preambling = CopLINE(PL_curcop);
9076 } else
9077 SvPVCLEAR(PL_linestr);
9078 if (PL_preambleav) {
9079 SV **svp = AvARRAY(PL_preambleav);
9080 SV **const end = svp + AvFILLp(PL_preambleav);
9081 while(svp <= end) {
9082 sv_catsv(PL_linestr, *svp);
9083 ++svp;
9084 sv_catpvs(PL_linestr, ";");
9085 }
9086 SvREFCNT_dec(MUTABLE_SV(PL_preambleav));
9087 PL_preambleav = NULL;
9088 }
9089 if (PL_minus_E)
9090 sv_catpvs(PL_linestr,
9091 "use feature ':" STRINGIFY(PERL_REVISION) "." STRINGIFY(PERL_VERSION) "';");
9092 if (PL_minus_n || PL_minus_p) {
9093 sv_catpvs(PL_linestr, "LINE: while (<>) {"/*}*/);
9094 if (PL_minus_l)
9095 sv_catpvs(PL_linestr,"chomp;");
9096 if (PL_minus_a) {
9097 if (PL_minus_F) {
9098 if ( ( *PL_splitstr == '/'
9099 || *PL_splitstr == '\''
9100 || *PL_splitstr == '"')
9101 && strchr(PL_splitstr + 1, *PL_splitstr))
9102 {
9103 /* strchr is ok, because -F pattern can't contain
9104 * embedded NULs */
9105 Perl_sv_catpvf(aTHX_ PL_linestr, "our @F=split(%s);", PL_splitstr);
9106 }
9107 else {
9108 /* "q\0${splitstr}\0" is legal perl. Yes, even NUL
9109 bytes can be used as quoting characters. :-) */
9110 const char *splits = PL_splitstr;
9111 sv_catpvs(PL_linestr, "our @F=split(q\0");
9112 do {
9113 /* Need to \ \s */
9114 if (*splits == '\\')
9115 sv_catpvn(PL_linestr, splits, 1);
9116 sv_catpvn(PL_linestr, splits, 1);
9117 } while (*splits++);
9118 /* This loop will embed the trailing NUL of
9119 PL_linestr as the last thing it does before
9120 terminating. */
9121 sv_catpvs(PL_linestr, ");");
9122 }
9123 }
9124 else
9125 sv_catpvs(PL_linestr,"our @F=split(' ');");
9126 }
9127 }
9128 sv_catpvs(PL_linestr, "\n");
9129 PL_oldoldbufptr = PL_oldbufptr = s = PL_linestart = SvPVX(PL_linestr);
9130 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
9131 PL_last_lop = PL_last_uni = NULL;
9132 if (PERLDB_LINE_OR_SAVESRC && PL_curstash != PL_debstash)
9133 update_debugger_info(PL_linestr, NULL, 0);
9134 goto retry;
9135 }
9136 if ((tok = yyl_fake_eof(aTHX_ 0, cBOOL(PL_rsfp), s)) != YYL_RETRY)
9137 return tok;
9138 goto retry_bufptr;
9139
9140 case '\r':
9141 #ifdef PERL_STRICT_CR
9142 Perl_warn(aTHX_ "Illegal character \\%03o (carriage return)", '\r');
9143 Perl_croak(aTHX_
9144 "\t(Maybe you didn't strip carriage returns after a network transfer?)\n");
9145 #endif
9146 case ' ': case '\t': case '\f': case '\v':
9147 s++;
9148 goto retry;
9149
9150 case '#':
9151 case '\n': {
9152 const bool needs_semicolon = yyl_eol_needs_semicolon(aTHX_ &s);
9153 if (needs_semicolon)
9154 TOKEN(PERLY_SEMICOLON);
9155 else
9156 goto retry;
9157 }
9158
9159 case '-':
9160 return yyl_hyphen(aTHX_ s);
9161
9162 case '+':
9163 return yyl_plus(aTHX_ s);
9164
9165 case '*':
9166 return yyl_star(aTHX_ s);
9167
9168 case '%':
9169 return yyl_percent(aTHX_ s);
9170
9171 case '^':
9172 return yyl_caret(aTHX_ s);
9173
9174 case '[':
9175 return yyl_leftsquare(aTHX_ s);
9176
9177 case '~':
9178 return yyl_tilde(aTHX_ s);
9179
9180 case ',':
9181 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_COMMA)
9182 TOKEN(0);
9183 s++;
9184 OPERATOR(PERLY_COMMA);
9185 case ':':
9186 if (s[1] == ':')
9187 return yyl_just_a_word(aTHX_ s, 0, 0, no_code);
9188 return yyl_colon(aTHX_ s + 1);
9189
9190 case '(':
9191 return yyl_leftparen(aTHX_ s + 1);
9192
9193 case ';':
9194 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_NONEXPR)
9195 TOKEN(0);
9196 CLINE;
9197 s++;
9198 PL_expect = XSTATE;
9199 TOKEN(PERLY_SEMICOLON);
9200
9201 case ')':
9202 return yyl_rightparen(aTHX_ s);
9203
9204 case ']':
9205 return yyl_rightsquare(aTHX_ s);
9206
9207 case '{':
9208 return yyl_leftcurly(aTHX_ s + 1, 0);
9209
9210 case '}':
9211 if (PL_lex_brackets && PL_lex_brackstack[PL_lex_brackets-1] == XFAKEEOF)
9212 TOKEN(0);
9213 return yyl_rightcurly(aTHX_ s, 0);
9214
9215 case '&':
9216 return yyl_ampersand(aTHX_ s);
9217
9218 case '|':
9219 return yyl_verticalbar(aTHX_ s);
9220
9221 case '=':
9222 if (s[1] == '=' && (s == PL_linestart || s[-1] == '\n')
9223 && memBEGINs(s + 2, (STRLEN) (PL_bufend - (s + 2)), "====="))
9224 {
9225 s = vcs_conflict_marker(s + 7);
9226 goto retry;
9227 }
9228
9229 s++;
9230 {
9231 const char tmp = *s++;
9232 if (tmp == '=') {
9233 if (!PL_lex_allbrackets
9234 && PL_lex_fakeeof >= LEX_FAKEEOF_COMPARE)
9235 {
9236 s -= 2;
9237 TOKEN(0);
9238 }
9239 ChEop(OP_EQ);
9240 }
9241 if (tmp == '>') {
9242 if (!PL_lex_allbrackets
9243 && PL_lex_fakeeof >= LEX_FAKEEOF_COMMA)
9244 {
9245 s -= 2;
9246 TOKEN(0);
9247 }
9248 OPERATOR(PERLY_COMMA);
9249 }
9250 if (tmp == '~')
9251 PMop(OP_MATCH);
9252 if (tmp && isSPACE(*s) && ckWARN(WARN_SYNTAX)
9253 && memCHRs("+-*/%.^&|<",tmp))
9254 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
9255 "Reversed %c= operator",(int)tmp);
9256 s--;
9257 if (PL_expect == XSTATE
9258 && isALPHA(tmp)
9259 && (s == PL_linestart+1 || s[-2] == '\n') )
9260 {
9261 if ( (PL_in_eval && !PL_rsfp && !PL_parser->filtered)
9262 || PL_lex_state != LEX_NORMAL)
9263 {
9264 d = PL_bufend;
9265 while (s < d) {
9266 if (*s++ == '\n') {
9267 incline(s, PL_bufend);
9268 if (memBEGINs(s, (STRLEN) (PL_bufend - s), "=cut"))
9269 {
9270 s = (char *) memchr(s,'\n', d - s);
9271 if (s)
9272 s++;
9273 else
9274 s = d;
9275 incline(s, PL_bufend);
9276 goto retry;
9277 }
9278 }
9279 }
9280 goto retry;
9281 }
9282 s = PL_bufend;
9283 PL_parser->in_pod = 1;
9284 goto retry;
9285 }
9286 }
9287 if (PL_expect == XBLOCK) {
9288 const char *t = s;
9289 #ifdef PERL_STRICT_CR
9290 while (SPACE_OR_TAB(*t))
9291 #else
9292 while (SPACE_OR_TAB(*t) || *t == '\r')
9293 #endif
9294 t++;
9295 if (*t == '\n' || *t == '#') {
9296 ENTER_with_name("lex_format");
9297 SAVEI8(PL_parser->form_lex_state);
9298 SAVEI32(PL_lex_formbrack);
9299 PL_parser->form_lex_state = PL_lex_state;
9300 PL_lex_formbrack = PL_lex_brackets + 1;
9301 PL_parser->sub_error_count = PL_error_count;
9302 return yyl_leftcurly(aTHX_ s, 1);
9303 }
9304 }
9305 if (!PL_lex_allbrackets && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN) {
9306 s--;
9307 TOKEN(0);
9308 }
9309 pl_yylval.ival = 0;
9310 OPERATOR(ASSIGNOP);
9311
9312 case '!':
9313 return yyl_bang(aTHX_ s + 1);
9314
9315 case '<':
9316 if (s[1] == '<' && (s == PL_linestart || s[-1] == '\n')
9317 && memBEGINs(s + 2, (STRLEN) (PL_bufend - (s + 2)), "<<<<<"))
9318 {
9319 s = vcs_conflict_marker(s + 7);
9320 goto retry;
9321 }
9322 return yyl_leftpointy(aTHX_ s);
9323
9324 case '>':
9325 if (s[1] == '>' && (s == PL_linestart || s[-1] == '\n')
9326 && memBEGINs(s + 2, (STRLEN) (PL_bufend - (s + 2)), ">>>>>"))
9327 {
9328 s = vcs_conflict_marker(s + 7);
9329 goto retry;
9330 }
9331 return yyl_rightpointy(aTHX_ s + 1);
9332
9333 case '$':
9334 return yyl_dollar(aTHX_ s);
9335
9336 case '@':
9337 return yyl_snail(aTHX_ s);
9338
9339 case '/': /* may be division, defined-or, or pattern */
9340 return yyl_slash(aTHX_ s);
9341
9342 case '?': /* conditional */
9343 s++;
9344 if (!PL_lex_allbrackets
9345 && PL_lex_fakeeof >= LEX_FAKEEOF_IFELSE)
9346 {
9347 s--;
9348 TOKEN(0);
9349 }
9350 PL_lex_allbrackets++;
9351 OPERATOR(PERLY_QUESTION_MARK);
9352
9353 case '.':
9354 if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack
9355 #ifdef PERL_STRICT_CR
9356 && s[1] == '\n'
9357 #else
9358 && (s[1] == '\n' || (s[1] == '\r' && s[2] == '\n'))
9359 #endif
9360 && (s == PL_linestart || s[-1] == '\n') )
9361 {
9362 PL_expect = XSTATE;
9363 /* formbrack==2 means dot seen where arguments expected */
9364 return yyl_rightcurly(aTHX_ s, 2);
9365 }
9366 if (PL_expect == XSTATE && s[1] == '.' && s[2] == '.') {
9367 s += 3;
9368 OPERATOR(YADAYADA);
9369 }
9370 if (PL_expect == XOPERATOR || !isDIGIT(s[1])) {
9371 char tmp = *s++;
9372 if (*s == tmp) {
9373 if (!PL_lex_allbrackets
9374 && PL_lex_fakeeof >= LEX_FAKEEOF_RANGE)
9375 {
9376 s--;
9377 TOKEN(0);
9378 }
9379 s++;
9380 if (*s == tmp) {
9381 s++;
9382 pl_yylval.ival = OPf_SPECIAL;
9383 }
9384 else
9385 pl_yylval.ival = 0;
9386 OPERATOR(DOTDOT);
9387 }
9388 if (*s == '=' && !PL_lex_allbrackets
9389 && PL_lex_fakeeof >= LEX_FAKEEOF_ASSIGN)
9390 {
9391 s--;
9392 TOKEN(0);
9393 }
9394 Aop(OP_CONCAT);
9395 }
9396 /* FALLTHROUGH */
9397 case '0': case '1': case '2': case '3': case '4':
9398 case '5': case '6': case '7': case '8': case '9':
9399 s = scan_num(s, &pl_yylval);
9400 DEBUG_T( { printbuf("### Saw number in %s\n", s); } );
9401 if (PL_expect == XOPERATOR)
9402 no_op("Number",s);
9403 TERM(THING);
9404
9405 case '\'':
9406 return yyl_sglquote(aTHX_ s);
9407
9408 case '"':
9409 return yyl_dblquote(aTHX_ s);
9410
9411 case '`':
9412 return yyl_backtick(aTHX_ s);
9413
9414 case '\\':
9415 return yyl_backslash(aTHX_ s + 1);
9416
9417 case 'v':
9418 if (isDIGIT(s[1]) && PL_expect != XOPERATOR) {
9419 char *start = s + 2;
9420 while (isDIGIT(*start) || *start == '_')
9421 start++;
9422 if (*start == '.' && isDIGIT(start[1])) {
9423 s = scan_num(s, &pl_yylval);
9424 TERM(THING);
9425 }
9426 else if ((*start == ':' && start[1] == ':')
9427 || (PL_expect == XSTATE && *start == ':')) {
9428 if ((tok = yyl_keylookup(aTHX_ s, gv)) != YYL_RETRY)
9429 return tok;
9430 goto retry_bufptr;
9431 }
9432 else if (PL_expect == XSTATE) {
9433 d = start;
9434 while (d < PL_bufend && isSPACE(*d)) d++;
9435 if (*d == ':') {
9436 if ((tok = yyl_keylookup(aTHX_ s, gv)) != YYL_RETRY)
9437 return tok;
9438 goto retry_bufptr;
9439 }
9440 }
9441 /* avoid v123abc() or $h{v1}, allow C<print v10;> */
9442 if (!isALPHA(*start) && (PL_expect == XTERM
9443 || PL_expect == XREF || PL_expect == XSTATE
9444 || PL_expect == XTERMORDORDOR)) {
9445 GV *const gv = gv_fetchpvn_flags(s, start - s,
9446 UTF ? SVf_UTF8 : 0, SVt_PVCV);
9447 if (!gv) {
9448 s = scan_num(s, &pl_yylval);
9449 TERM(THING);
9450 }
9451 }
9452 }
9453 if ((tok = yyl_keylookup(aTHX_ s, gv)) != YYL_RETRY)
9454 return tok;
9455 goto retry_bufptr;
9456
9457 case 'x':
9458 if (isDIGIT(s[1]) && PL_expect == XOPERATOR) {
9459 s++;
9460 Mop(OP_REPEAT);
9461 }
9462 if ((tok = yyl_keylookup(aTHX_ s, gv)) != YYL_RETRY)
9463 return tok;
9464 goto retry_bufptr;
9465
9466 case '_':
9467 case 'a': case 'A':
9468 case 'b': case 'B':
9469 case 'c': case 'C':
9470 case 'd': case 'D':
9471 case 'e': case 'E':
9472 case 'f': case 'F':
9473 case 'g': case 'G':
9474 case 'h': case 'H':
9475 case 'i': case 'I':
9476 case 'j': case 'J':
9477 case 'k': case 'K':
9478 case 'l': case 'L':
9479 case 'm': case 'M':
9480 case 'n': case 'N':
9481 case 'o': case 'O':
9482 case 'p': case 'P':
9483 case 'q': case 'Q':
9484 case 'r': case 'R':
9485 case 's': case 'S':
9486 case 't': case 'T':
9487 case 'u': case 'U':
9488 case 'V':
9489 case 'w': case 'W':
9490 case 'X':
9491 case 'y': case 'Y':
9492 case 'z': case 'Z':
9493 if ((tok = yyl_keylookup(aTHX_ s, gv)) != YYL_RETRY)
9494 return tok;
9495 goto retry_bufptr;
9496 }
9497 }
9498
9499
9500 /*
9501 yylex
9502
9503 Works out what to call the token just pulled out of the input
9504 stream. The yacc parser takes care of taking the ops we return and
9505 stitching them into a tree.
9506
9507 Returns:
9508 The type of the next token
9509
9510 Structure:
9511 Check if we have already built the token; if so, use it.
9512 Switch based on the current state:
9513 - if we have a case modifier in a string, deal with that
9514 - handle other cases of interpolation inside a string
9515 - scan the next line if we are inside a format
9516 In the normal state, switch on the next character:
9517 - default:
9518 if alphabetic, go to key lookup
9519 unrecognized character - croak
9520 - 0/4/26: handle end-of-line or EOF
9521 - cases for whitespace
9522 - \n and #: handle comments and line numbers
9523 - various operators, brackets and sigils
9524 - numbers
9525 - quotes
9526 - 'v': vstrings (or go to key lookup)
9527 - 'x' repetition operator (or go to key lookup)
9528 - other ASCII alphanumerics (key lookup begins here):
9529 word before => ?
9530 keyword plugin
9531 scan built-in keyword (but do nothing with it yet)
9532 check for statement label
9533 check for lexical subs
9534 return yyl_just_a_word if there is one
9535 see whether built-in keyword is overridden
9536 switch on keyword number:
9537 - default: return yyl_just_a_word:
9538 not a built-in keyword; handle bareword lookup
9539 disambiguate between method and sub call
9540 fall back to bareword
9541 - cases for built-in keywords
9542 */
9543
9544 int
Perl_yylex(pTHX)9545 Perl_yylex(pTHX)
9546 {
9547 char *s = PL_bufptr;
9548
9549 if (UNLIKELY(PL_parser->recheck_utf8_validity)) {
9550 const U8* first_bad_char_loc;
9551 if (UTF && UNLIKELY(! is_utf8_string_loc((U8 *) PL_bufptr,
9552 PL_bufend - PL_bufptr,
9553 &first_bad_char_loc)))
9554 {
9555 _force_out_malformed_utf8_message(first_bad_char_loc,
9556 (U8 *) PL_bufend,
9557 0,
9558 1 /* 1 means die */ );
9559 NOT_REACHED; /* NOTREACHED */
9560 }
9561 PL_parser->recheck_utf8_validity = FALSE;
9562 }
9563 DEBUG_T( {
9564 SV* tmp = newSVpvs("");
9565 PerlIO_printf(Perl_debug_log, "### %" LINE_Tf ":LEX_%s/X%s %s\n",
9566 CopLINE(PL_curcop),
9567 lex_state_names[PL_lex_state],
9568 exp_name[PL_expect],
9569 pv_display(tmp, s, strlen(s), 0, 60));
9570 SvREFCNT_dec(tmp);
9571 } );
9572
9573 /* when we've already built the next token, just pull it out of the queue */
9574 if (PL_nexttoke) {
9575 PL_nexttoke--;
9576 pl_yylval = PL_nextval[PL_nexttoke];
9577 {
9578 I32 next_type;
9579 next_type = PL_nexttype[PL_nexttoke];
9580 if (next_type & (7<<24)) {
9581 if (next_type & (1<<24)) {
9582 if (PL_lex_brackets > 100)
9583 Renew(PL_lex_brackstack, PL_lex_brackets + 10, char);
9584 PL_lex_brackstack[PL_lex_brackets++] =
9585 (char) ((U8) (next_type >> 16));
9586 }
9587 if (next_type & (2<<24))
9588 PL_lex_allbrackets++;
9589 if (next_type & (4<<24))
9590 PL_lex_allbrackets--;
9591 next_type &= 0xffff;
9592 }
9593 return REPORT(next_type == 'p' ? pending_ident() : next_type);
9594 }
9595 }
9596
9597 switch (PL_lex_state) {
9598 case LEX_NORMAL:
9599 case LEX_INTERPNORMAL:
9600 break;
9601
9602 /* interpolated case modifiers like \L \U, including \Q and \E.
9603 when we get here, PL_bufptr is at the \
9604 */
9605 case LEX_INTERPCASEMOD:
9606 /* handle \E or end of string */
9607 return yyl_interpcasemod(aTHX_ s);
9608
9609 case LEX_INTERPPUSH:
9610 return REPORT(sublex_push());
9611
9612 case LEX_INTERPSTART:
9613 if (PL_bufptr == PL_bufend)
9614 return REPORT(sublex_done());
9615 DEBUG_T({
9616 if(*PL_bufptr != '(')
9617 PerlIO_printf(Perl_debug_log, "### Interpolated variable\n");
9618 });
9619 PL_expect = XTERM;
9620 /* for /@a/, we leave the joining for the regex engine to do
9621 * (unless we're within \Q etc) */
9622 PL_lex_dojoin = (*PL_bufptr == '@'
9623 && (!PL_lex_inpat || PL_lex_casemods));
9624 PL_lex_state = LEX_INTERPNORMAL;
9625 if (PL_lex_dojoin) {
9626 NEXTVAL_NEXTTOKE.ival = 0;
9627 force_next(PERLY_COMMA);
9628 force_ident("\"", PERLY_DOLLAR);
9629 NEXTVAL_NEXTTOKE.ival = 0;
9630 force_next(PERLY_DOLLAR);
9631 NEXTVAL_NEXTTOKE.ival = 0;
9632 force_next((2<<24)|PERLY_PAREN_OPEN);
9633 NEXTVAL_NEXTTOKE.ival = OP_JOIN; /* emulate join($", ...) */
9634 force_next(FUNC);
9635 }
9636 /* Convert (?{...}) or (*{...}) and friends to 'do {...}' */
9637 if (PL_lex_inpat && *PL_bufptr == '(') {
9638 PL_parser->lex_shared->re_eval_start = PL_bufptr;
9639 PL_bufptr += 2;
9640 if (*PL_bufptr != '{')
9641 PL_bufptr++;
9642 PL_expect = XTERMBLOCK;
9643 force_next(KW_DO);
9644 }
9645
9646 if (PL_lex_starts++) {
9647 s = PL_bufptr;
9648 /* commas only at base level: /$a\Ub$c/ => ($a,uc(b.$c)) */
9649 if (!PL_lex_casemods && PL_lex_inpat)
9650 TOKEN(PERLY_COMMA);
9651 else
9652 AopNOASSIGN(OP_CONCAT);
9653 }
9654 return yylex();
9655
9656 case LEX_INTERPENDMAYBE:
9657 if (intuit_more(PL_bufptr, PL_bufend)) {
9658 PL_lex_state = LEX_INTERPNORMAL; /* false alarm, more expr */
9659 break;
9660 }
9661 /* FALLTHROUGH */
9662
9663 case LEX_INTERPEND:
9664 if (PL_lex_dojoin) {
9665 const U8 dojoin_was = PL_lex_dojoin;
9666 PL_lex_dojoin = FALSE;
9667 PL_lex_state = LEX_INTERPCONCAT;
9668 PL_lex_allbrackets--;
9669 return REPORT(dojoin_was == 1 ? (int)PERLY_PAREN_CLOSE : (int)POSTJOIN);
9670 }
9671 if (PL_lex_inwhat == OP_SUBST && PL_linestr == PL_lex_repl
9672 && SvEVALED(PL_lex_repl))
9673 {
9674 if (PL_bufptr != PL_bufend)
9675 Perl_croak(aTHX_ "Bad evalled substitution pattern");
9676 PL_lex_repl = NULL;
9677 }
9678 /* Paranoia. re_eval_start is adjusted when S_scan_heredoc sets
9679 re_eval_str. If the here-doc body's length equals the previous
9680 value of re_eval_start, re_eval_start will now be null. So
9681 check re_eval_str as well. */
9682 if (PL_parser->lex_shared->re_eval_start
9683 || PL_parser->lex_shared->re_eval_str) {
9684 SV *sv;
9685 if (*PL_bufptr != ')')
9686 Perl_croak(aTHX_ "Sequence (?{...}) not terminated with ')'");
9687 PL_bufptr++;
9688 /* having compiled a (?{..}) expression, return the original
9689 * text too, as a const */
9690 if (PL_parser->lex_shared->re_eval_str) {
9691 sv = PL_parser->lex_shared->re_eval_str;
9692 PL_parser->lex_shared->re_eval_str = NULL;
9693 SvCUR_set(sv,
9694 PL_bufptr - PL_parser->lex_shared->re_eval_start);
9695 SvPV_shrink_to_cur(sv);
9696 }
9697 else sv = newSVpvn(PL_parser->lex_shared->re_eval_start,
9698 PL_bufptr - PL_parser->lex_shared->re_eval_start);
9699 NEXTVAL_NEXTTOKE.opval =
9700 newSVOP(OP_CONST, 0,
9701 sv);
9702 force_next(THING);
9703 PL_parser->lex_shared->re_eval_start = NULL;
9704 PL_expect = XTERM;
9705 return REPORT(PERLY_COMMA);
9706 }
9707
9708 /* FALLTHROUGH */
9709 case LEX_INTERPCONCAT:
9710 #ifdef DEBUGGING
9711 if (PL_lex_brackets)
9712 Perl_croak(aTHX_ "panic: INTERPCONCAT, lex_brackets=%ld",
9713 (long) PL_lex_brackets);
9714 #endif
9715 if (PL_bufptr == PL_bufend)
9716 return REPORT(sublex_done());
9717
9718 /* m'foo' still needs to be parsed for possible (?{...}) */
9719 if (SvIVX(PL_linestr) == '\'' && !PL_lex_inpat) {
9720 SV *sv = newSVsv(PL_linestr);
9721 sv = tokeq(sv);
9722 pl_yylval.opval = newSVOP(OP_CONST, 0, sv);
9723 s = PL_bufend;
9724 }
9725 else {
9726 int save_error_count = PL_error_count;
9727
9728 s = scan_const(PL_bufptr);
9729
9730 /* Set flag if this was a pattern and there were errors. op.c will
9731 * refuse to compile a pattern with this flag set. Otherwise, we
9732 * could get segfaults, etc. */
9733 if (PL_lex_inpat && PL_error_count > save_error_count) {
9734 ((PMOP*)PL_lex_inpat)->op_pmflags |= PMf_HAS_ERROR;
9735 }
9736 if (*s == '\\')
9737 PL_lex_state = LEX_INTERPCASEMOD;
9738 else
9739 PL_lex_state = LEX_INTERPSTART;
9740 }
9741
9742 if (s != PL_bufptr) {
9743 NEXTVAL_NEXTTOKE = pl_yylval;
9744 PL_expect = XTERM;
9745 force_next(THING);
9746 if (PL_lex_starts++) {
9747 /* commas only at base level: /$a\Ub$c/ => ($a,uc(b.$c)) */
9748 if (!PL_lex_casemods && PL_lex_inpat)
9749 TOKEN(PERLY_COMMA);
9750 else
9751 AopNOASSIGN(OP_CONCAT);
9752 }
9753 else {
9754 PL_bufptr = s;
9755 return yylex();
9756 }
9757 }
9758
9759 return yylex();
9760 case LEX_FORMLINE:
9761 if (PL_parser->sub_error_count != PL_error_count) {
9762 /* There was an error parsing a formline, which tends to
9763 mess up the parser.
9764 Unlike interpolated sub-parsing, we can't treat any of
9765 these as recoverable, so no need to check sub_no_recover.
9766 */
9767 yyquit();
9768 }
9769 assert(PL_lex_formbrack);
9770 s = scan_formline(PL_bufptr);
9771 if (!PL_lex_formbrack)
9772 return yyl_rightcurly(aTHX_ s, 1);
9773 PL_bufptr = s;
9774 return yylex();
9775 }
9776
9777 /* We really do *not* want PL_linestr ever becoming a COW. */
9778 assert (!SvIsCOW(PL_linestr));
9779 s = PL_bufptr;
9780 PL_oldoldbufptr = PL_oldbufptr;
9781 PL_oldbufptr = s;
9782
9783 if (PL_in_my == KEY_sigvar) {
9784 PL_parser->saw_infix_sigil = 0;
9785 return yyl_sigvar(aTHX_ s);
9786 }
9787
9788 {
9789 /* yyl_try() and its callees might consult PL_parser->saw_infix_sigil.
9790 On its return, we then need to set it to indicate whether the token
9791 we just encountered was an infix operator that (if we hadn't been
9792 expecting an operator) have been a sigil.
9793 */
9794 bool expected_operator = (PL_expect == XOPERATOR);
9795 int ret = yyl_try(aTHX_ s);
9796 switch (pl_yylval.ival) {
9797 case OP_BIT_AND:
9798 case OP_MODULO:
9799 case OP_MULTIPLY:
9800 case OP_NBIT_AND:
9801 if (expected_operator) {
9802 PL_parser->saw_infix_sigil = 1;
9803 break;
9804 }
9805 /* FALLTHROUGH */
9806 default:
9807 PL_parser->saw_infix_sigil = 0;
9808 }
9809 return ret;
9810 }
9811 }
9812
9813
9814 /*
9815 S_pending_ident
9816
9817 Looks up an identifier in the pad or in a package
9818
9819 PL_in_my == KEY_sigvar indicates that this is a subroutine signature variable
9820 rather than a plain pad var.
9821
9822 Returns:
9823 PRIVATEREF if this is a lexical name.
9824 BAREWORD if this belongs to a package.
9825
9826 Structure:
9827 if we're in a my declaration
9828 croak if they tried to say my($foo::bar)
9829 build the ops for a my() declaration
9830 if it's an access to a my() variable
9831 build ops for access to a my() variable
9832 if in a dq string, and they've said @foo and we can't find @foo
9833 warn
9834 build ops for a bareword
9835 */
9836
9837 static int
S_pending_ident(pTHX)9838 S_pending_ident(pTHX)
9839 {
9840 PADOFFSET tmp = 0;
9841 const char pit = (char)pl_yylval.ival;
9842 const STRLEN tokenbuf_len = strlen(PL_tokenbuf);
9843 /* All routes through this function want to know if there is a colon. */
9844 const char *const has_colon = (const char*) memchr (PL_tokenbuf, ':', tokenbuf_len);
9845
9846 DEBUG_T({ PerlIO_printf(Perl_debug_log,
9847 "### Pending identifier '%s'\n", PL_tokenbuf); });
9848 assert(tokenbuf_len >= 2);
9849
9850 /* if we're in a my(), we can't allow dynamics here.
9851 $foo'bar has already been turned into $foo::bar, so
9852 just check for colons.
9853
9854 if it's a legal name, the OP is a PADANY.
9855 */
9856 if (PL_in_my) {
9857 if (PL_in_my == KEY_our) { /* "our" is merely analogous to "my" */
9858 if (has_colon)
9859 /* diag_listed_as: No package name allowed for variable %s
9860 in "our" */
9861 yyerror_pv(Perl_form(aTHX_ "No package name allowed for "
9862 "%s %s in \"our\"",
9863 *PL_tokenbuf=='&' ? "subroutine" : "variable",
9864 PL_tokenbuf), UTF ? SVf_UTF8 : 0);
9865 tmp = allocmy(PL_tokenbuf, tokenbuf_len, UTF ? SVf_UTF8 : 0);
9866 }
9867 else {
9868 OP *o;
9869 if (has_colon) {
9870 /* "my" variable %s can't be in a package */
9871 /* PL_no_myglob is constant */
9872 GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
9873 yyerror_pv(Perl_form(aTHX_ PL_no_myglob,
9874 PL_in_my == KEY_my ? "my" :
9875 PL_in_my == KEY_field ? "field" : "state",
9876 *PL_tokenbuf == '&' ? "subroutine" : "variable",
9877 PL_tokenbuf),
9878 UTF ? SVf_UTF8 : 0);
9879 GCC_DIAG_RESTORE_STMT;
9880 }
9881
9882 if (PL_in_my == KEY_sigvar) {
9883 /* A signature 'padop' needs in addition, an op_first to
9884 * point to a child sigdefelem, and an extra field to hold
9885 * the signature index. We can achieve both by using an
9886 * UNOP_AUX and (ab)using the op_aux field to hold the
9887 * index. If we ever need more fields, use a real malloced
9888 * aux strut instead.
9889 */
9890 o = newUNOP_AUX(OP_ARGELEM, 0, NULL,
9891 INT2PTR(UNOP_AUX_item *,
9892 (PL_parser->sig_elems)));
9893 o->op_private |= ( PL_tokenbuf[0] == '$' ? OPpARGELEM_SV
9894 : PL_tokenbuf[0] == '@' ? OPpARGELEM_AV
9895 : OPpARGELEM_HV);
9896 }
9897 else
9898 o = newOP(OP_PADANY, 0);
9899 o->op_targ = allocmy(PL_tokenbuf, tokenbuf_len,
9900 UTF ? SVf_UTF8 : 0);
9901 if (PL_in_my == KEY_sigvar)
9902 PL_in_my = 0;
9903
9904 pl_yylval.opval = o;
9905 return PRIVATEREF;
9906 }
9907 }
9908
9909 /*
9910 build the ops for accesses to a my() variable.
9911 */
9912
9913 if (!has_colon) {
9914 if (!PL_in_my)
9915 tmp = pad_findmy_pvn(PL_tokenbuf, tokenbuf_len,
9916 0);
9917 if (tmp != NOT_IN_PAD) {
9918 /* might be an "our" variable" */
9919 if (PAD_COMPNAME_FLAGS_isOUR(tmp)) {
9920 /* build ops for a bareword */
9921 HV * const stash = PAD_COMPNAME_OURSTASH(tmp);
9922 HEK * const stashname = HvNAME_HEK(stash);
9923 SV * const sym = newSVhek(stashname);
9924 sv_catpvs(sym, "::");
9925 sv_catpvn_flags(sym, PL_tokenbuf+1, tokenbuf_len > 0 ? tokenbuf_len - 1 : 0, (UTF ? SV_CATUTF8 : SV_CATBYTES ));
9926 pl_yylval.opval = newSVOP(OP_CONST, 0, sym);
9927 pl_yylval.opval->op_private = OPpCONST_ENTERED;
9928 if (pit != '&')
9929 gv_fetchsv(sym,
9930 GV_ADDMULTI,
9931 ((PL_tokenbuf[0] == '$') ? SVt_PV
9932 : (PL_tokenbuf[0] == '@') ? SVt_PVAV
9933 : SVt_PVHV));
9934 return BAREWORD;
9935 }
9936
9937 pl_yylval.opval = newOP(OP_PADANY, 0);
9938 pl_yylval.opval->op_targ = tmp;
9939 return PRIVATEREF;
9940 }
9941 }
9942
9943 /*
9944 Whine if they've said @foo or @foo{key} in a doublequoted string,
9945 and @foo (or %foo) isn't a variable we can find in the symbol
9946 table.
9947 */
9948 if (ckWARN(WARN_AMBIGUOUS)
9949 && pit == '@'
9950 && PL_lex_state != LEX_NORMAL
9951 && !PL_lex_brackets)
9952 {
9953 GV *const gv = gv_fetchpvn_flags(PL_tokenbuf + 1, tokenbuf_len > 0 ? tokenbuf_len - 1 : 0,
9954 ( UTF ? SVf_UTF8 : 0 ) | GV_ADDMG,
9955 SVt_PVAV);
9956 if ((!gv || ((PL_tokenbuf[0] == '@') ? !GvAV(gv) : !GvHV(gv)))
9957 )
9958 {
9959 /* Downgraded from fatal to warning 20000522 mjd */
9960 Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
9961 "Possible unintended interpolation of %" UTF8f
9962 " in string",
9963 UTF8fARG(UTF, tokenbuf_len, PL_tokenbuf));
9964 }
9965 }
9966
9967 /* build ops for a bareword */
9968 pl_yylval.opval = newSVOP(OP_CONST, 0,
9969 newSVpvn_flags(PL_tokenbuf + 1,
9970 tokenbuf_len > 0 ? tokenbuf_len - 1 : 0,
9971 UTF ? SVf_UTF8 : 0 ));
9972 pl_yylval.opval->op_private = OPpCONST_ENTERED;
9973 if (pit != '&')
9974 gv_fetchpvn_flags(PL_tokenbuf+1, tokenbuf_len > 0 ? tokenbuf_len - 1 : 0,
9975 (PL_in_eval ? GV_ADDMULTI : GV_ADD)
9976 | ( UTF ? SVf_UTF8 : 0 ),
9977 ((PL_tokenbuf[0] == '$') ? SVt_PV
9978 : (PL_tokenbuf[0] == '@') ? SVt_PVAV
9979 : SVt_PVHV));
9980 return BAREWORD;
9981 }
9982
9983 STATIC void
S_checkcomma(pTHX_ const char * s,const char * name,const char * what)9984 S_checkcomma(pTHX_ const char *s, const char *name, const char *what)
9985 {
9986 PERL_ARGS_ASSERT_CHECKCOMMA;
9987
9988 if (*s == ' ' && s[1] == '(') { /* XXX gotta be a better way */
9989 if (ckWARN(WARN_SYNTAX)) {
9990 int level = 1;
9991 const char *w;
9992 for (w = s+2; *w && level; w++) {
9993 if (*w == '(')
9994 ++level;
9995 else if (*w == ')')
9996 --level;
9997 }
9998 while (isSPACE(*w))
9999 ++w;
10000 /* the list of chars below is for end of statements or
10001 * block / parens, boolean operators (&&, ||, //) and branch
10002 * constructs (or, and, if, until, unless, while, err, for).
10003 * Not a very solid hack... */
10004 if (!*w || !memCHRs(";&/|})]oaiuwef!=", *w))
10005 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
10006 "%s (...) interpreted as function",name);
10007 }
10008 }
10009 while (s < PL_bufend && isSPACE(*s))
10010 s++;
10011 if (*s == '(')
10012 s++;
10013 while (s < PL_bufend && isSPACE(*s))
10014 s++;
10015 if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
10016 const char * const w = s;
10017 s += UTF ? UTF8SKIP(s) : 1;
10018 while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
10019 s += UTF ? UTF8SKIP(s) : 1;
10020 while (s < PL_bufend && isSPACE(*s))
10021 s++;
10022 if (*s == ',') {
10023 GV* gv;
10024 if (keyword(w, s - w, 0))
10025 return;
10026
10027 gv = gv_fetchpvn_flags(w, s - w, ( UTF ? SVf_UTF8 : 0 ), SVt_PVCV);
10028 if (gv && GvCVu(gv))
10029 return;
10030 if (s - w <= 254) {
10031 PADOFFSET off;
10032 char tmpbuf[256];
10033 Copy(w, tmpbuf+1, s - w, char);
10034 *tmpbuf = '&';
10035 off = pad_findmy_pvn(tmpbuf, s-w+1, 0);
10036 if (off != NOT_IN_PAD) return;
10037 }
10038 Perl_croak(aTHX_ "No comma allowed after %s", what);
10039 }
10040 }
10041 }
10042
10043 /* S_new_constant(): do any overload::constant lookup.
10044
10045 Either returns sv, or mortalizes/frees sv and returns a new SV*.
10046 Best used as sv=new_constant(..., sv, ...).
10047 If s, pv are NULL, calls subroutine with one argument,
10048 and <type> is used with error messages only.
10049 <type> is assumed to be well formed UTF-8.
10050
10051 If error_msg is not NULL, *error_msg will be set to any error encountered.
10052 Otherwise yyerror() will be used to output it */
10053
10054 STATIC SV *
S_new_constant(pTHX_ const char * s,STRLEN len,const char * key,STRLEN keylen,SV * sv,SV * pv,const char * type,STRLEN typelen,const char ** error_msg)10055 S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen,
10056 SV *sv, SV *pv, const char *type, STRLEN typelen,
10057 const char ** error_msg)
10058 {
10059 dSP;
10060 HV * table = GvHV(PL_hintgv); /* ^H */
10061 SV *res;
10062 SV *errsv = NULL;
10063 SV **cvp;
10064 SV *cv, *typesv;
10065 const char *why1 = "", *why2 = "", *why3 = "";
10066 const char * optional_colon = ":"; /* Only some messages have a colon */
10067 char *msg;
10068
10069 PERL_ARGS_ASSERT_NEW_CONSTANT;
10070 /* We assume that this is true: */
10071 assert(type || s);
10072
10073 sv_2mortal(sv); /* Parent created it permanently */
10074
10075 if ( ! table
10076 || ! (PL_hints & HINT_LOCALIZE_HH))
10077 {
10078 why1 = "unknown";
10079 optional_colon = "";
10080 goto report;
10081 }
10082
10083 cvp = hv_fetch(table, key, keylen, FALSE);
10084 if (!cvp || !SvOK(*cvp)) {
10085 why1 = "$^H{";
10086 why2 = key;
10087 why3 = "} is not defined";
10088 goto report;
10089 }
10090
10091 cv = *cvp;
10092 if (!pv && s)
10093 pv = newSVpvn_flags(s, len, SVs_TEMP);
10094 if (type && pv)
10095 typesv = newSVpvn_flags(type, typelen, SVs_TEMP);
10096 else
10097 typesv = &PL_sv_undef;
10098
10099 PUSHSTACKi(PERLSI_OVERLOAD);
10100 ENTER ;
10101 SAVETMPS;
10102
10103 PUSHMARK(SP) ;
10104 EXTEND(sp, 3);
10105 if (pv)
10106 PUSHs(pv);
10107 PUSHs(sv);
10108 if (pv)
10109 PUSHs(typesv);
10110 PUTBACK;
10111 call_sv(cv, G_SCALAR | ( PL_in_eval ? 0 : G_EVAL));
10112
10113 SPAGAIN ;
10114
10115 /* Check the eval first */
10116 if (!PL_in_eval && ((errsv = ERRSV), SvTRUE_NN(errsv))) {
10117 STRLEN errlen;
10118 const char * errstr;
10119 sv_catpvs(errsv, "Propagated");
10120 errstr = SvPV_const(errsv, errlen);
10121 yyerror_pvn(errstr, errlen, 0); /* Duplicates the message inside eval */
10122 (void)POPs;
10123 res = SvREFCNT_inc_simple_NN(sv);
10124 }
10125 else {
10126 res = POPs;
10127 SvREFCNT_inc_simple_void_NN(res);
10128 }
10129
10130 PUTBACK ;
10131 FREETMPS ;
10132 LEAVE ;
10133 POPSTACK;
10134
10135 if (SvOK(res)) {
10136 return res;
10137 }
10138
10139 sv = res;
10140 (void)sv_2mortal(sv);
10141
10142 why1 = "Call to &{$^H{";
10143 why2 = key;
10144 why3 = "}} did not return a defined value";
10145
10146 report:
10147
10148 msg = Perl_form(aTHX_ "Constant(%.*s)%s %s%s%s",
10149 (int)(type ? typelen : len),
10150 (type ? type: s),
10151 optional_colon,
10152 why1, why2, why3);
10153 if (error_msg) {
10154 *error_msg = msg;
10155 }
10156 else {
10157 yyerror_pv(msg, UTF ? SVf_UTF8 : 0);
10158 }
10159 return SvREFCNT_inc_simple_NN(sv);
10160 }
10161
10162 PERL_STATIC_INLINE void
S_parse_ident(pTHX_ char ** s,char ** d,char * const e,int allow_package,bool is_utf8,bool check_dollar,bool tick_warn)10163 S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package,
10164 bool is_utf8, bool check_dollar, bool tick_warn)
10165 {
10166 int saw_tick = 0;
10167 const char *olds = *s;
10168 PERL_ARGS_ASSERT_PARSE_IDENT;
10169
10170 while (*s < PL_bufend) {
10171 if (*d >= e)
10172 Perl_croak(aTHX_ "%s", ident_too_long);
10173 if (is_utf8 && isIDFIRST_utf8_safe(*s, PL_bufend)) {
10174 /* The UTF-8 case must come first, otherwise things
10175 * like c\N{COMBINING TILDE} would start failing, as the
10176 * isWORDCHAR_A case below would gobble the 'c' up.
10177 */
10178
10179 char *t = *s + UTF8SKIP(*s);
10180 while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) {
10181 t += UTF8SKIP(t);
10182 }
10183 if (*d + (t - *s) > e)
10184 Perl_croak(aTHX_ "%s", ident_too_long);
10185 Copy(*s, *d, t - *s, char);
10186 *d += t - *s;
10187 *s = t;
10188 }
10189 else if ( isWORDCHAR_A(**s) ) {
10190 do {
10191 *(*d)++ = *(*s)++;
10192 } while (isWORDCHAR_A(**s) && *d < e);
10193 }
10194 else if ( allow_package
10195 && **s == '\''
10196 && isIDFIRST_lazy_if_safe((*s)+1, PL_bufend, is_utf8))
10197 {
10198 *(*d)++ = ':';
10199 *(*d)++ = ':';
10200 (*s)++;
10201 saw_tick++;
10202 }
10203 else if (allow_package && **s == ':' && (*s)[1] == ':'
10204 /* Disallow things like Foo::$bar. For the curious, this is
10205 * the code path that triggers the "Bad name after" warning
10206 * when looking for barewords.
10207 */
10208 && !(check_dollar && (*s)[2] == '$')) {
10209 *(*d)++ = *(*s)++;
10210 *(*d)++ = *(*s)++;
10211 }
10212 else
10213 break;
10214 }
10215 if (UNLIKELY(saw_tick && tick_warn && ckWARN2_d(WARN_SYNTAX, WARN_DEPRECATED__APOSTROPHE_AS_PACKAGE_SEPARATOR))) {
10216 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
10217 char *this_d;
10218 char *d2;
10219 Newx(this_d, *s - olds + saw_tick + 2, char); /* +2 for $# */
10220 d2 = this_d;
10221 SAVEFREEPV(this_d);
10222
10223 Perl_warner(aTHX_ packWARN2(WARN_SYNTAX, WARN_DEPRECATED__APOSTROPHE_AS_PACKAGE_SEPARATOR),
10224 "Old package separator used in string");
10225 if (olds[-1] == '#')
10226 *d2++ = olds[-2];
10227 *d2++ = olds[-1];
10228 while (olds < *s) {
10229 if (*olds == '\'') {
10230 *d2++ = '\\';
10231 *d2++ = *olds++;
10232 }
10233 else
10234 *d2++ = *olds++;
10235 }
10236 Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
10237 "\t(Did you mean \"%" UTF8f "\" instead?)\n",
10238 UTF8fARG(is_utf8, d2-this_d, this_d));
10239 }
10240 else {
10241 Perl_warner(aTHX_ packWARN2(WARN_SYNTAX, WARN_DEPRECATED__APOSTROPHE_AS_PACKAGE_SEPARATOR),
10242 "Old package separator \"'\" deprecated");
10243 }
10244 }
10245 return;
10246 }
10247
10248 /* Returns a NUL terminated string, with the length of the string written to
10249 *slp
10250
10251 scan_word6() may be removed once ' in names is removed.
10252 */
10253 char *
Perl_scan_word6(pTHX_ char * s,char * dest,STRLEN destlen,int allow_package,STRLEN * slp,bool warn_tick)10254 Perl_scan_word6(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp, bool warn_tick)
10255 {
10256 char *d = dest;
10257 char * const e = d + destlen - 3; /* two-character token, ending NUL */
10258 bool is_utf8 = cBOOL(UTF);
10259
10260 PERL_ARGS_ASSERT_SCAN_WORD6;
10261
10262 parse_ident(&s, &d, e, allow_package, is_utf8, TRUE, warn_tick);
10263 *d = '\0';
10264 *slp = d - dest;
10265 return s;
10266 }
10267
10268 char *
Perl_scan_word(pTHX_ char * s,char * dest,STRLEN destlen,int allow_package,STRLEN * slp)10269 Perl_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp)
10270 {
10271 PERL_ARGS_ASSERT_SCAN_WORD;
10272 return scan_word6(s, dest, destlen, allow_package, slp, FALSE);
10273 }
10274
10275 /* scan s and extract an identifier ($var) from it if possible
10276 * into dest.
10277 * XXX: This function has subtle implications on parsing, and
10278 * changing how it behaves can cause a variable to change from
10279 * being a run time rv2sv call or a compile time binding to a
10280 * specific variable name.
10281 */
10282 STATIC char *
S_scan_ident(pTHX_ char * s,char * dest,STRLEN destlen,I32 ck_uni)10283 S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
10284 {
10285 I32 herelines = PL_parser->herelines;
10286 SSize_t bracket = -1;
10287 char funny = *s++;
10288 char *d = dest;
10289 char * const e = d + destlen - 3; /* two-character token, ending NUL */
10290 bool is_utf8 = cBOOL(UTF);
10291 line_t orig_copline = 0, tmp_copline = 0;
10292
10293 PERL_ARGS_ASSERT_SCAN_IDENT;
10294
10295 if (isSPACE(*s) || !*s)
10296 s = skipspace(s);
10297 if (isDIGIT(*s)) { /* handle $0 and $1 $2 and $10 and etc */
10298 bool is_zero= *s == '0' ? TRUE : FALSE;
10299 char *digit_start= d;
10300 *d++ = *s++;
10301 while (s < PL_bufend && isDIGIT(*s)) {
10302 if (d >= e)
10303 Perl_croak(aTHX_ "%s", ident_too_long);
10304 *d++ = *s++;
10305 }
10306 if (is_zero && d - digit_start > 1)
10307 Perl_croak(aTHX_ ident_var_zero_multi_digit);
10308 }
10309 else { /* See if it is a "normal" identifier */
10310 parse_ident(&s, &d, e, 1, is_utf8, FALSE, TRUE);
10311 }
10312 *d = '\0';
10313 d = dest;
10314 if (*d) {
10315 /* Either a digit variable, or parse_ident() found an identifier
10316 (anything valid as a bareword), so job done and return. */
10317 if (PL_lex_state != LEX_NORMAL)
10318 PL_lex_state = LEX_INTERPENDMAYBE;
10319 return s;
10320 }
10321
10322 /* Here, it is not a run-of-the-mill identifier name */
10323
10324 if (*s == '$' && s[1]
10325 && ( isIDFIRST_lazy_if_safe(s+1, PL_bufend, is_utf8)
10326 || isDIGIT_A((U8)s[1])
10327 || s[1] == '$'
10328 || s[1] == '{'
10329 || memBEGINs(s+1, (STRLEN) (PL_bufend - (s+1)), "::")) )
10330 {
10331 /* Dereferencing a value in a scalar variable.
10332 The alternatives are different syntaxes for a scalar variable.
10333 Using ' as a leading package separator isn't allowed. :: is. */
10334 return s;
10335 }
10336 /* Handle the opening { of @{...}, &{...}, *{...}, %{...}, ${...} */
10337 if (*s == '{') {
10338 bracket = s - SvPVX(PL_linestr);
10339 s++;
10340 orig_copline = CopLINE(PL_curcop);
10341 if (s < PL_bufend && isSPACE(*s)) {
10342 s = skipspace(s);
10343 }
10344 }
10345
10346
10347 /* Extract the first character of the variable name from 's' and
10348 * copy it, null terminated into 'd'. Note that this does not
10349 * involve checking for just IDFIRST characters, as it allows the
10350 * '^' for ${^FOO} type variable names, and it allows all the
10351 * characters that are legal in a single character variable name.
10352 *
10353 * The legal ones are any of:
10354 * a) all ASCII characters except:
10355 * 1) control and space-type ones, like NUL, SOH, \t, and SPACE;
10356 * 2) '{'
10357 * The final case currently doesn't get this far in the program, so we
10358 * don't test for it. If that were to change, it would be ok to allow it.
10359 * b) When not under Unicode rules, any upper Latin1 character
10360 * c) Otherwise, when unicode rules are used, all XIDS characters.
10361 *
10362 * Because all ASCII characters have the same representation whether
10363 * encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
10364 * '{' without knowing if is UTF-8 or not. */
10365
10366 if ((s <= PL_bufend - ((is_utf8)
10367 ? UTF8SKIP(s)
10368 : 1))
10369 && (
10370 isGRAPH_A(*s)
10371 ||
10372 ( is_utf8
10373 ? isIDFIRST_utf8_safe(s, PL_bufend)
10374 : (isGRAPH_L1(*s)
10375 && LIKELY((U8) *s != LATIN1_TO_NATIVE(0xAD))
10376 )
10377 )
10378 )
10379 ){
10380 if (is_utf8) {
10381 const STRLEN skip = UTF8SKIP(s);
10382 STRLEN i;
10383 d[skip] = '\0';
10384 for ( i = 0; i < skip; i++ )
10385 d[i] = *s++;
10386 }
10387 else {
10388 *d = *s++;
10389 d[1] = '\0';
10390 }
10391 }
10392
10393 /* special case to handle ${10}, ${11} the same way we handle ${1} etc */
10394 if (isDIGIT(*d)) {
10395 bool is_zero= *d == '0' ? TRUE : FALSE;
10396 char *digit_start= d;
10397 while (s < PL_bufend && isDIGIT(*s)) {
10398 d++;
10399 if (d >= e)
10400 Perl_croak(aTHX_ "%s", ident_too_long);
10401 *d= *s++;
10402 }
10403 if (is_zero && d - digit_start >= 1) /* d points at the last digit */
10404 Perl_croak(aTHX_ ident_var_zero_multi_digit);
10405 d[1] = '\0';
10406 }
10407
10408 /* Convert $^F, ${^F} and the ^F of ${^FOO} to control characters */
10409 else if (*d == '^' && *s && isCONTROLVAR(*s)) {
10410 *d = toCTRL(*s);
10411 s++;
10412 }
10413 /* Warn about ambiguous code after unary operators if {...} notation isn't
10414 used. There's no difference in ambiguity; it's merely a heuristic
10415 about when not to warn. */
10416 else if (ck_uni && bracket == -1)
10417 check_uni();
10418
10419 if (bracket != -1) {
10420 bool skip;
10421 char *s2;
10422 /* If we were processing {...} notation then... */
10423 if (isIDFIRST_lazy_if_safe(d, e, is_utf8)
10424 || (!isPRINT(*d) /* isCNTRL(d), plus all non-ASCII */
10425 && isWORDCHAR(*s))
10426 ) {
10427 /* note we have to check for a normal identifier first,
10428 * as it handles utf8 symbols, and only after that has
10429 * been ruled out can we look at the caret words */
10430 if (isIDFIRST_lazy_if_safe(d, e, is_utf8) ) {
10431 /* if it starts as a valid identifier, assume that it is one.
10432 (the later check for } being at the expected point will trap
10433 cases where this doesn't pan out.) */
10434 d += is_utf8 ? UTF8SKIP(d) : 1;
10435 parse_ident(&s, &d, e, 1, is_utf8, TRUE, TRUE);
10436 *d = '\0';
10437 }
10438 else { /* caret word: ${^Foo} ${^CAPTURE[0]} */
10439 d++;
10440 while (isWORDCHAR(*s) && d < e) {
10441 *d++ = *s++;
10442 }
10443 if (d >= e)
10444 Perl_croak(aTHX_ "%s", ident_too_long);
10445 *d = '\0';
10446 }
10447 tmp_copline = CopLINE(PL_curcop);
10448 if (s < PL_bufend && isSPACE(*s)) {
10449 s = skipspace(s);
10450 }
10451 if ((*s == '[' || (*s == '{' && strNE(dest, "sub")))) {
10452 /* ${foo[0]} and ${foo{bar}} and ${^CAPTURE[0]} notation. */
10453 if (ckWARN(WARN_AMBIGUOUS) && keyword(dest, d - dest, 0)) {
10454 const char * const brack =
10455 (const char *)
10456 ((*s == '[') ? "[...]" : "{...}");
10457 orig_copline = CopLINE(PL_curcop);
10458 CopLINE_set(PL_curcop, tmp_copline);
10459 /* diag_listed_as: Ambiguous use of %c{%s[...]} resolved to %c%s[...] */
10460 Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
10461 "Ambiguous use of %c{%s%s} resolved to %c%s%s",
10462 funny, dest, brack, funny, dest, brack);
10463 CopLINE_set(PL_curcop, orig_copline);
10464 }
10465 bracket++;
10466 PL_lex_brackstack[PL_lex_brackets++] = (char)(XOPERATOR | XFAKEBRACK);
10467 PL_lex_allbrackets++;
10468 return s;
10469 }
10470 }
10471
10472 if ( !tmp_copline )
10473 tmp_copline = CopLINE(PL_curcop);
10474 if ((skip = s < PL_bufend && isSPACE(*s))) {
10475 /* Avoid incrementing line numbers or resetting PL_linestart,
10476 in case we have to back up. */
10477 STRLEN s_off = s - SvPVX(PL_linestr);
10478 s2 = peekspace(s);
10479 s = SvPVX(PL_linestr) + s_off;
10480 }
10481 else
10482 s2 = s;
10483
10484 /* Expect to find a closing } after consuming any trailing whitespace.
10485 */
10486 if (*s2 == '}') {
10487 /* Now increment line numbers if applicable. */
10488 if (skip)
10489 s = skipspace(s);
10490 s++;
10491 if (PL_lex_state == LEX_INTERPNORMAL && !PL_lex_brackets) {
10492 PL_lex_state = LEX_INTERPEND;
10493 PL_expect = XREF;
10494 }
10495 if (PL_lex_state == LEX_NORMAL || PL_lex_brackets) {
10496 if (ckWARN(WARN_AMBIGUOUS)
10497 && (keyword(dest, d - dest, 0)
10498 || get_cvn_flags(dest, d - dest, is_utf8
10499 ? SVf_UTF8
10500 : 0)))
10501 {
10502 SV *tmp = newSVpvn_flags( dest, d - dest,
10503 SVs_TEMP | (is_utf8 ? SVf_UTF8 : 0) );
10504 if (funny == '#')
10505 funny = '@';
10506 orig_copline = CopLINE(PL_curcop);
10507 CopLINE_set(PL_curcop, tmp_copline);
10508 Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS),
10509 "Ambiguous use of %c{%" SVf "} resolved to %c%" SVf,
10510 funny, SVfARG(tmp), funny, SVfARG(tmp));
10511 CopLINE_set(PL_curcop, orig_copline);
10512 }
10513 }
10514 }
10515 else {
10516 /* Didn't find the closing } at the point we expected, so restore
10517 state such that the next thing to process is the opening { and */
10518 s = SvPVX(PL_linestr) + bracket; /* let the parser handle it */
10519 CopLINE_set(PL_curcop, orig_copline);
10520 PL_parser->herelines = herelines;
10521 *dest = '\0';
10522 PL_parser->sub_no_recover = TRUE;
10523 }
10524 }
10525 else if ( PL_lex_state == LEX_INTERPNORMAL
10526 && !PL_lex_brackets
10527 && !intuit_more(s, PL_bufend))
10528 PL_lex_state = LEX_INTERPEND;
10529 return s;
10530 }
10531
10532 static bool
S_pmflag(pTHX_ const char * const valid_flags,U32 * pmfl,char ** s,char * charset,unsigned int * x_mod_count)10533 S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charset, unsigned int * x_mod_count) {
10534
10535 /* Adds, subtracts to/from 'pmfl' based on the next regex modifier flag
10536 * found in the parse starting at 's', based on the subset that are valid
10537 * in this context input to this routine in 'valid_flags'. Advances s.
10538 * Returns TRUE if the input should be treated as a valid flag, so the next
10539 * char may be as well; otherwise FALSE. 'charset' should point to a NUL
10540 * upon first call on the current regex. This routine will set it to any
10541 * charset modifier found. The caller shouldn't change it. This way,
10542 * another charset modifier encountered in the parse can be detected as an
10543 * error, as we have decided to allow only one */
10544
10545 const char c = **s;
10546 STRLEN charlen = UTF ? UTF8SKIP(*s) : 1;
10547
10548 if ( charlen != 1 || ! strchr(valid_flags, c) ) {
10549 if (isWORDCHAR_lazy_if_safe( *s, PL_bufend, UTF)) {
10550 yyerror_pv(Perl_form(aTHX_ "Unknown regexp modifier \"/%.*s\"", (int)charlen, *s),
10551 UTF ? SVf_UTF8 : 0);
10552 (*s) += charlen;
10553 /* Pretend that it worked, so will continue processing before
10554 * dieing */
10555 return TRUE;
10556 }
10557 return FALSE;
10558 }
10559
10560 switch (c) {
10561
10562 CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl, *x_mod_count);
10563 case GLOBAL_PAT_MOD: *pmfl |= PMf_GLOBAL; break;
10564 case CONTINUE_PAT_MOD: *pmfl |= PMf_CONTINUE; break;
10565 case ONCE_PAT_MOD: *pmfl |= PMf_KEEP; break;
10566 case KEEPCOPY_PAT_MOD: *pmfl |= RXf_PMf_KEEPCOPY; break;
10567 case NONDESTRUCT_PAT_MOD: *pmfl |= PMf_NONDESTRUCT; break;
10568 case LOCALE_PAT_MOD:
10569 if (*charset) {
10570 goto multiple_charsets;
10571 }
10572 set_regex_charset(pmfl, REGEX_LOCALE_CHARSET);
10573 *charset = c;
10574 break;
10575 case UNICODE_PAT_MOD:
10576 if (*charset) {
10577 goto multiple_charsets;
10578 }
10579 set_regex_charset(pmfl, REGEX_UNICODE_CHARSET);
10580 *charset = c;
10581 break;
10582 case ASCII_RESTRICT_PAT_MOD:
10583 if (! *charset) {
10584 set_regex_charset(pmfl, REGEX_ASCII_RESTRICTED_CHARSET);
10585 }
10586 else {
10587
10588 /* Error if previous modifier wasn't an 'a', but if it was, see
10589 * if, and accept, a second occurrence (only) */
10590 if (*charset != 'a'
10591 || get_regex_charset(*pmfl)
10592 != REGEX_ASCII_RESTRICTED_CHARSET)
10593 {
10594 goto multiple_charsets;
10595 }
10596 set_regex_charset(pmfl, REGEX_ASCII_MORE_RESTRICTED_CHARSET);
10597 }
10598 *charset = c;
10599 break;
10600 case DEPENDS_PAT_MOD:
10601 if (*charset) {
10602 goto multiple_charsets;
10603 }
10604 set_regex_charset(pmfl, REGEX_DEPENDS_CHARSET);
10605 *charset = c;
10606 break;
10607 }
10608
10609 (*s)++;
10610 return TRUE;
10611
10612 multiple_charsets:
10613 if (*charset != c) {
10614 yyerror(Perl_form(aTHX_ "Regexp modifiers \"/%c\" and \"/%c\" are mutually exclusive", *charset, c));
10615 }
10616 else if (c == 'a') {
10617 /* diag_listed_as: Regexp modifier "/%c" may appear a maximum of twice */
10618 yyerror("Regexp modifier \"/a\" may appear a maximum of twice");
10619 }
10620 else {
10621 yyerror(Perl_form(aTHX_ "Regexp modifier \"/%c\" may not appear twice", c));
10622 }
10623
10624 /* Pretend that it worked, so will continue processing before dieing */
10625 (*s)++;
10626 return TRUE;
10627 }
10628
10629 STATIC char *
S_scan_pat(pTHX_ char * start,I32 type)10630 S_scan_pat(pTHX_ char *start, I32 type)
10631 {
10632 PMOP *pm;
10633 char *s;
10634 const char * const valid_flags =
10635 (const char *)((type == OP_QR) ? QR_PAT_MODS : M_PAT_MODS);
10636 char charset = '\0'; /* character set modifier */
10637 unsigned int x_mod_count = 0;
10638
10639 PERL_ARGS_ASSERT_SCAN_PAT;
10640
10641 s = scan_str(start,TRUE,FALSE, (PL_in_eval & EVAL_RE_REPARSING), NULL);
10642 if (!s)
10643 Perl_croak(aTHX_ "Search pattern not terminated");
10644
10645 pm = (PMOP*)newPMOP(type, 0);
10646 if (PL_multi_open == '?') {
10647 /* This is the only point in the code that sets PMf_ONCE: */
10648 pm->op_pmflags |= PMf_ONCE;
10649
10650 /* Hence it's safe to do this bit of PMOP book-keeping here, which
10651 allows us to restrict the list needed by reset to just the ??
10652 matches. */
10653 assert(type != OP_TRANS);
10654 if (PL_curstash) {
10655 MAGIC *mg = mg_find((const SV *)PL_curstash, PERL_MAGIC_symtab);
10656 U32 elements;
10657 if (!mg) {
10658 mg = sv_magicext(MUTABLE_SV(PL_curstash), 0, PERL_MAGIC_symtab, 0, 0,
10659 0);
10660 }
10661 elements = mg->mg_len / sizeof(PMOP**);
10662 Renewc(mg->mg_ptr, elements + 1, PMOP*, char);
10663 ((PMOP**)mg->mg_ptr) [elements++] = pm;
10664 mg->mg_len = elements * sizeof(PMOP**);
10665 PmopSTASH_set(pm,PL_curstash);
10666 }
10667 }
10668
10669 /* if qr/...(?{..}).../, then need to parse the pattern within a new
10670 * anon CV. False positives like qr/[(?{]/ are harmless */
10671
10672 if (type == OP_QR) {
10673 STRLEN len;
10674 char *e, *p = SvPV(PL_lex_stuff, len);
10675 e = p + len;
10676 for (; p < e; p++) {
10677 if (p[0] == '(' && (
10678 (p[1] == '?' && (p[2] == '{' ||
10679 (p[2] == '?' && p[3] == '{'))) ||
10680 (p[1] == '*' && (p[2] == '{' ||
10681 (p[2] == '*' && p[3] == '{')))
10682 )){
10683 pm->op_pmflags |= PMf_HAS_CV;
10684 break;
10685 }
10686 }
10687 pm->op_pmflags |= PMf_IS_QR;
10688 }
10689
10690 while (*s && S_pmflag(aTHX_ valid_flags, &(pm->op_pmflags),
10691 &s, &charset, &x_mod_count))
10692 {};
10693 /* issue a warning if /c is specified,but /g is not */
10694 if ((pm->op_pmflags & PMf_CONTINUE) && !(pm->op_pmflags & PMf_GLOBAL))
10695 {
10696 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
10697 "Use of /c modifier is meaningless without /g" );
10698 }
10699
10700 PL_lex_op = (OP*)pm;
10701 pl_yylval.ival = OP_MATCH;
10702 return s;
10703 }
10704
10705 STATIC char *
S_scan_subst(pTHX_ char * start)10706 S_scan_subst(pTHX_ char *start)
10707 {
10708 char *s;
10709 PMOP *pm;
10710 I32 first_start;
10711 line_t first_line;
10712 line_t linediff = 0;
10713 I32 es = 0;
10714 char charset = '\0'; /* character set modifier */
10715 unsigned int x_mod_count = 0;
10716 char *t;
10717
10718 PERL_ARGS_ASSERT_SCAN_SUBST;
10719
10720 pl_yylval.ival = OP_NULL;
10721
10722 s = scan_str(start, TRUE, FALSE, FALSE, &t);
10723
10724 if (!s)
10725 Perl_croak(aTHX_ "Substitution pattern not terminated");
10726
10727 s = t;
10728
10729 first_start = PL_multi_start;
10730 first_line = CopLINE(PL_curcop);
10731 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
10732 if (!s) {
10733 SvREFCNT_dec_NN(PL_lex_stuff);
10734 PL_lex_stuff = NULL;
10735 Perl_croak(aTHX_ "Substitution replacement not terminated");
10736 }
10737 PL_multi_start = first_start; /* so whole substitution is taken together */
10738
10739 pm = (PMOP*)newPMOP(OP_SUBST, 0);
10740
10741
10742 while (*s) {
10743 if (*s == EXEC_PAT_MOD) {
10744 s++;
10745 es++;
10746 }
10747 else if (! S_pmflag(aTHX_ S_PAT_MODS, &(pm->op_pmflags),
10748 &s, &charset, &x_mod_count))
10749 {
10750 break;
10751 }
10752 }
10753
10754 if ((pm->op_pmflags & PMf_CONTINUE)) {
10755 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is meaningless in s///" );
10756 }
10757
10758 if (es) {
10759 SV * const repl = newSVpvs("");
10760
10761 PL_multi_end = 0;
10762 pm->op_pmflags |= PMf_EVAL;
10763 for (; es > 1; es--) {
10764 sv_catpvs(repl, "eval ");
10765 }
10766 sv_catpvs(repl, "do {");
10767 sv_catsv(repl, PL_parser->lex_sub_repl);
10768 sv_catpvs(repl, "}");
10769 SvREFCNT_dec(PL_parser->lex_sub_repl);
10770 PL_parser->lex_sub_repl = repl;
10771 }
10772
10773
10774 linediff = CopLINE(PL_curcop) - first_line;
10775 if (linediff)
10776 CopLINE_set(PL_curcop, first_line);
10777
10778 if (linediff || es) {
10779 /* the IVX field indicates that the replacement string is a s///e;
10780 * the NVX field indicates how many src code lines the replacement
10781 * spreads over */
10782 sv_upgrade(PL_parser->lex_sub_repl, SVt_PVNV);
10783 ((XPVNV*)SvANY(PL_parser->lex_sub_repl))->xnv_u.xnv_lines = linediff;
10784 ((XPVIV*)SvANY(PL_parser->lex_sub_repl))->xiv_u.xivu_eval_seen =
10785 cBOOL(es);
10786 }
10787
10788 PL_lex_op = (OP*)pm;
10789 pl_yylval.ival = OP_SUBST;
10790 return s;
10791 }
10792
10793 STATIC char *
S_scan_trans(pTHX_ char * start)10794 S_scan_trans(pTHX_ char *start)
10795 {
10796 char* s;
10797 OP *o;
10798 U8 squash;
10799 U8 del;
10800 U8 complement;
10801 bool nondestruct = 0;
10802 char *t;
10803
10804 PERL_ARGS_ASSERT_SCAN_TRANS;
10805
10806 pl_yylval.ival = OP_NULL;
10807
10808 s = scan_str(start,FALSE,FALSE,FALSE,&t);
10809 if (!s)
10810 Perl_croak(aTHX_ "Transliteration pattern not terminated");
10811
10812 s = t;
10813
10814 s = scan_str(s,FALSE,FALSE,FALSE,NULL);
10815 if (!s) {
10816 SvREFCNT_dec_NN(PL_lex_stuff);
10817 PL_lex_stuff = NULL;
10818 Perl_croak(aTHX_ "Transliteration replacement not terminated");
10819 }
10820
10821 complement = del = squash = 0;
10822 while (1) {
10823 switch (*s) {
10824 case 'c':
10825 complement = OPpTRANS_COMPLEMENT;
10826 break;
10827 case 'd':
10828 del = OPpTRANS_DELETE;
10829 break;
10830 case 's':
10831 squash = OPpTRANS_SQUASH;
10832 break;
10833 case 'r':
10834 nondestruct = 1;
10835 break;
10836 default:
10837 goto no_more;
10838 }
10839 s++;
10840 }
10841 no_more:
10842
10843 o = newPVOP(nondestruct ? OP_TRANSR : OP_TRANS, 0, (char*)NULL);
10844 o->op_private &= ~OPpTRANS_ALL;
10845 o->op_private |= del|squash|complement;
10846
10847 PL_lex_op = o;
10848 pl_yylval.ival = nondestruct ? OP_TRANSR : OP_TRANS;
10849
10850
10851 return s;
10852 }
10853
10854 /* scan_heredoc
10855 Takes a pointer to the first < in <<FOO.
10856 Returns a pointer to the byte following <<FOO.
10857
10858 This function scans a heredoc, which involves different methods
10859 depending on whether we are in a string eval, quoted construct, etc.
10860 This is because PL_linestr could containing a single line of input, or
10861 a whole string being evalled, or the contents of the current quote-
10862 like operator.
10863
10864 The two basic methods are:
10865 - Steal lines from the input stream
10866 - Scan the heredoc in PL_linestr and remove it therefrom
10867
10868 In a file scope or filtered eval, the first method is used; in a
10869 string eval, the second.
10870
10871 In a quote-like operator, we have to choose between the two,
10872 depending on where we can find a newline. We peek into outer lex-
10873 ing scopes until we find one with a newline in it. If we reach the
10874 outermost lexing scope and it is a file, we use the stream method.
10875 Otherwise it is treated as an eval.
10876 */
10877
10878 STATIC char *
S_scan_heredoc(pTHX_ char * s)10879 S_scan_heredoc(pTHX_ char *s)
10880 {
10881 I32 op_type = OP_SCALAR;
10882 I32 len;
10883 SV *tmpstr;
10884 char term;
10885 char *d;
10886 char *e;
10887 char *peek;
10888 char *indent = 0;
10889 I32 indent_len = 0;
10890 bool indented = FALSE;
10891 const bool infile = PL_rsfp || PL_parser->filtered;
10892 const line_t origline = CopLINE(PL_curcop);
10893 LEXSHARED *shared = PL_parser->lex_shared;
10894
10895 PERL_ARGS_ASSERT_SCAN_HEREDOC;
10896
10897 s += 2;
10898 d = PL_tokenbuf + 1;
10899 e = PL_tokenbuf + sizeof PL_tokenbuf - 1;
10900 *PL_tokenbuf = '\n';
10901 peek = s;
10902
10903 if (*peek == '~') {
10904 indented = TRUE;
10905 peek++; s++;
10906 }
10907
10908 while (SPACE_OR_TAB(*peek))
10909 peek++;
10910
10911 if (*peek == '`' || *peek == '\'' || *peek =='"') {
10912 s = peek;
10913 term = *s++;
10914 s = delimcpy(d, e, s, PL_bufend, term, &len);
10915 if (s == PL_bufend)
10916 Perl_croak(aTHX_ "Unterminated delimiter for here document");
10917 d += len;
10918 s++;
10919 }
10920 else {
10921 if (*s == '\\')
10922 /* <<\FOO is equivalent to <<'FOO' */
10923 s++, term = '\'';
10924 else
10925 term = '"';
10926
10927 if (! isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
10928 Perl_croak(aTHX_ "Use of bare << to mean <<\"\" is forbidden");
10929
10930 peek = s;
10931
10932 while (isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF)) {
10933 peek += UTF ? UTF8SKIP(peek) : 1;
10934 }
10935
10936 len = (peek - s >= e - d) ? (e - d) : (peek - s);
10937 Copy(s, d, len, char);
10938 s += len;
10939 d += len;
10940 }
10941
10942 if (d >= PL_tokenbuf + sizeof PL_tokenbuf - 1)
10943 Perl_croak(aTHX_ "Delimiter for here document is too long");
10944
10945 *d++ = '\n';
10946 *d = '\0';
10947 len = d - PL_tokenbuf;
10948
10949 #ifndef PERL_STRICT_CR
10950 d = (char *) memchr(s, '\r', PL_bufend - s);
10951 if (d) {
10952 char * const olds = s;
10953 s = d;
10954 while (s < PL_bufend) {
10955 if (*s == '\r') {
10956 *d++ = '\n';
10957 if (*++s == '\n')
10958 s++;
10959 }
10960 else if (*s == '\n' && s[1] == '\r') { /* \015\013 on a mac? */
10961 *d++ = *s++;
10962 s++;
10963 }
10964 else
10965 *d++ = *s++;
10966 }
10967 *d = '\0';
10968 PL_bufend = d;
10969 SvCUR_set(PL_linestr, PL_bufend - SvPVX_const(PL_linestr));
10970 s = olds;
10971 }
10972 #endif
10973
10974 tmpstr = newSV_type(SVt_PVIV);
10975 if (term == '\'') {
10976 op_type = OP_CONST;
10977 SvIV_set(tmpstr, -1);
10978 }
10979 else if (term == '`') {
10980 op_type = OP_BACKTICK;
10981 SvIV_set(tmpstr, '\\');
10982 }
10983
10984 PL_multi_start = origline + 1 + PL_parser->herelines;
10985 PL_multi_open = PL_multi_close = '<';
10986
10987 /* inside a string eval or quote-like operator */
10988 if (!infile || PL_lex_inwhat) {
10989 SV *linestr;
10990 char *bufend;
10991 char * const olds = s;
10992 PERL_CONTEXT * const cx = CX_CUR();
10993 /* These two fields are not set until an inner lexing scope is
10994 entered. But we need them set here. */
10995 shared->ls_bufptr = s;
10996 shared->ls_linestr = PL_linestr;
10997
10998 if (PL_lex_inwhat) {
10999 /* Look for a newline. If the current buffer does not have one,
11000 peek into the line buffer of the parent lexing scope, going
11001 up as many levels as necessary to find one with a newline
11002 after bufptr.
11003 */
11004 while (!(s = (char *)memchr(
11005 (void *)shared->ls_bufptr, '\n',
11006 SvEND(shared->ls_linestr)-shared->ls_bufptr
11007 )))
11008 {
11009 shared = shared->ls_prev;
11010 /* shared is only null if we have gone beyond the outermost
11011 lexing scope. In a file, we will have broken out of the
11012 loop in the previous iteration. In an eval, the string buf-
11013 fer ends with "\n;", so the while condition above will have
11014 evaluated to false. So shared can never be null. Or so you
11015 might think. Odd syntax errors like s;@{<<; can gobble up
11016 the implicit semicolon at the end of a flie, causing the
11017 file handle to be closed even when we are not in a string
11018 eval. So shared may be null in that case.
11019 (Closing '>>}' here to balance the earlier open brace for
11020 editors that look for matched pairs.) */
11021 if (UNLIKELY(!shared))
11022 goto interminable;
11023 /* A LEXSHARED struct with a null ls_prev pointer is the outer-
11024 most lexing scope. In a file, shared->ls_linestr at that
11025 level is just one line, so there is no body to steal. */
11026 if (infile && !shared->ls_prev) {
11027 s = olds;
11028 goto streaming;
11029 }
11030 }
11031 }
11032 else { /* eval or we've already hit EOF */
11033 s = (char*)memchr((void*)s, '\n', PL_bufend - s);
11034 if (!s)
11035 goto interminable;
11036 }
11037
11038 linestr = shared->ls_linestr;
11039 bufend = SvEND(linestr);
11040 d = s;
11041 if (indented) {
11042 char *myolds = s;
11043
11044 while (s < bufend - len + 1) {
11045 if (*s++ == '\n')
11046 ++PL_parser->herelines;
11047
11048 if (memEQ(s, PL_tokenbuf + 1, len - 1)) {
11049 char *backup = s;
11050 indent_len = 0;
11051
11052 /* Only valid if it's preceded by whitespace only */
11053 while (backup != myolds && --backup >= myolds) {
11054 if (! SPACE_OR_TAB(*backup)) {
11055 break;
11056 }
11057 indent_len++;
11058 }
11059
11060 /* No whitespace or all! */
11061 if (backup == s || *backup == '\n') {
11062 Newx(indent, indent_len + 1, char);
11063 memcpy(indent, backup + 1, indent_len);
11064 indent[indent_len] = 0;
11065 s--; /* before our delimiter */
11066 PL_parser->herelines--; /* this line doesn't count */
11067 break;
11068 }
11069 }
11070 }
11071 }
11072 else {
11073 while (s < bufend - len + 1
11074 && memNE(s,PL_tokenbuf,len) )
11075 {
11076 if (*s++ == '\n')
11077 ++PL_parser->herelines;
11078 }
11079 }
11080
11081 if (s >= bufend - len + 1) {
11082 goto interminable;
11083 }
11084
11085 sv_setpvn_fresh(tmpstr,d+1,s-d);
11086 s += len - 1;
11087 /* the preceding stmt passes a newline */
11088 PL_parser->herelines++;
11089
11090 /* s now points to the newline after the heredoc terminator.
11091 d points to the newline before the body of the heredoc.
11092 */
11093
11094 /* We are going to modify linestr in place here, so set
11095 aside copies of the string if necessary for re-evals or
11096 (caller $n)[6]. */
11097 /* See the Paranoia note in case LEX_INTERPEND in yylex, for why we
11098 check shared->re_eval_str. */
11099 if (shared->re_eval_start || shared->re_eval_str) {
11100 /* Set aside the rest of the regexp */
11101 if (!shared->re_eval_str)
11102 shared->re_eval_str =
11103 newSVpvn(shared->re_eval_start,
11104 bufend - shared->re_eval_start);
11105 shared->re_eval_start -= s-d;
11106 }
11107
11108 if (cxstack_ix >= 0
11109 && CxTYPE(cx) == CXt_EVAL
11110 && CxOLD_OP_TYPE(cx) == OP_ENTEREVAL
11111 && cx->blk_eval.cur_text == linestr)
11112 {
11113 cx->blk_eval.cur_text = newSVsv(linestr);
11114 cx->blk_u16 |= 0x40; /* indicate cur_text is ref counted */
11115 }
11116
11117 /* Copy everything from s onwards back to d. */
11118 Move(s,d,bufend-s + 1,char);
11119 SvCUR_set(linestr, SvCUR(linestr) - (s-d));
11120 /* Setting PL_bufend only applies when we have not dug deeper
11121 into other scopes, because sublex_done sets PL_bufend to
11122 SvEND(PL_linestr). */
11123 if (shared == PL_parser->lex_shared)
11124 PL_bufend = SvEND(linestr);
11125 s = olds;
11126 }
11127 else {
11128 SV *linestr_save;
11129 char *oldbufptr_save;
11130 char *oldoldbufptr_save;
11131 streaming:
11132 sv_grow_fresh(tmpstr, 80);
11133 SvPVCLEAR_FRESH(tmpstr); /* avoid "uninitialized" warning */
11134 term = PL_tokenbuf[1];
11135 len--;
11136 linestr_save = PL_linestr; /* must restore this afterwards */
11137 d = s; /* and this */
11138 oldbufptr_save = PL_oldbufptr;
11139 oldoldbufptr_save = PL_oldoldbufptr;
11140 PL_linestr = newSVpvs("");
11141 PL_bufend = SvPVX(PL_linestr);
11142
11143 while (1) {
11144 PL_bufptr = PL_bufend;
11145 CopLINE_set(PL_curcop,
11146 origline + 1 + PL_parser->herelines);
11147
11148 if ( !lex_next_chunk(LEX_NO_TERM)
11149 && (!SvCUR(tmpstr) || SvEND(tmpstr)[-1] != '\n'))
11150 {
11151 /* Simply freeing linestr_save might seem simpler here, as it
11152 does not matter what PL_linestr points to, since we are
11153 about to croak; but in a quote-like op, linestr_save
11154 will have been prospectively freed already, via
11155 SAVEFREESV(PL_linestr) in sublex_push, so it's easier to
11156 restore PL_linestr. */
11157 SvREFCNT_dec_NN(PL_linestr);
11158 PL_linestr = linestr_save;
11159 PL_oldbufptr = oldbufptr_save;
11160 PL_oldoldbufptr = oldoldbufptr_save;
11161 goto interminable;
11162 }
11163
11164 CopLINE_set(PL_curcop, origline);
11165
11166 if (!SvCUR(PL_linestr) || PL_bufend[-1] != '\n') {
11167 s = lex_grow_linestr(SvLEN(PL_linestr) + 3);
11168 /* ^That should be enough to avoid this needing to grow: */
11169 sv_catpvs(PL_linestr, "\n\0");
11170 assert(s == SvPVX(PL_linestr));
11171 PL_bufend = SvEND(PL_linestr);
11172 }
11173
11174 s = PL_bufptr;
11175 PL_parser->herelines++;
11176 PL_last_lop = PL_last_uni = NULL;
11177
11178 #ifndef PERL_STRICT_CR
11179 if (PL_bufend - PL_linestart >= 2) {
11180 if ( (PL_bufend[-2] == '\r' && PL_bufend[-1] == '\n')
11181 || (PL_bufend[-2] == '\n' && PL_bufend[-1] == '\r'))
11182 {
11183 PL_bufend[-2] = '\n';
11184 PL_bufend--;
11185 SvCUR_set(PL_linestr, PL_bufend - SvPVX_const(PL_linestr));
11186 }
11187 else if (PL_bufend[-1] == '\r')
11188 PL_bufend[-1] = '\n';
11189 }
11190 else if (PL_bufend - PL_linestart == 1 && PL_bufend[-1] == '\r')
11191 PL_bufend[-1] = '\n';
11192 #endif
11193
11194 if (indented && (PL_bufend-s) >= len) {
11195 char * found = ninstr(s, PL_bufend, (PL_tokenbuf + 1), (PL_tokenbuf +1 + len));
11196
11197 if (found) {
11198 char *backup = found;
11199 indent_len = 0;
11200
11201 /* Only valid if it's preceded by whitespace only */
11202 while (backup != s && --backup >= s) {
11203 if (! SPACE_OR_TAB(*backup)) {
11204 break;
11205 }
11206 indent_len++;
11207 }
11208
11209 /* All whitespace or none! */
11210 if (backup == found || SPACE_OR_TAB(*backup)) {
11211 Newx(indent, indent_len + 1, char);
11212 memcpy(indent, backup, indent_len);
11213 indent[indent_len] = 0;
11214 SvREFCNT_dec(PL_linestr);
11215 PL_linestr = linestr_save;
11216 PL_linestart = SvPVX(linestr_save);
11217 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
11218 PL_oldbufptr = oldbufptr_save;
11219 PL_oldoldbufptr = oldoldbufptr_save;
11220 s = d;
11221 break;
11222 }
11223 }
11224
11225 /* Didn't find it */
11226 sv_catsv(tmpstr,PL_linestr);
11227 }
11228 else {
11229 if (*s == term && PL_bufend-s >= len
11230 && memEQ(s,PL_tokenbuf + 1,len))
11231 {
11232 SvREFCNT_dec(PL_linestr);
11233 PL_linestr = linestr_save;
11234 PL_linestart = SvPVX(linestr_save);
11235 PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr);
11236 PL_oldbufptr = oldbufptr_save;
11237 PL_oldoldbufptr = oldoldbufptr_save;
11238 s = d;
11239 break;
11240 }
11241 else {
11242 sv_catsv(tmpstr,PL_linestr);
11243 }
11244 }
11245 } /* while (1) */
11246 }
11247
11248 PL_multi_end = origline + PL_parser->herelines;
11249
11250 if (indented && indent) {
11251 STRLEN linecount = 1;
11252 STRLEN herelen = SvCUR(tmpstr);
11253 char *ss = SvPVX(tmpstr);
11254 char *se = ss + herelen;
11255 SV *newstr = newSV(herelen+1);
11256 SvPOK_on(newstr);
11257
11258 /* Trim leading whitespace */
11259 while (ss < se) {
11260 /* newline only? Copy and move on */
11261 if (*ss == '\n') {
11262 sv_catpvs(newstr,"\n");
11263 ss++;
11264 linecount++;
11265
11266 /* Found our indentation? Strip it */
11267 }
11268 else if (se - ss >= indent_len
11269 && memEQ(ss, indent, indent_len))
11270 {
11271 STRLEN le = 0;
11272 ss += indent_len;
11273
11274 while ((ss + le) < se && *(ss + le) != '\n')
11275 le++;
11276
11277 sv_catpvn(newstr, ss, le);
11278 ss += le;
11279
11280 /* Line doesn't begin with our indentation? Croak */
11281 }
11282 else {
11283 Safefree(indent);
11284 Perl_croak(aTHX_
11285 "Indentation on line %d of here-doc doesn't match delimiter",
11286 (int)linecount
11287 );
11288 }
11289 } /* while */
11290
11291 /* avoid sv_setsv() as we don't want to COW here */
11292 sv_setpvn(tmpstr,SvPVX(newstr),SvCUR(newstr));
11293 Safefree(indent);
11294 SvREFCNT_dec_NN(newstr);
11295 }
11296
11297 if (SvCUR(tmpstr) + 5 < SvLEN(tmpstr)) {
11298 SvPV_shrink_to_cur(tmpstr);
11299 }
11300
11301 if (!IN_BYTES) {
11302 if (UTF && is_utf8_string((U8*)SvPVX_const(tmpstr), SvCUR(tmpstr)))
11303 SvUTF8_on(tmpstr);
11304 }
11305
11306 PL_lex_stuff = tmpstr;
11307 pl_yylval.ival = op_type;
11308 return s;
11309
11310 interminable:
11311 if (indent)
11312 Safefree(indent);
11313 SvREFCNT_dec(tmpstr);
11314 CopLINE_set(PL_curcop, origline);
11315 missingterm(PL_tokenbuf + 1, sizeof(PL_tokenbuf) - 1);
11316 }
11317
11318
11319 /* scan_inputsymbol
11320 takes: position of first '<' in input buffer
11321 returns: position of first char following the matching '>' in
11322 input buffer
11323 side-effects: pl_yylval and lex_op are set.
11324
11325 This code handles:
11326
11327 <> read from ARGV
11328 <<>> read from ARGV without magic open
11329 <FH> read from filehandle
11330 <pkg::FH> read from package qualified filehandle
11331 <pkg'FH> read from package qualified filehandle
11332 <$fh> read from filehandle in $fh
11333 <*.h> filename glob
11334
11335 */
11336
11337 STATIC char *
S_scan_inputsymbol(pTHX_ char * start)11338 S_scan_inputsymbol(pTHX_ char *start)
11339 {
11340 char *s = start; /* current position in buffer */
11341 char *end;
11342 I32 len;
11343 bool nomagicopen = FALSE;
11344 char *d = PL_tokenbuf; /* start of temp holding space */
11345 const char * const e = PL_tokenbuf + sizeof PL_tokenbuf; /* end of temp holding space */
11346
11347 PERL_ARGS_ASSERT_SCAN_INPUTSYMBOL;
11348
11349 end = (char *) memchr(s, '\n', PL_bufend - s);
11350 if (!end)
11351 end = PL_bufend;
11352 if (s[1] == '<' && s[2] == '>' && s[3] == '>') {
11353 nomagicopen = TRUE;
11354 *d = '\0';
11355 len = 0;
11356 s += 3;
11357 }
11358 else
11359 s = delimcpy(d, e, s + 1, end, '>', &len); /* extract until > */
11360
11361 /* die if we didn't have space for the contents of the <>,
11362 or if it didn't end, or if we see a newline
11363 */
11364
11365 if (len >= (I32)sizeof PL_tokenbuf)
11366 Perl_croak(aTHX_ "Excessively long <> operator");
11367 if (s >= end)
11368 Perl_croak(aTHX_ "Unterminated <> operator");
11369
11370 s++;
11371
11372 /* check for <$fh>
11373 Remember, only scalar variables are interpreted as filehandles by
11374 this code. Anything more complex (e.g., <$fh{$num}>) will be
11375 treated as a glob() call.
11376 This code makes use of the fact that except for the $ at the front,
11377 a scalar variable and a filehandle look the same.
11378 */
11379 if (*d == '$' && d[1]) d++;
11380
11381 /* allow <Pkg'VALUE> or <Pkg::VALUE> */
11382 while (isWORDCHAR_lazy_if_safe(d, e, UTF) || *d == '\'' || *d == ':') {
11383 d += UTF ? UTF8SKIP(d) : 1;
11384 }
11385
11386 /* If we've tried to read what we allow filehandles to look like, and
11387 there's still text left, then it must be a glob() and not a getline.
11388 Use scan_str to pull out the stuff between the <> and treat it
11389 as nothing more than a string.
11390 */
11391
11392 if (d - PL_tokenbuf != len) {
11393 pl_yylval.ival = OP_GLOB;
11394 s = scan_str(start,FALSE,FALSE,FALSE,NULL);
11395 if (!s)
11396 Perl_croak(aTHX_ "Glob not terminated");
11397 return s;
11398 }
11399 else {
11400 bool readline_overridden = FALSE;
11401 GV *gv_readline;
11402 /* we're in a filehandle read situation */
11403 d = PL_tokenbuf;
11404
11405 /* turn <> into <ARGV> */
11406 if (!len)
11407 Copy("ARGV",d,5,char);
11408
11409 /* Check whether readline() is overridden */
11410 if ((gv_readline = gv_override("readline",8)))
11411 readline_overridden = TRUE;
11412
11413 /* if <$fh>, create the ops to turn the variable into a
11414 filehandle
11415 */
11416 if (*d == '$') {
11417 /* try to find it in the pad for this block, otherwise find
11418 add symbol table ops
11419 */
11420 const PADOFFSET tmp = pad_findmy_pvn(d, len, 0);
11421 if (tmp != NOT_IN_PAD) {
11422 if (PAD_COMPNAME_FLAGS_isOUR(tmp)) {
11423 HV * const stash = PAD_COMPNAME_OURSTASH(tmp);
11424 HEK * const stashname = HvNAME_HEK(stash);
11425 SV * const sym = newSVhek_mortal(stashname);
11426 sv_catpvs(sym, "::");
11427 sv_catpv(sym, d+1);
11428 d = SvPVX(sym);
11429 goto intro_sym;
11430 }
11431 else {
11432 OP * const o = newPADxVOP(OP_PADSV, 0, tmp);
11433 PL_lex_op = readline_overridden
11434 ? newUNOP(OP_ENTERSUB, OPf_STACKED,
11435 op_append_elem(OP_LIST, o,
11436 newCVREF(0, newGVOP(OP_GV,0,gv_readline))))
11437 : newUNOP(OP_READLINE, 0, o);
11438 }
11439 }
11440 else {
11441 GV *gv;
11442 ++d;
11443 intro_sym:
11444 gv = gv_fetchpv(d,
11445 GV_ADDMULTI | ( UTF ? SVf_UTF8 : 0 ),
11446 SVt_PV);
11447 PL_lex_op = readline_overridden
11448 ? newUNOP(OP_ENTERSUB, OPf_STACKED,
11449 op_append_elem(OP_LIST,
11450 newUNOP(OP_RV2SV, 0, newGVOP(OP_GV, 0, gv)),
11451 newCVREF(0, newGVOP(OP_GV, 0, gv_readline))))
11452 : newUNOP(OP_READLINE, 0,
11453 newUNOP(OP_RV2SV, 0,
11454 newGVOP(OP_GV, 0, gv)));
11455 }
11456 /* we created the ops in PL_lex_op, so make pl_yylval.ival a null op */
11457 pl_yylval.ival = OP_NULL;
11458 }
11459
11460 /* If it's none of the above, it must be a literal filehandle
11461 (<Foo::BAR> or <FOO>) so build a simple readline OP */
11462 else {
11463 GV * const gv = gv_fetchpv(d, GV_ADD | ( UTF ? SVf_UTF8 : 0 ), SVt_PVIO);
11464 PL_lex_op = readline_overridden
11465 ? newUNOP(OP_ENTERSUB, OPf_STACKED,
11466 op_append_elem(OP_LIST,
11467 newGVOP(OP_GV, 0, gv),
11468 newCVREF(0, newGVOP(OP_GV, 0, gv_readline))))
11469 : newUNOP(OP_READLINE, nomagicopen ? OPf_SPECIAL : 0, newGVOP(OP_GV, 0, gv));
11470 pl_yylval.ival = OP_NULL;
11471
11472 /* leave the token generation above to avoid confusing the parser */
11473 if (!FEATURE_BAREWORD_FILEHANDLES_IS_ENABLED) {
11474 no_bareword_filehandle(d);
11475 }
11476 }
11477 }
11478
11479 return s;
11480 }
11481
11482
11483 /* scan_str
11484 takes:
11485 start position in buffer
11486 keep_bracketed_quoted preserve \ quoting of embedded delimiters, but
11487 only if they are of the open/close form
11488 keep_delims preserve the delimiters around the string
11489 re_reparse compiling a run-time /(?{})/:
11490 collapse // to /, and skip encoding src
11491 delimp if non-null, this is set to the position of
11492 the closing delimiter, or just after it if
11493 the closing and opening delimiters differ
11494 (i.e., the opening delimiter of a substitu-
11495 tion replacement)
11496 returns: position to continue reading from buffer
11497 side-effects: multi_start, multi_close, lex_repl or lex_stuff, and
11498 updates the read buffer.
11499
11500 This subroutine pulls a string out of the input. It is called for:
11501 q single quotes q(literal text)
11502 ' single quotes 'literal text'
11503 qq double quotes qq(interpolate $here please)
11504 " double quotes "interpolate $here please"
11505 qx backticks qx(/bin/ls -l)
11506 ` backticks `/bin/ls -l`
11507 qw quote words @EXPORT_OK = qw( func() $spam )
11508 m// regexp match m/this/
11509 s/// regexp substitute s/this/that/
11510 tr/// string transliterate tr/this/that/
11511 y/// string transliterate y/this/that/
11512 ($*@) sub prototypes sub foo ($)
11513 (stuff) sub attr parameters sub foo : attr(stuff)
11514 <> readline or globs <FOO>, <>, <$fh>, or <*.c>
11515
11516 In most of these cases (all but <>, patterns and transliterate)
11517 yylex() calls scan_str(). m// makes yylex() call scan_pat() which
11518 calls scan_str(). s/// makes yylex() call scan_subst() which calls
11519 scan_str(). tr/// and y/// make yylex() call scan_trans() which
11520 calls scan_str().
11521
11522 It skips whitespace before the string starts, and treats the first
11523 character as the delimiter. If the delimiter is one of ([{< then
11524 the corresponding "close" character )]}> is used as the closing
11525 delimiter. It allows quoting of delimiters, and if the string has
11526 balanced delimiters ([{<>}]) it allows nesting.
11527
11528 On success, the SV with the resulting string is put into lex_stuff or,
11529 if that is already non-NULL, into lex_repl. The second case occurs only
11530 when parsing the RHS of the special constructs s/// and tr/// (y///).
11531 For convenience, the terminating delimiter character is stuffed into
11532 SvIVX of the SV.
11533 */
11534
11535 char *
Perl_scan_str(pTHX_ char * start,int keep_bracketed_quoted,int keep_delims,int re_reparse,char ** delimp)11536 Perl_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re_reparse,
11537 char **delimp
11538 )
11539 {
11540 SV *sv; /* scalar value: string */
11541 char *s = start; /* current position in the buffer */
11542 char *to; /* current position in the sv's data */
11543 int brackets = 1; /* bracket nesting level */
11544 bool d_is_utf8 = FALSE; /* is there any utf8 content? */
11545 UV open_delim_code; /* code point */
11546 char open_delim_str[UTF8_MAXBYTES+1];
11547 STRLEN delim_byte_len; /* each delimiter currently is the same number
11548 of bytes */
11549 line_t herelines;
11550
11551 /* The only non-UTF character that isn't a stand alone grapheme is
11552 * white-space, hence can't be a delimiter. */
11553 const char * non_grapheme_msg = "Use of unassigned code point or"
11554 " non-standalone grapheme for a delimiter"
11555 " is not allowed";
11556 PERL_ARGS_ASSERT_SCAN_STR;
11557
11558 /* skip space before the delimiter */
11559 if (isSPACE(*s)) { /* skipspace can change the buffer 's' is in, so
11560 'start' also has to change */
11561 s = start = skipspace(s);
11562 }
11563
11564 /* mark where we are, in case we need to report errors */
11565 CLINE;
11566
11567 /* after skipping whitespace, the next character is the delimiter */
11568 if (! UTF || UTF8_IS_INVARIANT(*s)) {
11569 open_delim_code = (U8) *s;
11570 open_delim_str[0] = *s;
11571 delim_byte_len = 1;
11572 }
11573 else {
11574 open_delim_code = utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend,
11575 &delim_byte_len);
11576 if (UNLIKELY(! is_grapheme((U8 *) start,
11577 (U8 *) s,
11578 (U8 *) PL_bufend,
11579 open_delim_code)))
11580 {
11581 yyerror(non_grapheme_msg);
11582 }
11583
11584 Copy(s, open_delim_str, delim_byte_len, char);
11585 }
11586 open_delim_str[delim_byte_len] = '\0'; /* Only for safety */
11587
11588
11589 /* mark where we are */
11590 PL_multi_start = CopLINE(PL_curcop);
11591 PL_multi_open = open_delim_code;
11592 herelines = PL_parser->herelines;
11593
11594 const char * legal_paired_opening_delims;
11595 const char * legal_paired_closing_delims;
11596 const char * deprecated_opening_delims;
11597 if (FEATURE_MORE_DELIMS_IS_ENABLED) {
11598 if (UTF) {
11599 legal_paired_opening_delims = EXTRA_OPENING_UTF8_BRACKETS;
11600 legal_paired_closing_delims = EXTRA_CLOSING_UTF8_BRACKETS;
11601
11602 /* We are deprecating using a closing delimiter as the opening, in
11603 * case we want in the future to accept them reversed. The string
11604 * may include ones that are legal, but the code below won't look
11605 * at this string unless it didn't find a legal opening one */
11606 deprecated_opening_delims = DEPRECATED_OPENING_UTF8_BRACKETS;
11607 }
11608 else {
11609 legal_paired_opening_delims = EXTRA_OPENING_NON_UTF8_BRACKETS;
11610 legal_paired_closing_delims = EXTRA_CLOSING_NON_UTF8_BRACKETS;
11611 deprecated_opening_delims = DEPRECATED_OPENING_NON_UTF8_BRACKETS;
11612 }
11613 }
11614 else {
11615 legal_paired_opening_delims = "([{<";
11616 legal_paired_closing_delims = ")]}>";
11617 deprecated_opening_delims = (UTF)
11618 ? DEPRECATED_OPENING_UTF8_BRACKETS
11619 : DEPRECATED_OPENING_NON_UTF8_BRACKETS;
11620 }
11621
11622 const char * legal_paired_opening_delims_end = legal_paired_opening_delims
11623 + strlen(legal_paired_opening_delims);
11624 const char * deprecated_delims_end = deprecated_opening_delims
11625 + strlen(deprecated_opening_delims);
11626
11627 const char * close_delim_str = open_delim_str;
11628 UV close_delim_code = open_delim_code;
11629
11630 /* If the delimiter has a mirror-image closing one, get it */
11631 const char *tmps = ninstr(legal_paired_opening_delims,
11632 legal_paired_opening_delims_end,
11633 open_delim_str, open_delim_str + delim_byte_len);
11634 if (tmps) {
11635 /* Here, there is a paired delimiter, and tmps points to its position
11636 in the string of the accepted opening paired delimiters. The
11637 corresponding position in the string of closing ones is the
11638 beginning of the paired mate. Both contain the same number of
11639 bytes. */
11640 close_delim_str = legal_paired_closing_delims
11641 + (tmps - legal_paired_opening_delims);
11642
11643 /* The list of paired delimiters contains all the ASCII ones that have
11644 * always been legal, and no other ASCIIs. Don't raise a message if
11645 * using one of these */
11646 if (! isASCII(open_delim_code)) {
11647 Perl_ck_warner_d(aTHX_
11648 packWARN(WARN_EXPERIMENTAL__EXTRA_PAIRED_DELIMITERS),
11649 "Use of '%" UTF8f "' is experimental as a string delimiter",
11650 UTF8fARG(UTF, delim_byte_len, open_delim_str));
11651 }
11652
11653 close_delim_code = (UTF)
11654 ? valid_utf8_to_uvchr((U8 *) close_delim_str, NULL)
11655 : * (U8 *) close_delim_str;
11656 }
11657 else { /* Here, the delimiter isn't paired, hence the close is the same as
11658 the open; and has already been set up. But make sure it isn't
11659 deprecated to use this particular delimiter, as we plan
11660 eventually to make it paired. */
11661 if (ninstr(deprecated_opening_delims, deprecated_delims_end,
11662 open_delim_str, open_delim_str + delim_byte_len))
11663 {
11664 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED__DELIMITER_WILL_BE_PAIRED),
11665 "Use of '%" UTF8f "' is deprecated as a string delimiter",
11666 UTF8fARG(UTF, delim_byte_len, open_delim_str));
11667 }
11668
11669 /* Note that a NUL may be used as a delimiter, and this happens when
11670 * delimiting an empty string, and no special handling for it is
11671 * needed, as ninstr() calls are used */
11672 }
11673
11674 PL_multi_close = close_delim_code;
11675
11676 if (PL_multi_open == PL_multi_close) {
11677 keep_bracketed_quoted = FALSE;
11678 }
11679
11680 /* create a new SV to hold the contents. 79 is the SV's initial length.
11681 What a random number. */
11682 sv = newSV_type(SVt_PVIV);
11683 sv_grow_fresh(sv, 79);
11684 SvIV_set(sv, close_delim_code);
11685 (void)SvPOK_only(sv); /* validate pointer */
11686
11687 /* move past delimiter and try to read a complete string */
11688 if (keep_delims)
11689 sv_catpvn(sv, s, delim_byte_len);
11690 s += delim_byte_len;
11691 for (;;) {
11692 /* extend sv if need be */
11693 SvGROW(sv, SvCUR(sv) + (PL_bufend - s) + 1);
11694 /* set 'to' to the next character in the sv's string */
11695 to = SvPVX(sv)+SvCUR(sv);
11696
11697 /* read until we run out of string, or we find the closing delimiter */
11698 while (s < PL_bufend) {
11699 /* embedded newlines increment the line count */
11700 if (*s == '\n' && !PL_rsfp && !PL_parser->filtered)
11701 COPLINE_INC_WITH_HERELINES;
11702
11703 /* backslashes can escape the closing delimiter */
11704 if ( *s == '\\' && s < PL_bufend - delim_byte_len
11705
11706 /* ... but not if the delimiter itself is a backslash */
11707 && close_delim_code != '\\')
11708 {
11709 /* Here, we have an escaping backslash. If we're supposed to
11710 * discard those that escape the closing delimiter, just
11711 * discard this one */
11712 if ( ! keep_bracketed_quoted
11713 && ( memEQ(s + 1, open_delim_str, delim_byte_len)
11714 || ( PL_multi_open == PL_multi_close
11715 && re_reparse && s[1] == '\\')
11716 || memEQ(s + 1, close_delim_str, delim_byte_len)))
11717 {
11718 s++;
11719 }
11720 else /* any other escapes are simply copied straight through */
11721 *to++ = *s++;
11722 }
11723 else if ( s < PL_bufend - (delim_byte_len - 1)
11724 && memEQ(s, close_delim_str, delim_byte_len)
11725 && --brackets <= 0)
11726 {
11727 /* Found unescaped closing delimiter, unnested if we care about
11728 * that; so are done.
11729 *
11730 * In the case of the opening and closing delimiters being
11731 * different, we have to deal with nesting; the conditional
11732 * above makes sure we don't get here until the nesting level,
11733 * 'brackets', is back down to zero. In the other case,
11734 * nesting isn't an issue, and 'brackets' never can get
11735 * incremented above 0, so will come here at the first closing
11736 * delimiter.
11737 *
11738 * Only grapheme delimiters are legal. */
11739 if ( UTF /* All Non-UTF-8's are graphemes */
11740 && UNLIKELY(! is_grapheme((U8 *) start,
11741 (U8 *) s,
11742 (U8 *) PL_bufend,
11743 close_delim_code)))
11744 {
11745 yyerror(non_grapheme_msg);
11746 }
11747
11748 break;
11749 }
11750 /* No nesting if open eq close */
11751 else if ( PL_multi_open != PL_multi_close
11752 && s < PL_bufend - (delim_byte_len - 1)
11753 && memEQ(s, open_delim_str, delim_byte_len))
11754 {
11755 brackets++;
11756 }
11757
11758 /* Here, still in the middle of the string; copy this character */
11759 if (! UTF || UTF8_IS_INVARIANT((U8) *s)) {
11760 *to++ = *s++;
11761 }
11762 else {
11763 size_t this_char_len = UTF8SKIP(s);
11764 Copy(s, to, this_char_len, char);
11765 s += this_char_len;
11766 to += this_char_len;
11767
11768 d_is_utf8 = TRUE;
11769 }
11770 } /* End of loop through buffer */
11771
11772 /* Here, found end of the string, OR ran out of buffer: terminate the
11773 * copied string and update the sv's end-of-string */
11774 *to = '\0';
11775 SvCUR_set(sv, to - SvPVX_const(sv));
11776
11777 /*
11778 * this next chunk reads more into the buffer if we're not done yet
11779 */
11780
11781 if (s < PL_bufend)
11782 break; /* handle case where we are done yet :-) */
11783
11784 #ifndef PERL_STRICT_CR
11785 if (to - SvPVX_const(sv) >= 2) {
11786 if ( (to[-2] == '\r' && to[-1] == '\n')
11787 || (to[-2] == '\n' && to[-1] == '\r'))
11788 {
11789 to[-2] = '\n';
11790 to--;
11791 SvCUR_set(sv, to - SvPVX_const(sv));
11792 }
11793 else if (to[-1] == '\r')
11794 to[-1] = '\n';
11795 }
11796 else if (to - SvPVX_const(sv) == 1 && to[-1] == '\r')
11797 to[-1] = '\n';
11798 #endif
11799
11800 /* if we're out of file, or a read fails, bail and reset the current
11801 line marker so we can report where the unterminated string began
11802 */
11803 COPLINE_INC_WITH_HERELINES;
11804 PL_bufptr = PL_bufend;
11805 if (!lex_next_chunk(0)) {
11806 ASSUME(sv);
11807 SvREFCNT_dec(sv);
11808 CopLINE_set(PL_curcop, (line_t)PL_multi_start);
11809 return NULL;
11810 }
11811 s = start = PL_bufptr;
11812 } /* End of infinite loop */
11813
11814 /* at this point, we have successfully read the delimited string */
11815
11816 if (keep_delims)
11817 sv_catpvn(sv, s, delim_byte_len);
11818 s += delim_byte_len;
11819
11820 if (d_is_utf8)
11821 SvUTF8_on(sv);
11822
11823 PL_multi_end = CopLINE(PL_curcop);
11824 CopLINE_set(PL_curcop, PL_multi_start);
11825 PL_parser->herelines = herelines;
11826
11827 /* if we allocated too much space, give some back */
11828 if (SvCUR(sv) + 5 < SvLEN(sv)) {
11829 SvLEN_set(sv, SvCUR(sv) + 1);
11830 SvPV_shrink_to_cur(sv);
11831 }
11832
11833 /* decide whether this is the first or second quoted string we've read
11834 for this op
11835 */
11836
11837 if (PL_lex_stuff)
11838 PL_parser->lex_sub_repl = sv;
11839 else
11840 PL_lex_stuff = sv;
11841 if (delimp) *delimp = PL_multi_open == PL_multi_close ? s-delim_byte_len : s;
11842 return s;
11843 }
11844
11845 /*
11846 scan_num
11847 takes: pointer to position in buffer
11848 returns: pointer to new position in buffer
11849 side-effects: builds ops for the constant in pl_yylval.op
11850
11851 Read a number in any of the formats that Perl accepts:
11852
11853 \d(_?\d)*(\.(\d(_?\d)*)?)?[Ee][\+\-]?(\d(_?\d)*) 12 12.34 12.
11854 \.\d(_?\d)*[Ee][\+\-]?(\d(_?\d)*) .34
11855 0b[01](_?[01])* binary integers
11856 0o?[0-7](_?[0-7])* octal integers
11857 0x[0-9A-Fa-f](_?[0-9A-Fa-f])* hexadecimal integers
11858 0x[0-9A-Fa-f](_?[0-9A-Fa-f])*(?:\.\d*)?p[+-]?[0-9]+ hexadecimal floats
11859
11860 Like most scan_ routines, it uses the PL_tokenbuf buffer to hold the
11861 thing it reads.
11862
11863 If it reads a number without a decimal point or an exponent, it will
11864 try converting the number to an integer and see if it can do so
11865 without loss of precision.
11866 */
11867
11868 char *
Perl_scan_num(pTHX_ const char * start,YYSTYPE * lvalp)11869 Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp)
11870 {
11871 const char *s = start; /* current position in buffer */
11872 char *d; /* destination in temp buffer */
11873 char *e; /* end of temp buffer */
11874 NV nv; /* number read, as a double */
11875 SV *sv = NULL; /* place to put the converted number */
11876 bool floatit; /* boolean: int or float? */
11877 const char *lastub = NULL; /* position of last underbar */
11878 static const char* const number_too_long = "Number too long";
11879 bool warned_about_underscore = 0;
11880 I32 shift; /* shift per digit for hex/oct/bin, hoisted here for fp */
11881 #define WARN_ABOUT_UNDERSCORE() \
11882 do { \
11883 if (!warned_about_underscore) { \
11884 warned_about_underscore = 1; \
11885 Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), \
11886 "Misplaced _ in number"); \
11887 } \
11888 } while(0)
11889 /* Hexadecimal floating point.
11890 *
11891 * In many places (where we have quads and NV is IEEE 754 double)
11892 * we can fit the mantissa bits of a NV into an unsigned quad.
11893 * (Note that UVs might not be quads even when we have quads.)
11894 * This will not work everywhere, though (either no quads, or
11895 * using long doubles), in which case we have to resort to NV,
11896 * which will probably mean horrible loss of precision due to
11897 * multiple fp operations. */
11898 bool hexfp = FALSE;
11899 int total_bits = 0;
11900 int significant_bits = 0;
11901 #if NVSIZE == 8 && defined(HAS_QUAD) && defined(Uquad_t)
11902 # define HEXFP_UQUAD
11903 Uquad_t hexfp_uquad = 0;
11904 int hexfp_frac_bits = 0;
11905 #else
11906 # define HEXFP_NV
11907 NV hexfp_nv = 0.0;
11908 #endif
11909 NV hexfp_mult = 1.0;
11910 UV high_non_zero = 0; /* highest digit */
11911 int non_zero_integer_digits = 0;
11912 bool new_octal = FALSE; /* octal with "0o" prefix */
11913
11914 PERL_ARGS_ASSERT_SCAN_NUM;
11915
11916 /* We use the first character to decide what type of number this is */
11917
11918 switch (*s) {
11919 default:
11920 Perl_croak(aTHX_ "panic: scan_num, *s=%d", *s);
11921
11922 /* if it starts with a 0, it could be an octal number, a decimal in
11923 0.13 disguise, or a hexadecimal number, or a binary number. */
11924 case '0':
11925 {
11926 /* variables:
11927 u holds the "number so far"
11928 overflowed was the number more than we can hold?
11929
11930 Shift is used when we add a digit. It also serves as an "are
11931 we in octal/hex/binary?" indicator to disallow hex characters
11932 when in octal mode.
11933 */
11934 NV n = 0.0;
11935 UV u = 0;
11936 bool overflowed = FALSE;
11937 bool just_zero = TRUE; /* just plain 0 or binary number? */
11938 bool has_digs = FALSE;
11939 static const NV nvshift[5] = { 1.0, 2.0, 4.0, 8.0, 16.0 };
11940 static const char* const bases[5] =
11941 { "", "binary", "", "octal", "hexadecimal" };
11942 static const char* const Bases[5] =
11943 { "", "Binary", "", "Octal", "Hexadecimal" };
11944 static const char* const maxima[5] =
11945 { "",
11946 "0b11111111111111111111111111111111",
11947 "",
11948 "037777777777",
11949 "0xffffffff" };
11950
11951 /* check for hex */
11952 if (isALPHA_FOLD_EQ(s[1], 'x')) {
11953 shift = 4;
11954 s += 2;
11955 just_zero = FALSE;
11956 } else if (isALPHA_FOLD_EQ(s[1], 'b')) {
11957 shift = 1;
11958 s += 2;
11959 just_zero = FALSE;
11960 }
11961 /* check for a decimal in disguise */
11962 else if (s[1] == '.' || isALPHA_FOLD_EQ(s[1], 'e'))
11963 goto decimal;
11964 /* so it must be octal */
11965 else {
11966 shift = 3;
11967 s++;
11968 if (isALPHA_FOLD_EQ(*s, 'o')) {
11969 s++;
11970 just_zero = FALSE;
11971 new_octal = TRUE;
11972 }
11973 }
11974
11975 if (*s == '_') {
11976 WARN_ABOUT_UNDERSCORE();
11977 lastub = s++;
11978 }
11979
11980 /* read the rest of the number */
11981 for (;;) {
11982 /* x is used in the overflow test,
11983 b is the digit we're adding on. */
11984 UV x, b;
11985
11986 switch (*s) {
11987
11988 /* if we don't mention it, we're done */
11989 default:
11990 goto out;
11991
11992 /* _ are ignored -- but warned about if consecutive */
11993 case '_':
11994 if (lastub && s == lastub + 1)
11995 WARN_ABOUT_UNDERSCORE();
11996 lastub = s++;
11997 break;
11998
11999 /* 8 and 9 are not octal */
12000 case '8': case '9':
12001 if (shift == 3)
12002 yyerror(Perl_form(aTHX_ "Illegal octal digit '%c'", *s));
12003 /* FALLTHROUGH */
12004
12005 /* octal digits */
12006 case '2': case '3': case '4':
12007 case '5': case '6': case '7':
12008 if (shift == 1)
12009 yyerror(Perl_form(aTHX_ "Illegal binary digit '%c'", *s));
12010 /* FALLTHROUGH */
12011
12012 case '0': case '1':
12013 b = *s++ & 15; /* ASCII digit -> value of digit */
12014 goto digit;
12015
12016 /* hex digits */
12017 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
12018 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
12019 /* make sure they said 0x */
12020 if (shift != 4)
12021 goto out;
12022 b = (*s++ & 7) + 9;
12023
12024 /* Prepare to put the digit we have onto the end
12025 of the number so far. We check for overflows.
12026 */
12027
12028 digit:
12029 just_zero = FALSE;
12030 has_digs = TRUE;
12031 if (!overflowed) {
12032 assert(shift >= 0);
12033 x = u << shift; /* make room for the digit */
12034
12035 total_bits += shift;
12036
12037 if ((x >> shift) != u
12038 && !(PL_hints & HINT_NEW_BINARY)) {
12039 overflowed = TRUE;
12040 n = (NV) u;
12041 Perl_ck_warner_d(aTHX_ packWARN(WARN_OVERFLOW),
12042 "Integer overflow in %s number",
12043 bases[shift]);
12044 } else
12045 u = x | b; /* add the digit to the end */
12046 }
12047 if (overflowed) {
12048 n *= nvshift[shift];
12049 /* If an NV has not enough bits in its
12050 * mantissa to represent an UV this summing of
12051 * small low-order numbers is a waste of time
12052 * (because the NV cannot preserve the
12053 * low-order bits anyway): we could just
12054 * remember when did we overflow and in the
12055 * end just multiply n by the right
12056 * amount. */
12057 n += (NV) b;
12058 }
12059
12060 if (high_non_zero == 0 && b > 0)
12061 high_non_zero = b;
12062
12063 if (high_non_zero)
12064 non_zero_integer_digits++;
12065
12066 /* this could be hexfp, but peek ahead
12067 * to avoid matching ".." */
12068 if (UNLIKELY(HEXFP_PEEK(s))) {
12069 goto out;
12070 }
12071
12072 break;
12073 }
12074 }
12075
12076 /* if we get here, we had success: make a scalar value from
12077 the number.
12078 */
12079 out:
12080
12081 /* final misplaced underbar check */
12082 if (s[-1] == '_')
12083 WARN_ABOUT_UNDERSCORE();
12084
12085 if (UNLIKELY(HEXFP_PEEK(s))) {
12086 /* Do sloppy (on the underbars) but quick detection
12087 * (and value construction) for hexfp, the decimal
12088 * detection will shortly be more thorough with the
12089 * underbar checks. */
12090 const char* h = s;
12091 significant_bits = non_zero_integer_digits * shift;
12092 #ifdef HEXFP_UQUAD
12093 hexfp_uquad = u;
12094 #else /* HEXFP_NV */
12095 hexfp_nv = u;
12096 #endif
12097 /* Ignore the leading zero bits of
12098 * the high (first) non-zero digit. */
12099 if (high_non_zero) {
12100 if (high_non_zero < 0x8)
12101 significant_bits--;
12102 if (high_non_zero < 0x4)
12103 significant_bits--;
12104 if (high_non_zero < 0x2)
12105 significant_bits--;
12106 }
12107
12108 if (*h == '.') {
12109 #ifdef HEXFP_NV
12110 NV nv_mult = 1.0;
12111 #endif
12112 bool accumulate = TRUE;
12113 U8 b = 0; /* silence compiler warning */
12114 int lim = 1 << shift;
12115 for (h++; ((isXDIGIT(*h) && (b = XDIGIT_VALUE(*h)) < lim) ||
12116 *h == '_'); h++) {
12117 if (isXDIGIT(*h)) {
12118 significant_bits += shift;
12119 #ifdef HEXFP_UQUAD
12120 if (accumulate) {
12121 if (significant_bits < NV_MANT_DIG) {
12122 /* We are in the long "run" of xdigits,
12123 * accumulate the full four bits. */
12124 assert(shift >= 0);
12125 hexfp_uquad <<= shift;
12126 hexfp_uquad |= b;
12127 hexfp_frac_bits += shift;
12128 } else if (significant_bits - shift < NV_MANT_DIG) {
12129 /* We are at a hexdigit either at,
12130 * or straddling, the edge of mantissa.
12131 * We will try grabbing as many as
12132 * possible bits. */
12133 int tail =
12134 significant_bits - NV_MANT_DIG;
12135 if (tail <= 0)
12136 tail += shift;
12137 assert(tail >= 0);
12138 hexfp_uquad <<= tail;
12139 assert((shift - tail) >= 0);
12140 hexfp_uquad |= b >> (shift - tail);
12141 hexfp_frac_bits += tail;
12142
12143 /* Ignore the trailing zero bits
12144 * of the last non-zero xdigit.
12145 *
12146 * The assumption here is that if
12147 * one has input of e.g. the xdigit
12148 * eight (0x8), there is only one
12149 * bit being input, not the full
12150 * four bits. Conversely, if one
12151 * specifies a zero xdigit, the
12152 * assumption is that one really
12153 * wants all those bits to be zero. */
12154 if (b) {
12155 if ((b & 0x1) == 0x0) {
12156 significant_bits--;
12157 if ((b & 0x2) == 0x0) {
12158 significant_bits--;
12159 if ((b & 0x4) == 0x0) {
12160 significant_bits--;
12161 }
12162 }
12163 }
12164 }
12165
12166 accumulate = FALSE;
12167 }
12168 } else {
12169 /* Keep skipping the xdigits, and
12170 * accumulating the significant bits,
12171 * but do not shift the uquad
12172 * (which would catastrophically drop
12173 * high-order bits) or accumulate the
12174 * xdigits anymore. */
12175 }
12176 #else /* HEXFP_NV */
12177 if (accumulate) {
12178 nv_mult /= nvshift[shift];
12179 if (nv_mult > 0.0)
12180 hexfp_nv += b * nv_mult;
12181 else
12182 accumulate = FALSE;
12183 }
12184 #endif
12185 }
12186 if (significant_bits >= NV_MANT_DIG)
12187 accumulate = FALSE;
12188 }
12189 }
12190
12191 if ((total_bits > 0 || significant_bits > 0) &&
12192 isALPHA_FOLD_EQ(*h, 'p')) {
12193 bool negexp = FALSE;
12194 h++;
12195 if (*h == '+')
12196 h++;
12197 else if (*h == '-') {
12198 negexp = TRUE;
12199 h++;
12200 }
12201 if (isDIGIT(*h)) {
12202 I32 hexfp_exp = 0;
12203 while (isDIGIT(*h) || *h == '_') {
12204 if (isDIGIT(*h)) {
12205 hexfp_exp *= 10;
12206 hexfp_exp += *h - '0';
12207 #ifdef NV_MIN_EXP
12208 if (negexp
12209 && -hexfp_exp < NV_MIN_EXP - 1) {
12210 /* NOTE: this means that the exponent
12211 * underflow warning happens for
12212 * the IEEE 754 subnormals (denormals),
12213 * because DBL_MIN_EXP etc are the lowest
12214 * possible binary (or, rather, DBL_RADIX-base)
12215 * exponent for normals, not subnormals.
12216 *
12217 * This may or may not be a good thing. */
12218 Perl_ck_warner(aTHX_ packWARN(WARN_OVERFLOW),
12219 "Hexadecimal float: exponent underflow");
12220 break;
12221 }
12222 #endif
12223 #ifdef NV_MAX_EXP
12224 if (!negexp
12225 && hexfp_exp > NV_MAX_EXP - 1) {
12226 Perl_ck_warner(aTHX_ packWARN(WARN_OVERFLOW),
12227 "Hexadecimal float: exponent overflow");
12228 break;
12229 }
12230 #endif
12231 }
12232 h++;
12233 }
12234 if (negexp)
12235 hexfp_exp = -hexfp_exp;
12236 #ifdef HEXFP_UQUAD
12237 hexfp_exp -= hexfp_frac_bits;
12238 #endif
12239 hexfp_mult = Perl_pow(2.0, hexfp_exp);
12240 hexfp = TRUE;
12241 goto decimal;
12242 }
12243 }
12244 }
12245
12246 if (!just_zero && !has_digs) {
12247 /* 0x, 0o or 0b with no digits, treat it as an error.
12248 Originally this backed up the parse before the b or
12249 x, but that has the potential for silent changes in
12250 behaviour, like for: "0x.3" and "0x+$foo".
12251 */
12252 const char *d = s;
12253 char *oldbp = PL_bufptr;
12254 if (*d) ++d; /* so the user sees the bad non-digit */
12255 PL_bufptr = (char *)d; /* so yyerror reports the context */
12256 yyerror(Perl_form(aTHX_ "No digits found for %s literal",
12257 bases[shift]));
12258 PL_bufptr = oldbp;
12259 }
12260
12261 if (overflowed) {
12262 if (n > 4294967295.0)
12263 Perl_ck_warner(aTHX_ packWARN(WARN_PORTABLE),
12264 "%s number > %s non-portable",
12265 Bases[shift],
12266 new_octal ? "0o37777777777" : maxima[shift]);
12267 sv = newSVnv(n);
12268 }
12269 else {
12270 #if UVSIZE > 4
12271 if (u > 0xffffffff)
12272 Perl_ck_warner(aTHX_ packWARN(WARN_PORTABLE),
12273 "%s number > %s non-portable",
12274 Bases[shift],
12275 new_octal ? "0o37777777777" : maxima[shift]);
12276 #endif
12277 sv = newSVuv(u);
12278 }
12279 if (just_zero && (PL_hints & HINT_NEW_INTEGER))
12280 sv = new_constant(start, s - start, "integer",
12281 sv, NULL, NULL, 0, NULL);
12282 else if (PL_hints & HINT_NEW_BINARY)
12283 sv = new_constant(start, s - start, "binary",
12284 sv, NULL, NULL, 0, NULL);
12285 }
12286 break;
12287
12288 /*
12289 handle decimal numbers.
12290 we're also sent here when we read a 0 as the first digit
12291 */
12292 case '1': case '2': case '3': case '4': case '5':
12293 case '6': case '7': case '8': case '9': case '.':
12294 decimal:
12295 d = PL_tokenbuf;
12296 e = PL_tokenbuf + sizeof PL_tokenbuf - 6; /* room for various punctuation */
12297 floatit = FALSE;
12298 if (hexfp) {
12299 floatit = TRUE;
12300 *d++ = '0';
12301 switch (shift) {
12302 case 4:
12303 *d++ = 'x';
12304 s = start + 2;
12305 break;
12306 case 3:
12307 if (new_octal) {
12308 *d++ = 'o';
12309 s = start + 2;
12310 break;
12311 }
12312 s = start + 1;
12313 break;
12314 case 1:
12315 *d++ = 'b';
12316 s = start + 2;
12317 break;
12318 default:
12319 NOT_REACHED; /* NOTREACHED */
12320 }
12321 }
12322
12323 /* read next group of digits and _ and copy into d */
12324 while (isDIGIT(*s)
12325 || *s == '_'
12326 || UNLIKELY(hexfp && isXDIGIT(*s)))
12327 {
12328 /* skip underscores, checking for misplaced ones
12329 if -w is on
12330 */
12331 if (*s == '_') {
12332 if (lastub && s == lastub + 1)
12333 WARN_ABOUT_UNDERSCORE();
12334 lastub = s++;
12335 }
12336 else {
12337 /* check for end of fixed-length buffer */
12338 if (d >= e)
12339 Perl_croak(aTHX_ "%s", number_too_long);
12340 /* if we're ok, copy the character */
12341 *d++ = *s++;
12342 }
12343 }
12344
12345 /* final misplaced underbar check */
12346 if (lastub && s == lastub + 1)
12347 WARN_ABOUT_UNDERSCORE();
12348
12349 /* read a decimal portion if there is one. avoid
12350 3..5 being interpreted as the number 3. followed
12351 by .5
12352 */
12353 if (*s == '.' && s[1] != '.') {
12354 floatit = TRUE;
12355 *d++ = *s++;
12356
12357 if (*s == '_') {
12358 WARN_ABOUT_UNDERSCORE();
12359 lastub = s;
12360 }
12361
12362 /* copy, ignoring underbars, until we run out of digits.
12363 */
12364 for (; isDIGIT(*s)
12365 || *s == '_'
12366 || UNLIKELY(hexfp && isXDIGIT(*s));
12367 s++)
12368 {
12369 /* fixed length buffer check */
12370 if (d >= e)
12371 Perl_croak(aTHX_ "%s", number_too_long);
12372 if (*s == '_') {
12373 if (lastub && s == lastub + 1)
12374 WARN_ABOUT_UNDERSCORE();
12375 lastub = s;
12376 }
12377 else
12378 *d++ = *s;
12379 }
12380 /* fractional part ending in underbar? */
12381 if (s[-1] == '_')
12382 WARN_ABOUT_UNDERSCORE();
12383 if (*s == '.' && isDIGIT(s[1])) {
12384 /* oops, it's really a v-string, but without the "v" */
12385 s = start;
12386 goto vstring;
12387 }
12388 }
12389
12390 /* read exponent part, if present */
12391 if ((isALPHA_FOLD_EQ(*s, 'e')
12392 || UNLIKELY(hexfp && isALPHA_FOLD_EQ(*s, 'p')))
12393 && memCHRs("+-0123456789_", s[1]))
12394 {
12395 int exp_digits = 0;
12396 const char *save_s = s;
12397 char * save_d = d;
12398
12399 /* regardless of whether user said 3E5 or 3e5, use lower 'e',
12400 ditto for p (hexfloats) */
12401 if ((isALPHA_FOLD_EQ(*s, 'e'))) {
12402 /* At least some Mach atof()s don't grok 'E' */
12403 *d++ = 'e';
12404 }
12405 else if (UNLIKELY(hexfp && (isALPHA_FOLD_EQ(*s, 'p')))) {
12406 *d++ = 'p';
12407 }
12408
12409 s++;
12410
12411
12412 /* stray preinitial _ */
12413 if (*s == '_') {
12414 WARN_ABOUT_UNDERSCORE();
12415 lastub = s++;
12416 }
12417
12418 /* allow positive or negative exponent */
12419 if (*s == '+' || *s == '-')
12420 *d++ = *s++;
12421
12422 /* stray initial _ */
12423 if (*s == '_') {
12424 WARN_ABOUT_UNDERSCORE();
12425 lastub = s++;
12426 }
12427
12428 /* read digits of exponent */
12429 while (isDIGIT(*s) || *s == '_') {
12430 if (isDIGIT(*s)) {
12431 ++exp_digits;
12432 if (d >= e)
12433 Perl_croak(aTHX_ "%s", number_too_long);
12434 *d++ = *s++;
12435 }
12436 else {
12437 if (((lastub && s == lastub + 1)
12438 || (!isDIGIT(s[1]) && s[1] != '_')))
12439 WARN_ABOUT_UNDERSCORE();
12440 lastub = s++;
12441 }
12442 }
12443
12444 if (!exp_digits) {
12445 /* no exponent digits, the [eEpP] could be for something else,
12446 * though in practice we don't get here for p since that's preparsed
12447 * earlier, and results in only the 0xX being consumed, so behave similarly
12448 * for decimal floats and consume only the D.DD, leaving the [eE] to the
12449 * next token.
12450 */
12451 s = save_s;
12452 d = save_d;
12453 }
12454 else {
12455 floatit = TRUE;
12456 }
12457 }
12458
12459
12460 /*
12461 We try to do an integer conversion first if no characters
12462 indicating "float" have been found.
12463 */
12464
12465 if (!floatit) {
12466 UV uv;
12467 const int flags = grok_number (PL_tokenbuf, d - PL_tokenbuf, &uv);
12468
12469 if (flags == IS_NUMBER_IN_UV) {
12470 if (uv <= IV_MAX)
12471 sv = newSViv(uv); /* Prefer IVs over UVs. */
12472 else
12473 sv = newSVuv(uv);
12474 } else if (flags == (IS_NUMBER_IN_UV | IS_NUMBER_NEG)) {
12475 if (uv <= (UV) IV_MIN)
12476 sv = newSViv(-(IV)uv);
12477 else
12478 floatit = TRUE;
12479 } else
12480 floatit = TRUE;
12481 }
12482 if (floatit) {
12483 /* terminate the string */
12484 *d = '\0';
12485 if (UNLIKELY(hexfp)) {
12486 # ifdef NV_MANT_DIG
12487 if (significant_bits > NV_MANT_DIG)
12488 Perl_ck_warner(aTHX_ packWARN(WARN_OVERFLOW),
12489 "Hexadecimal float: mantissa overflow");
12490 # endif
12491 #ifdef HEXFP_UQUAD
12492 nv = hexfp_uquad * hexfp_mult;
12493 #else /* HEXFP_NV */
12494 nv = hexfp_nv * hexfp_mult;
12495 #endif
12496 } else {
12497 nv = Atof(PL_tokenbuf);
12498 }
12499 sv = newSVnv(nv);
12500 }
12501
12502 if ( floatit
12503 ? (PL_hints & HINT_NEW_FLOAT) : (PL_hints & HINT_NEW_INTEGER) ) {
12504 const char *const key = floatit ? "float" : "integer";
12505 const STRLEN keylen = floatit ? 5 : 7;
12506 sv = S_new_constant(aTHX_ PL_tokenbuf, d - PL_tokenbuf,
12507 key, keylen, sv, NULL, NULL, 0, NULL);
12508 }
12509 break;
12510
12511 /* if it starts with a v, it could be a v-string */
12512 case 'v':
12513 vstring:
12514 sv = newSV(5); /* preallocate storage space */
12515 ENTER_with_name("scan_vstring");
12516 SAVEFREESV(sv);
12517 s = scan_vstring(s, PL_bufend, sv);
12518 SvREFCNT_inc_simple_void_NN(sv);
12519 LEAVE_with_name("scan_vstring");
12520 break;
12521 }
12522
12523 /* make the op for the constant and return */
12524
12525 if (sv)
12526 lvalp->opval = newSVOP(OP_CONST, 0, sv);
12527 else
12528 lvalp->opval = NULL;
12529
12530 return (char *)s;
12531 }
12532
12533 STATIC char *
S_scan_formline(pTHX_ char * s)12534 S_scan_formline(pTHX_ char *s)
12535 {
12536 SV * const stuff = newSVpvs("");
12537 bool needargs = FALSE;
12538 bool eofmt = FALSE;
12539
12540 PERL_ARGS_ASSERT_SCAN_FORMLINE;
12541
12542 while (!needargs) {
12543 char *eol;
12544 if (*s == '.') {
12545 char *t = s+1;
12546 #ifdef PERL_STRICT_CR
12547 while (SPACE_OR_TAB(*t))
12548 t++;
12549 #else
12550 while (SPACE_OR_TAB(*t) || *t == '\r')
12551 t++;
12552 #endif
12553 if (*t == '\n' || t == PL_bufend) {
12554 eofmt = TRUE;
12555 break;
12556 }
12557 }
12558 eol = (char *) memchr(s,'\n',PL_bufend-s);
12559 if (! eol) {
12560 eol = PL_bufend;
12561 }
12562 else {
12563 eol++;
12564 }
12565 if (*s != '#') {
12566 char *t;
12567 for (t = s; t < eol; t++) {
12568 if (*t == '~' && t[1] == '~' && SvCUR(stuff)) {
12569 needargs = FALSE;
12570 goto enough; /* ~~ must be first line in formline */
12571 }
12572 if (*t == '@' || *t == '^')
12573 needargs = TRUE;
12574 }
12575 if (eol > s) {
12576 sv_catpvn(stuff, s, eol-s);
12577 #ifndef PERL_STRICT_CR
12578 if (eol-s > 1 && eol[-2] == '\r' && eol[-1] == '\n') {
12579 char *end = SvPVX(stuff) + SvCUR(stuff);
12580 end[-2] = '\n';
12581 end[-1] = '\0';
12582 SvCUR_set(stuff, SvCUR(stuff) - 1);
12583 }
12584 #endif
12585 }
12586 else
12587 break;
12588 }
12589 s = (char*)eol;
12590 if ((PL_rsfp || PL_parser->filtered)
12591 && PL_parser->form_lex_state == LEX_NORMAL) {
12592 bool got_some;
12593 PL_bufptr = PL_bufend;
12594 COPLINE_INC_WITH_HERELINES;
12595 got_some = lex_next_chunk(0);
12596 CopLINE_dec(PL_curcop);
12597 s = PL_bufptr;
12598 if (!got_some)
12599 break;
12600 }
12601 incline(s, PL_bufend);
12602 }
12603 enough:
12604 if (!SvCUR(stuff) || needargs)
12605 PL_lex_state = PL_parser->form_lex_state;
12606 if (SvCUR(stuff)) {
12607 PL_expect = XSTATE;
12608 if (needargs) {
12609 const char *s2 = s;
12610 while (isSPACE(*s2) && *s2 != '\n')
12611 s2++;
12612 if (*s2 == '{') {
12613 PL_expect = XTERMBLOCK;
12614 NEXTVAL_NEXTTOKE.ival = 0;
12615 force_next(KW_DO);
12616 }
12617 NEXTVAL_NEXTTOKE.ival = 0;
12618 force_next(FORMLBRACK);
12619 }
12620 if (!IN_BYTES) {
12621 if (UTF && is_utf8_string((U8*)SvPVX_const(stuff), SvCUR(stuff)))
12622 SvUTF8_on(stuff);
12623 }
12624 NEXTVAL_NEXTTOKE.opval = newSVOP(OP_CONST, 0, stuff);
12625 force_next(THING);
12626 }
12627 else {
12628 SvREFCNT_dec(stuff);
12629 if (eofmt)
12630 PL_lex_formbrack = 0;
12631 }
12632 return s;
12633 }
12634
12635 /*
12636 =for apidoc start_subparse
12637
12638 Set things up for parsing a subroutine.
12639
12640 If C<is_format> is non-zero, the input is to be considered a format sub
12641 (a specialised sub used to implement perl's C<format> feature); else a
12642 normal C<sub>.
12643
12644 C<flags> are added to the flags for C<PL_compcv>. C<flags> may include the
12645 C<CVf_IsMETHOD> bit, which causes the new subroutine to be a method.
12646
12647 This returns the value of C<PL_savestack_ix> that was in effect upon entry to
12648 the function;
12649
12650 =cut
12651 */
12652
12653 I32
Perl_start_subparse(pTHX_ I32 is_format,U32 flags)12654 Perl_start_subparse(pTHX_ I32 is_format, U32 flags)
12655 {
12656 const I32 oldsavestack_ix = PL_savestack_ix;
12657 CV* const outsidecv = PL_compcv;
12658 bool is_method = flags & CVf_IsMETHOD;
12659
12660 if (is_method)
12661 croak_kw_unless_class("method");
12662
12663 SAVEI32(PL_subline);
12664 save_item(PL_subname);
12665 SAVESPTR(PL_compcv);
12666
12667 PL_compcv = MUTABLE_CV(newSV_type(is_format ? SVt_PVFM : SVt_PVCV));
12668 CvFLAGS(PL_compcv) |= flags;
12669
12670 PL_subline = CopLINE(PL_curcop);
12671 CvPADLIST(PL_compcv) = pad_new(padnew_SAVE|padnew_SAVESUB);
12672 CvOUTSIDE(PL_compcv) = MUTABLE_CV(SvREFCNT_inc_simple(outsidecv));
12673 CvOUTSIDE_SEQ(PL_compcv) = PL_cop_seqmax;
12674 if (outsidecv && CvPADLIST(outsidecv))
12675 CvPADLIST(PL_compcv)->xpadl_outid = CvPADLIST(outsidecv)->xpadl_id;
12676 if (is_method)
12677 class_prepare_method_parse(PL_compcv);
12678
12679 return oldsavestack_ix;
12680 }
12681
12682 /* If o represents a builtin attribute, apply it to cv and returns true.
12683 * Otherwise does nothing and returns false
12684 */
12685
12686 STATIC bool
S_apply_builtin_cv_attribute(pTHX_ CV * cv,OP * o)12687 S_apply_builtin_cv_attribute(pTHX_ CV *cv, OP *o)
12688 {
12689 assert(o->op_type == OP_CONST);
12690 SV *sv = cSVOPo_sv;
12691 STRLEN len = SvCUR(sv);
12692
12693 /* NOTE: any CV attrs applied here need to be part of
12694 the CVf_BUILTIN_ATTRS define in cv.h! */
12695
12696 if(memEQs(SvPVX(sv), len, "lvalue"))
12697 CvLVALUE_on(cv);
12698 else if(memEQs(SvPVX(sv), len, "method"))
12699 CvNOWARN_AMBIGUOUS_on(cv);
12700 else if(memEQs(SvPVX(sv), len, "const")) {
12701 Perl_ck_warner_d(aTHX_
12702 packWARN(WARN_EXPERIMENTAL__CONST_ATTR),
12703 ":const is experimental"
12704 );
12705 CvANONCONST_on(cv);
12706 if (!CvANON(cv))
12707 yyerror(":const is not permitted on named subroutines");
12708 }
12709 else
12710 return false;
12711
12712 return true;
12713 }
12714
12715 /*
12716 =for apidoc apply_builtin_cv_attributes
12717
12718 Given an OP_LIST containing attribute definitions, filter it for known builtin
12719 attributes to apply to the cv, returning a possibly-smaller list containing
12720 just the remaining ones.
12721
12722 =cut
12723 */
12724
12725 OP *
Perl_apply_builtin_cv_attributes(pTHX_ CV * cv,OP * attrlist)12726 Perl_apply_builtin_cv_attributes(pTHX_ CV *cv, OP *attrlist)
12727 {
12728 PERL_ARGS_ASSERT_APPLY_BUILTIN_CV_ATTRIBUTES;
12729
12730 if(!attrlist)
12731 return attrlist;
12732
12733 if(attrlist->op_type != OP_LIST) {
12734 /* Not in fact a list but just a single attribute */
12735 if(S_apply_builtin_cv_attribute(aTHX_ cv, attrlist)) {
12736 op_free(attrlist);
12737 return NULL;
12738 }
12739
12740 return attrlist;
12741 }
12742
12743 OP *prev = cLISTOPx(attrlist)->op_first;
12744 assert(prev->op_type == OP_PUSHMARK);
12745 OP *o = OpSIBLING(prev);
12746
12747 OP *next;
12748 for(; o; o = next) {
12749 next = OpSIBLING(o);
12750
12751 if(S_apply_builtin_cv_attribute(aTHX_ cv, o)) {
12752 op_sibling_splice(attrlist, prev, 1, NULL);
12753 op_free(o);
12754 }
12755 else {
12756 prev = o;
12757 }
12758 }
12759
12760 if(OpHAS_SIBLING(cLISTOPx(attrlist)->op_first))
12761 return attrlist;
12762
12763 /* The list is now entirely empty, we might as well discard it */
12764 op_free(attrlist);
12765 return NULL;
12766 }
12767
12768
12769 /* Do extra initialisation of a CV (typically one just created by
12770 * start_subparse()) if that CV is for a named sub
12771 */
12772
12773 void
Perl_init_named_cv(pTHX_ CV * cv,OP * nameop)12774 Perl_init_named_cv(pTHX_ CV *cv, OP *nameop)
12775 {
12776 PERL_ARGS_ASSERT_INIT_NAMED_CV;
12777
12778 if (nameop->op_type == OP_CONST) {
12779 const char *const name = SvPV_nolen_const(((SVOP*)nameop)->op_sv);
12780 if ( strEQ(name, "BEGIN")
12781 || strEQ(name, "END")
12782 || strEQ(name, "INIT")
12783 || strEQ(name, "CHECK")
12784 || strEQ(name, "UNITCHECK")
12785 )
12786 CvSPECIAL_on(cv);
12787 }
12788 else
12789 /* State subs inside anonymous subs need to be
12790 clonable themselves. */
12791 if ( CvANON(CvOUTSIDE(cv))
12792 || CvCLONE(CvOUTSIDE(cv))
12793 || !PadnameIsSTATE(PadlistNAMESARRAY(CvPADLIST(
12794 CvOUTSIDE(cv)
12795 ))[nameop->op_targ])
12796 )
12797 CvCLONE_on(cv);
12798 }
12799
12800
12801 static int
S_yywarn(pTHX_ const char * const s,U32 flags)12802 S_yywarn(pTHX_ const char *const s, U32 flags)
12803 {
12804 PERL_ARGS_ASSERT_YYWARN;
12805
12806 PL_in_eval |= EVAL_WARNONLY;
12807 yyerror_pv(s, flags);
12808 return 0;
12809 }
12810
12811 void
Perl_abort_execution(pTHX_ SV * msg_sv,const char * const name)12812 Perl_abort_execution(pTHX_ SV* msg_sv, const char * const name)
12813 {
12814 PERL_ARGS_ASSERT_ABORT_EXECUTION;
12815
12816 if (msg_sv) {
12817 if (PL_minus_c)
12818 Perl_croak(aTHX_ "%" SVf "%s had compilation errors.\n", SVfARG(msg_sv), name);
12819 else {
12820 Perl_croak(aTHX_
12821 "%" SVf "Execution of %s aborted due to compilation errors.\n", SVfARG(msg_sv), name);
12822 }
12823 } else {
12824 if (PL_minus_c)
12825 Perl_croak(aTHX_ "%s had compilation errors.\n", name);
12826 else {
12827 Perl_croak(aTHX_
12828 "Execution of %s aborted due to compilation errors.\n", name);
12829 }
12830 }
12831
12832 NOT_REACHED; /* NOTREACHED */
12833 }
12834
12835 void
Perl_yyquit(pTHX)12836 Perl_yyquit(pTHX)
12837 {
12838 /* Called, after at least one error has been found, to abort the parse now,
12839 * instead of trying to forge ahead */
12840
12841 yyerror_pvn(NULL, 0, 0);
12842 }
12843
12844 int
Perl_yyerror(pTHX_ const char * const s)12845 Perl_yyerror(pTHX_ const char *const s)
12846 {
12847 PERL_ARGS_ASSERT_YYERROR;
12848 int r = yyerror_pvn(s, strlen(s), 0);
12849 return r;
12850 }
12851
12852 int
Perl_yyerror_pv(pTHX_ const char * const s,U32 flags)12853 Perl_yyerror_pv(pTHX_ const char *const s, U32 flags)
12854 {
12855 PERL_ARGS_ASSERT_YYERROR_PV;
12856 int r = yyerror_pvn(s, strlen(s), flags);
12857 return r;
12858 }
12859
12860 int
Perl_yyerror_pvn(pTHX_ const char * const s,STRLEN len,U32 flags)12861 Perl_yyerror_pvn(pTHX_ const char *const s, STRLEN len, U32 flags)
12862 {
12863 const char *context = NULL;
12864 int contlen = -1;
12865 SV *msg;
12866 SV * const where_sv = newSVpvs_flags("", SVs_TEMP);
12867 int yychar = PL_parser->yychar;
12868
12869 /* Output error message 's' with length 'len'. 'flags' are SV flags that
12870 * apply. If the number of errors found is large enough, it abandons
12871 * parsing. If 's' is NULL, there is no message, and it abandons
12872 * processing unconditionally */
12873
12874 if (s != NULL) {
12875 if (!yychar || (yychar == PERLY_SEMICOLON && !PL_rsfp))
12876 sv_catpvs(where_sv, "at EOF");
12877 else if ( PL_oldoldbufptr
12878 && PL_bufptr > PL_oldoldbufptr
12879 && PL_bufptr - PL_oldoldbufptr < 200
12880 && PL_oldoldbufptr != PL_oldbufptr
12881 && PL_oldbufptr != PL_bufptr)
12882 {
12883 while (isSPACE(*PL_oldoldbufptr))
12884 PL_oldoldbufptr++;
12885 context = PL_oldoldbufptr;
12886 contlen = PL_bufptr - PL_oldoldbufptr;
12887 }
12888 else if ( PL_oldbufptr
12889 && PL_bufptr > PL_oldbufptr
12890 && PL_bufptr - PL_oldbufptr < 200
12891 && PL_oldbufptr != PL_bufptr)
12892 {
12893 while (isSPACE(*PL_oldbufptr))
12894 PL_oldbufptr++;
12895 context = PL_oldbufptr;
12896 contlen = PL_bufptr - PL_oldbufptr;
12897 }
12898 else if (yychar > 255)
12899 sv_catpvs(where_sv, "next token ???");
12900 else if (yychar == YYEMPTY) {
12901 if (PL_lex_state == LEX_NORMAL)
12902 sv_catpvs(where_sv, "at end of line");
12903 else if (PL_lex_inpat)
12904 sv_catpvs(where_sv, "within pattern");
12905 else
12906 sv_catpvs(where_sv, "within string");
12907 }
12908 else {
12909 sv_catpvs(where_sv, "next char ");
12910 if (yychar < 32)
12911 Perl_sv_catpvf(aTHX_ where_sv, "^%c", toCTRL(yychar));
12912 else if (isPRINT_LC(yychar)) {
12913 const char string = yychar;
12914 sv_catpvn(where_sv, &string, 1);
12915 }
12916 else
12917 Perl_sv_catpvf(aTHX_ where_sv, "\\%03o", yychar & 255);
12918 }
12919 msg = newSVpvn_flags(s, len, (flags & SVf_UTF8) | SVs_TEMP);
12920 Perl_sv_catpvf(aTHX_ msg, " at %s line %" LINE_Tf ", ",
12921 OutCopFILE(PL_curcop),
12922 (PL_parser->preambling == NOLINE
12923 ? CopLINE(PL_curcop)
12924 : PL_parser->preambling));
12925 if (context)
12926 Perl_sv_catpvf(aTHX_ msg, "near \"%" UTF8f "\"\n",
12927 UTF8fARG(UTF, contlen, context));
12928 else
12929 Perl_sv_catpvf(aTHX_ msg, "%" SVf "\n", SVfARG(where_sv));
12930 if ( PL_multi_start < PL_multi_end
12931 && (U32)(CopLINE(PL_curcop) - PL_multi_end) <= 1)
12932 {
12933 Perl_sv_catpvf(aTHX_ msg,
12934 " (Might be a runaway multi-line %c%c string starting on"
12935 " line %" LINE_Tf ")\n",
12936 (int)PL_multi_open,(int)PL_multi_close,(line_t)PL_multi_start);
12937 PL_multi_end = 0;
12938 }
12939 if (PL_in_eval & EVAL_WARNONLY) {
12940 PL_in_eval &= ~EVAL_WARNONLY;
12941 Perl_ck_warner_d(aTHX_ packWARN(WARN_SYNTAX), "%" SVf, SVfARG(msg));
12942 }
12943 else {
12944 qerror(msg);
12945 }
12946 }
12947 /* if there was no message then this is a yyquit(), which is actualy handled
12948 * by qerror() with a NULL argument */
12949 if (s == NULL)
12950 qerror(NULL);
12951
12952 PL_in_my = 0;
12953 PL_in_my_stash = NULL;
12954 return 0;
12955 }
12956
12957 STATIC char*
S_swallow_bom(pTHX_ U8 * s)12958 S_swallow_bom(pTHX_ U8 *s)
12959 {
12960 const STRLEN slen = SvCUR(PL_linestr);
12961
12962 PERL_ARGS_ASSERT_SWALLOW_BOM;
12963
12964 switch (s[0]) {
12965 case 0xFF:
12966 if (s[1] == 0xFE) {
12967 /* UTF-16 little-endian? (or UTF-32LE?) */
12968 if (s[2] == 0 && s[3] == 0) /* UTF-32 little-endian */
12969 /* diag_listed_as: Unsupported script encoding %s */
12970 Perl_croak(aTHX_ "Unsupported script encoding UTF-32LE");
12971 #ifndef PERL_NO_UTF16_FILTER
12972 #ifdef DEBUGGING
12973 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (BOM)\n");
12974 #endif
12975 s += 2;
12976 if (PL_bufend > (char*)s) {
12977 s = add_utf16_textfilter(s, TRUE);
12978 }
12979 #else
12980 /* diag_listed_as: Unsupported script encoding %s */
12981 Perl_croak(aTHX_ "Unsupported script encoding UTF-16LE");
12982 #endif
12983 }
12984 break;
12985 case 0xFE:
12986 if (s[1] == 0xFF) { /* UTF-16 big-endian? */
12987 #ifndef PERL_NO_UTF16_FILTER
12988 #ifdef DEBUGGING
12989 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (BOM)\n");
12990 #endif
12991 s += 2;
12992 if (PL_bufend > (char *)s) {
12993 s = add_utf16_textfilter(s, FALSE);
12994 }
12995 #else
12996 /* diag_listed_as: Unsupported script encoding %s */
12997 Perl_croak(aTHX_ "Unsupported script encoding UTF-16BE");
12998 #endif
12999 }
13000 break;
13001 case BOM_UTF8_FIRST_BYTE: {
13002 if (memBEGINs(s+1, slen - 1, BOM_UTF8_TAIL)) {
13003 #ifdef DEBUGGING
13004 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-8 script encoding (BOM)\n");
13005 #endif
13006 s += sizeof(BOM_UTF8) - 1; /* UTF-8 */
13007 }
13008 break;
13009 }
13010 case 0:
13011 if (slen > 3) {
13012 if (s[1] == 0) {
13013 if (s[2] == 0xFE && s[3] == 0xFF) {
13014 /* UTF-32 big-endian */
13015 /* diag_listed_as: Unsupported script encoding %s */
13016 Perl_croak(aTHX_ "Unsupported script encoding UTF-32BE");
13017 }
13018 }
13019 else if (s[2] == 0 && s[3] != 0) {
13020 /* Leading bytes
13021 * 00 xx 00 xx
13022 * are a good indicator of UTF-16BE. */
13023 #ifndef PERL_NO_UTF16_FILTER
13024 #ifdef DEBUGGING
13025 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (no BOM)\n");
13026 #endif
13027 s = add_utf16_textfilter(s, FALSE);
13028 #else
13029 /* diag_listed_as: Unsupported script encoding %s */
13030 Perl_croak(aTHX_ "Unsupported script encoding UTF-16BE");
13031 #endif
13032 }
13033 }
13034 break;
13035
13036 default:
13037 if (slen > 3 && s[1] == 0 && s[2] != 0 && s[3] == 0) {
13038 /* Leading bytes
13039 * xx 00 xx 00
13040 * are a good indicator of UTF-16LE. */
13041 #ifndef PERL_NO_UTF16_FILTER
13042 #ifdef DEBUGGING
13043 if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (no BOM)\n");
13044 #endif
13045 s = add_utf16_textfilter(s, TRUE);
13046 #else
13047 /* diag_listed_as: Unsupported script encoding %s */
13048 Perl_croak(aTHX_ "Unsupported script encoding UTF-16LE");
13049 #endif
13050 }
13051 }
13052 return (char*)s;
13053 }
13054
13055
13056 #ifndef PERL_NO_UTF16_FILTER
13057 static I32
S_utf16_textfilter(pTHX_ int idx,SV * sv,int maxlen)13058 S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen)
13059 {
13060 SV *const filter = FILTER_DATA(idx);
13061 /* We re-use this each time round, throwing the contents away before we
13062 return. */
13063 SV *const utf16_buffer = MUTABLE_SV(IoTOP_GV(filter));
13064 SV *const utf8_buffer = filter;
13065 IV status = IoPAGE(filter);
13066 const bool reverse = cBOOL(IoLINES(filter));
13067 I32 retval;
13068
13069 PERL_ARGS_ASSERT_UTF16_TEXTFILTER;
13070
13071 /* As we're automatically added, at the lowest level, and hence only called
13072 from this file, we can be sure that we're not called in block mode. Hence
13073 don't bother writing code to deal with block mode. */
13074 if (maxlen) {
13075 Perl_croak(aTHX_ "panic: utf16_textfilter called in block mode (for %d characters)", maxlen);
13076 }
13077 if (status < 0) {
13078 Perl_croak(aTHX_ "panic: utf16_textfilter called after error (status=%" IVdf ")", status);
13079 }
13080 DEBUG_P(PerlIO_printf(Perl_debug_log,
13081 "utf16_textfilter(%p,%ce): idx=%d maxlen=%d status=%" IVdf " utf16=%" UVuf " utf8=%" UVuf "\n",
13082 FPTR2DPTR(void *, S_utf16_textfilter),
13083 reverse ? 'l' : 'b', idx, maxlen, status,
13084 (UV)SvCUR(utf16_buffer), (UV)SvCUR(utf8_buffer)));
13085
13086 while (1) {
13087 STRLEN chars;
13088 STRLEN have;
13089 Size_t newlen;
13090 U8 *end;
13091 /* First, look in our buffer of existing UTF-8 data: */
13092 char *nl = (char *)memchr(SvPVX(utf8_buffer), '\n', SvCUR(utf8_buffer));
13093
13094 if (nl) {
13095 ++nl;
13096 } else if (status == 0) {
13097 /* EOF */
13098 IoPAGE(filter) = 0;
13099 nl = SvEND(utf8_buffer);
13100 }
13101 if (nl) {
13102 STRLEN got = nl - SvPVX(utf8_buffer);
13103 /* Did we have anything to append? */
13104 retval = got != 0;
13105 sv_catpvn(sv, SvPVX(utf8_buffer), got);
13106 /* Everything else in this code works just fine if SVp_POK isn't
13107 set. This, however, needs it, and we need it to work, else
13108 we loop infinitely because the buffer is never consumed. */
13109 sv_chop(utf8_buffer, nl);
13110 break;
13111 }
13112
13113 /* OK, not a complete line there, so need to read some more UTF-16.
13114 Read an extra octect if the buffer currently has an odd number. */
13115 while (1) {
13116 if (status <= 0)
13117 break;
13118 if (SvCUR(utf16_buffer) >= 2) {
13119 /* Location of the high octet of the last complete code point.
13120 Gosh, UTF-16 is a pain. All the benefits of variable length,
13121 *coupled* with all the benefits of partial reads and
13122 endianness. */
13123 const U8 *const last_hi = (U8*)SvPVX(utf16_buffer)
13124 + ((SvCUR(utf16_buffer) & ~1) - (reverse ? 1 : 2));
13125
13126 if (*last_hi < 0xd8 || *last_hi > 0xdb) {
13127 break;
13128 }
13129
13130 /* We have the first half of a surrogate. Read more. */
13131 DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter partial surrogate detected at %p\n", last_hi));
13132 }
13133
13134 status = FILTER_READ(idx + 1, utf16_buffer,
13135 160 + (SvCUR(utf16_buffer) & 1));
13136 DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter status=%" IVdf " SvCUR(sv)=%" UVuf "\n", status, (UV)SvCUR(utf16_buffer)));
13137 DEBUG_P({ sv_dump(utf16_buffer); sv_dump(utf8_buffer);});
13138 if (status < 0) {
13139 /* Error */
13140 IoPAGE(filter) = status;
13141 return status;
13142 }
13143 }
13144
13145 /* 'chars' isn't quite the right name, as code points above 0xFFFF
13146 * require 4 bytes per char */
13147 chars = SvCUR(utf16_buffer) >> 1;
13148 have = SvCUR(utf8_buffer);
13149
13150 /* Assume the worst case size as noted by the functions: twice the
13151 * number of input bytes */
13152 SvGROW(utf8_buffer, have + chars * 4 + 1);
13153
13154 if (reverse) {
13155 end = utf16_to_utf8_reversed((U8*)SvPVX(utf16_buffer),
13156 (U8*)SvPVX_const(utf8_buffer) + have,
13157 chars * 2, &newlen);
13158 } else {
13159 end = utf16_to_utf8((U8*)SvPVX(utf16_buffer),
13160 (U8*)SvPVX_const(utf8_buffer) + have,
13161 chars * 2, &newlen);
13162 }
13163 SvCUR_set(utf8_buffer, have + newlen);
13164 *end = '\0';
13165
13166 /* No need to keep this SV "well-formed" with a '\0' after the end, as
13167 it's private to us, and utf16_to_utf8{,reversed} take a
13168 (pointer,length) pair, rather than a NUL-terminated string. */
13169 if(SvCUR(utf16_buffer) & 1) {
13170 *SvPVX(utf16_buffer) = SvEND(utf16_buffer)[-1];
13171 SvCUR_set(utf16_buffer, 1);
13172 } else {
13173 SvCUR_set(utf16_buffer, 0);
13174 }
13175 }
13176 DEBUG_P(PerlIO_printf(Perl_debug_log,
13177 "utf16_textfilter: returns, status=%" IVdf " utf16=%" UVuf " utf8=%" UVuf "\n",
13178 status,
13179 (UV)SvCUR(utf16_buffer), (UV)SvCUR(utf8_buffer)));
13180 DEBUG_P({ sv_dump(utf8_buffer); sv_dump(sv);});
13181 return retval;
13182 }
13183
13184 static U8 *
S_add_utf16_textfilter(pTHX_ U8 * const s,bool reversed)13185 S_add_utf16_textfilter(pTHX_ U8 *const s, bool reversed)
13186 {
13187 SV *filter = filter_add(S_utf16_textfilter, NULL);
13188
13189 PERL_ARGS_ASSERT_ADD_UTF16_TEXTFILTER;
13190
13191 IoTOP_GV(filter) = MUTABLE_GV(newSVpvn((char *)s, PL_bufend - (char*)s));
13192 SvPVCLEAR(filter);
13193 IoLINES(filter) = reversed;
13194 IoPAGE(filter) = 1; /* Not EOF */
13195
13196 /* Sadly, we have to return a valid pointer, come what may, so we have to
13197 ignore any error return from this. */
13198 SvCUR_set(PL_linestr, 0);
13199 if (FILTER_READ(0, PL_linestr, 0)) {
13200 SvUTF8_on(PL_linestr);
13201 } else {
13202 SvUTF8_on(PL_linestr);
13203 }
13204 PL_bufend = SvEND(PL_linestr);
13205 return (U8*)SvPVX(PL_linestr);
13206 }
13207 #endif
13208
13209 /*
13210 =for apidoc scan_vstring
13211
13212 Returns a pointer to the next character after the parsed
13213 vstring, as well as updating the passed in sv.
13214
13215 Function must be called like
13216
13217 sv = sv_2mortal(newSV(5));
13218 s = scan_vstring(s,e,sv);
13219
13220 where s and e are the start and end of the string.
13221 The sv should already be large enough to store the vstring
13222 passed in, for performance reasons.
13223
13224 This function may croak if fatal warnings are enabled in the
13225 calling scope, hence the sv_2mortal in the example (to prevent
13226 a leak). Make sure to do SvREFCNT_inc afterwards if you use
13227 sv_2mortal.
13228
13229 =cut
13230 */
13231
13232 char *
Perl_scan_vstring(pTHX_ const char * s,const char * const e,SV * sv)13233 Perl_scan_vstring(pTHX_ const char *s, const char *const e, SV *sv)
13234 {
13235 const char *pos = s;
13236 const char *start = s;
13237
13238 PERL_ARGS_ASSERT_SCAN_VSTRING;
13239
13240 if (*pos == 'v') pos++; /* get past 'v' */
13241 while (pos < e && (isDIGIT(*pos) || *pos == '_'))
13242 pos++;
13243 if ( *pos != '.') {
13244 /* this may not be a v-string if followed by => */
13245 const char *next = pos;
13246 while (next < e && isSPACE(*next))
13247 ++next;
13248 if ((e - next) >= 2 && *next == '=' && next[1] == '>' ) {
13249 /* return string not v-string */
13250 sv_setpvn(sv,(char *)s,pos-s);
13251 return (char *)pos;
13252 }
13253 }
13254
13255 if (!isALPHA(*pos)) {
13256 U8 tmpbuf[UTF8_MAXBYTES+1];
13257
13258 if (*s == 'v')
13259 s++; /* get past 'v' */
13260
13261 SvPVCLEAR(sv);
13262
13263 for (;;) {
13264 /* this is atoi() that tolerates underscores */
13265 U8 *tmpend;
13266 UV rev = 0;
13267 const char *end = pos;
13268 UV mult = 1;
13269 while (--end >= s) {
13270 if (*end != '_') {
13271 const UV orev = rev;
13272 rev += (*end - '0') * mult;
13273 mult *= 10;
13274 if (orev > rev)
13275 /* diag_listed_as: Integer overflow in %s number */
13276 Perl_ck_warner_d(aTHX_ packWARN(WARN_OVERFLOW),
13277 "Integer overflow in decimal number");
13278 }
13279 }
13280
13281 /* Append native character for the rev point */
13282 tmpend = uvchr_to_utf8(tmpbuf, rev);
13283 sv_catpvn(sv, (const char*)tmpbuf, tmpend - tmpbuf);
13284 if (!UVCHR_IS_INVARIANT(rev))
13285 SvUTF8_on(sv);
13286 if (pos + 1 < e && *pos == '.' && isDIGIT(pos[1]))
13287 s = ++pos;
13288 else {
13289 s = pos;
13290 break;
13291 }
13292 while (pos < e && (isDIGIT(*pos) || *pos == '_'))
13293 pos++;
13294 }
13295 SvPOK_on(sv);
13296 sv_magic(sv,NULL,PERL_MAGIC_vstring,(const char*)start, pos-start);
13297 SvRMAGICAL_on(sv);
13298 }
13299 return (char *)s;
13300 }
13301
13302 int
Perl_keyword_plugin_standard(pTHX_ char * keyword_ptr,STRLEN keyword_len,OP ** op_ptr)13303 Perl_keyword_plugin_standard(pTHX_
13304 char *keyword_ptr, STRLEN keyword_len, OP **op_ptr)
13305 {
13306 PERL_ARGS_ASSERT_KEYWORD_PLUGIN_STANDARD;
13307 PERL_UNUSED_CONTEXT;
13308 PERL_UNUSED_ARG(keyword_ptr);
13309 PERL_UNUSED_ARG(keyword_len);
13310 PERL_UNUSED_ARG(op_ptr);
13311 return KEYWORD_PLUGIN_DECLINE;
13312 }
13313
13314 STRLEN
Perl_infix_plugin_standard(pTHX_ char * operator_ptr,STRLEN operator_len,struct Perl_custom_infix ** def)13315 Perl_infix_plugin_standard(pTHX_
13316 char *operator_ptr, STRLEN operator_len, struct Perl_custom_infix **def)
13317 {
13318 PERL_ARGS_ASSERT_INFIX_PLUGIN_STANDARD;
13319 PERL_UNUSED_CONTEXT;
13320 PERL_UNUSED_ARG(operator_ptr);
13321 PERL_UNUSED_ARG(operator_len);
13322 PERL_UNUSED_ARG(def);
13323 return 0;
13324 }
13325
13326 /*
13327 =for apidoc_section $lexer
13328 =for apidoc wrap_keyword_plugin
13329
13330 Puts a C function into the chain of keyword plugins. This is the
13331 preferred way to manipulate the L</PL_keyword_plugin> variable.
13332 C<new_plugin> is a pointer to the C function that is to be added to the
13333 keyword plugin chain, and C<old_plugin_p> points to the storage location
13334 where a pointer to the next function in the chain will be stored. The
13335 value of C<new_plugin> is written into the L</PL_keyword_plugin> variable,
13336 while the value previously stored there is written to C<*old_plugin_p>.
13337
13338 L</PL_keyword_plugin> is global to an entire process, and a module wishing
13339 to hook keyword parsing may find itself invoked more than once per
13340 process, typically in different threads. To handle that situation, this
13341 function is idempotent. The location C<*old_plugin_p> must initially
13342 (once per process) contain a null pointer. A C variable of static
13343 duration (declared at file scope, typically also marked C<static> to give
13344 it internal linkage) will be implicitly initialised appropriately, if it
13345 does not have an explicit initialiser. This function will only actually
13346 modify the plugin chain if it finds C<*old_plugin_p> to be null. This
13347 function is also thread safe on the small scale. It uses appropriate
13348 locking to avoid race conditions in accessing L</PL_keyword_plugin>.
13349
13350 When this function is called, the function referenced by C<new_plugin>
13351 must be ready to be called, except for C<*old_plugin_p> being unfilled.
13352 In a threading situation, C<new_plugin> may be called immediately, even
13353 before this function has returned. C<*old_plugin_p> will always be
13354 appropriately set before C<new_plugin> is called. If C<new_plugin>
13355 decides not to do anything special with the identifier that it is given
13356 (which is the usual case for most calls to a keyword plugin), it must
13357 chain the plugin function referenced by C<*old_plugin_p>.
13358
13359 Taken all together, XS code to install a keyword plugin should typically
13360 look something like this:
13361
13362 static Perl_keyword_plugin_t next_keyword_plugin;
13363 static OP *my_keyword_plugin(pTHX_
13364 char *keyword_ptr, STRLEN keyword_len, OP **op_ptr)
13365 {
13366 if (memEQs(keyword_ptr, keyword_len,
13367 "my_new_keyword")) {
13368 ...
13369 } else {
13370 return next_keyword_plugin(aTHX_
13371 keyword_ptr, keyword_len, op_ptr);
13372 }
13373 }
13374 BOOT:
13375 wrap_keyword_plugin(my_keyword_plugin,
13376 &next_keyword_plugin);
13377
13378 Direct access to L</PL_keyword_plugin> should be avoided.
13379
13380 =cut
13381 */
13382
13383 void
Perl_wrap_keyword_plugin(pTHX_ Perl_keyword_plugin_t new_plugin,Perl_keyword_plugin_t * old_plugin_p)13384 Perl_wrap_keyword_plugin(pTHX_
13385 Perl_keyword_plugin_t new_plugin, Perl_keyword_plugin_t *old_plugin_p)
13386 {
13387
13388 PERL_UNUSED_CONTEXT;
13389 PERL_ARGS_ASSERT_WRAP_KEYWORD_PLUGIN;
13390 if (*old_plugin_p) return;
13391 KEYWORD_PLUGIN_MUTEX_LOCK;
13392 if (!*old_plugin_p) {
13393 *old_plugin_p = PL_keyword_plugin;
13394 PL_keyword_plugin = new_plugin;
13395 }
13396 KEYWORD_PLUGIN_MUTEX_UNLOCK;
13397 }
13398
13399 /*
13400 =for apidoc wrap_infix_plugin
13401
13402 B<NOTE:> This API exists entirely for the purpose of making the CPAN module
13403 C<XS::Parse::Infix> work. It is not expected that additional modules will make
13404 use of it; rather, that they should use C<XS::Parse::Infix> to provide parsing
13405 of new infix operators.
13406
13407 Puts a C function into the chain of infix plugins. This is the preferred
13408 way to manipulate the L</PL_infix_plugin> variable. C<new_plugin> is a
13409 pointer to the C function that is to be added to the infix plugin chain, and
13410 C<old_plugin_p> points to a storage location where a pointer to the next
13411 function in the chain will be stored. The value of C<new_plugin> is written
13412 into the L</PL_infix_plugin> variable, while the value previously stored there
13413 is written to C<*old_plugin_p>.
13414
13415 Direct access to L</PL_infix_plugin> should be avoided.
13416
13417 =cut
13418 */
13419
13420 void
Perl_wrap_infix_plugin(pTHX_ Perl_infix_plugin_t new_plugin,Perl_infix_plugin_t * old_plugin_p)13421 Perl_wrap_infix_plugin(pTHX_
13422 Perl_infix_plugin_t new_plugin, Perl_infix_plugin_t *old_plugin_p)
13423 {
13424
13425 PERL_UNUSED_CONTEXT;
13426 PERL_ARGS_ASSERT_WRAP_INFIX_PLUGIN;
13427 if (*old_plugin_p) return;
13428 /* We use the same mutex as for PL_keyword_plugin as it's so rare either
13429 * of them is actually updated; no need for a dedicated one each */
13430 KEYWORD_PLUGIN_MUTEX_LOCK;
13431 if (!*old_plugin_p) {
13432 *old_plugin_p = PL_infix_plugin;
13433 PL_infix_plugin = new_plugin;
13434 }
13435 KEYWORD_PLUGIN_MUTEX_UNLOCK;
13436 }
13437
13438 #define parse_recdescent(g,p) S_parse_recdescent(aTHX_ g,p)
13439 static void
S_parse_recdescent(pTHX_ int gramtype,I32 fakeeof)13440 S_parse_recdescent(pTHX_ int gramtype, I32 fakeeof)
13441 {
13442 SAVEI32(PL_lex_brackets);
13443 if (PL_lex_brackets > 100)
13444 Renew(PL_lex_brackstack, PL_lex_brackets + 10, char);
13445 PL_lex_brackstack[PL_lex_brackets++] = XFAKEEOF;
13446 SAVEI32(PL_lex_allbrackets);
13447 PL_lex_allbrackets = 0;
13448 SAVEI8(PL_lex_fakeeof);
13449 PL_lex_fakeeof = (U8)fakeeof;
13450 if(yyparse(gramtype) && !PL_parser->error_count)
13451 qerror(Perl_mess(aTHX_ "Parse error"));
13452 }
13453
13454 #define parse_recdescent_for_op(g,p) S_parse_recdescent_for_op(aTHX_ g,p)
13455 static OP *
S_parse_recdescent_for_op(pTHX_ int gramtype,I32 fakeeof)13456 S_parse_recdescent_for_op(pTHX_ int gramtype, I32 fakeeof)
13457 {
13458 OP *o;
13459 ENTER;
13460 SAVEVPTR(PL_eval_root);
13461 PL_eval_root = NULL;
13462 parse_recdescent(gramtype, fakeeof);
13463 o = PL_eval_root;
13464 LEAVE;
13465 return o;
13466 }
13467
13468 #define parse_expr(p,f) S_parse_expr(aTHX_ p,f)
13469 static OP *
S_parse_expr(pTHX_ I32 fakeeof,U32 flags)13470 S_parse_expr(pTHX_ I32 fakeeof, U32 flags)
13471 {
13472 OP *exprop;
13473 if (flags & ~PARSE_OPTIONAL)
13474 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_expr");
13475 exprop = parse_recdescent_for_op(GRAMEXPR, fakeeof);
13476 if (!exprop && !(flags & PARSE_OPTIONAL)) {
13477 if (!PL_parser->error_count)
13478 qerror(Perl_mess(aTHX_ "Parse error"));
13479 exprop = newOP(OP_NULL, 0);
13480 }
13481 return exprop;
13482 }
13483
13484 /*
13485 =for apidoc parse_arithexpr
13486
13487 Parse a Perl arithmetic expression. This may contain operators of precedence
13488 down to the bit shift operators. The expression must be followed (and thus
13489 terminated) either by a comparison or lower-precedence operator or by
13490 something that would normally terminate an expression such as semicolon.
13491 If C<flags> has the C<PARSE_OPTIONAL> bit set, then the expression is optional,
13492 otherwise it is mandatory. It is up to the caller to ensure that the
13493 dynamic parser state (L</PL_parser> et al) is correctly set to reflect
13494 the source of the code to be parsed and the lexical context for the
13495 expression.
13496
13497 The op tree representing the expression is returned. If an optional
13498 expression is absent, a null pointer is returned, otherwise the pointer
13499 will be non-null.
13500
13501 If an error occurs in parsing or compilation, in most cases a valid op
13502 tree is returned anyway. The error is reflected in the parser state,
13503 normally resulting in a single exception at the top level of parsing
13504 which covers all the compilation errors that occurred. Some compilation
13505 errors, however, will throw an exception immediately.
13506
13507 =for apidoc Amnh||PARSE_OPTIONAL
13508
13509 =cut
13510
13511 */
13512
13513 OP *
Perl_parse_arithexpr(pTHX_ U32 flags)13514 Perl_parse_arithexpr(pTHX_ U32 flags)
13515 {
13516 return parse_expr(LEX_FAKEEOF_COMPARE, flags);
13517 }
13518
13519 /*
13520 =for apidoc parse_termexpr
13521
13522 Parse a Perl term expression. This may contain operators of precedence
13523 down to the assignment operators. The expression must be followed (and thus
13524 terminated) either by a comma or lower-precedence operator or by
13525 something that would normally terminate an expression such as semicolon.
13526 If C<flags> has the C<PARSE_OPTIONAL> bit set, then the expression is optional,
13527 otherwise it is mandatory. It is up to the caller to ensure that the
13528 dynamic parser state (L</PL_parser> et al) is correctly set to reflect
13529 the source of the code to be parsed and the lexical context for the
13530 expression.
13531
13532 The op tree representing the expression is returned. If an optional
13533 expression is absent, a null pointer is returned, otherwise the pointer
13534 will be non-null.
13535
13536 If an error occurs in parsing or compilation, in most cases a valid op
13537 tree is returned anyway. The error is reflected in the parser state,
13538 normally resulting in a single exception at the top level of parsing
13539 which covers all the compilation errors that occurred. Some compilation
13540 errors, however, will throw an exception immediately.
13541
13542 =cut
13543 */
13544
13545 OP *
Perl_parse_termexpr(pTHX_ U32 flags)13546 Perl_parse_termexpr(pTHX_ U32 flags)
13547 {
13548 return parse_expr(LEX_FAKEEOF_COMMA, flags);
13549 }
13550
13551 /*
13552 =for apidoc parse_listexpr
13553
13554 Parse a Perl list expression. This may contain operators of precedence
13555 down to the comma operator. The expression must be followed (and thus
13556 terminated) either by a low-precedence logic operator such as C<or> or by
13557 something that would normally terminate an expression such as semicolon.
13558 If C<flags> has the C<PARSE_OPTIONAL> bit set, then the expression is optional,
13559 otherwise it is mandatory. It is up to the caller to ensure that the
13560 dynamic parser state (L</PL_parser> et al) is correctly set to reflect
13561 the source of the code to be parsed and the lexical context for the
13562 expression.
13563
13564 The op tree representing the expression is returned. If an optional
13565 expression is absent, a null pointer is returned, otherwise the pointer
13566 will be non-null.
13567
13568 If an error occurs in parsing or compilation, in most cases a valid op
13569 tree is returned anyway. The error is reflected in the parser state,
13570 normally resulting in a single exception at the top level of parsing
13571 which covers all the compilation errors that occurred. Some compilation
13572 errors, however, will throw an exception immediately.
13573
13574 =cut
13575 */
13576
13577 OP *
Perl_parse_listexpr(pTHX_ U32 flags)13578 Perl_parse_listexpr(pTHX_ U32 flags)
13579 {
13580 return parse_expr(LEX_FAKEEOF_LOWLOGIC, flags);
13581 }
13582
13583 /*
13584 =for apidoc parse_fullexpr
13585
13586 Parse a single complete Perl expression. This allows the full
13587 expression grammar, including the lowest-precedence operators such
13588 as C<or>. The expression must be followed (and thus terminated) by a
13589 token that an expression would normally be terminated by: end-of-file,
13590 closing bracketing punctuation, semicolon, or one of the keywords that
13591 signals a postfix expression-statement modifier. If C<flags> has the
13592 C<PARSE_OPTIONAL> bit set, then the expression is optional, otherwise it is
13593 mandatory. It is up to the caller to ensure that the dynamic parser
13594 state (L</PL_parser> et al) is correctly set to reflect the source of
13595 the code to be parsed and the lexical context for the expression.
13596
13597 The op tree representing the expression is returned. If an optional
13598 expression is absent, a null pointer is returned, otherwise the pointer
13599 will be non-null.
13600
13601 If an error occurs in parsing or compilation, in most cases a valid op
13602 tree is returned anyway. The error is reflected in the parser state,
13603 normally resulting in a single exception at the top level of parsing
13604 which covers all the compilation errors that occurred. Some compilation
13605 errors, however, will throw an exception immediately.
13606
13607 =cut
13608 */
13609
13610 OP *
Perl_parse_fullexpr(pTHX_ U32 flags)13611 Perl_parse_fullexpr(pTHX_ U32 flags)
13612 {
13613 return parse_expr(LEX_FAKEEOF_NONEXPR, flags);
13614 }
13615
13616 /*
13617 =for apidoc parse_block
13618
13619 Parse a single complete Perl code block. This consists of an opening
13620 brace, a sequence of statements, and a closing brace. The block
13621 constitutes a lexical scope, so C<my> variables and various compile-time
13622 effects can be contained within it. It is up to the caller to ensure
13623 that the dynamic parser state (L</PL_parser> et al) is correctly set to
13624 reflect the source of the code to be parsed and the lexical context for
13625 the statement.
13626
13627 The op tree representing the code block is returned. This is always a
13628 real op, never a null pointer. It will normally be a C<lineseq> list,
13629 including C<nextstate> or equivalent ops. No ops to construct any kind
13630 of runtime scope are included by virtue of it being a block.
13631
13632 If an error occurs in parsing or compilation, in most cases a valid op
13633 tree (most likely null) is returned anyway. The error is reflected in
13634 the parser state, normally resulting in a single exception at the top
13635 level of parsing which covers all the compilation errors that occurred.
13636 Some compilation errors, however, will throw an exception immediately.
13637
13638 The C<flags> parameter is reserved for future use, and must always
13639 be zero.
13640
13641 =cut
13642 */
13643
13644 OP *
Perl_parse_block(pTHX_ U32 flags)13645 Perl_parse_block(pTHX_ U32 flags)
13646 {
13647 if (flags)
13648 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_block");
13649 return parse_recdescent_for_op(GRAMBLOCK, LEX_FAKEEOF_NEVER);
13650 }
13651
13652 /*
13653 =for apidoc parse_barestmt
13654
13655 Parse a single unadorned Perl statement. This may be a normal imperative
13656 statement or a declaration that has compile-time effect. It does not
13657 include any label or other affixture. It is up to the caller to ensure
13658 that the dynamic parser state (L</PL_parser> et al) is correctly set to
13659 reflect the source of the code to be parsed and the lexical context for
13660 the statement.
13661
13662 The op tree representing the statement is returned. This may be a
13663 null pointer if the statement is null, for example if it was actually
13664 a subroutine definition (which has compile-time side effects). If not
13665 null, it will be ops directly implementing the statement, suitable to
13666 pass to L</newSTATEOP>. It will not normally include a C<nextstate> or
13667 equivalent op (except for those embedded in a scope contained entirely
13668 within the statement).
13669
13670 If an error occurs in parsing or compilation, in most cases a valid op
13671 tree (most likely null) is returned anyway. The error is reflected in
13672 the parser state, normally resulting in a single exception at the top
13673 level of parsing which covers all the compilation errors that occurred.
13674 Some compilation errors, however, will throw an exception immediately.
13675
13676 The C<flags> parameter is reserved for future use, and must always
13677 be zero.
13678
13679 =cut
13680 */
13681
13682 OP *
Perl_parse_barestmt(pTHX_ U32 flags)13683 Perl_parse_barestmt(pTHX_ U32 flags)
13684 {
13685 if (flags)
13686 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_barestmt");
13687 return parse_recdescent_for_op(GRAMBARESTMT, LEX_FAKEEOF_NEVER);
13688 }
13689
13690 /*
13691 =for apidoc parse_label
13692
13693 Parse a single label, possibly optional, of the type that may prefix a
13694 Perl statement. It is up to the caller to ensure that the dynamic parser
13695 state (L</PL_parser> et al) is correctly set to reflect the source of
13696 the code to be parsed. If C<flags> has the C<PARSE_OPTIONAL> bit set, then the
13697 label is optional, otherwise it is mandatory.
13698
13699 The name of the label is returned in the form of a fresh scalar. If an
13700 optional label is absent, a null pointer is returned.
13701
13702 If an error occurs in parsing, which can only occur if the label is
13703 mandatory, a valid label is returned anyway. The error is reflected in
13704 the parser state, normally resulting in a single exception at the top
13705 level of parsing which covers all the compilation errors that occurred.
13706
13707 =cut
13708 */
13709
13710 SV *
Perl_parse_label(pTHX_ U32 flags)13711 Perl_parse_label(pTHX_ U32 flags)
13712 {
13713 if (flags & ~PARSE_OPTIONAL)
13714 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_label");
13715 if (PL_nexttoke) {
13716 PL_parser->yychar = yylex();
13717 if (PL_parser->yychar == LABEL) {
13718 SV * const labelsv = cSVOPx(pl_yylval.opval)->op_sv;
13719 PL_parser->yychar = YYEMPTY;
13720 cSVOPx(pl_yylval.opval)->op_sv = NULL;
13721 op_free(pl_yylval.opval);
13722 return labelsv;
13723 } else {
13724 yyunlex();
13725 goto no_label;
13726 }
13727 } else {
13728 char *s, *t;
13729 STRLEN wlen, bufptr_pos;
13730 lex_read_space(0);
13731 t = s = PL_bufptr;
13732 if (!isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
13733 goto no_label;
13734 t = scan_word6(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &wlen, FALSE);
13735 if (word_takes_any_delimiter(s, wlen))
13736 goto no_label;
13737 bufptr_pos = s - SvPVX(PL_linestr);
13738 PL_bufptr = t;
13739 lex_read_space(LEX_KEEP_PREVIOUS);
13740 t = PL_bufptr;
13741 s = SvPVX(PL_linestr) + bufptr_pos;
13742 if (t[0] == ':' && t[1] != ':') {
13743 PL_oldoldbufptr = PL_oldbufptr;
13744 PL_oldbufptr = s;
13745 PL_bufptr = t+1;
13746 return newSVpvn_flags(s, wlen, UTF ? SVf_UTF8 : 0);
13747 } else {
13748 PL_bufptr = s;
13749 no_label:
13750 if (flags & PARSE_OPTIONAL) {
13751 return NULL;
13752 } else {
13753 qerror(Perl_mess(aTHX_ "Parse error"));
13754 return newSVpvs("x");
13755 }
13756 }
13757 }
13758 }
13759
13760 /*
13761 =for apidoc parse_fullstmt
13762
13763 Parse a single complete Perl statement. This may be a normal imperative
13764 statement or a declaration that has compile-time effect, and may include
13765 optional labels. It is up to the caller to ensure that the dynamic
13766 parser state (L</PL_parser> et al) is correctly set to reflect the source
13767 of the code to be parsed and the lexical context for the statement.
13768
13769 The op tree representing the statement is returned. This may be a
13770 null pointer if the statement is null, for example if it was actually
13771 a subroutine definition (which has compile-time side effects). If not
13772 null, it will be the result of a L</newSTATEOP> call, normally including
13773 a C<nextstate> or equivalent op.
13774
13775 If an error occurs in parsing or compilation, in most cases a valid op
13776 tree (most likely null) is returned anyway. The error is reflected in
13777 the parser state, normally resulting in a single exception at the top
13778 level of parsing which covers all the compilation errors that occurred.
13779 Some compilation errors, however, will throw an exception immediately.
13780
13781 The C<flags> parameter is reserved for future use, and must always
13782 be zero.
13783
13784 =cut
13785 */
13786
13787 OP *
Perl_parse_fullstmt(pTHX_ U32 flags)13788 Perl_parse_fullstmt(pTHX_ U32 flags)
13789 {
13790 if (flags)
13791 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_fullstmt");
13792 return parse_recdescent_for_op(GRAMFULLSTMT, LEX_FAKEEOF_NEVER);
13793 }
13794
13795 /*
13796 =for apidoc parse_stmtseq
13797
13798 Parse a sequence of zero or more Perl statements. These may be normal
13799 imperative statements, including optional labels, or declarations
13800 that have compile-time effect, or any mixture thereof. The statement
13801 sequence ends when a closing brace or end-of-file is encountered in a
13802 place where a new statement could have validly started. It is up to
13803 the caller to ensure that the dynamic parser state (L</PL_parser> et al)
13804 is correctly set to reflect the source of the code to be parsed and the
13805 lexical context for the statements.
13806
13807 The op tree representing the statement sequence is returned. This may
13808 be a null pointer if the statements were all null, for example if there
13809 were no statements or if there were only subroutine definitions (which
13810 have compile-time side effects). If not null, it will be a C<lineseq>
13811 list, normally including C<nextstate> or equivalent ops.
13812
13813 If an error occurs in parsing or compilation, in most cases a valid op
13814 tree is returned anyway. The error is reflected in the parser state,
13815 normally resulting in a single exception at the top level of parsing
13816 which covers all the compilation errors that occurred. Some compilation
13817 errors, however, will throw an exception immediately.
13818
13819 The C<flags> parameter is reserved for future use, and must always
13820 be zero.
13821
13822 =cut
13823 */
13824
13825 OP *
Perl_parse_stmtseq(pTHX_ U32 flags)13826 Perl_parse_stmtseq(pTHX_ U32 flags)
13827 {
13828 OP *stmtseqop;
13829 I32 c;
13830 if (flags)
13831 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_stmtseq");
13832 stmtseqop = parse_recdescent_for_op(GRAMSTMTSEQ, LEX_FAKEEOF_CLOSING);
13833 c = lex_peek_unichar(0);
13834 if (c != -1 && c != /*{*/'}')
13835 qerror(Perl_mess(aTHX_ "Parse error"));
13836 return stmtseqop;
13837 }
13838
13839 /*
13840 =for apidoc parse_subsignature
13841
13842 Parse a subroutine signature declaration. This is the contents of the
13843 parentheses following a named or anonymous subroutine declaration when the
13844 C<signatures> feature is enabled. Note that this function neither expects
13845 nor consumes the opening and closing parentheses around the signature; it
13846 is the caller's job to handle these.
13847
13848 This function must only be called during parsing of a subroutine; after
13849 L</start_subparse> has been called. It might allocate lexical variables on
13850 the pad for the current subroutine.
13851
13852 The op tree to unpack the arguments from the stack at runtime is returned.
13853 This op tree should appear at the beginning of the compiled function. The
13854 caller may wish to use L</op_append_list> to build their function body
13855 after it, or splice it together with the body before calling L</newATTRSUB>.
13856
13857 The C<flags> parameter is reserved for future use, and must always
13858 be zero.
13859
13860 =cut
13861 */
13862
13863 OP *
Perl_parse_subsignature(pTHX_ U32 flags)13864 Perl_parse_subsignature(pTHX_ U32 flags)
13865 {
13866 if (flags)
13867 Perl_croak(aTHX_ "Parsing code internal error (%s)", "parse_subsignature");
13868 return parse_recdescent_for_op(GRAMSUBSIGNATURE, LEX_FAKEEOF_NONEXPR);
13869 }
13870
13871 /*
13872 * ex: set ts=8 sts=4 sw=4 et:
13873 */
13874