1 /* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 * This file contains tables and code adapted from
9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
10 * copyright notice:
11
12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14 Permission is hereby granted, free of charge, to any person obtaining a copy of
15 this software and associated documentation files (the "Software"), to deal in
16 the Software without restriction, including without limitation the rights to
17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18 of the Software, and to permit persons to whom the Software is furnished to do
19 so, subject to the following conditions:
20
21 The above copyright notice and this permission notice shall be included in all
22 copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 SOFTWARE.
31
32 *
33 * This file is a home for static inline functions that cannot go in other
34 * header files, because they depend on proto.h (included after most other
35 * headers) or struct definitions.
36 *
37 * Note also perlstatic.h for functions that can't or shouldn't be inlined, but
38 * whose details should be exposed to the compiler, for such things as tail
39 * call optimization.
40 *
41 * Each section names the header file that the functions "belong" to.
42 */
43
44 /* ------------------------------- av.h ------------------------------- */
45
46 /*
47 =for apidoc_section $AV
48 =for apidoc av_count
49 Returns the number of elements in the array C<av>. This is the true length of
50 the array, including any undefined elements. It is always the same as
51 S<C<av_top_index(av) + 1>>.
52
53 =cut
54 */
55 PERL_STATIC_INLINE Size_t
Perl_av_count(pTHX_ AV * av)56 Perl_av_count(pTHX_ AV *av)
57 {
58 PERL_ARGS_ASSERT_AV_COUNT;
59 assert(SvTYPE(av) == SVt_PVAV);
60
61 return AvFILL(av) + 1;
62 }
63
64 /* ------------------------------- av.c ------------------------------- */
65
66 /*
67 =for apidoc av_store_simple
68
69 This is a cut-down version of av_store that assumes that the array is
70 very straightforward - no magic, not readonly, and AvREAL - and that
71 C<key> is not negative. This function MUST NOT be used in situations
72 where any of those assumptions may not hold.
73
74 Stores an SV in an array. The array index is specified as C<key>. It
75 can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
76
77 Note that the caller is responsible for suitably incrementing the reference
78 count of C<val> before the call.
79
80 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
81
82 =cut
83 */
84
85 PERL_STATIC_INLINE SV**
Perl_av_store_simple(pTHX_ AV * av,SSize_t key,SV * val)86 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
87 {
88 SV** ary;
89
90 PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
91 assert(SvTYPE(av) == SVt_PVAV);
92 assert(!SvMAGICAL(av));
93 assert(!SvREADONLY(av));
94 assert(AvREAL(av));
95 assert(key > -1);
96
97 ary = AvARRAY(av);
98
99 if (AvFILLp(av) < key) {
100 if (key > AvMAX(av)) {
101 av_extend(av,key);
102 ary = AvARRAY(av);
103 }
104 AvFILLp(av) = key;
105 } else
106 SvREFCNT_dec(ary[key]);
107
108 ary[key] = val;
109 return &ary[key];
110 }
111
112 /*
113 =for apidoc av_fetch_simple
114
115 This is a cut-down version of av_fetch that assumes that the array is
116 very straightforward - no magic, not readonly, and AvREAL - and that
117 C<key> is not negative. This function MUST NOT be used in situations
118 where any of those assumptions may not hold.
119
120 Returns the SV at the specified index in the array. The C<key> is the
121 index. If lval is true, you are guaranteed to get a real SV back (in case
122 it wasn't real before), which you can then modify. Check that the return
123 value is non-null before dereferencing it to a C<SV*>.
124
125 The rough perl equivalent is C<$myarray[$key]>.
126
127 =cut
128 */
129
130 PERL_STATIC_INLINE SV**
Perl_av_fetch_simple(pTHX_ AV * av,SSize_t key,I32 lval)131 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
132 {
133 PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
134 assert(SvTYPE(av) == SVt_PVAV);
135 assert(!SvMAGICAL(av));
136 assert(!SvREADONLY(av));
137 assert(AvREAL(av));
138 assert(key > -1);
139
140 if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
141 return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
142 } else {
143 return &AvARRAY(av)[key];
144 }
145 }
146
147 /*
148 =for apidoc av_push_simple
149
150 This is a cut-down version of av_push that assumes that the array is very
151 straightforward - no magic, not readonly, and AvREAL - and that C<key> is
152 not less than -1. This function MUST NOT be used in situations where any
153 of those assumptions may not hold.
154
155 Pushes an SV (transferring control of one reference count) onto the end of the
156 array. The array will grow automatically to accommodate the addition.
157
158 Perl equivalent: C<push @myarray, $val;>.
159
160 =cut
161 */
162
163 PERL_STATIC_INLINE void
Perl_av_push_simple(pTHX_ AV * av,SV * val)164 Perl_av_push_simple(pTHX_ AV *av, SV *val)
165 {
166 PERL_ARGS_ASSERT_AV_PUSH_SIMPLE;
167 assert(SvTYPE(av) == SVt_PVAV);
168 assert(!SvMAGICAL(av));
169 assert(!SvREADONLY(av));
170 assert(AvREAL(av));
171 assert(AvFILLp(av) > -2);
172
173 (void)av_store_simple(av,AvFILLp(av)+1,val);
174 }
175
176 /*
177 =for apidoc av_new_alloc
178
179 This implements L<perlapi/C<newAV_alloc_x>>
180 and L<perlapi/C<newAV_alloc_xz>>, which are the public API for this
181 functionality.
182
183 Creates a new AV and allocates its SV* array.
184
185 This is similar to, but more efficient than doing:
186
187 AV *av = newAV();
188 av_extend(av, key);
189
190 The size parameter is used to pre-allocate a SV* array large enough to
191 hold at least elements C<0..(size-1)>. C<size> must be at least 1.
192
193 The C<zeroflag> parameter controls whether or not the array is NULL
194 initialized.
195
196 =cut
197 */
198
199 PERL_STATIC_INLINE AV *
Perl_av_new_alloc(pTHX_ SSize_t size,bool zeroflag)200 Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag)
201 {
202 AV * const av = newAV();
203 SV** ary;
204 PERL_ARGS_ASSERT_AV_NEW_ALLOC;
205 assert(size > 0);
206
207 Newx(ary, size, SV*); /* Newx performs the memwrap check */
208 AvALLOC(av) = ary;
209 AvARRAY(av) = ary;
210 AvMAX(av) = size - 1;
211
212 if (zeroflag)
213 Zero(ary, size, SV*);
214
215 return av;
216 }
217
218
219 /* remove (AvARRAY(av) - AvALLOC(av)) offset from empty array */
220
221 PERL_STATIC_INLINE void
Perl_av_remove_offset(pTHX_ AV * av)222 Perl_av_remove_offset(pTHX_ AV *av)
223 {
224 PERL_ARGS_ASSERT_AV_REMOVE_OFFSET;
225 assert(AvFILLp(av) == -1);
226 SSize_t i = AvARRAY(av) - AvALLOC(av);
227 if (i) {
228 AvARRAY(av) = AvALLOC(av);
229 AvMAX(av) += i;
230 #ifdef PERL_RC_STACK
231 Zero(AvALLOC(av), i, SV*);
232 #endif
233 }
234 }
235
236
237 /* ------------------------------- cv.h ------------------------------- */
238
239 /*
240 =for apidoc_section $CV
241 =for apidoc CvGV
242 Returns the GV associated with the CV C<sv>, reifying it if necessary.
243
244 =cut
245 */
246 PERL_STATIC_INLINE GV *
Perl_CvGV(pTHX_ CV * sv)247 Perl_CvGV(pTHX_ CV *sv)
248 {
249 PERL_ARGS_ASSERT_CVGV;
250
251 return CvNAMED(sv)
252 ? Perl_cvgv_from_hek(aTHX_ sv)
253 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
254 }
255
256 /*
257 =for apidoc CvDEPTH
258 Returns the recursion level of the CV C<sv>. Hence >= 2 indicates we are in a
259 recursive call.
260
261 =cut
262 */
263 PERL_STATIC_INLINE I32 *
Perl_CvDEPTH(const CV * const sv)264 Perl_CvDEPTH(const CV * const sv)
265 {
266 PERL_ARGS_ASSERT_CVDEPTH;
267 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
268
269 return &((XPVCV*)SvANY(sv))->xcv_depth;
270 }
271
272 /*
273 CvPROTO returns the prototype as stored, which is not necessarily what
274 the interpreter should be using. Specifically, the interpreter assumes
275 that spaces have been stripped, which has been the case if the prototype
276 was added by toke.c, but is generally not the case if it was added elsewhere.
277 Since we can't enforce the spacelessness at assignment time, this routine
278 provides a temporary copy at parse time with spaces removed.
279 I<orig> is the start of the original buffer, I<len> is the length of the
280 prototype and will be updated when this returns.
281 */
282
283 #ifdef PERL_CORE
284 PERL_STATIC_INLINE char *
S_strip_spaces(pTHX_ const char * orig,STRLEN * const len)285 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
286 {
287 SV * tmpsv;
288 char * tmps;
289 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
290 tmps = SvPVX(tmpsv);
291 while ((*len)--) {
292 if (!isSPACE(*orig))
293 *tmps++ = *orig;
294 orig++;
295 }
296 *tmps = '\0';
297 *len = tmps - SvPVX(tmpsv);
298 return SvPVX(tmpsv);
299 }
300 #endif
301
302 /* ------------------------------- iperlsys.h ------------------------------- */
303 #if ! defined(PERL_IMPLICIT_SYS) && defined(USE_ITHREADS)
304
305 /* Otherwise this function is implemented as macros in iperlsys.h */
306
307 PERL_STATIC_INLINE bool
S_PerlEnv_putenv(pTHX_ char * str)308 S_PerlEnv_putenv(pTHX_ char * str)
309 {
310 PERL_ARGS_ASSERT_PERLENV_PUTENV;
311
312 ENV_LOCK;
313 bool retval = putenv(str);
314 ENV_UNLOCK;
315
316 return retval;
317 }
318
319 #endif
320
321 /* ------------------------------- mg.h ------------------------------- */
322
323 #if defined(PERL_CORE) || defined(PERL_EXT)
324 /* assumes get-magic and stringification have already occurred */
325 PERL_STATIC_INLINE STRLEN
S_MgBYTEPOS(pTHX_ MAGIC * mg,SV * sv,const char * s,STRLEN len)326 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
327 {
328 assert(mg->mg_type == PERL_MAGIC_regex_global);
329 assert(mg->mg_len != -1);
330 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
331 return (STRLEN)mg->mg_len;
332 else {
333 const STRLEN pos = (STRLEN)mg->mg_len;
334 /* Without this check, we may read past the end of the buffer: */
335 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
336 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
337 }
338 }
339 #endif
340
341 /* ------------------------------- pad.h ------------------------------ */
342
343 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
344 PERL_STATIC_INLINE bool
S_PadnameIN_SCOPE(const PADNAME * const pn,const U32 seq)345 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
346 {
347 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
348
349 /* is seq within the range _LOW to _HIGH ?
350 * This is complicated by the fact that PL_cop_seqmax
351 * may have wrapped around at some point */
352 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
353 return FALSE; /* not yet introduced */
354
355 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
356 /* in compiling scope */
357 if (
358 (seq > COP_SEQ_RANGE_LOW(pn))
359 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
360 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
361 )
362 return TRUE;
363 }
364 else if (
365 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
366 ?
367 ( seq > COP_SEQ_RANGE_LOW(pn)
368 || seq <= COP_SEQ_RANGE_HIGH(pn))
369
370 : ( seq > COP_SEQ_RANGE_LOW(pn)
371 && seq <= COP_SEQ_RANGE_HIGH(pn))
372 )
373 return TRUE;
374 return FALSE;
375 }
376 #endif
377
378 /* ------------------------------- pp.h ------------------------------- */
379
380 PERL_STATIC_INLINE Stack_off_t
Perl_TOPMARK(pTHX)381 Perl_TOPMARK(pTHX)
382 {
383 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
384 "MARK top %p %" IVdf "\n",
385 PL_markstack_ptr,
386 (IV)*PL_markstack_ptr)));
387 return *PL_markstack_ptr;
388 }
389
390 PERL_STATIC_INLINE Stack_off_t
Perl_POPMARK(pTHX)391 Perl_POPMARK(pTHX)
392 {
393 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
394 "MARK pop %p %" IVdf "\n",
395 (PL_markstack_ptr-1),
396 (IV)*(PL_markstack_ptr-1))));
397 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
398 return *PL_markstack_ptr--;
399 }
400
401 /*
402 =for apidoc_section $rpp
403
404 =for apidoc rpp_extend
405 Ensures that there is space on the stack to push C<n> items, extending it
406 if necessary.
407
408 =cut
409 */
410
411 PERL_STATIC_INLINE void
Perl_rpp_extend(pTHX_ SSize_t n)412 Perl_rpp_extend(pTHX_ SSize_t n)
413 {
414 PERL_ARGS_ASSERT_RPP_EXTEND;
415
416 EXTEND_HWM_SET(PL_stack_sp, n);
417 #ifndef STRESS_REALLOC
418 if (UNLIKELY(_EXTEND_NEEDS_GROW(PL_stack_sp, n)))
419 #endif
420 {
421 (void)stack_grow(PL_stack_sp, PL_stack_sp, n);
422 }
423 }
424
425
426 /*
427 =for apidoc rpp_popfree_to
428
429 Pop and free all items on the argument stack above C<sp>. On return,
430 C<PL_stack_sp> will be equal to C<sp>.
431
432 =cut
433 */
434
435 PERL_STATIC_INLINE void
Perl_rpp_popfree_to(pTHX_ SV ** sp)436 Perl_rpp_popfree_to(pTHX_ SV **sp)
437 {
438 PERL_ARGS_ASSERT_RPP_POPFREE_TO;
439
440 assert(sp <= PL_stack_sp);
441 #ifdef PERL_RC_STACK
442 assert(rpp_stack_is_rc());
443 while (PL_stack_sp > sp) {
444 SV *sv = *PL_stack_sp--;
445 SvREFCNT_dec(sv);
446 }
447 #else
448 PL_stack_sp = sp;
449 #endif
450 }
451
452
453 /*
454 =for apidoc rpp_popfree_to_NN
455
456 A variant of rpp_popfree_to() which assumes that all the pointers being
457 popped off the stack are non-NULL.
458
459 =cut
460 */
461
462 PERL_STATIC_INLINE void
Perl_rpp_popfree_to_NN(pTHX_ SV ** sp)463 Perl_rpp_popfree_to_NN(pTHX_ SV **sp)
464 {
465 PERL_ARGS_ASSERT_RPP_POPFREE_TO_NN;
466
467 assert(sp <= PL_stack_sp);
468 #ifdef PERL_RC_STACK
469 assert(rpp_stack_is_rc());
470 while (PL_stack_sp > sp) {
471 SV *sv = *PL_stack_sp--;
472 assert(sv);
473 SvREFCNT_dec_NN(sv);
474 }
475 #else
476 PL_stack_sp = sp;
477 #endif
478 }
479
480
481 /*
482 =for apidoc rpp_popfree_1
483
484 Pop and free the top item on the argument stack and update C<PL_stack_sp>.
485
486 =cut
487 */
488
489 PERL_STATIC_INLINE void
Perl_rpp_popfree_1(pTHX)490 Perl_rpp_popfree_1(pTHX)
491 {
492 PERL_ARGS_ASSERT_RPP_POPFREE_1;
493
494 #ifdef PERL_RC_STACK
495 assert(rpp_stack_is_rc());
496 SV *sv = *PL_stack_sp--;
497 SvREFCNT_dec(sv);
498 #else
499 PL_stack_sp--;
500 #endif
501 }
502
503
504 /*
505 =for apidoc rpp_popfree_1_NN
506
507 A variant of rpp_popfree_1() which assumes that the pointer being popped
508 off the stack is non-NULL.
509
510 =cut
511 */
512
513 PERL_STATIC_INLINE void
Perl_rpp_popfree_1_NN(pTHX)514 Perl_rpp_popfree_1_NN(pTHX)
515 {
516 PERL_ARGS_ASSERT_RPP_POPFREE_1_NN;
517
518 assert(*PL_stack_sp);
519 #ifdef PERL_RC_STACK
520 assert(rpp_stack_is_rc());
521 SV *sv = *PL_stack_sp--;
522 SvREFCNT_dec_NN(sv);
523 #else
524 PL_stack_sp--;
525 #endif
526 }
527
528
529 /*
530 =for apidoc rpp_popfree_2
531
532 Pop and free the top two items on the argument stack and update
533 C<PL_stack_sp>.
534
535 =cut
536 */
537
538
539 PERL_STATIC_INLINE void
Perl_rpp_popfree_2(pTHX)540 Perl_rpp_popfree_2(pTHX)
541 {
542 PERL_ARGS_ASSERT_RPP_POPFREE_2;
543
544 #ifdef PERL_RC_STACK
545 assert(rpp_stack_is_rc());
546 for (int i = 0; i < 2; i++) {
547 SV *sv = *PL_stack_sp--;
548 SvREFCNT_dec(sv);
549 }
550 #else
551 PL_stack_sp -= 2;
552 #endif
553 }
554
555
556 /*
557 =for apidoc rpp_popfree_2_NN
558
559 A variant of rpp_popfree_2() which assumes that the two pointers being
560 popped off the stack are non-NULL.
561
562 =cut
563 */
564
565
566 PERL_STATIC_INLINE void
Perl_rpp_popfree_2_NN(pTHX)567 Perl_rpp_popfree_2_NN(pTHX)
568 {
569 PERL_ARGS_ASSERT_RPP_POPFREE_2_NN;
570 #ifdef PERL_RC_STACK
571 SV *sv2 = *PL_stack_sp--;
572 assert(sv2);
573 SV *sv1 = *PL_stack_sp;
574 assert(sv1);
575
576 assert(rpp_stack_is_rc());
577 U32 rc1 = SvREFCNT(sv1);
578 U32 rc2 = SvREFCNT(sv2);
579 /* This expression is intended to be true if either of rc1 or rc2 has
580 * the value 0 or 1, but using only a single branch test, rather
581 * than the two branches that a compiler would plant for a boolean
582 * expression. We are working on the assumption that, most of the
583 * time, neither of the args to a binary function will need to be
584 * freed - they're likely to lex vars, or PADTMPs or whatever.
585 * So give the CPU a single branch that is rarely taken. */
586 if (UNLIKELY( !(rc1>>1) + !(rc2>>1) ))
587 /* at least one of the old SVs needs freeing. Do it the long way */
588 Perl_rpp_free_2_(aTHX_ sv1, sv2, rc1, rc2);
589 else {
590 SvREFCNT(sv1) = rc1 - 1;
591 SvREFCNT(sv2) = rc2 - 1;
592 }
593 PL_stack_sp--;
594 #else
595 PL_stack_sp -= 2;
596 #endif
597 }
598
599
600 /*
601 =for apidoc rpp_pop_1_norc
602
603 Pop and return the top item off the argument stack and update
604 C<PL_stack_sp>. It's similar to rpp_popfree_1(), except that it actually
605 returns a value, and it I<doesn't> decrement the SV's reference count.
606 On non-C<PERL_RC_STACK> builds it actually increments the SV's reference
607 count.
608
609 This is useful in cases where the popped value is immediately embedded
610 somewhere e.g. via av_store(), allowing you skip decrementing and then
611 immediately incrementing the reference count again (and risk prematurely
612 freeing the SV if it had a RC of 1). On non-RC builds, the reference count
613 bookkeeping still works too, which is why it should be used rather than
614 a simple C<*PL_stack_sp-->.
615
616 =cut
617 */
618
619 PERL_STATIC_INLINE SV*
Perl_rpp_pop_1_norc(pTHX)620 Perl_rpp_pop_1_norc(pTHX)
621 {
622 PERL_ARGS_ASSERT_RPP_POP_1_NORC
623
624 SV *sv = *PL_stack_sp--;
625
626 #ifndef PERL_RC_STACK
627 SvREFCNT_inc(sv);
628 #else
629 assert(rpp_stack_is_rc());
630 #endif
631 return sv;
632 }
633
634
635
636 /*
637 =for apidoc rpp_push_1
638 =for apidoc_item rpp_push_IMM
639 =for apidoc_item rpp_push_2
640 =for apidoc_item rpp_xpush_1
641 =for apidoc_item rpp_xpush_IMM
642 =for apidoc_item rpp_xpush_2
643
644 Push one or two SVs onto the stack, incrementing their reference counts
645 and updating C<PL_stack_sp>. With the C<x> variants, it extends the stack
646 first. The C<IMM> variants assume that the single argument is an immortal
647 such as <&PL_sv_undef> and, for efficiency, will skip incrementing its
648 reference count.
649
650 =cut
651 */
652
653 PERL_STATIC_INLINE void
Perl_rpp_push_1(pTHX_ SV * sv)654 Perl_rpp_push_1(pTHX_ SV *sv)
655 {
656 PERL_ARGS_ASSERT_RPP_PUSH_1;
657
658 *++PL_stack_sp = sv;
659 #ifdef PERL_RC_STACK
660 assert(rpp_stack_is_rc());
661 SvREFCNT_inc_simple_void_NN(sv);
662 #endif
663 }
664
665 PERL_STATIC_INLINE void
Perl_rpp_push_IMM(pTHX_ SV * sv)666 Perl_rpp_push_IMM(pTHX_ SV *sv)
667 {
668 PERL_ARGS_ASSERT_RPP_PUSH_IMM;
669
670 assert(SvIMMORTAL(sv));
671 *++PL_stack_sp = sv;
672 #ifdef PERL_RC_STACK
673 assert(rpp_stack_is_rc());
674 #endif
675 }
676
677 PERL_STATIC_INLINE void
Perl_rpp_push_2(pTHX_ SV * sv1,SV * sv2)678 Perl_rpp_push_2(pTHX_ SV *sv1, SV *sv2)
679 {
680 PERL_ARGS_ASSERT_RPP_PUSH_2;
681
682 *++PL_stack_sp = sv1;
683 *++PL_stack_sp = sv2;
684 #ifdef PERL_RC_STACK
685 assert(rpp_stack_is_rc());
686 SvREFCNT_inc_simple_void_NN(sv1);
687 SvREFCNT_inc_simple_void_NN(sv2);
688 #endif
689 }
690
691 PERL_STATIC_INLINE void
Perl_rpp_xpush_1(pTHX_ SV * sv)692 Perl_rpp_xpush_1(pTHX_ SV *sv)
693 {
694 PERL_ARGS_ASSERT_RPP_XPUSH_1;
695
696 rpp_extend(1);
697 rpp_push_1(sv);
698 }
699
700 PERL_STATIC_INLINE void
Perl_rpp_xpush_IMM(pTHX_ SV * sv)701 Perl_rpp_xpush_IMM(pTHX_ SV *sv)
702 {
703 PERL_ARGS_ASSERT_RPP_XPUSH_IMM;
704
705 rpp_extend(1);
706 rpp_push_IMM(sv);
707 }
708
709 PERL_STATIC_INLINE void
Perl_rpp_xpush_2(pTHX_ SV * sv1,SV * sv2)710 Perl_rpp_xpush_2(pTHX_ SV *sv1, SV *sv2)
711 {
712 PERL_ARGS_ASSERT_RPP_XPUSH_2;
713
714 rpp_extend(2);
715 rpp_push_2(sv1, sv2);
716 }
717
718
719 /*
720 =for apidoc rpp_push_1_norc
721
722 Push C<sv> onto the stack without incrementing its reference count, and
723 update C<PL_stack_sp>. On non-PERL_RC_STACK builds, mortalise too.
724
725 This is most useful where an SV has just been created and already has a
726 reference count of 1, but has not yet been anchored anywhere.
727
728 =cut
729 */
730
731 PERL_STATIC_INLINE void
Perl_rpp_push_1_norc(pTHX_ SV * sv)732 Perl_rpp_push_1_norc(pTHX_ SV *sv)
733 {
734 PERL_ARGS_ASSERT_RPP_PUSH_1;
735
736 *++PL_stack_sp = sv;
737 #ifdef PERL_RC_STACK
738 assert(rpp_stack_is_rc());
739 #else
740 sv_2mortal(sv);
741 #endif
742 }
743
744
745 /*
746 =for apidoc rpp_replace_1_1
747 =for apidoc_item rpp_replace_1_1_NN
748 =for apidoc_item rpp_replace_1_IMM_NN
749
750 Replace the current top stack item with C<sv>, while suitably adjusting
751 reference counts. Equivalent to rpp_popfree_1(); rpp_push_1(sv), but
752 is more efficient and handles both SVs being the same.
753
754 The C<_NN> variant assumes that the pointer on the stack to the SV being
755 freed is non-NULL.
756
757 The C<IMM_NN> variant is like the C<_NN> variant, but in addition, assumes
758 that the single argument is an immortal such as <&PL_sv_undef> and, for
759 efficiency, will skip incrementing its reference count.
760
761 =cut
762 */
763
764 PERL_STATIC_INLINE void
Perl_rpp_replace_1_1(pTHX_ SV * sv)765 Perl_rpp_replace_1_1(pTHX_ SV *sv)
766 {
767 PERL_ARGS_ASSERT_RPP_REPLACE_1_1;
768
769 assert(sv);
770 #ifdef PERL_RC_STACK
771 assert(rpp_stack_is_rc());
772 SV *oldsv = *PL_stack_sp;
773 *PL_stack_sp = sv;
774 SvREFCNT_inc_simple_void_NN(sv);
775 SvREFCNT_dec(oldsv);
776 #else
777 *PL_stack_sp = sv;
778 #endif
779 }
780
781
782 PERL_STATIC_INLINE void
Perl_rpp_replace_1_1_NN(pTHX_ SV * sv)783 Perl_rpp_replace_1_1_NN(pTHX_ SV *sv)
784 {
785 PERL_ARGS_ASSERT_RPP_REPLACE_1_1_NN;
786
787 assert(sv);
788 assert(*PL_stack_sp);
789 #ifdef PERL_RC_STACK
790 assert(rpp_stack_is_rc());
791 SV *oldsv = *PL_stack_sp;
792 *PL_stack_sp = sv;
793 SvREFCNT_inc_simple_void_NN(sv);
794 SvREFCNT_dec_NN(oldsv);
795 #else
796 *PL_stack_sp = sv;
797 #endif
798 }
799
800
801 PERL_STATIC_INLINE void
Perl_rpp_replace_1_IMM_NN(pTHX_ SV * sv)802 Perl_rpp_replace_1_IMM_NN(pTHX_ SV *sv)
803 {
804 PERL_ARGS_ASSERT_RPP_REPLACE_1_IMM_NN;
805
806 assert(sv);
807 assert(SvIMMORTAL(sv));
808 assert(*PL_stack_sp);
809 #ifdef PERL_RC_STACK
810 assert(rpp_stack_is_rc());
811 SV *oldsv = *PL_stack_sp;
812 *PL_stack_sp = sv;
813 SvREFCNT_dec_NN(oldsv);
814 #else
815 *PL_stack_sp = sv;
816 #endif
817 }
818
819
820 /*
821 =for apidoc rpp_replace_2_1
822 =for apidoc_item rpp_replace_2_1_NN
823 =for apidoc_item rpp_replace_2_IMM_NN
824
825 Replace the current top to stacks item with C<sv>, while suitably
826 adjusting reference counts. Equivalent to rpp_popfree_2(); rpp_push_1(sv),
827 but is more efficient and handles SVs being the same.
828
829 The C<_NN> variant assumes that the pointers on the stack to the SVs being
830 freed are non-NULL.
831
832 The C<IMM_NN> variant is like the C<_NN> variant, but in addition, assumes
833 that the single argument is an immortal such as <&PL_sv_undef> and, for
834 efficiency, will skip incrementing its reference count.
835 =cut
836 */
837
838 PERL_STATIC_INLINE void
Perl_rpp_replace_2_1(pTHX_ SV * sv)839 Perl_rpp_replace_2_1(pTHX_ SV *sv)
840 {
841 PERL_ARGS_ASSERT_RPP_REPLACE_2_1;
842
843 #ifdef PERL_RC_STACK
844 assert(rpp_stack_is_rc());
845 /* replace PL_stack_sp[-1] first; leave PL_stack_sp[0] in place while
846 * we free [-1], so if an exception occurs, [0] will still be freed.
847 */
848 SV *oldsv = PL_stack_sp[-1];
849 PL_stack_sp[-1] = sv;
850 SvREFCNT_inc_simple_void_NN(sv);
851 SvREFCNT_dec(oldsv);
852 oldsv = *PL_stack_sp--;
853 SvREFCNT_dec(oldsv);
854 #else
855 *--PL_stack_sp = sv;
856 #endif
857 }
858
859
860 /* Private helper function for _NN and _IMM_NN variants.
861 * Assumes sv has already had its ref count incremented,
862 * ready for being put on the stack.
863 * Intended to be small and fast, since it's inlined into many hot parts of
864 * code.
865 */
866
867 PERL_STATIC_INLINE void
Perl_rpp_replace_2_1_COMMON(pTHX_ SV * sv)868 Perl_rpp_replace_2_1_COMMON(pTHX_ SV *sv)
869 {
870
871 assert(sv);
872 #ifdef PERL_RC_STACK
873 SV *sv2 = *PL_stack_sp--;
874 assert(sv2);
875 SV *sv1 = *PL_stack_sp;
876 assert(sv1);
877
878 *PL_stack_sp = sv;
879 assert(rpp_stack_is_rc());
880 U32 rc1 = SvREFCNT(sv1);
881 U32 rc2 = SvREFCNT(sv2);
882 /* This expression is intended to be true if either of rc1 or rc2 has
883 * the value 0 or 1, but using only a single branch test, rather
884 * than the two branches that a compiler would plant for a boolean
885 * expression. We are working on the assumption that, most of the
886 * time, neither of the args to a binary function will need to be
887 * freed - they're likely to lex vars, or PADTMPs or whatever.
888 * So give the CPU a single branch that is rarely taken. */
889 if (UNLIKELY( !(rc1>>1) + !(rc2>>1) ))
890 /* at least one of the old SVs needs freeing. Do it the long way */
891 Perl_rpp_free_2_(aTHX_ sv1, sv2, rc1, rc2);
892 else {
893 SvREFCNT(sv1) = rc1 - 1;
894 SvREFCNT(sv2) = rc2 - 1;
895 }
896 #else
897 *--PL_stack_sp = sv;
898 #endif
899 }
900
901
902 PERL_STATIC_INLINE void
Perl_rpp_replace_2_1_NN(pTHX_ SV * sv)903 Perl_rpp_replace_2_1_NN(pTHX_ SV *sv)
904 {
905 PERL_ARGS_ASSERT_RPP_REPLACE_2_1_NN;
906
907 assert(sv);
908 #ifdef PERL_RC_STACK
909 SvREFCNT_inc_simple_void_NN(sv);
910 #endif
911 rpp_replace_2_1_COMMON(sv);
912 }
913
914
915 PERL_STATIC_INLINE void
Perl_rpp_replace_2_IMM_NN(pTHX_ SV * sv)916 Perl_rpp_replace_2_IMM_NN(pTHX_ SV *sv)
917 {
918 PERL_ARGS_ASSERT_RPP_REPLACE_2_IMM_NN;
919
920 assert(sv);
921 assert(SvIMMORTAL(sv));
922 rpp_replace_2_1_COMMON(sv);
923 }
924
925
926 /*
927 =for apidoc rpp_replace_at
928
929 Replace the SV at address sp within the stack with C<sv>, while suitably
930 adjusting reference counts. Equivalent to C<*sp = sv>, except with proper
931 reference count handling.
932
933 =cut
934 */
935
936 PERL_STATIC_INLINE void
Perl_rpp_replace_at(pTHX_ SV ** sp,SV * sv)937 Perl_rpp_replace_at(pTHX_ SV **sp, SV *sv)
938 {
939 PERL_ARGS_ASSERT_RPP_REPLACE_AT;
940
941 #ifdef PERL_RC_STACK
942 assert(rpp_stack_is_rc());
943 SV *oldsv = *sp;
944 *sp = sv;
945 SvREFCNT_inc_simple_void_NN(sv);
946 SvREFCNT_dec(oldsv);
947 #else
948 *sp = sv;
949 #endif
950 }
951
952
953 /*
954 =for apidoc rpp_replace_at_NN
955
956 A variant of rpp_replace_at() which assumes that the SV pointer on the
957 stack is non-NULL.
958
959 =cut
960 */
961
962 PERL_STATIC_INLINE void
Perl_rpp_replace_at_NN(pTHX_ SV ** sp,SV * sv)963 Perl_rpp_replace_at_NN(pTHX_ SV **sp, SV *sv)
964 {
965 PERL_ARGS_ASSERT_RPP_REPLACE_AT_NN;
966
967 assert(sv);
968 assert(*sp);
969 #ifdef PERL_RC_STACK
970 assert(rpp_stack_is_rc());
971 SV *oldsv = *sp;
972 *sp = sv;
973 SvREFCNT_inc_simple_void_NN(sv);
974 SvREFCNT_dec_NN(oldsv);
975 #else
976 *sp = sv;
977 #endif
978 }
979
980
981 /*
982 =for apidoc rpp_replace_at_norc
983
984 Replace the SV at address sp within the stack with C<sv>, while suitably
985 adjusting the reference count of the old SV. Equivalent to C<*sp = sv>,
986 except with proper reference count handling.
987
988 C<sv>'s reference count doesn't get incremented. On non-C<PERL_RC_STACK>
989 builds, it gets mortalised too.
990
991 This is most useful where an SV has just been created and already has a
992 reference count of 1, but has not yet been anchored anywhere.
993
994 =cut
995 */
996
997 PERL_STATIC_INLINE void
Perl_rpp_replace_at_norc(pTHX_ SV ** sp,SV * sv)998 Perl_rpp_replace_at_norc(pTHX_ SV **sp, SV *sv)
999 {
1000 PERL_ARGS_ASSERT_RPP_REPLACE_AT_NORC;
1001
1002 #ifdef PERL_RC_STACK
1003 assert(rpp_stack_is_rc());
1004 SV *oldsv = *sp;
1005 *sp = sv;
1006 SvREFCNT_dec(oldsv);
1007 #else
1008 *sp = sv;
1009 sv_2mortal(sv);
1010 #endif
1011 }
1012
1013
1014 /*
1015 =for apidoc rpp_replace_at_norc_NN
1016
1017 A variant of rpp_replace_at_norc() which assumes that the SV pointer on the
1018 stack is non-NULL.
1019
1020 =cut
1021 */
1022
1023 PERL_STATIC_INLINE void
Perl_rpp_replace_at_norc_NN(pTHX_ SV ** sp,SV * sv)1024 Perl_rpp_replace_at_norc_NN(pTHX_ SV **sp, SV *sv)
1025 {
1026 PERL_ARGS_ASSERT_RPP_REPLACE_AT_NORC_NN;
1027
1028 assert(*sp);
1029 #ifdef PERL_RC_STACK
1030 assert(rpp_stack_is_rc());
1031 SV *oldsv = *sp;
1032 *sp = sv;
1033 SvREFCNT_dec_NN(oldsv);
1034 #else
1035 *sp = sv;
1036 sv_2mortal(sv);
1037 #endif
1038 }
1039
1040
1041 /*
1042 =for apidoc rpp_context
1043
1044 Impose void, scalar or list context on the stack.
1045 First, pop C<extra> items off the stack, then when C<gimme> is:
1046 C<G_LIST>: return as-is.
1047 C<G_VOID>: pop everything back to C<mark>
1048 C<G_SCALAR>: move the top stack item (or C<&PL_sv_undef> if none) to
1049 C<mark+1> and free everything above it.
1050
1051 =cut
1052 */
1053
1054 PERL_STATIC_INLINE void
Perl_rpp_context(pTHX_ SV ** mark,U8 gimme,SSize_t extra)1055 Perl_rpp_context(pTHX_ SV **mark, U8 gimme, SSize_t extra)
1056 {
1057 PERL_ARGS_ASSERT_RPP_CONTEXT;
1058 assert(extra >= 0);
1059 assert(mark <= PL_stack_sp - extra);
1060
1061 if (gimme == G_LIST)
1062 mark = PL_stack_sp - extra;
1063 else if (gimme == G_SCALAR) {
1064 SV **svp = PL_stack_sp - extra;
1065 mark++;
1066 if (mark > svp) {
1067 /* empty list (plus extra) */
1068 rpp_popfree_to(svp);
1069 rpp_extend(1);
1070 *++PL_stack_sp = &PL_sv_undef;
1071 return;
1072 }
1073 /* swap top and bottom list items */
1074 SV *top = *svp;
1075 *svp = *mark;
1076 *mark = top;
1077 }
1078 rpp_popfree_to(mark);
1079 }
1080
1081
1082
1083
1084 /*
1085 =for apidoc rpp_try_AMAGIC_1
1086 =for apidoc_item rpp_try_AMAGIC_2
1087
1088 Check whether either of the one or two SVs at the top of the stack is
1089 magical or a ref, and in either case handle it specially: invoke get
1090 magic, call an overload method, or replace a ref with a temporary numeric
1091 value, as appropriate. If this function returns true, it indicates that
1092 the correct return value is already on the stack. Intended to be used at
1093 the beginning of the PP function for unary or binary ops.
1094
1095 =cut
1096 */
1097
1098 PERL_STATIC_INLINE bool
Perl_rpp_try_AMAGIC_1(pTHX_ int method,int flags)1099 Perl_rpp_try_AMAGIC_1(pTHX_ int method, int flags)
1100 {
1101 return UNLIKELY((SvFLAGS(*PL_stack_sp) & (SVf_ROK|SVs_GMG)))
1102 && Perl_try_amagic_un(aTHX_ method, flags);
1103 }
1104
1105 PERL_STATIC_INLINE bool
Perl_rpp_try_AMAGIC_2(pTHX_ int method,int flags)1106 Perl_rpp_try_AMAGIC_2(pTHX_ int method, int flags)
1107 {
1108 return UNLIKELY(((SvFLAGS(PL_stack_sp[-1])|SvFLAGS(PL_stack_sp[0]))
1109 & (SVf_ROK|SVs_GMG)))
1110 && Perl_try_amagic_bin(aTHX_ method, flags);
1111 }
1112
1113
1114 /*
1115 =for apidoc rpp_stack_is_rc
1116
1117 Returns a boolean value indicating whether the stack is currently
1118 reference-counted. Note that if the stack is split (bottom half RC, top
1119 half non-RC), this function returns false, even if the top half currently
1120 contains zero items.
1121
1122 =cut
1123 */
1124
1125 PERL_STATIC_INLINE bool
Perl_rpp_stack_is_rc(pTHX)1126 Perl_rpp_stack_is_rc(pTHX)
1127 {
1128 #ifdef PERL_RC_STACK
1129 return AvREAL(PL_curstack) && !PL_curstackinfo->si_stack_nonrc_base;
1130 #else
1131 return 0;
1132 #endif
1133
1134 }
1135
1136
1137 /*
1138 =for apidoc rpp_is_lone
1139
1140 Indicates whether the stacked SV C<sv> (assumed to be not yet popped off
1141 the stack) is only kept alive due to a single reference from the argument
1142 stack and/or and the temps stack.
1143
1144 This can used for example to decide whether the copying of return values
1145 in rvalue context can be skipped, or whether it shouldn't be assigned to
1146 in lvalue context.
1147
1148 =cut
1149 */
1150
1151
1152 PERL_STATIC_INLINE bool
Perl_rpp_is_lone(pTHX_ SV * sv)1153 Perl_rpp_is_lone(pTHX_ SV *sv)
1154 {
1155 #ifdef PERL_RC_STACK
1156 /* note that rpp_is_lone() can be used in wrapped pp functions,
1157 * where technically the stack is no longer ref-counted; but because
1158 * the args are non-RC copies of RC args further down the stack, we
1159 * can't be in a *completely* non-ref stack.
1160 */
1161 assert(AvREAL(PL_curstack));
1162 #endif
1163
1164 return SvREFCNT(sv) <= cBOOL(SvTEMP(sv))
1165 #ifdef PERL_RC_STACK
1166 + 1
1167 && !SvIMMORTAL(sv) /* PL_sv_undef etc are never stealable */
1168 #endif
1169 ;
1170 }
1171
1172
1173 /*
1174 =for apidoc rpp_invoke_xs
1175
1176 Call the XS function associated with C<cv>. Wraps the call if necessary to
1177 handle XS functions which are not aware of reference-counted stacks.
1178
1179 =cut
1180 */
1181
1182
1183 PERL_STATIC_INLINE void
Perl_rpp_invoke_xs(pTHX_ CV * cv)1184 Perl_rpp_invoke_xs(pTHX_ CV *cv)
1185 {
1186 PERL_ARGS_ASSERT_RPP_INVOKE_XS;
1187
1188 #ifdef PERL_RC_STACK
1189 if (!CvXS_RCSTACK(cv))
1190 Perl_xs_wrap(aTHX_ CvXSUB(cv), cv);
1191 else
1192 #endif
1193 CvXSUB(cv)(aTHX_ cv);
1194 }
1195
1196
1197
1198
1199 /* ----------------------------- regexp.h ----------------------------- */
1200
1201 /* PVLVs need to act as a superset of all scalar types - they are basically
1202 * PVMGs with a few extra fields.
1203 * REGEXPs are first class scalars, but have many fields that can't be copied
1204 * into a PVLV body.
1205 *
1206 * Hence we take a different approach - instead of a copy, PVLVs store a pointer
1207 * back to the original body. To avoid increasing the size of PVLVs just for the
1208 * rare case of REGEXP assignment, this pointer is stored in the memory usually
1209 * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
1210 * read the pointer from the two possible locations. The macro SvLEN() wraps the
1211 * access to the union's member xpvlenu_len, but there is no equivalent macro
1212 * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
1213 *
1214 * See commit df6b4bd56551f2d3 for more details. */
1215
1216 PERL_STATIC_INLINE struct regexp *
Perl_ReANY(const REGEXP * const re)1217 Perl_ReANY(const REGEXP * const re)
1218 {
1219 XPV* const p = (XPV*)SvANY(re);
1220
1221 PERL_ARGS_ASSERT_REANY;
1222 assert(isREGEXP(re));
1223
1224 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
1225 : (struct regexp *)p;
1226 }
1227
1228 /* ------------------------------- utf8.h ------------------------------- */
1229
1230 /*
1231 =for apidoc_section $unicode
1232 */
1233
1234 PERL_STATIC_INLINE void
Perl_append_utf8_from_native_byte(const U8 byte,U8 ** dest)1235 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
1236 {
1237 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
1238 * encoded string at '*dest', updating '*dest' to include it */
1239
1240 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
1241
1242 if (NATIVE_BYTE_IS_INVARIANT(byte))
1243 *((*dest)++) = byte;
1244 else {
1245 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
1246 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
1247 }
1248 }
1249
1250 /*
1251 =for apidoc valid_utf8_to_uvchr
1252 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
1253 known that the next character in the input UTF-8 string C<s> is well-formed
1254 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
1255 points, and non-Unicode code points are allowed.
1256
1257 =cut
1258
1259 */
1260
1261 PERL_STATIC_INLINE UV
Perl_valid_utf8_to_uvchr(const U8 * s,STRLEN * retlen)1262 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
1263 {
1264 const UV expectlen = UTF8SKIP(s);
1265 const U8* send = s + expectlen;
1266 UV uv = *s;
1267
1268 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
1269
1270 if (retlen) {
1271 *retlen = expectlen;
1272 }
1273
1274 /* An invariant is trivially returned */
1275 if (expectlen == 1) {
1276 return uv;
1277 }
1278
1279 /* Remove the leading bits that indicate the number of bytes, leaving just
1280 * the bits that are part of the value */
1281 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
1282
1283 /* Now, loop through the remaining bytes, accumulating each into the
1284 * working total as we go. (I khw tried unrolling the loop for up to 4
1285 * bytes, but there was no performance improvement) */
1286 for (++s; s < send; s++) {
1287 uv = UTF8_ACCUMULATE(uv, *s);
1288 }
1289
1290 return UNI_TO_NATIVE(uv);
1291
1292 }
1293
1294 /*
1295 =for apidoc is_utf8_invariant_string
1296
1297 Returns TRUE if the first C<len> bytes of the string C<s> are the same
1298 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
1299 EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
1300 are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
1301 the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
1302 characters are invariant, but so also are the C1 controls.
1303
1304 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
1305 use this option, that C<s> can't have embedded C<NUL> characters and has to
1306 have a terminating C<NUL> byte).
1307
1308 See also
1309 C<L</is_utf8_string>>,
1310 C<L</is_utf8_string_flags>>,
1311 C<L</is_utf8_string_loc>>,
1312 C<L</is_utf8_string_loc_flags>>,
1313 C<L</is_utf8_string_loclen>>,
1314 C<L</is_utf8_string_loclen_flags>>,
1315 C<L</is_utf8_fixed_width_buf_flags>>,
1316 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1317 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1318 C<L</is_strict_utf8_string>>,
1319 C<L</is_strict_utf8_string_loc>>,
1320 C<L</is_strict_utf8_string_loclen>>,
1321 C<L</is_c9strict_utf8_string>>,
1322 C<L</is_c9strict_utf8_string_loc>>,
1323 and
1324 C<L</is_c9strict_utf8_string_loclen>>.
1325
1326 =cut
1327
1328 */
1329
1330 #define is_utf8_invariant_string(s, len) \
1331 is_utf8_invariant_string_loc(s, len, NULL)
1332
1333 /*
1334 =for apidoc is_utf8_invariant_string_loc
1335
1336 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
1337 the first UTF-8 variant character in the C<ep> pointer; if all characters are
1338 UTF-8 invariant, this function does not change the contents of C<*ep>.
1339
1340 =cut
1341
1342 */
1343
1344 PERL_STATIC_INLINE bool
Perl_is_utf8_invariant_string_loc(const U8 * const s,STRLEN len,const U8 ** ep)1345 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1346 {
1347 const U8* send;
1348 const U8* x = s;
1349
1350 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
1351
1352 if (len == 0) {
1353 len = strlen((const char *)s);
1354 }
1355
1356 send = s + len;
1357
1358 /* This looks like 0x010101... */
1359 # define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
1360
1361 /* This looks like 0x808080... */
1362 # define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
1363 # define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
1364 # define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
1365
1366 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
1367 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
1368 * optimized out completely on a 32-bit system, and its mask gets optimized out
1369 * on a 64-bit system */
1370 # define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
1371 | ( PTR2nat(x) >> 1) \
1372 | ( ( (PTR2nat(x) \
1373 & PERL_WORD_BOUNDARY_MASK) >> 2))))
1374
1375 #ifndef EBCDIC
1376
1377 /* Do the word-at-a-time iff there is at least one usable full word. That
1378 * means that after advancing to a word boundary, there still is at least a
1379 * full word left. The number of bytes needed to advance is 'wordsize -
1380 * offset' unless offset is 0. */
1381 if ((STRLEN) (send - x) >= PERL_WORDSIZE
1382
1383 /* This term is wordsize if subword; 0 if not */
1384 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1385
1386 /* 'offset' */
1387 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1388 {
1389
1390 /* Process per-byte until reach word boundary. XXX This loop could be
1391 * eliminated if we knew that this platform had fast unaligned reads */
1392 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1393 if (! UTF8_IS_INVARIANT(*x)) {
1394 if (ep) {
1395 *ep = x;
1396 }
1397
1398 return FALSE;
1399 }
1400 x++;
1401 }
1402
1403 /* Here, we know we have at least one full word to process. Process
1404 * per-word as long as we have at least a full word left */
1405 do {
1406 if ((* (const PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
1407
1408 /* Found a variant. Just return if caller doesn't want its
1409 * exact position */
1410 if (! ep) {
1411 return FALSE;
1412 }
1413
1414 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
1415 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1416
1417 *ep = x + variant_byte_number(* (const PERL_UINTMAX_T *) x);
1418 assert(*ep >= s && *ep < send);
1419
1420 return FALSE;
1421
1422 # else /* If weird byte order, drop into next loop to do byte-at-a-time
1423 checks. */
1424
1425 break;
1426 # endif
1427 }
1428
1429 x += PERL_WORDSIZE;
1430
1431 } while (x + PERL_WORDSIZE <= send);
1432 }
1433
1434 #endif /* End of ! EBCDIC */
1435
1436 /* Process per-byte */
1437 while (x < send) {
1438 if (! UTF8_IS_INVARIANT(*x)) {
1439 if (ep) {
1440 *ep = x;
1441 }
1442
1443 return FALSE;
1444 }
1445
1446 x++;
1447 }
1448
1449 return TRUE;
1450 }
1451
1452 /* See if the platform has builtins for finding the most/least significant bit,
1453 * and which one is right for using on 32 and 64 bit operands */
1454 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
1455 # if U32SIZE == INTSIZE
1456 # define PERL_CLZ_32 __builtin_clz
1457 # endif
1458 # if defined(U64TYPE) && U64SIZE == INTSIZE
1459 # define PERL_CLZ_64 __builtin_clz
1460 # endif
1461 #endif
1462 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
1463 # if U32SIZE == INTSIZE
1464 # define PERL_CTZ_32 __builtin_ctz
1465 # endif
1466 # if defined(U64TYPE) && U64SIZE == INTSIZE
1467 # define PERL_CTZ_64 __builtin_ctz
1468 # endif
1469 #endif
1470
1471 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
1472 # if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
1473 # define PERL_CLZ_32 __builtin_clzl
1474 # endif
1475 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
1476 # define PERL_CLZ_64 __builtin_clzl
1477 # endif
1478 #endif
1479 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
1480 # if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
1481 # define PERL_CTZ_32 __builtin_ctzl
1482 # endif
1483 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
1484 # define PERL_CTZ_64 __builtin_ctzl
1485 # endif
1486 #endif
1487
1488 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
1489 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
1490 # define PERL_CLZ_32 __builtin_clzll
1491 # endif
1492 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
1493 # define PERL_CLZ_64 __builtin_clzll
1494 # endif
1495 #endif
1496 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
1497 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
1498 # define PERL_CTZ_32 __builtin_ctzll
1499 # endif
1500 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
1501 # define PERL_CTZ_64 __builtin_ctzll
1502 # endif
1503 #endif
1504
1505 #if defined(WIN32)
1506 # include <intrin.h>
1507 /* MinGW warns that it ignores "pragma intrinsic". */
1508 # if defined(_MSC_VER)
1509 # pragma intrinsic(_BitScanForward)
1510 # pragma intrinsic(_BitScanReverse)
1511 # if defined(_WIN64)
1512 # pragma intrinsic(_BitScanForward64)
1513 # pragma intrinsic(_BitScanReverse64)
1514 # endif
1515 # endif
1516 #endif
1517
1518 /* The reason there are not checks to see if ffs() and ffsl() are available for
1519 * determining the lsb, is because these don't improve on the deBruijn method
1520 * fallback, which is just a branchless integer multiply, array element
1521 * retrieval, and shift. The others, even if the function call overhead is
1522 * optimized out, have to cope with the possibility of the input being all
1523 * zeroes, and almost certainly will have conditionals for this eventuality.
1524 * khw, at the time of this commit, looked at the source for both gcc and clang
1525 * to verify this. (gcc used a method inferior to deBruijn.) */
1526
1527 /* Below are functions to find the first, last, or only set bit in a word. On
1528 * platforms with 64-bit capability, there is a pair for each operation; the
1529 * first taking a 64 bit operand, and the second a 32 bit one. The logic is
1530 * the same in each pair, so the second is stripped of most comments. */
1531
1532 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
1533
1534 PERL_STATIC_INLINE unsigned
Perl_lsbit_pos64(U64 word)1535 Perl_lsbit_pos64(U64 word)
1536 {
1537 /* Find the position (0..63) of the least significant set bit in the input
1538 * word */
1539
1540 ASSUME(word != 0);
1541
1542 /* If we can determine that the platform has a usable fast method to get
1543 * this info, use that */
1544
1545 # if defined(PERL_CTZ_64)
1546 # define PERL_HAS_FAST_GET_LSB_POS64
1547
1548 return (unsigned) PERL_CTZ_64(word);
1549
1550 # elif U64SIZE == 8 && defined(_WIN64)
1551 # define PERL_HAS_FAST_GET_LSB_POS64
1552
1553 {
1554 unsigned long index;
1555 _BitScanForward64(&index, word);
1556 return (unsigned)index;
1557 }
1558
1559 # else
1560
1561 /* Here, we didn't find a fast method for finding the lsb. Fall back to
1562 * making the lsb the only set bit in the word, and use our function that
1563 * works on words with a single bit set.
1564 *
1565 * Isolate the lsb;
1566 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
1567 *
1568 * The word will look like this, with a rightmost set bit in position 's':
1569 * ('x's are don't cares, and 'y's are their complements)
1570 * s
1571 * x..x100..00
1572 * y..y011..11 Complement
1573 * y..y100..00 Add 1
1574 * 0..0100..00 And with the original
1575 *
1576 * (Yes, complementing and adding 1 is just taking the negative on 2's
1577 * complement machines, but not on 1's complement ones, and some compilers
1578 * complain about negating an unsigned.)
1579 */
1580 return single_1bit_pos64(word & (~word + 1));
1581
1582 # endif
1583
1584 }
1585
1586 # define lsbit_pos_uintmax_(word) lsbit_pos64(word)
1587 #else /* ! QUAD */
1588 # define lsbit_pos_uintmax_(word) lsbit_pos32(word)
1589 #endif
1590
1591 PERL_STATIC_INLINE unsigned /* Like above for 32 bit word */
Perl_lsbit_pos32(U32 word)1592 Perl_lsbit_pos32(U32 word)
1593 {
1594 /* Find the position (0..31) of the least significant set bit in the input
1595 * word */
1596
1597 ASSUME(word != 0);
1598
1599 #if defined(PERL_CTZ_32)
1600 # define PERL_HAS_FAST_GET_LSB_POS32
1601
1602 return (unsigned) PERL_CTZ_32(word);
1603
1604 #elif U32SIZE == 4 && defined(WIN32)
1605 # define PERL_HAS_FAST_GET_LSB_POS32
1606
1607 {
1608 unsigned long index;
1609 _BitScanForward(&index, word);
1610 return (unsigned)index;
1611 }
1612
1613 #elif defined(PERL_HAS_FAST_GET_LSB_POS64)
1614 # define PERL_HAS_FAST_GET_LSB_POS32
1615
1616 /* Unlikely, but possible for the platform to have a wider fast operation
1617 * but not a narrower one. But easy enough to handle the case by widening
1618 * the parameter size. */
1619 return lsbit_pos64(word);
1620
1621 #else
1622
1623 return single_1bit_pos32(word & (~word + 1));
1624
1625 #endif
1626
1627 }
1628
1629
1630 /* Convert the leading zeros count to the bit position of the first set bit.
1631 * This just subtracts from the highest position, 31 or 63. But some compilers
1632 * don't optimize this optimally, and so a bit of bit twiddling encourages them
1633 * to do the right thing. It turns out that subtracting a smaller non-negative
1634 * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
1635 * the two numbers. To see why, first note that the sum of any number, x, and
1636 * its complement, x', is all ones. So all ones minus x is x'. Then note that
1637 * the xor of x and all ones is x'. */
1638 #define LZC_TO_MSBIT_POS_(size, lzc) ((size##SIZE * CHARBITS - 1) ^ (lzc))
1639
1640 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
1641
1642 PERL_STATIC_INLINE unsigned
Perl_msbit_pos64(U64 word)1643 Perl_msbit_pos64(U64 word)
1644 {
1645 /* Find the position (0..63) of the most significant set bit in the input
1646 * word */
1647
1648 ASSUME(word != 0);
1649
1650 /* If we can determine that the platform has a usable fast method to get
1651 * this, use that */
1652
1653 # if defined(PERL_CLZ_64)
1654 # define PERL_HAS_FAST_GET_MSB_POS64
1655
1656 return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
1657
1658 # elif U64SIZE == 8 && defined(_WIN64)
1659 # define PERL_HAS_FAST_GET_MSB_POS64
1660
1661 {
1662 unsigned long index;
1663 _BitScanReverse64(&index, word);
1664 return (unsigned)index;
1665 }
1666
1667 # else
1668
1669 /* Here, we didn't find a fast method for finding the msb. Fall back to
1670 * making the msb the only set bit in the word, and use our function that
1671 * works on words with a single bit set.
1672 *
1673 * Isolate the msb; http://codeforces.com/blog/entry/10330
1674 *
1675 * Only the most significant set bit matters. Or'ing word with its right
1676 * shift of 1 makes that bit and the next one to its right both 1.
1677 * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
1678 * ... We end with the msb and all to the right being 1. */
1679 word |= (word >> 1);
1680 word |= (word >> 2);
1681 word |= (word >> 4);
1682 word |= (word >> 8);
1683 word |= (word >> 16);
1684 word |= (word >> 32);
1685
1686 /* Then subtracting the right shift by 1 clears all but the left-most of
1687 * the 1 bits, which is our desired result */
1688 word -= (word >> 1);
1689
1690 /* Now we have a single bit set */
1691 return single_1bit_pos64(word);
1692
1693 # endif
1694
1695 }
1696
1697 # define msbit_pos_uintmax_(word) msbit_pos64(word)
1698 #else /* ! QUAD */
1699 # define msbit_pos_uintmax_(word) msbit_pos32(word)
1700 #endif
1701
1702 PERL_STATIC_INLINE unsigned
Perl_msbit_pos32(U32 word)1703 Perl_msbit_pos32(U32 word)
1704 {
1705 /* Find the position (0..31) of the most significant set bit in the input
1706 * word */
1707
1708 ASSUME(word != 0);
1709
1710 #if defined(PERL_CLZ_32)
1711 # define PERL_HAS_FAST_GET_MSB_POS32
1712
1713 return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
1714 #elif U32SIZE == 4 && defined(WIN32)
1715 # define PERL_HAS_FAST_GET_MSB_POS32
1716
1717 {
1718 unsigned long index;
1719 _BitScanReverse(&index, word);
1720 return (unsigned)index;
1721 }
1722
1723 #elif defined(PERL_HAS_FAST_GET_MSB_POS64)
1724 # define PERL_HAS_FAST_GET_MSB_POS32
1725
1726 return msbit_pos64(word); /* Let compiler widen parameter */
1727
1728 #else
1729
1730 word |= (word >> 1);
1731 word |= (word >> 2);
1732 word |= (word >> 4);
1733 word |= (word >> 8);
1734 word |= (word >> 16);
1735 word -= (word >> 1);
1736 return single_1bit_pos32(word);
1737
1738 #endif
1739
1740 }
1741
1742 /* Note that if you are working through all the 1 bits in a word, and don't
1743 * care which order you process them in, it is better to use lsbit_pos. This
1744 * is because some platforms have a fast way to find the msb but not the lsb,
1745 * and others vice versa. The code above falls back to use the single
1746 * available fast method when the desired one is missing, and it is cheaper to
1747 * fall back from lsb to msb than the other way around */
1748
1749 #if UVSIZE == U64SIZE
1750 # define msbit_pos(word) msbit_pos64(word)
1751 # define lsbit_pos(word) lsbit_pos64(word)
1752 #elif UVSIZE == U32SIZE
1753 # define msbit_pos(word) msbit_pos32(word)
1754 # define lsbit_pos(word) lsbit_pos32(word)
1755 #endif
1756
1757 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
1758
1759 PERL_STATIC_INLINE unsigned
Perl_single_1bit_pos64(U64 word)1760 Perl_single_1bit_pos64(U64 word)
1761 {
1762 /* Given a 64-bit word known to contain all zero bits except one 1 bit,
1763 * find and return the 1's position: 0..63 */
1764
1765 # ifdef PERL_CORE /* macro not exported */
1766 ASSUME(isPOWER_OF_2(word));
1767 # else
1768 ASSUME(word && (word & (word-1)) == 0);
1769 # endif
1770
1771 /* The only set bit is both the most and least significant bit. If we have
1772 * a fast way of finding either one, use that.
1773 *
1774 * It may appear at first glance that those functions call this one, but
1775 * they don't if the corresponding #define is set */
1776
1777 # ifdef PERL_HAS_FAST_GET_MSB_POS64
1778
1779 return msbit_pos64(word);
1780
1781 # elif defined(PERL_HAS_FAST_GET_LSB_POS64)
1782
1783 return lsbit_pos64(word);
1784
1785 # else
1786
1787 /* The position of the only set bit in a word can be quickly calculated
1788 * using deBruijn sequences. See for example
1789 * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
1790 return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
1791 >> PERL_deBruijnShift64_];
1792 # endif
1793
1794 }
1795
1796 #endif
1797
1798 PERL_STATIC_INLINE unsigned
Perl_single_1bit_pos32(U32 word)1799 Perl_single_1bit_pos32(U32 word)
1800 {
1801 /* Given a 32-bit word known to contain all zero bits except one 1 bit,
1802 * find and return the 1's position: 0..31 */
1803
1804 #ifdef PERL_CORE /* macro not exported */
1805 ASSUME(isPOWER_OF_2(word));
1806 #else
1807 ASSUME(word && (word & (word-1)) == 0);
1808 #endif
1809 #ifdef PERL_HAS_FAST_GET_MSB_POS32
1810
1811 return msbit_pos32(word);
1812
1813 #elif defined(PERL_HAS_FAST_GET_LSB_POS32)
1814
1815 return lsbit_pos32(word);
1816
1817 #else
1818
1819 return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
1820 >> PERL_deBruijnShift32_];
1821 #endif
1822
1823 }
1824
1825 #ifndef EBCDIC
1826
1827 PERL_STATIC_INLINE unsigned int
Perl_variant_byte_number(PERL_UINTMAX_T word)1828 Perl_variant_byte_number(PERL_UINTMAX_T word)
1829 {
1830 /* This returns the position in a word (0..7) of the first variant byte in
1831 * it. This is a helper function. Note that there are no branches */
1832
1833 /* Get just the msb bits of each byte */
1834 word &= PERL_VARIANTS_WORD_MASK;
1835
1836 /* This should only be called if we know there is a variant byte in the
1837 * word */
1838 assert(word);
1839
1840 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1841
1842 /* Bytes are stored like
1843 * Byte8 ... Byte2 Byte1
1844 * 63..56...15...8 7...0
1845 * so getting the lsb of the whole modified word is getting the msb of the
1846 * first byte that has its msb set */
1847 word = lsbit_pos_uintmax_(word);
1848
1849 /* Here, word contains the position 7,15,23,...55,63 of that bit. Convert
1850 * to 0..7 */
1851 return (unsigned int) ((word + 1) >> 3) - 1;
1852
1853 # elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1854
1855 /* Bytes are stored like
1856 * Byte1 Byte2 ... Byte8
1857 * 63..56 55..47 ... 7...0
1858 * so getting the msb of the whole modified word is getting the msb of the
1859 * first byte that has its msb set */
1860 word = msbit_pos_uintmax_(word);
1861
1862 /* Here, word contains the position 63,55,...,23,15,7 of that bit. Convert
1863 * to 0..7 */
1864 word = ((word + 1) >> 3) - 1;
1865
1866 /* And invert the result because of the reversed byte order on this
1867 * platform */
1868 word = CHARBITS - word - 1;
1869
1870 return (unsigned int) word;
1871
1872 # else
1873 # error Unexpected byte order
1874 # endif
1875
1876 }
1877
1878 #endif
1879 #if defined(PERL_CORE) || defined(PERL_EXT)
1880
1881 /*
1882 =for apidoc variant_under_utf8_count
1883
1884 This function looks at the sequence of bytes between C<s> and C<e>, which are
1885 assumed to be encoded in ASCII/Latin1, and returns how many of them would
1886 change should the string be translated into UTF-8. Due to the nature of UTF-8,
1887 each of these would occupy two bytes instead of the single one in the input
1888 string. Thus, this function returns the precise number of bytes the string
1889 would expand by when translated to UTF-8.
1890
1891 Unlike most of the other functions that have C<utf8> in their name, the input
1892 to this function is NOT a UTF-8-encoded string. The function name is slightly
1893 I<odd> to emphasize this.
1894
1895 This function is internal to Perl because khw thinks that any XS code that
1896 would want this is probably operating too close to the internals. Presenting a
1897 valid use case could change that.
1898
1899 See also
1900 C<L<perlapi/is_utf8_invariant_string>>
1901 and
1902 C<L<perlapi/is_utf8_invariant_string_loc>>,
1903
1904 =cut
1905
1906 */
1907
1908 PERL_STATIC_INLINE Size_t
S_variant_under_utf8_count(const U8 * const s,const U8 * const e)1909 S_variant_under_utf8_count(const U8* const s, const U8* const e)
1910 {
1911 const U8* x = s;
1912 Size_t count = 0;
1913
1914 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1915
1916 # ifndef EBCDIC
1917
1918 /* Test if the string is long enough to use word-at-a-time. (Logic is the
1919 * same as for is_utf8_invariant_string()) */
1920 if ((STRLEN) (e - x) >= PERL_WORDSIZE
1921 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1922 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1923 {
1924
1925 /* Process per-byte until reach word boundary. XXX This loop could be
1926 * eliminated if we knew that this platform had fast unaligned reads */
1927 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1928 count += ! UTF8_IS_INVARIANT(*x++);
1929 }
1930
1931 /* Process per-word as long as we have at least a full word left */
1932 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1933 explanation of how this works */
1934 PERL_UINTMAX_T increment
1935 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
1936 * PERL_COUNT_MULTIPLIER)
1937 >> ((PERL_WORDSIZE - 1) * CHARBITS);
1938 count += (Size_t) increment;
1939 x += PERL_WORDSIZE;
1940 } while (x + PERL_WORDSIZE <= e);
1941 }
1942
1943 # endif
1944
1945 /* Process per-byte */
1946 while (x < e) {
1947 if (! UTF8_IS_INVARIANT(*x)) {
1948 count++;
1949 }
1950
1951 x++;
1952 }
1953
1954 return count;
1955 }
1956
1957 #endif
1958
1959 /* Keep these around for these files */
1960 #if ! defined(PERL_IN_REGEXEC_C) && ! defined(PERL_IN_UTF8_C)
1961 # undef PERL_WORDSIZE
1962 # undef PERL_COUNT_MULTIPLIER
1963 # undef PERL_WORD_BOUNDARY_MASK
1964 # undef PERL_VARIANTS_WORD_MASK
1965 #endif
1966
1967 /*
1968 =for apidoc is_utf8_string
1969
1970 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1971 Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
1972 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1973 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1974 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1975
1976 This function considers Perl's extended UTF-8 to be valid. That means that
1977 code points above Unicode, surrogates, and non-character code points are
1978 considered valid by this function. Use C<L</is_strict_utf8_string>>,
1979 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1980 code points are considered valid.
1981
1982 See also
1983 C<L</is_utf8_invariant_string>>,
1984 C<L</is_utf8_invariant_string_loc>>,
1985 C<L</is_utf8_string_loc>>,
1986 C<L</is_utf8_string_loclen>>,
1987 C<L</is_utf8_fixed_width_buf_flags>>,
1988 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1989 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1990
1991 =cut
1992 */
1993
1994 #define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
1995
1996 #if defined(PERL_CORE) || defined (PERL_EXT)
1997
1998 /*
1999 =for apidoc is_utf8_non_invariant_string
2000
2001 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
2002 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
2003 UTF-8; otherwise returns FALSE.
2004
2005 A TRUE return means that at least one code point represented by the sequence
2006 either is a wide character not representable as a single byte, or the
2007 representation differs depending on whether the sequence is encoded in UTF-8 or
2008 not.
2009
2010 See also
2011 C<L<perlapi/is_utf8_invariant_string>>,
2012 C<L<perlapi/is_utf8_string>>
2013
2014 =cut
2015
2016 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
2017 It generally needn't be if its string is entirely UTF-8 invariant, and it
2018 shouldn't be if it otherwise contains invalid UTF-8.
2019
2020 It is an internal function because khw thinks that XS code shouldn't be working
2021 at this low a level. A valid use case could change that.
2022
2023 */
2024
2025 PERL_STATIC_INLINE bool
Perl_is_utf8_non_invariant_string(const U8 * const s,STRLEN len)2026 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
2027 {
2028 const U8 * first_variant;
2029
2030 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
2031
2032 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2033 return FALSE;
2034 }
2035
2036 return is_utf8_string(first_variant, len - (first_variant - s));
2037 }
2038
2039 #endif
2040
2041 /*
2042 =for apidoc is_strict_utf8_string
2043
2044 Returns TRUE if the first C<len> bytes of string C<s> form a valid
2045 UTF-8-encoded string that is fully interchangeable by any application using
2046 Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
2047 calculated using C<strlen(s)> (which means if you use this option, that C<s>
2048 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
2049 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
2050
2051 This function returns FALSE for strings containing any
2052 code points above the Unicode max of 0x10FFFF, surrogate code points, or
2053 non-character code points.
2054
2055 See also
2056 C<L</is_utf8_invariant_string>>,
2057 C<L</is_utf8_invariant_string_loc>>,
2058 C<L</is_utf8_string>>,
2059 C<L</is_utf8_string_flags>>,
2060 C<L</is_utf8_string_loc>>,
2061 C<L</is_utf8_string_loc_flags>>,
2062 C<L</is_utf8_string_loclen>>,
2063 C<L</is_utf8_string_loclen_flags>>,
2064 C<L</is_utf8_fixed_width_buf_flags>>,
2065 C<L</is_utf8_fixed_width_buf_loc_flags>>,
2066 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
2067 C<L</is_strict_utf8_string_loc>>,
2068 C<L</is_strict_utf8_string_loclen>>,
2069 C<L</is_c9strict_utf8_string>>,
2070 C<L</is_c9strict_utf8_string_loc>>,
2071 and
2072 C<L</is_c9strict_utf8_string_loclen>>.
2073
2074 =cut
2075 */
2076
2077 #define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
2078
2079 /*
2080 =for apidoc is_c9strict_utf8_string
2081
2082 Returns TRUE if the first C<len> bytes of string C<s> form a valid
2083 UTF-8-encoded string that conforms to
2084 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
2085 otherwise it returns FALSE. If C<len> is 0, it will be calculated using
2086 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
2087 C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
2088 characters being ASCII constitute 'a valid UTF-8 string'.
2089
2090 This function returns FALSE for strings containing any code points above the
2091 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
2092 code points per
2093 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
2094
2095 See also
2096 C<L</is_utf8_invariant_string>>,
2097 C<L</is_utf8_invariant_string_loc>>,
2098 C<L</is_utf8_string>>,
2099 C<L</is_utf8_string_flags>>,
2100 C<L</is_utf8_string_loc>>,
2101 C<L</is_utf8_string_loc_flags>>,
2102 C<L</is_utf8_string_loclen>>,
2103 C<L</is_utf8_string_loclen_flags>>,
2104 C<L</is_utf8_fixed_width_buf_flags>>,
2105 C<L</is_utf8_fixed_width_buf_loc_flags>>,
2106 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
2107 C<L</is_strict_utf8_string>>,
2108 C<L</is_strict_utf8_string_loc>>,
2109 C<L</is_strict_utf8_string_loclen>>,
2110 C<L</is_c9strict_utf8_string_loc>>,
2111 and
2112 C<L</is_c9strict_utf8_string_loclen>>.
2113
2114 =cut
2115 */
2116
2117 #define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
2118
2119 /*
2120 =for apidoc is_utf8_string_flags
2121
2122 Returns TRUE if the first C<len> bytes of string C<s> form a valid
2123 UTF-8 string, subject to the restrictions imposed by C<flags>;
2124 returns FALSE otherwise. If C<len> is 0, it will be calculated
2125 using C<strlen(s)> (which means if you use this option, that C<s> can't have
2126 embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
2127 that all characters being ASCII constitute 'a valid UTF-8 string'.
2128
2129 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
2130 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2131 as C<L</is_strict_utf8_string>>; and if C<flags> is
2132 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
2133 C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
2134 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
2135 C<L</utf8n_to_uvchr>>, with the same meanings.
2136
2137 See also
2138 C<L</is_utf8_invariant_string>>,
2139 C<L</is_utf8_invariant_string_loc>>,
2140 C<L</is_utf8_string>>,
2141 C<L</is_utf8_string_loc>>,
2142 C<L</is_utf8_string_loc_flags>>,
2143 C<L</is_utf8_string_loclen>>,
2144 C<L</is_utf8_string_loclen_flags>>,
2145 C<L</is_utf8_fixed_width_buf_flags>>,
2146 C<L</is_utf8_fixed_width_buf_loc_flags>>,
2147 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
2148 C<L</is_strict_utf8_string>>,
2149 C<L</is_strict_utf8_string_loc>>,
2150 C<L</is_strict_utf8_string_loclen>>,
2151 C<L</is_c9strict_utf8_string>>,
2152 C<L</is_c9strict_utf8_string_loc>>,
2153 and
2154 C<L</is_c9strict_utf8_string_loclen>>.
2155
2156 =cut
2157 */
2158
2159 PERL_STATIC_INLINE bool
Perl_is_utf8_string_flags(const U8 * s,STRLEN len,const U32 flags)2160 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
2161 {
2162 const U8 * first_variant;
2163
2164 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
2165 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2166 |UTF8_DISALLOW_PERL_EXTENDED)));
2167
2168 if (len == 0) {
2169 len = strlen((const char *)s);
2170 }
2171
2172 if (flags == 0) {
2173 return is_utf8_string(s, len);
2174 }
2175
2176 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2177 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
2178 {
2179 return is_strict_utf8_string(s, len);
2180 }
2181
2182 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2183 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
2184 {
2185 return is_c9strict_utf8_string(s, len);
2186 }
2187
2188 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
2189 const U8* const send = s + len;
2190 const U8* x = first_variant;
2191
2192 while (x < send) {
2193 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
2194 if (UNLIKELY(! cur_len)) {
2195 return FALSE;
2196 }
2197 x += cur_len;
2198 }
2199 }
2200
2201 return TRUE;
2202 }
2203
2204 /*
2205
2206 =for apidoc is_utf8_string_loc
2207
2208 Like C<L</is_utf8_string>> but stores the location of the failure (in the
2209 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2210 "utf8ness success") in the C<ep> pointer.
2211
2212 See also C<L</is_utf8_string_loclen>>.
2213
2214 =cut
2215 */
2216
2217 #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
2218
2219 /*
2220
2221 =for apidoc is_utf8_string_loclen
2222
2223 Like C<L</is_utf8_string>> but stores the location of the failure (in the
2224 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2225 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
2226 encoded characters in the C<el> pointer.
2227
2228 See also C<L</is_utf8_string_loc>>.
2229
2230 =cut
2231 */
2232
2233 PERL_STATIC_INLINE bool
Perl_is_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)2234 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
2235 {
2236 const U8 * first_variant;
2237
2238 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
2239
2240 if (len == 0) {
2241 len = strlen((const char *) s);
2242 }
2243
2244 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2245 if (el)
2246 *el = len;
2247
2248 if (ep) {
2249 *ep = s + len;
2250 }
2251
2252 return TRUE;
2253 }
2254
2255 {
2256 const U8* const send = s + len;
2257 const U8* x = first_variant;
2258 STRLEN outlen = first_variant - s;
2259
2260 while (x < send) {
2261 const STRLEN cur_len = isUTF8_CHAR(x, send);
2262 if (UNLIKELY(! cur_len)) {
2263 break;
2264 }
2265 x += cur_len;
2266 outlen++;
2267 }
2268
2269 if (el)
2270 *el = outlen;
2271
2272 if (ep) {
2273 *ep = x;
2274 }
2275
2276 return (x == send);
2277 }
2278 }
2279
2280 /* The perl core arranges to never call the DFA below without there being at
2281 * least one byte available to look at. This allows the DFA to use a do {}
2282 * while loop which means that calling it with a UTF-8 invariant has a single
2283 * conditional, same as the calling code checking for invariance ahead of time.
2284 * And having the calling code remove that conditional speeds up by that
2285 * conditional, the case where it wasn't invariant. So there's no reason to
2286 * check before caling this.
2287 *
2288 * But we don't know this for non-core calls, so have to retain the check for
2289 * them. */
2290 #ifdef PERL_CORE
2291 # define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s))
2292 #else
2293 # define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE
2294 #endif
2295
2296 /*
2297 * DFA for checking input is valid UTF-8 syntax.
2298 *
2299 * This uses adaptations of the table and algorithm given in
2300 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
2301 * documentation of the original version. A copyright notice for the original
2302 * version is given at the beginning of this file. The Perl adaptations are
2303 * documented at the definition of PL_extended_utf8_dfa_tab[].
2304 *
2305 * This dfa is fast. There are three exit conditions:
2306 * 1) a well-formed code point, acceptable to the table
2307 * 2) the beginning bytes of an incomplete character, whose completion might
2308 * or might not be acceptable
2309 * 3) unacceptable to the table. Some of the adaptations have certain,
2310 * hopefully less likely to occur, legal inputs be unacceptable to the
2311 * table, so these must be sorted out afterwards.
2312 *
2313 * This macro is a complete implementation of the code executing the DFA. It
2314 * is passed the input sequence bounds and the table to use, and what to do
2315 * for each of the exit conditions. There are three canned actions, likely to
2316 * be the ones you want:
2317 * DFA_RETURN_SUCCESS_
2318 * DFA_RETURN_FAILURE_
2319 * DFA_GOTO_TEASE_APART_FF_
2320 *
2321 * You pass a parameter giving the action to take for each of the three
2322 * possible exit conditions:
2323 *
2324 * 'accept_action' This is executed when the DFA accepts the input.
2325 * DFA_RETURN_SUCCESS_ is the most likely candidate.
2326 * 'reject_action' This is executed when the DFA rejects the input.
2327 * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
2328 * you have written code to distinguish the rejecting state
2329 * results. Because it happens in several places, and
2330 * involves #ifdefs, the special action
2331 * DFA_GOTO_TEASE_APART_FF_ is what you want with
2332 * PL_extended_utf8_dfa_tab. On platforms without
2333 * EXTRA_LONG_UTF8, there is no need to tease anything apart,
2334 * so this evaluates to DFA_RETURN_FAILURE_; otherwise you
2335 * need to have a label 'tease_apart_FF' that it will transfer
2336 * to.
2337 * 'incomplete_char_action' This is executed when the DFA ran off the end
2338 * before accepting or rejecting the input.
2339 * DFA_RETURN_FAILURE_ is the likely action, but you could
2340 * have a 'goto', or NOOP. In the latter case the DFA drops
2341 * off the end, and you place your code to handle this case
2342 * immediately after it.
2343 */
2344
2345 #define DFA_RETURN_SUCCESS_ return s - s0
2346 #define DFA_RETURN_FAILURE_ return 0
2347 #ifdef HAS_EXTRA_LONG_UTF8
2348 # define DFA_TEASE_APART_FF_ goto tease_apart_FF
2349 #else
2350 # define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_
2351 #endif
2352
2353 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \
2354 accept_action, \
2355 reject_action, \
2356 incomplete_char_action) \
2357 STMT_START { \
2358 const U8 * s = s0; \
2359 const U8 * e_ = e; \
2360 UV state = 0; \
2361 \
2362 PERL_NON_CORE_CHECK_EMPTY(s, e_); \
2363 \
2364 do { \
2365 state = dfa_tab[256 + state + dfa_tab[*s]]; \
2366 s++; \
2367 \
2368 if (state == 0) { /* Accepting state */ \
2369 accept_action; \
2370 } \
2371 \
2372 if (UNLIKELY(state == 1)) { /* Rejecting state */ \
2373 reject_action; \
2374 } \
2375 } while (s < e_); \
2376 \
2377 /* Here, dropped out of loop before end-of-char */ \
2378 incomplete_char_action; \
2379 } STMT_END
2380
2381
2382 /*
2383
2384 =for apidoc isUTF8_CHAR
2385
2386 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2387 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2388 that represents some code point; otherwise it evaluates to 0. If non-zero, the
2389 value gives how many bytes starting at C<s> comprise the code point's
2390 representation. Any bytes remaining before C<e>, but beyond the ones needed to
2391 form the first code point in C<s>, are not examined.
2392
2393 The code point can be any that will fit in an IV on this machine, using Perl's
2394 extension to official UTF-8 to represent those higher than the Unicode maximum
2395 of 0x10FFFF. That means that this macro is used to efficiently decide if the
2396 next few bytes in C<s> is legal UTF-8 for a single character.
2397
2398 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
2399 defined by Unicode to be fully interchangeable across applications;
2400 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
2401 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
2402 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
2403
2404 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
2405 C<L</is_utf8_string_loclen>> to check entire strings.
2406
2407 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
2408 machines) is a valid UTF-8 character.
2409
2410 =cut
2411
2412 This uses an adaptation of the table and algorithm given in
2413 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
2414 documentation of the original version. A copyright notice for the original
2415 version is given at the beginning of this file. The Perl adaptation is
2416 documented at the definition of PL_extended_utf8_dfa_tab[].
2417 */
2418
2419 PERL_STATIC_INLINE Size_t
Perl_isUTF8_CHAR(const U8 * const s0,const U8 * const e)2420 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
2421 {
2422 PERL_ARGS_ASSERT_ISUTF8_CHAR;
2423
2424 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2425 DFA_RETURN_SUCCESS_,
2426 DFA_TEASE_APART_FF_,
2427 DFA_RETURN_FAILURE_);
2428
2429 /* Here, we didn't return success, but dropped out of the loop. In the
2430 * case of PL_extended_utf8_dfa_tab, this means the input is either
2431 * malformed, or the start byte was FF on a platform that the dfa doesn't
2432 * handle FF's. Call a helper function. */
2433
2434 #ifdef HAS_EXTRA_LONG_UTF8
2435
2436 tease_apart_FF:
2437
2438 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2439 * either malformed, or was for the largest possible start byte, which we
2440 * now check, not inline */
2441 if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
2442 return 0;
2443 }
2444
2445 return is_utf8_FF_helper_(s0, e,
2446 FALSE /* require full, not partial char */
2447 );
2448 #endif
2449
2450 }
2451
2452 /*
2453
2454 =for apidoc isSTRICT_UTF8_CHAR
2455
2456 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2457 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
2458 Unicode code point completely acceptable for open interchange between all
2459 applications; otherwise it evaluates to 0. If non-zero, the value gives how
2460 many bytes starting at C<s> comprise the code point's representation. Any
2461 bytes remaining before C<e>, but beyond the ones needed to form the first code
2462 point in C<s>, are not examined.
2463
2464 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
2465 be a surrogate nor a non-character code point. Thus this excludes any code
2466 point from Perl's extended UTF-8.
2467
2468 This is used to efficiently decide if the next few bytes in C<s> is
2469 legal Unicode-acceptable UTF-8 for a single character.
2470
2471 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
2472 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
2473 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
2474 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
2475
2476 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
2477 C<L</is_strict_utf8_string_loclen>> to check entire strings.
2478
2479 =cut
2480
2481 This uses an adaptation of the tables and algorithm given in
2482 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
2483 documentation of the original version. A copyright notice for the original
2484 version is given at the beginning of this file. The Perl adaptation is
2485 documented at the definition of strict_extended_utf8_dfa_tab[].
2486
2487 */
2488
2489 PERL_STATIC_INLINE Size_t
Perl_isSTRICT_UTF8_CHAR(const U8 * const s0,const U8 * const e)2490 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
2491 {
2492 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
2493
2494 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
2495 DFA_RETURN_SUCCESS_,
2496 goto check_hanguls,
2497 DFA_RETURN_FAILURE_);
2498 check_hanguls:
2499
2500 /* Here, we didn't return success, but dropped out of the loop. In the
2501 * case of PL_strict_utf8_dfa_tab, this means the input is either
2502 * malformed, or was for certain Hanguls; handle them specially */
2503
2504 /* The dfa above drops out for incomplete or illegal inputs, and certain
2505 * legal Hanguls; check and return accordingly */
2506 return is_HANGUL_ED_utf8_safe(s0, e);
2507 }
2508
2509 /*
2510
2511 =for apidoc isC9_STRICT_UTF8_CHAR
2512
2513 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2514 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
2515 Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
2516 the value gives how many bytes starting at C<s> comprise the code point's
2517 representation. Any bytes remaining before C<e>, but beyond the ones needed to
2518 form the first code point in C<s>, are not examined.
2519
2520 The largest acceptable code point is the Unicode maximum 0x10FFFF. This
2521 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
2522 code points. This corresponds to
2523 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
2524 which said that non-character code points are merely discouraged rather than
2525 completely forbidden in open interchange. See
2526 L<perlunicode/Noncharacter code points>.
2527
2528 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
2529 C<L</isUTF8_CHAR_flags>> for a more customized definition.
2530
2531 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
2532 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
2533
2534 =cut
2535
2536 This uses an adaptation of the tables and algorithm given in
2537 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
2538 documentation of the original version. A copyright notice for the original
2539 version is given at the beginning of this file. The Perl adaptation is
2540 documented at the definition of PL_c9_utf8_dfa_tab[].
2541
2542 */
2543
2544 PERL_STATIC_INLINE Size_t
Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0,const U8 * const e)2545 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
2546 {
2547 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
2548
2549 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
2550 DFA_RETURN_SUCCESS_,
2551 DFA_RETURN_FAILURE_,
2552 DFA_RETURN_FAILURE_);
2553 }
2554
2555 /*
2556
2557 =for apidoc is_strict_utf8_string_loc
2558
2559 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
2560 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2561 "utf8ness success") in the C<ep> pointer.
2562
2563 See also C<L</is_strict_utf8_string_loclen>>.
2564
2565 =cut
2566 */
2567
2568 #define is_strict_utf8_string_loc(s, len, ep) \
2569 is_strict_utf8_string_loclen(s, len, ep, 0)
2570
2571 /*
2572
2573 =for apidoc is_strict_utf8_string_loclen
2574
2575 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
2576 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2577 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
2578 encoded characters in the C<el> pointer.
2579
2580 See also C<L</is_strict_utf8_string_loc>>.
2581
2582 =cut
2583 */
2584
2585 PERL_STATIC_INLINE bool
Perl_is_strict_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)2586 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
2587 {
2588 const U8 * first_variant;
2589
2590 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
2591
2592 if (len == 0) {
2593 len = strlen((const char *) s);
2594 }
2595
2596 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2597 if (el)
2598 *el = len;
2599
2600 if (ep) {
2601 *ep = s + len;
2602 }
2603
2604 return TRUE;
2605 }
2606
2607 {
2608 const U8* const send = s + len;
2609 const U8* x = first_variant;
2610 STRLEN outlen = first_variant - s;
2611
2612 while (x < send) {
2613 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
2614 if (UNLIKELY(! cur_len)) {
2615 break;
2616 }
2617 x += cur_len;
2618 outlen++;
2619 }
2620
2621 if (el)
2622 *el = outlen;
2623
2624 if (ep) {
2625 *ep = x;
2626 }
2627
2628 return (x == send);
2629 }
2630 }
2631
2632 /*
2633
2634 =for apidoc is_c9strict_utf8_string_loc
2635
2636 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
2637 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2638 "utf8ness success") in the C<ep> pointer.
2639
2640 See also C<L</is_c9strict_utf8_string_loclen>>.
2641
2642 =cut
2643 */
2644
2645 #define is_c9strict_utf8_string_loc(s, len, ep) \
2646 is_c9strict_utf8_string_loclen(s, len, ep, 0)
2647
2648 /*
2649
2650 =for apidoc is_c9strict_utf8_string_loclen
2651
2652 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
2653 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2654 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
2655 characters in the C<el> pointer.
2656
2657 See also C<L</is_c9strict_utf8_string_loc>>.
2658
2659 =cut
2660 */
2661
2662 PERL_STATIC_INLINE bool
Perl_is_c9strict_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)2663 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
2664 {
2665 const U8 * first_variant;
2666
2667 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
2668
2669 if (len == 0) {
2670 len = strlen((const char *) s);
2671 }
2672
2673 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2674 if (el)
2675 *el = len;
2676
2677 if (ep) {
2678 *ep = s + len;
2679 }
2680
2681 return TRUE;
2682 }
2683
2684 {
2685 const U8* const send = s + len;
2686 const U8* x = first_variant;
2687 STRLEN outlen = first_variant - s;
2688
2689 while (x < send) {
2690 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
2691 if (UNLIKELY(! cur_len)) {
2692 break;
2693 }
2694 x += cur_len;
2695 outlen++;
2696 }
2697
2698 if (el)
2699 *el = outlen;
2700
2701 if (ep) {
2702 *ep = x;
2703 }
2704
2705 return (x == send);
2706 }
2707 }
2708
2709 /*
2710
2711 =for apidoc is_utf8_string_loc_flags
2712
2713 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
2714 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2715 "utf8ness success") in the C<ep> pointer.
2716
2717 See also C<L</is_utf8_string_loclen_flags>>.
2718
2719 =cut
2720 */
2721
2722 #define is_utf8_string_loc_flags(s, len, ep, flags) \
2723 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
2724
2725
2726 /* The above 3 actual functions could have been moved into the more general one
2727 * just below, and made #defines that call it with the right 'flags'. They are
2728 * currently kept separate to increase their chances of getting inlined */
2729
2730 /*
2731
2732 =for apidoc is_utf8_string_loclen_flags
2733
2734 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
2735 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2736 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
2737 encoded characters in the C<el> pointer.
2738
2739 See also C<L</is_utf8_string_loc_flags>>.
2740
2741 =cut
2742 */
2743
2744 PERL_STATIC_INLINE bool
Perl_is_utf8_string_loclen_flags(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el,const U32 flags)2745 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
2746 {
2747 const U8 * first_variant;
2748
2749 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
2750 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2751 |UTF8_DISALLOW_PERL_EXTENDED)));
2752
2753 if (flags == 0) {
2754 return is_utf8_string_loclen(s, len, ep, el);
2755 }
2756
2757 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2758 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
2759 {
2760 return is_strict_utf8_string_loclen(s, len, ep, el);
2761 }
2762
2763 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2764 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
2765 {
2766 return is_c9strict_utf8_string_loclen(s, len, ep, el);
2767 }
2768
2769 if (len == 0) {
2770 len = strlen((const char *) s);
2771 }
2772
2773 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2774 if (el)
2775 *el = len;
2776
2777 if (ep) {
2778 *ep = s + len;
2779 }
2780
2781 return TRUE;
2782 }
2783
2784 {
2785 const U8* send = s + len;
2786 const U8* x = first_variant;
2787 STRLEN outlen = first_variant - s;
2788
2789 while (x < send) {
2790 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
2791 if (UNLIKELY(! cur_len)) {
2792 break;
2793 }
2794 x += cur_len;
2795 outlen++;
2796 }
2797
2798 if (el)
2799 *el = outlen;
2800
2801 if (ep) {
2802 *ep = x;
2803 }
2804
2805 return (x == send);
2806 }
2807 }
2808
2809 /*
2810 =for apidoc utf8_distance
2811
2812 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
2813 and C<b>.
2814
2815 WARNING: use only if you *know* that the pointers point inside the
2816 same UTF-8 buffer.
2817
2818 =cut
2819 */
2820
2821 PERL_STATIC_INLINE IV
Perl_utf8_distance(pTHX_ const U8 * a,const U8 * b)2822 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
2823 {
2824 PERL_ARGS_ASSERT_UTF8_DISTANCE;
2825
2826 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
2827 }
2828
2829 /*
2830 =for apidoc utf8_hop
2831
2832 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
2833 forward (if C<off> is positive) or backward (if negative). C<s> does not need
2834 to be pointing to the starting byte of a character. If it isn't, one count of
2835 C<off> will be used up to get to the start of the next character for forward
2836 hops, and to the start of the current character for negative ones.
2837
2838 WARNING: Prefer L</utf8_hop_safe> to this one.
2839
2840 Do NOT use this function unless you B<know> C<off> is within
2841 the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned
2842 on the first byte of a character or just after the last byte of a character.
2843
2844 =cut
2845 */
2846
2847 PERL_STATIC_INLINE U8 *
Perl_utf8_hop(const U8 * s,SSize_t off)2848 Perl_utf8_hop(const U8 *s, SSize_t off)
2849 {
2850 PERL_ARGS_ASSERT_UTF8_HOP;
2851
2852 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2853 * the XXX bitops (especially ~) can create illegal UTF-8.
2854 * In other words: in Perl UTF-8 is not just for Unicode. */
2855
2856 if (off > 0) {
2857
2858 /* Get to next non-continuation byte */
2859 if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2860 do {
2861 s++;
2862 }
2863 while (UTF8_IS_CONTINUATION(*s));
2864 off--;
2865 }
2866
2867 while (off--)
2868 s += UTF8SKIP(s);
2869 }
2870 else {
2871 while (off++) {
2872 s--;
2873 while (UTF8_IS_CONTINUATION(*s))
2874 s--;
2875 }
2876 }
2877
2878 GCC_DIAG_IGNORE(-Wcast-qual)
2879 return (U8 *)s;
2880 GCC_DIAG_RESTORE
2881 }
2882
2883 /*
2884 =for apidoc utf8_hop_forward
2885
2886 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2887 forward. C<s> does not need to be pointing to the starting byte of a
2888 character. If it isn't, one count of C<off> will be used up to get to the
2889 start of the next character.
2890
2891 C<off> must be non-negative.
2892
2893 C<s> must be before or equal to C<end>.
2894
2895 When moving forward it will not move beyond C<end>.
2896
2897 Will not exceed this limit even if the string is not valid "UTF-8".
2898
2899 =cut
2900 */
2901
2902 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_forward(const U8 * s,SSize_t off,const U8 * end)2903 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2904 {
2905 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2906
2907 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2908 * the bitops (especially ~) can create illegal UTF-8.
2909 * In other words: in Perl UTF-8 is not just for Unicode. */
2910
2911 assert(s <= end);
2912 assert(off >= 0);
2913
2914 if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2915 /* Get to next non-continuation byte */
2916 do {
2917 s++;
2918 }
2919 while (UTF8_IS_CONTINUATION(*s));
2920 off--;
2921 }
2922
2923 while (off--) {
2924 STRLEN skip = UTF8SKIP(s);
2925 if ((STRLEN)(end - s) <= skip) {
2926 GCC_DIAG_IGNORE(-Wcast-qual)
2927 return (U8 *)end;
2928 GCC_DIAG_RESTORE
2929 }
2930 s += skip;
2931 }
2932
2933 GCC_DIAG_IGNORE(-Wcast-qual)
2934 return (U8 *)s;
2935 GCC_DIAG_RESTORE
2936 }
2937
2938 /*
2939 =for apidoc utf8_hop_back
2940
2941 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2942 backward. C<s> does not need to be pointing to the starting byte of a
2943 character. If it isn't, one count of C<off> will be used up to get to that
2944 start.
2945
2946 C<off> must be non-positive.
2947
2948 C<s> must be after or equal to C<start>.
2949
2950 When moving backward it will not move before C<start>.
2951
2952 Will not exceed this limit even if the string is not valid "UTF-8".
2953
2954 =cut
2955 */
2956
2957 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_back(const U8 * s,SSize_t off,const U8 * start)2958 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2959 {
2960 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2961
2962 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2963 * the bitops (especially ~) can create illegal UTF-8.
2964 * In other words: in Perl UTF-8 is not just for Unicode. */
2965
2966 assert(start <= s);
2967 assert(off <= 0);
2968
2969 /* Note: if we know that the input is well-formed, we can do per-word
2970 * hop-back. Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented
2971 * that. But it was reverted because doing per-word has some
2972 * start-up/tear-down overhead, so only makes sense if the distance to be
2973 * moved is large, and core perl doesn't currently move more than a few
2974 * characters at a time. You can reinstate it if it does become
2975 * advantageous. */
2976 while (off++ && s > start) {
2977 do {
2978 s--;
2979 } while (s > start && UTF8_IS_CONTINUATION(*s));
2980 }
2981
2982 GCC_DIAG_IGNORE(-Wcast-qual)
2983 return (U8 *)s;
2984 GCC_DIAG_RESTORE
2985 }
2986
2987 /*
2988 =for apidoc utf8_hop_safe
2989
2990 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2991 either forward or backward. C<s> does not need to be pointing to the starting
2992 byte of a character. If it isn't, one count of C<off> will be used up to get
2993 to the start of the next character for forward hops, and to the start of the
2994 current character for negative ones.
2995
2996 When moving backward it will not move before C<start>.
2997
2998 When moving forward it will not move beyond C<end>.
2999
3000 Will not exceed those limits even if the string is not valid "UTF-8".
3001
3002 =cut
3003 */
3004
3005 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_safe(const U8 * s,SSize_t off,const U8 * start,const U8 * end)3006 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
3007 {
3008 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
3009
3010 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
3011 * the bitops (especially ~) can create illegal UTF-8.
3012 * In other words: in Perl UTF-8 is not just for Unicode. */
3013
3014 assert(start <= s && s <= end);
3015
3016 if (off >= 0) {
3017 return utf8_hop_forward(s, off, end);
3018 }
3019 else {
3020 return utf8_hop_back(s, off, start);
3021 }
3022 }
3023
3024 /*
3025
3026 =for apidoc isUTF8_CHAR_flags
3027
3028 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
3029 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
3030 that represents some code point, subject to the restrictions given by C<flags>;
3031 otherwise it evaluates to 0. If non-zero, the value gives how many bytes
3032 starting at C<s> comprise the code point's representation. Any bytes remaining
3033 before C<e>, but beyond the ones needed to form the first code point in C<s>,
3034 are not examined.
3035
3036 If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
3037 if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
3038 as C<L</isSTRICT_UTF8_CHAR>>;
3039 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
3040 the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
3041 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
3042 understood by C<L</utf8n_to_uvchr>>, with the same meanings.
3043
3044 The three alternative macros are for the most commonly needed validations; they
3045 are likely to run somewhat faster than this more general one, as they can be
3046 inlined into your code.
3047
3048 Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
3049 L</is_utf8_string_loclen_flags> to check entire strings.
3050
3051 =cut
3052 */
3053
3054 PERL_STATIC_INLINE STRLEN
Perl_isUTF8_CHAR_flags(const U8 * const s0,const U8 * const e,const U32 flags)3055 Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
3056 {
3057 PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
3058 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
3059 |UTF8_DISALLOW_PERL_EXTENDED)));
3060
3061 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
3062 goto check_success,
3063 DFA_TEASE_APART_FF_,
3064 DFA_RETURN_FAILURE_);
3065
3066 check_success:
3067
3068 return is_utf8_char_helper_(s0, e, flags);
3069
3070 #ifdef HAS_EXTRA_LONG_UTF8
3071
3072 tease_apart_FF:
3073
3074 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
3075 * either malformed, or was for the largest possible start byte, which
3076 * indicates perl extended UTF-8, well above the Unicode maximum */
3077 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
3078 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
3079 {
3080 return 0;
3081 }
3082
3083 /* Otherwise examine the sequence not inline */
3084 return is_utf8_FF_helper_(s0, e,
3085 FALSE /* require full, not partial char */
3086 );
3087 #endif
3088
3089 }
3090
3091 /*
3092
3093 =for apidoc is_utf8_valid_partial_char
3094
3095 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
3096 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
3097 points. Otherwise, it returns 1 if there exists at least one non-empty
3098 sequence of bytes that when appended to sequence C<s>, starting at position
3099 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
3100 otherwise returns 0.
3101
3102 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
3103 point.
3104
3105 This is useful when a fixed-length buffer is being tested for being well-formed
3106 UTF-8, but the final few bytes in it don't comprise a full character; that is,
3107 it is split somewhere in the middle of the final code point's UTF-8
3108 representation. (Presumably when the buffer is refreshed with the next chunk
3109 of data, the new first bytes will complete the partial code point.) This
3110 function is used to verify that the final bytes in the current buffer are in
3111 fact the legal beginning of some code point, so that if they aren't, the
3112 failure can be signalled without having to wait for the next read.
3113
3114 =cut
3115 */
3116 #define is_utf8_valid_partial_char(s, e) \
3117 is_utf8_valid_partial_char_flags(s, e, 0)
3118
3119 /*
3120
3121 =for apidoc is_utf8_valid_partial_char_flags
3122
3123 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
3124 or not the input is a valid UTF-8 encoded partial character, but it takes an
3125 extra parameter, C<flags>, which can further restrict which code points are
3126 considered valid.
3127
3128 If C<flags> is 0, this behaves identically to
3129 C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
3130 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
3131 there is any sequence of bytes that can complete the input partial character in
3132 such a way that a non-prohibited character is formed, the function returns
3133 TRUE; otherwise FALSE. Non character code points cannot be determined based on
3134 partial character input. But many of the other possible excluded types can be
3135 determined from just the first one or two bytes.
3136
3137 =cut
3138 */
3139
3140 PERL_STATIC_INLINE bool
Perl_is_utf8_valid_partial_char_flags(const U8 * const s0,const U8 * const e,const U32 flags)3141 Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
3142 {
3143 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
3144 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
3145 |UTF8_DISALLOW_PERL_EXTENDED)));
3146
3147 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
3148 DFA_RETURN_FAILURE_,
3149 DFA_TEASE_APART_FF_,
3150 NOOP);
3151
3152 /* The NOOP above causes the DFA to drop down here iff the input was a
3153 * partial character. flags=0 => can return TRUE immediately; otherwise we
3154 * need to check (not inline) if the partial character is the beginning of
3155 * a disallowed one */
3156 if (flags == 0) {
3157 return TRUE;
3158 }
3159
3160 return cBOOL(is_utf8_char_helper_(s0, e, flags));
3161
3162 #ifdef HAS_EXTRA_LONG_UTF8
3163
3164 tease_apart_FF:
3165
3166 /* Getting here means the input is either malformed, or, in the case of
3167 * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The
3168 * latter case has to be extended UTF-8, so can fail immediately if that is
3169 * forbidden */
3170
3171 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
3172 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
3173 {
3174 return 0;
3175 }
3176
3177 return is_utf8_FF_helper_(s0, e,
3178 TRUE /* Require to be a partial character */
3179 );
3180 #endif
3181
3182 }
3183
3184 /*
3185
3186 =for apidoc is_utf8_fixed_width_buf_flags
3187
3188 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
3189 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
3190 otherwise it returns FALSE.
3191
3192 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
3193 without restriction. If the final few bytes of the buffer do not form a
3194 complete code point, this will return TRUE anyway, provided that
3195 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
3196
3197 If C<flags> in non-zero, it can be any combination of the
3198 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
3199 same meanings.
3200
3201 This function differs from C<L</is_utf8_string_flags>> only in that the latter
3202 returns FALSE if the final few bytes of the string don't form a complete code
3203 point.
3204
3205 =cut
3206 */
3207 #define is_utf8_fixed_width_buf_flags(s, len, flags) \
3208 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
3209
3210 /*
3211
3212 =for apidoc is_utf8_fixed_width_buf_loc_flags
3213
3214 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
3215 failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
3216 to the beginning of any partial character at the end of the buffer; if there is
3217 no partial character C<*ep> will contain C<s>+C<len>.
3218
3219 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
3220
3221 =cut
3222 */
3223
3224 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
3225 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
3226
3227 /*
3228
3229 =for apidoc is_utf8_fixed_width_buf_loclen_flags
3230
3231 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
3232 complete, valid characters found in the C<el> pointer.
3233
3234 =cut
3235 */
3236
3237 PERL_STATIC_INLINE bool
Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,STRLEN len,const U8 ** ep,STRLEN * el,const U32 flags)3238 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
3239 STRLEN len,
3240 const U8 **ep,
3241 STRLEN *el,
3242 const U32 flags)
3243 {
3244 const U8 * maybe_partial;
3245
3246 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
3247
3248 if (! ep) {
3249 ep = &maybe_partial;
3250 }
3251
3252 /* If it's entirely valid, return that; otherwise see if the only error is
3253 * that the final few bytes are for a partial character */
3254 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
3255 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
3256 }
3257
3258 PERL_STATIC_INLINE UV
Perl_utf8n_to_uvchr_msgs(const U8 * s,STRLEN curlen,STRLEN * retlen,const U32 flags,U32 * errors,AV ** msgs)3259 Perl_utf8n_to_uvchr_msgs(const U8 *s,
3260 STRLEN curlen,
3261 STRLEN *retlen,
3262 const U32 flags,
3263 U32 * errors,
3264 AV ** msgs)
3265 {
3266 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
3267 * simple cases, and, if necessary calls a helper function to deal with the
3268 * more complex ones. Almost all well-formed non-problematic code points
3269 * are considered simple, so that it's unlikely that the helper function
3270 * will need to be called.
3271 *
3272 * This is an adaptation of the tables and algorithm given in
3273 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
3274 * comprehensive documentation of the original version. A copyright notice
3275 * for the original version is given at the beginning of this file. The
3276 * Perl adaptation is documented at the definition of PL_strict_utf8_dfa_tab[].
3277 */
3278
3279 const U8 * const s0 = s;
3280 const U8 * send = s0 + curlen;
3281 UV type;
3282 UV uv;
3283
3284 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
3285
3286 /* This dfa is fast. If it accepts the input, it was for a well-formed,
3287 * non-problematic code point, which can be returned immediately.
3288 * Otherwise we call a helper function to figure out the more complicated
3289 * cases. */
3290
3291 /* No calls from core pass in an empty string; non-core need a check */
3292 #ifdef PERL_CORE
3293 assert(curlen > 0);
3294 #else
3295 if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
3296 flags, errors, msgs);
3297 #endif
3298
3299 type = PL_strict_utf8_dfa_tab[*s];
3300
3301 /* The table is structured so that 'type' is 0 iff the input byte is
3302 * represented identically regardless of the UTF-8ness of the string */
3303 if (type == 0) { /* UTF-8 invariants are returned unchanged */
3304 uv = *s;
3305 }
3306 else {
3307 UV state = PL_strict_utf8_dfa_tab[256 + type];
3308 uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
3309
3310 while (LIKELY(state != 1) && ++s < send) {
3311 type = PL_strict_utf8_dfa_tab[*s];
3312 state = PL_strict_utf8_dfa_tab[256 + state + type];
3313
3314 uv = UTF8_ACCUMULATE(uv, *s);
3315
3316 if (state == 0) {
3317 #ifdef EBCDIC
3318 uv = UNI_TO_NATIVE(uv);
3319 #endif
3320 goto success;
3321 }
3322 }
3323
3324 /* Here is potentially problematic. Use the full mechanism */
3325 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
3326 errors, msgs);
3327 }
3328
3329 success:
3330 if (retlen) {
3331 *retlen = s - s0 + 1;
3332 }
3333 if (errors) {
3334 *errors = 0;
3335 }
3336 if (msgs) {
3337 *msgs = NULL;
3338 }
3339
3340 return uv;
3341 }
3342
3343 PERL_STATIC_INLINE UV
Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 * s,const U8 * send,STRLEN * retlen)3344 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
3345 {
3346 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
3347
3348 assert(s < send);
3349
3350 if (! ckWARN_d(WARN_UTF8)) {
3351
3352 /* EMPTY is not really allowed, and asserts on debugging builds. But
3353 * on non-debugging we have to deal with it, and this causes it to
3354 * return the REPLACEMENT CHARACTER, as the documentation indicates */
3355 return utf8n_to_uvchr(s, send - s, retlen,
3356 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
3357 }
3358 else {
3359 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
3360 if (retlen && ret == 0 && (send <= s || *s != '\0')) {
3361 *retlen = (STRLEN) -1;
3362 }
3363
3364 return ret;
3365 }
3366 }
3367
3368 /* ------------------------------- perl.h ----------------------------- */
3369
3370 /*
3371 =for apidoc_section $utility
3372
3373 =for apidoc is_safe_syscall
3374
3375 Test that the given C<pv> (with length C<len>) doesn't contain any internal
3376 C<NUL> characters.
3377 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
3378 category, and return FALSE.
3379
3380 Return TRUE if the name is safe.
3381
3382 C<what> and C<op_name> are used in any warning.
3383
3384 Used by the C<IS_SAFE_SYSCALL()> macro.
3385
3386 =cut
3387 */
3388
3389 PERL_STATIC_INLINE bool
Perl_is_safe_syscall(pTHX_ const char * pv,STRLEN len,const char * what,const char * op_name)3390 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
3391 {
3392 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
3393 * perl itself uses xce*() functions which accept 8-bit strings.
3394 */
3395
3396 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
3397
3398 if (len > 1) {
3399 char *null_at;
3400 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
3401 SETERRNO(ENOENT, LIB_INVARG);
3402 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
3403 "Invalid \\0 character in %s for %s: %s\\0%s",
3404 what, op_name, pv, null_at+1);
3405 return FALSE;
3406 }
3407 }
3408
3409 return TRUE;
3410 }
3411
3412 /*
3413
3414 Return true if the supplied filename has a newline character
3415 immediately before the first (hopefully only) NUL.
3416
3417 My original look at this incorrectly used the len from SvPV(), but
3418 that's incorrect, since we allow for a NUL in pv[len-1].
3419
3420 So instead, strlen() and work from there.
3421
3422 This allow for the user reading a filename, forgetting to chomp it,
3423 then calling:
3424
3425 open my $foo, "$file\0";
3426
3427 */
3428
3429 #ifdef PERL_CORE
3430
3431 PERL_STATIC_INLINE bool
S_should_warn_nl(const char * pv)3432 S_should_warn_nl(const char *pv)
3433 {
3434 STRLEN len;
3435
3436 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
3437
3438 len = strlen(pv);
3439
3440 return len > 0 && pv[len-1] == '\n';
3441 }
3442
3443 #endif
3444
3445 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
3446
3447 PERL_STATIC_INLINE bool
S_lossless_NV_to_IV(const NV nv,IV * ivp)3448 S_lossless_NV_to_IV(const NV nv, IV *ivp)
3449 {
3450 /* This function determines if the input NV 'nv' may be converted without
3451 * loss of data to an IV. If not, it returns FALSE taking no other action.
3452 * But if it is possible, it does the conversion, returning TRUE, and
3453 * storing the converted result in '*ivp' */
3454
3455 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
3456
3457 # if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
3458 /* Normally any comparison with a NaN returns false; if we can't rely
3459 * on that behaviour, check explicitly */
3460 if (UNLIKELY(Perl_isnan(nv))) {
3461 return FALSE;
3462 }
3463 # endif
3464
3465 # ifndef NV_PRESERVES_UV
3466 STATIC_ASSERT_STMT(((UV)1 << NV_PRESERVES_UV_BITS) - 1 <= (UV)IV_MAX);
3467 # endif
3468
3469 /* Written this way so that with an always-false NaN comparison we
3470 * return false */
3471 if (
3472 # ifdef NV_PRESERVES_UV
3473 LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1) &&
3474 # else
3475 /* If the condition below is not satisfied, lower bits of nv's
3476 * integral part is already lost and accurate conversion to integer
3477 * is impossible.
3478 * Note this should be consistent with S_sv_2iuv_common in sv.c. */
3479 Perl_fabs(nv) < (NV) ((UV)1 << NV_PRESERVES_UV_BITS) &&
3480 # endif
3481 (IV) nv == nv) {
3482 *ivp = (IV) nv;
3483 return TRUE;
3484 }
3485 return FALSE;
3486 }
3487
3488 #endif
3489
3490 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
3491
3492 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
3493
3494 #define MAX_CHARSET_NAME_LENGTH 2
3495
3496 PERL_STATIC_INLINE const char *
S_get_regex_charset_name(const U32 flags,STRLEN * const lenp)3497 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
3498 {
3499 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
3500
3501 /* Returns a string that corresponds to the name of the regex character set
3502 * given by 'flags', and *lenp is set the length of that string, which
3503 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
3504
3505 *lenp = 1;
3506 switch (get_regex_charset(flags)) {
3507 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
3508 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
3509 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
3510 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
3511 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3512 *lenp = 2;
3513 return ASCII_MORE_RESTRICT_PAT_MODS;
3514 }
3515 /* The NOT_REACHED; hides an assert() which has a rather complex
3516 * definition in perl.h. */
3517 NOT_REACHED; /* NOTREACHED */
3518 return "?"; /* Unknown */
3519 }
3520
3521 #endif
3522
3523 /*
3524
3525 Return false if any get magic is on the SV other than taint magic.
3526
3527 */
3528
3529 PERL_STATIC_INLINE bool
Perl_sv_only_taint_gmagic(SV * sv)3530 Perl_sv_only_taint_gmagic(SV *sv)
3531 {
3532 MAGIC *mg = SvMAGIC(sv);
3533
3534 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
3535
3536 while (mg) {
3537 if (mg->mg_type != PERL_MAGIC_taint
3538 && !(mg->mg_flags & MGf_GSKIP)
3539 && mg->mg_virtual->svt_get) {
3540 return FALSE;
3541 }
3542 mg = mg->mg_moremagic;
3543 }
3544
3545 return TRUE;
3546 }
3547
3548 /* ------------------ cop.h ------------------------------------------- */
3549
3550 /* implement GIMME_V() macro */
3551
3552 PERL_STATIC_INLINE U8
Perl_gimme_V(pTHX)3553 Perl_gimme_V(pTHX)
3554 {
3555 I32 cxix;
3556 U8 gimme = (PL_op->op_flags & OPf_WANT);
3557
3558 if (gimme)
3559 return gimme;
3560 cxix = PL_curstackinfo->si_cxsubix;
3561 if (cxix < 0)
3562 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
3563 assert(cxstack[cxix].blk_gimme & G_WANT);
3564 return (cxstack[cxix].blk_gimme & G_WANT);
3565 }
3566
3567
3568 /* Enter a block. Push a new base context and return its address. */
3569
3570 PERL_STATIC_INLINE PERL_CONTEXT *
Perl_cx_pushblock(pTHX_ U8 type,U8 gimme,SV ** sp,I32 saveix)3571 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
3572 {
3573 PERL_CONTEXT * cx;
3574
3575 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
3576
3577 CXINC;
3578 cx = CX_CUR();
3579 cx->cx_type = type;
3580 cx->blk_gimme = gimme;
3581 cx->blk_oldsaveix = saveix;
3582 cx->blk_oldsp = (Stack_off_t)(sp - PL_stack_base);
3583 assert(cxstack_ix <= 0
3584 || CxTYPE(cx-1) == CXt_SUBST
3585 || cx->blk_oldsp >= (cx-1)->blk_oldsp);
3586 cx->blk_oldcop = PL_curcop;
3587 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
3588 cx->blk_oldscopesp = PL_scopestack_ix;
3589 cx->blk_oldpm = PL_curpm;
3590 cx->blk_old_tmpsfloor = PL_tmps_floor;
3591
3592 PL_tmps_floor = PL_tmps_ix;
3593 CX_DEBUG(cx, "PUSH");
3594 return cx;
3595 }
3596
3597
3598 /* Exit a block (RETURN and LAST). */
3599
3600 PERL_STATIC_INLINE void
Perl_cx_popblock(pTHX_ PERL_CONTEXT * cx)3601 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
3602 {
3603 PERL_ARGS_ASSERT_CX_POPBLOCK;
3604
3605 CX_DEBUG(cx, "POP");
3606 /* these 3 are common to cx_popblock and cx_topblock */
3607 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
3608 PL_scopestack_ix = cx->blk_oldscopesp;
3609 PL_curpm = cx->blk_oldpm;
3610
3611 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
3612 * and leaves a CX entry lying around for repeated use, so
3613 * skip for multicall */ \
3614 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
3615 || PL_savestack_ix == cx->blk_oldsaveix);
3616 PL_curcop = cx->blk_oldcop;
3617 PL_tmps_floor = cx->blk_old_tmpsfloor;
3618 }
3619
3620 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
3621 * Whereas cx_popblock() restores the state to the point just before
3622 * cx_pushblock() was called, cx_topblock() restores it to the point just
3623 * *after* cx_pushblock() was called. */
3624
3625 PERL_STATIC_INLINE void
Perl_cx_topblock(pTHX_ PERL_CONTEXT * cx)3626 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
3627 {
3628 PERL_ARGS_ASSERT_CX_TOPBLOCK;
3629
3630 CX_DEBUG(cx, "TOP");
3631 /* these 3 are common to cx_popblock and cx_topblock */
3632 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
3633 PL_scopestack_ix = cx->blk_oldscopesp;
3634 PL_curpm = cx->blk_oldpm;
3635 Perl_rpp_popfree_to(aTHX_ PL_stack_base + cx->blk_oldsp);
3636 }
3637
3638
3639 PERL_STATIC_INLINE void
Perl_cx_pushsub(pTHX_ PERL_CONTEXT * cx,CV * cv,OP * retop,bool hasargs)3640 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
3641 {
3642 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
3643
3644 PERL_ARGS_ASSERT_CX_PUSHSUB;
3645
3646 PERL_DTRACE_PROBE_ENTRY(cv);
3647 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
3648 PL_curstackinfo->si_cxsubix = (I32)(cx - PL_curstackinfo->si_cxstack);
3649 cx->blk_sub.cv = cv;
3650 cx->blk_sub.olddepth = CvDEPTH(cv);
3651 cx->blk_sub.prevcomppad = PL_comppad;
3652 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
3653 cx->blk_sub.retop = retop;
3654 SvREFCNT_inc_simple_void_NN(cv);
3655 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
3656 }
3657
3658
3659 /* subsets of cx_popsub() */
3660
3661 PERL_STATIC_INLINE void
Perl_cx_popsub_common(pTHX_ PERL_CONTEXT * cx)3662 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
3663 {
3664 CV *cv;
3665
3666 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
3667 assert(CxTYPE(cx) == CXt_SUB);
3668
3669 PL_comppad = cx->blk_sub.prevcomppad;
3670 PL_curpad = LIKELY(PL_comppad != NULL) ? AvARRAY(PL_comppad) : NULL;
3671 cv = cx->blk_sub.cv;
3672 CvDEPTH(cv) = cx->blk_sub.olddepth;
3673 cx->blk_sub.cv = NULL;
3674 SvREFCNT_dec(cv);
3675 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
3676 }
3677
3678
3679 /* handle the @_ part of leaving a sub */
3680
3681 PERL_STATIC_INLINE void
Perl_cx_popsub_args(pTHX_ PERL_CONTEXT * cx)3682 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
3683 {
3684 AV *av;
3685
3686 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
3687 assert(CxTYPE(cx) == CXt_SUB);
3688 assert(AvARRAY(MUTABLE_AV(
3689 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
3690 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
3691
3692 CX_POP_SAVEARRAY(cx);
3693 av = MUTABLE_AV(PAD_SVl(0));
3694 if (!SvMAGICAL(av) && SvREFCNT(av) == 1
3695 #ifndef PERL_RC_STACK
3696 && !AvREAL(av)
3697 #endif
3698 )
3699 clear_defarray_simple(av);
3700 else
3701 /* abandon @_ if it got reified */
3702 clear_defarray(av, 0);
3703 }
3704
3705
3706 PERL_STATIC_INLINE void
Perl_cx_popsub(pTHX_ PERL_CONTEXT * cx)3707 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
3708 {
3709 PERL_ARGS_ASSERT_CX_POPSUB;
3710 assert(CxTYPE(cx) == CXt_SUB);
3711
3712 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
3713
3714 if (CxHASARGS(cx))
3715 cx_popsub_args(cx);
3716 cx_popsub_common(cx);
3717 }
3718
3719
3720 PERL_STATIC_INLINE void
Perl_cx_pushformat(pTHX_ PERL_CONTEXT * cx,CV * cv,OP * retop,GV * gv)3721 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
3722 {
3723 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
3724
3725 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
3726 PL_curstackinfo->si_cxsubix= (I32)(cx - PL_curstackinfo->si_cxstack);
3727 cx->blk_format.cv = cv;
3728 cx->blk_format.retop = retop;
3729 cx->blk_format.gv = gv;
3730 cx->blk_format.dfoutgv = PL_defoutgv;
3731 cx->blk_format.prevcomppad = PL_comppad;
3732 cx->blk_u16 = 0;
3733
3734 SvREFCNT_inc_simple_void_NN(cv);
3735 CvDEPTH(cv)++;
3736 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
3737 }
3738
3739
3740 PERL_STATIC_INLINE void
Perl_cx_popformat(pTHX_ PERL_CONTEXT * cx)3741 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
3742 {
3743 CV *cv;
3744 GV *dfout;
3745
3746 PERL_ARGS_ASSERT_CX_POPFORMAT;
3747 assert(CxTYPE(cx) == CXt_FORMAT);
3748
3749 dfout = cx->blk_format.dfoutgv;
3750 setdefout(dfout);
3751 cx->blk_format.dfoutgv = NULL;
3752 SvREFCNT_dec_NN(dfout);
3753
3754 PL_comppad = cx->blk_format.prevcomppad;
3755 PL_curpad = LIKELY(PL_comppad != NULL) ? AvARRAY(PL_comppad) : NULL;
3756 cv = cx->blk_format.cv;
3757 cx->blk_format.cv = NULL;
3758 --CvDEPTH(cv);
3759 SvREFCNT_dec_NN(cv);
3760 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
3761 }
3762
3763
3764 PERL_STATIC_INLINE void
Perl_push_evalortry_common(pTHX_ PERL_CONTEXT * cx,OP * retop,SV * namesv)3765 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
3766 {
3767 cx->blk_eval.retop = retop;
3768 cx->blk_eval.old_namesv = namesv;
3769 cx->blk_eval.old_eval_root = PL_eval_root;
3770 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
3771 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
3772 cx->blk_eval.cur_top_env = PL_top_env;
3773
3774 assert(!(PL_in_eval & ~ 0x3F));
3775 assert(!(PL_op->op_type & ~0x1FF));
3776 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
3777 }
3778
3779 PERL_STATIC_INLINE void
Perl_cx_pusheval(pTHX_ PERL_CONTEXT * cx,OP * retop,SV * namesv)3780 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
3781 {
3782 PERL_ARGS_ASSERT_CX_PUSHEVAL;
3783
3784 Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
3785
3786 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
3787 PL_curstackinfo->si_cxsubix = (I32)(cx - PL_curstackinfo->si_cxstack);
3788 }
3789
3790 PERL_STATIC_INLINE void
Perl_cx_pushtry(pTHX_ PERL_CONTEXT * cx,OP * retop)3791 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
3792 {
3793 PERL_ARGS_ASSERT_CX_PUSHTRY;
3794
3795 Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
3796
3797 /* Don't actually change it, just store the current value so it's restored
3798 * by the common popeval */
3799 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
3800 }
3801
3802
3803 PERL_STATIC_INLINE void
Perl_cx_popeval(pTHX_ PERL_CONTEXT * cx)3804 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
3805 {
3806 SV *sv;
3807
3808 PERL_ARGS_ASSERT_CX_POPEVAL;
3809 assert(CxTYPE(cx) == CXt_EVAL);
3810
3811 PL_in_eval = CxOLD_IN_EVAL(cx);
3812 assert(!(PL_in_eval & 0xc0));
3813 PL_eval_root = cx->blk_eval.old_eval_root;
3814 sv = cx->blk_eval.cur_text;
3815 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
3816 cx->blk_eval.cur_text = NULL;
3817 SvREFCNT_dec_NN(sv);
3818 }
3819
3820 sv = cx->blk_eval.old_namesv;
3821 if (sv) {
3822 cx->blk_eval.old_namesv = NULL;
3823 SvREFCNT_dec_NN(sv);
3824 }
3825 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
3826 }
3827
3828
3829 /* push a plain loop, i.e.
3830 * { block }
3831 * while (cond) { block }
3832 * for (init;cond;continue) { block }
3833 * This loop can be last/redo'ed etc.
3834 */
3835
3836 PERL_STATIC_INLINE void
Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT * cx)3837 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
3838 {
3839 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
3840 cx->blk_loop.my_op = cLOOP;
3841 }
3842
3843
3844 /* push a true for loop, i.e.
3845 * for var (list) { block }
3846 */
3847
3848 PERL_STATIC_INLINE void
Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT * cx,void * itervarp,SV * itersave)3849 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
3850 {
3851 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
3852
3853 /* this one line is common with cx_pushloop_plain */
3854 cx->blk_loop.my_op = cLOOP;
3855
3856 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
3857 cx->blk_loop.itersave = itersave;
3858 #ifdef USE_ITHREADS
3859 cx->blk_loop.oldcomppad = PL_comppad;
3860 #endif
3861 }
3862
3863
3864 /* pop all loop types, including plain */
3865
3866 PERL_STATIC_INLINE void
Perl_cx_poploop(pTHX_ PERL_CONTEXT * cx)3867 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
3868 {
3869 PERL_ARGS_ASSERT_CX_POPLOOP;
3870
3871 assert(CxTYPE_is_LOOP(cx));
3872 if ( CxTYPE(cx) == CXt_LOOP_ARY
3873 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
3874 {
3875 /* Free ary or cur. This assumes that state_u.ary.ary
3876 * aligns with state_u.lazysv.cur. See cx_dup() */
3877 SV *sv = cx->blk_loop.state_u.lazysv.cur;
3878 cx->blk_loop.state_u.lazysv.cur = NULL;
3879 SvREFCNT_dec_NN(sv);
3880 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
3881 sv = cx->blk_loop.state_u.lazysv.end;
3882 cx->blk_loop.state_u.lazysv.end = NULL;
3883 SvREFCNT_dec_NN(sv);
3884 }
3885 }
3886 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
3887 SV *cursv;
3888 SV **svp = (cx)->blk_loop.itervar_u.svp;
3889 if ((cx->cx_type & CXp_FOR_GV))
3890 svp = &GvSV((GV*)svp);
3891 cursv = *svp;
3892 *svp = cx->blk_loop.itersave;
3893 cx->blk_loop.itersave = NULL;
3894 SvREFCNT_dec(cursv);
3895 }
3896 if (cx->cx_type & (CXp_FOR_GV|CXp_FOR_LVREF))
3897 SvREFCNT_dec(cx->blk_loop.itervar_u.svp);
3898 }
3899
3900
3901 PERL_STATIC_INLINE void
Perl_cx_pushwhen(pTHX_ PERL_CONTEXT * cx)3902 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
3903 {
3904 PERL_ARGS_ASSERT_CX_PUSHWHEN;
3905
3906 cx->blk_givwhen.leave_op = cLOGOP->op_other;
3907 }
3908
3909
3910 PERL_STATIC_INLINE void
Perl_cx_popwhen(pTHX_ PERL_CONTEXT * cx)3911 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
3912 {
3913 PERL_ARGS_ASSERT_CX_POPWHEN;
3914 assert(CxTYPE(cx) == CXt_WHEN);
3915
3916 PERL_UNUSED_ARG(cx);
3917 PERL_UNUSED_CONTEXT;
3918 /* currently NOOP */
3919 }
3920
3921
3922 PERL_STATIC_INLINE void
Perl_cx_pushgiven(pTHX_ PERL_CONTEXT * cx,SV * orig_defsv)3923 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
3924 {
3925 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
3926
3927 cx->blk_givwhen.leave_op = cLOGOP->op_other;
3928 cx->blk_givwhen.defsv_save = orig_defsv;
3929 }
3930
3931
3932 PERL_STATIC_INLINE void
Perl_cx_popgiven(pTHX_ PERL_CONTEXT * cx)3933 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
3934 {
3935 SV *sv;
3936
3937 PERL_ARGS_ASSERT_CX_POPGIVEN;
3938 assert(CxTYPE(cx) == CXt_GIVEN);
3939
3940 sv = GvSV(PL_defgv);
3941 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
3942 cx->blk_givwhen.defsv_save = NULL;
3943 SvREFCNT_dec(sv);
3944 }
3945
3946
3947 /* Make @_ empty in-place in simple cases: a cheap av_clear().
3948 * See Perl_clear_defarray() for non-simple cases */
3949
3950
3951 PERL_STATIC_INLINE void
Perl_clear_defarray_simple(pTHX_ AV * av)3952 Perl_clear_defarray_simple(pTHX_ AV *av)
3953 {
3954 PERL_ARGS_ASSERT_CLEAR_DEFARRAY_SIMPLE;
3955
3956 assert(SvTYPE(av) == SVt_PVAV);
3957 assert(!SvREADONLY(av));
3958 assert(!SvMAGICAL(av));
3959 assert(SvREFCNT(av) == 1);
3960
3961 #ifdef PERL_RC_STACK
3962 assert(AvREAL(av));
3963 /* this code assumes that destructors called here can't free av
3964 * itself, because pad[0] and/or CX pointers will keep it alive */
3965 SSize_t i = AvFILLp(av);
3966 while (i >= 0) {
3967 SV *sv = AvARRAY(av)[i];
3968 AvARRAY(av)[i--] = NULL;
3969 SvREFCNT_dec(sv);
3970 }
3971 #else
3972 assert(!AvREAL(av));
3973 #endif
3974 AvFILLp(av) = -1;
3975 Perl_av_remove_offset(aTHX_ av);
3976 }
3977
3978 /* Switch to a different argument stack.
3979 *
3980 * Note that it doesn't update PL_curstackinfo->si_stack_nonrc_base,
3981 * so this should only be used as part of a general switching between
3982 * stackinfos.
3983 */
3984
3985 PERL_STATIC_INLINE void
Perl_switch_argstack(pTHX_ AV * to)3986 Perl_switch_argstack(pTHX_ AV *to)
3987 {
3988 PERL_ARGS_ASSERT_SWITCH_ARGSTACK;
3989
3990 AvFILLp(PL_curstack) = PL_stack_sp - PL_stack_base;
3991 PL_stack_base = AvARRAY(to);
3992 PL_stack_max = PL_stack_base + AvMAX(to);
3993 PL_stack_sp = PL_stack_base + AvFILLp(to);
3994 PL_curstack = to;
3995 }
3996
3997
3998 /* Push, and switch to a new stackinfo, allocating one if none are spare,
3999 * to get a fresh set of stacks.
4000 * Update all the interpreter variables like PL_curstackinfo,
4001 * PL_stack_sp, etc.
4002 * current flag meanings:
4003 * 1 make the new arg stack AvREAL
4004 */
4005
4006
4007 PERL_STATIC_INLINE void
Perl_push_stackinfo(pTHX_ I32 type,UV flags)4008 Perl_push_stackinfo(pTHX_ I32 type, UV flags)
4009 {
4010 PERL_ARGS_ASSERT_PUSH_STACKINFO;
4011
4012 PERL_SI *next = PL_curstackinfo->si_next;
4013 DEBUG_l({
4014 int i = 0; PERL_SI *p = PL_curstackinfo;
4015 while (p) { i++; p = p->si_prev; }
4016 Perl_deb(aTHX_ "push STACKINFO %d in %s at %s:%d\n",
4017 i, SAFE_FUNCTION__, __FILE__, __LINE__);
4018 })
4019
4020 if (!next) {
4021 next = new_stackinfo_flags(32, 2048/sizeof(PERL_CONTEXT) - 1, flags);
4022 next->si_prev = PL_curstackinfo;
4023 PL_curstackinfo->si_next = next;
4024 }
4025 next->si_type = type;
4026 next->si_cxix = -1;
4027 next->si_cxsubix = -1;
4028 PUSHSTACK_INIT_HWM(next);
4029 #ifdef PERL_RC_STACK
4030 next->si_stack_nonrc_base = 0;
4031 #endif
4032 if (flags & 1)
4033 AvREAL_on(next->si_stack);
4034 else
4035 AvREAL_off(next->si_stack);
4036 AvFILLp(next->si_stack) = 0;
4037 switch_argstack(next->si_stack);
4038 PL_curstackinfo = next;
4039 SET_MARK_OFFSET;
4040 }
4041
4042
4043 /* Pop, then switch to the previous stackinfo and set of stacks.
4044 * Update all the interpreter variables like PL_curstackinfo,
4045 * PL_stack_sp, etc. */
4046
4047 PERL_STATIC_INLINE void
Perl_pop_stackinfo(pTHX)4048 Perl_pop_stackinfo(pTHX)
4049 {
4050 PERL_ARGS_ASSERT_POP_STACKINFO;
4051
4052 PERL_SI * const prev = PL_curstackinfo->si_prev;
4053 DEBUG_l({
4054 int i = -1; PERL_SI *p = PL_curstackinfo;
4055 while (p) { i++; p = p->si_prev; }
4056 Perl_deb(aTHX_ "pop STACKINFO %d in %s at %s:%d\n",
4057 i, SAFE_FUNCTION__, __FILE__, __LINE__);})
4058 if (!prev) {
4059 Perl_croak_popstack();
4060 }
4061
4062 switch_argstack(prev->si_stack);
4063 /* don't free prev here, free them all at the END{} */
4064 PL_curstackinfo = prev;
4065 }
4066
4067
4068
4069 /*
4070 =for apidoc newPADxVOP
4071
4072 Constructs, checks and returns an op containing a pad offset. C<type> is
4073 the opcode, which should be one of C<OP_PADSV>, C<OP_PADAV>, C<OP_PADHV>
4074 or C<OP_PADCV>. The returned op will have the C<op_targ> field set by
4075 the C<padix> argument.
4076
4077 This is convenient when constructing a large optree in nested function
4078 calls, as it avoids needing to store the pad op directly to set the
4079 C<op_targ> field as a side-effect. For example
4080
4081 o = op_append_elem(OP_LINESEQ, o,
4082 newPADxVOP(OP_PADSV, 0, padix));
4083
4084 =cut
4085 */
4086
4087 PERL_STATIC_INLINE OP *
Perl_newPADxVOP(pTHX_ I32 type,I32 flags,PADOFFSET padix)4088 Perl_newPADxVOP(pTHX_ I32 type, I32 flags, PADOFFSET padix)
4089 {
4090 PERL_ARGS_ASSERT_NEWPADXVOP;
4091
4092 assert(type == OP_PADSV || type == OP_PADAV || type == OP_PADHV
4093 || type == OP_PADCV);
4094 OP *o = newOP(type, flags);
4095 o->op_targ = padix;
4096 return o;
4097 }
4098
4099 /* ------------------ util.h ------------------------------------------- */
4100
4101 /*
4102 =for apidoc_section $string
4103
4104 =for apidoc foldEQ
4105
4106 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
4107 same
4108 case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
4109 match themselves and their opposite case counterparts. Non-cased and non-ASCII
4110 range bytes match only themselves.
4111
4112 =cut
4113 */
4114
4115 PERL_STATIC_INLINE I32
Perl_foldEQ(pTHX_ const char * s1,const char * s2,I32 len)4116 Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len)
4117 {
4118 PERL_UNUSED_CONTEXT;
4119
4120 const U8 *a = (const U8 *)s1;
4121 const U8 *b = (const U8 *)s2;
4122
4123 PERL_ARGS_ASSERT_FOLDEQ;
4124
4125 assert(len >= 0);
4126
4127 while (len--) {
4128 if (*a != *b && *a != PL_fold[*b])
4129 return 0;
4130 a++,b++;
4131 }
4132 return 1;
4133 }
4134
4135 PERL_STATIC_INLINE I32
Perl_foldEQ_latin1(pTHX_ const char * s1,const char * s2,I32 len)4136 Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len)
4137 {
4138 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
4139 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
4140 * does not check for this. Nor does it check that the strings each have
4141 * at least 'len' characters. */
4142
4143 PERL_UNUSED_CONTEXT;
4144
4145 const U8 *a = (const U8 *)s1;
4146 const U8 *b = (const U8 *)s2;
4147
4148 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
4149
4150 assert(len >= 0);
4151
4152 while (len--) {
4153 if (*a != *b && *a != PL_fold_latin1[*b]) {
4154 return 0;
4155 }
4156 a++, b++;
4157 }
4158 return 1;
4159 }
4160
4161 /*
4162 =for apidoc_section $locale
4163 =for apidoc foldEQ_locale
4164
4165 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
4166 same case-insensitively in the current locale; false otherwise.
4167
4168 =cut
4169 */
4170
4171 PERL_STATIC_INLINE I32
Perl_foldEQ_locale(pTHX_ const char * s1,const char * s2,I32 len)4172 Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len)
4173 {
4174 const U8 *a = (const U8 *)s1;
4175 const U8 *b = (const U8 *)s2;
4176
4177 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
4178
4179 assert(len >= 0);
4180
4181 while (len--) {
4182 if (*a != *b && *a != PL_fold_locale[*b]) {
4183 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4184 "%s:%d: Our records indicate %02x is not a fold of %02x"
4185 " or its mate %02x\n",
4186 __FILE__, __LINE__, *a, *b, PL_fold_locale[*b]));
4187
4188 return 0;
4189 }
4190 a++,b++;
4191 }
4192 return 1;
4193 }
4194
4195 /*
4196 =for apidoc_section $string
4197 =for apidoc my_strnlen
4198
4199 The C library C<strnlen> if available, or a Perl implementation of it.
4200
4201 C<my_strnlen()> computes the length of the string, up to C<maxlen>
4202 bytes. It will never attempt to address more than C<maxlen>
4203 bytes, making it suitable for use with strings that are not
4204 guaranteed to be NUL-terminated.
4205
4206 =cut
4207
4208 Description stolen from http://man.openbsd.org/strnlen.3,
4209 implementation stolen from PostgreSQL.
4210 */
4211 #ifndef HAS_STRNLEN
4212
4213 PERL_STATIC_INLINE Size_t
Perl_my_strnlen(const char * str,Size_t maxlen)4214 Perl_my_strnlen(const char *str, Size_t maxlen)
4215 {
4216 const char *end = (char *) memchr(str, '\0', maxlen);
4217
4218 PERL_ARGS_ASSERT_MY_STRNLEN;
4219
4220 if (end == NULL) return maxlen;
4221 return end - str;
4222 }
4223
4224 #endif
4225
4226 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
4227
4228 PERL_STATIC_INLINE void *
S_my_memrchr(const char * s,const char c,const STRLEN len)4229 S_my_memrchr(const char * s, const char c, const STRLEN len)
4230 {
4231 /* memrchr(), since many platforms lack it */
4232
4233 const char * t = s + len - 1;
4234
4235 PERL_ARGS_ASSERT_MY_MEMRCHR;
4236
4237 while (t >= s) {
4238 if (*t == c) {
4239 return (void *) t;
4240 }
4241 t--;
4242 }
4243
4244 return NULL;
4245 }
4246
4247 #endif
4248
4249 PERL_STATIC_INLINE char *
Perl_mortal_getenv(const char * str)4250 Perl_mortal_getenv(const char * str)
4251 {
4252 /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
4253 *
4254 * It's (mostly) thread-safe because it uses a mutex to prevent other
4255 * threads (that look at this mutex) from destroying the result before this
4256 * routine has a chance to copy the result to a place that won't be
4257 * destroyed before the caller gets a chance to handle it. That place is a
4258 * mortal SV. khw chose this over SAVEFREEPV because he is under the
4259 * impression that the SV will hang around longer under more circumstances
4260 *
4261 * The reason it isn't completely thread-safe is that other code could
4262 * simply not pay attention to the mutex. All of the Perl core uses the
4263 * mutex, but it is possible for code from, say XS, to not use this mutex,
4264 * defeating the safety.
4265 *
4266 * getenv() returns, in some implementations, a pointer to a spot in the
4267 * **environ array, which could be invalidated at any time by this or
4268 * another thread changing the environment. Other implementations copy the
4269 * **environ value to a static buffer, returning a pointer to that. That
4270 * buffer might or might not be invalidated by a getenv() call in another
4271 * thread. If it does get zapped, we need an exclusive lock. Otherwise,
4272 * many getenv() calls can safely be running simultaneously, so a
4273 * many-reader (but no simultaneous writers) lock is ok. There is a
4274 * Configure probe to see if another thread destroys the buffer, and the
4275 * mutex is defined accordingly.
4276 *
4277 * But in all cases, using the mutex prevents these problems, as long as
4278 * all code uses the same mutex.
4279 *
4280 * A complication is that this can be called during phases where the
4281 * mortalization process isn't available. These are in interpreter
4282 * destruction or early in construction. khw believes that at these times
4283 * there shouldn't be anything else going on, so plain getenv is safe AS
4284 * LONG AS the caller acts on the return before calling it again. */
4285
4286 char * ret;
4287 dTHX;
4288
4289 PERL_ARGS_ASSERT_MORTAL_GETENV;
4290
4291 /* Can't mortalize without stacks. khw believes that no other threads
4292 * should be running, so no need to lock things, and this may be during a
4293 * phase when locking isn't even available */
4294 if (UNLIKELY(PL_scopestack_ix == 0)) {
4295 return getenv(str);
4296 }
4297
4298 #ifdef PERL_MEM_LOG
4299
4300 /* A major complication arises under PERL_MEM_LOG. When that is active,
4301 * every memory allocation may result in logging, depending on the value of
4302 * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for
4303 * saving ENV{foo}'s value (but before saving it), the logging code will
4304 * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some
4305 * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
4306 * lock a boolean mutex recursively); 3) destroying the getenv() static
4307 * buffer; or 4) destroying the temporary created by this for the copy
4308 * causes a log entry to be made which could cause a new temporary to be
4309 * created, which will need to be destroyed at some point, leading to an
4310 * infinite loop.
4311 *
4312 * The solution adopted here (after some gnashing of teeth) is to detect
4313 * the recursive calls and calls from the logger, and treat them specially.
4314 * Let's say we want to do getenv("foo"). We first find
4315 * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
4316 * variable, so no temporary is required. Then we do getenv(foo), and in
4317 * the process of creating a temporary to save it, this function will be
4318 * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call,
4319 * we detect that it is such a call and return our saved value instead of
4320 * locking and doing a new getenv(). This solves all of problems 1), 2),
4321 * and 3). Because all the getenv()s are done while the mutex is locked,
4322 * the state cannot have changed. To solve 4), we don't create a temporary
4323 * when this is called from the logging code. That code disposes of the
4324 * return value while the mutex is still locked.
4325 *
4326 * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
4327 * digits and 3 particular letters are significant; the rest are ignored by
4328 * the memory logging code. Thus the per-interpreter variable only needs
4329 * to be large enough to save the significant information, the size of
4330 * which is known at compile time. The first byte is extra, reserved for
4331 * flags for our use. To protect against overflowing, only the reserved
4332 * byte, as many digits as don't overflow, and the three letters are
4333 * stored.
4334 *
4335 * The reserved byte has two bits:
4336 * 0x1 if set indicates that if we get here, it is a recursive call of
4337 * getenv()
4338 * 0x2 if set indicates that the call is from the logging code.
4339 *
4340 * If the flag indicates this is a recursive call, just return the stored
4341 * value of PL_mem_log; An empty value gets turned into NULL. */
4342 if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
4343 if (PL_mem_log[1] == '\0') {
4344 return NULL;
4345 } else {
4346 return PL_mem_log + 1;
4347 }
4348 }
4349
4350 #endif
4351
4352 GETENV_LOCK;
4353
4354 #ifdef PERL_MEM_LOG
4355
4356 /* Here we are in a critical section. As explained above, we do our own
4357 * getenv(PERL_MEM_LOG), saving the result safely. */
4358 ret = getenv("PERL_MEM_LOG");
4359 if (ret == NULL) { /* No logging active */
4360
4361 /* Return that immediately if called from the logging code */
4362 if (PL_mem_log[0] & 0x2) {
4363 GETENV_UNLOCK;
4364 return NULL;
4365 }
4366
4367 PL_mem_log[1] = '\0';
4368 }
4369 else {
4370 char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */
4371
4372 /* There is nothing to prevent the value of PERL_MEM_LOG from being an
4373 * extremely long string. But we want only a few characters from it.
4374 * PL_mem_log has been made large enough to hold just the ones we need.
4375 * First the file descriptor. */
4376 if (isDIGIT(*ret)) {
4377 const char * s = ret;
4378 if (UNLIKELY(*s == '0')) {
4379
4380 /* Reduce multiple leading zeros to a single one. This is to
4381 * allow the caller to change what to do with leading zeros. */
4382 *mem_log_meat++ = '0';
4383 s++;
4384 while (*s == '0') {
4385 s++;
4386 }
4387 }
4388
4389 /* If the input overflows, copy just enough for the result to also
4390 * overflow, plus 1 to make sure */
4391 while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
4392 *mem_log_meat++ = *s++;
4393 }
4394 }
4395
4396 /* Then each of the four significant characters */
4397 if (strchr(ret, 'm')) {
4398 *mem_log_meat++ = 'm';
4399 }
4400 if (strchr(ret, 's')) {
4401 *mem_log_meat++ = 's';
4402 }
4403 if (strchr(ret, 't')) {
4404 *mem_log_meat++ = 't';
4405 }
4406 if (strchr(ret, 'c')) {
4407 *mem_log_meat++ = 'c';
4408 }
4409 *mem_log_meat = '\0';
4410
4411 assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
4412 }
4413
4414 /* If we are being called from the logger, it only needs the significant
4415 * portion of PERL_MEM_LOG, and doesn't need a safe copy */
4416 if (PL_mem_log[0] & 0x2) {
4417 assert(strEQ(str, "PERL_MEM_LOG"));
4418 GETENV_UNLOCK;
4419 return PL_mem_log + 1;
4420 }
4421
4422 /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that
4423 * is coming from other than the logging code, so it should be treated the
4424 * same as any other getenv(), returning the full value, not just the
4425 * significant part, and having its value saved. Set the flag that
4426 * indicates any call to this routine will be a recursion from here */
4427 PL_mem_log[0] = 0x1;
4428
4429 #endif
4430
4431 /* Now get the value of the real desired variable, and save a copy */
4432 ret = getenv(str);
4433
4434 if (ret != NULL) {
4435 ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
4436 }
4437
4438 GETENV_UNLOCK;
4439
4440 #ifdef PERL_MEM_LOG
4441
4442 /* Clear the buffer */
4443 Zero(PL_mem_log, sizeof(PL_mem_log), char);
4444
4445 #endif
4446
4447 return ret;
4448 }
4449
4450 PERL_STATIC_INLINE bool
Perl_sv_isbool(pTHX_ const SV * sv)4451 Perl_sv_isbool(pTHX_ const SV *sv)
4452 {
4453 PERL_UNUSED_CONTEXT;
4454 return SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv);
4455 }
4456
4457 #ifdef USE_ITHREADS
4458
4459 PERL_STATIC_INLINE AV *
Perl_cop_file_avn(pTHX_ const COP * cop)4460 Perl_cop_file_avn(pTHX_ const COP *cop) {
4461
4462 PERL_ARGS_ASSERT_COP_FILE_AVN;
4463
4464 const char *file = CopFILE(cop);
4465 if (file) {
4466 GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
4467 if (gv) {
4468 return GvAVn(gv);
4469 }
4470 else
4471 return NULL;
4472 }
4473 else
4474 return NULL;
4475 }
4476
4477 #endif
4478
4479 PERL_STATIC_INLINE PADNAME *
Perl_padname_refcnt_inc(PADNAME * pn)4480 Perl_padname_refcnt_inc(PADNAME *pn)
4481 {
4482 PadnameREFCNT(pn)++;
4483 return pn;
4484 }
4485
4486 PERL_STATIC_INLINE PADNAMELIST *
Perl_padnamelist_refcnt_inc(PADNAMELIST * pnl)4487 Perl_padnamelist_refcnt_inc(PADNAMELIST *pnl)
4488 {
4489 PadnamelistREFCNT(pnl)++;
4490 return pnl;
4491 }
4492
4493 /* copy a string to a safe spot */
4494
4495 /*
4496 =for apidoc_section $string
4497 =for apidoc savepv
4498
4499 Perl's version of C<strdup()>. Returns a pointer to a newly allocated
4500 string which is a duplicate of C<pv>. The size of the string is
4501 determined by C<strlen()>, which means it may not contain embedded C<NUL>
4502 characters and must have a trailing C<NUL>. To prevent memory leaks, the
4503 memory allocated for the new string needs to be freed when no longer needed.
4504 This can be done with the C<L</Safefree>> function, or
4505 L<C<SAVEFREEPV>|perlguts/SAVEFREEPV(p)>.
4506
4507 On some platforms, Windows for example, all allocated memory owned by a thread
4508 is deallocated when that thread ends. So if you need that not to happen, you
4509 need to use the shared memory functions, such as C<L</savesharedpv>>.
4510
4511 =cut
4512 */
4513
4514 PERL_STATIC_INLINE char *
Perl_savepv(pTHX_ const char * pv)4515 Perl_savepv(pTHX_ const char *pv)
4516 {
4517 PERL_UNUSED_CONTEXT;
4518 if (!pv)
4519 return NULL;
4520 else {
4521 char *newaddr;
4522 const STRLEN pvlen = strlen(pv)+1;
4523 Newx(newaddr, pvlen, char);
4524 return (char*)memcpy(newaddr, pv, pvlen);
4525 }
4526 }
4527
4528 /* same thing but with a known length */
4529
4530 /*
4531 =for apidoc savepvn
4532
4533 Perl's version of what C<strndup()> would be if it existed. Returns a
4534 pointer to a newly allocated string which is a duplicate of the first
4535 C<len> bytes from C<pv>, plus a trailing
4536 C<NUL> byte. The memory allocated for
4537 the new string can be freed with the C<Safefree()> function.
4538
4539 On some platforms, Windows for example, all allocated memory owned by a thread
4540 is deallocated when that thread ends. So if you need that not to happen, you
4541 need to use the shared memory functions, such as C<L</savesharedpvn>>.
4542
4543 =cut
4544 */
4545
4546 PERL_STATIC_INLINE char *
Perl_savepvn(pTHX_ const char * pv,Size_t len)4547 Perl_savepvn(pTHX_ const char *pv, Size_t len)
4548 {
4549 char *newaddr;
4550 PERL_UNUSED_CONTEXT;
4551
4552 Newx(newaddr,len+1,char);
4553 /* Give a meaning to NULL pointer mainly for the use in sv_magic() */
4554 if (pv) {
4555 /* might not be null terminated */
4556 newaddr[len] = '\0';
4557 return (char *) CopyD(pv,newaddr,len,char);
4558 }
4559 else {
4560 return (char *) ZeroD(newaddr,len+1,char);
4561 }
4562 }
4563
4564 /*
4565 =for apidoc savesvpv
4566
4567 A version of C<savepv()>/C<savepvn()> which gets the string to duplicate from
4568 the passed in SV using C<SvPV()>
4569
4570 On some platforms, Windows for example, all allocated memory owned by a thread
4571 is deallocated when that thread ends. So if you need that not to happen, you
4572 need to use the shared memory functions, such as C<L</savesharedsvpv>>.
4573
4574 =cut
4575 */
4576
4577 PERL_STATIC_INLINE char *
Perl_savesvpv(pTHX_ SV * sv)4578 Perl_savesvpv(pTHX_ SV *sv)
4579 {
4580 STRLEN len;
4581 const char * const pv = SvPV_const(sv, len);
4582 char *newaddr;
4583
4584 PERL_ARGS_ASSERT_SAVESVPV;
4585
4586 ++len;
4587 Newx(newaddr,len,char);
4588 return (char *) CopyD(pv,newaddr,len,char);
4589 }
4590
4591 /*
4592 =for apidoc savesharedsvpv
4593
4594 A version of C<savesharedpv()> which allocates the duplicate string in
4595 memory which is shared between threads.
4596
4597 =cut
4598 */
4599
4600 PERL_STATIC_INLINE char *
Perl_savesharedsvpv(pTHX_ SV * sv)4601 Perl_savesharedsvpv(pTHX_ SV *sv)
4602 {
4603 STRLEN len;
4604 const char * const pv = SvPV_const(sv, len);
4605
4606 PERL_ARGS_ASSERT_SAVESHAREDSVPV;
4607
4608 return savesharedpvn(pv, len);
4609 }
4610
4611 #ifndef PERL_GET_CONTEXT_DEFINED
4612
4613 /*
4614 =for apidoc_section $embedding
4615 =for apidoc get_context
4616
4617 Implements L<perlapi/C<PERL_GET_CONTEXT>>, which you should use instead.
4618
4619 =cut
4620 */
4621
4622 PERL_STATIC_INLINE void *
Perl_get_context(void)4623 Perl_get_context(void)
4624 {
4625 # if defined(USE_ITHREADS)
4626 # ifdef OLD_PTHREADS_API
4627 pthread_addr_t t;
4628 int error = pthread_getspecific(PL_thr_key, &t);
4629 if (error)
4630 Perl_croak_nocontext("panic: pthread_getspecific, error=%d", error);
4631 return (void*)t;
4632 # elif defined(I_MACH_CTHREADS)
4633 return (void*)cthread_data(cthread_self());
4634 # else
4635 return (void*)PTHREAD_GETSPECIFIC(PL_thr_key);
4636 # endif
4637 # else
4638 return (void*)NULL;
4639 # endif
4640 }
4641
4642 #endif
4643
4644 PERL_STATIC_INLINE MGVTBL*
Perl_get_vtbl(pTHX_ int vtbl_id)4645 Perl_get_vtbl(pTHX_ int vtbl_id)
4646 {
4647 PERL_UNUSED_CONTEXT;
4648
4649 return (vtbl_id < 0 || vtbl_id >= magic_vtable_max)
4650 ? NULL : (MGVTBL*)PL_magic_vtables + vtbl_id;
4651 }
4652
4653 /*
4654 =for apidoc my_strlcat
4655
4656 The C library C<strlcat> if available, or a Perl implementation of it.
4657 This operates on C C<NUL>-terminated strings.
4658
4659 C<my_strlcat()> appends string C<src> to the end of C<dst>. It will append at
4660 most S<C<size - strlen(dst) - 1>> bytes. It will then C<NUL>-terminate,
4661 unless C<size> is 0 or the original C<dst> string was longer than C<size> (in
4662 practice this should not happen as it means that either C<size> is incorrect or
4663 that C<dst> is not a proper C<NUL>-terminated string).
4664
4665 Note that C<size> is the full size of the destination buffer and
4666 the result is guaranteed to be C<NUL>-terminated if there is room. Note that
4667 room for the C<NUL> should be included in C<size>.
4668
4669 The return value is the total length that C<dst> would have if C<size> is
4670 sufficiently large. Thus it is the initial length of C<dst> plus the length of
4671 C<src>. If C<size> is smaller than the return, the excess was not appended.
4672
4673 =cut
4674
4675 Description stolen from http://man.openbsd.org/strlcat.3
4676 */
4677 #ifndef HAS_STRLCAT
4678 PERL_STATIC_INLINE Size_t
Perl_my_strlcat(char * dst,const char * src,Size_t size)4679 Perl_my_strlcat(char *dst, const char *src, Size_t size)
4680 {
4681 Size_t used, length, copy;
4682
4683 used = strlen(dst);
4684 length = strlen(src);
4685 if (size > 0 && used < size - 1) {
4686 copy = (length >= size - used) ? size - used - 1 : length;
4687 memcpy(dst + used, src, copy);
4688 dst[used + copy] = '\0';
4689 }
4690 return used + length;
4691 }
4692 #endif
4693
4694
4695 /*
4696 =for apidoc my_strlcpy
4697
4698 The C library C<strlcpy> if available, or a Perl implementation of it.
4699 This operates on C C<NUL>-terminated strings.
4700
4701 C<my_strlcpy()> copies up to S<C<size - 1>> bytes from the string C<src>
4702 to C<dst>, C<NUL>-terminating the result if C<size> is not 0.
4703
4704 The return value is the total length C<src> would be if the copy completely
4705 succeeded. If it is larger than C<size>, the excess was not copied.
4706
4707 =cut
4708
4709 Description stolen from http://man.openbsd.org/strlcpy.3
4710 */
4711 #ifndef HAS_STRLCPY
4712 PERL_STATIC_INLINE Size_t
Perl_my_strlcpy(char * dst,const char * src,Size_t size)4713 Perl_my_strlcpy(char *dst, const char *src, Size_t size)
4714 {
4715 Size_t length, copy;
4716
4717 length = strlen(src);
4718 if (size > 0) {
4719 copy = (length >= size) ? size - 1 : length;
4720 memcpy(dst, src, copy);
4721 dst[copy] = '\0';
4722 }
4723 return length;
4724 }
4725 #endif
4726
4727 /*
4728 * ex: set ts=8 sts=4 sw=4 et:
4729 */
4730