xref: /openbsd/gnu/usr.bin/perl/inline.h (revision 3d61058a)
1 /*    inline.h
2  *
3  *    Copyright (C) 2012 by Larry Wall and others
4  *
5  *    You may distribute under the terms of either the GNU General Public
6  *    License or the Artistic License, as specified in the README file.
7  *
8  *    This file contains tables and code adapted from
9  *    https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
10  *    copyright notice:
11 
12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13 
14 Permission is hereby granted, free of charge, to any person obtaining a copy of
15 this software and associated documentation files (the "Software"), to deal in
16 the Software without restriction, including without limitation the rights to
17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18 of the Software, and to permit persons to whom the Software is furnished to do
19 so, subject to the following conditions:
20 
21 The above copyright notice and this permission notice shall be included in all
22 copies or substantial portions of the Software.
23 
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 SOFTWARE.
31 
32  *
33  * This file is a home for static inline functions that cannot go in other
34  * header files, because they depend on proto.h (included after most other
35  * headers) or struct definitions.
36  *
37  * Note also perlstatic.h for functions that can't or shouldn't be inlined, but
38  * whose details should be exposed to the compiler, for such things as tail
39  * call optimization.
40  *
41  * Each section names the header file that the functions "belong" to.
42  */
43 
44 /* ------------------------------- av.h ------------------------------- */
45 
46 /*
47 =for apidoc_section $AV
48 =for apidoc av_count
49 Returns the number of elements in the array C<av>.  This is the true length of
50 the array, including any undefined elements.  It is always the same as
51 S<C<av_top_index(av) + 1>>.
52 
53 =cut
54 */
55 PERL_STATIC_INLINE Size_t
Perl_av_count(pTHX_ AV * av)56 Perl_av_count(pTHX_ AV *av)
57 {
58     PERL_ARGS_ASSERT_AV_COUNT;
59     assert(SvTYPE(av) == SVt_PVAV);
60 
61     return AvFILL(av) + 1;
62 }
63 
64 /* ------------------------------- av.c ------------------------------- */
65 
66 /*
67 =for apidoc av_store_simple
68 
69 This is a cut-down version of av_store that assumes that the array is
70 very straightforward - no magic, not readonly, and AvREAL - and that
71 C<key> is not negative. This function MUST NOT be used in situations
72 where any of those assumptions may not hold.
73 
74 Stores an SV in an array.  The array index is specified as C<key>. It
75 can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
76 
77 Note that the caller is responsible for suitably incrementing the reference
78 count of C<val> before the call.
79 
80 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
81 
82 =cut
83 */
84 
85 PERL_STATIC_INLINE SV**
Perl_av_store_simple(pTHX_ AV * av,SSize_t key,SV * val)86 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
87 {
88     SV** ary;
89 
90     PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
91     assert(SvTYPE(av) == SVt_PVAV);
92     assert(!SvMAGICAL(av));
93     assert(!SvREADONLY(av));
94     assert(AvREAL(av));
95     assert(key > -1);
96 
97     ary = AvARRAY(av);
98 
99     if (AvFILLp(av) < key) {
100         if (key > AvMAX(av)) {
101             av_extend(av,key);
102             ary = AvARRAY(av);
103         }
104         AvFILLp(av) = key;
105     } else
106         SvREFCNT_dec(ary[key]);
107 
108     ary[key] = val;
109     return &ary[key];
110 }
111 
112 /*
113 =for apidoc av_fetch_simple
114 
115 This is a cut-down version of av_fetch that assumes that the array is
116 very straightforward - no magic, not readonly, and AvREAL - and that
117 C<key> is not negative. This function MUST NOT be used in situations
118 where any of those assumptions may not hold.
119 
120 Returns the SV at the specified index in the array.  The C<key> is the
121 index.  If lval is true, you are guaranteed to get a real SV back (in case
122 it wasn't real before), which you can then modify.  Check that the return
123 value is non-null before dereferencing it to a C<SV*>.
124 
125 The rough perl equivalent is C<$myarray[$key]>.
126 
127 =cut
128 */
129 
130 PERL_STATIC_INLINE SV**
Perl_av_fetch_simple(pTHX_ AV * av,SSize_t key,I32 lval)131 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
132 {
133     PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
134     assert(SvTYPE(av) == SVt_PVAV);
135     assert(!SvMAGICAL(av));
136     assert(!SvREADONLY(av));
137     assert(AvREAL(av));
138     assert(key > -1);
139 
140     if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
141         return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
142     } else {
143         return &AvARRAY(av)[key];
144     }
145 }
146 
147 /*
148 =for apidoc av_push_simple
149 
150 This is a cut-down version of av_push that assumes that the array is very
151 straightforward - no magic, not readonly, and AvREAL - and that C<key> is
152 not less than -1. This function MUST NOT be used in situations where any
153 of those assumptions may not hold.
154 
155 Pushes an SV (transferring control of one reference count) onto the end of the
156 array.  The array will grow automatically to accommodate the addition.
157 
158 Perl equivalent: C<push @myarray, $val;>.
159 
160 =cut
161 */
162 
163 PERL_STATIC_INLINE void
Perl_av_push_simple(pTHX_ AV * av,SV * val)164 Perl_av_push_simple(pTHX_ AV *av, SV *val)
165 {
166     PERL_ARGS_ASSERT_AV_PUSH_SIMPLE;
167     assert(SvTYPE(av) == SVt_PVAV);
168     assert(!SvMAGICAL(av));
169     assert(!SvREADONLY(av));
170     assert(AvREAL(av));
171     assert(AvFILLp(av) > -2);
172 
173     (void)av_store_simple(av,AvFILLp(av)+1,val);
174 }
175 
176 /*
177 =for apidoc av_new_alloc
178 
179 This implements L<perlapi/C<newAV_alloc_x>>
180 and L<perlapi/C<newAV_alloc_xz>>, which are the public API for this
181 functionality.
182 
183 Creates a new AV and allocates its SV* array.
184 
185 This is similar to, but more efficient than doing:
186 
187     AV *av = newAV();
188     av_extend(av, key);
189 
190 The size parameter is used to pre-allocate a SV* array large enough to
191 hold at least elements C<0..(size-1)>.  C<size> must be at least 1.
192 
193 The C<zeroflag> parameter controls whether or not the array is NULL
194 initialized.
195 
196 =cut
197 */
198 
199 PERL_STATIC_INLINE AV *
Perl_av_new_alloc(pTHX_ SSize_t size,bool zeroflag)200 Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag)
201 {
202     AV * const av = newAV();
203     SV** ary;
204     PERL_ARGS_ASSERT_AV_NEW_ALLOC;
205     assert(size > 0);
206 
207     Newx(ary, size, SV*); /* Newx performs the memwrap check */
208     AvALLOC(av) = ary;
209     AvARRAY(av) = ary;
210     AvMAX(av) = size - 1;
211 
212     if (zeroflag)
213         Zero(ary, size, SV*);
214 
215     return av;
216 }
217 
218 
219 /* remove (AvARRAY(av) - AvALLOC(av)) offset from empty array */
220 
221 PERL_STATIC_INLINE void
Perl_av_remove_offset(pTHX_ AV * av)222 Perl_av_remove_offset(pTHX_ AV *av)
223 {
224     PERL_ARGS_ASSERT_AV_REMOVE_OFFSET;
225     assert(AvFILLp(av) == -1);
226     SSize_t i = AvARRAY(av) - AvALLOC(av);
227     if (i) {
228         AvARRAY(av) = AvALLOC(av);
229         AvMAX(av)   += i;
230 #ifdef PERL_RC_STACK
231         Zero(AvALLOC(av), i, SV*);
232 #endif
233     }
234 }
235 
236 
237 /* ------------------------------- cv.h ------------------------------- */
238 
239 /*
240 =for apidoc_section $CV
241 =for apidoc CvGV
242 Returns the GV associated with the CV C<sv>, reifying it if necessary.
243 
244 =cut
245 */
246 PERL_STATIC_INLINE GV *
Perl_CvGV(pTHX_ CV * sv)247 Perl_CvGV(pTHX_ CV *sv)
248 {
249     PERL_ARGS_ASSERT_CVGV;
250 
251     return CvNAMED(sv)
252         ? Perl_cvgv_from_hek(aTHX_ sv)
253         : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
254 }
255 
256 /*
257 =for apidoc CvDEPTH
258 Returns the recursion level of the CV C<sv>.  Hence >= 2 indicates we are in a
259 recursive call.
260 
261 =cut
262 */
263 PERL_STATIC_INLINE I32 *
Perl_CvDEPTH(const CV * const sv)264 Perl_CvDEPTH(const CV * const sv)
265 {
266     PERL_ARGS_ASSERT_CVDEPTH;
267     assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
268 
269     return &((XPVCV*)SvANY(sv))->xcv_depth;
270 }
271 
272 /*
273  CvPROTO returns the prototype as stored, which is not necessarily what
274  the interpreter should be using. Specifically, the interpreter assumes
275  that spaces have been stripped, which has been the case if the prototype
276  was added by toke.c, but is generally not the case if it was added elsewhere.
277  Since we can't enforce the spacelessness at assignment time, this routine
278  provides a temporary copy at parse time with spaces removed.
279  I<orig> is the start of the original buffer, I<len> is the length of the
280  prototype and will be updated when this returns.
281  */
282 
283 #ifdef PERL_CORE
284 PERL_STATIC_INLINE char *
S_strip_spaces(pTHX_ const char * orig,STRLEN * const len)285 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
286 {
287     SV * tmpsv;
288     char * tmps;
289     tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
290     tmps = SvPVX(tmpsv);
291     while ((*len)--) {
292         if (!isSPACE(*orig))
293             *tmps++ = *orig;
294         orig++;
295     }
296     *tmps = '\0';
297     *len = tmps - SvPVX(tmpsv);
298                 return SvPVX(tmpsv);
299 }
300 #endif
301 
302 /* ------------------------------- iperlsys.h ------------------------------- */
303 #if ! defined(PERL_IMPLICIT_SYS) && defined(USE_ITHREADS)
304 
305 /* Otherwise this function is implemented as macros in iperlsys.h */
306 
307 PERL_STATIC_INLINE bool
S_PerlEnv_putenv(pTHX_ char * str)308 S_PerlEnv_putenv(pTHX_ char * str)
309 {
310     PERL_ARGS_ASSERT_PERLENV_PUTENV;
311 
312     ENV_LOCK;
313     bool retval = putenv(str);
314     ENV_UNLOCK;
315 
316     return retval;
317 }
318 
319 #endif
320 
321 /* ------------------------------- mg.h ------------------------------- */
322 
323 #if defined(PERL_CORE) || defined(PERL_EXT)
324 /* assumes get-magic and stringification have already occurred */
325 PERL_STATIC_INLINE STRLEN
S_MgBYTEPOS(pTHX_ MAGIC * mg,SV * sv,const char * s,STRLEN len)326 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
327 {
328     assert(mg->mg_type == PERL_MAGIC_regex_global);
329     assert(mg->mg_len != -1);
330     if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
331         return (STRLEN)mg->mg_len;
332     else {
333         const STRLEN pos = (STRLEN)mg->mg_len;
334         /* Without this check, we may read past the end of the buffer: */
335         if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
336         return sv_or_pv_pos_u2b(sv, s, pos, NULL);
337     }
338 }
339 #endif
340 
341 /* ------------------------------- pad.h ------------------------------ */
342 
343 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
344 PERL_STATIC_INLINE bool
S_PadnameIN_SCOPE(const PADNAME * const pn,const U32 seq)345 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
346 {
347     PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
348 
349     /* is seq within the range _LOW to _HIGH ?
350      * This is complicated by the fact that PL_cop_seqmax
351      * may have wrapped around at some point */
352     if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
353         return FALSE; /* not yet introduced */
354 
355     if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
356     /* in compiling scope */
357         if (
358             (seq >  COP_SEQ_RANGE_LOW(pn))
359             ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
360             : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
361         )
362             return TRUE;
363     }
364     else if (
365         (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
366         ?
367             (  seq >  COP_SEQ_RANGE_LOW(pn)
368             || seq <= COP_SEQ_RANGE_HIGH(pn))
369 
370         :    (  seq >  COP_SEQ_RANGE_LOW(pn)
371              && seq <= COP_SEQ_RANGE_HIGH(pn))
372     )
373         return TRUE;
374     return FALSE;
375 }
376 #endif
377 
378 /* ------------------------------- pp.h ------------------------------- */
379 
380 PERL_STATIC_INLINE Stack_off_t
Perl_TOPMARK(pTHX)381 Perl_TOPMARK(pTHX)
382 {
383     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
384                                  "MARK top  %p %" IVdf "\n",
385                                   PL_markstack_ptr,
386                                   (IV)*PL_markstack_ptr)));
387     return *PL_markstack_ptr;
388 }
389 
390 PERL_STATIC_INLINE Stack_off_t
Perl_POPMARK(pTHX)391 Perl_POPMARK(pTHX)
392 {
393     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
394                                  "MARK pop  %p %" IVdf "\n",
395                                   (PL_markstack_ptr-1),
396                                   (IV)*(PL_markstack_ptr-1))));
397     assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
398     return *PL_markstack_ptr--;
399 }
400 
401 /*
402 =for apidoc_section $rpp
403 
404 =for apidoc rpp_extend
405 Ensures that there is space on the stack to push C<n> items, extending it
406 if necessary.
407 
408 =cut
409 */
410 
411 PERL_STATIC_INLINE void
Perl_rpp_extend(pTHX_ SSize_t n)412 Perl_rpp_extend(pTHX_ SSize_t n)
413 {
414     PERL_ARGS_ASSERT_RPP_EXTEND;
415 
416     EXTEND_HWM_SET(PL_stack_sp, n);
417 #ifndef STRESS_REALLOC
418     if (UNLIKELY(_EXTEND_NEEDS_GROW(PL_stack_sp, n)))
419 #endif
420     {
421         (void)stack_grow(PL_stack_sp, PL_stack_sp, n);
422     }
423 }
424 
425 
426 /*
427 =for apidoc rpp_popfree_to
428 
429 Pop and free all items on the argument stack above C<sp>. On return,
430 C<PL_stack_sp> will be equal to C<sp>.
431 
432 =cut
433 */
434 
435 PERL_STATIC_INLINE void
Perl_rpp_popfree_to(pTHX_ SV ** sp)436 Perl_rpp_popfree_to(pTHX_ SV **sp)
437 {
438     PERL_ARGS_ASSERT_RPP_POPFREE_TO;
439 
440     assert(sp <= PL_stack_sp);
441 #ifdef PERL_RC_STACK
442     assert(rpp_stack_is_rc());
443     while (PL_stack_sp > sp) {
444         SV *sv = *PL_stack_sp--;
445         SvREFCNT_dec(sv);
446     }
447 #else
448     PL_stack_sp = sp;
449 #endif
450 }
451 
452 
453 /*
454 =for apidoc rpp_popfree_to_NN
455 
456 A variant of rpp_popfree_to() which assumes that all the pointers being
457 popped off the stack are non-NULL.
458 
459 =cut
460 */
461 
462 PERL_STATIC_INLINE void
Perl_rpp_popfree_to_NN(pTHX_ SV ** sp)463 Perl_rpp_popfree_to_NN(pTHX_ SV **sp)
464 {
465     PERL_ARGS_ASSERT_RPP_POPFREE_TO_NN;
466 
467     assert(sp <= PL_stack_sp);
468 #ifdef PERL_RC_STACK
469     assert(rpp_stack_is_rc());
470     while (PL_stack_sp > sp) {
471         SV *sv = *PL_stack_sp--;
472         assert(sv);
473         SvREFCNT_dec_NN(sv);
474     }
475 #else
476     PL_stack_sp = sp;
477 #endif
478 }
479 
480 
481 /*
482 =for apidoc rpp_popfree_1
483 
484 Pop and free the top item on the argument stack and update C<PL_stack_sp>.
485 
486 =cut
487 */
488 
489 PERL_STATIC_INLINE void
Perl_rpp_popfree_1(pTHX)490 Perl_rpp_popfree_1(pTHX)
491 {
492     PERL_ARGS_ASSERT_RPP_POPFREE_1;
493 
494 #ifdef PERL_RC_STACK
495     assert(rpp_stack_is_rc());
496     SV *sv = *PL_stack_sp--;
497     SvREFCNT_dec(sv);
498 #else
499     PL_stack_sp--;
500 #endif
501 }
502 
503 
504 /*
505 =for apidoc rpp_popfree_1_NN
506 
507 A variant of rpp_popfree_1() which assumes that the pointer being popped
508 off the stack is non-NULL.
509 
510 =cut
511 */
512 
513 PERL_STATIC_INLINE void
Perl_rpp_popfree_1_NN(pTHX)514 Perl_rpp_popfree_1_NN(pTHX)
515 {
516     PERL_ARGS_ASSERT_RPP_POPFREE_1_NN;
517 
518     assert(*PL_stack_sp);
519 #ifdef PERL_RC_STACK
520     assert(rpp_stack_is_rc());
521     SV *sv = *PL_stack_sp--;
522     SvREFCNT_dec_NN(sv);
523 #else
524     PL_stack_sp--;
525 #endif
526 }
527 
528 
529 /*
530 =for apidoc rpp_popfree_2
531 
532 Pop and free the top two items on the argument stack and update
533 C<PL_stack_sp>.
534 
535 =cut
536 */
537 
538 
539 PERL_STATIC_INLINE void
Perl_rpp_popfree_2(pTHX)540 Perl_rpp_popfree_2(pTHX)
541 {
542     PERL_ARGS_ASSERT_RPP_POPFREE_2;
543 
544 #ifdef PERL_RC_STACK
545     assert(rpp_stack_is_rc());
546     for (int i = 0; i < 2; i++) {
547         SV *sv = *PL_stack_sp--;
548         SvREFCNT_dec(sv);
549     }
550 #else
551     PL_stack_sp -= 2;
552 #endif
553 }
554 
555 
556 /*
557 =for apidoc rpp_popfree_2_NN
558 
559 A variant of rpp_popfree_2() which assumes that the two pointers being
560 popped off the stack are non-NULL.
561 
562 =cut
563 */
564 
565 
566 PERL_STATIC_INLINE void
Perl_rpp_popfree_2_NN(pTHX)567 Perl_rpp_popfree_2_NN(pTHX)
568 {
569     PERL_ARGS_ASSERT_RPP_POPFREE_2_NN;
570 #ifdef PERL_RC_STACK
571     SV *sv2 = *PL_stack_sp--;
572     assert(sv2);
573     SV *sv1 = *PL_stack_sp;
574     assert(sv1);
575 
576     assert(rpp_stack_is_rc());
577     U32 rc1 = SvREFCNT(sv1);
578     U32 rc2 = SvREFCNT(sv2);
579     /* This expression is intended to be true if either of rc1 or rc2 has
580      * the value 0 or 1, but using only a single branch test, rather
581      * than the two branches that a compiler would plant for a boolean
582      * expression. We are working on the assumption that, most of the
583      * time, neither of the args to a binary function will need to be
584      * freed - they're likely to lex vars, or PADTMPs or whatever.
585      * So give the CPU a single branch that is rarely taken. */
586     if (UNLIKELY( !(rc1>>1) + !(rc2>>1) ))
587         /* at least one of the old SVs needs freeing. Do it the long way */
588         Perl_rpp_free_2_(aTHX_ sv1, sv2, rc1, rc2);
589     else {
590         SvREFCNT(sv1) = rc1 - 1;
591         SvREFCNT(sv2) = rc2 - 1;
592     }
593     PL_stack_sp--;
594 #else
595     PL_stack_sp -= 2;
596 #endif
597 }
598 
599 
600 /*
601 =for apidoc rpp_pop_1_norc
602 
603 Pop and return the top item off the argument stack and update
604 C<PL_stack_sp>. It's similar to rpp_popfree_1(), except that it actually
605 returns a value, and it I<doesn't> decrement the SV's reference count.
606 On non-C<PERL_RC_STACK> builds it actually increments the SV's reference
607 count.
608 
609 This is useful in cases where the popped value is immediately embedded
610 somewhere e.g. via av_store(), allowing you skip decrementing and then
611 immediately incrementing the reference count again (and risk prematurely
612 freeing the SV if it had a RC of 1). On non-RC builds, the reference count
613 bookkeeping still works too, which is why it should be used rather than
614 a simple C<*PL_stack_sp-->.
615 
616 =cut
617 */
618 
619 PERL_STATIC_INLINE SV*
Perl_rpp_pop_1_norc(pTHX)620 Perl_rpp_pop_1_norc(pTHX)
621 {
622     PERL_ARGS_ASSERT_RPP_POP_1_NORC
623 
624     SV *sv = *PL_stack_sp--;
625 
626 #ifndef PERL_RC_STACK
627     SvREFCNT_inc(sv);
628 #else
629     assert(rpp_stack_is_rc());
630 #endif
631     return sv;
632 }
633 
634 
635 
636 /*
637 =for apidoc      rpp_push_1
638 =for apidoc_item rpp_push_IMM
639 =for apidoc_item rpp_push_2
640 =for apidoc_item rpp_xpush_1
641 =for apidoc_item rpp_xpush_IMM
642 =for apidoc_item rpp_xpush_2
643 
644 Push one or two SVs onto the stack, incrementing their reference counts
645 and updating C<PL_stack_sp>. With the C<x> variants, it extends the stack
646 first. The C<IMM> variants assume that the single argument is an immortal
647 such as <&PL_sv_undef> and, for efficiency, will skip incrementing its
648 reference count.
649 
650 =cut
651 */
652 
653 PERL_STATIC_INLINE void
Perl_rpp_push_1(pTHX_ SV * sv)654 Perl_rpp_push_1(pTHX_ SV *sv)
655 {
656     PERL_ARGS_ASSERT_RPP_PUSH_1;
657 
658     *++PL_stack_sp = sv;
659 #ifdef PERL_RC_STACK
660     assert(rpp_stack_is_rc());
661     SvREFCNT_inc_simple_void_NN(sv);
662 #endif
663 }
664 
665 PERL_STATIC_INLINE void
Perl_rpp_push_IMM(pTHX_ SV * sv)666 Perl_rpp_push_IMM(pTHX_ SV *sv)
667 {
668     PERL_ARGS_ASSERT_RPP_PUSH_IMM;
669 
670     assert(SvIMMORTAL(sv));
671     *++PL_stack_sp = sv;
672 #ifdef PERL_RC_STACK
673     assert(rpp_stack_is_rc());
674 #endif
675 }
676 
677 PERL_STATIC_INLINE void
Perl_rpp_push_2(pTHX_ SV * sv1,SV * sv2)678 Perl_rpp_push_2(pTHX_ SV *sv1, SV *sv2)
679 {
680     PERL_ARGS_ASSERT_RPP_PUSH_2;
681 
682     *++PL_stack_sp = sv1;
683     *++PL_stack_sp = sv2;
684 #ifdef PERL_RC_STACK
685     assert(rpp_stack_is_rc());
686     SvREFCNT_inc_simple_void_NN(sv1);
687     SvREFCNT_inc_simple_void_NN(sv2);
688 #endif
689 }
690 
691 PERL_STATIC_INLINE void
Perl_rpp_xpush_1(pTHX_ SV * sv)692 Perl_rpp_xpush_1(pTHX_ SV *sv)
693 {
694     PERL_ARGS_ASSERT_RPP_XPUSH_1;
695 
696     rpp_extend(1);
697     rpp_push_1(sv);
698 }
699 
700 PERL_STATIC_INLINE void
Perl_rpp_xpush_IMM(pTHX_ SV * sv)701 Perl_rpp_xpush_IMM(pTHX_ SV *sv)
702 {
703     PERL_ARGS_ASSERT_RPP_XPUSH_IMM;
704 
705     rpp_extend(1);
706     rpp_push_IMM(sv);
707 }
708 
709 PERL_STATIC_INLINE void
Perl_rpp_xpush_2(pTHX_ SV * sv1,SV * sv2)710 Perl_rpp_xpush_2(pTHX_ SV *sv1, SV *sv2)
711 {
712     PERL_ARGS_ASSERT_RPP_XPUSH_2;
713 
714     rpp_extend(2);
715     rpp_push_2(sv1, sv2);
716 }
717 
718 
719 /*
720 =for apidoc rpp_push_1_norc
721 
722 Push C<sv> onto the stack without incrementing its reference count, and
723 update C<PL_stack_sp>. On non-PERL_RC_STACK builds, mortalise too.
724 
725 This is most useful where an SV has just been created and already has a
726 reference count of 1, but has not yet been anchored anywhere.
727 
728 =cut
729 */
730 
731 PERL_STATIC_INLINE void
Perl_rpp_push_1_norc(pTHX_ SV * sv)732 Perl_rpp_push_1_norc(pTHX_ SV *sv)
733 {
734     PERL_ARGS_ASSERT_RPP_PUSH_1;
735 
736     *++PL_stack_sp = sv;
737 #ifdef PERL_RC_STACK
738     assert(rpp_stack_is_rc());
739 #else
740     sv_2mortal(sv);
741 #endif
742 }
743 
744 
745 /*
746 =for apidoc      rpp_replace_1_1
747 =for apidoc_item rpp_replace_1_1_NN
748 =for apidoc_item rpp_replace_1_IMM_NN
749 
750 Replace the current top stack item with C<sv>, while suitably adjusting
751 reference counts. Equivalent to rpp_popfree_1(); rpp_push_1(sv), but
752 is more efficient and handles both SVs being the same.
753 
754 The C<_NN> variant assumes that the pointer on the stack to the SV being
755 freed is non-NULL.
756 
757 The C<IMM_NN> variant is like the C<_NN> variant, but in addition, assumes
758 that the single argument is an immortal such as <&PL_sv_undef> and, for
759 efficiency, will skip incrementing its reference count.
760 
761 =cut
762 */
763 
764 PERL_STATIC_INLINE void
Perl_rpp_replace_1_1(pTHX_ SV * sv)765 Perl_rpp_replace_1_1(pTHX_ SV *sv)
766 {
767     PERL_ARGS_ASSERT_RPP_REPLACE_1_1;
768 
769     assert(sv);
770 #ifdef PERL_RC_STACK
771     assert(rpp_stack_is_rc());
772     SV *oldsv = *PL_stack_sp;
773     *PL_stack_sp = sv;
774     SvREFCNT_inc_simple_void_NN(sv);
775     SvREFCNT_dec(oldsv);
776 #else
777     *PL_stack_sp = sv;
778 #endif
779 }
780 
781 
782 PERL_STATIC_INLINE void
Perl_rpp_replace_1_1_NN(pTHX_ SV * sv)783 Perl_rpp_replace_1_1_NN(pTHX_ SV *sv)
784 {
785     PERL_ARGS_ASSERT_RPP_REPLACE_1_1_NN;
786 
787     assert(sv);
788     assert(*PL_stack_sp);
789 #ifdef PERL_RC_STACK
790     assert(rpp_stack_is_rc());
791     SV *oldsv = *PL_stack_sp;
792     *PL_stack_sp = sv;
793     SvREFCNT_inc_simple_void_NN(sv);
794     SvREFCNT_dec_NN(oldsv);
795 #else
796     *PL_stack_sp = sv;
797 #endif
798 }
799 
800 
801 PERL_STATIC_INLINE void
Perl_rpp_replace_1_IMM_NN(pTHX_ SV * sv)802 Perl_rpp_replace_1_IMM_NN(pTHX_ SV *sv)
803 {
804     PERL_ARGS_ASSERT_RPP_REPLACE_1_IMM_NN;
805 
806     assert(sv);
807     assert(SvIMMORTAL(sv));
808     assert(*PL_stack_sp);
809 #ifdef PERL_RC_STACK
810     assert(rpp_stack_is_rc());
811     SV *oldsv = *PL_stack_sp;
812     *PL_stack_sp = sv;
813     SvREFCNT_dec_NN(oldsv);
814 #else
815     *PL_stack_sp = sv;
816 #endif
817 }
818 
819 
820 /*
821 =for apidoc      rpp_replace_2_1
822 =for apidoc_item rpp_replace_2_1_NN
823 =for apidoc_item rpp_replace_2_IMM_NN
824 
825 Replace the current top to stacks item with C<sv>, while suitably
826 adjusting reference counts. Equivalent to rpp_popfree_2(); rpp_push_1(sv),
827 but is more efficient and handles SVs being the same.
828 
829 The C<_NN> variant assumes that the pointers on the stack to the SVs being
830 freed are non-NULL.
831 
832 The C<IMM_NN> variant is like the C<_NN> variant, but in addition, assumes
833 that the single argument is an immortal such as <&PL_sv_undef> and, for
834 efficiency, will skip incrementing its reference count.
835 =cut
836 */
837 
838 PERL_STATIC_INLINE void
Perl_rpp_replace_2_1(pTHX_ SV * sv)839 Perl_rpp_replace_2_1(pTHX_ SV *sv)
840 {
841     PERL_ARGS_ASSERT_RPP_REPLACE_2_1;
842 
843 #ifdef PERL_RC_STACK
844     assert(rpp_stack_is_rc());
845     /* replace PL_stack_sp[-1] first; leave PL_stack_sp[0] in place while
846      * we free [-1], so if an exception occurs, [0] will still be freed.
847      */
848     SV *oldsv = PL_stack_sp[-1];
849     PL_stack_sp[-1] = sv;
850     SvREFCNT_inc_simple_void_NN(sv);
851     SvREFCNT_dec(oldsv);
852     oldsv = *PL_stack_sp--;
853     SvREFCNT_dec(oldsv);
854 #else
855     *--PL_stack_sp = sv;
856 #endif
857 }
858 
859 
860 /* Private helper function for _NN and _IMM_NN variants.
861  * Assumes sv has already had its ref count incremented,
862  * ready for being put on the stack.
863  * Intended to be small and fast, since it's inlined into many hot parts of
864  * code.
865  */
866 
867 PERL_STATIC_INLINE void
Perl_rpp_replace_2_1_COMMON(pTHX_ SV * sv)868 Perl_rpp_replace_2_1_COMMON(pTHX_ SV *sv)
869 {
870 
871     assert(sv);
872 #ifdef PERL_RC_STACK
873     SV *sv2 = *PL_stack_sp--;
874     assert(sv2);
875     SV *sv1 = *PL_stack_sp;
876     assert(sv1);
877 
878     *PL_stack_sp = sv;
879     assert(rpp_stack_is_rc());
880     U32 rc1 = SvREFCNT(sv1);
881     U32 rc2 = SvREFCNT(sv2);
882     /* This expression is intended to be true if either of rc1 or rc2 has
883      * the value 0 or 1, but using only a single branch test, rather
884      * than the two branches that a compiler would plant for a boolean
885      * expression. We are working on the assumption that, most of the
886      * time, neither of the args to a binary function will need to be
887      * freed - they're likely to lex vars, or PADTMPs or whatever.
888      * So give the CPU a single branch that is rarely taken. */
889     if (UNLIKELY( !(rc1>>1) + !(rc2>>1) ))
890         /* at least one of the old SVs needs freeing. Do it the long way */
891         Perl_rpp_free_2_(aTHX_ sv1, sv2, rc1, rc2);
892     else {
893         SvREFCNT(sv1) = rc1 - 1;
894         SvREFCNT(sv2) = rc2 - 1;
895     }
896 #else
897     *--PL_stack_sp = sv;
898 #endif
899 }
900 
901 
902 PERL_STATIC_INLINE void
Perl_rpp_replace_2_1_NN(pTHX_ SV * sv)903 Perl_rpp_replace_2_1_NN(pTHX_ SV *sv)
904 {
905     PERL_ARGS_ASSERT_RPP_REPLACE_2_1_NN;
906 
907     assert(sv);
908 #ifdef PERL_RC_STACK
909     SvREFCNT_inc_simple_void_NN(sv);
910 #endif
911     rpp_replace_2_1_COMMON(sv);
912 }
913 
914 
915 PERL_STATIC_INLINE void
Perl_rpp_replace_2_IMM_NN(pTHX_ SV * sv)916 Perl_rpp_replace_2_IMM_NN(pTHX_ SV *sv)
917 {
918     PERL_ARGS_ASSERT_RPP_REPLACE_2_IMM_NN;
919 
920     assert(sv);
921     assert(SvIMMORTAL(sv));
922     rpp_replace_2_1_COMMON(sv);
923 }
924 
925 
926 /*
927 =for apidoc rpp_replace_at
928 
929 Replace the SV at address sp within the stack with C<sv>, while suitably
930 adjusting reference counts. Equivalent to C<*sp = sv>, except with proper
931 reference count handling.
932 
933 =cut
934 */
935 
936 PERL_STATIC_INLINE void
Perl_rpp_replace_at(pTHX_ SV ** sp,SV * sv)937 Perl_rpp_replace_at(pTHX_ SV **sp, SV *sv)
938 {
939     PERL_ARGS_ASSERT_RPP_REPLACE_AT;
940 
941 #ifdef PERL_RC_STACK
942     assert(rpp_stack_is_rc());
943     SV *oldsv = *sp;
944     *sp = sv;
945     SvREFCNT_inc_simple_void_NN(sv);
946     SvREFCNT_dec(oldsv);
947 #else
948     *sp = sv;
949 #endif
950 }
951 
952 
953 /*
954 =for apidoc rpp_replace_at_NN
955 
956 A variant of rpp_replace_at() which assumes that the SV pointer on the
957 stack is non-NULL.
958 
959 =cut
960 */
961 
962 PERL_STATIC_INLINE void
Perl_rpp_replace_at_NN(pTHX_ SV ** sp,SV * sv)963 Perl_rpp_replace_at_NN(pTHX_ SV **sp, SV *sv)
964 {
965     PERL_ARGS_ASSERT_RPP_REPLACE_AT_NN;
966 
967     assert(sv);
968     assert(*sp);
969 #ifdef PERL_RC_STACK
970     assert(rpp_stack_is_rc());
971     SV *oldsv = *sp;
972     *sp = sv;
973     SvREFCNT_inc_simple_void_NN(sv);
974     SvREFCNT_dec_NN(oldsv);
975 #else
976     *sp = sv;
977 #endif
978 }
979 
980 
981 /*
982 =for apidoc rpp_replace_at_norc
983 
984 Replace the SV at address sp within the stack with C<sv>, while suitably
985 adjusting the reference count of the old SV. Equivalent to C<*sp = sv>,
986 except with proper reference count handling.
987 
988 C<sv>'s reference count doesn't get incremented. On non-C<PERL_RC_STACK>
989 builds, it gets mortalised too.
990 
991 This is most useful where an SV has just been created and already has a
992 reference count of 1, but has not yet been anchored anywhere.
993 
994 =cut
995 */
996 
997 PERL_STATIC_INLINE void
Perl_rpp_replace_at_norc(pTHX_ SV ** sp,SV * sv)998 Perl_rpp_replace_at_norc(pTHX_ SV **sp, SV *sv)
999 {
1000     PERL_ARGS_ASSERT_RPP_REPLACE_AT_NORC;
1001 
1002 #ifdef PERL_RC_STACK
1003     assert(rpp_stack_is_rc());
1004     SV *oldsv = *sp;
1005     *sp = sv;
1006     SvREFCNT_dec(oldsv);
1007 #else
1008     *sp = sv;
1009     sv_2mortal(sv);
1010 #endif
1011 }
1012 
1013 
1014 /*
1015 =for apidoc rpp_replace_at_norc_NN
1016 
1017 A variant of rpp_replace_at_norc() which assumes that the SV pointer on the
1018 stack is non-NULL.
1019 
1020 =cut
1021 */
1022 
1023 PERL_STATIC_INLINE void
Perl_rpp_replace_at_norc_NN(pTHX_ SV ** sp,SV * sv)1024 Perl_rpp_replace_at_norc_NN(pTHX_ SV **sp, SV *sv)
1025 {
1026     PERL_ARGS_ASSERT_RPP_REPLACE_AT_NORC_NN;
1027 
1028     assert(*sp);
1029 #ifdef PERL_RC_STACK
1030     assert(rpp_stack_is_rc());
1031     SV *oldsv = *sp;
1032     *sp = sv;
1033     SvREFCNT_dec_NN(oldsv);
1034 #else
1035     *sp = sv;
1036     sv_2mortal(sv);
1037 #endif
1038 }
1039 
1040 
1041 /*
1042 =for apidoc rpp_context
1043 
1044 Impose void, scalar or list context on the stack.
1045 First, pop C<extra> items off the stack, then when C<gimme> is:
1046 C<G_LIST>:   return as-is.
1047 C<G_VOID>:   pop everything back to C<mark>
1048 C<G_SCALAR>: move the top stack item (or C<&PL_sv_undef> if none) to
1049 C<mark+1> and free everything above it.
1050 
1051 =cut
1052 */
1053 
1054 PERL_STATIC_INLINE void
Perl_rpp_context(pTHX_ SV ** mark,U8 gimme,SSize_t extra)1055 Perl_rpp_context(pTHX_ SV **mark, U8 gimme, SSize_t extra)
1056 {
1057     PERL_ARGS_ASSERT_RPP_CONTEXT;
1058     assert(extra >= 0);
1059     assert(mark <= PL_stack_sp - extra);
1060 
1061     if (gimme == G_LIST)
1062         mark = PL_stack_sp - extra;
1063     else if (gimme == G_SCALAR) {
1064         SV **svp = PL_stack_sp - extra;
1065         mark++;
1066         if (mark > svp) {
1067             /* empty list (plus extra) */
1068             rpp_popfree_to(svp);
1069             rpp_extend(1);
1070             *++PL_stack_sp = &PL_sv_undef;
1071             return;
1072         }
1073         /* swap top and bottom list items */
1074         SV *top = *svp;
1075         *svp = *mark;
1076         *mark = top;
1077      }
1078     rpp_popfree_to(mark);
1079 }
1080 
1081 
1082 
1083 
1084 /*
1085 =for apidoc      rpp_try_AMAGIC_1
1086 =for apidoc_item rpp_try_AMAGIC_2
1087 
1088 Check whether either of the one or two SVs at the top of the stack is
1089 magical or a ref, and in either case handle it specially: invoke get
1090 magic, call an overload method, or replace a ref with a temporary numeric
1091 value, as appropriate. If this function returns true, it indicates that
1092 the correct return value is already on the stack. Intended to be used at
1093 the beginning of the PP function for unary or binary ops.
1094 
1095 =cut
1096 */
1097 
1098 PERL_STATIC_INLINE bool
Perl_rpp_try_AMAGIC_1(pTHX_ int method,int flags)1099 Perl_rpp_try_AMAGIC_1(pTHX_ int method, int flags)
1100 {
1101     return    UNLIKELY((SvFLAGS(*PL_stack_sp) & (SVf_ROK|SVs_GMG)))
1102            && Perl_try_amagic_un(aTHX_ method, flags);
1103 }
1104 
1105 PERL_STATIC_INLINE bool
Perl_rpp_try_AMAGIC_2(pTHX_ int method,int flags)1106 Perl_rpp_try_AMAGIC_2(pTHX_ int method, int flags)
1107 {
1108     return    UNLIKELY(((SvFLAGS(PL_stack_sp[-1])|SvFLAGS(PL_stack_sp[0]))
1109                      & (SVf_ROK|SVs_GMG)))
1110            && Perl_try_amagic_bin(aTHX_ method, flags);
1111 }
1112 
1113 
1114 /*
1115 =for apidoc rpp_stack_is_rc
1116 
1117 Returns a boolean value indicating whether the stack is currently
1118 reference-counted. Note that if the stack is split (bottom half RC, top
1119 half non-RC), this function returns false, even if the top half currently
1120 contains zero items.
1121 
1122 =cut
1123 */
1124 
1125 PERL_STATIC_INLINE bool
Perl_rpp_stack_is_rc(pTHX)1126 Perl_rpp_stack_is_rc(pTHX)
1127 {
1128 #ifdef PERL_RC_STACK
1129     return AvREAL(PL_curstack) && !PL_curstackinfo->si_stack_nonrc_base;
1130 #else
1131     return 0;
1132 #endif
1133 
1134 }
1135 
1136 
1137 /*
1138 =for apidoc rpp_is_lone
1139 
1140 Indicates whether the stacked SV C<sv> (assumed to be not yet popped off
1141 the stack) is only kept alive due to a single reference from the argument
1142 stack and/or and the temps stack.
1143 
1144 This can used for example to decide whether the copying of return values
1145 in rvalue context can be skipped, or whether it shouldn't be assigned to
1146 in lvalue context.
1147 
1148 =cut
1149 */
1150 
1151 
1152 PERL_STATIC_INLINE bool
Perl_rpp_is_lone(pTHX_ SV * sv)1153 Perl_rpp_is_lone(pTHX_ SV *sv)
1154 {
1155 #ifdef PERL_RC_STACK
1156     /* note that rpp_is_lone() can be used in wrapped pp functions,
1157      * where technically the stack is no longer ref-counted; but because
1158      * the args are non-RC copies of RC args further down the stack, we
1159      * can't be in a *completely* non-ref stack.
1160      */
1161     assert(AvREAL(PL_curstack));
1162 #endif
1163 
1164     return SvREFCNT(sv) <= cBOOL(SvTEMP(sv))
1165 #ifdef PERL_RC_STACK
1166                          + 1
1167             && !SvIMMORTAL(sv) /* PL_sv_undef etc are never stealable */
1168 #endif
1169     ;
1170 }
1171 
1172 
1173 /*
1174 =for apidoc rpp_invoke_xs
1175 
1176 Call the XS function associated with C<cv>. Wraps the call if necessary to
1177 handle XS functions which are not aware of reference-counted stacks.
1178 
1179 =cut
1180 */
1181 
1182 
1183 PERL_STATIC_INLINE void
Perl_rpp_invoke_xs(pTHX_ CV * cv)1184 Perl_rpp_invoke_xs(pTHX_ CV *cv)
1185 {
1186     PERL_ARGS_ASSERT_RPP_INVOKE_XS;
1187 
1188 #ifdef PERL_RC_STACK
1189     if (!CvXS_RCSTACK(cv))
1190         Perl_xs_wrap(aTHX_ CvXSUB(cv), cv);
1191     else
1192 #endif
1193         CvXSUB(cv)(aTHX_ cv);
1194 }
1195 
1196 
1197 
1198 
1199 /* ----------------------------- regexp.h ----------------------------- */
1200 
1201 /* PVLVs need to act as a superset of all scalar types - they are basically
1202  * PVMGs with a few extra fields.
1203  * REGEXPs are first class scalars, but have many fields that can't be copied
1204  * into a PVLV body.
1205  *
1206  * Hence we take a different approach - instead of a copy, PVLVs store a pointer
1207  * back to the original body. To avoid increasing the size of PVLVs just for the
1208  * rare case of REGEXP assignment, this pointer is stored in the memory usually
1209  * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
1210  * read the pointer from the two possible locations. The macro SvLEN() wraps the
1211  * access to the union's member xpvlenu_len, but there is no equivalent macro
1212  * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
1213  *
1214  * See commit df6b4bd56551f2d3 for more details. */
1215 
1216 PERL_STATIC_INLINE struct regexp *
Perl_ReANY(const REGEXP * const re)1217 Perl_ReANY(const REGEXP * const re)
1218 {
1219     XPV* const p = (XPV*)SvANY(re);
1220 
1221     PERL_ARGS_ASSERT_REANY;
1222     assert(isREGEXP(re));
1223 
1224     return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
1225                                    : (struct regexp *)p;
1226 }
1227 
1228 /* ------------------------------- utf8.h ------------------------------- */
1229 
1230 /*
1231 =for apidoc_section $unicode
1232 */
1233 
1234 PERL_STATIC_INLINE void
Perl_append_utf8_from_native_byte(const U8 byte,U8 ** dest)1235 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
1236 {
1237     /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
1238      * encoded string at '*dest', updating '*dest' to include it */
1239 
1240     PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
1241 
1242     if (NATIVE_BYTE_IS_INVARIANT(byte))
1243         *((*dest)++) = byte;
1244     else {
1245         *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
1246         *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
1247     }
1248 }
1249 
1250 /*
1251 =for apidoc valid_utf8_to_uvchr
1252 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
1253 known that the next character in the input UTF-8 string C<s> is well-formed
1254 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>.  Surrogates, non-character code
1255 points, and non-Unicode code points are allowed.
1256 
1257 =cut
1258 
1259  */
1260 
1261 PERL_STATIC_INLINE UV
Perl_valid_utf8_to_uvchr(const U8 * s,STRLEN * retlen)1262 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
1263 {
1264     const UV expectlen = UTF8SKIP(s);
1265     const U8* send = s + expectlen;
1266     UV uv = *s;
1267 
1268     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
1269 
1270     if (retlen) {
1271         *retlen = expectlen;
1272     }
1273 
1274     /* An invariant is trivially returned */
1275     if (expectlen == 1) {
1276         return uv;
1277     }
1278 
1279     /* Remove the leading bits that indicate the number of bytes, leaving just
1280      * the bits that are part of the value */
1281     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
1282 
1283     /* Now, loop through the remaining bytes, accumulating each into the
1284      * working total as we go.  (I khw tried unrolling the loop for up to 4
1285      * bytes, but there was no performance improvement) */
1286     for (++s; s < send; s++) {
1287         uv = UTF8_ACCUMULATE(uv, *s);
1288     }
1289 
1290     return UNI_TO_NATIVE(uv);
1291 
1292 }
1293 
1294 /*
1295 =for apidoc is_utf8_invariant_string
1296 
1297 Returns TRUE if the first C<len> bytes of the string C<s> are the same
1298 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
1299 EBCDIC machines); otherwise it returns FALSE.  That is, it returns TRUE if they
1300 are UTF-8 invariant.  On ASCII-ish machines, all the ASCII characters and only
1301 the ASCII characters fit this definition.  On EBCDIC machines, the ASCII-range
1302 characters are invariant, but so also are the C1 controls.
1303 
1304 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
1305 use this option, that C<s> can't have embedded C<NUL> characters and has to
1306 have a terminating C<NUL> byte).
1307 
1308 See also
1309 C<L</is_utf8_string>>,
1310 C<L</is_utf8_string_flags>>,
1311 C<L</is_utf8_string_loc>>,
1312 C<L</is_utf8_string_loc_flags>>,
1313 C<L</is_utf8_string_loclen>>,
1314 C<L</is_utf8_string_loclen_flags>>,
1315 C<L</is_utf8_fixed_width_buf_flags>>,
1316 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1317 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1318 C<L</is_strict_utf8_string>>,
1319 C<L</is_strict_utf8_string_loc>>,
1320 C<L</is_strict_utf8_string_loclen>>,
1321 C<L</is_c9strict_utf8_string>>,
1322 C<L</is_c9strict_utf8_string_loc>>,
1323 and
1324 C<L</is_c9strict_utf8_string_loclen>>.
1325 
1326 =cut
1327 
1328 */
1329 
1330 #define is_utf8_invariant_string(s, len)                                    \
1331                                 is_utf8_invariant_string_loc(s, len, NULL)
1332 
1333 /*
1334 =for apidoc is_utf8_invariant_string_loc
1335 
1336 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
1337 the first UTF-8 variant character in the C<ep> pointer; if all characters are
1338 UTF-8 invariant, this function does not change the contents of C<*ep>.
1339 
1340 =cut
1341 
1342 */
1343 
1344 PERL_STATIC_INLINE bool
Perl_is_utf8_invariant_string_loc(const U8 * const s,STRLEN len,const U8 ** ep)1345 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
1346 {
1347     const U8* send;
1348     const U8* x = s;
1349 
1350     PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
1351 
1352     if (len == 0) {
1353         len = strlen((const char *)s);
1354     }
1355 
1356     send = s + len;
1357 
1358 /* This looks like 0x010101... */
1359 #  define PERL_COUNT_MULTIPLIER   (~ (UINTMAX_C(0)) / 0xFF)
1360 
1361 /* This looks like 0x808080... */
1362 #  define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
1363 #  define PERL_WORDSIZE            sizeof(PERL_UINTMAX_T)
1364 #  define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
1365 
1366 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
1367  * or'ing together the lowest bits of 'x'.  Hopefully the final term gets
1368  * optimized out completely on a 32-bit system, and its mask gets optimized out
1369  * on a 64-bit system */
1370 #  define PERL_IS_SUBWORD_ADDR(x) (1 & (       PTR2nat(x)                     \
1371                                       |   (  PTR2nat(x) >> 1)                 \
1372                                       | ( ( (PTR2nat(x)                       \
1373                                            & PERL_WORD_BOUNDARY_MASK) >> 2))))
1374 
1375 #ifndef EBCDIC
1376 
1377     /* Do the word-at-a-time iff there is at least one usable full word.  That
1378      * means that after advancing to a word boundary, there still is at least a
1379      * full word left.  The number of bytes needed to advance is 'wordsize -
1380      * offset' unless offset is 0. */
1381     if ((STRLEN) (send - x) >= PERL_WORDSIZE
1382 
1383                             /* This term is wordsize if subword; 0 if not */
1384                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1385 
1386                             /* 'offset' */
1387                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1388     {
1389 
1390         /* Process per-byte until reach word boundary.  XXX This loop could be
1391          * eliminated if we knew that this platform had fast unaligned reads */
1392         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1393             if (! UTF8_IS_INVARIANT(*x)) {
1394                 if (ep) {
1395                     *ep = x;
1396                 }
1397 
1398                 return FALSE;
1399             }
1400             x++;
1401         }
1402 
1403         /* Here, we know we have at least one full word to process.  Process
1404          * per-word as long as we have at least a full word left */
1405         do {
1406             if ((* (const PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK)  {
1407 
1408                 /* Found a variant.  Just return if caller doesn't want its
1409                  * exact position */
1410                 if (! ep) {
1411                     return FALSE;
1412                 }
1413 
1414 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
1415      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1416 
1417                 *ep = x + variant_byte_number(* (const PERL_UINTMAX_T *) x);
1418                 assert(*ep >= s && *ep < send);
1419 
1420                 return FALSE;
1421 
1422 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
1423            checks. */
1424 
1425                 break;
1426 #  endif
1427             }
1428 
1429             x += PERL_WORDSIZE;
1430 
1431         } while (x + PERL_WORDSIZE <= send);
1432     }
1433 
1434 #endif      /* End of ! EBCDIC */
1435 
1436     /* Process per-byte */
1437     while (x < send) {
1438         if (! UTF8_IS_INVARIANT(*x)) {
1439             if (ep) {
1440                 *ep = x;
1441             }
1442 
1443             return FALSE;
1444         }
1445 
1446         x++;
1447     }
1448 
1449     return TRUE;
1450 }
1451 
1452 /* See if the platform has builtins for finding the most/least significant bit,
1453  * and which one is right for using on 32 and 64 bit operands */
1454 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
1455 #  if U32SIZE == INTSIZE
1456 #    define PERL_CLZ_32 __builtin_clz
1457 #  endif
1458 #  if defined(U64TYPE) && U64SIZE == INTSIZE
1459 #    define PERL_CLZ_64 __builtin_clz
1460 #  endif
1461 #endif
1462 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
1463 #  if U32SIZE == INTSIZE
1464 #    define PERL_CTZ_32 __builtin_ctz
1465 #  endif
1466 #  if defined(U64TYPE) && U64SIZE == INTSIZE
1467 #    define PERL_CTZ_64 __builtin_ctz
1468 #  endif
1469 #endif
1470 
1471 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
1472 #  if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
1473 #    define PERL_CLZ_32 __builtin_clzl
1474 #  endif
1475 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
1476 #    define PERL_CLZ_64 __builtin_clzl
1477 #  endif
1478 #endif
1479 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
1480 #  if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
1481 #    define PERL_CTZ_32 __builtin_ctzl
1482 #  endif
1483 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
1484 #    define PERL_CTZ_64 __builtin_ctzl
1485 #  endif
1486 #endif
1487 
1488 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
1489 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
1490 #    define PERL_CLZ_32 __builtin_clzll
1491 #  endif
1492 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
1493 #    define PERL_CLZ_64 __builtin_clzll
1494 #  endif
1495 #endif
1496 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
1497 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
1498 #    define PERL_CTZ_32 __builtin_ctzll
1499 #  endif
1500 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
1501 #    define PERL_CTZ_64 __builtin_ctzll
1502 #  endif
1503 #endif
1504 
1505 #if defined(WIN32)
1506 #  include <intrin.h>
1507    /* MinGW warns that it ignores "pragma intrinsic". */
1508 #  if defined(_MSC_VER)
1509 #    pragma intrinsic(_BitScanForward)
1510 #    pragma intrinsic(_BitScanReverse)
1511 #    if defined(_WIN64)
1512 #      pragma intrinsic(_BitScanForward64)
1513 #      pragma intrinsic(_BitScanReverse64)
1514 #    endif
1515 #  endif
1516 #endif
1517 
1518 /* The reason there are not checks to see if ffs() and ffsl() are available for
1519  * determining the lsb, is because these don't improve on the deBruijn method
1520  * fallback, which is just a branchless integer multiply, array element
1521  * retrieval, and shift.  The others, even if the function call overhead is
1522  * optimized out, have to cope with the possibility of the input being all
1523  * zeroes, and almost certainly will have conditionals for this eventuality.
1524  * khw, at the time of this commit, looked at the source for both gcc and clang
1525  * to verify this.  (gcc used a method inferior to deBruijn.) */
1526 
1527 /* Below are functions to find the first, last, or only set bit in a word.  On
1528  * platforms with 64-bit capability, there is a pair for each operation; the
1529  * first taking a 64 bit operand, and the second a 32 bit one.  The logic is
1530  * the same in each pair, so the second is stripped of most comments. */
1531 
1532 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
1533 
1534 PERL_STATIC_INLINE unsigned
Perl_lsbit_pos64(U64 word)1535 Perl_lsbit_pos64(U64 word)
1536 {
1537     /* Find the position (0..63) of the least significant set bit in the input
1538      * word */
1539 
1540     ASSUME(word != 0);
1541 
1542     /* If we can determine that the platform has a usable fast method to get
1543      * this info, use that */
1544 
1545 #  if defined(PERL_CTZ_64)
1546 #    define PERL_HAS_FAST_GET_LSB_POS64
1547 
1548     return (unsigned) PERL_CTZ_64(word);
1549 
1550 #  elif U64SIZE == 8 && defined(_WIN64)
1551 #    define PERL_HAS_FAST_GET_LSB_POS64
1552 
1553     {
1554         unsigned long index;
1555         _BitScanForward64(&index, word);
1556         return (unsigned)index;
1557     }
1558 
1559 #  else
1560 
1561     /* Here, we didn't find a fast method for finding the lsb.  Fall back to
1562      * making the lsb the only set bit in the word, and use our function that
1563      * works on words with a single bit set.
1564      *
1565      * Isolate the lsb;
1566      * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
1567      *
1568      * The word will look like this, with a rightmost set bit in position 's':
1569      * ('x's are don't cares, and 'y's are their complements)
1570      *      s
1571      *  x..x100..00
1572      *  y..y011..11      Complement
1573      *  y..y100..00      Add 1
1574      *  0..0100..00      And with the original
1575      *
1576      *  (Yes, complementing and adding 1 is just taking the negative on 2's
1577      *  complement machines, but not on 1's complement ones, and some compilers
1578      *  complain about negating an unsigned.)
1579      */
1580     return single_1bit_pos64(word & (~word + 1));
1581 
1582 #  endif
1583 
1584 }
1585 
1586 #  define lsbit_pos_uintmax_(word) lsbit_pos64(word)
1587 #else   /* ! QUAD */
1588 #  define lsbit_pos_uintmax_(word) lsbit_pos32(word)
1589 #endif
1590 
1591 PERL_STATIC_INLINE unsigned     /* Like above for 32 bit word */
Perl_lsbit_pos32(U32 word)1592 Perl_lsbit_pos32(U32 word)
1593 {
1594     /* Find the position (0..31) of the least significant set bit in the input
1595      * word */
1596 
1597     ASSUME(word != 0);
1598 
1599 #if defined(PERL_CTZ_32)
1600 #  define PERL_HAS_FAST_GET_LSB_POS32
1601 
1602     return (unsigned) PERL_CTZ_32(word);
1603 
1604 #elif U32SIZE == 4 && defined(WIN32)
1605 #  define PERL_HAS_FAST_GET_LSB_POS32
1606 
1607     {
1608         unsigned long index;
1609         _BitScanForward(&index, word);
1610         return (unsigned)index;
1611     }
1612 
1613 #elif defined(PERL_HAS_FAST_GET_LSB_POS64)
1614 #  define PERL_HAS_FAST_GET_LSB_POS32
1615 
1616     /* Unlikely, but possible for the platform to have a wider fast operation
1617      * but not a narrower one.  But easy enough to handle the case by widening
1618      * the parameter size. */
1619     return lsbit_pos64(word);
1620 
1621 #else
1622 
1623     return single_1bit_pos32(word & (~word + 1));
1624 
1625 #endif
1626 
1627 }
1628 
1629 
1630 /* Convert the leading zeros count to the bit position of the first set bit.
1631  * This just subtracts from the highest position, 31 or 63.  But some compilers
1632  * don't optimize this optimally, and so a bit of bit twiddling encourages them
1633  * to do the right thing.  It turns out that subtracting a smaller non-negative
1634  * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
1635  * the two numbers.  To see why, first note that the sum of any number, x, and
1636  * its complement, x', is all ones.  So all ones minus x is x'.  Then note that
1637  * the xor of x and all ones is x'. */
1638 #define LZC_TO_MSBIT_POS_(size, lzc)  ((size##SIZE * CHARBITS - 1) ^ (lzc))
1639 
1640 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
1641 
1642 PERL_STATIC_INLINE unsigned
Perl_msbit_pos64(U64 word)1643 Perl_msbit_pos64(U64 word)
1644 {
1645     /* Find the position (0..63) of the most significant set bit in the input
1646      * word */
1647 
1648     ASSUME(word != 0);
1649 
1650     /* If we can determine that the platform has a usable fast method to get
1651      * this, use that */
1652 
1653 #  if defined(PERL_CLZ_64)
1654 #    define PERL_HAS_FAST_GET_MSB_POS64
1655 
1656     return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
1657 
1658 #  elif U64SIZE == 8 && defined(_WIN64)
1659 #    define PERL_HAS_FAST_GET_MSB_POS64
1660 
1661     {
1662         unsigned long index;
1663         _BitScanReverse64(&index, word);
1664         return (unsigned)index;
1665     }
1666 
1667 #  else
1668 
1669     /* Here, we didn't find a fast method for finding the msb.  Fall back to
1670      * making the msb the only set bit in the word, and use our function that
1671      * works on words with a single bit set.
1672      *
1673      * Isolate the msb; http://codeforces.com/blog/entry/10330
1674      *
1675      * Only the most significant set bit matters.  Or'ing word with its right
1676      * shift of 1 makes that bit and the next one to its right both 1.
1677      * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
1678      * ...  We end with the msb and all to the right being 1. */
1679     word |= (word >>  1);
1680     word |= (word >>  2);
1681     word |= (word >>  4);
1682     word |= (word >>  8);
1683     word |= (word >> 16);
1684     word |= (word >> 32);
1685 
1686     /* Then subtracting the right shift by 1 clears all but the left-most of
1687      * the 1 bits, which is our desired result */
1688     word -= (word >> 1);
1689 
1690     /* Now we have a single bit set */
1691     return single_1bit_pos64(word);
1692 
1693 #  endif
1694 
1695 }
1696 
1697 #  define msbit_pos_uintmax_(word) msbit_pos64(word)
1698 #else   /* ! QUAD */
1699 #  define msbit_pos_uintmax_(word) msbit_pos32(word)
1700 #endif
1701 
1702 PERL_STATIC_INLINE unsigned
Perl_msbit_pos32(U32 word)1703 Perl_msbit_pos32(U32 word)
1704 {
1705     /* Find the position (0..31) of the most significant set bit in the input
1706      * word */
1707 
1708     ASSUME(word != 0);
1709 
1710 #if defined(PERL_CLZ_32)
1711 #  define PERL_HAS_FAST_GET_MSB_POS32
1712 
1713     return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
1714 #elif U32SIZE == 4 && defined(WIN32)
1715 #  define PERL_HAS_FAST_GET_MSB_POS32
1716 
1717     {
1718         unsigned long index;
1719         _BitScanReverse(&index, word);
1720         return (unsigned)index;
1721     }
1722 
1723 #elif defined(PERL_HAS_FAST_GET_MSB_POS64)
1724 #  define PERL_HAS_FAST_GET_MSB_POS32
1725 
1726     return msbit_pos64(word);   /* Let compiler widen parameter */
1727 
1728 #else
1729 
1730     word |= (word >>  1);
1731     word |= (word >>  2);
1732     word |= (word >>  4);
1733     word |= (word >>  8);
1734     word |= (word >> 16);
1735     word -= (word >> 1);
1736     return single_1bit_pos32(word);
1737 
1738 #endif
1739 
1740 }
1741 
1742 /* Note that if you are working through all the 1 bits in a word, and don't
1743  * care which order you process them in, it is better to use lsbit_pos.  This
1744  * is because some platforms have a fast way to find the msb but not the lsb,
1745  * and others vice versa.  The code above falls back to use the single
1746  * available fast method when the desired one is missing, and it is cheaper to
1747  * fall back from lsb to msb than the other way around */
1748 
1749 #if UVSIZE == U64SIZE
1750 #  define msbit_pos(word)  msbit_pos64(word)
1751 #  define lsbit_pos(word)  lsbit_pos64(word)
1752 #elif UVSIZE == U32SIZE
1753 #  define msbit_pos(word)  msbit_pos32(word)
1754 #  define lsbit_pos(word)  lsbit_pos32(word)
1755 #endif
1756 
1757 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
1758 
1759 PERL_STATIC_INLINE unsigned
Perl_single_1bit_pos64(U64 word)1760 Perl_single_1bit_pos64(U64 word)
1761 {
1762     /* Given a 64-bit word known to contain all zero bits except one 1 bit,
1763      * find and return the 1's position: 0..63 */
1764 
1765 #  ifdef PERL_CORE    /* macro not exported */
1766     ASSUME(isPOWER_OF_2(word));
1767 #  else
1768     ASSUME(word && (word & (word-1)) == 0);
1769 #  endif
1770 
1771     /* The only set bit is both the most and least significant bit.  If we have
1772      * a fast way of finding either one, use that.
1773      *
1774      * It may appear at first glance that those functions call this one, but
1775      * they don't if the corresponding #define is set */
1776 
1777 #  ifdef PERL_HAS_FAST_GET_MSB_POS64
1778 
1779     return msbit_pos64(word);
1780 
1781 #  elif defined(PERL_HAS_FAST_GET_LSB_POS64)
1782 
1783     return lsbit_pos64(word);
1784 
1785 #  else
1786 
1787     /* The position of the only set bit in a word can be quickly calculated
1788      * using deBruijn sequences.  See for example
1789      * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
1790     return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
1791                                                     >> PERL_deBruijnShift64_];
1792 #  endif
1793 
1794 }
1795 
1796 #endif
1797 
1798 PERL_STATIC_INLINE unsigned
Perl_single_1bit_pos32(U32 word)1799 Perl_single_1bit_pos32(U32 word)
1800 {
1801     /* Given a 32-bit word known to contain all zero bits except one 1 bit,
1802      * find and return the 1's position: 0..31 */
1803 
1804 #ifdef PERL_CORE    /* macro not exported */
1805     ASSUME(isPOWER_OF_2(word));
1806 #else
1807     ASSUME(word && (word & (word-1)) == 0);
1808 #endif
1809 #ifdef PERL_HAS_FAST_GET_MSB_POS32
1810 
1811     return msbit_pos32(word);
1812 
1813 #elif defined(PERL_HAS_FAST_GET_LSB_POS32)
1814 
1815     return lsbit_pos32(word);
1816 
1817 #else
1818 
1819     return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
1820                                                     >> PERL_deBruijnShift32_];
1821 #endif
1822 
1823 }
1824 
1825 #ifndef EBCDIC
1826 
1827 PERL_STATIC_INLINE unsigned int
Perl_variant_byte_number(PERL_UINTMAX_T word)1828 Perl_variant_byte_number(PERL_UINTMAX_T word)
1829 {
1830     /* This returns the position in a word (0..7) of the first variant byte in
1831      * it.  This is a helper function.  Note that there are no branches */
1832 
1833     /* Get just the msb bits of each byte */
1834     word &= PERL_VARIANTS_WORD_MASK;
1835 
1836     /* This should only be called if we know there is a variant byte in the
1837      * word */
1838     assert(word);
1839 
1840 #  if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1841 
1842     /* Bytes are stored like
1843      *  Byte8 ... Byte2 Byte1
1844      *  63..56...15...8 7...0
1845      * so getting the lsb of the whole modified word is getting the msb of the
1846      * first byte that has its msb set */
1847     word = lsbit_pos_uintmax_(word);
1848 
1849     /* Here, word contains the position 7,15,23,...55,63 of that bit.  Convert
1850      * to 0..7 */
1851     return (unsigned int) ((word + 1) >> 3) - 1;
1852 
1853 #  elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1854 
1855     /* Bytes are stored like
1856      *  Byte1 Byte2  ... Byte8
1857      * 63..56 55..47 ... 7...0
1858      * so getting the msb of the whole modified word is getting the msb of the
1859      * first byte that has its msb set */
1860     word = msbit_pos_uintmax_(word);
1861 
1862     /* Here, word contains the position 63,55,...,23,15,7 of that bit.  Convert
1863      * to 0..7 */
1864     word = ((word + 1) >> 3) - 1;
1865 
1866     /* And invert the result because of the reversed byte order on this
1867      * platform */
1868     word = CHARBITS - word - 1;
1869 
1870     return (unsigned int) word;
1871 
1872 #  else
1873 #    error Unexpected byte order
1874 #  endif
1875 
1876 }
1877 
1878 #endif
1879 #if defined(PERL_CORE) || defined(PERL_EXT)
1880 
1881 /*
1882 =for apidoc variant_under_utf8_count
1883 
1884 This function looks at the sequence of bytes between C<s> and C<e>, which are
1885 assumed to be encoded in ASCII/Latin1, and returns how many of them would
1886 change should the string be translated into UTF-8.  Due to the nature of UTF-8,
1887 each of these would occupy two bytes instead of the single one in the input
1888 string.  Thus, this function returns the precise number of bytes the string
1889 would expand by when translated to UTF-8.
1890 
1891 Unlike most of the other functions that have C<utf8> in their name, the input
1892 to this function is NOT a UTF-8-encoded string.  The function name is slightly
1893 I<odd> to emphasize this.
1894 
1895 This function is internal to Perl because khw thinks that any XS code that
1896 would want this is probably operating too close to the internals.  Presenting a
1897 valid use case could change that.
1898 
1899 See also
1900 C<L<perlapi/is_utf8_invariant_string>>
1901 and
1902 C<L<perlapi/is_utf8_invariant_string_loc>>,
1903 
1904 =cut
1905 
1906 */
1907 
1908 PERL_STATIC_INLINE Size_t
S_variant_under_utf8_count(const U8 * const s,const U8 * const e)1909 S_variant_under_utf8_count(const U8* const s, const U8* const e)
1910 {
1911     const U8* x = s;
1912     Size_t count = 0;
1913 
1914     PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1915 
1916 #  ifndef EBCDIC
1917 
1918     /* Test if the string is long enough to use word-at-a-time.  (Logic is the
1919      * same as for is_utf8_invariant_string()) */
1920     if ((STRLEN) (e - x) >= PERL_WORDSIZE
1921                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1922                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1923     {
1924 
1925         /* Process per-byte until reach word boundary.  XXX This loop could be
1926          * eliminated if we knew that this platform had fast unaligned reads */
1927         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1928             count += ! UTF8_IS_INVARIANT(*x++);
1929         }
1930 
1931         /* Process per-word as long as we have at least a full word left */
1932         do {    /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1933                    explanation of how this works */
1934             PERL_UINTMAX_T increment
1935                 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
1936                       * PERL_COUNT_MULTIPLIER)
1937                     >> ((PERL_WORDSIZE - 1) * CHARBITS);
1938             count += (Size_t) increment;
1939             x += PERL_WORDSIZE;
1940         } while (x + PERL_WORDSIZE <= e);
1941     }
1942 
1943 #  endif
1944 
1945     /* Process per-byte */
1946     while (x < e) {
1947         if (! UTF8_IS_INVARIANT(*x)) {
1948             count++;
1949         }
1950 
1951         x++;
1952     }
1953 
1954     return count;
1955 }
1956 
1957 #endif
1958 
1959    /* Keep  these around for these files */
1960 #if ! defined(PERL_IN_REGEXEC_C) && ! defined(PERL_IN_UTF8_C)
1961 #  undef PERL_WORDSIZE
1962 #  undef PERL_COUNT_MULTIPLIER
1963 #  undef PERL_WORD_BOUNDARY_MASK
1964 #  undef PERL_VARIANTS_WORD_MASK
1965 #endif
1966 
1967 /*
1968 =for apidoc is_utf8_string
1969 
1970 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1971 Perl-extended-UTF-8 string; returns FALSE otherwise.  If C<len> is 0, it will
1972 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1973 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1974 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1975 
1976 This function considers Perl's extended UTF-8 to be valid.  That means that
1977 code points above Unicode, surrogates, and non-character code points are
1978 considered valid by this function.  Use C<L</is_strict_utf8_string>>,
1979 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1980 code points are considered valid.
1981 
1982 See also
1983 C<L</is_utf8_invariant_string>>,
1984 C<L</is_utf8_invariant_string_loc>>,
1985 C<L</is_utf8_string_loc>>,
1986 C<L</is_utf8_string_loclen>>,
1987 C<L</is_utf8_fixed_width_buf_flags>>,
1988 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1989 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1990 
1991 =cut
1992 */
1993 
1994 #define is_utf8_string(s, len)  is_utf8_string_loclen(s, len, NULL, NULL)
1995 
1996 #if defined(PERL_CORE) || defined (PERL_EXT)
1997 
1998 /*
1999 =for apidoc is_utf8_non_invariant_string
2000 
2001 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
2002 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
2003 UTF-8; otherwise returns FALSE.
2004 
2005 A TRUE return means that at least one code point represented by the sequence
2006 either is a wide character not representable as a single byte, or the
2007 representation differs depending on whether the sequence is encoded in UTF-8 or
2008 not.
2009 
2010 See also
2011 C<L<perlapi/is_utf8_invariant_string>>,
2012 C<L<perlapi/is_utf8_string>>
2013 
2014 =cut
2015 
2016 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
2017 It generally needn't be if its string is entirely UTF-8 invariant, and it
2018 shouldn't be if it otherwise contains invalid UTF-8.
2019 
2020 It is an internal function because khw thinks that XS code shouldn't be working
2021 at this low a level.  A valid use case could change that.
2022 
2023 */
2024 
2025 PERL_STATIC_INLINE bool
Perl_is_utf8_non_invariant_string(const U8 * const s,STRLEN len)2026 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
2027 {
2028     const U8 * first_variant;
2029 
2030     PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
2031 
2032     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2033         return FALSE;
2034     }
2035 
2036     return is_utf8_string(first_variant, len - (first_variant - s));
2037 }
2038 
2039 #endif
2040 
2041 /*
2042 =for apidoc is_strict_utf8_string
2043 
2044 Returns TRUE if the first C<len> bytes of string C<s> form a valid
2045 UTF-8-encoded string that is fully interchangeable by any application using
2046 Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
2047 calculated using C<strlen(s)> (which means if you use this option, that C<s>
2048 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
2049 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
2050 
2051 This function returns FALSE for strings containing any
2052 code points above the Unicode max of 0x10FFFF, surrogate code points, or
2053 non-character code points.
2054 
2055 See also
2056 C<L</is_utf8_invariant_string>>,
2057 C<L</is_utf8_invariant_string_loc>>,
2058 C<L</is_utf8_string>>,
2059 C<L</is_utf8_string_flags>>,
2060 C<L</is_utf8_string_loc>>,
2061 C<L</is_utf8_string_loc_flags>>,
2062 C<L</is_utf8_string_loclen>>,
2063 C<L</is_utf8_string_loclen_flags>>,
2064 C<L</is_utf8_fixed_width_buf_flags>>,
2065 C<L</is_utf8_fixed_width_buf_loc_flags>>,
2066 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
2067 C<L</is_strict_utf8_string_loc>>,
2068 C<L</is_strict_utf8_string_loclen>>,
2069 C<L</is_c9strict_utf8_string>>,
2070 C<L</is_c9strict_utf8_string_loc>>,
2071 and
2072 C<L</is_c9strict_utf8_string_loclen>>.
2073 
2074 =cut
2075 */
2076 
2077 #define is_strict_utf8_string(s, len)  is_strict_utf8_string_loclen(s, len, NULL, NULL)
2078 
2079 /*
2080 =for apidoc is_c9strict_utf8_string
2081 
2082 Returns TRUE if the first C<len> bytes of string C<s> form a valid
2083 UTF-8-encoded string that conforms to
2084 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
2085 otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
2086 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
2087 C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
2088 characters being ASCII constitute 'a valid UTF-8 string'.
2089 
2090 This function returns FALSE for strings containing any code points above the
2091 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
2092 code points per
2093 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
2094 
2095 See also
2096 C<L</is_utf8_invariant_string>>,
2097 C<L</is_utf8_invariant_string_loc>>,
2098 C<L</is_utf8_string>>,
2099 C<L</is_utf8_string_flags>>,
2100 C<L</is_utf8_string_loc>>,
2101 C<L</is_utf8_string_loc_flags>>,
2102 C<L</is_utf8_string_loclen>>,
2103 C<L</is_utf8_string_loclen_flags>>,
2104 C<L</is_utf8_fixed_width_buf_flags>>,
2105 C<L</is_utf8_fixed_width_buf_loc_flags>>,
2106 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
2107 C<L</is_strict_utf8_string>>,
2108 C<L</is_strict_utf8_string_loc>>,
2109 C<L</is_strict_utf8_string_loclen>>,
2110 C<L</is_c9strict_utf8_string_loc>>,
2111 and
2112 C<L</is_c9strict_utf8_string_loclen>>.
2113 
2114 =cut
2115 */
2116 
2117 #define is_c9strict_utf8_string(s, len)  is_c9strict_utf8_string_loclen(s, len, NULL, 0)
2118 
2119 /*
2120 =for apidoc is_utf8_string_flags
2121 
2122 Returns TRUE if the first C<len> bytes of string C<s> form a valid
2123 UTF-8 string, subject to the restrictions imposed by C<flags>;
2124 returns FALSE otherwise.  If C<len> is 0, it will be calculated
2125 using C<strlen(s)> (which means if you use this option, that C<s> can't have
2126 embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
2127 that all characters being ASCII constitute 'a valid UTF-8 string'.
2128 
2129 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
2130 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2131 as C<L</is_strict_utf8_string>>; and if C<flags> is
2132 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
2133 C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
2134 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
2135 C<L</utf8n_to_uvchr>>, with the same meanings.
2136 
2137 See also
2138 C<L</is_utf8_invariant_string>>,
2139 C<L</is_utf8_invariant_string_loc>>,
2140 C<L</is_utf8_string>>,
2141 C<L</is_utf8_string_loc>>,
2142 C<L</is_utf8_string_loc_flags>>,
2143 C<L</is_utf8_string_loclen>>,
2144 C<L</is_utf8_string_loclen_flags>>,
2145 C<L</is_utf8_fixed_width_buf_flags>>,
2146 C<L</is_utf8_fixed_width_buf_loc_flags>>,
2147 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
2148 C<L</is_strict_utf8_string>>,
2149 C<L</is_strict_utf8_string_loc>>,
2150 C<L</is_strict_utf8_string_loclen>>,
2151 C<L</is_c9strict_utf8_string>>,
2152 C<L</is_c9strict_utf8_string_loc>>,
2153 and
2154 C<L</is_c9strict_utf8_string_loclen>>.
2155 
2156 =cut
2157 */
2158 
2159 PERL_STATIC_INLINE bool
Perl_is_utf8_string_flags(const U8 * s,STRLEN len,const U32 flags)2160 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
2161 {
2162     const U8 * first_variant;
2163 
2164     PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
2165     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2166                           |UTF8_DISALLOW_PERL_EXTENDED)));
2167 
2168     if (len == 0) {
2169         len = strlen((const char *)s);
2170     }
2171 
2172     if (flags == 0) {
2173         return is_utf8_string(s, len);
2174     }
2175 
2176     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2177                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
2178     {
2179         return is_strict_utf8_string(s, len);
2180     }
2181 
2182     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2183                                        == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
2184     {
2185         return is_c9strict_utf8_string(s, len);
2186     }
2187 
2188     if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
2189         const U8* const send = s + len;
2190         const U8* x = first_variant;
2191 
2192         while (x < send) {
2193             STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
2194             if (UNLIKELY(! cur_len)) {
2195                 return FALSE;
2196             }
2197             x += cur_len;
2198         }
2199     }
2200 
2201     return TRUE;
2202 }
2203 
2204 /*
2205 
2206 =for apidoc is_utf8_string_loc
2207 
2208 Like C<L</is_utf8_string>> but stores the location of the failure (in the
2209 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2210 "utf8ness success") in the C<ep> pointer.
2211 
2212 See also C<L</is_utf8_string_loclen>>.
2213 
2214 =cut
2215 */
2216 
2217 #define is_utf8_string_loc(s, len, ep)  is_utf8_string_loclen(s, len, ep, 0)
2218 
2219 /*
2220 
2221 =for apidoc is_utf8_string_loclen
2222 
2223 Like C<L</is_utf8_string>> but stores the location of the failure (in the
2224 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2225 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
2226 encoded characters in the C<el> pointer.
2227 
2228 See also C<L</is_utf8_string_loc>>.
2229 
2230 =cut
2231 */
2232 
2233 PERL_STATIC_INLINE bool
Perl_is_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)2234 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
2235 {
2236     const U8 * first_variant;
2237 
2238     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
2239 
2240     if (len == 0) {
2241         len = strlen((const char *) s);
2242     }
2243 
2244     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2245         if (el)
2246             *el = len;
2247 
2248         if (ep) {
2249             *ep = s + len;
2250         }
2251 
2252         return TRUE;
2253     }
2254 
2255     {
2256         const U8* const send = s + len;
2257         const U8* x = first_variant;
2258         STRLEN outlen = first_variant - s;
2259 
2260         while (x < send) {
2261             const STRLEN cur_len = isUTF8_CHAR(x, send);
2262             if (UNLIKELY(! cur_len)) {
2263                 break;
2264             }
2265             x += cur_len;
2266             outlen++;
2267         }
2268 
2269         if (el)
2270             *el = outlen;
2271 
2272         if (ep) {
2273             *ep = x;
2274         }
2275 
2276         return (x == send);
2277     }
2278 }
2279 
2280 /* The perl core arranges to never call the DFA below without there being at
2281  * least one byte available to look at.  This allows the DFA to use a do {}
2282  * while loop which means that calling it with a UTF-8 invariant has a single
2283  * conditional, same as the calling code checking for invariance ahead of time.
2284  * And having the calling code remove that conditional speeds up by that
2285  * conditional, the case where it wasn't invariant.  So there's no reason to
2286  * check before caling this.
2287  *
2288  * But we don't know this for non-core calls, so have to retain the check for
2289  * them. */
2290 #ifdef PERL_CORE
2291 #  define PERL_NON_CORE_CHECK_EMPTY(s,e)  assert((e) > (s))
2292 #else
2293 #  define PERL_NON_CORE_CHECK_EMPTY(s,e)  if ((e) <= (s)) return FALSE
2294 #endif
2295 
2296 /*
2297  * DFA for checking input is valid UTF-8 syntax.
2298  *
2299  * This uses adaptations of the table and algorithm given in
2300  * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
2301  * documentation of the original version.  A copyright notice for the original
2302  * version is given at the beginning of this file.  The Perl adaptations are
2303  * documented at the definition of PL_extended_utf8_dfa_tab[].
2304  *
2305  * This dfa is fast.  There are three exit conditions:
2306  *  1) a well-formed code point, acceptable to the table
2307  *  2) the beginning bytes of an incomplete character, whose completion might
2308  *     or might not be acceptable
2309  *  3) unacceptable to the table.  Some of the adaptations have certain,
2310  *     hopefully less likely to occur, legal inputs be unacceptable to the
2311  *     table, so these must be sorted out afterwards.
2312  *
2313  * This macro is a complete implementation of the code executing the DFA.  It
2314  * is passed the input sequence bounds and the table to use, and what to do
2315  * for each of the exit conditions.  There are three canned actions, likely to
2316  * be the ones you want:
2317  *      DFA_RETURN_SUCCESS_
2318  *      DFA_RETURN_FAILURE_
2319  *      DFA_GOTO_TEASE_APART_FF_
2320  *
2321  * You pass a parameter giving the action to take for each of the three
2322  * possible exit conditions:
2323  *
2324  * 'accept_action'  This is executed when the DFA accepts the input.
2325  *                  DFA_RETURN_SUCCESS_ is the most likely candidate.
2326  * 'reject_action'  This is executed when the DFA rejects the input.
2327  *                  DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
2328  *                  you have written code to distinguish the rejecting state
2329  *                  results.  Because it happens in several places, and
2330  *                  involves #ifdefs, the special action
2331  *                  DFA_GOTO_TEASE_APART_FF_ is what you want with
2332  *                  PL_extended_utf8_dfa_tab.  On platforms without
2333  *                  EXTRA_LONG_UTF8, there is no need to tease anything apart,
2334  *                  so this evaluates to DFA_RETURN_FAILURE_; otherwise you
2335  *                  need to have a label 'tease_apart_FF' that it will transfer
2336  *                  to.
2337  * 'incomplete_char_action'  This is executed when the DFA ran off the end
2338  *                  before accepting or rejecting the input.
2339  *                  DFA_RETURN_FAILURE_ is the likely action, but you could
2340  *                  have a 'goto', or NOOP.  In the latter case the DFA drops
2341  *                  off the end, and you place your code to handle this case
2342  *                  immediately after it.
2343  */
2344 
2345 #define DFA_RETURN_SUCCESS_      return s - s0
2346 #define DFA_RETURN_FAILURE_      return 0
2347 #ifdef HAS_EXTRA_LONG_UTF8
2348 #  define DFA_TEASE_APART_FF_  goto tease_apart_FF
2349 #else
2350 #  define DFA_TEASE_APART_FF_  DFA_RETURN_FAILURE_
2351 #endif
2352 
2353 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab,                               \
2354                               accept_action,                                \
2355                               reject_action,                                \
2356                               incomplete_char_action)                       \
2357     STMT_START {                                                            \
2358         const U8 * s = s0;                                                  \
2359         const U8 * e_ = e;                                                  \
2360         UV state = 0;                                                       \
2361                                                                             \
2362         PERL_NON_CORE_CHECK_EMPTY(s, e_);                                   \
2363                                                                             \
2364         do {                                                                \
2365             state = dfa_tab[256 + state + dfa_tab[*s]];                     \
2366             s++;                                                            \
2367                                                                             \
2368             if (state == 0) {   /* Accepting state */                       \
2369                 accept_action;                                              \
2370             }                                                               \
2371                                                                             \
2372             if (UNLIKELY(state == 1)) { /* Rejecting state */               \
2373                 reject_action;                                              \
2374             }                                                               \
2375         } while (s < e_);                                                   \
2376                                                                             \
2377         /* Here, dropped out of loop before end-of-char */                  \
2378         incomplete_char_action;                                             \
2379     } STMT_END
2380 
2381 
2382 /*
2383 
2384 =for apidoc isUTF8_CHAR
2385 
2386 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2387 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2388 that represents some code point; otherwise it evaluates to 0.  If non-zero, the
2389 value gives how many bytes starting at C<s> comprise the code point's
2390 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
2391 form the first code point in C<s>, are not examined.
2392 
2393 The code point can be any that will fit in an IV on this machine, using Perl's
2394 extension to official UTF-8 to represent those higher than the Unicode maximum
2395 of 0x10FFFF.  That means that this macro is used to efficiently decide if the
2396 next few bytes in C<s> is legal UTF-8 for a single character.
2397 
2398 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
2399 defined by Unicode to be fully interchangeable across applications;
2400 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
2401 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
2402 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
2403 
2404 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
2405 C<L</is_utf8_string_loclen>> to check entire strings.
2406 
2407 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
2408 machines) is a valid UTF-8 character.
2409 
2410 =cut
2411 
2412 This uses an adaptation of the table and algorithm given in
2413 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
2414 documentation of the original version.  A copyright notice for the original
2415 version is given at the beginning of this file.  The Perl adaptation is
2416 documented at the definition of PL_extended_utf8_dfa_tab[].
2417 */
2418 
2419 PERL_STATIC_INLINE Size_t
Perl_isUTF8_CHAR(const U8 * const s0,const U8 * const e)2420 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
2421 {
2422     PERL_ARGS_ASSERT_ISUTF8_CHAR;
2423 
2424     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2425                           DFA_RETURN_SUCCESS_,
2426                           DFA_TEASE_APART_FF_,
2427                           DFA_RETURN_FAILURE_);
2428 
2429     /* Here, we didn't return success, but dropped out of the loop.  In the
2430      * case of PL_extended_utf8_dfa_tab, this means the input is either
2431      * malformed, or the start byte was FF on a platform that the dfa doesn't
2432      * handle FF's.  Call a helper function. */
2433 
2434 #ifdef HAS_EXTRA_LONG_UTF8
2435 
2436   tease_apart_FF:
2437 
2438     /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2439      * either malformed, or was for the largest possible start byte, which we
2440      * now check, not inline */
2441     if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
2442         return 0;
2443     }
2444 
2445     return is_utf8_FF_helper_(s0, e,
2446                               FALSE /* require full, not partial char */
2447                              );
2448 #endif
2449 
2450 }
2451 
2452 /*
2453 
2454 =for apidoc isSTRICT_UTF8_CHAR
2455 
2456 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2457 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
2458 Unicode code point completely acceptable for open interchange between all
2459 applications; otherwise it evaluates to 0.  If non-zero, the value gives how
2460 many bytes starting at C<s> comprise the code point's representation.  Any
2461 bytes remaining before C<e>, but beyond the ones needed to form the first code
2462 point in C<s>, are not examined.
2463 
2464 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
2465 be a surrogate nor a non-character code point.  Thus this excludes any code
2466 point from Perl's extended UTF-8.
2467 
2468 This is used to efficiently decide if the next few bytes in C<s> is
2469 legal Unicode-acceptable UTF-8 for a single character.
2470 
2471 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
2472 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
2473 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
2474 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
2475 
2476 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
2477 C<L</is_strict_utf8_string_loclen>> to check entire strings.
2478 
2479 =cut
2480 
2481 This uses an adaptation of the tables and algorithm given in
2482 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
2483 documentation of the original version.  A copyright notice for the original
2484 version is given at the beginning of this file.  The Perl adaptation is
2485 documented at the definition of strict_extended_utf8_dfa_tab[].
2486 
2487 */
2488 
2489 PERL_STATIC_INLINE Size_t
Perl_isSTRICT_UTF8_CHAR(const U8 * const s0,const U8 * const e)2490 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
2491 {
2492     PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
2493 
2494     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
2495                           DFA_RETURN_SUCCESS_,
2496                           goto check_hanguls,
2497                           DFA_RETURN_FAILURE_);
2498   check_hanguls:
2499 
2500     /* Here, we didn't return success, but dropped out of the loop.  In the
2501      * case of PL_strict_utf8_dfa_tab, this means the input is either
2502      * malformed, or was for certain Hanguls; handle them specially */
2503 
2504     /* The dfa above drops out for incomplete or illegal inputs, and certain
2505      * legal Hanguls; check and return accordingly */
2506     return is_HANGUL_ED_utf8_safe(s0, e);
2507 }
2508 
2509 /*
2510 
2511 =for apidoc isC9_STRICT_UTF8_CHAR
2512 
2513 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2514 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
2515 Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
2516 the value gives how many bytes starting at C<s> comprise the code point's
2517 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
2518 form the first code point in C<s>, are not examined.
2519 
2520 The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
2521 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
2522 code points.  This corresponds to
2523 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
2524 which said that non-character code points are merely discouraged rather than
2525 completely forbidden in open interchange.  See
2526 L<perlunicode/Noncharacter code points>.
2527 
2528 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
2529 C<L</isUTF8_CHAR_flags>> for a more customized definition.
2530 
2531 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
2532 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
2533 
2534 =cut
2535 
2536 This uses an adaptation of the tables and algorithm given in
2537 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
2538 documentation of the original version.  A copyright notice for the original
2539 version is given at the beginning of this file.  The Perl adaptation is
2540 documented at the definition of PL_c9_utf8_dfa_tab[].
2541 
2542 */
2543 
2544 PERL_STATIC_INLINE Size_t
Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0,const U8 * const e)2545 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
2546 {
2547     PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
2548 
2549     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
2550                           DFA_RETURN_SUCCESS_,
2551                           DFA_RETURN_FAILURE_,
2552                           DFA_RETURN_FAILURE_);
2553 }
2554 
2555 /*
2556 
2557 =for apidoc is_strict_utf8_string_loc
2558 
2559 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
2560 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2561 "utf8ness success") in the C<ep> pointer.
2562 
2563 See also C<L</is_strict_utf8_string_loclen>>.
2564 
2565 =cut
2566 */
2567 
2568 #define is_strict_utf8_string_loc(s, len, ep)                               \
2569                                 is_strict_utf8_string_loclen(s, len, ep, 0)
2570 
2571 /*
2572 
2573 =for apidoc is_strict_utf8_string_loclen
2574 
2575 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
2576 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2577 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
2578 encoded characters in the C<el> pointer.
2579 
2580 See also C<L</is_strict_utf8_string_loc>>.
2581 
2582 =cut
2583 */
2584 
2585 PERL_STATIC_INLINE bool
Perl_is_strict_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)2586 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
2587 {
2588     const U8 * first_variant;
2589 
2590     PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
2591 
2592     if (len == 0) {
2593         len = strlen((const char *) s);
2594     }
2595 
2596     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2597         if (el)
2598             *el = len;
2599 
2600         if (ep) {
2601             *ep = s + len;
2602         }
2603 
2604         return TRUE;
2605     }
2606 
2607     {
2608         const U8* const send = s + len;
2609         const U8* x = first_variant;
2610         STRLEN outlen = first_variant - s;
2611 
2612         while (x < send) {
2613             const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
2614             if (UNLIKELY(! cur_len)) {
2615                 break;
2616             }
2617             x += cur_len;
2618             outlen++;
2619         }
2620 
2621         if (el)
2622             *el = outlen;
2623 
2624         if (ep) {
2625             *ep = x;
2626         }
2627 
2628         return (x == send);
2629     }
2630 }
2631 
2632 /*
2633 
2634 =for apidoc is_c9strict_utf8_string_loc
2635 
2636 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
2637 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2638 "utf8ness success") in the C<ep> pointer.
2639 
2640 See also C<L</is_c9strict_utf8_string_loclen>>.
2641 
2642 =cut
2643 */
2644 
2645 #define is_c9strict_utf8_string_loc(s, len, ep)	                            \
2646                             is_c9strict_utf8_string_loclen(s, len, ep, 0)
2647 
2648 /*
2649 
2650 =for apidoc is_c9strict_utf8_string_loclen
2651 
2652 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
2653 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2654 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
2655 characters in the C<el> pointer.
2656 
2657 See also C<L</is_c9strict_utf8_string_loc>>.
2658 
2659 =cut
2660 */
2661 
2662 PERL_STATIC_INLINE bool
Perl_is_c9strict_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)2663 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
2664 {
2665     const U8 * first_variant;
2666 
2667     PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
2668 
2669     if (len == 0) {
2670         len = strlen((const char *) s);
2671     }
2672 
2673     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2674         if (el)
2675             *el = len;
2676 
2677         if (ep) {
2678             *ep = s + len;
2679         }
2680 
2681         return TRUE;
2682     }
2683 
2684     {
2685         const U8* const send = s + len;
2686         const U8* x = first_variant;
2687         STRLEN outlen = first_variant - s;
2688 
2689         while (x < send) {
2690             const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
2691             if (UNLIKELY(! cur_len)) {
2692                 break;
2693             }
2694             x += cur_len;
2695             outlen++;
2696         }
2697 
2698         if (el)
2699             *el = outlen;
2700 
2701         if (ep) {
2702             *ep = x;
2703         }
2704 
2705         return (x == send);
2706     }
2707 }
2708 
2709 /*
2710 
2711 =for apidoc is_utf8_string_loc_flags
2712 
2713 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
2714 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2715 "utf8ness success") in the C<ep> pointer.
2716 
2717 See also C<L</is_utf8_string_loclen_flags>>.
2718 
2719 =cut
2720 */
2721 
2722 #define is_utf8_string_loc_flags(s, len, ep, flags)                         \
2723                         is_utf8_string_loclen_flags(s, len, ep, 0, flags)
2724 
2725 
2726 /* The above 3 actual functions could have been moved into the more general one
2727  * just below, and made #defines that call it with the right 'flags'.  They are
2728  * currently kept separate to increase their chances of getting inlined */
2729 
2730 /*
2731 
2732 =for apidoc is_utf8_string_loclen_flags
2733 
2734 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
2735 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
2736 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
2737 encoded characters in the C<el> pointer.
2738 
2739 See also C<L</is_utf8_string_loc_flags>>.
2740 
2741 =cut
2742 */
2743 
2744 PERL_STATIC_INLINE bool
Perl_is_utf8_string_loclen_flags(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el,const U32 flags)2745 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
2746 {
2747     const U8 * first_variant;
2748 
2749     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
2750     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2751                           |UTF8_DISALLOW_PERL_EXTENDED)));
2752 
2753     if (flags == 0) {
2754         return is_utf8_string_loclen(s, len, ep, el);
2755     }
2756 
2757     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2758                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
2759     {
2760         return is_strict_utf8_string_loclen(s, len, ep, el);
2761     }
2762 
2763     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
2764                                     == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
2765     {
2766         return is_c9strict_utf8_string_loclen(s, len, ep, el);
2767     }
2768 
2769     if (len == 0) {
2770         len = strlen((const char *) s);
2771     }
2772 
2773     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
2774         if (el)
2775             *el = len;
2776 
2777         if (ep) {
2778             *ep = s + len;
2779         }
2780 
2781         return TRUE;
2782     }
2783 
2784     {
2785         const U8* send = s + len;
2786         const U8* x = first_variant;
2787         STRLEN outlen = first_variant - s;
2788 
2789         while (x < send) {
2790             const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
2791             if (UNLIKELY(! cur_len)) {
2792                 break;
2793             }
2794             x += cur_len;
2795             outlen++;
2796         }
2797 
2798         if (el)
2799             *el = outlen;
2800 
2801         if (ep) {
2802             *ep = x;
2803         }
2804 
2805         return (x == send);
2806     }
2807 }
2808 
2809 /*
2810 =for apidoc utf8_distance
2811 
2812 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
2813 and C<b>.
2814 
2815 WARNING: use only if you *know* that the pointers point inside the
2816 same UTF-8 buffer.
2817 
2818 =cut
2819 */
2820 
2821 PERL_STATIC_INLINE IV
Perl_utf8_distance(pTHX_ const U8 * a,const U8 * b)2822 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
2823 {
2824     PERL_ARGS_ASSERT_UTF8_DISTANCE;
2825 
2826     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
2827 }
2828 
2829 /*
2830 =for apidoc utf8_hop
2831 
2832 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
2833 forward (if C<off> is positive) or backward (if negative).  C<s> does not need
2834 to be pointing to the starting byte of a character.  If it isn't, one count of
2835 C<off> will be used up to get to the start of the next character for forward
2836 hops, and to the start of the current character for negative ones.
2837 
2838 WARNING: Prefer L</utf8_hop_safe> to this one.
2839 
2840 Do NOT use this function unless you B<know> C<off> is within
2841 the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned
2842 on the first byte of a character or just after the last byte of a character.
2843 
2844 =cut
2845 */
2846 
2847 PERL_STATIC_INLINE U8 *
Perl_utf8_hop(const U8 * s,SSize_t off)2848 Perl_utf8_hop(const U8 *s, SSize_t off)
2849 {
2850     PERL_ARGS_ASSERT_UTF8_HOP;
2851 
2852     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2853      * the XXX bitops (especially ~) can create illegal UTF-8.
2854      * In other words: in Perl UTF-8 is not just for Unicode. */
2855 
2856     if (off > 0) {
2857 
2858         /* Get to next non-continuation byte */
2859         if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2860             do {
2861                 s++;
2862             }
2863             while (UTF8_IS_CONTINUATION(*s));
2864             off--;
2865         }
2866 
2867         while (off--)
2868             s += UTF8SKIP(s);
2869     }
2870     else {
2871         while (off++) {
2872             s--;
2873             while (UTF8_IS_CONTINUATION(*s))
2874                 s--;
2875         }
2876     }
2877 
2878     GCC_DIAG_IGNORE(-Wcast-qual)
2879     return (U8 *)s;
2880     GCC_DIAG_RESTORE
2881 }
2882 
2883 /*
2884 =for apidoc utf8_hop_forward
2885 
2886 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2887 forward.  C<s> does not need to be pointing to the starting byte of a
2888 character.  If it isn't, one count of C<off> will be used up to get to the
2889 start of the next character.
2890 
2891 C<off> must be non-negative.
2892 
2893 C<s> must be before or equal to C<end>.
2894 
2895 When moving forward it will not move beyond C<end>.
2896 
2897 Will not exceed this limit even if the string is not valid "UTF-8".
2898 
2899 =cut
2900 */
2901 
2902 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_forward(const U8 * s,SSize_t off,const U8 * end)2903 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2904 {
2905     PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2906 
2907     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2908      * the bitops (especially ~) can create illegal UTF-8.
2909      * In other words: in Perl UTF-8 is not just for Unicode. */
2910 
2911     assert(s <= end);
2912     assert(off >= 0);
2913 
2914     if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2915         /* Get to next non-continuation byte */
2916         do {
2917             s++;
2918         }
2919         while (UTF8_IS_CONTINUATION(*s));
2920         off--;
2921     }
2922 
2923     while (off--) {
2924         STRLEN skip = UTF8SKIP(s);
2925         if ((STRLEN)(end - s) <= skip) {
2926             GCC_DIAG_IGNORE(-Wcast-qual)
2927             return (U8 *)end;
2928             GCC_DIAG_RESTORE
2929         }
2930         s += skip;
2931     }
2932 
2933     GCC_DIAG_IGNORE(-Wcast-qual)
2934     return (U8 *)s;
2935     GCC_DIAG_RESTORE
2936 }
2937 
2938 /*
2939 =for apidoc utf8_hop_back
2940 
2941 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2942 backward.  C<s> does not need to be pointing to the starting byte of a
2943 character.  If it isn't, one count of C<off> will be used up to get to that
2944 start.
2945 
2946 C<off> must be non-positive.
2947 
2948 C<s> must be after or equal to C<start>.
2949 
2950 When moving backward it will not move before C<start>.
2951 
2952 Will not exceed this limit even if the string is not valid "UTF-8".
2953 
2954 =cut
2955 */
2956 
2957 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_back(const U8 * s,SSize_t off,const U8 * start)2958 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2959 {
2960     PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2961 
2962     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2963      * the bitops (especially ~) can create illegal UTF-8.
2964      * In other words: in Perl UTF-8 is not just for Unicode. */
2965 
2966     assert(start <= s);
2967     assert(off <= 0);
2968 
2969     /* Note: if we know that the input is well-formed, we can do per-word
2970      * hop-back.  Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented
2971      * that.  But it was reverted because doing per-word has some
2972      * start-up/tear-down overhead, so only makes sense if the distance to be
2973      * moved is large, and core perl doesn't currently move more than a few
2974      * characters at a time.  You can reinstate it if it does become
2975      * advantageous. */
2976     while (off++ && s > start) {
2977         do {
2978             s--;
2979         } while (s > start && UTF8_IS_CONTINUATION(*s));
2980     }
2981 
2982     GCC_DIAG_IGNORE(-Wcast-qual)
2983     return (U8 *)s;
2984     GCC_DIAG_RESTORE
2985 }
2986 
2987 /*
2988 =for apidoc utf8_hop_safe
2989 
2990 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2991 either forward or backward.  C<s> does not need to be pointing to the starting
2992 byte of a character.  If it isn't, one count of C<off> will be used up to get
2993 to the start of the next character for forward hops, and to the start of the
2994 current character for negative ones.
2995 
2996 When moving backward it will not move before C<start>.
2997 
2998 When moving forward it will not move beyond C<end>.
2999 
3000 Will not exceed those limits even if the string is not valid "UTF-8".
3001 
3002 =cut
3003 */
3004 
3005 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_safe(const U8 * s,SSize_t off,const U8 * start,const U8 * end)3006 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
3007 {
3008     PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
3009 
3010     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
3011      * the bitops (especially ~) can create illegal UTF-8.
3012      * In other words: in Perl UTF-8 is not just for Unicode. */
3013 
3014     assert(start <= s && s <= end);
3015 
3016     if (off >= 0) {
3017         return utf8_hop_forward(s, off, end);
3018     }
3019     else {
3020         return utf8_hop_back(s, off, start);
3021     }
3022 }
3023 
3024 /*
3025 
3026 =for apidoc isUTF8_CHAR_flags
3027 
3028 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
3029 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
3030 that represents some code point, subject to the restrictions given by C<flags>;
3031 otherwise it evaluates to 0.  If non-zero, the value gives how many bytes
3032 starting at C<s> comprise the code point's representation.  Any bytes remaining
3033 before C<e>, but beyond the ones needed to form the first code point in C<s>,
3034 are not examined.
3035 
3036 If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
3037 if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
3038 as C<L</isSTRICT_UTF8_CHAR>>;
3039 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
3040 the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
3041 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
3042 understood by C<L</utf8n_to_uvchr>>, with the same meanings.
3043 
3044 The three alternative macros are for the most commonly needed validations; they
3045 are likely to run somewhat faster than this more general one, as they can be
3046 inlined into your code.
3047 
3048 Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
3049 L</is_utf8_string_loclen_flags> to check entire strings.
3050 
3051 =cut
3052 */
3053 
3054 PERL_STATIC_INLINE STRLEN
Perl_isUTF8_CHAR_flags(const U8 * const s0,const U8 * const e,const U32 flags)3055 Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
3056 {
3057     PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
3058     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
3059                           |UTF8_DISALLOW_PERL_EXTENDED)));
3060 
3061     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
3062                           goto check_success,
3063                           DFA_TEASE_APART_FF_,
3064                           DFA_RETURN_FAILURE_);
3065 
3066   check_success:
3067 
3068     return is_utf8_char_helper_(s0, e, flags);
3069 
3070 #ifdef HAS_EXTRA_LONG_UTF8
3071 
3072   tease_apart_FF:
3073 
3074     /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
3075      * either malformed, or was for the largest possible start byte, which
3076      * indicates perl extended UTF-8, well above the Unicode maximum */
3077     if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
3078         || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
3079     {
3080         return 0;
3081     }
3082 
3083     /* Otherwise examine the sequence not inline */
3084     return is_utf8_FF_helper_(s0, e,
3085                               FALSE /* require full, not partial char */
3086                              );
3087 #endif
3088 
3089 }
3090 
3091 /*
3092 
3093 =for apidoc is_utf8_valid_partial_char
3094 
3095 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
3096 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
3097 points.  Otherwise, it returns 1 if there exists at least one non-empty
3098 sequence of bytes that when appended to sequence C<s>, starting at position
3099 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
3100 otherwise returns 0.
3101 
3102 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
3103 point.
3104 
3105 This is useful when a fixed-length buffer is being tested for being well-formed
3106 UTF-8, but the final few bytes in it don't comprise a full character; that is,
3107 it is split somewhere in the middle of the final code point's UTF-8
3108 representation.  (Presumably when the buffer is refreshed with the next chunk
3109 of data, the new first bytes will complete the partial code point.)   This
3110 function is used to verify that the final bytes in the current buffer are in
3111 fact the legal beginning of some code point, so that if they aren't, the
3112 failure can be signalled without having to wait for the next read.
3113 
3114 =cut
3115 */
3116 #define is_utf8_valid_partial_char(s, e)                                    \
3117                                 is_utf8_valid_partial_char_flags(s, e, 0)
3118 
3119 /*
3120 
3121 =for apidoc is_utf8_valid_partial_char_flags
3122 
3123 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
3124 or not the input is a valid UTF-8 encoded partial character, but it takes an
3125 extra parameter, C<flags>, which can further restrict which code points are
3126 considered valid.
3127 
3128 If C<flags> is 0, this behaves identically to
3129 C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
3130 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
3131 there is any sequence of bytes that can complete the input partial character in
3132 such a way that a non-prohibited character is formed, the function returns
3133 TRUE; otherwise FALSE.  Non character code points cannot be determined based on
3134 partial character input.  But many  of the other possible excluded types can be
3135 determined from just the first one or two bytes.
3136 
3137 =cut
3138  */
3139 
3140 PERL_STATIC_INLINE bool
Perl_is_utf8_valid_partial_char_flags(const U8 * const s0,const U8 * const e,const U32 flags)3141 Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
3142 {
3143     PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
3144     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
3145                           |UTF8_DISALLOW_PERL_EXTENDED)));
3146 
3147     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
3148                           DFA_RETURN_FAILURE_,
3149                           DFA_TEASE_APART_FF_,
3150                           NOOP);
3151 
3152     /* The NOOP above causes the DFA to drop down here iff the input was a
3153      * partial character.  flags=0 => can return TRUE immediately; otherwise we
3154      * need to check (not inline) if the partial character is the beginning of
3155      * a disallowed one */
3156     if (flags == 0) {
3157         return TRUE;
3158     }
3159 
3160     return cBOOL(is_utf8_char_helper_(s0, e, flags));
3161 
3162 #ifdef HAS_EXTRA_LONG_UTF8
3163 
3164   tease_apart_FF:
3165 
3166     /* Getting here means the input is either malformed, or, in the case of
3167      * PL_extended_utf8_dfa_tab, was for the largest possible start byte.  The
3168      * latter case has to be extended UTF-8, so can fail immediately if that is
3169      * forbidden */
3170 
3171     if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
3172         || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
3173     {
3174         return 0;
3175     }
3176 
3177     return is_utf8_FF_helper_(s0, e,
3178                               TRUE /* Require to be a partial character */
3179                              );
3180 #endif
3181 
3182 }
3183 
3184 /*
3185 
3186 =for apidoc is_utf8_fixed_width_buf_flags
3187 
3188 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
3189 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
3190 otherwise it returns FALSE.
3191 
3192 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
3193 without restriction.  If the final few bytes of the buffer do not form a
3194 complete code point, this will return TRUE anyway, provided that
3195 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
3196 
3197 If C<flags> in non-zero, it can be any combination of the
3198 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
3199 same meanings.
3200 
3201 This function differs from C<L</is_utf8_string_flags>> only in that the latter
3202 returns FALSE if the final few bytes of the string don't form a complete code
3203 point.
3204 
3205 =cut
3206  */
3207 #define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
3208                 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
3209 
3210 /*
3211 
3212 =for apidoc is_utf8_fixed_width_buf_loc_flags
3213 
3214 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
3215 failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
3216 to the beginning of any partial character at the end of the buffer; if there is
3217 no partial character C<*ep> will contain C<s>+C<len>.
3218 
3219 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
3220 
3221 =cut
3222 */
3223 
3224 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
3225                 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
3226 
3227 /*
3228 
3229 =for apidoc is_utf8_fixed_width_buf_loclen_flags
3230 
3231 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
3232 complete, valid characters found in the C<el> pointer.
3233 
3234 =cut
3235 */
3236 
3237 PERL_STATIC_INLINE bool
Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,STRLEN len,const U8 ** ep,STRLEN * el,const U32 flags)3238 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
3239                                        STRLEN len,
3240                                        const U8 **ep,
3241                                        STRLEN *el,
3242                                        const U32 flags)
3243 {
3244     const U8 * maybe_partial;
3245 
3246     PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
3247 
3248     if (! ep) {
3249         ep  = &maybe_partial;
3250     }
3251 
3252     /* If it's entirely valid, return that; otherwise see if the only error is
3253      * that the final few bytes are for a partial character */
3254     return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
3255            || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
3256 }
3257 
3258 PERL_STATIC_INLINE UV
Perl_utf8n_to_uvchr_msgs(const U8 * s,STRLEN curlen,STRLEN * retlen,const U32 flags,U32 * errors,AV ** msgs)3259 Perl_utf8n_to_uvchr_msgs(const U8 *s,
3260                          STRLEN curlen,
3261                          STRLEN *retlen,
3262                          const U32 flags,
3263                          U32 * errors,
3264                          AV ** msgs)
3265 {
3266     /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
3267      * simple cases, and, if necessary calls a helper function to deal with the
3268      * more complex ones.  Almost all well-formed non-problematic code points
3269      * are considered simple, so that it's unlikely that the helper function
3270      * will need to be called.
3271      *
3272      * This is an adaptation of the tables and algorithm given in
3273      * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
3274      * comprehensive documentation of the original version.  A copyright notice
3275      * for the original version is given at the beginning of this file.  The
3276      * Perl adaptation is documented at the definition of PL_strict_utf8_dfa_tab[].
3277      */
3278 
3279     const U8 * const s0 = s;
3280     const U8 * send = s0 + curlen;
3281     UV type;
3282     UV uv;
3283 
3284     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
3285 
3286     /* This dfa is fast.  If it accepts the input, it was for a well-formed,
3287      * non-problematic code point, which can be returned immediately.
3288      * Otherwise we call a helper function to figure out the more complicated
3289      * cases. */
3290 
3291     /* No calls from core pass in an empty string; non-core need a check */
3292 #ifdef PERL_CORE
3293     assert(curlen > 0);
3294 #else
3295     if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
3296                                                         flags, errors, msgs);
3297 #endif
3298 
3299     type = PL_strict_utf8_dfa_tab[*s];
3300 
3301     /* The table is structured so that 'type' is 0 iff the input byte is
3302      * represented identically regardless of the UTF-8ness of the string */
3303     if (type == 0) {   /* UTF-8 invariants are returned unchanged */
3304         uv = *s;
3305     }
3306     else {
3307         UV state = PL_strict_utf8_dfa_tab[256 + type];
3308         uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
3309 
3310         while (LIKELY(state != 1) && ++s < send) {
3311             type  = PL_strict_utf8_dfa_tab[*s];
3312             state = PL_strict_utf8_dfa_tab[256 + state + type];
3313 
3314             uv = UTF8_ACCUMULATE(uv, *s);
3315 
3316             if (state == 0) {
3317 #ifdef EBCDIC
3318                 uv = UNI_TO_NATIVE(uv);
3319 #endif
3320                 goto success;
3321             }
3322         }
3323 
3324         /* Here is potentially problematic.  Use the full mechanism */
3325         return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
3326                                            errors, msgs);
3327     }
3328 
3329   success:
3330     if (retlen) {
3331         *retlen = s - s0 + 1;
3332     }
3333     if (errors) {
3334         *errors = 0;
3335     }
3336     if (msgs) {
3337         *msgs = NULL;
3338     }
3339 
3340     return uv;
3341 }
3342 
3343 PERL_STATIC_INLINE UV
Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 * s,const U8 * send,STRLEN * retlen)3344 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
3345 {
3346     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
3347 
3348     assert(s < send);
3349 
3350     if (! ckWARN_d(WARN_UTF8)) {
3351 
3352         /* EMPTY is not really allowed, and asserts on debugging builds.  But
3353          * on non-debugging we have to deal with it, and this causes it to
3354          * return the REPLACEMENT CHARACTER, as the documentation indicates */
3355         return utf8n_to_uvchr(s, send - s, retlen,
3356                               (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
3357     }
3358     else {
3359         UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
3360         if (retlen && ret == 0 && (send <= s || *s != '\0')) {
3361             *retlen = (STRLEN) -1;
3362         }
3363 
3364         return ret;
3365     }
3366 }
3367 
3368 /* ------------------------------- perl.h ----------------------------- */
3369 
3370 /*
3371 =for apidoc_section $utility
3372 
3373 =for apidoc is_safe_syscall
3374 
3375 Test that the given C<pv> (with length C<len>) doesn't contain any internal
3376 C<NUL> characters.
3377 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
3378 category, and return FALSE.
3379 
3380 Return TRUE if the name is safe.
3381 
3382 C<what> and C<op_name> are used in any warning.
3383 
3384 Used by the C<IS_SAFE_SYSCALL()> macro.
3385 
3386 =cut
3387 */
3388 
3389 PERL_STATIC_INLINE bool
Perl_is_safe_syscall(pTHX_ const char * pv,STRLEN len,const char * what,const char * op_name)3390 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
3391 {
3392     /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
3393      * perl itself uses xce*() functions which accept 8-bit strings.
3394      */
3395 
3396     PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
3397 
3398     if (len > 1) {
3399         char *null_at;
3400         if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
3401                 SETERRNO(ENOENT, LIB_INVARG);
3402                 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
3403                                    "Invalid \\0 character in %s for %s: %s\\0%s",
3404                                    what, op_name, pv, null_at+1);
3405                 return FALSE;
3406         }
3407     }
3408 
3409     return TRUE;
3410 }
3411 
3412 /*
3413 
3414 Return true if the supplied filename has a newline character
3415 immediately before the first (hopefully only) NUL.
3416 
3417 My original look at this incorrectly used the len from SvPV(), but
3418 that's incorrect, since we allow for a NUL in pv[len-1].
3419 
3420 So instead, strlen() and work from there.
3421 
3422 This allow for the user reading a filename, forgetting to chomp it,
3423 then calling:
3424 
3425   open my $foo, "$file\0";
3426 
3427 */
3428 
3429 #ifdef PERL_CORE
3430 
3431 PERL_STATIC_INLINE bool
S_should_warn_nl(const char * pv)3432 S_should_warn_nl(const char *pv)
3433 {
3434     STRLEN len;
3435 
3436     PERL_ARGS_ASSERT_SHOULD_WARN_NL;
3437 
3438     len = strlen(pv);
3439 
3440     return len > 0 && pv[len-1] == '\n';
3441 }
3442 
3443 #endif
3444 
3445 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
3446 
3447 PERL_STATIC_INLINE bool
S_lossless_NV_to_IV(const NV nv,IV * ivp)3448 S_lossless_NV_to_IV(const NV nv, IV *ivp)
3449 {
3450     /* This function determines if the input NV 'nv' may be converted without
3451      * loss of data to an IV.  If not, it returns FALSE taking no other action.
3452      * But if it is possible, it does the conversion, returning TRUE, and
3453      * storing the converted result in '*ivp' */
3454 
3455     PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
3456 
3457 #  if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
3458     /* Normally any comparison with a NaN returns false; if we can't rely
3459      * on that behaviour, check explicitly */
3460     if (UNLIKELY(Perl_isnan(nv))) {
3461         return FALSE;
3462     }
3463 #  endif
3464 
3465 #  ifndef NV_PRESERVES_UV
3466     STATIC_ASSERT_STMT(((UV)1 << NV_PRESERVES_UV_BITS) - 1 <= (UV)IV_MAX);
3467 #  endif
3468 
3469     /* Written this way so that with an always-false NaN comparison we
3470      * return false */
3471     if (
3472 #  ifdef NV_PRESERVES_UV
3473         LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1) &&
3474 #  else
3475         /* If the condition below is not satisfied, lower bits of nv's
3476          * integral part is already lost and accurate conversion to integer
3477          * is impossible.
3478          * Note this should be consistent with S_sv_2iuv_common in sv.c. */
3479         Perl_fabs(nv) < (NV) ((UV)1 << NV_PRESERVES_UV_BITS) &&
3480 #  endif
3481         (IV) nv == nv) {
3482         *ivp = (IV) nv;
3483         return TRUE;
3484     }
3485     return FALSE;
3486 }
3487 
3488 #endif
3489 
3490 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
3491 
3492 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
3493 
3494 #define MAX_CHARSET_NAME_LENGTH 2
3495 
3496 PERL_STATIC_INLINE const char *
S_get_regex_charset_name(const U32 flags,STRLEN * const lenp)3497 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
3498 {
3499     PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
3500 
3501     /* Returns a string that corresponds to the name of the regex character set
3502      * given by 'flags', and *lenp is set the length of that string, which
3503      * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
3504 
3505     *lenp = 1;
3506     switch (get_regex_charset(flags)) {
3507         case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
3508         case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
3509         case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
3510         case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
3511         case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
3512             *lenp = 2;
3513             return ASCII_MORE_RESTRICT_PAT_MODS;
3514     }
3515     /* The NOT_REACHED; hides an assert() which has a rather complex
3516      * definition in perl.h. */
3517     NOT_REACHED; /* NOTREACHED */
3518     return "?";	    /* Unknown */
3519 }
3520 
3521 #endif
3522 
3523 /*
3524 
3525 Return false if any get magic is on the SV other than taint magic.
3526 
3527 */
3528 
3529 PERL_STATIC_INLINE bool
Perl_sv_only_taint_gmagic(SV * sv)3530 Perl_sv_only_taint_gmagic(SV *sv)
3531 {
3532     MAGIC *mg = SvMAGIC(sv);
3533 
3534     PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
3535 
3536     while (mg) {
3537         if (mg->mg_type != PERL_MAGIC_taint
3538             && !(mg->mg_flags & MGf_GSKIP)
3539             && mg->mg_virtual->svt_get) {
3540             return FALSE;
3541         }
3542         mg = mg->mg_moremagic;
3543     }
3544 
3545     return TRUE;
3546 }
3547 
3548 /* ------------------ cop.h ------------------------------------------- */
3549 
3550 /* implement GIMME_V() macro */
3551 
3552 PERL_STATIC_INLINE U8
Perl_gimme_V(pTHX)3553 Perl_gimme_V(pTHX)
3554 {
3555     I32 cxix;
3556     U8  gimme = (PL_op->op_flags & OPf_WANT);
3557 
3558     if (gimme)
3559         return gimme;
3560     cxix = PL_curstackinfo->si_cxsubix;
3561     if (cxix < 0)
3562         return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
3563     assert(cxstack[cxix].blk_gimme & G_WANT);
3564     return (cxstack[cxix].blk_gimme & G_WANT);
3565 }
3566 
3567 
3568 /* Enter a block. Push a new base context and return its address. */
3569 
3570 PERL_STATIC_INLINE PERL_CONTEXT *
Perl_cx_pushblock(pTHX_ U8 type,U8 gimme,SV ** sp,I32 saveix)3571 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
3572 {
3573     PERL_CONTEXT * cx;
3574 
3575     PERL_ARGS_ASSERT_CX_PUSHBLOCK;
3576 
3577     CXINC;
3578     cx = CX_CUR();
3579     cx->cx_type        = type;
3580     cx->blk_gimme      = gimme;
3581     cx->blk_oldsaveix  = saveix;
3582     cx->blk_oldsp      = (Stack_off_t)(sp - PL_stack_base);
3583     assert(cxstack_ix <= 0
3584             || CxTYPE(cx-1) == CXt_SUBST
3585             || cx->blk_oldsp >= (cx-1)->blk_oldsp);
3586     cx->blk_oldcop     = PL_curcop;
3587     cx->blk_oldmarksp  = (I32)(PL_markstack_ptr - PL_markstack);
3588     cx->blk_oldscopesp = PL_scopestack_ix;
3589     cx->blk_oldpm      = PL_curpm;
3590     cx->blk_old_tmpsfloor = PL_tmps_floor;
3591 
3592     PL_tmps_floor        = PL_tmps_ix;
3593     CX_DEBUG(cx, "PUSH");
3594     return cx;
3595 }
3596 
3597 
3598 /* Exit a block (RETURN and LAST). */
3599 
3600 PERL_STATIC_INLINE void
Perl_cx_popblock(pTHX_ PERL_CONTEXT * cx)3601 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
3602 {
3603     PERL_ARGS_ASSERT_CX_POPBLOCK;
3604 
3605     CX_DEBUG(cx, "POP");
3606     /* these 3 are common to cx_popblock and cx_topblock */
3607     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
3608     PL_scopestack_ix = cx->blk_oldscopesp;
3609     PL_curpm         = cx->blk_oldpm;
3610 
3611     /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
3612      * and leaves a CX entry lying around for repeated use, so
3613      * skip for multicall */                  \
3614     assert(   (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
3615             || PL_savestack_ix == cx->blk_oldsaveix);
3616     PL_curcop     = cx->blk_oldcop;
3617     PL_tmps_floor = cx->blk_old_tmpsfloor;
3618 }
3619 
3620 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
3621  * Whereas cx_popblock() restores the state to the point just before
3622  * cx_pushblock() was called,  cx_topblock() restores it to the point just
3623  * *after* cx_pushblock() was called. */
3624 
3625 PERL_STATIC_INLINE void
Perl_cx_topblock(pTHX_ PERL_CONTEXT * cx)3626 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
3627 {
3628     PERL_ARGS_ASSERT_CX_TOPBLOCK;
3629 
3630     CX_DEBUG(cx, "TOP");
3631     /* these 3 are common to cx_popblock and cx_topblock */
3632     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
3633     PL_scopestack_ix = cx->blk_oldscopesp;
3634     PL_curpm         = cx->blk_oldpm;
3635     Perl_rpp_popfree_to(aTHX_ PL_stack_base + cx->blk_oldsp);
3636 }
3637 
3638 
3639 PERL_STATIC_INLINE void
Perl_cx_pushsub(pTHX_ PERL_CONTEXT * cx,CV * cv,OP * retop,bool hasargs)3640 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
3641 {
3642     U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
3643 
3644     PERL_ARGS_ASSERT_CX_PUSHSUB;
3645 
3646     PERL_DTRACE_PROBE_ENTRY(cv);
3647     cx->blk_sub.old_cxsubix     = PL_curstackinfo->si_cxsubix;
3648     PL_curstackinfo->si_cxsubix = (I32)(cx - PL_curstackinfo->si_cxstack);
3649     cx->blk_sub.cv = cv;
3650     cx->blk_sub.olddepth = CvDEPTH(cv);
3651     cx->blk_sub.prevcomppad = PL_comppad;
3652     cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
3653     cx->blk_sub.retop = retop;
3654     SvREFCNT_inc_simple_void_NN(cv);
3655     cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
3656 }
3657 
3658 
3659 /* subsets of cx_popsub() */
3660 
3661 PERL_STATIC_INLINE void
Perl_cx_popsub_common(pTHX_ PERL_CONTEXT * cx)3662 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
3663 {
3664     CV *cv;
3665 
3666     PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
3667     assert(CxTYPE(cx) == CXt_SUB);
3668 
3669     PL_comppad = cx->blk_sub.prevcomppad;
3670     PL_curpad = LIKELY(PL_comppad != NULL) ? AvARRAY(PL_comppad) : NULL;
3671     cv = cx->blk_sub.cv;
3672     CvDEPTH(cv) = cx->blk_sub.olddepth;
3673     cx->blk_sub.cv = NULL;
3674     SvREFCNT_dec(cv);
3675     PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
3676 }
3677 
3678 
3679 /* handle the @_ part of leaving a sub */
3680 
3681 PERL_STATIC_INLINE void
Perl_cx_popsub_args(pTHX_ PERL_CONTEXT * cx)3682 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
3683 {
3684     AV *av;
3685 
3686     PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
3687     assert(CxTYPE(cx) == CXt_SUB);
3688     assert(AvARRAY(MUTABLE_AV(
3689         PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
3690                 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
3691 
3692     CX_POP_SAVEARRAY(cx);
3693     av = MUTABLE_AV(PAD_SVl(0));
3694     if (!SvMAGICAL(av) && SvREFCNT(av) == 1
3695 #ifndef PERL_RC_STACK
3696         && !AvREAL(av)
3697 #endif
3698     )
3699         clear_defarray_simple(av);
3700     else
3701         /* abandon @_ if it got reified */
3702         clear_defarray(av, 0);
3703 }
3704 
3705 
3706 PERL_STATIC_INLINE void
Perl_cx_popsub(pTHX_ PERL_CONTEXT * cx)3707 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
3708 {
3709     PERL_ARGS_ASSERT_CX_POPSUB;
3710     assert(CxTYPE(cx) == CXt_SUB);
3711 
3712     PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
3713 
3714     if (CxHASARGS(cx))
3715         cx_popsub_args(cx);
3716     cx_popsub_common(cx);
3717 }
3718 
3719 
3720 PERL_STATIC_INLINE void
Perl_cx_pushformat(pTHX_ PERL_CONTEXT * cx,CV * cv,OP * retop,GV * gv)3721 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
3722 {
3723     PERL_ARGS_ASSERT_CX_PUSHFORMAT;
3724 
3725     cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
3726     PL_curstackinfo->si_cxsubix= (I32)(cx - PL_curstackinfo->si_cxstack);
3727     cx->blk_format.cv          = cv;
3728     cx->blk_format.retop       = retop;
3729     cx->blk_format.gv          = gv;
3730     cx->blk_format.dfoutgv     = PL_defoutgv;
3731     cx->blk_format.prevcomppad = PL_comppad;
3732     cx->blk_u16                = 0;
3733 
3734     SvREFCNT_inc_simple_void_NN(cv);
3735     CvDEPTH(cv)++;
3736     SvREFCNT_inc_void(cx->blk_format.dfoutgv);
3737 }
3738 
3739 
3740 PERL_STATIC_INLINE void
Perl_cx_popformat(pTHX_ PERL_CONTEXT * cx)3741 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
3742 {
3743     CV *cv;
3744     GV *dfout;
3745 
3746     PERL_ARGS_ASSERT_CX_POPFORMAT;
3747     assert(CxTYPE(cx) == CXt_FORMAT);
3748 
3749     dfout = cx->blk_format.dfoutgv;
3750     setdefout(dfout);
3751     cx->blk_format.dfoutgv = NULL;
3752     SvREFCNT_dec_NN(dfout);
3753 
3754     PL_comppad = cx->blk_format.prevcomppad;
3755     PL_curpad = LIKELY(PL_comppad != NULL) ? AvARRAY(PL_comppad) : NULL;
3756     cv = cx->blk_format.cv;
3757     cx->blk_format.cv = NULL;
3758     --CvDEPTH(cv);
3759     SvREFCNT_dec_NN(cv);
3760     PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
3761 }
3762 
3763 
3764 PERL_STATIC_INLINE void
Perl_push_evalortry_common(pTHX_ PERL_CONTEXT * cx,OP * retop,SV * namesv)3765 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
3766 {
3767     cx->blk_eval.retop         = retop;
3768     cx->blk_eval.old_namesv    = namesv;
3769     cx->blk_eval.old_eval_root = PL_eval_root;
3770     cx->blk_eval.cur_text      = PL_parser ? PL_parser->linestr : NULL;
3771     cx->blk_eval.cv            = NULL; /* later set by doeval_compile() */
3772     cx->blk_eval.cur_top_env   = PL_top_env;
3773 
3774     assert(!(PL_in_eval     & ~ 0x3F));
3775     assert(!(PL_op->op_type & ~0x1FF));
3776     cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
3777 }
3778 
3779 PERL_STATIC_INLINE void
Perl_cx_pusheval(pTHX_ PERL_CONTEXT * cx,OP * retop,SV * namesv)3780 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
3781 {
3782     PERL_ARGS_ASSERT_CX_PUSHEVAL;
3783 
3784     Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
3785 
3786     cx->blk_eval.old_cxsubix    = PL_curstackinfo->si_cxsubix;
3787     PL_curstackinfo->si_cxsubix = (I32)(cx - PL_curstackinfo->si_cxstack);
3788 }
3789 
3790 PERL_STATIC_INLINE void
Perl_cx_pushtry(pTHX_ PERL_CONTEXT * cx,OP * retop)3791 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
3792 {
3793     PERL_ARGS_ASSERT_CX_PUSHTRY;
3794 
3795     Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
3796 
3797     /* Don't actually change it, just store the current value so it's restored
3798      * by the common popeval */
3799     cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
3800 }
3801 
3802 
3803 PERL_STATIC_INLINE void
Perl_cx_popeval(pTHX_ PERL_CONTEXT * cx)3804 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
3805 {
3806     SV *sv;
3807 
3808     PERL_ARGS_ASSERT_CX_POPEVAL;
3809     assert(CxTYPE(cx) == CXt_EVAL);
3810 
3811     PL_in_eval = CxOLD_IN_EVAL(cx);
3812     assert(!(PL_in_eval & 0xc0));
3813     PL_eval_root = cx->blk_eval.old_eval_root;
3814     sv = cx->blk_eval.cur_text;
3815     if (sv && CxEVAL_TXT_REFCNTED(cx)) {
3816         cx->blk_eval.cur_text = NULL;
3817         SvREFCNT_dec_NN(sv);
3818     }
3819 
3820     sv = cx->blk_eval.old_namesv;
3821     if (sv) {
3822         cx->blk_eval.old_namesv = NULL;
3823         SvREFCNT_dec_NN(sv);
3824     }
3825     PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
3826 }
3827 
3828 
3829 /* push a plain loop, i.e.
3830  *     { block }
3831  *     while (cond) { block }
3832  *     for (init;cond;continue) { block }
3833  * This loop can be last/redo'ed etc.
3834  */
3835 
3836 PERL_STATIC_INLINE void
Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT * cx)3837 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
3838 {
3839     PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
3840     cx->blk_loop.my_op = cLOOP;
3841 }
3842 
3843 
3844 /* push a true for loop, i.e.
3845  *     for var (list) { block }
3846  */
3847 
3848 PERL_STATIC_INLINE void
Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT * cx,void * itervarp,SV * itersave)3849 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
3850 {
3851     PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
3852 
3853     /* this one line is common with cx_pushloop_plain */
3854     cx->blk_loop.my_op = cLOOP;
3855 
3856     cx->blk_loop.itervar_u.svp = (SV**)itervarp;
3857     cx->blk_loop.itersave      = itersave;
3858 #ifdef USE_ITHREADS
3859     cx->blk_loop.oldcomppad = PL_comppad;
3860 #endif
3861 }
3862 
3863 
3864 /* pop all loop types, including plain */
3865 
3866 PERL_STATIC_INLINE void
Perl_cx_poploop(pTHX_ PERL_CONTEXT * cx)3867 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
3868 {
3869     PERL_ARGS_ASSERT_CX_POPLOOP;
3870 
3871     assert(CxTYPE_is_LOOP(cx));
3872     if (  CxTYPE(cx) == CXt_LOOP_ARY
3873        || CxTYPE(cx) == CXt_LOOP_LAZYSV)
3874     {
3875         /* Free ary or cur. This assumes that state_u.ary.ary
3876          * aligns with state_u.lazysv.cur. See cx_dup() */
3877         SV *sv = cx->blk_loop.state_u.lazysv.cur;
3878         cx->blk_loop.state_u.lazysv.cur = NULL;
3879         SvREFCNT_dec_NN(sv);
3880         if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
3881             sv = cx->blk_loop.state_u.lazysv.end;
3882             cx->blk_loop.state_u.lazysv.end = NULL;
3883             SvREFCNT_dec_NN(sv);
3884         }
3885     }
3886     if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
3887         SV *cursv;
3888         SV **svp = (cx)->blk_loop.itervar_u.svp;
3889         if ((cx->cx_type & CXp_FOR_GV))
3890             svp = &GvSV((GV*)svp);
3891         cursv = *svp;
3892         *svp = cx->blk_loop.itersave;
3893         cx->blk_loop.itersave = NULL;
3894         SvREFCNT_dec(cursv);
3895     }
3896     if (cx->cx_type & (CXp_FOR_GV|CXp_FOR_LVREF))
3897         SvREFCNT_dec(cx->blk_loop.itervar_u.svp);
3898 }
3899 
3900 
3901 PERL_STATIC_INLINE void
Perl_cx_pushwhen(pTHX_ PERL_CONTEXT * cx)3902 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
3903 {
3904     PERL_ARGS_ASSERT_CX_PUSHWHEN;
3905 
3906     cx->blk_givwhen.leave_op = cLOGOP->op_other;
3907 }
3908 
3909 
3910 PERL_STATIC_INLINE void
Perl_cx_popwhen(pTHX_ PERL_CONTEXT * cx)3911 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
3912 {
3913     PERL_ARGS_ASSERT_CX_POPWHEN;
3914     assert(CxTYPE(cx) == CXt_WHEN);
3915 
3916     PERL_UNUSED_ARG(cx);
3917     PERL_UNUSED_CONTEXT;
3918     /* currently NOOP */
3919 }
3920 
3921 
3922 PERL_STATIC_INLINE void
Perl_cx_pushgiven(pTHX_ PERL_CONTEXT * cx,SV * orig_defsv)3923 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
3924 {
3925     PERL_ARGS_ASSERT_CX_PUSHGIVEN;
3926 
3927     cx->blk_givwhen.leave_op = cLOGOP->op_other;
3928     cx->blk_givwhen.defsv_save = orig_defsv;
3929 }
3930 
3931 
3932 PERL_STATIC_INLINE void
Perl_cx_popgiven(pTHX_ PERL_CONTEXT * cx)3933 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
3934 {
3935     SV *sv;
3936 
3937     PERL_ARGS_ASSERT_CX_POPGIVEN;
3938     assert(CxTYPE(cx) == CXt_GIVEN);
3939 
3940     sv = GvSV(PL_defgv);
3941     GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
3942     cx->blk_givwhen.defsv_save = NULL;
3943     SvREFCNT_dec(sv);
3944 }
3945 
3946 
3947 /* Make @_ empty in-place in simple cases: a cheap av_clear().
3948  * See Perl_clear_defarray() for non-simple cases */
3949 
3950 
3951 PERL_STATIC_INLINE void
Perl_clear_defarray_simple(pTHX_ AV * av)3952 Perl_clear_defarray_simple(pTHX_ AV *av)
3953 {
3954     PERL_ARGS_ASSERT_CLEAR_DEFARRAY_SIMPLE;
3955 
3956     assert(SvTYPE(av) == SVt_PVAV);
3957     assert(!SvREADONLY(av));
3958     assert(!SvMAGICAL(av));
3959     assert(SvREFCNT(av) == 1);
3960 
3961 #ifdef PERL_RC_STACK
3962     assert(AvREAL(av));
3963     /* this code assumes that destructors called here can't free av
3964      * itself, because pad[0] and/or CX pointers will keep it alive */
3965     SSize_t i = AvFILLp(av);
3966     while (i >= 0) {
3967         SV *sv = AvARRAY(av)[i];
3968         AvARRAY(av)[i--] = NULL;
3969         SvREFCNT_dec(sv);
3970     }
3971 #else
3972     assert(!AvREAL(av));
3973 #endif
3974     AvFILLp(av) = -1;
3975     Perl_av_remove_offset(aTHX_ av);
3976 }
3977 
3978 /* Switch to a different argument stack.
3979  *
3980  * Note that it doesn't update PL_curstackinfo->si_stack_nonrc_base,
3981  * so this should only be used as part of a general switching between
3982  * stackinfos.
3983  */
3984 
3985 PERL_STATIC_INLINE void
Perl_switch_argstack(pTHX_ AV * to)3986 Perl_switch_argstack(pTHX_ AV *to)
3987 {
3988     PERL_ARGS_ASSERT_SWITCH_ARGSTACK;
3989 
3990     AvFILLp(PL_curstack) = PL_stack_sp - PL_stack_base;
3991     PL_stack_base = AvARRAY(to);
3992     PL_stack_max  = PL_stack_base + AvMAX(to);
3993     PL_stack_sp   = PL_stack_base + AvFILLp(to);
3994     PL_curstack   = to;
3995 }
3996 
3997 
3998 /* Push, and switch to a new stackinfo, allocating one if none are spare,
3999  * to get a fresh set of stacks.
4000  * Update all the interpreter variables like PL_curstackinfo,
4001  * PL_stack_sp, etc.
4002  * current flag meanings:
4003  *   1 make the new arg stack AvREAL
4004  */
4005 
4006 
4007 PERL_STATIC_INLINE void
Perl_push_stackinfo(pTHX_ I32 type,UV flags)4008 Perl_push_stackinfo(pTHX_ I32 type, UV flags)
4009 {
4010     PERL_ARGS_ASSERT_PUSH_STACKINFO;
4011 
4012     PERL_SI *next = PL_curstackinfo->si_next;
4013     DEBUG_l({
4014         int i = 0; PERL_SI *p = PL_curstackinfo;
4015         while (p) { i++; p = p->si_prev; }
4016         Perl_deb(aTHX_ "push STACKINFO %d in %s at %s:%d\n",
4017                      i, SAFE_FUNCTION__, __FILE__, __LINE__);
4018     })
4019 
4020     if (!next) {
4021         next = new_stackinfo_flags(32, 2048/sizeof(PERL_CONTEXT) - 1, flags);
4022         next->si_prev = PL_curstackinfo;
4023         PL_curstackinfo->si_next = next;
4024     }
4025     next->si_type = type;
4026     next->si_cxix = -1;
4027     next->si_cxsubix = -1;
4028     PUSHSTACK_INIT_HWM(next);
4029 #ifdef PERL_RC_STACK
4030     next->si_stack_nonrc_base = 0;
4031 #endif
4032     if (flags & 1)
4033         AvREAL_on(next->si_stack);
4034     else
4035         AvREAL_off(next->si_stack);
4036     AvFILLp(next->si_stack) = 0;
4037     switch_argstack(next->si_stack);
4038     PL_curstackinfo = next;
4039     SET_MARK_OFFSET;
4040 }
4041 
4042 
4043 /* Pop, then switch to the previous stackinfo and set of stacks.
4044  * Update all the interpreter variables like PL_curstackinfo,
4045  * PL_stack_sp, etc. */
4046 
4047 PERL_STATIC_INLINE void
Perl_pop_stackinfo(pTHX)4048 Perl_pop_stackinfo(pTHX)
4049 {
4050     PERL_ARGS_ASSERT_POP_STACKINFO;
4051 
4052     PERL_SI * const prev = PL_curstackinfo->si_prev;
4053     DEBUG_l({
4054         int i = -1; PERL_SI *p = PL_curstackinfo;
4055         while (p) { i++; p = p->si_prev; }
4056         Perl_deb(aTHX_ "pop  STACKINFO %d in %s at %s:%d\n",
4057                      i, SAFE_FUNCTION__, __FILE__, __LINE__);})
4058     if (!prev) {
4059         Perl_croak_popstack();
4060     }
4061 
4062     switch_argstack(prev->si_stack);
4063     /* don't free prev here, free them all at the END{} */
4064     PL_curstackinfo = prev;
4065 }
4066 
4067 
4068 
4069 /*
4070 =for apidoc newPADxVOP
4071 
4072 Constructs, checks and returns an op containing a pad offset.  C<type> is
4073 the opcode, which should be one of C<OP_PADSV>, C<OP_PADAV>, C<OP_PADHV>
4074 or C<OP_PADCV>.  The returned op will have the C<op_targ> field set by
4075 the C<padix> argument.
4076 
4077 This is convenient when constructing a large optree in nested function
4078 calls, as it avoids needing to store the pad op directly to set the
4079 C<op_targ> field as a side-effect. For example
4080 
4081     o = op_append_elem(OP_LINESEQ, o,
4082         newPADxVOP(OP_PADSV, 0, padix));
4083 
4084 =cut
4085 */
4086 
4087 PERL_STATIC_INLINE OP *
Perl_newPADxVOP(pTHX_ I32 type,I32 flags,PADOFFSET padix)4088 Perl_newPADxVOP(pTHX_ I32 type, I32 flags, PADOFFSET padix)
4089 {
4090     PERL_ARGS_ASSERT_NEWPADXVOP;
4091 
4092     assert(type == OP_PADSV || type == OP_PADAV || type == OP_PADHV
4093             || type == OP_PADCV);
4094     OP *o = newOP(type, flags);
4095     o->op_targ = padix;
4096     return o;
4097 }
4098 
4099 /* ------------------ util.h ------------------------------------------- */
4100 
4101 /*
4102 =for apidoc_section $string
4103 
4104 =for apidoc foldEQ
4105 
4106 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
4107 same
4108 case-insensitively; false otherwise.  Uppercase and lowercase ASCII range bytes
4109 match themselves and their opposite case counterparts.  Non-cased and non-ASCII
4110 range bytes match only themselves.
4111 
4112 =cut
4113 */
4114 
4115 PERL_STATIC_INLINE I32
Perl_foldEQ(pTHX_ const char * s1,const char * s2,I32 len)4116 Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len)
4117 {
4118     PERL_UNUSED_CONTEXT;
4119 
4120     const U8 *a = (const U8 *)s1;
4121     const U8 *b = (const U8 *)s2;
4122 
4123     PERL_ARGS_ASSERT_FOLDEQ;
4124 
4125     assert(len >= 0);
4126 
4127     while (len--) {
4128         if (*a != *b && *a != PL_fold[*b])
4129             return 0;
4130         a++,b++;
4131     }
4132     return 1;
4133 }
4134 
4135 PERL_STATIC_INLINE I32
Perl_foldEQ_latin1(pTHX_ const char * s1,const char * s2,I32 len)4136 Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len)
4137 {
4138     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  Works on all folds
4139      * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
4140      * does not check for this.  Nor does it check that the strings each have
4141      * at least 'len' characters. */
4142 
4143     PERL_UNUSED_CONTEXT;
4144 
4145     const U8 *a = (const U8 *)s1;
4146     const U8 *b = (const U8 *)s2;
4147 
4148     PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
4149 
4150     assert(len >= 0);
4151 
4152     while (len--) {
4153         if (*a != *b && *a != PL_fold_latin1[*b]) {
4154             return 0;
4155         }
4156         a++, b++;
4157     }
4158     return 1;
4159 }
4160 
4161 /*
4162 =for apidoc_section $locale
4163 =for apidoc foldEQ_locale
4164 
4165 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
4166 same case-insensitively in the current locale; false otherwise.
4167 
4168 =cut
4169 */
4170 
4171 PERL_STATIC_INLINE I32
Perl_foldEQ_locale(pTHX_ const char * s1,const char * s2,I32 len)4172 Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len)
4173 {
4174     const U8 *a = (const U8 *)s1;
4175     const U8 *b = (const U8 *)s2;
4176 
4177     PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
4178 
4179     assert(len >= 0);
4180 
4181     while (len--) {
4182         if (*a != *b && *a != PL_fold_locale[*b]) {
4183             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
4184                      "%s:%d: Our records indicate %02x is not a fold of %02x"
4185                      " or its mate %02x\n",
4186                      __FILE__, __LINE__, *a, *b, PL_fold_locale[*b]));
4187 
4188             return 0;
4189         }
4190         a++,b++;
4191     }
4192     return 1;
4193 }
4194 
4195 /*
4196 =for apidoc_section $string
4197 =for apidoc my_strnlen
4198 
4199 The C library C<strnlen> if available, or a Perl implementation of it.
4200 
4201 C<my_strnlen()> computes the length of the string, up to C<maxlen>
4202 bytes.  It will never attempt to address more than C<maxlen>
4203 bytes, making it suitable for use with strings that are not
4204 guaranteed to be NUL-terminated.
4205 
4206 =cut
4207 
4208 Description stolen from http://man.openbsd.org/strnlen.3,
4209 implementation stolen from PostgreSQL.
4210 */
4211 #ifndef HAS_STRNLEN
4212 
4213 PERL_STATIC_INLINE Size_t
Perl_my_strnlen(const char * str,Size_t maxlen)4214 Perl_my_strnlen(const char *str, Size_t maxlen)
4215 {
4216     const char *end = (char *) memchr(str, '\0', maxlen);
4217 
4218     PERL_ARGS_ASSERT_MY_STRNLEN;
4219 
4220     if (end == NULL) return maxlen;
4221     return end - str;
4222 }
4223 
4224 #endif
4225 
4226 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
4227 
4228 PERL_STATIC_INLINE void *
S_my_memrchr(const char * s,const char c,const STRLEN len)4229 S_my_memrchr(const char * s, const char c, const STRLEN len)
4230 {
4231     /* memrchr(), since many platforms lack it */
4232 
4233     const char * t = s + len - 1;
4234 
4235     PERL_ARGS_ASSERT_MY_MEMRCHR;
4236 
4237     while (t >= s) {
4238         if (*t == c) {
4239             return (void *) t;
4240         }
4241         t--;
4242     }
4243 
4244     return NULL;
4245 }
4246 
4247 #endif
4248 
4249 PERL_STATIC_INLINE char *
Perl_mortal_getenv(const char * str)4250 Perl_mortal_getenv(const char * str)
4251 {
4252     /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
4253      *
4254      * It's (mostly) thread-safe because it uses a mutex to prevent other
4255      * threads (that look at this mutex) from destroying the result before this
4256      * routine has a chance to copy the result to a place that won't be
4257      * destroyed before the caller gets a chance to handle it.  That place is a
4258      * mortal SV.  khw chose this over SAVEFREEPV because he is under the
4259      * impression that the SV will hang around longer under more circumstances
4260      *
4261      * The reason it isn't completely thread-safe is that other code could
4262      * simply not pay attention to the mutex.  All of the Perl core uses the
4263      * mutex, but it is possible for code from, say XS, to not use this mutex,
4264      * defeating the safety.
4265      *
4266      * getenv() returns, in some implementations, a pointer to a spot in the
4267      * **environ array, which could be invalidated at any time by this or
4268      * another thread changing the environment.  Other implementations copy the
4269      * **environ value to a static buffer, returning a pointer to that.  That
4270      * buffer might or might not be invalidated by a getenv() call in another
4271      * thread.  If it does get zapped, we need an exclusive lock.  Otherwise,
4272      * many getenv() calls can safely be running simultaneously, so a
4273      * many-reader (but no simultaneous writers) lock is ok.  There is a
4274      * Configure probe to see if another thread destroys the buffer, and the
4275      * mutex is defined accordingly.
4276      *
4277      * But in all cases, using the mutex prevents these problems, as long as
4278      * all code uses the same mutex.
4279      *
4280      * A complication is that this can be called during phases where the
4281      * mortalization process isn't available.  These are in interpreter
4282      * destruction or early in construction.  khw believes that at these times
4283      * there shouldn't be anything else going on, so plain getenv is safe AS
4284      * LONG AS the caller acts on the return before calling it again. */
4285 
4286     char * ret;
4287     dTHX;
4288 
4289     PERL_ARGS_ASSERT_MORTAL_GETENV;
4290 
4291     /* Can't mortalize without stacks.  khw believes that no other threads
4292      * should be running, so no need to lock things, and this may be during a
4293      * phase when locking isn't even available */
4294     if (UNLIKELY(PL_scopestack_ix == 0)) {
4295         return getenv(str);
4296     }
4297 
4298 #ifdef PERL_MEM_LOG
4299 
4300     /* A major complication arises under PERL_MEM_LOG.  When that is active,
4301      * every memory allocation may result in logging, depending on the value of
4302      * ENV{PERL_MEM_LOG} at the moment.  That means, as we create the SV for
4303      * saving ENV{foo}'s value (but before saving it), the logging code will
4304      * call us recursively to find out what ENV{PERL_MEM_LOG} is.  Without some
4305      * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
4306      * lock a boolean mutex recursively); 3) destroying the getenv() static
4307      * buffer; or 4) destroying the temporary created by this for the copy
4308      * causes a log entry to be made which could cause a new temporary to be
4309      * created, which will need to be destroyed at some point, leading to an
4310      * infinite loop.
4311      *
4312      * The solution adopted here (after some gnashing of teeth) is to detect
4313      * the recursive calls and calls from the logger, and treat them specially.
4314      * Let's say we want to do getenv("foo").  We first find
4315      * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
4316      * variable, so no temporary is required.  Then we do getenv(foo), and in
4317      * the process of creating a temporary to save it, this function will be
4318      * called recursively to do a getenv(PERL_MEM_LOG).  On the recursed call,
4319      * we detect that it is such a call and return our saved value instead of
4320      * locking and doing a new getenv().  This solves all of problems 1), 2),
4321      * and 3).  Because all the getenv()s are done while the mutex is locked,
4322      * the state cannot have changed.  To solve 4), we don't create a temporary
4323      * when this is called from the logging code.  That code disposes of the
4324      * return value while the mutex is still locked.
4325      *
4326      * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
4327      * digits and 3 particular letters are significant; the rest are ignored by
4328      * the memory logging code.  Thus the per-interpreter variable only needs
4329      * to be large enough to save the significant information, the size of
4330      * which is known at compile time.  The first byte is extra, reserved for
4331      * flags for our use.  To protect against overflowing, only the reserved
4332      * byte, as many digits as don't overflow, and the three letters are
4333      * stored.
4334      *
4335      * The reserved byte has two bits:
4336      *      0x1 if set indicates that if we get here, it is a recursive call of
4337      *          getenv()
4338      *      0x2 if set indicates that the call is from the logging code.
4339      *
4340      * If the flag indicates this is a recursive call, just return the stored
4341      * value of PL_mem_log;  An empty value gets turned into NULL. */
4342     if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
4343         if (PL_mem_log[1] == '\0') {
4344             return NULL;
4345         } else {
4346             return PL_mem_log + 1;
4347         }
4348     }
4349 
4350 #endif
4351 
4352     GETENV_LOCK;
4353 
4354 #ifdef PERL_MEM_LOG
4355 
4356     /* Here we are in a critical section.  As explained above, we do our own
4357      * getenv(PERL_MEM_LOG), saving the result safely. */
4358     ret = getenv("PERL_MEM_LOG");
4359     if (ret == NULL) {  /* No logging active */
4360 
4361         /* Return that immediately if called from the logging code */
4362         if (PL_mem_log[0] & 0x2) {
4363             GETENV_UNLOCK;
4364             return NULL;
4365         }
4366 
4367         PL_mem_log[1] = '\0';
4368     }
4369     else {
4370         char *mem_log_meat = PL_mem_log + 1;    /* first byte reserved */
4371 
4372         /* There is nothing to prevent the value of PERL_MEM_LOG from being an
4373          * extremely long string.  But we want only a few characters from it.
4374          * PL_mem_log has been made large enough to hold just the ones we need.
4375          * First the file descriptor. */
4376         if (isDIGIT(*ret)) {
4377             const char * s = ret;
4378             if (UNLIKELY(*s == '0')) {
4379 
4380                 /* Reduce multiple leading zeros to a single one.  This is to
4381                  * allow the caller to change what to do with leading zeros. */
4382                 *mem_log_meat++ = '0';
4383                 s++;
4384                 while (*s == '0') {
4385                     s++;
4386                 }
4387             }
4388 
4389             /* If the input overflows, copy just enough for the result to also
4390              * overflow, plus 1 to make sure */
4391             while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
4392                 *mem_log_meat++ = *s++;
4393             }
4394         }
4395 
4396         /* Then each of the four significant characters */
4397         if (strchr(ret, 'm')) {
4398             *mem_log_meat++ = 'm';
4399         }
4400         if (strchr(ret, 's')) {
4401             *mem_log_meat++ = 's';
4402         }
4403         if (strchr(ret, 't')) {
4404             *mem_log_meat++ = 't';
4405         }
4406         if (strchr(ret, 'c')) {
4407             *mem_log_meat++ = 'c';
4408         }
4409         *mem_log_meat = '\0';
4410 
4411         assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
4412     }
4413 
4414     /* If we are being called from the logger, it only needs the significant
4415      * portion of PERL_MEM_LOG, and doesn't need a safe copy */
4416     if (PL_mem_log[0] & 0x2) {
4417         assert(strEQ(str, "PERL_MEM_LOG"));
4418         GETENV_UNLOCK;
4419         return PL_mem_log + 1;
4420     }
4421 
4422     /* Here is a generic getenv().  This could be a getenv("PERL_MEM_LOG") that
4423      * is coming from other than the logging code, so it should be treated the
4424      * same as any other getenv(), returning the full value, not just the
4425      * significant part, and having its value saved.  Set the flag that
4426      * indicates any call to this routine will be a recursion from here */
4427     PL_mem_log[0] = 0x1;
4428 
4429 #endif
4430 
4431     /* Now get the value of the real desired variable, and save a copy */
4432     ret = getenv(str);
4433 
4434     if (ret != NULL) {
4435         ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
4436     }
4437 
4438     GETENV_UNLOCK;
4439 
4440 #ifdef PERL_MEM_LOG
4441 
4442     /* Clear the buffer */
4443     Zero(PL_mem_log, sizeof(PL_mem_log), char);
4444 
4445 #endif
4446 
4447     return ret;
4448 }
4449 
4450 PERL_STATIC_INLINE bool
Perl_sv_isbool(pTHX_ const SV * sv)4451 Perl_sv_isbool(pTHX_ const SV *sv)
4452 {
4453     PERL_UNUSED_CONTEXT;
4454     return SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv);
4455 }
4456 
4457 #ifdef USE_ITHREADS
4458 
4459 PERL_STATIC_INLINE AV *
Perl_cop_file_avn(pTHX_ const COP * cop)4460 Perl_cop_file_avn(pTHX_ const COP *cop) {
4461 
4462     PERL_ARGS_ASSERT_COP_FILE_AVN;
4463 
4464     const char *file = CopFILE(cop);
4465     if (file) {
4466         GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
4467         if (gv) {
4468             return GvAVn(gv);
4469         }
4470         else
4471             return NULL;
4472      }
4473      else
4474          return NULL;
4475 }
4476 
4477 #endif
4478 
4479 PERL_STATIC_INLINE PADNAME *
Perl_padname_refcnt_inc(PADNAME * pn)4480 Perl_padname_refcnt_inc(PADNAME *pn)
4481 {
4482     PadnameREFCNT(pn)++;
4483     return pn;
4484 }
4485 
4486 PERL_STATIC_INLINE PADNAMELIST *
Perl_padnamelist_refcnt_inc(PADNAMELIST * pnl)4487 Perl_padnamelist_refcnt_inc(PADNAMELIST *pnl)
4488 {
4489     PadnamelistREFCNT(pnl)++;
4490     return pnl;
4491 }
4492 
4493 /* copy a string to a safe spot */
4494 
4495 /*
4496 =for apidoc_section $string
4497 =for apidoc savepv
4498 
4499 Perl's version of C<strdup()>.  Returns a pointer to a newly allocated
4500 string which is a duplicate of C<pv>.  The size of the string is
4501 determined by C<strlen()>, which means it may not contain embedded C<NUL>
4502 characters and must have a trailing C<NUL>.  To prevent memory leaks, the
4503 memory allocated for the new string needs to be freed when no longer needed.
4504 This can be done with the C<L</Safefree>> function, or
4505 L<C<SAVEFREEPV>|perlguts/SAVEFREEPV(p)>.
4506 
4507 On some platforms, Windows for example, all allocated memory owned by a thread
4508 is deallocated when that thread ends.  So if you need that not to happen, you
4509 need to use the shared memory functions, such as C<L</savesharedpv>>.
4510 
4511 =cut
4512 */
4513 
4514 PERL_STATIC_INLINE char *
Perl_savepv(pTHX_ const char * pv)4515 Perl_savepv(pTHX_ const char *pv)
4516 {
4517     PERL_UNUSED_CONTEXT;
4518     if (!pv)
4519         return NULL;
4520     else {
4521         char *newaddr;
4522         const STRLEN pvlen = strlen(pv)+1;
4523         Newx(newaddr, pvlen, char);
4524         return (char*)memcpy(newaddr, pv, pvlen);
4525     }
4526 }
4527 
4528 /* same thing but with a known length */
4529 
4530 /*
4531 =for apidoc savepvn
4532 
4533 Perl's version of what C<strndup()> would be if it existed.  Returns a
4534 pointer to a newly allocated string which is a duplicate of the first
4535 C<len> bytes from C<pv>, plus a trailing
4536 C<NUL> byte.  The memory allocated for
4537 the new string can be freed with the C<Safefree()> function.
4538 
4539 On some platforms, Windows for example, all allocated memory owned by a thread
4540 is deallocated when that thread ends.  So if you need that not to happen, you
4541 need to use the shared memory functions, such as C<L</savesharedpvn>>.
4542 
4543 =cut
4544 */
4545 
4546 PERL_STATIC_INLINE char *
Perl_savepvn(pTHX_ const char * pv,Size_t len)4547 Perl_savepvn(pTHX_ const char *pv, Size_t len)
4548 {
4549     char *newaddr;
4550     PERL_UNUSED_CONTEXT;
4551 
4552     Newx(newaddr,len+1,char);
4553     /* Give a meaning to NULL pointer mainly for the use in sv_magic() */
4554     if (pv) {
4555         /* might not be null terminated */
4556         newaddr[len] = '\0';
4557         return (char *) CopyD(pv,newaddr,len,char);
4558     }
4559     else {
4560         return (char *) ZeroD(newaddr,len+1,char);
4561     }
4562 }
4563 
4564 /*
4565 =for apidoc savesvpv
4566 
4567 A version of C<savepv()>/C<savepvn()> which gets the string to duplicate from
4568 the passed in SV using C<SvPV()>
4569 
4570 On some platforms, Windows for example, all allocated memory owned by a thread
4571 is deallocated when that thread ends.  So if you need that not to happen, you
4572 need to use the shared memory functions, such as C<L</savesharedsvpv>>.
4573 
4574 =cut
4575 */
4576 
4577 PERL_STATIC_INLINE char *
Perl_savesvpv(pTHX_ SV * sv)4578 Perl_savesvpv(pTHX_ SV *sv)
4579 {
4580     STRLEN len;
4581     const char * const pv = SvPV_const(sv, len);
4582     char *newaddr;
4583 
4584     PERL_ARGS_ASSERT_SAVESVPV;
4585 
4586     ++len;
4587     Newx(newaddr,len,char);
4588     return (char *) CopyD(pv,newaddr,len,char);
4589 }
4590 
4591 /*
4592 =for apidoc savesharedsvpv
4593 
4594 A version of C<savesharedpv()> which allocates the duplicate string in
4595 memory which is shared between threads.
4596 
4597 =cut
4598 */
4599 
4600 PERL_STATIC_INLINE char *
Perl_savesharedsvpv(pTHX_ SV * sv)4601 Perl_savesharedsvpv(pTHX_ SV *sv)
4602 {
4603     STRLEN len;
4604     const char * const pv = SvPV_const(sv, len);
4605 
4606     PERL_ARGS_ASSERT_SAVESHAREDSVPV;
4607 
4608     return savesharedpvn(pv, len);
4609 }
4610 
4611 #ifndef PERL_GET_CONTEXT_DEFINED
4612 
4613 /*
4614 =for apidoc_section $embedding
4615 =for apidoc get_context
4616 
4617 Implements L<perlapi/C<PERL_GET_CONTEXT>>, which you should use instead.
4618 
4619 =cut
4620 */
4621 
4622 PERL_STATIC_INLINE void *
Perl_get_context(void)4623 Perl_get_context(void)
4624 {
4625 #  if defined(USE_ITHREADS)
4626 #    ifdef OLD_PTHREADS_API
4627     pthread_addr_t t;
4628     int error = pthread_getspecific(PL_thr_key, &t);
4629     if (error)
4630         Perl_croak_nocontext("panic: pthread_getspecific, error=%d", error);
4631     return (void*)t;
4632 #    elif defined(I_MACH_CTHREADS)
4633     return (void*)cthread_data(cthread_self());
4634 #    else
4635     return (void*)PTHREAD_GETSPECIFIC(PL_thr_key);
4636 #    endif
4637 #  else
4638     return (void*)NULL;
4639 #  endif
4640 }
4641 
4642 #endif
4643 
4644 PERL_STATIC_INLINE MGVTBL*
Perl_get_vtbl(pTHX_ int vtbl_id)4645 Perl_get_vtbl(pTHX_ int vtbl_id)
4646 {
4647     PERL_UNUSED_CONTEXT;
4648 
4649     return (vtbl_id < 0 || vtbl_id >= magic_vtable_max)
4650         ? NULL : (MGVTBL*)PL_magic_vtables + vtbl_id;
4651 }
4652 
4653 /*
4654 =for apidoc my_strlcat
4655 
4656 The C library C<strlcat> if available, or a Perl implementation of it.
4657 This operates on C C<NUL>-terminated strings.
4658 
4659 C<my_strlcat()> appends string C<src> to the end of C<dst>.  It will append at
4660 most S<C<size - strlen(dst) - 1>> bytes.  It will then C<NUL>-terminate,
4661 unless C<size> is 0 or the original C<dst> string was longer than C<size> (in
4662 practice this should not happen as it means that either C<size> is incorrect or
4663 that C<dst> is not a proper C<NUL>-terminated string).
4664 
4665 Note that C<size> is the full size of the destination buffer and
4666 the result is guaranteed to be C<NUL>-terminated if there is room.  Note that
4667 room for the C<NUL> should be included in C<size>.
4668 
4669 The return value is the total length that C<dst> would have if C<size> is
4670 sufficiently large.  Thus it is the initial length of C<dst> plus the length of
4671 C<src>.  If C<size> is smaller than the return, the excess was not appended.
4672 
4673 =cut
4674 
4675 Description stolen from http://man.openbsd.org/strlcat.3
4676 */
4677 #ifndef HAS_STRLCAT
4678 PERL_STATIC_INLINE Size_t
Perl_my_strlcat(char * dst,const char * src,Size_t size)4679 Perl_my_strlcat(char *dst, const char *src, Size_t size)
4680 {
4681     Size_t used, length, copy;
4682 
4683     used = strlen(dst);
4684     length = strlen(src);
4685     if (size > 0 && used < size - 1) {
4686         copy = (length >= size - used) ? size - used - 1 : length;
4687         memcpy(dst + used, src, copy);
4688         dst[used + copy] = '\0';
4689     }
4690     return used + length;
4691 }
4692 #endif
4693 
4694 
4695 /*
4696 =for apidoc my_strlcpy
4697 
4698 The C library C<strlcpy> if available, or a Perl implementation of it.
4699 This operates on C C<NUL>-terminated strings.
4700 
4701 C<my_strlcpy()> copies up to S<C<size - 1>> bytes from the string C<src>
4702 to C<dst>, C<NUL>-terminating the result if C<size> is not 0.
4703 
4704 The return value is the total length C<src> would be if the copy completely
4705 succeeded.  If it is larger than C<size>, the excess was not copied.
4706 
4707 =cut
4708 
4709 Description stolen from http://man.openbsd.org/strlcpy.3
4710 */
4711 #ifndef HAS_STRLCPY
4712 PERL_STATIC_INLINE Size_t
Perl_my_strlcpy(char * dst,const char * src,Size_t size)4713 Perl_my_strlcpy(char *dst, const char *src, Size_t size)
4714 {
4715     Size_t length, copy;
4716 
4717     length = strlen(src);
4718     if (size > 0) {
4719         copy = (length >= size) ? size - 1 : length;
4720         memcpy(dst, src, copy);
4721         dst[copy] = '\0';
4722     }
4723     return length;
4724 }
4725 #endif
4726 
4727 /*
4728  * ex: set ts=8 sts=4 sw=4 et:
4729  */
4730