1 /*    inline.h
2  *
3  *    Copyright (C) 2012 by Larry Wall and others
4  *
5  *    You may distribute under the terms of either the GNU General Public
6  *    License or the Artistic License, as specified in the README file.
7  *
8  *    This file contains tables and code adapted from
9  *    https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
10  *    copyright notice:
11 
12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13 
14 Permission is hereby granted, free of charge, to any person obtaining a copy of
15 this software and associated documentation files (the "Software"), to deal in
16 the Software without restriction, including without limitation the rights to
17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18 of the Software, and to permit persons to whom the Software is furnished to do
19 so, subject to the following conditions:
20 
21 The above copyright notice and this permission notice shall be included in all
22 copies or substantial portions of the Software.
23 
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 SOFTWARE.
31 
32  *
33  * This file is a home for static inline functions that cannot go in other
34  * header files, because they depend on proto.h (included after most other
35  * headers) or struct definitions.
36  *
37  * Each section names the header file that the functions "belong" to.
38  */
39 
40 /* ------------------------------- av.h ------------------------------- */
41 
42 PERL_STATIC_INLINE SSize_t
Perl_av_top_index(pTHX_ AV * av)43 Perl_av_top_index(pTHX_ AV *av)
44 {
45     PERL_ARGS_ASSERT_AV_TOP_INDEX;
46     assert(SvTYPE(av) == SVt_PVAV);
47 
48     return AvFILL(av);
49 }
50 
51 /* ------------------------------- cv.h ------------------------------- */
52 
53 PERL_STATIC_INLINE GV *
Perl_CvGV(pTHX_ CV * sv)54 Perl_CvGV(pTHX_ CV *sv)
55 {
56     PERL_ARGS_ASSERT_CVGV;
57 
58     return CvNAMED(sv)
59 	? Perl_cvgv_from_hek(aTHX_ sv)
60 	: ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
61 }
62 
63 PERL_STATIC_INLINE I32 *
Perl_CvDEPTH(const CV * const sv)64 Perl_CvDEPTH(const CV * const sv)
65 {
66     PERL_ARGS_ASSERT_CVDEPTH;
67     assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
68 
69     return &((XPVCV*)SvANY(sv))->xcv_depth;
70 }
71 
72 /*
73  CvPROTO returns the prototype as stored, which is not necessarily what
74  the interpreter should be using. Specifically, the interpreter assumes
75  that spaces have been stripped, which has been the case if the prototype
76  was added by toke.c, but is generally not the case if it was added elsewhere.
77  Since we can't enforce the spacelessness at assignment time, this routine
78  provides a temporary copy at parse time with spaces removed.
79  I<orig> is the start of the original buffer, I<len> is the length of the
80  prototype and will be updated when this returns.
81  */
82 
83 #ifdef PERL_CORE
84 PERL_STATIC_INLINE char *
S_strip_spaces(pTHX_ const char * orig,STRLEN * const len)85 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
86 {
87     SV * tmpsv;
88     char * tmps;
89     tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
90     tmps = SvPVX(tmpsv);
91     while ((*len)--) {
92 	if (!isSPACE(*orig))
93 	    *tmps++ = *orig;
94 	orig++;
95     }
96     *tmps = '\0';
97     *len = tmps - SvPVX(tmpsv);
98 		return SvPVX(tmpsv);
99 }
100 #endif
101 
102 /* ------------------------------- mg.h ------------------------------- */
103 
104 #if defined(PERL_CORE) || defined(PERL_EXT)
105 /* assumes get-magic and stringification have already occurred */
106 PERL_STATIC_INLINE STRLEN
S_MgBYTEPOS(pTHX_ MAGIC * mg,SV * sv,const char * s,STRLEN len)107 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
108 {
109     assert(mg->mg_type == PERL_MAGIC_regex_global);
110     assert(mg->mg_len != -1);
111     if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
112 	return (STRLEN)mg->mg_len;
113     else {
114 	const STRLEN pos = (STRLEN)mg->mg_len;
115 	/* Without this check, we may read past the end of the buffer: */
116 	if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
117 	return sv_or_pv_pos_u2b(sv, s, pos, NULL);
118     }
119 }
120 #endif
121 
122 /* ------------------------------- pad.h ------------------------------ */
123 
124 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
125 PERL_STATIC_INLINE bool
S_PadnameIN_SCOPE(const PADNAME * const pn,const U32 seq)126 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
127 {
128     PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
129 
130     /* is seq within the range _LOW to _HIGH ?
131      * This is complicated by the fact that PL_cop_seqmax
132      * may have wrapped around at some point */
133     if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
134 	return FALSE; /* not yet introduced */
135 
136     if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
137     /* in compiling scope */
138 	if (
139 	    (seq >  COP_SEQ_RANGE_LOW(pn))
140 	    ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
141 	    : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
142 	)
143 	    return TRUE;
144     }
145     else if (
146 	(COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
147 	?
148 	    (  seq >  COP_SEQ_RANGE_LOW(pn)
149 	    || seq <= COP_SEQ_RANGE_HIGH(pn))
150 
151 	:    (  seq >  COP_SEQ_RANGE_LOW(pn)
152 	     && seq <= COP_SEQ_RANGE_HIGH(pn))
153     )
154 	return TRUE;
155     return FALSE;
156 }
157 #endif
158 
159 /* ------------------------------- pp.h ------------------------------- */
160 
161 PERL_STATIC_INLINE I32
Perl_TOPMARK(pTHX)162 Perl_TOPMARK(pTHX)
163 {
164     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
165 				 "MARK top  %p %" IVdf "\n",
166 				  PL_markstack_ptr,
167 				  (IV)*PL_markstack_ptr)));
168     return *PL_markstack_ptr;
169 }
170 
171 PERL_STATIC_INLINE I32
Perl_POPMARK(pTHX)172 Perl_POPMARK(pTHX)
173 {
174     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
175 				 "MARK pop  %p %" IVdf "\n",
176 				  (PL_markstack_ptr-1),
177 				  (IV)*(PL_markstack_ptr-1))));
178     assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
179     return *PL_markstack_ptr--;
180 }
181 
182 /* ----------------------------- regexp.h ----------------------------- */
183 
184 PERL_STATIC_INLINE struct regexp *
Perl_ReANY(const REGEXP * const re)185 Perl_ReANY(const REGEXP * const re)
186 {
187     XPV* const p = (XPV*)SvANY(re);
188 
189     PERL_ARGS_ASSERT_REANY;
190     assert(isREGEXP(re));
191 
192     return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
193                                    : (struct regexp *)p;
194 }
195 
196 /* ------------------------------- sv.h ------------------------------- */
197 
198 PERL_STATIC_INLINE bool
Perl_SvTRUE(pTHX_ SV * sv)199 Perl_SvTRUE(pTHX_ SV *sv) {
200     if (!LIKELY(sv))
201         return FALSE;
202     SvGETMAGIC(sv);
203     return SvTRUE_nomg_NN(sv);
204 }
205 
206 PERL_STATIC_INLINE SV *
Perl_SvREFCNT_inc(SV * sv)207 Perl_SvREFCNT_inc(SV *sv)
208 {
209     if (LIKELY(sv != NULL))
210 	SvREFCNT(sv)++;
211     return sv;
212 }
213 PERL_STATIC_INLINE SV *
Perl_SvREFCNT_inc_NN(SV * sv)214 Perl_SvREFCNT_inc_NN(SV *sv)
215 {
216     PERL_ARGS_ASSERT_SVREFCNT_INC_NN;
217 
218     SvREFCNT(sv)++;
219     return sv;
220 }
221 PERL_STATIC_INLINE void
Perl_SvREFCNT_inc_void(SV * sv)222 Perl_SvREFCNT_inc_void(SV *sv)
223 {
224     if (LIKELY(sv != NULL))
225 	SvREFCNT(sv)++;
226 }
227 PERL_STATIC_INLINE void
Perl_SvREFCNT_dec(pTHX_ SV * sv)228 Perl_SvREFCNT_dec(pTHX_ SV *sv)
229 {
230     if (LIKELY(sv != NULL)) {
231 	U32 rc = SvREFCNT(sv);
232 	if (LIKELY(rc > 1))
233 	    SvREFCNT(sv) = rc - 1;
234 	else
235 	    Perl_sv_free2(aTHX_ sv, rc);
236     }
237 }
238 
239 PERL_STATIC_INLINE void
Perl_SvREFCNT_dec_NN(pTHX_ SV * sv)240 Perl_SvREFCNT_dec_NN(pTHX_ SV *sv)
241 {
242     U32 rc = SvREFCNT(sv);
243 
244     PERL_ARGS_ASSERT_SVREFCNT_DEC_NN;
245 
246     if (LIKELY(rc > 1))
247 	SvREFCNT(sv) = rc - 1;
248     else
249 	Perl_sv_free2(aTHX_ sv, rc);
250 }
251 
252 PERL_STATIC_INLINE void
Perl_SvAMAGIC_on(SV * sv)253 Perl_SvAMAGIC_on(SV *sv)
254 {
255     PERL_ARGS_ASSERT_SVAMAGIC_ON;
256     assert(SvROK(sv));
257 
258     if (SvOBJECT(SvRV(sv))) HvAMAGIC_on(SvSTASH(SvRV(sv)));
259 }
260 PERL_STATIC_INLINE void
Perl_SvAMAGIC_off(SV * sv)261 Perl_SvAMAGIC_off(SV *sv)
262 {
263     PERL_ARGS_ASSERT_SVAMAGIC_OFF;
264 
265     if (SvROK(sv) && SvOBJECT(SvRV(sv)))
266 	HvAMAGIC_off(SvSTASH(SvRV(sv)));
267 }
268 
269 PERL_STATIC_INLINE U32
Perl_SvPADSTALE_on(SV * sv)270 Perl_SvPADSTALE_on(SV *sv)
271 {
272     assert(!(SvFLAGS(sv) & SVs_PADTMP));
273     return SvFLAGS(sv) |= SVs_PADSTALE;
274 }
275 PERL_STATIC_INLINE U32
Perl_SvPADSTALE_off(SV * sv)276 Perl_SvPADSTALE_off(SV *sv)
277 {
278     assert(!(SvFLAGS(sv) & SVs_PADTMP));
279     return SvFLAGS(sv) &= ~SVs_PADSTALE;
280 }
281 #if defined(PERL_CORE) || defined (PERL_EXT)
282 PERL_STATIC_INLINE STRLEN
S_sv_or_pv_pos_u2b(pTHX_ SV * sv,const char * pv,STRLEN pos,STRLEN * lenp)283 S_sv_or_pv_pos_u2b(pTHX_ SV *sv, const char *pv, STRLEN pos, STRLEN *lenp)
284 {
285     PERL_ARGS_ASSERT_SV_OR_PV_POS_U2B;
286     if (SvGAMAGIC(sv)) {
287 	U8 *hopped = utf8_hop((U8 *)pv, pos);
288 	if (lenp) *lenp = (STRLEN)(utf8_hop(hopped, *lenp) - hopped);
289 	return (STRLEN)(hopped - (U8 *)pv);
290     }
291     return sv_pos_u2b_flags(sv,pos,lenp,SV_CONST_RETURN);
292 }
293 #endif
294 
295 /* ------------------------------- utf8.h ------------------------------- */
296 
297 /*
298 =head1 Unicode Support
299 */
300 
301 PERL_STATIC_INLINE void
Perl_append_utf8_from_native_byte(const U8 byte,U8 ** dest)302 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
303 {
304     /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
305      * encoded string at '*dest', updating '*dest' to include it */
306 
307     PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
308 
309     if (NATIVE_BYTE_IS_INVARIANT(byte))
310         *((*dest)++) = byte;
311     else {
312         *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
313         *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
314     }
315 }
316 
317 /*
318 =for apidoc valid_utf8_to_uvchr
319 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
320 known that the next character in the input UTF-8 string C<s> is well-formed
321 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>.  Surrogates, non-character code
322 points, and non-Unicode code points are allowed.
323 
324 =cut
325 
326  */
327 
328 PERL_STATIC_INLINE UV
Perl_valid_utf8_to_uvchr(const U8 * s,STRLEN * retlen)329 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
330 {
331     const UV expectlen = UTF8SKIP(s);
332     const U8* send = s + expectlen;
333     UV uv = *s;
334 
335     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
336 
337     if (retlen) {
338         *retlen = expectlen;
339     }
340 
341     /* An invariant is trivially returned */
342     if (expectlen == 1) {
343 	return uv;
344     }
345 
346     /* Remove the leading bits that indicate the number of bytes, leaving just
347      * the bits that are part of the value */
348     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
349 
350     /* Now, loop through the remaining bytes, accumulating each into the
351      * working total as we go.  (I khw tried unrolling the loop for up to 4
352      * bytes, but there was no performance improvement) */
353     for (++s; s < send; s++) {
354         uv = UTF8_ACCUMULATE(uv, *s);
355     }
356 
357     return UNI_TO_NATIVE(uv);
358 
359 }
360 
361 /*
362 =for apidoc is_utf8_invariant_string
363 
364 Returns TRUE if the first C<len> bytes of the string C<s> are the same
365 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
366 EBCDIC machines); otherwise it returns FALSE.  That is, it returns TRUE if they
367 are UTF-8 invariant.  On ASCII-ish machines, all the ASCII characters and only
368 the ASCII characters fit this definition.  On EBCDIC machines, the ASCII-range
369 characters are invariant, but so also are the C1 controls.
370 
371 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
372 use this option, that C<s> can't have embedded C<NUL> characters and has to
373 have a terminating C<NUL> byte).
374 
375 See also
376 C<L</is_utf8_string>>,
377 C<L</is_utf8_string_flags>>,
378 C<L</is_utf8_string_loc>>,
379 C<L</is_utf8_string_loc_flags>>,
380 C<L</is_utf8_string_loclen>>,
381 C<L</is_utf8_string_loclen_flags>>,
382 C<L</is_utf8_fixed_width_buf_flags>>,
383 C<L</is_utf8_fixed_width_buf_loc_flags>>,
384 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
385 C<L</is_strict_utf8_string>>,
386 C<L</is_strict_utf8_string_loc>>,
387 C<L</is_strict_utf8_string_loclen>>,
388 C<L</is_c9strict_utf8_string>>,
389 C<L</is_c9strict_utf8_string_loc>>,
390 and
391 C<L</is_c9strict_utf8_string_loclen>>.
392 
393 =cut
394 
395 */
396 
397 #define is_utf8_invariant_string(s, len)                                    \
398                                 is_utf8_invariant_string_loc(s, len, NULL)
399 
400 /*
401 =for apidoc is_utf8_invariant_string_loc
402 
403 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
404 the first UTF-8 variant character in the C<ep> pointer; if all characters are
405 UTF-8 invariant, this function does not change the contents of C<*ep>.
406 
407 =cut
408 
409 */
410 
411 PERL_STATIC_INLINE bool
Perl_is_utf8_invariant_string_loc(const U8 * const s,STRLEN len,const U8 ** ep)412 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
413 {
414     const U8* send;
415     const U8* x = s;
416 
417     PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
418 
419     if (len == 0) {
420         len = strlen((const char *)s);
421     }
422 
423     send = s + len;
424 
425 /* This looks like 0x010101... */
426 #  define PERL_COUNT_MULTIPLIER   (~ (UINTMAX_C(0)) / 0xFF)
427 
428 /* This looks like 0x808080... */
429 #  define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
430 #  define PERL_WORDSIZE            sizeof(PERL_UINTMAX_T)
431 #  define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
432 
433 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
434  * or'ing together the lowest bits of 'x'.  Hopefully the final term gets
435  * optimized out completely on a 32-bit system, and its mask gets optimized out
436  * on a 64-bit system */
437 #  define PERL_IS_SUBWORD_ADDR(x) (1 & (       PTR2nat(x)                     \
438                                       |   (  PTR2nat(x) >> 1)                 \
439                                       | ( ( (PTR2nat(x)                       \
440                                            & PERL_WORD_BOUNDARY_MASK) >> 2))))
441 
442 #ifndef EBCDIC
443 
444     /* Do the word-at-a-time iff there is at least one usable full word.  That
445      * means that after advancing to a word boundary, there still is at least a
446      * full word left.  The number of bytes needed to advance is 'wordsize -
447      * offset' unless offset is 0. */
448     if ((STRLEN) (send - x) >= PERL_WORDSIZE
449 
450                             /* This term is wordsize if subword; 0 if not */
451                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
452 
453                             /* 'offset' */
454                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
455     {
456 
457         /* Process per-byte until reach word boundary.  XXX This loop could be
458          * eliminated if we knew that this platform had fast unaligned reads */
459         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
460             if (! UTF8_IS_INVARIANT(*x)) {
461                 if (ep) {
462                     *ep = x;
463                 }
464 
465                 return FALSE;
466             }
467             x++;
468         }
469 
470         /* Here, we know we have at least one full word to process.  Process
471          * per-word as long as we have at least a full word left */
472         do {
473             if ((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK)  {
474 
475                 /* Found a variant.  Just return if caller doesn't want its
476                  * exact position */
477                 if (! ep) {
478                     return FALSE;
479                 }
480 
481 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
482      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
483 
484                 *ep = x + variant_byte_number(* (PERL_UINTMAX_T *) x);
485                 assert(*ep >= s && *ep < send);
486 
487                 return FALSE;
488 
489 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
490            checks. */
491 
492                 break;
493 #  endif
494             }
495 
496             x += PERL_WORDSIZE;
497 
498         } while (x + PERL_WORDSIZE <= send);
499     }
500 
501 #endif      /* End of ! EBCDIC */
502 
503     /* Process per-byte */
504     while (x < send) {
505 	if (! UTF8_IS_INVARIANT(*x)) {
506             if (ep) {
507                 *ep = x;
508             }
509 
510             return FALSE;
511         }
512 
513         x++;
514     }
515 
516     return TRUE;
517 }
518 
519 #ifndef EBCDIC
520 
521 PERL_STATIC_INLINE unsigned int
Perl_variant_byte_number(PERL_UINTMAX_T word)522 Perl_variant_byte_number(PERL_UINTMAX_T word)
523 {
524 
525     /* This returns the position in a word (0..7) of the first variant byte in
526      * it.  This is a helper function.  Note that there are no branches */
527 
528     assert(word);
529 
530     /* Get just the msb bits of each byte */
531     word &= PERL_VARIANTS_WORD_MASK;
532 
533 #  if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
534 
535     /* Bytes are stored like
536      *  Byte8 ... Byte2 Byte1
537      *  63..56...15...8 7...0
538      *
539      *  Isolate the lsb;
540      * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
541      *
542      * The word will look like this, with a rightmost set bit in position 's':
543      * ('x's are don't cares)
544      *      s
545      *  x..x100..0
546      *  x..xx10..0      Right shift (rightmost 0 is shifted off)
547      *  x..xx01..1      Subtract 1, turns all the trailing zeros into 1's and
548      *                  the 1 just to their left into a 0; the remainder is
549      *                  untouched
550      *  0..0011..1      The xor with the original, x..xx10..0, clears that
551      *                  remainder, sets the bottom to all 1
552      *  0..0100..0      Add 1 to clear the word except for the bit in 's'
553      *
554      * Another method is to do 'word &= -word'; but it generates a compiler
555      * message on some platforms about taking the negative of an unsigned */
556 
557     word >>= 1;
558     word = 1 + (word ^ (word - 1));
559 
560 #  elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
561 
562     /* Bytes are stored like
563      *  Byte1 Byte2  ... Byte8
564      * 63..56 55..47 ... 7...0
565      *
566      * Isolate the msb; http://codeforces.com/blog/entry/10330
567      *
568      * Only the most significant set bit matters.  Or'ing word with its right
569      * shift of 1 makes that bit and the next one to its right both 1.  Then
570      * right shifting by 2 makes for 4 1-bits in a row. ...  We end with the
571      * msb and all to the right being 1. */
572     word |= word >>  1;
573     word |= word >>  2;
574     word |= word >>  4;
575     word |= word >>  8;
576     word |= word >> 16;
577     word |= word >> 32;  /* This should get optimized out on 32-bit systems. */
578 
579     /* Then subtracting the right shift by 1 clears all but the left-most of
580      * the 1 bits, which is our desired result */
581     word -= (word >> 1);
582 
583 #  else
584 #    error Unexpected byte order
585 #  endif
586 
587     /* Here 'word' has a single bit set: the  msb of the first byte in which it
588      * is set.  Calculate that position in the word.  We can use this
589      * specialized solution: https://stackoverflow.com/a/32339674/1626653,
590      * assumes an 8-bit byte.  (On a 32-bit machine, the larger numbers should
591      * just get shifted off at compile time) */
592     word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
593                         | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
594                         |           (39 <<  24) |           (47 <<  16)
595                         |           (55 <<   8) |           (63 <<   0));
596     word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
597 
598     /* Here, word contains the position 7..63 of that bit.  Convert to 0..7 */
599     word = ((word + 1) >> 3) - 1;
600 
601 #  if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
602 
603     /* And invert the result */
604     word = CHARBITS - word - 1;
605 
606 #  endif
607 
608     return (unsigned int) word;
609 }
610 
611 #endif
612 #if defined(PERL_CORE) || defined(PERL_EXT)
613 
614 /*
615 =for apidoc variant_under_utf8_count
616 
617 This function looks at the sequence of bytes between C<s> and C<e>, which are
618 assumed to be encoded in ASCII/Latin1, and returns how many of them would
619 change should the string be translated into UTF-8.  Due to the nature of UTF-8,
620 each of these would occupy two bytes instead of the single one in the input
621 string.  Thus, this function returns the precise number of bytes the string
622 would expand by when translated to UTF-8.
623 
624 Unlike most of the other functions that have C<utf8> in their name, the input
625 to this function is NOT a UTF-8-encoded string.  The function name is slightly
626 I<odd> to emphasize this.
627 
628 This function is internal to Perl because khw thinks that any XS code that
629 would want this is probably operating too close to the internals.  Presenting a
630 valid use case could change that.
631 
632 See also
633 C<L<perlapi/is_utf8_invariant_string>>
634 and
635 C<L<perlapi/is_utf8_invariant_string_loc>>,
636 
637 =cut
638 
639 */
640 
641 PERL_STATIC_INLINE Size_t
S_variant_under_utf8_count(const U8 * const s,const U8 * const e)642 S_variant_under_utf8_count(const U8* const s, const U8* const e)
643 {
644     const U8* x = s;
645     Size_t count = 0;
646 
647     PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
648 
649 #  ifndef EBCDIC
650 
651     /* Test if the string is long enough to use word-at-a-time.  (Logic is the
652      * same as for is_utf8_invariant_string()) */
653     if ((STRLEN) (e - x) >= PERL_WORDSIZE
654                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
655                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
656     {
657 
658         /* Process per-byte until reach word boundary.  XXX This loop could be
659          * eliminated if we knew that this platform had fast unaligned reads */
660         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
661             count += ! UTF8_IS_INVARIANT(*x++);
662         }
663 
664         /* Process per-word as long as we have at least a full word left */
665         do {    /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
666                    explanation of how this works */
667             PERL_UINTMAX_T increment
668                 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
669                       * PERL_COUNT_MULTIPLIER)
670                     >> ((PERL_WORDSIZE - 1) * CHARBITS);
671             count += (Size_t) increment;
672             x += PERL_WORDSIZE;
673         } while (x + PERL_WORDSIZE <= e);
674     }
675 
676 #  endif
677 
678     /* Process per-byte */
679     while (x < e) {
680 	if (! UTF8_IS_INVARIANT(*x)) {
681             count++;
682         }
683 
684         x++;
685     }
686 
687     return count;
688 }
689 
690 #endif
691 
692 #ifndef PERL_IN_REGEXEC_C   /* Keep  these around for that file */
693 #  undef PERL_WORDSIZE
694 #  undef PERL_COUNT_MULTIPLIER
695 #  undef PERL_WORD_BOUNDARY_MASK
696 #  undef PERL_VARIANTS_WORD_MASK
697 #endif
698 
699 /*
700 =for apidoc is_utf8_string
701 
702 Returns TRUE if the first C<len> bytes of string C<s> form a valid
703 Perl-extended-UTF-8 string; returns FALSE otherwise.  If C<len> is 0, it will
704 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
705 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
706 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
707 
708 This function considers Perl's extended UTF-8 to be valid.  That means that
709 code points above Unicode, surrogates, and non-character code points are
710 considered valid by this function.  Use C<L</is_strict_utf8_string>>,
711 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
712 code points are considered valid.
713 
714 See also
715 C<L</is_utf8_invariant_string>>,
716 C<L</is_utf8_invariant_string_loc>>,
717 C<L</is_utf8_string_loc>>,
718 C<L</is_utf8_string_loclen>>,
719 C<L</is_utf8_fixed_width_buf_flags>>,
720 C<L</is_utf8_fixed_width_buf_loc_flags>>,
721 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
722 
723 =cut
724 */
725 
726 #define is_utf8_string(s, len)  is_utf8_string_loclen(s, len, NULL, NULL)
727 
728 #if defined(PERL_CORE) || defined (PERL_EXT)
729 
730 /*
731 =for apidoc is_utf8_non_invariant_string
732 
733 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
734 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
735 UTF-8; otherwise returns FALSE.
736 
737 A TRUE return means that at least one code point represented by the sequence
738 either is a wide character not representable as a single byte, or the
739 representation differs depending on whether the sequence is encoded in UTF-8 or
740 not.
741 
742 See also
743 C<L<perlapi/is_utf8_invariant_string>>,
744 C<L<perlapi/is_utf8_string>>
745 
746 =cut
747 
748 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
749 It generally needn't be if its string is entirely UTF-8 invariant, and it
750 shouldn't be if it otherwise contains invalid UTF-8.
751 
752 It is an internal function because khw thinks that XS code shouldn't be working
753 at this low a level.  A valid use case could change that.
754 
755 */
756 
757 PERL_STATIC_INLINE bool
Perl_is_utf8_non_invariant_string(const U8 * const s,STRLEN len)758 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
759 {
760     const U8 * first_variant;
761 
762     PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
763 
764     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
765         return FALSE;
766     }
767 
768     return is_utf8_string(first_variant, len - (first_variant - s));
769 }
770 
771 #endif
772 
773 /*
774 =for apidoc is_strict_utf8_string
775 
776 Returns TRUE if the first C<len> bytes of string C<s> form a valid
777 UTF-8-encoded string that is fully interchangeable by any application using
778 Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
779 calculated using C<strlen(s)> (which means if you use this option, that C<s>
780 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
781 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
782 
783 This function returns FALSE for strings containing any
784 code points above the Unicode max of 0x10FFFF, surrogate code points, or
785 non-character code points.
786 
787 See also
788 C<L</is_utf8_invariant_string>>,
789 C<L</is_utf8_invariant_string_loc>>,
790 C<L</is_utf8_string>>,
791 C<L</is_utf8_string_flags>>,
792 C<L</is_utf8_string_loc>>,
793 C<L</is_utf8_string_loc_flags>>,
794 C<L</is_utf8_string_loclen>>,
795 C<L</is_utf8_string_loclen_flags>>,
796 C<L</is_utf8_fixed_width_buf_flags>>,
797 C<L</is_utf8_fixed_width_buf_loc_flags>>,
798 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
799 C<L</is_strict_utf8_string_loc>>,
800 C<L</is_strict_utf8_string_loclen>>,
801 C<L</is_c9strict_utf8_string>>,
802 C<L</is_c9strict_utf8_string_loc>>,
803 and
804 C<L</is_c9strict_utf8_string_loclen>>.
805 
806 =cut
807 */
808 
809 #define is_strict_utf8_string(s, len)  is_strict_utf8_string_loclen(s, len, NULL, NULL)
810 
811 /*
812 =for apidoc is_c9strict_utf8_string
813 
814 Returns TRUE if the first C<len> bytes of string C<s> form a valid
815 UTF-8-encoded string that conforms to
816 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
817 otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
818 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
819 C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
820 characters being ASCII constitute 'a valid UTF-8 string'.
821 
822 This function returns FALSE for strings containing any code points above the
823 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
824 code points per
825 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
826 
827 See also
828 C<L</is_utf8_invariant_string>>,
829 C<L</is_utf8_invariant_string_loc>>,
830 C<L</is_utf8_string>>,
831 C<L</is_utf8_string_flags>>,
832 C<L</is_utf8_string_loc>>,
833 C<L</is_utf8_string_loc_flags>>,
834 C<L</is_utf8_string_loclen>>,
835 C<L</is_utf8_string_loclen_flags>>,
836 C<L</is_utf8_fixed_width_buf_flags>>,
837 C<L</is_utf8_fixed_width_buf_loc_flags>>,
838 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
839 C<L</is_strict_utf8_string>>,
840 C<L</is_strict_utf8_string_loc>>,
841 C<L</is_strict_utf8_string_loclen>>,
842 C<L</is_c9strict_utf8_string_loc>>,
843 and
844 C<L</is_c9strict_utf8_string_loclen>>.
845 
846 =cut
847 */
848 
849 #define is_c9strict_utf8_string(s, len)  is_c9strict_utf8_string_loclen(s, len, NULL, 0)
850 
851 /*
852 =for apidoc is_utf8_string_flags
853 
854 Returns TRUE if the first C<len> bytes of string C<s> form a valid
855 UTF-8 string, subject to the restrictions imposed by C<flags>;
856 returns FALSE otherwise.  If C<len> is 0, it will be calculated
857 using C<strlen(s)> (which means if you use this option, that C<s> can't have
858 embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
859 that all characters being ASCII constitute 'a valid UTF-8 string'.
860 
861 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
862 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
863 as C<L</is_strict_utf8_string>>; and if C<flags> is
864 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
865 C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
866 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
867 C<L</utf8n_to_uvchr>>, with the same meanings.
868 
869 See also
870 C<L</is_utf8_invariant_string>>,
871 C<L</is_utf8_invariant_string_loc>>,
872 C<L</is_utf8_string>>,
873 C<L</is_utf8_string_loc>>,
874 C<L</is_utf8_string_loc_flags>>,
875 C<L</is_utf8_string_loclen>>,
876 C<L</is_utf8_string_loclen_flags>>,
877 C<L</is_utf8_fixed_width_buf_flags>>,
878 C<L</is_utf8_fixed_width_buf_loc_flags>>,
879 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
880 C<L</is_strict_utf8_string>>,
881 C<L</is_strict_utf8_string_loc>>,
882 C<L</is_strict_utf8_string_loclen>>,
883 C<L</is_c9strict_utf8_string>>,
884 C<L</is_c9strict_utf8_string_loc>>,
885 and
886 C<L</is_c9strict_utf8_string_loclen>>.
887 
888 =cut
889 */
890 
891 PERL_STATIC_INLINE bool
Perl_is_utf8_string_flags(const U8 * s,STRLEN len,const U32 flags)892 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
893 {
894     const U8 * first_variant;
895 
896     PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
897     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
898                           |UTF8_DISALLOW_PERL_EXTENDED)));
899 
900     if (len == 0) {
901         len = strlen((const char *)s);
902     }
903 
904     if (flags == 0) {
905         return is_utf8_string(s, len);
906     }
907 
908     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
909                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
910     {
911         return is_strict_utf8_string(s, len);
912     }
913 
914     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
915                                        == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
916     {
917         return is_c9strict_utf8_string(s, len);
918     }
919 
920     if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
921         const U8* const send = s + len;
922         const U8* x = first_variant;
923 
924         while (x < send) {
925             STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
926             if (UNLIKELY(! cur_len)) {
927                 return FALSE;
928             }
929             x += cur_len;
930         }
931     }
932 
933     return TRUE;
934 }
935 
936 /*
937 
938 =for apidoc is_utf8_string_loc
939 
940 Like C<L</is_utf8_string>> but stores the location of the failure (in the
941 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
942 "utf8ness success") in the C<ep> pointer.
943 
944 See also C<L</is_utf8_string_loclen>>.
945 
946 =cut
947 */
948 
949 #define is_utf8_string_loc(s, len, ep)  is_utf8_string_loclen(s, len, ep, 0)
950 
951 /*
952 
953 =for apidoc is_utf8_string_loclen
954 
955 Like C<L</is_utf8_string>> but stores the location of the failure (in the
956 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
957 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
958 encoded characters in the C<el> pointer.
959 
960 See also C<L</is_utf8_string_loc>>.
961 
962 =cut
963 */
964 
965 PERL_STATIC_INLINE bool
Perl_is_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)966 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
967 {
968     const U8 * first_variant;
969 
970     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
971 
972     if (len == 0) {
973         len = strlen((const char *) s);
974     }
975 
976     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
977         if (el)
978             *el = len;
979 
980         if (ep) {
981             *ep = s + len;
982         }
983 
984         return TRUE;
985     }
986 
987     {
988         const U8* const send = s + len;
989         const U8* x = first_variant;
990         STRLEN outlen = first_variant - s;
991 
992         while (x < send) {
993             const STRLEN cur_len = isUTF8_CHAR(x, send);
994             if (UNLIKELY(! cur_len)) {
995                 break;
996             }
997             x += cur_len;
998             outlen++;
999         }
1000 
1001         if (el)
1002             *el = outlen;
1003 
1004         if (ep) {
1005             *ep = x;
1006         }
1007 
1008         return (x == send);
1009     }
1010 }
1011 
1012 /*
1013 
1014 =for apidoc isUTF8_CHAR
1015 
1016 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1017 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1018 that represents some code point; otherwise it evaluates to 0.  If non-zero, the
1019 value gives how many bytes starting at C<s> comprise the code point's
1020 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1021 form the first code point in C<s>, are not examined.
1022 
1023 The code point can be any that will fit in an IV on this machine, using Perl's
1024 extension to official UTF-8 to represent those higher than the Unicode maximum
1025 of 0x10FFFF.  That means that this macro is used to efficiently decide if the
1026 next few bytes in C<s> is legal UTF-8 for a single character.
1027 
1028 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1029 defined by Unicode to be fully interchangeable across applications;
1030 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1031 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1032 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1033 
1034 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1035 C<L</is_utf8_string_loclen>> to check entire strings.
1036 
1037 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1038 machines) is a valid UTF-8 character.
1039 
1040 =cut
1041 
1042 This uses an adaptation of the table and algorithm given in
1043 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1044 documentation of the original version.  A copyright notice for the original
1045 version is given at the beginning of this file.  The Perl adapation is
1046 documented at the definition of PL_extended_utf8_dfa_tab[].
1047 
1048 */
1049 
1050 PERL_STATIC_INLINE Size_t
Perl_isUTF8_CHAR(const U8 * const s0,const U8 * const e)1051 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1052 {
1053     const U8 * s = s0;
1054     UV state = 0;
1055 
1056     PERL_ARGS_ASSERT_ISUTF8_CHAR;
1057 
1058     /* This dfa is fast.  If it accepts the input, it was for a well-formed,
1059      * code point, which can be returned immediately.  Otherwise, it is either
1060      * malformed, or for the start byte FF which the dfa doesn't handle (except
1061      * on 32-bit ASCII platforms where it trivially is an error).  Call a
1062      * helper function for the other platforms. */
1063 
1064     while (s < e && LIKELY(state != 1)) {
1065         state = PL_extended_utf8_dfa_tab[256
1066                                          + state
1067                                          + PL_extended_utf8_dfa_tab[*s]];
1068         if (state != 0) {
1069             s++;
1070             continue;
1071         }
1072 
1073         return s - s0 + 1;
1074     }
1075 
1076 #if defined(UV_IS_QUAD) || defined(EBCDIC)
1077 
1078     if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
1079        return is_utf8_char_helper(s0, e, 0);
1080     }
1081 
1082 #endif
1083 
1084     return 0;
1085 }
1086 
1087 /*
1088 
1089 =for apidoc isSTRICT_UTF8_CHAR
1090 
1091 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1092 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1093 Unicode code point completely acceptable for open interchange between all
1094 applications; otherwise it evaluates to 0.  If non-zero, the value gives how
1095 many bytes starting at C<s> comprise the code point's representation.  Any
1096 bytes remaining before C<e>, but beyond the ones needed to form the first code
1097 point in C<s>, are not examined.
1098 
1099 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1100 be a surrogate nor a non-character code point.  Thus this excludes any code
1101 point from Perl's extended UTF-8.
1102 
1103 This is used to efficiently decide if the next few bytes in C<s> is
1104 legal Unicode-acceptable UTF-8 for a single character.
1105 
1106 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1107 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1108 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1109 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1110 
1111 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1112 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1113 
1114 =cut
1115 
1116 This uses an adaptation of the tables and algorithm given in
1117 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1118 documentation of the original version.  A copyright notice for the original
1119 version is given at the beginning of this file.  The Perl adapation is
1120 documented at the definition of strict_extended_utf8_dfa_tab[].
1121 
1122 */
1123 
1124 PERL_STATIC_INLINE Size_t
Perl_isSTRICT_UTF8_CHAR(const U8 * const s0,const U8 * const e)1125 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1126 {
1127     const U8 * s = s0;
1128     UV state = 0;
1129 
1130     PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1131 
1132     while (s < e && LIKELY(state != 1)) {
1133         state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
1134 
1135         if (state != 0) {
1136             s++;
1137             continue;
1138         }
1139 
1140         return s - s0 + 1;
1141     }
1142 
1143 #ifndef EBCDIC
1144 
1145     /* The dfa above drops out for certain Hanguls; handle them specially */
1146     if (is_HANGUL_ED_utf8_safe(s0, e)) {
1147         return 3;
1148     }
1149 
1150 #endif
1151 
1152     return 0;
1153 }
1154 
1155 /*
1156 
1157 =for apidoc isC9_STRICT_UTF8_CHAR
1158 
1159 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1160 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1161 Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
1162 the value gives how many bytes starting at C<s> comprise the code point's
1163 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1164 form the first code point in C<s>, are not examined.
1165 
1166 The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
1167 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1168 code points.  This corresponds to
1169 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1170 which said that non-character code points are merely discouraged rather than
1171 completely forbidden in open interchange.  See
1172 L<perlunicode/Noncharacter code points>.
1173 
1174 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1175 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1176 
1177 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1178 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1179 
1180 =cut
1181 
1182 This uses an adaptation of the tables and algorithm given in
1183 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1184 documentation of the original version.  A copyright notice for the original
1185 version is given at the beginning of this file.  The Perl adapation is
1186 documented at the definition of PL_c9_utf8_dfa_tab[].
1187 
1188 */
1189 
1190 PERL_STATIC_INLINE Size_t
Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0,const U8 * const e)1191 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1192 {
1193     const U8 * s = s0;
1194     UV state = 0;
1195 
1196     PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1197 
1198     while (s < e && LIKELY(state != 1)) {
1199         state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
1200 
1201         if (state != 0) {
1202             s++;
1203             continue;
1204         }
1205 
1206         return s - s0 + 1;
1207     }
1208 
1209     return 0;
1210 }
1211 
1212 /*
1213 
1214 =for apidoc is_strict_utf8_string_loc
1215 
1216 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1217 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1218 "utf8ness success") in the C<ep> pointer.
1219 
1220 See also C<L</is_strict_utf8_string_loclen>>.
1221 
1222 =cut
1223 */
1224 
1225 #define is_strict_utf8_string_loc(s, len, ep)                               \
1226                                 is_strict_utf8_string_loclen(s, len, ep, 0)
1227 
1228 /*
1229 
1230 =for apidoc is_strict_utf8_string_loclen
1231 
1232 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1233 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1234 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1235 encoded characters in the C<el> pointer.
1236 
1237 See also C<L</is_strict_utf8_string_loc>>.
1238 
1239 =cut
1240 */
1241 
1242 PERL_STATIC_INLINE bool
Perl_is_strict_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)1243 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1244 {
1245     const U8 * first_variant;
1246 
1247     PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1248 
1249     if (len == 0) {
1250         len = strlen((const char *) s);
1251     }
1252 
1253     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1254         if (el)
1255             *el = len;
1256 
1257         if (ep) {
1258             *ep = s + len;
1259         }
1260 
1261         return TRUE;
1262     }
1263 
1264     {
1265         const U8* const send = s + len;
1266         const U8* x = first_variant;
1267         STRLEN outlen = first_variant - s;
1268 
1269         while (x < send) {
1270             const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1271             if (UNLIKELY(! cur_len)) {
1272                 break;
1273             }
1274             x += cur_len;
1275             outlen++;
1276         }
1277 
1278         if (el)
1279             *el = outlen;
1280 
1281         if (ep) {
1282             *ep = x;
1283         }
1284 
1285         return (x == send);
1286     }
1287 }
1288 
1289 /*
1290 
1291 =for apidoc is_c9strict_utf8_string_loc
1292 
1293 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1294 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1295 "utf8ness success") in the C<ep> pointer.
1296 
1297 See also C<L</is_c9strict_utf8_string_loclen>>.
1298 
1299 =cut
1300 */
1301 
1302 #define is_c9strict_utf8_string_loc(s, len, ep)	                            \
1303                             is_c9strict_utf8_string_loclen(s, len, ep, 0)
1304 
1305 /*
1306 
1307 =for apidoc is_c9strict_utf8_string_loclen
1308 
1309 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1310 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1311 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1312 characters in the C<el> pointer.
1313 
1314 See also C<L</is_c9strict_utf8_string_loc>>.
1315 
1316 =cut
1317 */
1318 
1319 PERL_STATIC_INLINE bool
Perl_is_c9strict_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)1320 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1321 {
1322     const U8 * first_variant;
1323 
1324     PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1325 
1326     if (len == 0) {
1327         len = strlen((const char *) s);
1328     }
1329 
1330     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1331         if (el)
1332             *el = len;
1333 
1334         if (ep) {
1335             *ep = s + len;
1336         }
1337 
1338         return TRUE;
1339     }
1340 
1341     {
1342         const U8* const send = s + len;
1343         const U8* x = first_variant;
1344         STRLEN outlen = first_variant - s;
1345 
1346         while (x < send) {
1347             const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1348             if (UNLIKELY(! cur_len)) {
1349                 break;
1350             }
1351             x += cur_len;
1352             outlen++;
1353         }
1354 
1355         if (el)
1356             *el = outlen;
1357 
1358         if (ep) {
1359             *ep = x;
1360         }
1361 
1362         return (x == send);
1363     }
1364 }
1365 
1366 /*
1367 
1368 =for apidoc is_utf8_string_loc_flags
1369 
1370 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1371 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1372 "utf8ness success") in the C<ep> pointer.
1373 
1374 See also C<L</is_utf8_string_loclen_flags>>.
1375 
1376 =cut
1377 */
1378 
1379 #define is_utf8_string_loc_flags(s, len, ep, flags)                         \
1380                         is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1381 
1382 
1383 /* The above 3 actual functions could have been moved into the more general one
1384  * just below, and made #defines that call it with the right 'flags'.  They are
1385  * currently kept separate to increase their chances of getting inlined */
1386 
1387 /*
1388 
1389 =for apidoc is_utf8_string_loclen_flags
1390 
1391 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1392 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1393 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1394 encoded characters in the C<el> pointer.
1395 
1396 See also C<L</is_utf8_string_loc_flags>>.
1397 
1398 =cut
1399 */
1400 
1401 PERL_STATIC_INLINE bool
Perl_is_utf8_string_loclen_flags(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el,const U32 flags)1402 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1403 {
1404     const U8 * first_variant;
1405 
1406     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1407     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1408                           |UTF8_DISALLOW_PERL_EXTENDED)));
1409 
1410     if (len == 0) {
1411         len = strlen((const char *) s);
1412     }
1413 
1414     if (flags == 0) {
1415         return is_utf8_string_loclen(s, len, ep, el);
1416     }
1417 
1418     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1419                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1420     {
1421         return is_strict_utf8_string_loclen(s, len, ep, el);
1422     }
1423 
1424     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1425                                     == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1426     {
1427         return is_c9strict_utf8_string_loclen(s, len, ep, el);
1428     }
1429 
1430     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1431         if (el)
1432             *el = len;
1433 
1434         if (ep) {
1435             *ep = s + len;
1436         }
1437 
1438         return TRUE;
1439     }
1440 
1441     {
1442         const U8* send = s + len;
1443         const U8* x = first_variant;
1444         STRLEN outlen = first_variant - s;
1445 
1446         while (x < send) {
1447             const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1448             if (UNLIKELY(! cur_len)) {
1449                 break;
1450             }
1451             x += cur_len;
1452             outlen++;
1453         }
1454 
1455         if (el)
1456             *el = outlen;
1457 
1458         if (ep) {
1459             *ep = x;
1460         }
1461 
1462         return (x == send);
1463     }
1464 }
1465 
1466 /*
1467 =for apidoc utf8_distance
1468 
1469 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1470 and C<b>.
1471 
1472 WARNING: use only if you *know* that the pointers point inside the
1473 same UTF-8 buffer.
1474 
1475 =cut
1476 */
1477 
1478 PERL_STATIC_INLINE IV
Perl_utf8_distance(pTHX_ const U8 * a,const U8 * b)1479 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1480 {
1481     PERL_ARGS_ASSERT_UTF8_DISTANCE;
1482 
1483     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
1484 }
1485 
1486 /*
1487 =for apidoc utf8_hop
1488 
1489 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
1490 forward or backward.
1491 
1492 WARNING: do not use the following unless you *know* C<off> is within
1493 the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
1494 on the first byte of character or just after the last byte of a character.
1495 
1496 =cut
1497 */
1498 
1499 PERL_STATIC_INLINE U8 *
Perl_utf8_hop(const U8 * s,SSize_t off)1500 Perl_utf8_hop(const U8 *s, SSize_t off)
1501 {
1502     PERL_ARGS_ASSERT_UTF8_HOP;
1503 
1504     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1505      * the bitops (especially ~) can create illegal UTF-8.
1506      * In other words: in Perl UTF-8 is not just for Unicode. */
1507 
1508     if (off >= 0) {
1509 	while (off--)
1510 	    s += UTF8SKIP(s);
1511     }
1512     else {
1513 	while (off++) {
1514 	    s--;
1515 	    while (UTF8_IS_CONTINUATION(*s))
1516 		s--;
1517 	}
1518     }
1519     GCC_DIAG_IGNORE(-Wcast-qual)
1520     return (U8 *)s;
1521     GCC_DIAG_RESTORE
1522 }
1523 
1524 /*
1525 =for apidoc utf8_hop_forward
1526 
1527 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1528 forward.
1529 
1530 C<off> must be non-negative.
1531 
1532 C<s> must be before or equal to C<end>.
1533 
1534 When moving forward it will not move beyond C<end>.
1535 
1536 Will not exceed this limit even if the string is not valid "UTF-8".
1537 
1538 =cut
1539 */
1540 
1541 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_forward(const U8 * s,SSize_t off,const U8 * end)1542 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
1543 {
1544     PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
1545 
1546     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1547      * the bitops (especially ~) can create illegal UTF-8.
1548      * In other words: in Perl UTF-8 is not just for Unicode. */
1549 
1550     assert(s <= end);
1551     assert(off >= 0);
1552 
1553     while (off--) {
1554         STRLEN skip = UTF8SKIP(s);
1555         if ((STRLEN)(end - s) <= skip) {
1556             GCC_DIAG_IGNORE(-Wcast-qual)
1557             return (U8 *)end;
1558             GCC_DIAG_RESTORE
1559         }
1560         s += skip;
1561     }
1562 
1563     GCC_DIAG_IGNORE(-Wcast-qual)
1564     return (U8 *)s;
1565     GCC_DIAG_RESTORE
1566 }
1567 
1568 /*
1569 =for apidoc utf8_hop_back
1570 
1571 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1572 backward.
1573 
1574 C<off> must be non-positive.
1575 
1576 C<s> must be after or equal to C<start>.
1577 
1578 When moving backward it will not move before C<start>.
1579 
1580 Will not exceed this limit even if the string is not valid "UTF-8".
1581 
1582 =cut
1583 */
1584 
1585 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_back(const U8 * s,SSize_t off,const U8 * start)1586 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
1587 {
1588     PERL_ARGS_ASSERT_UTF8_HOP_BACK;
1589 
1590     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1591      * the bitops (especially ~) can create illegal UTF-8.
1592      * In other words: in Perl UTF-8 is not just for Unicode. */
1593 
1594     assert(start <= s);
1595     assert(off <= 0);
1596 
1597     while (off++ && s > start) {
1598         do {
1599             s--;
1600         } while (UTF8_IS_CONTINUATION(*s) && s > start);
1601     }
1602 
1603     GCC_DIAG_IGNORE(-Wcast-qual)
1604     return (U8 *)s;
1605     GCC_DIAG_RESTORE
1606 }
1607 
1608 /*
1609 =for apidoc utf8_hop_safe
1610 
1611 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
1612 either forward or backward.
1613 
1614 When moving backward it will not move before C<start>.
1615 
1616 When moving forward it will not move beyond C<end>.
1617 
1618 Will not exceed those limits even if the string is not valid "UTF-8".
1619 
1620 =cut
1621 */
1622 
1623 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_safe(const U8 * s,SSize_t off,const U8 * start,const U8 * end)1624 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
1625 {
1626     PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
1627 
1628     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
1629      * the bitops (especially ~) can create illegal UTF-8.
1630      * In other words: in Perl UTF-8 is not just for Unicode. */
1631 
1632     assert(start <= s && s <= end);
1633 
1634     if (off >= 0) {
1635         return utf8_hop_forward(s, off, end);
1636     }
1637     else {
1638         return utf8_hop_back(s, off, start);
1639     }
1640 }
1641 
1642 /*
1643 
1644 =for apidoc is_utf8_valid_partial_char
1645 
1646 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
1647 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
1648 points.  Otherwise, it returns 1 if there exists at least one non-empty
1649 sequence of bytes that when appended to sequence C<s>, starting at position
1650 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
1651 otherwise returns 0.
1652 
1653 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
1654 point.
1655 
1656 This is useful when a fixed-length buffer is being tested for being well-formed
1657 UTF-8, but the final few bytes in it don't comprise a full character; that is,
1658 it is split somewhere in the middle of the final code point's UTF-8
1659 representation.  (Presumably when the buffer is refreshed with the next chunk
1660 of data, the new first bytes will complete the partial code point.)   This
1661 function is used to verify that the final bytes in the current buffer are in
1662 fact the legal beginning of some code point, so that if they aren't, the
1663 failure can be signalled without having to wait for the next read.
1664 
1665 =cut
1666 */
1667 #define is_utf8_valid_partial_char(s, e)                                    \
1668                                 is_utf8_valid_partial_char_flags(s, e, 0)
1669 
1670 /*
1671 
1672 =for apidoc is_utf8_valid_partial_char_flags
1673 
1674 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
1675 or not the input is a valid UTF-8 encoded partial character, but it takes an
1676 extra parameter, C<flags>, which can further restrict which code points are
1677 considered valid.
1678 
1679 If C<flags> is 0, this behaves identically to
1680 C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
1681 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
1682 there is any sequence of bytes that can complete the input partial character in
1683 such a way that a non-prohibited character is formed, the function returns
1684 TRUE; otherwise FALSE.  Non character code points cannot be determined based on
1685 partial character input.  But many  of the other possible excluded types can be
1686 determined from just the first one or two bytes.
1687 
1688 =cut
1689  */
1690 
1691 PERL_STATIC_INLINE bool
Perl_is_utf8_valid_partial_char_flags(const U8 * const s,const U8 * const e,const U32 flags)1692 Perl_is_utf8_valid_partial_char_flags(const U8 * const s, const U8 * const e, const U32 flags)
1693 {
1694     PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
1695 
1696     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1697                           |UTF8_DISALLOW_PERL_EXTENDED)));
1698 
1699     if (s >= e || s + UTF8SKIP(s) <= e) {
1700         return FALSE;
1701     }
1702 
1703     return cBOOL(is_utf8_char_helper(s, e, flags));
1704 }
1705 
1706 /*
1707 
1708 =for apidoc is_utf8_fixed_width_buf_flags
1709 
1710 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
1711 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
1712 otherwise it returns FALSE.
1713 
1714 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
1715 without restriction.  If the final few bytes of the buffer do not form a
1716 complete code point, this will return TRUE anyway, provided that
1717 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
1718 
1719 If C<flags> in non-zero, it can be any combination of the
1720 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
1721 same meanings.
1722 
1723 This function differs from C<L</is_utf8_string_flags>> only in that the latter
1724 returns FALSE if the final few bytes of the string don't form a complete code
1725 point.
1726 
1727 =cut
1728  */
1729 #define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
1730                 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
1731 
1732 /*
1733 
1734 =for apidoc is_utf8_fixed_width_buf_loc_flags
1735 
1736 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
1737 failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
1738 to the beginning of any partial character at the end of the buffer; if there is
1739 no partial character C<*ep> will contain C<s>+C<len>.
1740 
1741 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
1742 
1743 =cut
1744 */
1745 
1746 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
1747                 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
1748 
1749 /*
1750 
1751 =for apidoc is_utf8_fixed_width_buf_loclen_flags
1752 
1753 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
1754 complete, valid characters found in the C<el> pointer.
1755 
1756 =cut
1757 */
1758 
1759 PERL_STATIC_INLINE bool
Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,STRLEN len,const U8 ** ep,STRLEN * el,const U32 flags)1760 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
1761                                        STRLEN len,
1762                                        const U8 **ep,
1763                                        STRLEN *el,
1764                                        const U32 flags)
1765 {
1766     const U8 * maybe_partial;
1767 
1768     PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
1769 
1770     if (! ep) {
1771         ep  = &maybe_partial;
1772     }
1773 
1774     /* If it's entirely valid, return that; otherwise see if the only error is
1775      * that the final few bytes are for a partial character */
1776     return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
1777            || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
1778 }
1779 
1780 PERL_STATIC_INLINE UV
Perl_utf8n_to_uvchr_msgs(const U8 * s,STRLEN curlen,STRLEN * retlen,const U32 flags,U32 * errors,AV ** msgs)1781 Perl_utf8n_to_uvchr_msgs(const U8 *s,
1782                       STRLEN curlen,
1783                       STRLEN *retlen,
1784                       const U32 flags,
1785                       U32 * errors,
1786                       AV ** msgs)
1787 {
1788     /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
1789      * simple cases, and, if necessary calls a helper function to deal with the
1790      * more complex ones.  Almost all well-formed non-problematic code points
1791      * are considered simple, so that it's unlikely that the helper function
1792      * will need to be called.
1793      *
1794      * This is an adaptation of the tables and algorithm given in
1795      * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
1796      * comprehensive documentation of the original version.  A copyright notice
1797      * for the original version is given at the beginning of this file.  The
1798      * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
1799      */
1800 
1801     const U8 * const s0 = s;
1802     const U8 * send = s0 + curlen;
1803     UV uv = 0;      /* The 0 silences some stupid compilers */
1804     UV state = 0;
1805 
1806     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
1807 
1808     /* This dfa is fast.  If it accepts the input, it was for a well-formed,
1809      * non-problematic code point, which can be returned immediately.
1810      * Otherwise we call a helper function to figure out the more complicated
1811      * cases. */
1812 
1813     while (s < send && LIKELY(state != 1)) {
1814         UV type = PL_strict_utf8_dfa_tab[*s];
1815 
1816         uv = (state == 0)
1817              ?  ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
1818              : UTF8_ACCUMULATE(uv, *s);
1819         state = PL_strict_utf8_dfa_tab[256 + state + type];
1820 
1821         if (state != 0) {
1822             s++;
1823             continue;
1824         }
1825 
1826         if (retlen) {
1827             *retlen = s - s0 + 1;
1828         }
1829         if (errors) {
1830             *errors = 0;
1831         }
1832         if (msgs) {
1833             *msgs = NULL;
1834         }
1835 
1836         return UNI_TO_NATIVE(uv);
1837     }
1838 
1839     /* Here is potentially problematic.  Use the full mechanism */
1840     return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
1841 }
1842 
1843 PERL_STATIC_INLINE UV
Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 * s,const U8 * send,STRLEN * retlen)1844 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1845 {
1846     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
1847 
1848     assert(s < send);
1849 
1850     if (! ckWARN_d(WARN_UTF8)) {
1851 
1852         /* EMPTY is not really allowed, and asserts on debugging builds.  But
1853          * on non-debugging we have to deal with it, and this causes it to
1854          * return the REPLACEMENT CHARACTER, as the documentation indicates */
1855         return utf8n_to_uvchr(s, send - s, retlen,
1856                               (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
1857     }
1858     else {
1859         UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
1860         if (retlen && ret == 0 && *s != '\0') {
1861             *retlen = (STRLEN) -1;
1862         }
1863 
1864         return ret;
1865     }
1866 }
1867 
1868 /* ------------------------------- perl.h ----------------------------- */
1869 
1870 /*
1871 =head1 Miscellaneous Functions
1872 
1873 =for apidoc is_safe_syscall
1874 
1875 Test that the given C<pv> (with length C<len>) doesn't contain any internal
1876 C<NUL> characters.
1877 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
1878 category, and return FALSE.
1879 
1880 Return TRUE if the name is safe.
1881 
1882 C<what> and C<op_name> are used in any warning.
1883 
1884 Used by the C<IS_SAFE_SYSCALL()> macro.
1885 
1886 =cut
1887 */
1888 
1889 PERL_STATIC_INLINE bool
Perl_is_safe_syscall(pTHX_ const char * pv,STRLEN len,const char * what,const char * op_name)1890 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
1891 {
1892     /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
1893      * perl itself uses xce*() functions which accept 8-bit strings.
1894      */
1895 
1896     PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
1897 
1898     if (len > 1) {
1899         char *null_at;
1900         if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
1901                 SETERRNO(ENOENT, LIB_INVARG);
1902                 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
1903                                    "Invalid \\0 character in %s for %s: %s\\0%s",
1904                                    what, op_name, pv, null_at+1);
1905                 return FALSE;
1906         }
1907     }
1908 
1909     return TRUE;
1910 }
1911 
1912 /*
1913 
1914 Return true if the supplied filename has a newline character
1915 immediately before the first (hopefully only) NUL.
1916 
1917 My original look at this incorrectly used the len from SvPV(), but
1918 that's incorrect, since we allow for a NUL in pv[len-1].
1919 
1920 So instead, strlen() and work from there.
1921 
1922 This allow for the user reading a filename, forgetting to chomp it,
1923 then calling:
1924 
1925   open my $foo, "$file\0";
1926 
1927 */
1928 
1929 #ifdef PERL_CORE
1930 
1931 PERL_STATIC_INLINE bool
S_should_warn_nl(const char * pv)1932 S_should_warn_nl(const char *pv)
1933 {
1934     STRLEN len;
1935 
1936     PERL_ARGS_ASSERT_SHOULD_WARN_NL;
1937 
1938     len = strlen(pv);
1939 
1940     return len > 0 && pv[len-1] == '\n';
1941 }
1942 
1943 #endif
1944 
1945 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
1946 
1947 PERL_STATIC_INLINE bool
S_lossless_NV_to_IV(const NV nv,IV * ivp)1948 S_lossless_NV_to_IV(const NV nv, IV *ivp)
1949 {
1950     /* This function determines if the input NV 'nv' may be converted without
1951      * loss of data to an IV.  If not, it returns FALSE taking no other action.
1952      * But if it is possible, it does the conversion, returning TRUE, and
1953      * storing the converted result in '*ivp' */
1954 
1955     PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
1956 
1957 #  if  defined(Perl_isnan)
1958 
1959     if (UNLIKELY(Perl_isnan(nv))) {
1960         return FALSE;
1961     }
1962 
1963 #  endif
1964 
1965     if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) {
1966         return FALSE;
1967     }
1968 
1969     if ((IV) nv != nv) {
1970         return FALSE;
1971     }
1972 
1973     *ivp = (IV) nv;
1974     return TRUE;
1975 }
1976 
1977 #endif
1978 
1979 /* ------------------ regcomp.c, toke.c ------------ */
1980 
1981 #if defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C)
1982 
1983 /*
1984  - regcurly - a little FSA that accepts {\d+,?\d*}
1985     Pulled from reg.c.
1986  */
1987 PERL_STATIC_INLINE bool
S_regcurly(const char * s)1988 S_regcurly(const char *s)
1989 {
1990     PERL_ARGS_ASSERT_REGCURLY;
1991 
1992     if (*s++ != '{')
1993 	return FALSE;
1994     if (!isDIGIT(*s))
1995 	return FALSE;
1996     while (isDIGIT(*s))
1997 	s++;
1998     if (*s == ',') {
1999 	s++;
2000 	while (isDIGIT(*s))
2001 	    s++;
2002     }
2003 
2004     return *s == '}';
2005 }
2006 
2007 #endif
2008 
2009 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2010 
2011 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_C) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2012 
2013 #define MAX_CHARSET_NAME_LENGTH 2
2014 
2015 PERL_STATIC_INLINE const char *
S_get_regex_charset_name(const U32 flags,STRLEN * const lenp)2016 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2017 {
2018     PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2019 
2020     /* Returns a string that corresponds to the name of the regex character set
2021      * given by 'flags', and *lenp is set the length of that string, which
2022      * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2023 
2024     *lenp = 1;
2025     switch (get_regex_charset(flags)) {
2026         case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2027         case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
2028         case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2029 	case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2030 	case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2031 	    *lenp = 2;
2032 	    return ASCII_MORE_RESTRICT_PAT_MODS;
2033     }
2034     /* The NOT_REACHED; hides an assert() which has a rather complex
2035      * definition in perl.h. */
2036     NOT_REACHED; /* NOTREACHED */
2037     return "?";	    /* Unknown */
2038 }
2039 
2040 #endif
2041 
2042 /*
2043 
2044 Return false if any get magic is on the SV other than taint magic.
2045 
2046 */
2047 
2048 PERL_STATIC_INLINE bool
Perl_sv_only_taint_gmagic(SV * sv)2049 Perl_sv_only_taint_gmagic(SV *sv)
2050 {
2051     MAGIC *mg = SvMAGIC(sv);
2052 
2053     PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2054 
2055     while (mg) {
2056         if (mg->mg_type != PERL_MAGIC_taint
2057             && !(mg->mg_flags & MGf_GSKIP)
2058             && mg->mg_virtual->svt_get) {
2059             return FALSE;
2060         }
2061         mg = mg->mg_moremagic;
2062     }
2063 
2064     return TRUE;
2065 }
2066 
2067 /* ------------------ cop.h ------------------------------------------- */
2068 
2069 /* implement GIMME_V() macro */
2070 
2071 PERL_STATIC_INLINE U8
Perl_gimme_V(pTHX)2072 Perl_gimme_V(pTHX)
2073 {
2074     I32 cxix;
2075     U8  gimme = (PL_op->op_flags & OPf_WANT);
2076 
2077     if (gimme)
2078         return gimme;
2079     cxix = PL_curstackinfo->si_cxsubix;
2080     if (cxix < 0)
2081         return G_VOID;
2082     assert(cxstack[cxix].blk_gimme & G_WANT);
2083     return (cxstack[cxix].blk_gimme & G_WANT);
2084 }
2085 
2086 
2087 /* Enter a block. Push a new base context and return its address. */
2088 
2089 PERL_STATIC_INLINE PERL_CONTEXT *
Perl_cx_pushblock(pTHX_ U8 type,U8 gimme,SV ** sp,I32 saveix)2090 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2091 {
2092     PERL_CONTEXT * cx;
2093 
2094     PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2095 
2096     CXINC;
2097     cx = CX_CUR();
2098     cx->cx_type        = type;
2099     cx->blk_gimme      = gimme;
2100     cx->blk_oldsaveix  = saveix;
2101     cx->blk_oldsp      = (I32)(sp - PL_stack_base);
2102     cx->blk_oldcop     = PL_curcop;
2103     cx->blk_oldmarksp  = (I32)(PL_markstack_ptr - PL_markstack);
2104     cx->blk_oldscopesp = PL_scopestack_ix;
2105     cx->blk_oldpm      = PL_curpm;
2106     cx->blk_old_tmpsfloor = PL_tmps_floor;
2107 
2108     PL_tmps_floor        = PL_tmps_ix;
2109     CX_DEBUG(cx, "PUSH");
2110     return cx;
2111 }
2112 
2113 
2114 /* Exit a block (RETURN and LAST). */
2115 
2116 PERL_STATIC_INLINE void
Perl_cx_popblock(pTHX_ PERL_CONTEXT * cx)2117 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2118 {
2119     PERL_ARGS_ASSERT_CX_POPBLOCK;
2120 
2121     CX_DEBUG(cx, "POP");
2122     /* these 3 are common to cx_popblock and cx_topblock */
2123     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2124     PL_scopestack_ix = cx->blk_oldscopesp;
2125     PL_curpm         = cx->blk_oldpm;
2126 
2127     /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2128      * and leaves a CX entry lying around for repeated use, so
2129      * skip for multicall */                  \
2130     assert(   (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2131             || PL_savestack_ix == cx->blk_oldsaveix);
2132     PL_curcop     = cx->blk_oldcop;
2133     PL_tmps_floor = cx->blk_old_tmpsfloor;
2134 }
2135 
2136 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2137  * Whereas cx_popblock() restores the state to the point just before
2138  * cx_pushblock() was called,  cx_topblock() restores it to the point just
2139  * *after* cx_pushblock() was called. */
2140 
2141 PERL_STATIC_INLINE void
Perl_cx_topblock(pTHX_ PERL_CONTEXT * cx)2142 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2143 {
2144     PERL_ARGS_ASSERT_CX_TOPBLOCK;
2145 
2146     CX_DEBUG(cx, "TOP");
2147     /* these 3 are common to cx_popblock and cx_topblock */
2148     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2149     PL_scopestack_ix = cx->blk_oldscopesp;
2150     PL_curpm         = cx->blk_oldpm;
2151 
2152     PL_stack_sp      = PL_stack_base + cx->blk_oldsp;
2153 }
2154 
2155 
2156 PERL_STATIC_INLINE void
Perl_cx_pushsub(pTHX_ PERL_CONTEXT * cx,CV * cv,OP * retop,bool hasargs)2157 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2158 {
2159     U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2160 
2161     PERL_ARGS_ASSERT_CX_PUSHSUB;
2162 
2163     PERL_DTRACE_PROBE_ENTRY(cv);
2164     cx->blk_sub.old_cxsubix     = PL_curstackinfo->si_cxsubix;
2165     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2166     cx->blk_sub.cv = cv;
2167     cx->blk_sub.olddepth = CvDEPTH(cv);
2168     cx->blk_sub.prevcomppad = PL_comppad;
2169     cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2170     cx->blk_sub.retop = retop;
2171     SvREFCNT_inc_simple_void_NN(cv);
2172     cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2173 }
2174 
2175 
2176 /* subsets of cx_popsub() */
2177 
2178 PERL_STATIC_INLINE void
Perl_cx_popsub_common(pTHX_ PERL_CONTEXT * cx)2179 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2180 {
2181     CV *cv;
2182 
2183     PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2184     assert(CxTYPE(cx) == CXt_SUB);
2185 
2186     PL_comppad = cx->blk_sub.prevcomppad;
2187     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2188     cv = cx->blk_sub.cv;
2189     CvDEPTH(cv) = cx->blk_sub.olddepth;
2190     cx->blk_sub.cv = NULL;
2191     SvREFCNT_dec(cv);
2192     PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2193 }
2194 
2195 
2196 /* handle the @_ part of leaving a sub */
2197 
2198 PERL_STATIC_INLINE void
Perl_cx_popsub_args(pTHX_ PERL_CONTEXT * cx)2199 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2200 {
2201     AV *av;
2202 
2203     PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2204     assert(CxTYPE(cx) == CXt_SUB);
2205     assert(AvARRAY(MUTABLE_AV(
2206         PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2207                 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2208 
2209     CX_POP_SAVEARRAY(cx);
2210     av = MUTABLE_AV(PAD_SVl(0));
2211     if (UNLIKELY(AvREAL(av)))
2212         /* abandon @_ if it got reified */
2213         clear_defarray(av, 0);
2214     else {
2215         CLEAR_ARGARRAY(av);
2216     }
2217 }
2218 
2219 
2220 PERL_STATIC_INLINE void
Perl_cx_popsub(pTHX_ PERL_CONTEXT * cx)2221 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2222 {
2223     PERL_ARGS_ASSERT_CX_POPSUB;
2224     assert(CxTYPE(cx) == CXt_SUB);
2225 
2226     PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2227 
2228     if (CxHASARGS(cx))
2229         cx_popsub_args(cx);
2230     cx_popsub_common(cx);
2231 }
2232 
2233 
2234 PERL_STATIC_INLINE void
Perl_cx_pushformat(pTHX_ PERL_CONTEXT * cx,CV * cv,OP * retop,GV * gv)2235 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2236 {
2237     PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2238 
2239     cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2240     PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2241     cx->blk_format.cv          = cv;
2242     cx->blk_format.retop       = retop;
2243     cx->blk_format.gv          = gv;
2244     cx->blk_format.dfoutgv     = PL_defoutgv;
2245     cx->blk_format.prevcomppad = PL_comppad;
2246     cx->blk_u16                = 0;
2247 
2248     SvREFCNT_inc_simple_void_NN(cv);
2249     CvDEPTH(cv)++;
2250     SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2251 }
2252 
2253 
2254 PERL_STATIC_INLINE void
Perl_cx_popformat(pTHX_ PERL_CONTEXT * cx)2255 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2256 {
2257     CV *cv;
2258     GV *dfout;
2259 
2260     PERL_ARGS_ASSERT_CX_POPFORMAT;
2261     assert(CxTYPE(cx) == CXt_FORMAT);
2262 
2263     dfout = cx->blk_format.dfoutgv;
2264     setdefout(dfout);
2265     cx->blk_format.dfoutgv = NULL;
2266     SvREFCNT_dec_NN(dfout);
2267 
2268     PL_comppad = cx->blk_format.prevcomppad;
2269     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2270     cv = cx->blk_format.cv;
2271     cx->blk_format.cv = NULL;
2272     --CvDEPTH(cv);
2273     SvREFCNT_dec_NN(cv);
2274     PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2275 }
2276 
2277 
2278 PERL_STATIC_INLINE void
Perl_cx_pusheval(pTHX_ PERL_CONTEXT * cx,OP * retop,SV * namesv)2279 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2280 {
2281     PERL_ARGS_ASSERT_CX_PUSHEVAL;
2282 
2283     cx->blk_eval.old_cxsubix   = PL_curstackinfo->si_cxsubix;
2284     PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2285     cx->blk_eval.retop         = retop;
2286     cx->blk_eval.old_namesv    = namesv;
2287     cx->blk_eval.old_eval_root = PL_eval_root;
2288     cx->blk_eval.cur_text      = PL_parser ? PL_parser->linestr : NULL;
2289     cx->blk_eval.cv            = NULL; /* later set by doeval_compile() */
2290     cx->blk_eval.cur_top_env   = PL_top_env;
2291 
2292     assert(!(PL_in_eval     & ~ 0x3F));
2293     assert(!(PL_op->op_type & ~0x1FF));
2294     cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2295 }
2296 
2297 
2298 PERL_STATIC_INLINE void
Perl_cx_popeval(pTHX_ PERL_CONTEXT * cx)2299 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2300 {
2301     SV *sv;
2302 
2303     PERL_ARGS_ASSERT_CX_POPEVAL;
2304     assert(CxTYPE(cx) == CXt_EVAL);
2305 
2306     PL_in_eval = CxOLD_IN_EVAL(cx);
2307     assert(!(PL_in_eval & 0xc0));
2308     PL_eval_root = cx->blk_eval.old_eval_root;
2309     sv = cx->blk_eval.cur_text;
2310     if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2311         cx->blk_eval.cur_text = NULL;
2312         SvREFCNT_dec_NN(sv);
2313     }
2314 
2315     sv = cx->blk_eval.old_namesv;
2316     if (sv) {
2317         cx->blk_eval.old_namesv = NULL;
2318         SvREFCNT_dec_NN(sv);
2319     }
2320     PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
2321 }
2322 
2323 
2324 /* push a plain loop, i.e.
2325  *     { block }
2326  *     while (cond) { block }
2327  *     for (init;cond;continue) { block }
2328  * This loop can be last/redo'ed etc.
2329  */
2330 
2331 PERL_STATIC_INLINE void
Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT * cx)2332 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
2333 {
2334     PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
2335     cx->blk_loop.my_op = cLOOP;
2336 }
2337 
2338 
2339 /* push a true for loop, i.e.
2340  *     for var (list) { block }
2341  */
2342 
2343 PERL_STATIC_INLINE void
Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT * cx,void * itervarp,SV * itersave)2344 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
2345 {
2346     PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
2347 
2348     /* this one line is common with cx_pushloop_plain */
2349     cx->blk_loop.my_op = cLOOP;
2350 
2351     cx->blk_loop.itervar_u.svp = (SV**)itervarp;
2352     cx->blk_loop.itersave      = itersave;
2353 #ifdef USE_ITHREADS
2354     cx->blk_loop.oldcomppad = PL_comppad;
2355 #endif
2356 }
2357 
2358 
2359 /* pop all loop types, including plain */
2360 
2361 PERL_STATIC_INLINE void
Perl_cx_poploop(pTHX_ PERL_CONTEXT * cx)2362 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
2363 {
2364     PERL_ARGS_ASSERT_CX_POPLOOP;
2365 
2366     assert(CxTYPE_is_LOOP(cx));
2367     if (  CxTYPE(cx) == CXt_LOOP_ARY
2368        || CxTYPE(cx) == CXt_LOOP_LAZYSV)
2369     {
2370         /* Free ary or cur. This assumes that state_u.ary.ary
2371          * aligns with state_u.lazysv.cur. See cx_dup() */
2372         SV *sv = cx->blk_loop.state_u.lazysv.cur;
2373         cx->blk_loop.state_u.lazysv.cur = NULL;
2374         SvREFCNT_dec_NN(sv);
2375         if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
2376             sv = cx->blk_loop.state_u.lazysv.end;
2377             cx->blk_loop.state_u.lazysv.end = NULL;
2378             SvREFCNT_dec_NN(sv);
2379         }
2380     }
2381     if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
2382         SV *cursv;
2383         SV **svp = (cx)->blk_loop.itervar_u.svp;
2384         if ((cx->cx_type & CXp_FOR_GV))
2385             svp = &GvSV((GV*)svp);
2386         cursv = *svp;
2387         *svp = cx->blk_loop.itersave;
2388         cx->blk_loop.itersave = NULL;
2389         SvREFCNT_dec(cursv);
2390     }
2391 }
2392 
2393 
2394 PERL_STATIC_INLINE void
Perl_cx_pushwhen(pTHX_ PERL_CONTEXT * cx)2395 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
2396 {
2397     PERL_ARGS_ASSERT_CX_PUSHWHEN;
2398 
2399     cx->blk_givwhen.leave_op = cLOGOP->op_other;
2400 }
2401 
2402 
2403 PERL_STATIC_INLINE void
Perl_cx_popwhen(pTHX_ PERL_CONTEXT * cx)2404 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
2405 {
2406     PERL_ARGS_ASSERT_CX_POPWHEN;
2407     assert(CxTYPE(cx) == CXt_WHEN);
2408 
2409     PERL_UNUSED_ARG(cx);
2410     PERL_UNUSED_CONTEXT;
2411     /* currently NOOP */
2412 }
2413 
2414 
2415 PERL_STATIC_INLINE void
Perl_cx_pushgiven(pTHX_ PERL_CONTEXT * cx,SV * orig_defsv)2416 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
2417 {
2418     PERL_ARGS_ASSERT_CX_PUSHGIVEN;
2419 
2420     cx->blk_givwhen.leave_op = cLOGOP->op_other;
2421     cx->blk_givwhen.defsv_save = orig_defsv;
2422 }
2423 
2424 
2425 PERL_STATIC_INLINE void
Perl_cx_popgiven(pTHX_ PERL_CONTEXT * cx)2426 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
2427 {
2428     SV *sv;
2429 
2430     PERL_ARGS_ASSERT_CX_POPGIVEN;
2431     assert(CxTYPE(cx) == CXt_GIVEN);
2432 
2433     sv = GvSV(PL_defgv);
2434     GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
2435     cx->blk_givwhen.defsv_save = NULL;
2436     SvREFCNT_dec(sv);
2437 }
2438 
2439 /* ------------------ util.h ------------------------------------------- */
2440 
2441 /*
2442 =head1 Miscellaneous Functions
2443 
2444 =for apidoc foldEQ
2445 
2446 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2447 same
2448 case-insensitively; false otherwise.  Uppercase and lowercase ASCII range bytes
2449 match themselves and their opposite case counterparts.  Non-cased and non-ASCII
2450 range bytes match only themselves.
2451 
2452 =cut
2453 */
2454 
2455 PERL_STATIC_INLINE I32
Perl_foldEQ(const char * s1,const char * s2,I32 len)2456 Perl_foldEQ(const char *s1, const char *s2, I32 len)
2457 {
2458     const U8 *a = (const U8 *)s1;
2459     const U8 *b = (const U8 *)s2;
2460 
2461     PERL_ARGS_ASSERT_FOLDEQ;
2462 
2463     assert(len >= 0);
2464 
2465     while (len--) {
2466 	if (*a != *b && *a != PL_fold[*b])
2467 	    return 0;
2468 	a++,b++;
2469     }
2470     return 1;
2471 }
2472 
2473 PERL_STATIC_INLINE I32
Perl_foldEQ_latin1(const char * s1,const char * s2,I32 len)2474 Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
2475 {
2476     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  Works on all folds
2477      * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
2478      * does not check for this.  Nor does it check that the strings each have
2479      * at least 'len' characters. */
2480 
2481     const U8 *a = (const U8 *)s1;
2482     const U8 *b = (const U8 *)s2;
2483 
2484     PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
2485 
2486     assert(len >= 0);
2487 
2488     while (len--) {
2489 	if (*a != *b && *a != PL_fold_latin1[*b]) {
2490 	    return 0;
2491 	}
2492 	a++, b++;
2493     }
2494     return 1;
2495 }
2496 
2497 /*
2498 =for apidoc foldEQ_locale
2499 
2500 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
2501 same case-insensitively in the current locale; false otherwise.
2502 
2503 =cut
2504 */
2505 
2506 PERL_STATIC_INLINE I32
Perl_foldEQ_locale(const char * s1,const char * s2,I32 len)2507 Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
2508 {
2509     dVAR;
2510     const U8 *a = (const U8 *)s1;
2511     const U8 *b = (const U8 *)s2;
2512 
2513     PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
2514 
2515     assert(len >= 0);
2516 
2517     while (len--) {
2518 	if (*a != *b && *a != PL_fold_locale[*b])
2519 	    return 0;
2520 	a++,b++;
2521     }
2522     return 1;
2523 }
2524 
2525 /*
2526 =for apidoc my_strnlen
2527 
2528 The C library C<strnlen> if available, or a Perl implementation of it.
2529 
2530 C<my_strnlen()> computes the length of the string, up to C<maxlen>
2531 characters.  It will never attempt to address more than C<maxlen>
2532 characters, making it suitable for use with strings that are not
2533 guaranteed to be NUL-terminated.
2534 
2535 =cut
2536 
2537 Description stolen from http://man.openbsd.org/strnlen.3,
2538 implementation stolen from PostgreSQL.
2539 */
2540 #ifndef HAS_STRNLEN
2541 
2542 PERL_STATIC_INLINE Size_t
Perl_my_strnlen(const char * str,Size_t maxlen)2543 Perl_my_strnlen(const char *str, Size_t maxlen)
2544 {
2545     const char *end = (char *) memchr(str, '\0', maxlen);
2546 
2547     PERL_ARGS_ASSERT_MY_STRNLEN;
2548 
2549     if (end == NULL) return maxlen;
2550     return end - str;
2551 }
2552 
2553 #endif
2554 
2555 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
2556 
2557 PERL_STATIC_INLINE void *
S_my_memrchr(const char * s,const char c,const STRLEN len)2558 S_my_memrchr(const char * s, const char c, const STRLEN len)
2559 {
2560     /* memrchr(), since many platforms lack it */
2561 
2562     const char * t = s + len - 1;
2563 
2564     PERL_ARGS_ASSERT_MY_MEMRCHR;
2565 
2566     while (t >= s) {
2567         if (*t == c) {
2568             return (void *) t;
2569         }
2570         t--;
2571     }
2572 
2573     return NULL;
2574 }
2575 
2576 #endif
2577 
2578 PERL_STATIC_INLINE char *
Perl_mortal_getenv(const char * str)2579 Perl_mortal_getenv(const char * str)
2580 {
2581     /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
2582      *
2583      * It's (mostly) thread-safe because it uses a mutex to prevent
2584      * simultaneous access from other threads that use the same mutex, and
2585      * makes a copy of the result before releasing that mutex.  All of the Perl
2586      * core uses that mutex, but, like all mutexes, everything has to cooperate
2587      * for it to completely work.  It is possible for code from, say XS, to not
2588      * use this mutex, defeating the safety.
2589      *
2590      * On some platforms, getenv() is not sequential-call-safe, because
2591      * subsequent calls destroy the static storage inside the C library
2592      * returned by an earlier call.  The result must be copied or completely
2593      * acted upon before a subsequent getenv call.  Those calls could come from
2594      * another thread.  Again, making a copy while controlling the mutex
2595      * prevents these problems..
2596      *
2597      * To prevent leaks, the copy is made by creating a new SV containing it,
2598      * mortalizing the SV, and returning the SV's string (the copy).  Thus this
2599      * is a drop-in replacement for getenv().
2600      *
2601      * A complication is that this can be called during phases where the
2602      * mortalization process isn't available.  These are in interpreter
2603      * destruction or early in construction.  khw believes that at these times
2604      * there shouldn't be anything else going on, so plain getenv is safe AS
2605      * LONG AS the caller acts on the return before calling it again. */
2606 
2607     char * ret;
2608     dTHX;
2609 
2610     PERL_ARGS_ASSERT_MORTAL_GETENV;
2611 
2612     /* Can't mortalize without stacks.  khw believes that no other threads
2613      * should be running, so no need to lock things, and this may be during a
2614      * phase when locking isn't even available */
2615     if (UNLIKELY(PL_scopestack_ix == 0)) {
2616         return getenv(str);
2617     }
2618 
2619     ENV_LOCK;
2620 
2621     ret = getenv(str);
2622 
2623     if (ret != NULL) {
2624         ret = SvPVX(sv_2mortal(newSVpv(ret, 0)));
2625     }
2626 
2627     ENV_UNLOCK;
2628     return ret;
2629 }
2630 
2631 /*
2632  * ex: set ts=8 sts=4 sw=4 et:
2633  */
2634