xref: /openbsd/gnu/usr.bin/perl/inline.h (revision e0680481)
1 /*    inline.h
2  *
3  *    Copyright (C) 2012 by Larry Wall and others
4  *
5  *    You may distribute under the terms of either the GNU General Public
6  *    License or the Artistic License, as specified in the README file.
7  *
8  *    This file contains tables and code adapted from
9  *    https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
10  *    copyright notice:
11 
12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13 
14 Permission is hereby granted, free of charge, to any person obtaining a copy of
15 this software and associated documentation files (the "Software"), to deal in
16 the Software without restriction, including without limitation the rights to
17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18 of the Software, and to permit persons to whom the Software is furnished to do
19 so, subject to the following conditions:
20 
21 The above copyright notice and this permission notice shall be included in all
22 copies or substantial portions of the Software.
23 
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 SOFTWARE.
31 
32  *
33  * This file is a home for static inline functions that cannot go in other
34  * header files, because they depend on proto.h (included after most other
35  * headers) or struct definitions.
36  *
37  * Note also perlstatic.h for functions that can't or shouldn't be inlined, but
38  * whose details should be exposed to the compiler, for such things as tail
39  * call optimization.
40  *
41  * Each section names the header file that the functions "belong" to.
42  */
43 
44 /* ------------------------------- av.h ------------------------------- */
45 
46 /*
47 =for apidoc_section $AV
48 =for apidoc av_count
49 Returns the number of elements in the array C<av>.  This is the true length of
50 the array, including any undefined elements.  It is always the same as
51 S<C<av_top_index(av) + 1>>.
52 
53 =cut
54 */
55 PERL_STATIC_INLINE Size_t
Perl_av_count(pTHX_ AV * av)56 Perl_av_count(pTHX_ AV *av)
57 {
58     PERL_ARGS_ASSERT_AV_COUNT;
59     assert(SvTYPE(av) == SVt_PVAV);
60 
61     return AvFILL(av) + 1;
62 }
63 
64 /* ------------------------------- av.c ------------------------------- */
65 
66 /*
67 =for apidoc av_store_simple
68 
69 This is a cut-down version of av_store that assumes that the array is
70 very straightforward - no magic, not readonly, and AvREAL - and that
71 C<key> is not negative. This function MUST NOT be used in situations
72 where any of those assumptions may not hold.
73 
74 Stores an SV in an array.  The array index is specified as C<key>. It
75 can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
76 
77 Note that the caller is responsible for suitably incrementing the reference
78 count of C<val> before the call.
79 
80 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
81 
82 =cut
83 */
84 
85 PERL_STATIC_INLINE SV**
Perl_av_store_simple(pTHX_ AV * av,SSize_t key,SV * val)86 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
87 {
88     SV** ary;
89 
90     PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
91     assert(SvTYPE(av) == SVt_PVAV);
92     assert(!SvMAGICAL(av));
93     assert(!SvREADONLY(av));
94     assert(AvREAL(av));
95     assert(key > -1);
96 
97     ary = AvARRAY(av);
98 
99     if (AvFILLp(av) < key) {
100         if (key > AvMAX(av)) {
101             av_extend(av,key);
102             ary = AvARRAY(av);
103         }
104         AvFILLp(av) = key;
105     } else
106         SvREFCNT_dec(ary[key]);
107 
108     ary[key] = val;
109     return &ary[key];
110 }
111 
112 /*
113 =for apidoc av_fetch_simple
114 
115 This is a cut-down version of av_fetch that assumes that the array is
116 very straightforward - no magic, not readonly, and AvREAL - and that
117 C<key> is not negative. This function MUST NOT be used in situations
118 where any of those assumptions may not hold.
119 
120 Returns the SV at the specified index in the array.  The C<key> is the
121 index.  If lval is true, you are guaranteed to get a real SV back (in case
122 it wasn't real before), which you can then modify.  Check that the return
123 value is non-null before dereferencing it to a C<SV*>.
124 
125 The rough perl equivalent is C<$myarray[$key]>.
126 
127 =cut
128 */
129 
130 PERL_STATIC_INLINE SV**
Perl_av_fetch_simple(pTHX_ AV * av,SSize_t key,I32 lval)131 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
132 {
133     PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
134     assert(SvTYPE(av) == SVt_PVAV);
135     assert(!SvMAGICAL(av));
136     assert(!SvREADONLY(av));
137     assert(AvREAL(av));
138     assert(key > -1);
139 
140     if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
141         return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
142     } else {
143         return &AvARRAY(av)[key];
144     }
145 }
146 
147 /*
148 =for apidoc av_push_simple
149 
150 This is a cut-down version of av_push that assumes that the array is very
151 straightforward - no magic, not readonly, and AvREAL - and that C<key> is
152 not less than -1. This function MUST NOT be used in situations where any
153 of those assumptions may not hold.
154 
155 Pushes an SV (transferring control of one reference count) onto the end of the
156 array.  The array will grow automatically to accommodate the addition.
157 
158 Perl equivalent: C<push @myarray, $val;>.
159 
160 =cut
161 */
162 
163 PERL_STATIC_INLINE void
Perl_av_push_simple(pTHX_ AV * av,SV * val)164 Perl_av_push_simple(pTHX_ AV *av, SV *val)
165 {
166     PERL_ARGS_ASSERT_AV_PUSH_SIMPLE;
167     assert(SvTYPE(av) == SVt_PVAV);
168     assert(!SvMAGICAL(av));
169     assert(!SvREADONLY(av));
170     assert(AvREAL(av));
171     assert(AvFILLp(av) > -2);
172 
173     (void)av_store_simple(av,AvFILLp(av)+1,val);
174 }
175 
176 /*
177 =for apidoc av_new_alloc
178 
179 This implements L<perlapi/C<newAV_alloc_x>>
180 and L<perlapi/C<newAV_alloc_xz>>, which are the public API for this
181 functionality.
182 
183 Creates a new AV and allocates its SV* array.
184 
185 This is similar to, but more efficient than doing:
186 
187     AV *av = newAV();
188     av_extend(av, key);
189 
190 The size parameter is used to pre-allocate a SV* array large enough to
191 hold at least elements C<0..(size-1)>.  C<size> must be at least 1.
192 
193 The C<zeroflag> parameter controls whether or not the array is NULL
194 initialized.
195 
196 =cut
197 */
198 
199 PERL_STATIC_INLINE AV *
Perl_av_new_alloc(pTHX_ SSize_t size,bool zeroflag)200 Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag)
201 {
202     AV * const av = newAV();
203     SV** ary;
204     PERL_ARGS_ASSERT_AV_NEW_ALLOC;
205     assert(size > 0);
206 
207     Newx(ary, size, SV*); /* Newx performs the memwrap check */
208     AvALLOC(av) = ary;
209     AvARRAY(av) = ary;
210     AvMAX(av) = size - 1;
211 
212     if (zeroflag)
213         Zero(ary, size, SV*);
214 
215     return av;
216 }
217 
218 
219 /* ------------------------------- cv.h ------------------------------- */
220 
221 /*
222 =for apidoc_section $CV
223 =for apidoc CvGV
224 Returns the GV associated with the CV C<sv>, reifying it if necessary.
225 
226 =cut
227 */
228 PERL_STATIC_INLINE GV *
Perl_CvGV(pTHX_ CV * sv)229 Perl_CvGV(pTHX_ CV *sv)
230 {
231     PERL_ARGS_ASSERT_CVGV;
232 
233     return CvNAMED(sv)
234         ? Perl_cvgv_from_hek(aTHX_ sv)
235         : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
236 }
237 
238 /*
239 =for apidoc CvDEPTH
240 Returns the recursion level of the CV C<sv>.  Hence >= 2 indicates we are in a
241 recursive call.
242 
243 =cut
244 */
245 PERL_STATIC_INLINE I32 *
Perl_CvDEPTH(const CV * const sv)246 Perl_CvDEPTH(const CV * const sv)
247 {
248     PERL_ARGS_ASSERT_CVDEPTH;
249     assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
250 
251     return &((XPVCV*)SvANY(sv))->xcv_depth;
252 }
253 
254 /*
255  CvPROTO returns the prototype as stored, which is not necessarily what
256  the interpreter should be using. Specifically, the interpreter assumes
257  that spaces have been stripped, which has been the case if the prototype
258  was added by toke.c, but is generally not the case if it was added elsewhere.
259  Since we can't enforce the spacelessness at assignment time, this routine
260  provides a temporary copy at parse time with spaces removed.
261  I<orig> is the start of the original buffer, I<len> is the length of the
262  prototype and will be updated when this returns.
263  */
264 
265 #ifdef PERL_CORE
266 PERL_STATIC_INLINE char *
S_strip_spaces(pTHX_ const char * orig,STRLEN * const len)267 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
268 {
269     SV * tmpsv;
270     char * tmps;
271     tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
272     tmps = SvPVX(tmpsv);
273     while ((*len)--) {
274         if (!isSPACE(*orig))
275             *tmps++ = *orig;
276         orig++;
277     }
278     *tmps = '\0';
279     *len = tmps - SvPVX(tmpsv);
280                 return SvPVX(tmpsv);
281 }
282 #endif
283 
284 /* ------------------------------- iperlsys.h ------------------------------- */
285 #if ! defined(PERL_IMPLICIT_SYS) && defined(USE_ITHREADS)
286 
287 /* Otherwise this function is implemented as macros in iperlsys.h */
288 
289 PERL_STATIC_INLINE bool
S_PerlEnv_putenv(pTHX_ char * str)290 S_PerlEnv_putenv(pTHX_ char * str)
291 {
292     PERL_ARGS_ASSERT_PERLENV_PUTENV;
293 
294     ENV_LOCK;
295     bool retval = putenv(str);
296     ENV_UNLOCK;
297 
298     return retval;
299 }
300 
301 #endif
302 
303 /* ------------------------------- mg.h ------------------------------- */
304 
305 #if defined(PERL_CORE) || defined(PERL_EXT)
306 /* assumes get-magic and stringification have already occurred */
307 PERL_STATIC_INLINE STRLEN
S_MgBYTEPOS(pTHX_ MAGIC * mg,SV * sv,const char * s,STRLEN len)308 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
309 {
310     assert(mg->mg_type == PERL_MAGIC_regex_global);
311     assert(mg->mg_len != -1);
312     if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
313         return (STRLEN)mg->mg_len;
314     else {
315         const STRLEN pos = (STRLEN)mg->mg_len;
316         /* Without this check, we may read past the end of the buffer: */
317         if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
318         return sv_or_pv_pos_u2b(sv, s, pos, NULL);
319     }
320 }
321 #endif
322 
323 /* ------------------------------- pad.h ------------------------------ */
324 
325 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
326 PERL_STATIC_INLINE bool
S_PadnameIN_SCOPE(const PADNAME * const pn,const U32 seq)327 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
328 {
329     PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
330 
331     /* is seq within the range _LOW to _HIGH ?
332      * This is complicated by the fact that PL_cop_seqmax
333      * may have wrapped around at some point */
334     if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
335         return FALSE; /* not yet introduced */
336 
337     if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
338     /* in compiling scope */
339         if (
340             (seq >  COP_SEQ_RANGE_LOW(pn))
341             ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
342             : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
343         )
344             return TRUE;
345     }
346     else if (
347         (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
348         ?
349             (  seq >  COP_SEQ_RANGE_LOW(pn)
350             || seq <= COP_SEQ_RANGE_HIGH(pn))
351 
352         :    (  seq >  COP_SEQ_RANGE_LOW(pn)
353              && seq <= COP_SEQ_RANGE_HIGH(pn))
354     )
355         return TRUE;
356     return FALSE;
357 }
358 #endif
359 
360 /* ------------------------------- pp.h ------------------------------- */
361 
362 PERL_STATIC_INLINE I32
Perl_TOPMARK(pTHX)363 Perl_TOPMARK(pTHX)
364 {
365     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
366                                  "MARK top  %p %" IVdf "\n",
367                                   PL_markstack_ptr,
368                                   (IV)*PL_markstack_ptr)));
369     return *PL_markstack_ptr;
370 }
371 
372 PERL_STATIC_INLINE I32
Perl_POPMARK(pTHX)373 Perl_POPMARK(pTHX)
374 {
375     DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
376                                  "MARK pop  %p %" IVdf "\n",
377                                   (PL_markstack_ptr-1),
378                                   (IV)*(PL_markstack_ptr-1))));
379     assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
380     return *PL_markstack_ptr--;
381 }
382 
383 /* ----------------------------- regexp.h ----------------------------- */
384 
385 /* PVLVs need to act as a superset of all scalar types - they are basically
386  * PVMGs with a few extra fields.
387  * REGEXPs are first class scalars, but have many fields that can't be copied
388  * into a PVLV body.
389  *
390  * Hence we take a different approach - instead of a copy, PVLVs store a pointer
391  * back to the original body. To avoid increasing the size of PVLVs just for the
392  * rare case of REGEXP assignment, this pointer is stored in the memory usually
393  * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
394  * read the pointer from the two possible locations. The macro SvLEN() wraps the
395  * access to the union's member xpvlenu_len, but there is no equivalent macro
396  * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
397  *
398  * See commit df6b4bd56551f2d3 for more details. */
399 
400 PERL_STATIC_INLINE struct regexp *
Perl_ReANY(const REGEXP * const re)401 Perl_ReANY(const REGEXP * const re)
402 {
403     XPV* const p = (XPV*)SvANY(re);
404 
405     PERL_ARGS_ASSERT_REANY;
406     assert(isREGEXP(re));
407 
408     return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
409                                    : (struct regexp *)p;
410 }
411 
412 /* ------------------------------- utf8.h ------------------------------- */
413 
414 /*
415 =for apidoc_section $unicode
416 */
417 
418 PERL_STATIC_INLINE void
Perl_append_utf8_from_native_byte(const U8 byte,U8 ** dest)419 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
420 {
421     /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
422      * encoded string at '*dest', updating '*dest' to include it */
423 
424     PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
425 
426     if (NATIVE_BYTE_IS_INVARIANT(byte))
427         *((*dest)++) = byte;
428     else {
429         *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
430         *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
431     }
432 }
433 
434 /*
435 =for apidoc valid_utf8_to_uvchr
436 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
437 known that the next character in the input UTF-8 string C<s> is well-formed
438 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>.  Surrogates, non-character code
439 points, and non-Unicode code points are allowed.
440 
441 =cut
442 
443  */
444 
445 PERL_STATIC_INLINE UV
Perl_valid_utf8_to_uvchr(const U8 * s,STRLEN * retlen)446 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
447 {
448     const UV expectlen = UTF8SKIP(s);
449     const U8* send = s + expectlen;
450     UV uv = *s;
451 
452     PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
453 
454     if (retlen) {
455         *retlen = expectlen;
456     }
457 
458     /* An invariant is trivially returned */
459     if (expectlen == 1) {
460         return uv;
461     }
462 
463     /* Remove the leading bits that indicate the number of bytes, leaving just
464      * the bits that are part of the value */
465     uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
466 
467     /* Now, loop through the remaining bytes, accumulating each into the
468      * working total as we go.  (I khw tried unrolling the loop for up to 4
469      * bytes, but there was no performance improvement) */
470     for (++s; s < send; s++) {
471         uv = UTF8_ACCUMULATE(uv, *s);
472     }
473 
474     return UNI_TO_NATIVE(uv);
475 
476 }
477 
478 /*
479 =for apidoc is_utf8_invariant_string
480 
481 Returns TRUE if the first C<len> bytes of the string C<s> are the same
482 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
483 EBCDIC machines); otherwise it returns FALSE.  That is, it returns TRUE if they
484 are UTF-8 invariant.  On ASCII-ish machines, all the ASCII characters and only
485 the ASCII characters fit this definition.  On EBCDIC machines, the ASCII-range
486 characters are invariant, but so also are the C1 controls.
487 
488 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
489 use this option, that C<s> can't have embedded C<NUL> characters and has to
490 have a terminating C<NUL> byte).
491 
492 See also
493 C<L</is_utf8_string>>,
494 C<L</is_utf8_string_flags>>,
495 C<L</is_utf8_string_loc>>,
496 C<L</is_utf8_string_loc_flags>>,
497 C<L</is_utf8_string_loclen>>,
498 C<L</is_utf8_string_loclen_flags>>,
499 C<L</is_utf8_fixed_width_buf_flags>>,
500 C<L</is_utf8_fixed_width_buf_loc_flags>>,
501 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
502 C<L</is_strict_utf8_string>>,
503 C<L</is_strict_utf8_string_loc>>,
504 C<L</is_strict_utf8_string_loclen>>,
505 C<L</is_c9strict_utf8_string>>,
506 C<L</is_c9strict_utf8_string_loc>>,
507 and
508 C<L</is_c9strict_utf8_string_loclen>>.
509 
510 =cut
511 
512 */
513 
514 #define is_utf8_invariant_string(s, len)                                    \
515                                 is_utf8_invariant_string_loc(s, len, NULL)
516 
517 /*
518 =for apidoc is_utf8_invariant_string_loc
519 
520 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
521 the first UTF-8 variant character in the C<ep> pointer; if all characters are
522 UTF-8 invariant, this function does not change the contents of C<*ep>.
523 
524 =cut
525 
526 */
527 
528 PERL_STATIC_INLINE bool
Perl_is_utf8_invariant_string_loc(const U8 * const s,STRLEN len,const U8 ** ep)529 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
530 {
531     const U8* send;
532     const U8* x = s;
533 
534     PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
535 
536     if (len == 0) {
537         len = strlen((const char *)s);
538     }
539 
540     send = s + len;
541 
542 /* This looks like 0x010101... */
543 #  define PERL_COUNT_MULTIPLIER   (~ (UINTMAX_C(0)) / 0xFF)
544 
545 /* This looks like 0x808080... */
546 #  define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
547 #  define PERL_WORDSIZE            sizeof(PERL_UINTMAX_T)
548 #  define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
549 
550 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
551  * or'ing together the lowest bits of 'x'.  Hopefully the final term gets
552  * optimized out completely on a 32-bit system, and its mask gets optimized out
553  * on a 64-bit system */
554 #  define PERL_IS_SUBWORD_ADDR(x) (1 & (       PTR2nat(x)                     \
555                                       |   (  PTR2nat(x) >> 1)                 \
556                                       | ( ( (PTR2nat(x)                       \
557                                            & PERL_WORD_BOUNDARY_MASK) >> 2))))
558 
559 #ifndef EBCDIC
560 
561     /* Do the word-at-a-time iff there is at least one usable full word.  That
562      * means that after advancing to a word boundary, there still is at least a
563      * full word left.  The number of bytes needed to advance is 'wordsize -
564      * offset' unless offset is 0. */
565     if ((STRLEN) (send - x) >= PERL_WORDSIZE
566 
567                             /* This term is wordsize if subword; 0 if not */
568                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
569 
570                             /* 'offset' */
571                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
572     {
573 
574         /* Process per-byte until reach word boundary.  XXX This loop could be
575          * eliminated if we knew that this platform had fast unaligned reads */
576         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
577             if (! UTF8_IS_INVARIANT(*x)) {
578                 if (ep) {
579                     *ep = x;
580                 }
581 
582                 return FALSE;
583             }
584             x++;
585         }
586 
587         /* Here, we know we have at least one full word to process.  Process
588          * per-word as long as we have at least a full word left */
589         do {
590             if ((* (const PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK)  {
591 
592                 /* Found a variant.  Just return if caller doesn't want its
593                  * exact position */
594                 if (! ep) {
595                     return FALSE;
596                 }
597 
598 #  if   BYTEORDER == 0x1234 || BYTEORDER == 0x12345678    \
599      || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
600 
601                 *ep = x + variant_byte_number(* (const PERL_UINTMAX_T *) x);
602                 assert(*ep >= s && *ep < send);
603 
604                 return FALSE;
605 
606 #  else   /* If weird byte order, drop into next loop to do byte-at-a-time
607            checks. */
608 
609                 break;
610 #  endif
611             }
612 
613             x += PERL_WORDSIZE;
614 
615         } while (x + PERL_WORDSIZE <= send);
616     }
617 
618 #endif      /* End of ! EBCDIC */
619 
620     /* Process per-byte */
621     while (x < send) {
622         if (! UTF8_IS_INVARIANT(*x)) {
623             if (ep) {
624                 *ep = x;
625             }
626 
627             return FALSE;
628         }
629 
630         x++;
631     }
632 
633     return TRUE;
634 }
635 
636 /* See if the platform has builtins for finding the most/least significant bit,
637  * and which one is right for using on 32 and 64 bit operands */
638 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
639 #  if U32SIZE == INTSIZE
640 #    define PERL_CLZ_32 __builtin_clz
641 #  endif
642 #  if defined(U64TYPE) && U64SIZE == INTSIZE
643 #    define PERL_CLZ_64 __builtin_clz
644 #  endif
645 #endif
646 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
647 #  if U32SIZE == INTSIZE
648 #    define PERL_CTZ_32 __builtin_ctz
649 #  endif
650 #  if defined(U64TYPE) && U64SIZE == INTSIZE
651 #    define PERL_CTZ_64 __builtin_ctz
652 #  endif
653 #endif
654 
655 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
656 #  if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
657 #    define PERL_CLZ_32 __builtin_clzl
658 #  endif
659 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
660 #    define PERL_CLZ_64 __builtin_clzl
661 #  endif
662 #endif
663 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
664 #  if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
665 #    define PERL_CTZ_32 __builtin_ctzl
666 #  endif
667 #  if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
668 #    define PERL_CTZ_64 __builtin_ctzl
669 #  endif
670 #endif
671 
672 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
673 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
674 #    define PERL_CLZ_32 __builtin_clzll
675 #  endif
676 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
677 #    define PERL_CLZ_64 __builtin_clzll
678 #  endif
679 #endif
680 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
681 #  if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
682 #    define PERL_CTZ_32 __builtin_ctzll
683 #  endif
684 #  if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
685 #    define PERL_CTZ_64 __builtin_ctzll
686 #  endif
687 #endif
688 
689 #if defined(_MSC_VER)
690 #  include <intrin.h>
691 #  pragma intrinsic(_BitScanForward)
692 #  pragma intrinsic(_BitScanReverse)
693 #  ifdef _WIN64
694 #    pragma intrinsic(_BitScanForward64)
695 #    pragma intrinsic(_BitScanReverse64)
696 #  endif
697 #endif
698 
699 /* The reason there are not checks to see if ffs() and ffsl() are available for
700  * determining the lsb, is because these don't improve on the deBruijn method
701  * fallback, which is just a branchless integer multiply, array element
702  * retrieval, and shift.  The others, even if the function call overhead is
703  * optimized out, have to cope with the possibility of the input being all
704  * zeroes, and almost certainly will have conditionals for this eventuality.
705  * khw, at the time of this commit, looked at the source for both gcc and clang
706  * to verify this.  (gcc used a method inferior to deBruijn.) */
707 
708 /* Below are functions to find the first, last, or only set bit in a word.  On
709  * platforms with 64-bit capability, there is a pair for each operation; the
710  * first taking a 64 bit operand, and the second a 32 bit one.  The logic is
711  * the same in each pair, so the second is stripped of most comments. */
712 
713 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
714 
715 PERL_STATIC_INLINE unsigned
Perl_lsbit_pos64(U64 word)716 Perl_lsbit_pos64(U64 word)
717 {
718     /* Find the position (0..63) of the least significant set bit in the input
719      * word */
720 
721     ASSUME(word != 0);
722 
723     /* If we can determine that the platform has a usable fast method to get
724      * this info, use that */
725 
726 #  if defined(PERL_CTZ_64)
727 #    define PERL_HAS_FAST_GET_LSB_POS64
728 
729     return (unsigned) PERL_CTZ_64(word);
730 
731 #  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
732 #    define PERL_HAS_FAST_GET_LSB_POS64
733 
734     {
735         unsigned long index;
736         _BitScanForward64(&index, word);
737         return (unsigned)index;
738     }
739 
740 #  else
741 
742     /* Here, we didn't find a fast method for finding the lsb.  Fall back to
743      * making the lsb the only set bit in the word, and use our function that
744      * works on words with a single bit set.
745      *
746      * Isolate the lsb;
747      * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
748      *
749      * The word will look like this, with a rightmost set bit in position 's':
750      * ('x's are don't cares, and 'y's are their complements)
751      *      s
752      *  x..x100..00
753      *  y..y011..11      Complement
754      *  y..y100..00      Add 1
755      *  0..0100..00      And with the original
756      *
757      *  (Yes, complementing and adding 1 is just taking the negative on 2's
758      *  complement machines, but not on 1's complement ones, and some compilers
759      *  complain about negating an unsigned.)
760      */
761     return single_1bit_pos64(word & (~word + 1));
762 
763 #  endif
764 
765 }
766 
767 #  define lsbit_pos_uintmax_(word) lsbit_pos64(word)
768 #else   /* ! QUAD */
769 #  define lsbit_pos_uintmax_(word) lsbit_pos32(word)
770 #endif
771 
772 PERL_STATIC_INLINE unsigned     /* Like above for 32 bit word */
Perl_lsbit_pos32(U32 word)773 Perl_lsbit_pos32(U32 word)
774 {
775     /* Find the position (0..31) of the least significant set bit in the input
776      * word */
777 
778     ASSUME(word != 0);
779 
780 #if defined(PERL_CTZ_32)
781 #  define PERL_HAS_FAST_GET_LSB_POS32
782 
783     return (unsigned) PERL_CTZ_32(word);
784 
785 #elif U32SIZE == 4 && defined(_MSC_VER)
786 #  define PERL_HAS_FAST_GET_LSB_POS32
787 
788     {
789         unsigned long index;
790         _BitScanForward(&index, word);
791         return (unsigned)index;
792     }
793 
794 #else
795 
796     return single_1bit_pos32(word & (~word + 1));
797 
798 #endif
799 
800 }
801 
802 
803 /* Convert the leading zeros count to the bit position of the first set bit.
804  * This just subtracts from the highest position, 31 or 63.  But some compilers
805  * don't optimize this optimally, and so a bit of bit twiddling encourages them
806  * to do the right thing.  It turns out that subtracting a smaller non-negative
807  * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
808  * the two numbers.  To see why, first note that the sum of any number, x, and
809  * its complement, x', is all ones.  So all ones minus x is x'.  Then note that
810  * the xor of x and all ones is x'. */
811 #define LZC_TO_MSBIT_POS_(size, lzc)  ((size##SIZE * CHARBITS - 1) ^ (lzc))
812 
813 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
814 
815 PERL_STATIC_INLINE unsigned
Perl_msbit_pos64(U64 word)816 Perl_msbit_pos64(U64 word)
817 {
818     /* Find the position (0..63) of the most significant set bit in the input
819      * word */
820 
821     ASSUME(word != 0);
822 
823     /* If we can determine that the platform has a usable fast method to get
824      * this, use that */
825 
826 #  if defined(PERL_CLZ_64)
827 #    define PERL_HAS_FAST_GET_MSB_POS64
828 
829     return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
830 
831 #  elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
832 #    define PERL_HAS_FAST_GET_MSB_POS64
833 
834     {
835         unsigned long index;
836         _BitScanReverse64(&index, word);
837         return (unsigned)index;
838     }
839 
840 #  else
841 
842     /* Here, we didn't find a fast method for finding the msb.  Fall back to
843      * making the msb the only set bit in the word, and use our function that
844      * works on words with a single bit set.
845      *
846      * Isolate the msb; http://codeforces.com/blog/entry/10330
847      *
848      * Only the most significant set bit matters.  Or'ing word with its right
849      * shift of 1 makes that bit and the next one to its right both 1.
850      * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
851      * ...  We end with the msb and all to the right being 1. */
852     word |= (word >>  1);
853     word |= (word >>  2);
854     word |= (word >>  4);
855     word |= (word >>  8);
856     word |= (word >> 16);
857     word |= (word >> 32);
858 
859     /* Then subtracting the right shift by 1 clears all but the left-most of
860      * the 1 bits, which is our desired result */
861     word -= (word >> 1);
862 
863     /* Now we have a single bit set */
864     return single_1bit_pos64(word);
865 
866 #  endif
867 
868 }
869 
870 #  define msbit_pos_uintmax_(word) msbit_pos64(word)
871 #else   /* ! QUAD */
872 #  define msbit_pos_uintmax_(word) msbit_pos32(word)
873 #endif
874 
875 PERL_STATIC_INLINE unsigned
Perl_msbit_pos32(U32 word)876 Perl_msbit_pos32(U32 word)
877 {
878     /* Find the position (0..31) of the most significant set bit in the input
879      * word */
880 
881     ASSUME(word != 0);
882 
883 #if defined(PERL_CLZ_32)
884 #  define PERL_HAS_FAST_GET_MSB_POS32
885 
886     return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
887 
888 #elif U32SIZE == 4 && defined(_MSC_VER)
889 #  define PERL_HAS_FAST_GET_MSB_POS32
890 
891     {
892         unsigned long index;
893         _BitScanReverse(&index, word);
894         return (unsigned)index;
895     }
896 
897 #else
898 
899     word |= (word >>  1);
900     word |= (word >>  2);
901     word |= (word >>  4);
902     word |= (word >>  8);
903     word |= (word >> 16);
904     word -= (word >> 1);
905     return single_1bit_pos32(word);
906 
907 #endif
908 
909 }
910 
911 #if UVSIZE == U64SIZE
912 #  define msbit_pos(word)  msbit_pos64(word)
913 #  define lsbit_pos(word)  lsbit_pos64(word)
914 #elif UVSIZE == U32SIZE
915 #  define msbit_pos(word)  msbit_pos32(word)
916 #  define lsbit_pos(word)  lsbit_pos32(word)
917 #endif
918 
919 #ifdef U64TYPE  /* HAS_QUAD not usable outside the core */
920 
921 PERL_STATIC_INLINE unsigned
Perl_single_1bit_pos64(U64 word)922 Perl_single_1bit_pos64(U64 word)
923 {
924     /* Given a 64-bit word known to contain all zero bits except one 1 bit,
925      * find and return the 1's position: 0..63 */
926 
927 #  ifdef PERL_CORE    /* macro not exported */
928     ASSUME(isPOWER_OF_2(word));
929 #  else
930     ASSUME(word && (word & (word-1)) == 0);
931 #  endif
932 
933     /* The only set bit is both the most and least significant bit.  If we have
934      * a fast way of finding either one, use that.
935      *
936      * It may appear at first glance that those functions call this one, but
937      * they don't if the corresponding #define is set */
938 
939 #  ifdef PERL_HAS_FAST_GET_MSB_POS64
940 
941     return msbit_pos64(word);
942 
943 #  elif defined(PERL_HAS_FAST_GET_LSB_POS64)
944 
945     return lsbit_pos64(word);
946 
947 #  else
948 
949     /* The position of the only set bit in a word can be quickly calculated
950      * using deBruijn sequences.  See for example
951      * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
952     return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
953                                                     >> PERL_deBruijnShift64_];
954 #  endif
955 
956 }
957 
958 #endif
959 
960 PERL_STATIC_INLINE unsigned
Perl_single_1bit_pos32(U32 word)961 Perl_single_1bit_pos32(U32 word)
962 {
963     /* Given a 32-bit word known to contain all zero bits except one 1 bit,
964      * find and return the 1's position: 0..31 */
965 
966 #ifdef PERL_CORE    /* macro not exported */
967     ASSUME(isPOWER_OF_2(word));
968 #else
969     ASSUME(word && (word & (word-1)) == 0);
970 #endif
971 #ifdef PERL_HAS_FAST_GET_MSB_POS32
972 
973     return msbit_pos32(word);
974 
975 #elif defined(PERL_HAS_FAST_GET_LSB_POS32)
976 
977     return lsbit_pos32(word);
978 
979 /* Unlikely, but possible for the platform to have a wider fast operation but
980  * not a narrower one.  But easy enough to handle the case by widening the
981  * parameter size.  (Going the other way, emulating 64 bit by two 32 bit ops
982  * would be slower than the deBruijn method.) */
983 #elif defined(PERL_HAS_FAST_GET_MSB_POS64)
984 
985     return msbit_pos64(word);
986 
987 #elif defined(PERL_HAS_FAST_GET_LSB_POS64)
988 
989     return lsbit_pos64(word);
990 
991 #else
992 
993     return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
994                                                     >> PERL_deBruijnShift32_];
995 #endif
996 
997 }
998 
999 #ifndef EBCDIC
1000 
1001 PERL_STATIC_INLINE unsigned int
Perl_variant_byte_number(PERL_UINTMAX_T word)1002 Perl_variant_byte_number(PERL_UINTMAX_T word)
1003 {
1004     /* This returns the position in a word (0..7) of the first variant byte in
1005      * it.  This is a helper function.  Note that there are no branches */
1006 
1007     /* Get just the msb bits of each byte */
1008     word &= PERL_VARIANTS_WORD_MASK;
1009 
1010     /* This should only be called if we know there is a variant byte in the
1011      * word */
1012     assert(word);
1013 
1014 #  if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1015 
1016     /* Bytes are stored like
1017      *  Byte8 ... Byte2 Byte1
1018      *  63..56...15...8 7...0
1019      * so getting the lsb of the whole modified word is getting the msb of the
1020      * first byte that has its msb set */
1021     word = lsbit_pos_uintmax_(word);
1022 
1023     /* Here, word contains the position 7,15,23,...55,63 of that bit.  Convert
1024      * to 0..7 */
1025     return (unsigned int) ((word + 1) >> 3) - 1;
1026 
1027 #  elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1028 
1029     /* Bytes are stored like
1030      *  Byte1 Byte2  ... Byte8
1031      * 63..56 55..47 ... 7...0
1032      * so getting the msb of the whole modified word is getting the msb of the
1033      * first byte that has its msb set */
1034     word = msbit_pos_uintmax_(word);
1035 
1036     /* Here, word contains the position 63,55,...,23,15,7 of that bit.  Convert
1037      * to 0..7 */
1038     word = ((word + 1) >> 3) - 1;
1039 
1040     /* And invert the result because of the reversed byte order on this
1041      * platform */
1042     word = CHARBITS - word - 1;
1043 
1044     return (unsigned int) word;
1045 
1046 #  else
1047 #    error Unexpected byte order
1048 #  endif
1049 
1050 }
1051 
1052 #endif
1053 #if defined(PERL_CORE) || defined(PERL_EXT)
1054 
1055 /*
1056 =for apidoc variant_under_utf8_count
1057 
1058 This function looks at the sequence of bytes between C<s> and C<e>, which are
1059 assumed to be encoded in ASCII/Latin1, and returns how many of them would
1060 change should the string be translated into UTF-8.  Due to the nature of UTF-8,
1061 each of these would occupy two bytes instead of the single one in the input
1062 string.  Thus, this function returns the precise number of bytes the string
1063 would expand by when translated to UTF-8.
1064 
1065 Unlike most of the other functions that have C<utf8> in their name, the input
1066 to this function is NOT a UTF-8-encoded string.  The function name is slightly
1067 I<odd> to emphasize this.
1068 
1069 This function is internal to Perl because khw thinks that any XS code that
1070 would want this is probably operating too close to the internals.  Presenting a
1071 valid use case could change that.
1072 
1073 See also
1074 C<L<perlapi/is_utf8_invariant_string>>
1075 and
1076 C<L<perlapi/is_utf8_invariant_string_loc>>,
1077 
1078 =cut
1079 
1080 */
1081 
1082 PERL_STATIC_INLINE Size_t
S_variant_under_utf8_count(const U8 * const s,const U8 * const e)1083 S_variant_under_utf8_count(const U8* const s, const U8* const e)
1084 {
1085     const U8* x = s;
1086     Size_t count = 0;
1087 
1088     PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1089 
1090 #  ifndef EBCDIC
1091 
1092     /* Test if the string is long enough to use word-at-a-time.  (Logic is the
1093      * same as for is_utf8_invariant_string()) */
1094     if ((STRLEN) (e - x) >= PERL_WORDSIZE
1095                           + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1096                           - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1097     {
1098 
1099         /* Process per-byte until reach word boundary.  XXX This loop could be
1100          * eliminated if we knew that this platform had fast unaligned reads */
1101         while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1102             count += ! UTF8_IS_INVARIANT(*x++);
1103         }
1104 
1105         /* Process per-word as long as we have at least a full word left */
1106         do {    /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1107                    explanation of how this works */
1108             PERL_UINTMAX_T increment
1109                 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
1110                       * PERL_COUNT_MULTIPLIER)
1111                     >> ((PERL_WORDSIZE - 1) * CHARBITS);
1112             count += (Size_t) increment;
1113             x += PERL_WORDSIZE;
1114         } while (x + PERL_WORDSIZE <= e);
1115     }
1116 
1117 #  endif
1118 
1119     /* Process per-byte */
1120     while (x < e) {
1121         if (! UTF8_IS_INVARIANT(*x)) {
1122             count++;
1123         }
1124 
1125         x++;
1126     }
1127 
1128     return count;
1129 }
1130 
1131 #endif
1132 
1133    /* Keep  these around for these files */
1134 #if ! defined(PERL_IN_REGEXEC_C) && ! defined(PERL_IN_UTF8_C)
1135 #  undef PERL_WORDSIZE
1136 #  undef PERL_COUNT_MULTIPLIER
1137 #  undef PERL_WORD_BOUNDARY_MASK
1138 #  undef PERL_VARIANTS_WORD_MASK
1139 #endif
1140 
1141 /*
1142 =for apidoc is_utf8_string
1143 
1144 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1145 Perl-extended-UTF-8 string; returns FALSE otherwise.  If C<len> is 0, it will
1146 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1147 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1148 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1149 
1150 This function considers Perl's extended UTF-8 to be valid.  That means that
1151 code points above Unicode, surrogates, and non-character code points are
1152 considered valid by this function.  Use C<L</is_strict_utf8_string>>,
1153 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1154 code points are considered valid.
1155 
1156 See also
1157 C<L</is_utf8_invariant_string>>,
1158 C<L</is_utf8_invariant_string_loc>>,
1159 C<L</is_utf8_string_loc>>,
1160 C<L</is_utf8_string_loclen>>,
1161 C<L</is_utf8_fixed_width_buf_flags>>,
1162 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1163 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1164 
1165 =cut
1166 */
1167 
1168 #define is_utf8_string(s, len)  is_utf8_string_loclen(s, len, NULL, NULL)
1169 
1170 #if defined(PERL_CORE) || defined (PERL_EXT)
1171 
1172 /*
1173 =for apidoc is_utf8_non_invariant_string
1174 
1175 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
1176 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
1177 UTF-8; otherwise returns FALSE.
1178 
1179 A TRUE return means that at least one code point represented by the sequence
1180 either is a wide character not representable as a single byte, or the
1181 representation differs depending on whether the sequence is encoded in UTF-8 or
1182 not.
1183 
1184 See also
1185 C<L<perlapi/is_utf8_invariant_string>>,
1186 C<L<perlapi/is_utf8_string>>
1187 
1188 =cut
1189 
1190 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
1191 It generally needn't be if its string is entirely UTF-8 invariant, and it
1192 shouldn't be if it otherwise contains invalid UTF-8.
1193 
1194 It is an internal function because khw thinks that XS code shouldn't be working
1195 at this low a level.  A valid use case could change that.
1196 
1197 */
1198 
1199 PERL_STATIC_INLINE bool
Perl_is_utf8_non_invariant_string(const U8 * const s,STRLEN len)1200 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
1201 {
1202     const U8 * first_variant;
1203 
1204     PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
1205 
1206     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1207         return FALSE;
1208     }
1209 
1210     return is_utf8_string(first_variant, len - (first_variant - s));
1211 }
1212 
1213 #endif
1214 
1215 /*
1216 =for apidoc is_strict_utf8_string
1217 
1218 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1219 UTF-8-encoded string that is fully interchangeable by any application using
1220 Unicode rules; otherwise it returns FALSE.  If C<len> is 0, it will be
1221 calculated using C<strlen(s)> (which means if you use this option, that C<s>
1222 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1223 byte).  Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1224 
1225 This function returns FALSE for strings containing any
1226 code points above the Unicode max of 0x10FFFF, surrogate code points, or
1227 non-character code points.
1228 
1229 See also
1230 C<L</is_utf8_invariant_string>>,
1231 C<L</is_utf8_invariant_string_loc>>,
1232 C<L</is_utf8_string>>,
1233 C<L</is_utf8_string_flags>>,
1234 C<L</is_utf8_string_loc>>,
1235 C<L</is_utf8_string_loc_flags>>,
1236 C<L</is_utf8_string_loclen>>,
1237 C<L</is_utf8_string_loclen_flags>>,
1238 C<L</is_utf8_fixed_width_buf_flags>>,
1239 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1240 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1241 C<L</is_strict_utf8_string_loc>>,
1242 C<L</is_strict_utf8_string_loclen>>,
1243 C<L</is_c9strict_utf8_string>>,
1244 C<L</is_c9strict_utf8_string_loc>>,
1245 and
1246 C<L</is_c9strict_utf8_string_loclen>>.
1247 
1248 =cut
1249 */
1250 
1251 #define is_strict_utf8_string(s, len)  is_strict_utf8_string_loclen(s, len, NULL, NULL)
1252 
1253 /*
1254 =for apidoc is_c9strict_utf8_string
1255 
1256 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1257 UTF-8-encoded string that conforms to
1258 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
1259 otherwise it returns FALSE.  If C<len> is 0, it will be calculated using
1260 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
1261 C<NUL> characters and has to have a terminating C<NUL> byte).  Note that all
1262 characters being ASCII constitute 'a valid UTF-8 string'.
1263 
1264 This function returns FALSE for strings containing any code points above the
1265 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
1266 code points per
1267 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1268 
1269 See also
1270 C<L</is_utf8_invariant_string>>,
1271 C<L</is_utf8_invariant_string_loc>>,
1272 C<L</is_utf8_string>>,
1273 C<L</is_utf8_string_flags>>,
1274 C<L</is_utf8_string_loc>>,
1275 C<L</is_utf8_string_loc_flags>>,
1276 C<L</is_utf8_string_loclen>>,
1277 C<L</is_utf8_string_loclen_flags>>,
1278 C<L</is_utf8_fixed_width_buf_flags>>,
1279 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1280 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1281 C<L</is_strict_utf8_string>>,
1282 C<L</is_strict_utf8_string_loc>>,
1283 C<L</is_strict_utf8_string_loclen>>,
1284 C<L</is_c9strict_utf8_string_loc>>,
1285 and
1286 C<L</is_c9strict_utf8_string_loclen>>.
1287 
1288 =cut
1289 */
1290 
1291 #define is_c9strict_utf8_string(s, len)  is_c9strict_utf8_string_loclen(s, len, NULL, 0)
1292 
1293 /*
1294 =for apidoc is_utf8_string_flags
1295 
1296 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1297 UTF-8 string, subject to the restrictions imposed by C<flags>;
1298 returns FALSE otherwise.  If C<len> is 0, it will be calculated
1299 using C<strlen(s)> (which means if you use this option, that C<s> can't have
1300 embedded C<NUL> characters and has to have a terminating C<NUL> byte).  Note
1301 that all characters being ASCII constitute 'a valid UTF-8 string'.
1302 
1303 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1304 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1305 as C<L</is_strict_utf8_string>>; and if C<flags> is
1306 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1307 C<L</is_c9strict_utf8_string>>.  Otherwise C<flags> may be any
1308 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1309 C<L</utf8n_to_uvchr>>, with the same meanings.
1310 
1311 See also
1312 C<L</is_utf8_invariant_string>>,
1313 C<L</is_utf8_invariant_string_loc>>,
1314 C<L</is_utf8_string>>,
1315 C<L</is_utf8_string_loc>>,
1316 C<L</is_utf8_string_loc_flags>>,
1317 C<L</is_utf8_string_loclen>>,
1318 C<L</is_utf8_string_loclen_flags>>,
1319 C<L</is_utf8_fixed_width_buf_flags>>,
1320 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1321 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1322 C<L</is_strict_utf8_string>>,
1323 C<L</is_strict_utf8_string_loc>>,
1324 C<L</is_strict_utf8_string_loclen>>,
1325 C<L</is_c9strict_utf8_string>>,
1326 C<L</is_c9strict_utf8_string_loc>>,
1327 and
1328 C<L</is_c9strict_utf8_string_loclen>>.
1329 
1330 =cut
1331 */
1332 
1333 PERL_STATIC_INLINE bool
Perl_is_utf8_string_flags(const U8 * s,STRLEN len,const U32 flags)1334 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
1335 {
1336     const U8 * first_variant;
1337 
1338     PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1339     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1340                           |UTF8_DISALLOW_PERL_EXTENDED)));
1341 
1342     if (len == 0) {
1343         len = strlen((const char *)s);
1344     }
1345 
1346     if (flags == 0) {
1347         return is_utf8_string(s, len);
1348     }
1349 
1350     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1351                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1352     {
1353         return is_strict_utf8_string(s, len);
1354     }
1355 
1356     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1357                                        == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1358     {
1359         return is_c9strict_utf8_string(s, len);
1360     }
1361 
1362     if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1363         const U8* const send = s + len;
1364         const U8* x = first_variant;
1365 
1366         while (x < send) {
1367             STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1368             if (UNLIKELY(! cur_len)) {
1369                 return FALSE;
1370             }
1371             x += cur_len;
1372         }
1373     }
1374 
1375     return TRUE;
1376 }
1377 
1378 /*
1379 
1380 =for apidoc is_utf8_string_loc
1381 
1382 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1383 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1384 "utf8ness success") in the C<ep> pointer.
1385 
1386 See also C<L</is_utf8_string_loclen>>.
1387 
1388 =cut
1389 */
1390 
1391 #define is_utf8_string_loc(s, len, ep)  is_utf8_string_loclen(s, len, ep, 0)
1392 
1393 /*
1394 
1395 =for apidoc is_utf8_string_loclen
1396 
1397 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1398 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1399 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1400 encoded characters in the C<el> pointer.
1401 
1402 See also C<L</is_utf8_string_loc>>.
1403 
1404 =cut
1405 */
1406 
1407 PERL_STATIC_INLINE bool
Perl_is_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)1408 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1409 {
1410     const U8 * first_variant;
1411 
1412     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1413 
1414     if (len == 0) {
1415         len = strlen((const char *) s);
1416     }
1417 
1418     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1419         if (el)
1420             *el = len;
1421 
1422         if (ep) {
1423             *ep = s + len;
1424         }
1425 
1426         return TRUE;
1427     }
1428 
1429     {
1430         const U8* const send = s + len;
1431         const U8* x = first_variant;
1432         STRLEN outlen = first_variant - s;
1433 
1434         while (x < send) {
1435             const STRLEN cur_len = isUTF8_CHAR(x, send);
1436             if (UNLIKELY(! cur_len)) {
1437                 break;
1438             }
1439             x += cur_len;
1440             outlen++;
1441         }
1442 
1443         if (el)
1444             *el = outlen;
1445 
1446         if (ep) {
1447             *ep = x;
1448         }
1449 
1450         return (x == send);
1451     }
1452 }
1453 
1454 /* The perl core arranges to never call the DFA below without there being at
1455  * least one byte available to look at.  This allows the DFA to use a do {}
1456  * while loop which means that calling it with a UTF-8 invariant has a single
1457  * conditional, same as the calling code checking for invariance ahead of time.
1458  * And having the calling code remove that conditional speeds up by that
1459  * conditional, the case where it wasn't invariant.  So there's no reason to
1460  * check before caling this.
1461  *
1462  * But we don't know this for non-core calls, so have to retain the check for
1463  * them. */
1464 #ifdef PERL_CORE
1465 #  define PERL_NON_CORE_CHECK_EMPTY(s,e)  assert((e) > (s))
1466 #else
1467 #  define PERL_NON_CORE_CHECK_EMPTY(s,e)  if ((e) <= (s)) return FALSE
1468 #endif
1469 
1470 /*
1471  * DFA for checking input is valid UTF-8 syntax.
1472  *
1473  * This uses adaptations of the table and algorithm given in
1474  * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1475  * documentation of the original version.  A copyright notice for the original
1476  * version is given at the beginning of this file.  The Perl adaptations are
1477  * documented at the definition of PL_extended_utf8_dfa_tab[].
1478  *
1479  * This dfa is fast.  There are three exit conditions:
1480  *  1) a well-formed code point, acceptable to the table
1481  *  2) the beginning bytes of an incomplete character, whose completion might
1482  *     or might not be acceptable
1483  *  3) unacceptable to the table.  Some of the adaptations have certain,
1484  *     hopefully less likely to occur, legal inputs be unacceptable to the
1485  *     table, so these must be sorted out afterwards.
1486  *
1487  * This macro is a complete implementation of the code executing the DFA.  It
1488  * is passed the input sequence bounds and the table to use, and what to do
1489  * for each of the exit conditions.  There are three canned actions, likely to
1490  * be the ones you want:
1491  *      DFA_RETURN_SUCCESS_
1492  *      DFA_RETURN_FAILURE_
1493  *      DFA_GOTO_TEASE_APART_FF_
1494  *
1495  * You pass a parameter giving the action to take for each of the three
1496  * possible exit conditions:
1497  *
1498  * 'accept_action'  This is executed when the DFA accepts the input.
1499  *                  DFA_RETURN_SUCCESS_ is the most likely candidate.
1500  * 'reject_action'  This is executed when the DFA rejects the input.
1501  *                  DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
1502  *                  you have written code to distinguish the rejecting state
1503  *                  results.  Because it happens in several places, and
1504  *                  involves #ifdefs, the special action
1505  *                  DFA_GOTO_TEASE_APART_FF_ is what you want with
1506  *                  PL_extended_utf8_dfa_tab.  On platforms without
1507  *                  EXTRA_LONG_UTF8, there is no need to tease anything apart,
1508  *                  so this evaluates to DFA_RETURN_FAILURE_; otherwise you
1509  *                  need to have a label 'tease_apart_FF' that it will transfer
1510  *                  to.
1511  * 'incomplete_char_action'  This is executed when the DFA ran off the end
1512  *                  before accepting or rejecting the input.
1513  *                  DFA_RETURN_FAILURE_ is the likely action, but you could
1514  *                  have a 'goto', or NOOP.  In the latter case the DFA drops
1515  *                  off the end, and you place your code to handle this case
1516  *                  immediately after it.
1517  */
1518 
1519 #define DFA_RETURN_SUCCESS_      return s - s0
1520 #define DFA_RETURN_FAILURE_      return 0
1521 #ifdef HAS_EXTRA_LONG_UTF8
1522 #  define DFA_TEASE_APART_FF_  goto tease_apart_FF
1523 #else
1524 #  define DFA_TEASE_APART_FF_  DFA_RETURN_FAILURE_
1525 #endif
1526 
1527 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab,                               \
1528                               accept_action,                                \
1529                               reject_action,                                \
1530                               incomplete_char_action)                       \
1531     STMT_START {                                                            \
1532         const U8 * s = s0;                                                  \
1533         const U8 * e_ = e;                                                  \
1534         UV state = 0;                                                       \
1535                                                                             \
1536         PERL_NON_CORE_CHECK_EMPTY(s, e_);                                   \
1537                                                                             \
1538         do {                                                                \
1539             state = dfa_tab[256 + state + dfa_tab[*s]];                     \
1540             s++;                                                            \
1541                                                                             \
1542             if (state == 0) {   /* Accepting state */                       \
1543                 accept_action;                                              \
1544             }                                                               \
1545                                                                             \
1546             if (UNLIKELY(state == 1)) { /* Rejecting state */               \
1547                 reject_action;                                              \
1548             }                                                               \
1549         } while (s < e_);                                                   \
1550                                                                             \
1551         /* Here, dropped out of loop before end-of-char */                  \
1552         incomplete_char_action;                                             \
1553     } STMT_END
1554 
1555 
1556 /*
1557 
1558 =for apidoc isUTF8_CHAR
1559 
1560 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1561 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1562 that represents some code point; otherwise it evaluates to 0.  If non-zero, the
1563 value gives how many bytes starting at C<s> comprise the code point's
1564 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1565 form the first code point in C<s>, are not examined.
1566 
1567 The code point can be any that will fit in an IV on this machine, using Perl's
1568 extension to official UTF-8 to represent those higher than the Unicode maximum
1569 of 0x10FFFF.  That means that this macro is used to efficiently decide if the
1570 next few bytes in C<s> is legal UTF-8 for a single character.
1571 
1572 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1573 defined by Unicode to be fully interchangeable across applications;
1574 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1575 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1576 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1577 
1578 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1579 C<L</is_utf8_string_loclen>> to check entire strings.
1580 
1581 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1582 machines) is a valid UTF-8 character.
1583 
1584 =cut
1585 
1586 This uses an adaptation of the table and algorithm given in
1587 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1588 documentation of the original version.  A copyright notice for the original
1589 version is given at the beginning of this file.  The Perl adaptation is
1590 documented at the definition of PL_extended_utf8_dfa_tab[].
1591 */
1592 
1593 PERL_STATIC_INLINE Size_t
Perl_isUTF8_CHAR(const U8 * const s0,const U8 * const e)1594 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1595 {
1596     PERL_ARGS_ASSERT_ISUTF8_CHAR;
1597 
1598     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
1599                           DFA_RETURN_SUCCESS_,
1600                           DFA_TEASE_APART_FF_,
1601                           DFA_RETURN_FAILURE_);
1602 
1603     /* Here, we didn't return success, but dropped out of the loop.  In the
1604      * case of PL_extended_utf8_dfa_tab, this means the input is either
1605      * malformed, or the start byte was FF on a platform that the dfa doesn't
1606      * handle FF's.  Call a helper function. */
1607 
1608 #ifdef HAS_EXTRA_LONG_UTF8
1609 
1610   tease_apart_FF:
1611 
1612     /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
1613      * either malformed, or was for the largest possible start byte, which we
1614      * now check, not inline */
1615     if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
1616         return 0;
1617     }
1618 
1619     return is_utf8_FF_helper_(s0, e,
1620                               FALSE /* require full, not partial char */
1621                              );
1622 #endif
1623 
1624 }
1625 
1626 /*
1627 
1628 =for apidoc isSTRICT_UTF8_CHAR
1629 
1630 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1631 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1632 Unicode code point completely acceptable for open interchange between all
1633 applications; otherwise it evaluates to 0.  If non-zero, the value gives how
1634 many bytes starting at C<s> comprise the code point's representation.  Any
1635 bytes remaining before C<e>, but beyond the ones needed to form the first code
1636 point in C<s>, are not examined.
1637 
1638 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1639 be a surrogate nor a non-character code point.  Thus this excludes any code
1640 point from Perl's extended UTF-8.
1641 
1642 This is used to efficiently decide if the next few bytes in C<s> is
1643 legal Unicode-acceptable UTF-8 for a single character.
1644 
1645 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1646 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1647 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1648 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1649 
1650 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1651 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1652 
1653 =cut
1654 
1655 This uses an adaptation of the tables and algorithm given in
1656 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1657 documentation of the original version.  A copyright notice for the original
1658 version is given at the beginning of this file.  The Perl adaptation is
1659 documented at the definition of strict_extended_utf8_dfa_tab[].
1660 
1661 */
1662 
1663 PERL_STATIC_INLINE Size_t
Perl_isSTRICT_UTF8_CHAR(const U8 * const s0,const U8 * const e)1664 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1665 {
1666     PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1667 
1668     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
1669                           DFA_RETURN_SUCCESS_,
1670                           goto check_hanguls,
1671                           DFA_RETURN_FAILURE_);
1672   check_hanguls:
1673 
1674     /* Here, we didn't return success, but dropped out of the loop.  In the
1675      * case of PL_strict_utf8_dfa_tab, this means the input is either
1676      * malformed, or was for certain Hanguls; handle them specially */
1677 
1678     /* The dfa above drops out for incomplete or illegal inputs, and certain
1679      * legal Hanguls; check and return accordingly */
1680     return is_HANGUL_ED_utf8_safe(s0, e);
1681 }
1682 
1683 /*
1684 
1685 =for apidoc isC9_STRICT_UTF8_CHAR
1686 
1687 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1688 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1689 Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
1690 the value gives how many bytes starting at C<s> comprise the code point's
1691 representation.  Any bytes remaining before C<e>, but beyond the ones needed to
1692 form the first code point in C<s>, are not examined.
1693 
1694 The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
1695 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1696 code points.  This corresponds to
1697 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1698 which said that non-character code points are merely discouraged rather than
1699 completely forbidden in open interchange.  See
1700 L<perlunicode/Noncharacter code points>.
1701 
1702 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1703 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1704 
1705 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1706 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1707 
1708 =cut
1709 
1710 This uses an adaptation of the tables and algorithm given in
1711 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1712 documentation of the original version.  A copyright notice for the original
1713 version is given at the beginning of this file.  The Perl adaptation is
1714 documented at the definition of PL_c9_utf8_dfa_tab[].
1715 
1716 */
1717 
1718 PERL_STATIC_INLINE Size_t
Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0,const U8 * const e)1719 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1720 {
1721     PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1722 
1723     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
1724                           DFA_RETURN_SUCCESS_,
1725                           DFA_RETURN_FAILURE_,
1726                           DFA_RETURN_FAILURE_);
1727 }
1728 
1729 /*
1730 
1731 =for apidoc is_strict_utf8_string_loc
1732 
1733 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1734 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1735 "utf8ness success") in the C<ep> pointer.
1736 
1737 See also C<L</is_strict_utf8_string_loclen>>.
1738 
1739 =cut
1740 */
1741 
1742 #define is_strict_utf8_string_loc(s, len, ep)                               \
1743                                 is_strict_utf8_string_loclen(s, len, ep, 0)
1744 
1745 /*
1746 
1747 =for apidoc is_strict_utf8_string_loclen
1748 
1749 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1750 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1751 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1752 encoded characters in the C<el> pointer.
1753 
1754 See also C<L</is_strict_utf8_string_loc>>.
1755 
1756 =cut
1757 */
1758 
1759 PERL_STATIC_INLINE bool
Perl_is_strict_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)1760 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1761 {
1762     const U8 * first_variant;
1763 
1764     PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1765 
1766     if (len == 0) {
1767         len = strlen((const char *) s);
1768     }
1769 
1770     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1771         if (el)
1772             *el = len;
1773 
1774         if (ep) {
1775             *ep = s + len;
1776         }
1777 
1778         return TRUE;
1779     }
1780 
1781     {
1782         const U8* const send = s + len;
1783         const U8* x = first_variant;
1784         STRLEN outlen = first_variant - s;
1785 
1786         while (x < send) {
1787             const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1788             if (UNLIKELY(! cur_len)) {
1789                 break;
1790             }
1791             x += cur_len;
1792             outlen++;
1793         }
1794 
1795         if (el)
1796             *el = outlen;
1797 
1798         if (ep) {
1799             *ep = x;
1800         }
1801 
1802         return (x == send);
1803     }
1804 }
1805 
1806 /*
1807 
1808 =for apidoc is_c9strict_utf8_string_loc
1809 
1810 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1811 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1812 "utf8ness success") in the C<ep> pointer.
1813 
1814 See also C<L</is_c9strict_utf8_string_loclen>>.
1815 
1816 =cut
1817 */
1818 
1819 #define is_c9strict_utf8_string_loc(s, len, ep)	                            \
1820                             is_c9strict_utf8_string_loclen(s, len, ep, 0)
1821 
1822 /*
1823 
1824 =for apidoc is_c9strict_utf8_string_loclen
1825 
1826 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1827 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1828 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1829 characters in the C<el> pointer.
1830 
1831 See also C<L</is_c9strict_utf8_string_loc>>.
1832 
1833 =cut
1834 */
1835 
1836 PERL_STATIC_INLINE bool
Perl_is_c9strict_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)1837 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1838 {
1839     const U8 * first_variant;
1840 
1841     PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1842 
1843     if (len == 0) {
1844         len = strlen((const char *) s);
1845     }
1846 
1847     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1848         if (el)
1849             *el = len;
1850 
1851         if (ep) {
1852             *ep = s + len;
1853         }
1854 
1855         return TRUE;
1856     }
1857 
1858     {
1859         const U8* const send = s + len;
1860         const U8* x = first_variant;
1861         STRLEN outlen = first_variant - s;
1862 
1863         while (x < send) {
1864             const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1865             if (UNLIKELY(! cur_len)) {
1866                 break;
1867             }
1868             x += cur_len;
1869             outlen++;
1870         }
1871 
1872         if (el)
1873             *el = outlen;
1874 
1875         if (ep) {
1876             *ep = x;
1877         }
1878 
1879         return (x == send);
1880     }
1881 }
1882 
1883 /*
1884 
1885 =for apidoc is_utf8_string_loc_flags
1886 
1887 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1888 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1889 "utf8ness success") in the C<ep> pointer.
1890 
1891 See also C<L</is_utf8_string_loclen_flags>>.
1892 
1893 =cut
1894 */
1895 
1896 #define is_utf8_string_loc_flags(s, len, ep, flags)                         \
1897                         is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1898 
1899 
1900 /* The above 3 actual functions could have been moved into the more general one
1901  * just below, and made #defines that call it with the right 'flags'.  They are
1902  * currently kept separate to increase their chances of getting inlined */
1903 
1904 /*
1905 
1906 =for apidoc is_utf8_string_loclen_flags
1907 
1908 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1909 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1910 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1911 encoded characters in the C<el> pointer.
1912 
1913 See also C<L</is_utf8_string_loc_flags>>.
1914 
1915 =cut
1916 */
1917 
1918 PERL_STATIC_INLINE bool
Perl_is_utf8_string_loclen_flags(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el,const U32 flags)1919 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1920 {
1921     const U8 * first_variant;
1922 
1923     PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1924     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1925                           |UTF8_DISALLOW_PERL_EXTENDED)));
1926 
1927     if (len == 0) {
1928         len = strlen((const char *) s);
1929     }
1930 
1931     if (flags == 0) {
1932         return is_utf8_string_loclen(s, len, ep, el);
1933     }
1934 
1935     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1936                                         == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1937     {
1938         return is_strict_utf8_string_loclen(s, len, ep, el);
1939     }
1940 
1941     if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1942                                     == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1943     {
1944         return is_c9strict_utf8_string_loclen(s, len, ep, el);
1945     }
1946 
1947     if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1948         if (el)
1949             *el = len;
1950 
1951         if (ep) {
1952             *ep = s + len;
1953         }
1954 
1955         return TRUE;
1956     }
1957 
1958     {
1959         const U8* send = s + len;
1960         const U8* x = first_variant;
1961         STRLEN outlen = first_variant - s;
1962 
1963         while (x < send) {
1964             const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1965             if (UNLIKELY(! cur_len)) {
1966                 break;
1967             }
1968             x += cur_len;
1969             outlen++;
1970         }
1971 
1972         if (el)
1973             *el = outlen;
1974 
1975         if (ep) {
1976             *ep = x;
1977         }
1978 
1979         return (x == send);
1980     }
1981 }
1982 
1983 /*
1984 =for apidoc utf8_distance
1985 
1986 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1987 and C<b>.
1988 
1989 WARNING: use only if you *know* that the pointers point inside the
1990 same UTF-8 buffer.
1991 
1992 =cut
1993 */
1994 
1995 PERL_STATIC_INLINE IV
Perl_utf8_distance(pTHX_ const U8 * a,const U8 * b)1996 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1997 {
1998     PERL_ARGS_ASSERT_UTF8_DISTANCE;
1999 
2000     return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
2001 }
2002 
2003 /*
2004 =for apidoc utf8_hop
2005 
2006 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
2007 forward (if C<off> is positive) or backward (if negative).  C<s> does not need
2008 to be pointing to the starting byte of a character.  If it isn't, one count of
2009 C<off> will be used up to get to the start of the next character for forward
2010 hops, and to the start of the current character for negative ones.
2011 
2012 WARNING: Prefer L</utf8_hop_safe> to this one.
2013 
2014 Do NOT use this function unless you B<know> C<off> is within
2015 the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned
2016 on the first byte of a character or just after the last byte of a character.
2017 
2018 =cut
2019 */
2020 
2021 PERL_STATIC_INLINE U8 *
Perl_utf8_hop(const U8 * s,SSize_t off)2022 Perl_utf8_hop(const U8 *s, SSize_t off)
2023 {
2024     PERL_ARGS_ASSERT_UTF8_HOP;
2025 
2026     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2027      * the XXX bitops (especially ~) can create illegal UTF-8.
2028      * In other words: in Perl UTF-8 is not just for Unicode. */
2029 
2030     if (off > 0) {
2031 
2032         /* Get to next non-continuation byte */
2033         if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2034             do {
2035                 s++;
2036             }
2037             while (UTF8_IS_CONTINUATION(*s));
2038             off--;
2039         }
2040 
2041         while (off--)
2042             s += UTF8SKIP(s);
2043     }
2044     else {
2045         while (off++) {
2046             s--;
2047             while (UTF8_IS_CONTINUATION(*s))
2048                 s--;
2049         }
2050     }
2051 
2052     GCC_DIAG_IGNORE(-Wcast-qual)
2053     return (U8 *)s;
2054     GCC_DIAG_RESTORE
2055 }
2056 
2057 /*
2058 =for apidoc utf8_hop_forward
2059 
2060 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2061 forward.  C<s> does not need to be pointing to the starting byte of a
2062 character.  If it isn't, one count of C<off> will be used up to get to the
2063 start of the next character.
2064 
2065 C<off> must be non-negative.
2066 
2067 C<s> must be before or equal to C<end>.
2068 
2069 When moving forward it will not move beyond C<end>.
2070 
2071 Will not exceed this limit even if the string is not valid "UTF-8".
2072 
2073 =cut
2074 */
2075 
2076 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_forward(const U8 * s,SSize_t off,const U8 * end)2077 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2078 {
2079     PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2080 
2081     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2082      * the bitops (especially ~) can create illegal UTF-8.
2083      * In other words: in Perl UTF-8 is not just for Unicode. */
2084 
2085     assert(s <= end);
2086     assert(off >= 0);
2087 
2088     if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2089         /* Get to next non-continuation byte */
2090         do {
2091             s++;
2092         }
2093         while (UTF8_IS_CONTINUATION(*s));
2094         off--;
2095     }
2096 
2097     while (off--) {
2098         STRLEN skip = UTF8SKIP(s);
2099         if ((STRLEN)(end - s) <= skip) {
2100             GCC_DIAG_IGNORE(-Wcast-qual)
2101             return (U8 *)end;
2102             GCC_DIAG_RESTORE
2103         }
2104         s += skip;
2105     }
2106 
2107     GCC_DIAG_IGNORE(-Wcast-qual)
2108     return (U8 *)s;
2109     GCC_DIAG_RESTORE
2110 }
2111 
2112 /*
2113 =for apidoc utf8_hop_back
2114 
2115 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2116 backward.  C<s> does not need to be pointing to the starting byte of a
2117 character.  If it isn't, one count of C<off> will be used up to get to that
2118 start.
2119 
2120 C<off> must be non-positive.
2121 
2122 C<s> must be after or equal to C<start>.
2123 
2124 When moving backward it will not move before C<start>.
2125 
2126 Will not exceed this limit even if the string is not valid "UTF-8".
2127 
2128 =cut
2129 */
2130 
2131 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_back(const U8 * s,SSize_t off,const U8 * start)2132 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2133 {
2134     PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2135 
2136     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2137      * the bitops (especially ~) can create illegal UTF-8.
2138      * In other words: in Perl UTF-8 is not just for Unicode. */
2139 
2140     assert(start <= s);
2141     assert(off <= 0);
2142 
2143     /* Note: if we know that the input is well-formed, we can do per-word
2144      * hop-back.  Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented
2145      * that.  But it was reverted because doing per-word has some
2146      * start-up/tear-down overhead, so only makes sense if the distance to be
2147      * moved is large, and core perl doesn't currently move more than a few
2148      * characters at a time.  You can reinstate it if it does become
2149      * advantageous. */
2150     while (off++ && s > start) {
2151         do {
2152             s--;
2153         } while (UTF8_IS_CONTINUATION(*s) && s > start);
2154     }
2155 
2156     GCC_DIAG_IGNORE(-Wcast-qual)
2157     return (U8 *)s;
2158     GCC_DIAG_RESTORE
2159 }
2160 
2161 /*
2162 =for apidoc utf8_hop_safe
2163 
2164 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2165 either forward or backward.  C<s> does not need to be pointing to the starting
2166 byte of a character.  If it isn't, one count of C<off> will be used up to get
2167 to the start of the next character for forward hops, and to the start of the
2168 current character for negative ones.
2169 
2170 When moving backward it will not move before C<start>.
2171 
2172 When moving forward it will not move beyond C<end>.
2173 
2174 Will not exceed those limits even if the string is not valid "UTF-8".
2175 
2176 =cut
2177 */
2178 
2179 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_safe(const U8 * s,SSize_t off,const U8 * start,const U8 * end)2180 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2181 {
2182     PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2183 
2184     /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2185      * the bitops (especially ~) can create illegal UTF-8.
2186      * In other words: in Perl UTF-8 is not just for Unicode. */
2187 
2188     assert(start <= s && s <= end);
2189 
2190     if (off >= 0) {
2191         return utf8_hop_forward(s, off, end);
2192     }
2193     else {
2194         return utf8_hop_back(s, off, start);
2195     }
2196 }
2197 
2198 /*
2199 
2200 =for apidoc isUTF8_CHAR_flags
2201 
2202 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2203 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2204 that represents some code point, subject to the restrictions given by C<flags>;
2205 otherwise it evaluates to 0.  If non-zero, the value gives how many bytes
2206 starting at C<s> comprise the code point's representation.  Any bytes remaining
2207 before C<e>, but beyond the ones needed to form the first code point in C<s>,
2208 are not examined.
2209 
2210 If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
2211 if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2212 as C<L</isSTRICT_UTF8_CHAR>>;
2213 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
2214 the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
2215 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
2216 understood by C<L</utf8n_to_uvchr>>, with the same meanings.
2217 
2218 The three alternative macros are for the most commonly needed validations; they
2219 are likely to run somewhat faster than this more general one, as they can be
2220 inlined into your code.
2221 
2222 Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
2223 L</is_utf8_string_loclen_flags> to check entire strings.
2224 
2225 =cut
2226 */
2227 
2228 PERL_STATIC_INLINE STRLEN
Perl_isUTF8_CHAR_flags(const U8 * const s0,const U8 * const e,const U32 flags)2229 Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2230 {
2231     PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
2232     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2233                           |UTF8_DISALLOW_PERL_EXTENDED)));
2234 
2235     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2236                           goto check_success,
2237                           DFA_TEASE_APART_FF_,
2238                           DFA_RETURN_FAILURE_);
2239 
2240   check_success:
2241 
2242     return is_utf8_char_helper_(s0, e, flags);
2243 
2244 #ifdef HAS_EXTRA_LONG_UTF8
2245 
2246   tease_apart_FF:
2247 
2248     /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2249      * either malformed, or was for the largest possible start byte, which
2250      * indicates perl extended UTF-8, well above the Unicode maximum */
2251     if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
2252         || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2253     {
2254         return 0;
2255     }
2256 
2257     /* Otherwise examine the sequence not inline */
2258     return is_utf8_FF_helper_(s0, e,
2259                               FALSE /* require full, not partial char */
2260                              );
2261 #endif
2262 
2263 }
2264 
2265 /*
2266 
2267 =for apidoc is_utf8_valid_partial_char
2268 
2269 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
2270 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
2271 points.  Otherwise, it returns 1 if there exists at least one non-empty
2272 sequence of bytes that when appended to sequence C<s>, starting at position
2273 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
2274 otherwise returns 0.
2275 
2276 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
2277 point.
2278 
2279 This is useful when a fixed-length buffer is being tested for being well-formed
2280 UTF-8, but the final few bytes in it don't comprise a full character; that is,
2281 it is split somewhere in the middle of the final code point's UTF-8
2282 representation.  (Presumably when the buffer is refreshed with the next chunk
2283 of data, the new first bytes will complete the partial code point.)   This
2284 function is used to verify that the final bytes in the current buffer are in
2285 fact the legal beginning of some code point, so that if they aren't, the
2286 failure can be signalled without having to wait for the next read.
2287 
2288 =cut
2289 */
2290 #define is_utf8_valid_partial_char(s, e)                                    \
2291                                 is_utf8_valid_partial_char_flags(s, e, 0)
2292 
2293 /*
2294 
2295 =for apidoc is_utf8_valid_partial_char_flags
2296 
2297 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
2298 or not the input is a valid UTF-8 encoded partial character, but it takes an
2299 extra parameter, C<flags>, which can further restrict which code points are
2300 considered valid.
2301 
2302 If C<flags> is 0, this behaves identically to
2303 C<L</is_utf8_valid_partial_char>>.  Otherwise C<flags> can be any combination
2304 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>.  If
2305 there is any sequence of bytes that can complete the input partial character in
2306 such a way that a non-prohibited character is formed, the function returns
2307 TRUE; otherwise FALSE.  Non character code points cannot be determined based on
2308 partial character input.  But many  of the other possible excluded types can be
2309 determined from just the first one or two bytes.
2310 
2311 =cut
2312  */
2313 
2314 PERL_STATIC_INLINE bool
Perl_is_utf8_valid_partial_char_flags(const U8 * const s0,const U8 * const e,const U32 flags)2315 Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2316 {
2317     PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
2318     assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2319                           |UTF8_DISALLOW_PERL_EXTENDED)));
2320 
2321     PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2322                           DFA_RETURN_FAILURE_,
2323                           DFA_TEASE_APART_FF_,
2324                           NOOP);
2325 
2326     /* The NOOP above causes the DFA to drop down here iff the input was a
2327      * partial character.  flags=0 => can return TRUE immediately; otherwise we
2328      * need to check (not inline) if the partial character is the beginning of
2329      * a disallowed one */
2330     if (flags == 0) {
2331         return TRUE;
2332     }
2333 
2334     return cBOOL(is_utf8_char_helper_(s0, e, flags));
2335 
2336 #ifdef HAS_EXTRA_LONG_UTF8
2337 
2338   tease_apart_FF:
2339 
2340     /* Getting here means the input is either malformed, or, in the case of
2341      * PL_extended_utf8_dfa_tab, was for the largest possible start byte.  The
2342      * latter case has to be extended UTF-8, so can fail immediately if that is
2343      * forbidden */
2344 
2345     if (   *s0 != I8_TO_NATIVE_UTF8(0xFF)
2346         || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2347     {
2348         return 0;
2349     }
2350 
2351     return is_utf8_FF_helper_(s0, e,
2352                               TRUE /* Require to be a partial character */
2353                              );
2354 #endif
2355 
2356 }
2357 
2358 /*
2359 
2360 =for apidoc is_utf8_fixed_width_buf_flags
2361 
2362 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
2363 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
2364 otherwise it returns FALSE.
2365 
2366 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
2367 without restriction.  If the final few bytes of the buffer do not form a
2368 complete code point, this will return TRUE anyway, provided that
2369 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
2370 
2371 If C<flags> in non-zero, it can be any combination of the
2372 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
2373 same meanings.
2374 
2375 This function differs from C<L</is_utf8_string_flags>> only in that the latter
2376 returns FALSE if the final few bytes of the string don't form a complete code
2377 point.
2378 
2379 =cut
2380  */
2381 #define is_utf8_fixed_width_buf_flags(s, len, flags)                        \
2382                 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
2383 
2384 /*
2385 
2386 =for apidoc is_utf8_fixed_width_buf_loc_flags
2387 
2388 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
2389 failure in the C<ep> pointer.  If the function returns TRUE, C<*ep> will point
2390 to the beginning of any partial character at the end of the buffer; if there is
2391 no partial character C<*ep> will contain C<s>+C<len>.
2392 
2393 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
2394 
2395 =cut
2396 */
2397 
2398 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags)               \
2399                 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
2400 
2401 /*
2402 
2403 =for apidoc is_utf8_fixed_width_buf_loclen_flags
2404 
2405 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
2406 complete, valid characters found in the C<el> pointer.
2407 
2408 =cut
2409 */
2410 
2411 PERL_STATIC_INLINE bool
Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,STRLEN len,const U8 ** ep,STRLEN * el,const U32 flags)2412 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
2413                                        STRLEN len,
2414                                        const U8 **ep,
2415                                        STRLEN *el,
2416                                        const U32 flags)
2417 {
2418     const U8 * maybe_partial;
2419 
2420     PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
2421 
2422     if (! ep) {
2423         ep  = &maybe_partial;
2424     }
2425 
2426     /* If it's entirely valid, return that; otherwise see if the only error is
2427      * that the final few bytes are for a partial character */
2428     return    is_utf8_string_loclen_flags(s, len, ep, el, flags)
2429            || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
2430 }
2431 
2432 PERL_STATIC_INLINE UV
Perl_utf8n_to_uvchr_msgs(const U8 * s,STRLEN curlen,STRLEN * retlen,const U32 flags,U32 * errors,AV ** msgs)2433 Perl_utf8n_to_uvchr_msgs(const U8 *s,
2434                          STRLEN curlen,
2435                          STRLEN *retlen,
2436                          const U32 flags,
2437                          U32 * errors,
2438                          AV ** msgs)
2439 {
2440     /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
2441      * simple cases, and, if necessary calls a helper function to deal with the
2442      * more complex ones.  Almost all well-formed non-problematic code points
2443      * are considered simple, so that it's unlikely that the helper function
2444      * will need to be called.
2445      *
2446      * This is an adaptation of the tables and algorithm given in
2447      * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
2448      * comprehensive documentation of the original version.  A copyright notice
2449      * for the original version is given at the beginning of this file.  The
2450      * Perl adaptation is documented at the definition of PL_strict_utf8_dfa_tab[].
2451      */
2452 
2453     const U8 * const s0 = s;
2454     const U8 * send = s0 + curlen;
2455     UV type;
2456     UV uv;
2457 
2458     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
2459 
2460     /* This dfa is fast.  If it accepts the input, it was for a well-formed,
2461      * non-problematic code point, which can be returned immediately.
2462      * Otherwise we call a helper function to figure out the more complicated
2463      * cases. */
2464 
2465     /* No calls from core pass in an empty string; non-core need a check */
2466 #ifdef PERL_CORE
2467     assert(curlen > 0);
2468 #else
2469     if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
2470                                                         flags, errors, msgs);
2471 #endif
2472 
2473     type = PL_strict_utf8_dfa_tab[*s];
2474 
2475     /* The table is structured so that 'type' is 0 iff the input byte is
2476      * represented identically regardless of the UTF-8ness of the string */
2477     if (type == 0) {   /* UTF-8 invariants are returned unchanged */
2478         uv = *s;
2479     }
2480     else {
2481         UV state = PL_strict_utf8_dfa_tab[256 + type];
2482         uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
2483 
2484         while (++s < send) {
2485             type  = PL_strict_utf8_dfa_tab[*s];
2486             state = PL_strict_utf8_dfa_tab[256 + state + type];
2487 
2488             uv = UTF8_ACCUMULATE(uv, *s);
2489 
2490             if (state == 0) {
2491 #ifdef EBCDIC
2492                 uv = UNI_TO_NATIVE(uv);
2493 #endif
2494                 goto success;
2495             }
2496 
2497             if (UNLIKELY(state == 1)) {
2498                 break;
2499             }
2500         }
2501 
2502         /* Here is potentially problematic.  Use the full mechanism */
2503         return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
2504                                            errors, msgs);
2505     }
2506 
2507   success:
2508     if (retlen) {
2509         *retlen = s - s0 + 1;
2510     }
2511     if (errors) {
2512         *errors = 0;
2513     }
2514     if (msgs) {
2515         *msgs = NULL;
2516     }
2517 
2518     return uv;
2519 }
2520 
2521 PERL_STATIC_INLINE UV
Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 * s,const U8 * send,STRLEN * retlen)2522 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2523 {
2524     PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
2525 
2526     assert(s < send);
2527 
2528     if (! ckWARN_d(WARN_UTF8)) {
2529 
2530         /* EMPTY is not really allowed, and asserts on debugging builds.  But
2531          * on non-debugging we have to deal with it, and this causes it to
2532          * return the REPLACEMENT CHARACTER, as the documentation indicates */
2533         return utf8n_to_uvchr(s, send - s, retlen,
2534                               (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
2535     }
2536     else {
2537         UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
2538         if (retlen && ret == 0 && (send <= s || *s != '\0')) {
2539             *retlen = (STRLEN) -1;
2540         }
2541 
2542         return ret;
2543     }
2544 }
2545 
2546 /* ------------------------------- perl.h ----------------------------- */
2547 
2548 /*
2549 =for apidoc_section $utility
2550 
2551 =for apidoc is_safe_syscall
2552 
2553 Test that the given C<pv> (with length C<len>) doesn't contain any internal
2554 C<NUL> characters.
2555 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2556 category, and return FALSE.
2557 
2558 Return TRUE if the name is safe.
2559 
2560 C<what> and C<op_name> are used in any warning.
2561 
2562 Used by the C<IS_SAFE_SYSCALL()> macro.
2563 
2564 =cut
2565 */
2566 
2567 PERL_STATIC_INLINE bool
Perl_is_safe_syscall(pTHX_ const char * pv,STRLEN len,const char * what,const char * op_name)2568 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2569 {
2570     /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2571      * perl itself uses xce*() functions which accept 8-bit strings.
2572      */
2573 
2574     PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2575 
2576     if (len > 1) {
2577         char *null_at;
2578         if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
2579                 SETERRNO(ENOENT, LIB_INVARG);
2580                 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
2581                                    "Invalid \\0 character in %s for %s: %s\\0%s",
2582                                    what, op_name, pv, null_at+1);
2583                 return FALSE;
2584         }
2585     }
2586 
2587     return TRUE;
2588 }
2589 
2590 /*
2591 
2592 Return true if the supplied filename has a newline character
2593 immediately before the first (hopefully only) NUL.
2594 
2595 My original look at this incorrectly used the len from SvPV(), but
2596 that's incorrect, since we allow for a NUL in pv[len-1].
2597 
2598 So instead, strlen() and work from there.
2599 
2600 This allow for the user reading a filename, forgetting to chomp it,
2601 then calling:
2602 
2603   open my $foo, "$file\0";
2604 
2605 */
2606 
2607 #ifdef PERL_CORE
2608 
2609 PERL_STATIC_INLINE bool
S_should_warn_nl(const char * pv)2610 S_should_warn_nl(const char *pv)
2611 {
2612     STRLEN len;
2613 
2614     PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2615 
2616     len = strlen(pv);
2617 
2618     return len > 0 && pv[len-1] == '\n';
2619 }
2620 
2621 #endif
2622 
2623 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2624 
2625 PERL_STATIC_INLINE bool
S_lossless_NV_to_IV(const NV nv,IV * ivp)2626 S_lossless_NV_to_IV(const NV nv, IV *ivp)
2627 {
2628     /* This function determines if the input NV 'nv' may be converted without
2629      * loss of data to an IV.  If not, it returns FALSE taking no other action.
2630      * But if it is possible, it does the conversion, returning TRUE, and
2631      * storing the converted result in '*ivp' */
2632 
2633     PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2634 
2635 #  if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2636     /* Normally any comparison with a NaN returns false; if we can't rely
2637      * on that behaviour, check explicitly */
2638     if (UNLIKELY(Perl_isnan(nv))) {
2639         return FALSE;
2640     }
2641 #  endif
2642 
2643     /* Written this way so that with an always-false NaN comparison we
2644      * return false */
2645     if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
2646         return FALSE;
2647     }
2648 
2649     if ((IV) nv != nv) {
2650         return FALSE;
2651     }
2652 
2653     *ivp = (IV) nv;
2654     return TRUE;
2655 }
2656 
2657 #endif
2658 
2659 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2660 
2661 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2662 
2663 #define MAX_CHARSET_NAME_LENGTH 2
2664 
2665 PERL_STATIC_INLINE const char *
S_get_regex_charset_name(const U32 flags,STRLEN * const lenp)2666 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2667 {
2668     PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2669 
2670     /* Returns a string that corresponds to the name of the regex character set
2671      * given by 'flags', and *lenp is set the length of that string, which
2672      * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2673 
2674     *lenp = 1;
2675     switch (get_regex_charset(flags)) {
2676         case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2677         case REGEX_LOCALE_CHARSET:  return LOCALE_PAT_MODS;
2678         case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2679         case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2680         case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2681             *lenp = 2;
2682             return ASCII_MORE_RESTRICT_PAT_MODS;
2683     }
2684     /* The NOT_REACHED; hides an assert() which has a rather complex
2685      * definition in perl.h. */
2686     NOT_REACHED; /* NOTREACHED */
2687     return "?";	    /* Unknown */
2688 }
2689 
2690 #endif
2691 
2692 /*
2693 
2694 Return false if any get magic is on the SV other than taint magic.
2695 
2696 */
2697 
2698 PERL_STATIC_INLINE bool
Perl_sv_only_taint_gmagic(SV * sv)2699 Perl_sv_only_taint_gmagic(SV *sv)
2700 {
2701     MAGIC *mg = SvMAGIC(sv);
2702 
2703     PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2704 
2705     while (mg) {
2706         if (mg->mg_type != PERL_MAGIC_taint
2707             && !(mg->mg_flags & MGf_GSKIP)
2708             && mg->mg_virtual->svt_get) {
2709             return FALSE;
2710         }
2711         mg = mg->mg_moremagic;
2712     }
2713 
2714     return TRUE;
2715 }
2716 
2717 /* ------------------ cop.h ------------------------------------------- */
2718 
2719 /* implement GIMME_V() macro */
2720 
2721 PERL_STATIC_INLINE U8
Perl_gimme_V(pTHX)2722 Perl_gimme_V(pTHX)
2723 {
2724     I32 cxix;
2725     U8  gimme = (PL_op->op_flags & OPf_WANT);
2726 
2727     if (gimme)
2728         return gimme;
2729     cxix = PL_curstackinfo->si_cxsubix;
2730     if (cxix < 0)
2731         return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
2732     assert(cxstack[cxix].blk_gimme & G_WANT);
2733     return (cxstack[cxix].blk_gimme & G_WANT);
2734 }
2735 
2736 
2737 /* Enter a block. Push a new base context and return its address. */
2738 
2739 PERL_STATIC_INLINE PERL_CONTEXT *
Perl_cx_pushblock(pTHX_ U8 type,U8 gimme,SV ** sp,I32 saveix)2740 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2741 {
2742     PERL_CONTEXT * cx;
2743 
2744     PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2745 
2746     CXINC;
2747     cx = CX_CUR();
2748     cx->cx_type        = type;
2749     cx->blk_gimme      = gimme;
2750     cx->blk_oldsaveix  = saveix;
2751     cx->blk_oldsp      = (I32)(sp - PL_stack_base);
2752     cx->blk_oldcop     = PL_curcop;
2753     cx->blk_oldmarksp  = (I32)(PL_markstack_ptr - PL_markstack);
2754     cx->blk_oldscopesp = PL_scopestack_ix;
2755     cx->blk_oldpm      = PL_curpm;
2756     cx->blk_old_tmpsfloor = PL_tmps_floor;
2757 
2758     PL_tmps_floor        = PL_tmps_ix;
2759     CX_DEBUG(cx, "PUSH");
2760     return cx;
2761 }
2762 
2763 
2764 /* Exit a block (RETURN and LAST). */
2765 
2766 PERL_STATIC_INLINE void
Perl_cx_popblock(pTHX_ PERL_CONTEXT * cx)2767 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2768 {
2769     PERL_ARGS_ASSERT_CX_POPBLOCK;
2770 
2771     CX_DEBUG(cx, "POP");
2772     /* these 3 are common to cx_popblock and cx_topblock */
2773     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2774     PL_scopestack_ix = cx->blk_oldscopesp;
2775     PL_curpm         = cx->blk_oldpm;
2776 
2777     /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2778      * and leaves a CX entry lying around for repeated use, so
2779      * skip for multicall */                  \
2780     assert(   (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2781             || PL_savestack_ix == cx->blk_oldsaveix);
2782     PL_curcop     = cx->blk_oldcop;
2783     PL_tmps_floor = cx->blk_old_tmpsfloor;
2784 }
2785 
2786 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2787  * Whereas cx_popblock() restores the state to the point just before
2788  * cx_pushblock() was called,  cx_topblock() restores it to the point just
2789  * *after* cx_pushblock() was called. */
2790 
2791 PERL_STATIC_INLINE void
Perl_cx_topblock(pTHX_ PERL_CONTEXT * cx)2792 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2793 {
2794     PERL_ARGS_ASSERT_CX_TOPBLOCK;
2795 
2796     CX_DEBUG(cx, "TOP");
2797     /* these 3 are common to cx_popblock and cx_topblock */
2798     PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2799     PL_scopestack_ix = cx->blk_oldscopesp;
2800     PL_curpm         = cx->blk_oldpm;
2801 
2802     PL_stack_sp      = PL_stack_base + cx->blk_oldsp;
2803 }
2804 
2805 
2806 PERL_STATIC_INLINE void
Perl_cx_pushsub(pTHX_ PERL_CONTEXT * cx,CV * cv,OP * retop,bool hasargs)2807 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2808 {
2809     U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2810 
2811     PERL_ARGS_ASSERT_CX_PUSHSUB;
2812 
2813     PERL_DTRACE_PROBE_ENTRY(cv);
2814     cx->blk_sub.old_cxsubix     = PL_curstackinfo->si_cxsubix;
2815     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2816     cx->blk_sub.cv = cv;
2817     cx->blk_sub.olddepth = CvDEPTH(cv);
2818     cx->blk_sub.prevcomppad = PL_comppad;
2819     cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2820     cx->blk_sub.retop = retop;
2821     SvREFCNT_inc_simple_void_NN(cv);
2822     cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2823 }
2824 
2825 
2826 /* subsets of cx_popsub() */
2827 
2828 PERL_STATIC_INLINE void
Perl_cx_popsub_common(pTHX_ PERL_CONTEXT * cx)2829 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2830 {
2831     CV *cv;
2832 
2833     PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2834     assert(CxTYPE(cx) == CXt_SUB);
2835 
2836     PL_comppad = cx->blk_sub.prevcomppad;
2837     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2838     cv = cx->blk_sub.cv;
2839     CvDEPTH(cv) = cx->blk_sub.olddepth;
2840     cx->blk_sub.cv = NULL;
2841     SvREFCNT_dec(cv);
2842     PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2843 }
2844 
2845 
2846 /* handle the @_ part of leaving a sub */
2847 
2848 PERL_STATIC_INLINE void
Perl_cx_popsub_args(pTHX_ PERL_CONTEXT * cx)2849 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2850 {
2851     AV *av;
2852 
2853     PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2854     assert(CxTYPE(cx) == CXt_SUB);
2855     assert(AvARRAY(MUTABLE_AV(
2856         PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2857                 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2858 
2859     CX_POP_SAVEARRAY(cx);
2860     av = MUTABLE_AV(PAD_SVl(0));
2861     if (UNLIKELY(AvREAL(av)))
2862         /* abandon @_ if it got reified */
2863         clear_defarray(av, 0);
2864     else {
2865         CLEAR_ARGARRAY(av);
2866     }
2867 }
2868 
2869 
2870 PERL_STATIC_INLINE void
Perl_cx_popsub(pTHX_ PERL_CONTEXT * cx)2871 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2872 {
2873     PERL_ARGS_ASSERT_CX_POPSUB;
2874     assert(CxTYPE(cx) == CXt_SUB);
2875 
2876     PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2877 
2878     if (CxHASARGS(cx))
2879         cx_popsub_args(cx);
2880     cx_popsub_common(cx);
2881 }
2882 
2883 
2884 PERL_STATIC_INLINE void
Perl_cx_pushformat(pTHX_ PERL_CONTEXT * cx,CV * cv,OP * retop,GV * gv)2885 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2886 {
2887     PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2888 
2889     cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2890     PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2891     cx->blk_format.cv          = cv;
2892     cx->blk_format.retop       = retop;
2893     cx->blk_format.gv          = gv;
2894     cx->blk_format.dfoutgv     = PL_defoutgv;
2895     cx->blk_format.prevcomppad = PL_comppad;
2896     cx->blk_u16                = 0;
2897 
2898     SvREFCNT_inc_simple_void_NN(cv);
2899     CvDEPTH(cv)++;
2900     SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2901 }
2902 
2903 
2904 PERL_STATIC_INLINE void
Perl_cx_popformat(pTHX_ PERL_CONTEXT * cx)2905 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2906 {
2907     CV *cv;
2908     GV *dfout;
2909 
2910     PERL_ARGS_ASSERT_CX_POPFORMAT;
2911     assert(CxTYPE(cx) == CXt_FORMAT);
2912 
2913     dfout = cx->blk_format.dfoutgv;
2914     setdefout(dfout);
2915     cx->blk_format.dfoutgv = NULL;
2916     SvREFCNT_dec_NN(dfout);
2917 
2918     PL_comppad = cx->blk_format.prevcomppad;
2919     PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2920     cv = cx->blk_format.cv;
2921     cx->blk_format.cv = NULL;
2922     --CvDEPTH(cv);
2923     SvREFCNT_dec_NN(cv);
2924     PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2925 }
2926 
2927 
2928 PERL_STATIC_INLINE void
Perl_push_evalortry_common(pTHX_ PERL_CONTEXT * cx,OP * retop,SV * namesv)2929 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2930 {
2931     cx->blk_eval.retop         = retop;
2932     cx->blk_eval.old_namesv    = namesv;
2933     cx->blk_eval.old_eval_root = PL_eval_root;
2934     cx->blk_eval.cur_text      = PL_parser ? PL_parser->linestr : NULL;
2935     cx->blk_eval.cv            = NULL; /* later set by doeval_compile() */
2936     cx->blk_eval.cur_top_env   = PL_top_env;
2937 
2938     assert(!(PL_in_eval     & ~ 0x3F));
2939     assert(!(PL_op->op_type & ~0x1FF));
2940     cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2941 }
2942 
2943 PERL_STATIC_INLINE void
Perl_cx_pusheval(pTHX_ PERL_CONTEXT * cx,OP * retop,SV * namesv)2944 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2945 {
2946     PERL_ARGS_ASSERT_CX_PUSHEVAL;
2947 
2948     Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2949 
2950     cx->blk_eval.old_cxsubix    = PL_curstackinfo->si_cxsubix;
2951     PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2952 }
2953 
2954 PERL_STATIC_INLINE void
Perl_cx_pushtry(pTHX_ PERL_CONTEXT * cx,OP * retop)2955 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2956 {
2957     PERL_ARGS_ASSERT_CX_PUSHTRY;
2958 
2959     Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2960 
2961     /* Don't actually change it, just store the current value so it's restored
2962      * by the common popeval */
2963     cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2964 }
2965 
2966 
2967 PERL_STATIC_INLINE void
Perl_cx_popeval(pTHX_ PERL_CONTEXT * cx)2968 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2969 {
2970     SV *sv;
2971 
2972     PERL_ARGS_ASSERT_CX_POPEVAL;
2973     assert(CxTYPE(cx) == CXt_EVAL);
2974 
2975     PL_in_eval = CxOLD_IN_EVAL(cx);
2976     assert(!(PL_in_eval & 0xc0));
2977     PL_eval_root = cx->blk_eval.old_eval_root;
2978     sv = cx->blk_eval.cur_text;
2979     if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2980         cx->blk_eval.cur_text = NULL;
2981         SvREFCNT_dec_NN(sv);
2982     }
2983 
2984     sv = cx->blk_eval.old_namesv;
2985     if (sv) {
2986         cx->blk_eval.old_namesv = NULL;
2987         SvREFCNT_dec_NN(sv);
2988     }
2989     PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
2990 }
2991 
2992 
2993 /* push a plain loop, i.e.
2994  *     { block }
2995  *     while (cond) { block }
2996  *     for (init;cond;continue) { block }
2997  * This loop can be last/redo'ed etc.
2998  */
2999 
3000 PERL_STATIC_INLINE void
Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT * cx)3001 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
3002 {
3003     PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
3004     cx->blk_loop.my_op = cLOOP;
3005 }
3006 
3007 
3008 /* push a true for loop, i.e.
3009  *     for var (list) { block }
3010  */
3011 
3012 PERL_STATIC_INLINE void
Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT * cx,void * itervarp,SV * itersave)3013 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
3014 {
3015     PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
3016 
3017     /* this one line is common with cx_pushloop_plain */
3018     cx->blk_loop.my_op = cLOOP;
3019 
3020     cx->blk_loop.itervar_u.svp = (SV**)itervarp;
3021     cx->blk_loop.itersave      = itersave;
3022 #ifdef USE_ITHREADS
3023     cx->blk_loop.oldcomppad = PL_comppad;
3024 #endif
3025 }
3026 
3027 
3028 /* pop all loop types, including plain */
3029 
3030 PERL_STATIC_INLINE void
Perl_cx_poploop(pTHX_ PERL_CONTEXT * cx)3031 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
3032 {
3033     PERL_ARGS_ASSERT_CX_POPLOOP;
3034 
3035     assert(CxTYPE_is_LOOP(cx));
3036     if (  CxTYPE(cx) == CXt_LOOP_ARY
3037        || CxTYPE(cx) == CXt_LOOP_LAZYSV)
3038     {
3039         /* Free ary or cur. This assumes that state_u.ary.ary
3040          * aligns with state_u.lazysv.cur. See cx_dup() */
3041         SV *sv = cx->blk_loop.state_u.lazysv.cur;
3042         cx->blk_loop.state_u.lazysv.cur = NULL;
3043         SvREFCNT_dec_NN(sv);
3044         if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
3045             sv = cx->blk_loop.state_u.lazysv.end;
3046             cx->blk_loop.state_u.lazysv.end = NULL;
3047             SvREFCNT_dec_NN(sv);
3048         }
3049     }
3050     if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
3051         SV *cursv;
3052         SV **svp = (cx)->blk_loop.itervar_u.svp;
3053         if ((cx->cx_type & CXp_FOR_GV))
3054             svp = &GvSV((GV*)svp);
3055         cursv = *svp;
3056         *svp = cx->blk_loop.itersave;
3057         cx->blk_loop.itersave = NULL;
3058         SvREFCNT_dec(cursv);
3059     }
3060     if (cx->cx_type & (CXp_FOR_GV|CXp_FOR_LVREF))
3061         SvREFCNT_dec(cx->blk_loop.itervar_u.svp);
3062 }
3063 
3064 
3065 PERL_STATIC_INLINE void
Perl_cx_pushwhen(pTHX_ PERL_CONTEXT * cx)3066 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
3067 {
3068     PERL_ARGS_ASSERT_CX_PUSHWHEN;
3069 
3070     cx->blk_givwhen.leave_op = cLOGOP->op_other;
3071 }
3072 
3073 
3074 PERL_STATIC_INLINE void
Perl_cx_popwhen(pTHX_ PERL_CONTEXT * cx)3075 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
3076 {
3077     PERL_ARGS_ASSERT_CX_POPWHEN;
3078     assert(CxTYPE(cx) == CXt_WHEN);
3079 
3080     PERL_UNUSED_ARG(cx);
3081     PERL_UNUSED_CONTEXT;
3082     /* currently NOOP */
3083 }
3084 
3085 
3086 PERL_STATIC_INLINE void
Perl_cx_pushgiven(pTHX_ PERL_CONTEXT * cx,SV * orig_defsv)3087 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
3088 {
3089     PERL_ARGS_ASSERT_CX_PUSHGIVEN;
3090 
3091     cx->blk_givwhen.leave_op = cLOGOP->op_other;
3092     cx->blk_givwhen.defsv_save = orig_defsv;
3093 }
3094 
3095 
3096 PERL_STATIC_INLINE void
Perl_cx_popgiven(pTHX_ PERL_CONTEXT * cx)3097 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
3098 {
3099     SV *sv;
3100 
3101     PERL_ARGS_ASSERT_CX_POPGIVEN;
3102     assert(CxTYPE(cx) == CXt_GIVEN);
3103 
3104     sv = GvSV(PL_defgv);
3105     GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
3106     cx->blk_givwhen.defsv_save = NULL;
3107     SvREFCNT_dec(sv);
3108 }
3109 
3110 /*
3111 =for apidoc newPADxVOP
3112 
3113 Constructs, checks and returns an op containing a pad offset.  C<type> is
3114 the opcode, which should be one of C<OP_PADSV>, C<OP_PADAV>, C<OP_PADHV>
3115 or C<OP_PADCV>.  The returned op will have the C<op_targ> field set by
3116 the C<padix> argument.
3117 
3118 This is convenient when constructing a large optree in nested function
3119 calls, as it avoids needing to store the pad op directly to set the
3120 C<op_targ> field as a side-effect. For example
3121 
3122     o = op_append_elem(OP_LINESEQ, o,
3123         newPADxVOP(OP_PADSV, 0, padix));
3124 
3125 =cut
3126 */
3127 
3128 PERL_STATIC_INLINE OP *
Perl_newPADxVOP(pTHX_ I32 type,I32 flags,PADOFFSET padix)3129 Perl_newPADxVOP(pTHX_ I32 type, I32 flags, PADOFFSET padix)
3130 {
3131     PERL_ARGS_ASSERT_NEWPADXVOP;
3132 
3133     assert(type == OP_PADSV || type == OP_PADAV || type == OP_PADHV
3134             || type == OP_PADCV);
3135     OP *o = newOP(type, flags);
3136     o->op_targ = padix;
3137     return o;
3138 }
3139 
3140 /* ------------------ util.h ------------------------------------------- */
3141 
3142 /*
3143 =for apidoc_section $string
3144 
3145 =for apidoc foldEQ
3146 
3147 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3148 same
3149 case-insensitively; false otherwise.  Uppercase and lowercase ASCII range bytes
3150 match themselves and their opposite case counterparts.  Non-cased and non-ASCII
3151 range bytes match only themselves.
3152 
3153 =cut
3154 */
3155 
3156 PERL_STATIC_INLINE I32
Perl_foldEQ(pTHX_ const char * s1,const char * s2,I32 len)3157 Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len)
3158 {
3159     const U8 *a = (const U8 *)s1;
3160     const U8 *b = (const U8 *)s2;
3161 
3162     PERL_ARGS_ASSERT_FOLDEQ;
3163 
3164     assert(len >= 0);
3165 
3166     while (len--) {
3167         if (*a != *b && *a != PL_fold[*b])
3168             return 0;
3169         a++,b++;
3170     }
3171     return 1;
3172 }
3173 
3174 PERL_STATIC_INLINE I32
Perl_foldEQ_latin1(pTHX_ const char * s1,const char * s2,I32 len)3175 Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len)
3176 {
3177     /* Compare non-UTF-8 using Unicode (Latin1) semantics.  Works on all folds
3178      * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
3179      * does not check for this.  Nor does it check that the strings each have
3180      * at least 'len' characters. */
3181 
3182     const U8 *a = (const U8 *)s1;
3183     const U8 *b = (const U8 *)s2;
3184 
3185     PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
3186 
3187     assert(len >= 0);
3188 
3189     while (len--) {
3190         if (*a != *b && *a != PL_fold_latin1[*b]) {
3191             return 0;
3192         }
3193         a++, b++;
3194     }
3195     return 1;
3196 }
3197 
3198 /*
3199 =for apidoc_section $locale
3200 =for apidoc foldEQ_locale
3201 
3202 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3203 same case-insensitively in the current locale; false otherwise.
3204 
3205 =cut
3206 */
3207 
3208 PERL_STATIC_INLINE I32
Perl_foldEQ_locale(pTHX_ const char * s1,const char * s2,I32 len)3209 Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len)
3210 {
3211     const U8 *a = (const U8 *)s1;
3212     const U8 *b = (const U8 *)s2;
3213 
3214     PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
3215 
3216     assert(len >= 0);
3217 
3218     while (len--) {
3219         if (*a != *b && *a != PL_fold_locale[*b]) {
3220             DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3221                      "%s:%d: Our records indicate %02x is not a fold of %02x"
3222                      " or its mate %02x\n",
3223                      __FILE__, __LINE__, *a, *b, PL_fold_locale[*b]));
3224 
3225             return 0;
3226         }
3227         a++,b++;
3228     }
3229     return 1;
3230 }
3231 
3232 /*
3233 =for apidoc_section $string
3234 =for apidoc my_strnlen
3235 
3236 The C library C<strnlen> if available, or a Perl implementation of it.
3237 
3238 C<my_strnlen()> computes the length of the string, up to C<maxlen>
3239 characters.  It will never attempt to address more than C<maxlen>
3240 characters, making it suitable for use with strings that are not
3241 guaranteed to be NUL-terminated.
3242 
3243 =cut
3244 
3245 Description stolen from http://man.openbsd.org/strnlen.3,
3246 implementation stolen from PostgreSQL.
3247 */
3248 #ifndef HAS_STRNLEN
3249 
3250 PERL_STATIC_INLINE Size_t
Perl_my_strnlen(const char * str,Size_t maxlen)3251 Perl_my_strnlen(const char *str, Size_t maxlen)
3252 {
3253     const char *end = (char *) memchr(str, '\0', maxlen);
3254 
3255     PERL_ARGS_ASSERT_MY_STRNLEN;
3256 
3257     if (end == NULL) return maxlen;
3258     return end - str;
3259 }
3260 
3261 #endif
3262 
3263 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
3264 
3265 PERL_STATIC_INLINE void *
S_my_memrchr(const char * s,const char c,const STRLEN len)3266 S_my_memrchr(const char * s, const char c, const STRLEN len)
3267 {
3268     /* memrchr(), since many platforms lack it */
3269 
3270     const char * t = s + len - 1;
3271 
3272     PERL_ARGS_ASSERT_MY_MEMRCHR;
3273 
3274     while (t >= s) {
3275         if (*t == c) {
3276             return (void *) t;
3277         }
3278         t--;
3279     }
3280 
3281     return NULL;
3282 }
3283 
3284 #endif
3285 
3286 PERL_STATIC_INLINE char *
Perl_mortal_getenv(const char * str)3287 Perl_mortal_getenv(const char * str)
3288 {
3289     /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
3290      *
3291      * It's (mostly) thread-safe because it uses a mutex to prevent other
3292      * threads (that look at this mutex) from destroying the result before this
3293      * routine has a chance to copy the result to a place that won't be
3294      * destroyed before the caller gets a chance to handle it.  That place is a
3295      * mortal SV.  khw chose this over SAVEFREEPV because he is under the
3296      * impression that the SV will hang around longer under more circumstances
3297      *
3298      * The reason it isn't completely thread-safe is that other code could
3299      * simply not pay attention to the mutex.  All of the Perl core uses the
3300      * mutex, but it is possible for code from, say XS, to not use this mutex,
3301      * defeating the safety.
3302      *
3303      * getenv() returns, in some implementations, a pointer to a spot in the
3304      * **environ array, which could be invalidated at any time by this or
3305      * another thread changing the environment.  Other implementations copy the
3306      * **environ value to a static buffer, returning a pointer to that.  That
3307      * buffer might or might not be invalidated by a getenv() call in another
3308      * thread.  If it does get zapped, we need an exclusive lock.  Otherwise,
3309      * many getenv() calls can safely be running simultaneously, so a
3310      * many-reader (but no simultaneous writers) lock is ok.  There is a
3311      * Configure probe to see if another thread destroys the buffer, and the
3312      * mutex is defined accordingly.
3313      *
3314      * But in all cases, using the mutex prevents these problems, as long as
3315      * all code uses the same mutex.
3316      *
3317      * A complication is that this can be called during phases where the
3318      * mortalization process isn't available.  These are in interpreter
3319      * destruction or early in construction.  khw believes that at these times
3320      * there shouldn't be anything else going on, so plain getenv is safe AS
3321      * LONG AS the caller acts on the return before calling it again. */
3322 
3323     char * ret;
3324     dTHX;
3325 
3326     PERL_ARGS_ASSERT_MORTAL_GETENV;
3327 
3328     /* Can't mortalize without stacks.  khw believes that no other threads
3329      * should be running, so no need to lock things, and this may be during a
3330      * phase when locking isn't even available */
3331     if (UNLIKELY(PL_scopestack_ix == 0)) {
3332         return getenv(str);
3333     }
3334 
3335 #ifdef PERL_MEM_LOG
3336 
3337     /* A major complication arises under PERL_MEM_LOG.  When that is active,
3338      * every memory allocation may result in logging, depending on the value of
3339      * ENV{PERL_MEM_LOG} at the moment.  That means, as we create the SV for
3340      * saving ENV{foo}'s value (but before saving it), the logging code will
3341      * call us recursively to find out what ENV{PERL_MEM_LOG} is.  Without some
3342      * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
3343      * lock a boolean mutex recursively); 3) destroying the getenv() static
3344      * buffer; or 4) destroying the temporary created by this for the copy
3345      * causes a log entry to be made which could cause a new temporary to be
3346      * created, which will need to be destroyed at some point, leading to an
3347      * infinite loop.
3348      *
3349      * The solution adopted here (after some gnashing of teeth) is to detect
3350      * the recursive calls and calls from the logger, and treat them specially.
3351      * Let's say we want to do getenv("foo").  We first find
3352      * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
3353      * variable, so no temporary is required.  Then we do getenv(foo), and in
3354      * the process of creating a temporary to save it, this function will be
3355      * called recursively to do a getenv(PERL_MEM_LOG).  On the recursed call,
3356      * we detect that it is such a call and return our saved value instead of
3357      * locking and doing a new getenv().  This solves all of problems 1), 2),
3358      * and 3).  Because all the getenv()s are done while the mutex is locked,
3359      * the state cannot have changed.  To solve 4), we don't create a temporary
3360      * when this is called from the logging code.  That code disposes of the
3361      * return value while the mutex is still locked.
3362      *
3363      * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
3364      * digits and 3 particular letters are significant; the rest are ignored by
3365      * the memory logging code.  Thus the per-interpreter variable only needs
3366      * to be large enough to save the significant information, the size of
3367      * which is known at compile time.  The first byte is extra, reserved for
3368      * flags for our use.  To protect against overflowing, only the reserved
3369      * byte, as many digits as don't overflow, and the three letters are
3370      * stored.
3371      *
3372      * The reserved byte has two bits:
3373      *      0x1 if set indicates that if we get here, it is a recursive call of
3374      *          getenv()
3375      *      0x2 if set indicates that the call is from the logging code.
3376      *
3377      * If the flag indicates this is a recursive call, just return the stored
3378      * value of PL_mem_log;  An empty value gets turned into NULL. */
3379     if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
3380         if (PL_mem_log[1] == '\0') {
3381             return NULL;
3382         } else {
3383             return PL_mem_log + 1;
3384         }
3385     }
3386 
3387 #endif
3388 
3389     GETENV_LOCK;
3390 
3391 #ifdef PERL_MEM_LOG
3392 
3393     /* Here we are in a critical section.  As explained above, we do our own
3394      * getenv(PERL_MEM_LOG), saving the result safely. */
3395     ret = getenv("PERL_MEM_LOG");
3396     if (ret == NULL) {  /* No logging active */
3397 
3398         /* Return that immediately if called from the logging code */
3399         if (PL_mem_log[0] & 0x2) {
3400             GETENV_UNLOCK;
3401             return NULL;
3402         }
3403 
3404         PL_mem_log[1] = '\0';
3405     }
3406     else {
3407         char *mem_log_meat = PL_mem_log + 1;    /* first byte reserved */
3408 
3409         /* There is nothing to prevent the value of PERL_MEM_LOG from being an
3410          * extremely long string.  But we want only a few characters from it.
3411          * PL_mem_log has been made large enough to hold just the ones we need.
3412          * First the file descriptor. */
3413         if (isDIGIT(*ret)) {
3414             const char * s = ret;
3415             if (UNLIKELY(*s == '0')) {
3416 
3417                 /* Reduce multiple leading zeros to a single one.  This is to
3418                  * allow the caller to change what to do with leading zeros. */
3419                 *mem_log_meat++ = '0';
3420                 s++;
3421                 while (*s == '0') {
3422                     s++;
3423                 }
3424             }
3425 
3426             /* If the input overflows, copy just enough for the result to also
3427              * overflow, plus 1 to make sure */
3428             while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
3429                 *mem_log_meat++ = *s++;
3430             }
3431         }
3432 
3433         /* Then each of the four significant characters */
3434         if (strchr(ret, 'm')) {
3435             *mem_log_meat++ = 'm';
3436         }
3437         if (strchr(ret, 's')) {
3438             *mem_log_meat++ = 's';
3439         }
3440         if (strchr(ret, 't')) {
3441             *mem_log_meat++ = 't';
3442         }
3443         if (strchr(ret, 'c')) {
3444             *mem_log_meat++ = 'c';
3445         }
3446         *mem_log_meat = '\0';
3447 
3448         assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
3449     }
3450 
3451     /* If we are being called from the logger, it only needs the significant
3452      * portion of PERL_MEM_LOG, and doesn't need a safe copy */
3453     if (PL_mem_log[0] & 0x2) {
3454         assert(strEQ(str, "PERL_MEM_LOG"));
3455         GETENV_UNLOCK;
3456         return PL_mem_log + 1;
3457     }
3458 
3459     /* Here is a generic getenv().  This could be a getenv("PERL_MEM_LOG") that
3460      * is coming from other than the logging code, so it should be treated the
3461      * same as any other getenv(), returning the full value, not just the
3462      * significant part, and having its value saved.  Set the flag that
3463      * indicates any call to this routine will be a recursion from here */
3464     PL_mem_log[0] = 0x1;
3465 
3466 #endif
3467 
3468     /* Now get the value of the real desired variable, and save a copy */
3469     ret = getenv(str);
3470 
3471     if (ret != NULL) {
3472         ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
3473     }
3474 
3475     GETENV_UNLOCK;
3476 
3477 #ifdef PERL_MEM_LOG
3478 
3479     /* Clear the buffer */
3480     Zero(PL_mem_log, sizeof(PL_mem_log), char);
3481 
3482 #endif
3483 
3484     return ret;
3485 }
3486 
3487 PERL_STATIC_INLINE bool
Perl_sv_isbool(pTHX_ const SV * sv)3488 Perl_sv_isbool(pTHX_ const SV *sv)
3489 {
3490     return SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv);
3491 }
3492 
3493 #ifdef USE_ITHREADS
3494 
3495 PERL_STATIC_INLINE AV *
Perl_cop_file_avn(pTHX_ const COP * cop)3496 Perl_cop_file_avn(pTHX_ const COP *cop) {
3497 
3498     PERL_ARGS_ASSERT_COP_FILE_AVN;
3499 
3500     const char *file = CopFILE(cop);
3501     if (file) {
3502         GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
3503         if (gv) {
3504             return GvAVn(gv);
3505         }
3506         else
3507             return NULL;
3508      }
3509      else
3510          return NULL;
3511 }
3512 
3513 #endif
3514 
3515 PERL_STATIC_INLINE PADNAME *
Perl_padname_refcnt_inc(PADNAME * pn)3516 Perl_padname_refcnt_inc(PADNAME *pn)
3517 {
3518     PadnameREFCNT(pn)++;
3519     return pn;
3520 }
3521 
3522 PERL_STATIC_INLINE PADNAMELIST *
Perl_padnamelist_refcnt_inc(PADNAMELIST * pnl)3523 Perl_padnamelist_refcnt_inc(PADNAMELIST *pnl)
3524 {
3525     PadnamelistREFCNT(pnl)++;
3526     return pnl;
3527 }
3528 
3529 /* copy a string to a safe spot */
3530 
3531 /*
3532 =for apidoc_section $string
3533 =for apidoc savepv
3534 
3535 Perl's version of C<strdup()>.  Returns a pointer to a newly allocated
3536 string which is a duplicate of C<pv>.  The size of the string is
3537 determined by C<strlen()>, which means it may not contain embedded C<NUL>
3538 characters and must have a trailing C<NUL>.  To prevent memory leaks, the
3539 memory allocated for the new string needs to be freed when no longer needed.
3540 This can be done with the C<L</Safefree>> function, or
3541 L<C<SAVEFREEPV>|perlguts/SAVEFREEPV(p)>.
3542 
3543 On some platforms, Windows for example, all allocated memory owned by a thread
3544 is deallocated when that thread ends.  So if you need that not to happen, you
3545 need to use the shared memory functions, such as C<L</savesharedpv>>.
3546 
3547 =cut
3548 */
3549 
3550 PERL_STATIC_INLINE char *
Perl_savepv(pTHX_ const char * pv)3551 Perl_savepv(pTHX_ const char *pv)
3552 {
3553     PERL_UNUSED_CONTEXT;
3554     if (!pv)
3555         return NULL;
3556     else {
3557         char *newaddr;
3558         const STRLEN pvlen = strlen(pv)+1;
3559         Newx(newaddr, pvlen, char);
3560         return (char*)memcpy(newaddr, pv, pvlen);
3561     }
3562 }
3563 
3564 /* same thing but with a known length */
3565 
3566 /*
3567 =for apidoc savepvn
3568 
3569 Perl's version of what C<strndup()> would be if it existed.  Returns a
3570 pointer to a newly allocated string which is a duplicate of the first
3571 C<len> bytes from C<pv>, plus a trailing
3572 C<NUL> byte.  The memory allocated for
3573 the new string can be freed with the C<Safefree()> function.
3574 
3575 On some platforms, Windows for example, all allocated memory owned by a thread
3576 is deallocated when that thread ends.  So if you need that not to happen, you
3577 need to use the shared memory functions, such as C<L</savesharedpvn>>.
3578 
3579 =cut
3580 */
3581 
3582 PERL_STATIC_INLINE char *
Perl_savepvn(pTHX_ const char * pv,Size_t len)3583 Perl_savepvn(pTHX_ const char *pv, Size_t len)
3584 {
3585     char *newaddr;
3586     PERL_UNUSED_CONTEXT;
3587 
3588     Newx(newaddr,len+1,char);
3589     /* Give a meaning to NULL pointer mainly for the use in sv_magic() */
3590     if (pv) {
3591         /* might not be null terminated */
3592         newaddr[len] = '\0';
3593         return (char *) CopyD(pv,newaddr,len,char);
3594     }
3595     else {
3596         return (char *) ZeroD(newaddr,len+1,char);
3597     }
3598 }
3599 
3600 /*
3601 =for apidoc savesvpv
3602 
3603 A version of C<savepv()>/C<savepvn()> which gets the string to duplicate from
3604 the passed in SV using C<SvPV()>
3605 
3606 On some platforms, Windows for example, all allocated memory owned by a thread
3607 is deallocated when that thread ends.  So if you need that not to happen, you
3608 need to use the shared memory functions, such as C<L</savesharedsvpv>>.
3609 
3610 =cut
3611 */
3612 
3613 PERL_STATIC_INLINE char *
Perl_savesvpv(pTHX_ SV * sv)3614 Perl_savesvpv(pTHX_ SV *sv)
3615 {
3616     STRLEN len;
3617     const char * const pv = SvPV_const(sv, len);
3618     char *newaddr;
3619 
3620     PERL_ARGS_ASSERT_SAVESVPV;
3621 
3622     ++len;
3623     Newx(newaddr,len,char);
3624     return (char *) CopyD(pv,newaddr,len,char);
3625 }
3626 
3627 /*
3628 =for apidoc savesharedsvpv
3629 
3630 A version of C<savesharedpv()> which allocates the duplicate string in
3631 memory which is shared between threads.
3632 
3633 =cut
3634 */
3635 
3636 PERL_STATIC_INLINE char *
Perl_savesharedsvpv(pTHX_ SV * sv)3637 Perl_savesharedsvpv(pTHX_ SV *sv)
3638 {
3639     STRLEN len;
3640     const char * const pv = SvPV_const(sv, len);
3641 
3642     PERL_ARGS_ASSERT_SAVESHAREDSVPV;
3643 
3644     return savesharedpvn(pv, len);
3645 }
3646 
3647 #ifndef PERL_GET_CONTEXT_DEFINED
3648 
3649 /*
3650 =for apidoc_section $embedding
3651 =for apidoc get_context
3652 
3653 Implements L<perlapi/C<PERL_GET_CONTEXT>>, which you should use instead.
3654 
3655 =cut
3656 */
3657 
3658 PERL_STATIC_INLINE void *
Perl_get_context(void)3659 Perl_get_context(void)
3660 {
3661 #  if defined(USE_ITHREADS)
3662 #    ifdef OLD_PTHREADS_API
3663     pthread_addr_t t;
3664     int error = pthread_getspecific(PL_thr_key, &t);
3665     if (error)
3666         Perl_croak_nocontext("panic: pthread_getspecific, error=%d", error);
3667     return (void*)t;
3668 #    elif defined(I_MACH_CTHREADS)
3669     return (void*)cthread_data(cthread_self());
3670 #    else
3671     return (void*)PTHREAD_GETSPECIFIC(PL_thr_key);
3672 #    endif
3673 #  else
3674     return (void*)NULL;
3675 #  endif
3676 }
3677 
3678 #endif
3679 
3680 PERL_STATIC_INLINE MGVTBL*
Perl_get_vtbl(pTHX_ int vtbl_id)3681 Perl_get_vtbl(pTHX_ int vtbl_id)
3682 {
3683     PERL_UNUSED_CONTEXT;
3684 
3685     return (vtbl_id < 0 || vtbl_id >= magic_vtable_max)
3686         ? NULL : (MGVTBL*)PL_magic_vtables + vtbl_id;
3687 }
3688 
3689 /*
3690 =for apidoc my_strlcat
3691 
3692 The C library C<strlcat> if available, or a Perl implementation of it.
3693 This operates on C C<NUL>-terminated strings.
3694 
3695 C<my_strlcat()> appends string C<src> to the end of C<dst>.  It will append at
3696 most S<C<size - strlen(dst) - 1>> characters.  It will then C<NUL>-terminate,
3697 unless C<size> is 0 or the original C<dst> string was longer than C<size> (in
3698 practice this should not happen as it means that either C<size> is incorrect or
3699 that C<dst> is not a proper C<NUL>-terminated string).
3700 
3701 Note that C<size> is the full size of the destination buffer and
3702 the result is guaranteed to be C<NUL>-terminated if there is room.  Note that
3703 room for the C<NUL> should be included in C<size>.
3704 
3705 The return value is the total length that C<dst> would have if C<size> is
3706 sufficiently large.  Thus it is the initial length of C<dst> plus the length of
3707 C<src>.  If C<size> is smaller than the return, the excess was not appended.
3708 
3709 =cut
3710 
3711 Description stolen from http://man.openbsd.org/strlcat.3
3712 */
3713 #ifndef HAS_STRLCAT
3714 PERL_STATIC_INLINE Size_t
Perl_my_strlcat(char * dst,const char * src,Size_t size)3715 Perl_my_strlcat(char *dst, const char *src, Size_t size)
3716 {
3717     Size_t used, length, copy;
3718 
3719     used = strlen(dst);
3720     length = strlen(src);
3721     if (size > 0 && used < size - 1) {
3722         copy = (length >= size - used) ? size - used - 1 : length;
3723         memcpy(dst + used, src, copy);
3724         dst[used + copy] = '\0';
3725     }
3726     return used + length;
3727 }
3728 #endif
3729 
3730 
3731 /*
3732 =for apidoc my_strlcpy
3733 
3734 The C library C<strlcpy> if available, or a Perl implementation of it.
3735 This operates on C C<NUL>-terminated strings.
3736 
3737 C<my_strlcpy()> copies up to S<C<size - 1>> characters from the string C<src>
3738 to C<dst>, C<NUL>-terminating the result if C<size> is not 0.
3739 
3740 The return value is the total length C<src> would be if the copy completely
3741 succeeded.  If it is larger than C<size>, the excess was not copied.
3742 
3743 =cut
3744 
3745 Description stolen from http://man.openbsd.org/strlcpy.3
3746 */
3747 #ifndef HAS_STRLCPY
3748 PERL_STATIC_INLINE Size_t
Perl_my_strlcpy(char * dst,const char * src,Size_t size)3749 Perl_my_strlcpy(char *dst, const char *src, Size_t size)
3750 {
3751     Size_t length, copy;
3752 
3753     length = strlen(src);
3754     if (size > 0) {
3755         copy = (length >= size) ? size - 1 : length;
3756         memcpy(dst, src, copy);
3757         dst[copy] = '\0';
3758     }
3759     return length;
3760 }
3761 #endif
3762 
3763 /*
3764  * ex: set ts=8 sts=4 sw=4 et:
3765  */
3766