1 /* inline.h
2 *
3 * Copyright (C) 2012 by Larry Wall and others
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 * This file contains tables and code adapted from
9 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
10 * copyright notice:
11
12 Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
14 Permission is hereby granted, free of charge, to any person obtaining a copy of
15 this software and associated documentation files (the "Software"), to deal in
16 the Software without restriction, including without limitation the rights to
17 use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
18 of the Software, and to permit persons to whom the Software is furnished to do
19 so, subject to the following conditions:
20
21 The above copyright notice and this permission notice shall be included in all
22 copies or substantial portions of the Software.
23
24 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 SOFTWARE.
31
32 *
33 * This file is a home for static inline functions that cannot go in other
34 * header files, because they depend on proto.h (included after most other
35 * headers) or struct definitions.
36 *
37 * Note also perlstatic.h for functions that can't or shouldn't be inlined, but
38 * whose details should be exposed to the compiler, for such things as tail
39 * call optimization.
40 *
41 * Each section names the header file that the functions "belong" to.
42 */
43
44 /* ------------------------------- av.h ------------------------------- */
45
46 /*
47 =for apidoc_section $AV
48 =for apidoc av_count
49 Returns the number of elements in the array C<av>. This is the true length of
50 the array, including any undefined elements. It is always the same as
51 S<C<av_top_index(av) + 1>>.
52
53 =cut
54 */
55 PERL_STATIC_INLINE Size_t
Perl_av_count(pTHX_ AV * av)56 Perl_av_count(pTHX_ AV *av)
57 {
58 PERL_ARGS_ASSERT_AV_COUNT;
59 assert(SvTYPE(av) == SVt_PVAV);
60
61 return AvFILL(av) + 1;
62 }
63
64 /* ------------------------------- av.c ------------------------------- */
65
66 /*
67 =for apidoc av_store_simple
68
69 This is a cut-down version of av_store that assumes that the array is
70 very straightforward - no magic, not readonly, and AvREAL - and that
71 C<key> is not negative. This function MUST NOT be used in situations
72 where any of those assumptions may not hold.
73
74 Stores an SV in an array. The array index is specified as C<key>. It
75 can be dereferenced to get the C<SV*> that was stored there (= C<val>)).
76
77 Note that the caller is responsible for suitably incrementing the reference
78 count of C<val> before the call.
79
80 Approximate Perl equivalent: C<splice(@myarray, $key, 1, $val)>.
81
82 =cut
83 */
84
85 PERL_STATIC_INLINE SV**
Perl_av_store_simple(pTHX_ AV * av,SSize_t key,SV * val)86 Perl_av_store_simple(pTHX_ AV *av, SSize_t key, SV *val)
87 {
88 SV** ary;
89
90 PERL_ARGS_ASSERT_AV_STORE_SIMPLE;
91 assert(SvTYPE(av) == SVt_PVAV);
92 assert(!SvMAGICAL(av));
93 assert(!SvREADONLY(av));
94 assert(AvREAL(av));
95 assert(key > -1);
96
97 ary = AvARRAY(av);
98
99 if (AvFILLp(av) < key) {
100 if (key > AvMAX(av)) {
101 av_extend(av,key);
102 ary = AvARRAY(av);
103 }
104 AvFILLp(av) = key;
105 } else
106 SvREFCNT_dec(ary[key]);
107
108 ary[key] = val;
109 return &ary[key];
110 }
111
112 /*
113 =for apidoc av_fetch_simple
114
115 This is a cut-down version of av_fetch that assumes that the array is
116 very straightforward - no magic, not readonly, and AvREAL - and that
117 C<key> is not negative. This function MUST NOT be used in situations
118 where any of those assumptions may not hold.
119
120 Returns the SV at the specified index in the array. The C<key> is the
121 index. If lval is true, you are guaranteed to get a real SV back (in case
122 it wasn't real before), which you can then modify. Check that the return
123 value is non-null before dereferencing it to a C<SV*>.
124
125 The rough perl equivalent is C<$myarray[$key]>.
126
127 =cut
128 */
129
130 PERL_STATIC_INLINE SV**
Perl_av_fetch_simple(pTHX_ AV * av,SSize_t key,I32 lval)131 Perl_av_fetch_simple(pTHX_ AV *av, SSize_t key, I32 lval)
132 {
133 PERL_ARGS_ASSERT_AV_FETCH_SIMPLE;
134 assert(SvTYPE(av) == SVt_PVAV);
135 assert(!SvMAGICAL(av));
136 assert(!SvREADONLY(av));
137 assert(AvREAL(av));
138 assert(key > -1);
139
140 if ( (key > AvFILLp(av)) || !AvARRAY(av)[key]) {
141 return lval ? av_store_simple(av,key,newSV_type(SVt_NULL)) : NULL;
142 } else {
143 return &AvARRAY(av)[key];
144 }
145 }
146
147 /*
148 =for apidoc av_push_simple
149
150 This is a cut-down version of av_push that assumes that the array is very
151 straightforward - no magic, not readonly, and AvREAL - and that C<key> is
152 not less than -1. This function MUST NOT be used in situations where any
153 of those assumptions may not hold.
154
155 Pushes an SV (transferring control of one reference count) onto the end of the
156 array. The array will grow automatically to accommodate the addition.
157
158 Perl equivalent: C<push @myarray, $val;>.
159
160 =cut
161 */
162
163 PERL_STATIC_INLINE void
Perl_av_push_simple(pTHX_ AV * av,SV * val)164 Perl_av_push_simple(pTHX_ AV *av, SV *val)
165 {
166 PERL_ARGS_ASSERT_AV_PUSH_SIMPLE;
167 assert(SvTYPE(av) == SVt_PVAV);
168 assert(!SvMAGICAL(av));
169 assert(!SvREADONLY(av));
170 assert(AvREAL(av));
171 assert(AvFILLp(av) > -2);
172
173 (void)av_store_simple(av,AvFILLp(av)+1,val);
174 }
175
176 /*
177 =for apidoc av_new_alloc
178
179 This implements L<perlapi/C<newAV_alloc_x>>
180 and L<perlapi/C<newAV_alloc_xz>>, which are the public API for this
181 functionality.
182
183 Creates a new AV and allocates its SV* array.
184
185 This is similar to, but more efficient than doing:
186
187 AV *av = newAV();
188 av_extend(av, key);
189
190 The size parameter is used to pre-allocate a SV* array large enough to
191 hold at least elements C<0..(size-1)>. C<size> must be at least 1.
192
193 The C<zeroflag> parameter controls whether or not the array is NULL
194 initialized.
195
196 =cut
197 */
198
199 PERL_STATIC_INLINE AV *
Perl_av_new_alloc(pTHX_ SSize_t size,bool zeroflag)200 Perl_av_new_alloc(pTHX_ SSize_t size, bool zeroflag)
201 {
202 AV * const av = newAV();
203 SV** ary;
204 PERL_ARGS_ASSERT_AV_NEW_ALLOC;
205 assert(size > 0);
206
207 Newx(ary, size, SV*); /* Newx performs the memwrap check */
208 AvALLOC(av) = ary;
209 AvARRAY(av) = ary;
210 AvMAX(av) = size - 1;
211
212 if (zeroflag)
213 Zero(ary, size, SV*);
214
215 return av;
216 }
217
218
219 /* ------------------------------- cv.h ------------------------------- */
220
221 /*
222 =for apidoc_section $CV
223 =for apidoc CvGV
224 Returns the GV associated with the CV C<sv>, reifying it if necessary.
225
226 =cut
227 */
228 PERL_STATIC_INLINE GV *
Perl_CvGV(pTHX_ CV * sv)229 Perl_CvGV(pTHX_ CV *sv)
230 {
231 PERL_ARGS_ASSERT_CVGV;
232
233 return CvNAMED(sv)
234 ? Perl_cvgv_from_hek(aTHX_ sv)
235 : ((XPVCV*)MUTABLE_PTR(SvANY(sv)))->xcv_gv_u.xcv_gv;
236 }
237
238 /*
239 =for apidoc CvDEPTH
240 Returns the recursion level of the CV C<sv>. Hence >= 2 indicates we are in a
241 recursive call.
242
243 =cut
244 */
245 PERL_STATIC_INLINE I32 *
Perl_CvDEPTH(const CV * const sv)246 Perl_CvDEPTH(const CV * const sv)
247 {
248 PERL_ARGS_ASSERT_CVDEPTH;
249 assert(SvTYPE(sv) == SVt_PVCV || SvTYPE(sv) == SVt_PVFM);
250
251 return &((XPVCV*)SvANY(sv))->xcv_depth;
252 }
253
254 /*
255 CvPROTO returns the prototype as stored, which is not necessarily what
256 the interpreter should be using. Specifically, the interpreter assumes
257 that spaces have been stripped, which has been the case if the prototype
258 was added by toke.c, but is generally not the case if it was added elsewhere.
259 Since we can't enforce the spacelessness at assignment time, this routine
260 provides a temporary copy at parse time with spaces removed.
261 I<orig> is the start of the original buffer, I<len> is the length of the
262 prototype and will be updated when this returns.
263 */
264
265 #ifdef PERL_CORE
266 PERL_STATIC_INLINE char *
S_strip_spaces(pTHX_ const char * orig,STRLEN * const len)267 S_strip_spaces(pTHX_ const char * orig, STRLEN * const len)
268 {
269 SV * tmpsv;
270 char * tmps;
271 tmpsv = newSVpvn_flags(orig, *len, SVs_TEMP);
272 tmps = SvPVX(tmpsv);
273 while ((*len)--) {
274 if (!isSPACE(*orig))
275 *tmps++ = *orig;
276 orig++;
277 }
278 *tmps = '\0';
279 *len = tmps - SvPVX(tmpsv);
280 return SvPVX(tmpsv);
281 }
282 #endif
283
284 /* ------------------------------- iperlsys.h ------------------------------- */
285 #if ! defined(PERL_IMPLICIT_SYS) && defined(USE_ITHREADS)
286
287 /* Otherwise this function is implemented as macros in iperlsys.h */
288
289 PERL_STATIC_INLINE bool
S_PerlEnv_putenv(pTHX_ char * str)290 S_PerlEnv_putenv(pTHX_ char * str)
291 {
292 PERL_ARGS_ASSERT_PERLENV_PUTENV;
293
294 ENV_LOCK;
295 bool retval = putenv(str);
296 ENV_UNLOCK;
297
298 return retval;
299 }
300
301 #endif
302
303 /* ------------------------------- mg.h ------------------------------- */
304
305 #if defined(PERL_CORE) || defined(PERL_EXT)
306 /* assumes get-magic and stringification have already occurred */
307 PERL_STATIC_INLINE STRLEN
S_MgBYTEPOS(pTHX_ MAGIC * mg,SV * sv,const char * s,STRLEN len)308 S_MgBYTEPOS(pTHX_ MAGIC *mg, SV *sv, const char *s, STRLEN len)
309 {
310 assert(mg->mg_type == PERL_MAGIC_regex_global);
311 assert(mg->mg_len != -1);
312 if (mg->mg_flags & MGf_BYTES || !DO_UTF8(sv))
313 return (STRLEN)mg->mg_len;
314 else {
315 const STRLEN pos = (STRLEN)mg->mg_len;
316 /* Without this check, we may read past the end of the buffer: */
317 if (pos > sv_or_pv_len_utf8(sv, s, len)) return len+1;
318 return sv_or_pv_pos_u2b(sv, s, pos, NULL);
319 }
320 }
321 #endif
322
323 /* ------------------------------- pad.h ------------------------------ */
324
325 #if defined(PERL_IN_PAD_C) || defined(PERL_IN_OP_C)
326 PERL_STATIC_INLINE bool
S_PadnameIN_SCOPE(const PADNAME * const pn,const U32 seq)327 S_PadnameIN_SCOPE(const PADNAME * const pn, const U32 seq)
328 {
329 PERL_ARGS_ASSERT_PADNAMEIN_SCOPE;
330
331 /* is seq within the range _LOW to _HIGH ?
332 * This is complicated by the fact that PL_cop_seqmax
333 * may have wrapped around at some point */
334 if (COP_SEQ_RANGE_LOW(pn) == PERL_PADSEQ_INTRO)
335 return FALSE; /* not yet introduced */
336
337 if (COP_SEQ_RANGE_HIGH(pn) == PERL_PADSEQ_INTRO) {
338 /* in compiling scope */
339 if (
340 (seq > COP_SEQ_RANGE_LOW(pn))
341 ? (seq - COP_SEQ_RANGE_LOW(pn) < (U32_MAX >> 1))
342 : (COP_SEQ_RANGE_LOW(pn) - seq > (U32_MAX >> 1))
343 )
344 return TRUE;
345 }
346 else if (
347 (COP_SEQ_RANGE_LOW(pn) > COP_SEQ_RANGE_HIGH(pn))
348 ?
349 ( seq > COP_SEQ_RANGE_LOW(pn)
350 || seq <= COP_SEQ_RANGE_HIGH(pn))
351
352 : ( seq > COP_SEQ_RANGE_LOW(pn)
353 && seq <= COP_SEQ_RANGE_HIGH(pn))
354 )
355 return TRUE;
356 return FALSE;
357 }
358 #endif
359
360 /* ------------------------------- pp.h ------------------------------- */
361
362 PERL_STATIC_INLINE I32
Perl_TOPMARK(pTHX)363 Perl_TOPMARK(pTHX)
364 {
365 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
366 "MARK top %p %" IVdf "\n",
367 PL_markstack_ptr,
368 (IV)*PL_markstack_ptr)));
369 return *PL_markstack_ptr;
370 }
371
372 PERL_STATIC_INLINE I32
Perl_POPMARK(pTHX)373 Perl_POPMARK(pTHX)
374 {
375 DEBUG_s(DEBUG_v(PerlIO_printf(Perl_debug_log,
376 "MARK pop %p %" IVdf "\n",
377 (PL_markstack_ptr-1),
378 (IV)*(PL_markstack_ptr-1))));
379 assert((PL_markstack_ptr > PL_markstack) || !"MARK underflow");
380 return *PL_markstack_ptr--;
381 }
382
383 /* ----------------------------- regexp.h ----------------------------- */
384
385 /* PVLVs need to act as a superset of all scalar types - they are basically
386 * PVMGs with a few extra fields.
387 * REGEXPs are first class scalars, but have many fields that can't be copied
388 * into a PVLV body.
389 *
390 * Hence we take a different approach - instead of a copy, PVLVs store a pointer
391 * back to the original body. To avoid increasing the size of PVLVs just for the
392 * rare case of REGEXP assignment, this pointer is stored in the memory usually
393 * used for SvLEN(). Hence the check for SVt_PVLV below, and the ? : ternary to
394 * read the pointer from the two possible locations. The macro SvLEN() wraps the
395 * access to the union's member xpvlenu_len, but there is no equivalent macro
396 * for wrapping the union's member xpvlenu_rx, hence the direct reference here.
397 *
398 * See commit df6b4bd56551f2d3 for more details. */
399
400 PERL_STATIC_INLINE struct regexp *
Perl_ReANY(const REGEXP * const re)401 Perl_ReANY(const REGEXP * const re)
402 {
403 XPV* const p = (XPV*)SvANY(re);
404
405 PERL_ARGS_ASSERT_REANY;
406 assert(isREGEXP(re));
407
408 return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
409 : (struct regexp *)p;
410 }
411
412 /* ------------------------------- utf8.h ------------------------------- */
413
414 /*
415 =for apidoc_section $unicode
416 */
417
418 PERL_STATIC_INLINE void
Perl_append_utf8_from_native_byte(const U8 byte,U8 ** dest)419 Perl_append_utf8_from_native_byte(const U8 byte, U8** dest)
420 {
421 /* Takes an input 'byte' (Latin1 or EBCDIC) and appends it to the UTF-8
422 * encoded string at '*dest', updating '*dest' to include it */
423
424 PERL_ARGS_ASSERT_APPEND_UTF8_FROM_NATIVE_BYTE;
425
426 if (NATIVE_BYTE_IS_INVARIANT(byte))
427 *((*dest)++) = byte;
428 else {
429 *((*dest)++) = UTF8_EIGHT_BIT_HI(byte);
430 *((*dest)++) = UTF8_EIGHT_BIT_LO(byte);
431 }
432 }
433
434 /*
435 =for apidoc valid_utf8_to_uvchr
436 Like C<L<perlapi/utf8_to_uvchr_buf>>, but should only be called when it is
437 known that the next character in the input UTF-8 string C<s> is well-formed
438 (I<e.g.>, it passes C<L<perlapi/isUTF8_CHAR>>. Surrogates, non-character code
439 points, and non-Unicode code points are allowed.
440
441 =cut
442
443 */
444
445 PERL_STATIC_INLINE UV
Perl_valid_utf8_to_uvchr(const U8 * s,STRLEN * retlen)446 Perl_valid_utf8_to_uvchr(const U8 *s, STRLEN *retlen)
447 {
448 const UV expectlen = UTF8SKIP(s);
449 const U8* send = s + expectlen;
450 UV uv = *s;
451
452 PERL_ARGS_ASSERT_VALID_UTF8_TO_UVCHR;
453
454 if (retlen) {
455 *retlen = expectlen;
456 }
457
458 /* An invariant is trivially returned */
459 if (expectlen == 1) {
460 return uv;
461 }
462
463 /* Remove the leading bits that indicate the number of bytes, leaving just
464 * the bits that are part of the value */
465 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
466
467 /* Now, loop through the remaining bytes, accumulating each into the
468 * working total as we go. (I khw tried unrolling the loop for up to 4
469 * bytes, but there was no performance improvement) */
470 for (++s; s < send; s++) {
471 uv = UTF8_ACCUMULATE(uv, *s);
472 }
473
474 return UNI_TO_NATIVE(uv);
475
476 }
477
478 /*
479 =for apidoc is_utf8_invariant_string
480
481 Returns TRUE if the first C<len> bytes of the string C<s> are the same
482 regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
483 EBCDIC machines); otherwise it returns FALSE. That is, it returns TRUE if they
484 are UTF-8 invariant. On ASCII-ish machines, all the ASCII characters and only
485 the ASCII characters fit this definition. On EBCDIC machines, the ASCII-range
486 characters are invariant, but so also are the C1 controls.
487
488 If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
489 use this option, that C<s> can't have embedded C<NUL> characters and has to
490 have a terminating C<NUL> byte).
491
492 See also
493 C<L</is_utf8_string>>,
494 C<L</is_utf8_string_flags>>,
495 C<L</is_utf8_string_loc>>,
496 C<L</is_utf8_string_loc_flags>>,
497 C<L</is_utf8_string_loclen>>,
498 C<L</is_utf8_string_loclen_flags>>,
499 C<L</is_utf8_fixed_width_buf_flags>>,
500 C<L</is_utf8_fixed_width_buf_loc_flags>>,
501 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
502 C<L</is_strict_utf8_string>>,
503 C<L</is_strict_utf8_string_loc>>,
504 C<L</is_strict_utf8_string_loclen>>,
505 C<L</is_c9strict_utf8_string>>,
506 C<L</is_c9strict_utf8_string_loc>>,
507 and
508 C<L</is_c9strict_utf8_string_loclen>>.
509
510 =cut
511
512 */
513
514 #define is_utf8_invariant_string(s, len) \
515 is_utf8_invariant_string_loc(s, len, NULL)
516
517 /*
518 =for apidoc is_utf8_invariant_string_loc
519
520 Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
521 the first UTF-8 variant character in the C<ep> pointer; if all characters are
522 UTF-8 invariant, this function does not change the contents of C<*ep>.
523
524 =cut
525
526 */
527
528 PERL_STATIC_INLINE bool
Perl_is_utf8_invariant_string_loc(const U8 * const s,STRLEN len,const U8 ** ep)529 Perl_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
530 {
531 const U8* send;
532 const U8* x = s;
533
534 PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
535
536 if (len == 0) {
537 len = strlen((const char *)s);
538 }
539
540 send = s + len;
541
542 /* This looks like 0x010101... */
543 # define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
544
545 /* This looks like 0x808080... */
546 # define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
547 # define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
548 # define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
549
550 /* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
551 * or'ing together the lowest bits of 'x'. Hopefully the final term gets
552 * optimized out completely on a 32-bit system, and its mask gets optimized out
553 * on a 64-bit system */
554 # define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
555 | ( PTR2nat(x) >> 1) \
556 | ( ( (PTR2nat(x) \
557 & PERL_WORD_BOUNDARY_MASK) >> 2))))
558
559 #ifndef EBCDIC
560
561 /* Do the word-at-a-time iff there is at least one usable full word. That
562 * means that after advancing to a word boundary, there still is at least a
563 * full word left. The number of bytes needed to advance is 'wordsize -
564 * offset' unless offset is 0. */
565 if ((STRLEN) (send - x) >= PERL_WORDSIZE
566
567 /* This term is wordsize if subword; 0 if not */
568 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
569
570 /* 'offset' */
571 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
572 {
573
574 /* Process per-byte until reach word boundary. XXX This loop could be
575 * eliminated if we knew that this platform had fast unaligned reads */
576 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
577 if (! UTF8_IS_INVARIANT(*x)) {
578 if (ep) {
579 *ep = x;
580 }
581
582 return FALSE;
583 }
584 x++;
585 }
586
587 /* Here, we know we have at least one full word to process. Process
588 * per-word as long as we have at least a full word left */
589 do {
590 if ((* (const PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) {
591
592 /* Found a variant. Just return if caller doesn't want its
593 * exact position */
594 if (! ep) {
595 return FALSE;
596 }
597
598 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
599 || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
600
601 *ep = x + variant_byte_number(* (const PERL_UINTMAX_T *) x);
602 assert(*ep >= s && *ep < send);
603
604 return FALSE;
605
606 # else /* If weird byte order, drop into next loop to do byte-at-a-time
607 checks. */
608
609 break;
610 # endif
611 }
612
613 x += PERL_WORDSIZE;
614
615 } while (x + PERL_WORDSIZE <= send);
616 }
617
618 #endif /* End of ! EBCDIC */
619
620 /* Process per-byte */
621 while (x < send) {
622 if (! UTF8_IS_INVARIANT(*x)) {
623 if (ep) {
624 *ep = x;
625 }
626
627 return FALSE;
628 }
629
630 x++;
631 }
632
633 return TRUE;
634 }
635
636 /* See if the platform has builtins for finding the most/least significant bit,
637 * and which one is right for using on 32 and 64 bit operands */
638 #if (__has_builtin(__builtin_clz) || PERL_GCC_VERSION_GE(3,4,0))
639 # if U32SIZE == INTSIZE
640 # define PERL_CLZ_32 __builtin_clz
641 # endif
642 # if defined(U64TYPE) && U64SIZE == INTSIZE
643 # define PERL_CLZ_64 __builtin_clz
644 # endif
645 #endif
646 #if (__has_builtin(__builtin_ctz) || PERL_GCC_VERSION_GE(3,4,0))
647 # if U32SIZE == INTSIZE
648 # define PERL_CTZ_32 __builtin_ctz
649 # endif
650 # if defined(U64TYPE) && U64SIZE == INTSIZE
651 # define PERL_CTZ_64 __builtin_ctz
652 # endif
653 #endif
654
655 #if (__has_builtin(__builtin_clzl) || PERL_GCC_VERSION_GE(3,4,0))
656 # if U32SIZE == LONGSIZE && ! defined(PERL_CLZ_32)
657 # define PERL_CLZ_32 __builtin_clzl
658 # endif
659 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CLZ_64)
660 # define PERL_CLZ_64 __builtin_clzl
661 # endif
662 #endif
663 #if (__has_builtin(__builtin_ctzl) || PERL_GCC_VERSION_GE(3,4,0))
664 # if U32SIZE == LONGSIZE && ! defined(PERL_CTZ_32)
665 # define PERL_CTZ_32 __builtin_ctzl
666 # endif
667 # if defined(U64TYPE) && U64SIZE == LONGSIZE && ! defined(PERL_CTZ_64)
668 # define PERL_CTZ_64 __builtin_ctzl
669 # endif
670 #endif
671
672 #if (__has_builtin(__builtin_clzll) || PERL_GCC_VERSION_GE(3,4,0))
673 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_32)
674 # define PERL_CLZ_32 __builtin_clzll
675 # endif
676 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CLZ_64)
677 # define PERL_CLZ_64 __builtin_clzll
678 # endif
679 #endif
680 #if (__has_builtin(__builtin_ctzll) || PERL_GCC_VERSION_GE(3,4,0))
681 # if U32SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_32)
682 # define PERL_CTZ_32 __builtin_ctzll
683 # endif
684 # if defined(U64TYPE) && U64SIZE == LONGLONGSIZE && ! defined(PERL_CTZ_64)
685 # define PERL_CTZ_64 __builtin_ctzll
686 # endif
687 #endif
688
689 #if defined(_MSC_VER)
690 # include <intrin.h>
691 # pragma intrinsic(_BitScanForward)
692 # pragma intrinsic(_BitScanReverse)
693 # ifdef _WIN64
694 # pragma intrinsic(_BitScanForward64)
695 # pragma intrinsic(_BitScanReverse64)
696 # endif
697 #endif
698
699 /* The reason there are not checks to see if ffs() and ffsl() are available for
700 * determining the lsb, is because these don't improve on the deBruijn method
701 * fallback, which is just a branchless integer multiply, array element
702 * retrieval, and shift. The others, even if the function call overhead is
703 * optimized out, have to cope with the possibility of the input being all
704 * zeroes, and almost certainly will have conditionals for this eventuality.
705 * khw, at the time of this commit, looked at the source for both gcc and clang
706 * to verify this. (gcc used a method inferior to deBruijn.) */
707
708 /* Below are functions to find the first, last, or only set bit in a word. On
709 * platforms with 64-bit capability, there is a pair for each operation; the
710 * first taking a 64 bit operand, and the second a 32 bit one. The logic is
711 * the same in each pair, so the second is stripped of most comments. */
712
713 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
714
715 PERL_STATIC_INLINE unsigned
Perl_lsbit_pos64(U64 word)716 Perl_lsbit_pos64(U64 word)
717 {
718 /* Find the position (0..63) of the least significant set bit in the input
719 * word */
720
721 ASSUME(word != 0);
722
723 /* If we can determine that the platform has a usable fast method to get
724 * this info, use that */
725
726 # if defined(PERL_CTZ_64)
727 # define PERL_HAS_FAST_GET_LSB_POS64
728
729 return (unsigned) PERL_CTZ_64(word);
730
731 # elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
732 # define PERL_HAS_FAST_GET_LSB_POS64
733
734 {
735 unsigned long index;
736 _BitScanForward64(&index, word);
737 return (unsigned)index;
738 }
739
740 # else
741
742 /* Here, we didn't find a fast method for finding the lsb. Fall back to
743 * making the lsb the only set bit in the word, and use our function that
744 * works on words with a single bit set.
745 *
746 * Isolate the lsb;
747 * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
748 *
749 * The word will look like this, with a rightmost set bit in position 's':
750 * ('x's are don't cares, and 'y's are their complements)
751 * s
752 * x..x100..00
753 * y..y011..11 Complement
754 * y..y100..00 Add 1
755 * 0..0100..00 And with the original
756 *
757 * (Yes, complementing and adding 1 is just taking the negative on 2's
758 * complement machines, but not on 1's complement ones, and some compilers
759 * complain about negating an unsigned.)
760 */
761 return single_1bit_pos64(word & (~word + 1));
762
763 # endif
764
765 }
766
767 # define lsbit_pos_uintmax_(word) lsbit_pos64(word)
768 #else /* ! QUAD */
769 # define lsbit_pos_uintmax_(word) lsbit_pos32(word)
770 #endif
771
772 PERL_STATIC_INLINE unsigned /* Like above for 32 bit word */
Perl_lsbit_pos32(U32 word)773 Perl_lsbit_pos32(U32 word)
774 {
775 /* Find the position (0..31) of the least significant set bit in the input
776 * word */
777
778 ASSUME(word != 0);
779
780 #if defined(PERL_CTZ_32)
781 # define PERL_HAS_FAST_GET_LSB_POS32
782
783 return (unsigned) PERL_CTZ_32(word);
784
785 #elif U32SIZE == 4 && defined(_MSC_VER)
786 # define PERL_HAS_FAST_GET_LSB_POS32
787
788 {
789 unsigned long index;
790 _BitScanForward(&index, word);
791 return (unsigned)index;
792 }
793
794 #else
795
796 return single_1bit_pos32(word & (~word + 1));
797
798 #endif
799
800 }
801
802
803 /* Convert the leading zeros count to the bit position of the first set bit.
804 * This just subtracts from the highest position, 31 or 63. But some compilers
805 * don't optimize this optimally, and so a bit of bit twiddling encourages them
806 * to do the right thing. It turns out that subtracting a smaller non-negative
807 * number 'x' from 2**n-1 for any n is the same as taking the exclusive-or of
808 * the two numbers. To see why, first note that the sum of any number, x, and
809 * its complement, x', is all ones. So all ones minus x is x'. Then note that
810 * the xor of x and all ones is x'. */
811 #define LZC_TO_MSBIT_POS_(size, lzc) ((size##SIZE * CHARBITS - 1) ^ (lzc))
812
813 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
814
815 PERL_STATIC_INLINE unsigned
Perl_msbit_pos64(U64 word)816 Perl_msbit_pos64(U64 word)
817 {
818 /* Find the position (0..63) of the most significant set bit in the input
819 * word */
820
821 ASSUME(word != 0);
822
823 /* If we can determine that the platform has a usable fast method to get
824 * this, use that */
825
826 # if defined(PERL_CLZ_64)
827 # define PERL_HAS_FAST_GET_MSB_POS64
828
829 return (unsigned) LZC_TO_MSBIT_POS_(U64, PERL_CLZ_64(word));
830
831 # elif U64SIZE == 8 && defined(_WIN64) && defined(_MSC_VER)
832 # define PERL_HAS_FAST_GET_MSB_POS64
833
834 {
835 unsigned long index;
836 _BitScanReverse64(&index, word);
837 return (unsigned)index;
838 }
839
840 # else
841
842 /* Here, we didn't find a fast method for finding the msb. Fall back to
843 * making the msb the only set bit in the word, and use our function that
844 * works on words with a single bit set.
845 *
846 * Isolate the msb; http://codeforces.com/blog/entry/10330
847 *
848 * Only the most significant set bit matters. Or'ing word with its right
849 * shift of 1 makes that bit and the next one to its right both 1.
850 * Repeating that with the right shift of 2 makes for 4 1-bits in a row.
851 * ... We end with the msb and all to the right being 1. */
852 word |= (word >> 1);
853 word |= (word >> 2);
854 word |= (word >> 4);
855 word |= (word >> 8);
856 word |= (word >> 16);
857 word |= (word >> 32);
858
859 /* Then subtracting the right shift by 1 clears all but the left-most of
860 * the 1 bits, which is our desired result */
861 word -= (word >> 1);
862
863 /* Now we have a single bit set */
864 return single_1bit_pos64(word);
865
866 # endif
867
868 }
869
870 # define msbit_pos_uintmax_(word) msbit_pos64(word)
871 #else /* ! QUAD */
872 # define msbit_pos_uintmax_(word) msbit_pos32(word)
873 #endif
874
875 PERL_STATIC_INLINE unsigned
Perl_msbit_pos32(U32 word)876 Perl_msbit_pos32(U32 word)
877 {
878 /* Find the position (0..31) of the most significant set bit in the input
879 * word */
880
881 ASSUME(word != 0);
882
883 #if defined(PERL_CLZ_32)
884 # define PERL_HAS_FAST_GET_MSB_POS32
885
886 return (unsigned) LZC_TO_MSBIT_POS_(U32, PERL_CLZ_32(word));
887
888 #elif U32SIZE == 4 && defined(_MSC_VER)
889 # define PERL_HAS_FAST_GET_MSB_POS32
890
891 {
892 unsigned long index;
893 _BitScanReverse(&index, word);
894 return (unsigned)index;
895 }
896
897 #else
898
899 word |= (word >> 1);
900 word |= (word >> 2);
901 word |= (word >> 4);
902 word |= (word >> 8);
903 word |= (word >> 16);
904 word -= (word >> 1);
905 return single_1bit_pos32(word);
906
907 #endif
908
909 }
910
911 #if UVSIZE == U64SIZE
912 # define msbit_pos(word) msbit_pos64(word)
913 # define lsbit_pos(word) lsbit_pos64(word)
914 #elif UVSIZE == U32SIZE
915 # define msbit_pos(word) msbit_pos32(word)
916 # define lsbit_pos(word) lsbit_pos32(word)
917 #endif
918
919 #ifdef U64TYPE /* HAS_QUAD not usable outside the core */
920
921 PERL_STATIC_INLINE unsigned
Perl_single_1bit_pos64(U64 word)922 Perl_single_1bit_pos64(U64 word)
923 {
924 /* Given a 64-bit word known to contain all zero bits except one 1 bit,
925 * find and return the 1's position: 0..63 */
926
927 # ifdef PERL_CORE /* macro not exported */
928 ASSUME(isPOWER_OF_2(word));
929 # else
930 ASSUME(word && (word & (word-1)) == 0);
931 # endif
932
933 /* The only set bit is both the most and least significant bit. If we have
934 * a fast way of finding either one, use that.
935 *
936 * It may appear at first glance that those functions call this one, but
937 * they don't if the corresponding #define is set */
938
939 # ifdef PERL_HAS_FAST_GET_MSB_POS64
940
941 return msbit_pos64(word);
942
943 # elif defined(PERL_HAS_FAST_GET_LSB_POS64)
944
945 return lsbit_pos64(word);
946
947 # else
948
949 /* The position of the only set bit in a word can be quickly calculated
950 * using deBruijn sequences. See for example
951 * https://en.wikipedia.org/wiki/De_Bruijn_sequence */
952 return PL_deBruijn_bitpos_tab64[(word * PERL_deBruijnMagic64_)
953 >> PERL_deBruijnShift64_];
954 # endif
955
956 }
957
958 #endif
959
960 PERL_STATIC_INLINE unsigned
Perl_single_1bit_pos32(U32 word)961 Perl_single_1bit_pos32(U32 word)
962 {
963 /* Given a 32-bit word known to contain all zero bits except one 1 bit,
964 * find and return the 1's position: 0..31 */
965
966 #ifdef PERL_CORE /* macro not exported */
967 ASSUME(isPOWER_OF_2(word));
968 #else
969 ASSUME(word && (word & (word-1)) == 0);
970 #endif
971 #ifdef PERL_HAS_FAST_GET_MSB_POS32
972
973 return msbit_pos32(word);
974
975 #elif defined(PERL_HAS_FAST_GET_LSB_POS32)
976
977 return lsbit_pos32(word);
978
979 /* Unlikely, but possible for the platform to have a wider fast operation but
980 * not a narrower one. But easy enough to handle the case by widening the
981 * parameter size. (Going the other way, emulating 64 bit by two 32 bit ops
982 * would be slower than the deBruijn method.) */
983 #elif defined(PERL_HAS_FAST_GET_MSB_POS64)
984
985 return msbit_pos64(word);
986
987 #elif defined(PERL_HAS_FAST_GET_LSB_POS64)
988
989 return lsbit_pos64(word);
990
991 #else
992
993 return PL_deBruijn_bitpos_tab32[(word * PERL_deBruijnMagic32_)
994 >> PERL_deBruijnShift32_];
995 #endif
996
997 }
998
999 #ifndef EBCDIC
1000
1001 PERL_STATIC_INLINE unsigned int
Perl_variant_byte_number(PERL_UINTMAX_T word)1002 Perl_variant_byte_number(PERL_UINTMAX_T word)
1003 {
1004 /* This returns the position in a word (0..7) of the first variant byte in
1005 * it. This is a helper function. Note that there are no branches */
1006
1007 /* Get just the msb bits of each byte */
1008 word &= PERL_VARIANTS_WORD_MASK;
1009
1010 /* This should only be called if we know there is a variant byte in the
1011 * word */
1012 assert(word);
1013
1014 # if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
1015
1016 /* Bytes are stored like
1017 * Byte8 ... Byte2 Byte1
1018 * 63..56...15...8 7...0
1019 * so getting the lsb of the whole modified word is getting the msb of the
1020 * first byte that has its msb set */
1021 word = lsbit_pos_uintmax_(word);
1022
1023 /* Here, word contains the position 7,15,23,...55,63 of that bit. Convert
1024 * to 0..7 */
1025 return (unsigned int) ((word + 1) >> 3) - 1;
1026
1027 # elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
1028
1029 /* Bytes are stored like
1030 * Byte1 Byte2 ... Byte8
1031 * 63..56 55..47 ... 7...0
1032 * so getting the msb of the whole modified word is getting the msb of the
1033 * first byte that has its msb set */
1034 word = msbit_pos_uintmax_(word);
1035
1036 /* Here, word contains the position 63,55,...,23,15,7 of that bit. Convert
1037 * to 0..7 */
1038 word = ((word + 1) >> 3) - 1;
1039
1040 /* And invert the result because of the reversed byte order on this
1041 * platform */
1042 word = CHARBITS - word - 1;
1043
1044 return (unsigned int) word;
1045
1046 # else
1047 # error Unexpected byte order
1048 # endif
1049
1050 }
1051
1052 #endif
1053 #if defined(PERL_CORE) || defined(PERL_EXT)
1054
1055 /*
1056 =for apidoc variant_under_utf8_count
1057
1058 This function looks at the sequence of bytes between C<s> and C<e>, which are
1059 assumed to be encoded in ASCII/Latin1, and returns how many of them would
1060 change should the string be translated into UTF-8. Due to the nature of UTF-8,
1061 each of these would occupy two bytes instead of the single one in the input
1062 string. Thus, this function returns the precise number of bytes the string
1063 would expand by when translated to UTF-8.
1064
1065 Unlike most of the other functions that have C<utf8> in their name, the input
1066 to this function is NOT a UTF-8-encoded string. The function name is slightly
1067 I<odd> to emphasize this.
1068
1069 This function is internal to Perl because khw thinks that any XS code that
1070 would want this is probably operating too close to the internals. Presenting a
1071 valid use case could change that.
1072
1073 See also
1074 C<L<perlapi/is_utf8_invariant_string>>
1075 and
1076 C<L<perlapi/is_utf8_invariant_string_loc>>,
1077
1078 =cut
1079
1080 */
1081
1082 PERL_STATIC_INLINE Size_t
S_variant_under_utf8_count(const U8 * const s,const U8 * const e)1083 S_variant_under_utf8_count(const U8* const s, const U8* const e)
1084 {
1085 const U8* x = s;
1086 Size_t count = 0;
1087
1088 PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
1089
1090 # ifndef EBCDIC
1091
1092 /* Test if the string is long enough to use word-at-a-time. (Logic is the
1093 * same as for is_utf8_invariant_string()) */
1094 if ((STRLEN) (e - x) >= PERL_WORDSIZE
1095 + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
1096 - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
1097 {
1098
1099 /* Process per-byte until reach word boundary. XXX This loop could be
1100 * eliminated if we knew that this platform had fast unaligned reads */
1101 while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
1102 count += ! UTF8_IS_INVARIANT(*x++);
1103 }
1104
1105 /* Process per-word as long as we have at least a full word left */
1106 do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
1107 explanation of how this works */
1108 PERL_UINTMAX_T increment
1109 = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
1110 * PERL_COUNT_MULTIPLIER)
1111 >> ((PERL_WORDSIZE - 1) * CHARBITS);
1112 count += (Size_t) increment;
1113 x += PERL_WORDSIZE;
1114 } while (x + PERL_WORDSIZE <= e);
1115 }
1116
1117 # endif
1118
1119 /* Process per-byte */
1120 while (x < e) {
1121 if (! UTF8_IS_INVARIANT(*x)) {
1122 count++;
1123 }
1124
1125 x++;
1126 }
1127
1128 return count;
1129 }
1130
1131 #endif
1132
1133 /* Keep these around for these files */
1134 #if ! defined(PERL_IN_REGEXEC_C) && ! defined(PERL_IN_UTF8_C)
1135 # undef PERL_WORDSIZE
1136 # undef PERL_COUNT_MULTIPLIER
1137 # undef PERL_WORD_BOUNDARY_MASK
1138 # undef PERL_VARIANTS_WORD_MASK
1139 #endif
1140
1141 /*
1142 =for apidoc is_utf8_string
1143
1144 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1145 Perl-extended-UTF-8 string; returns FALSE otherwise. If C<len> is 0, it will
1146 be calculated using C<strlen(s)> (which means if you use this option, that C<s>
1147 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1148 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1149
1150 This function considers Perl's extended UTF-8 to be valid. That means that
1151 code points above Unicode, surrogates, and non-character code points are
1152 considered valid by this function. Use C<L</is_strict_utf8_string>>,
1153 C<L</is_c9strict_utf8_string>>, or C<L</is_utf8_string_flags>> to restrict what
1154 code points are considered valid.
1155
1156 See also
1157 C<L</is_utf8_invariant_string>>,
1158 C<L</is_utf8_invariant_string_loc>>,
1159 C<L</is_utf8_string_loc>>,
1160 C<L</is_utf8_string_loclen>>,
1161 C<L</is_utf8_fixed_width_buf_flags>>,
1162 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1163 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1164
1165 =cut
1166 */
1167
1168 #define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
1169
1170 #if defined(PERL_CORE) || defined (PERL_EXT)
1171
1172 /*
1173 =for apidoc is_utf8_non_invariant_string
1174
1175 Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
1176 C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
1177 UTF-8; otherwise returns FALSE.
1178
1179 A TRUE return means that at least one code point represented by the sequence
1180 either is a wide character not representable as a single byte, or the
1181 representation differs depending on whether the sequence is encoded in UTF-8 or
1182 not.
1183
1184 See also
1185 C<L<perlapi/is_utf8_invariant_string>>,
1186 C<L<perlapi/is_utf8_string>>
1187
1188 =cut
1189
1190 This is commonly used to determine if a SV's UTF-8 flag should be turned on.
1191 It generally needn't be if its string is entirely UTF-8 invariant, and it
1192 shouldn't be if it otherwise contains invalid UTF-8.
1193
1194 It is an internal function because khw thinks that XS code shouldn't be working
1195 at this low a level. A valid use case could change that.
1196
1197 */
1198
1199 PERL_STATIC_INLINE bool
Perl_is_utf8_non_invariant_string(const U8 * const s,STRLEN len)1200 Perl_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
1201 {
1202 const U8 * first_variant;
1203
1204 PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
1205
1206 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1207 return FALSE;
1208 }
1209
1210 return is_utf8_string(first_variant, len - (first_variant - s));
1211 }
1212
1213 #endif
1214
1215 /*
1216 =for apidoc is_strict_utf8_string
1217
1218 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1219 UTF-8-encoded string that is fully interchangeable by any application using
1220 Unicode rules; otherwise it returns FALSE. If C<len> is 0, it will be
1221 calculated using C<strlen(s)> (which means if you use this option, that C<s>
1222 can't have embedded C<NUL> characters and has to have a terminating C<NUL>
1223 byte). Note that all characters being ASCII constitute 'a valid UTF-8 string'.
1224
1225 This function returns FALSE for strings containing any
1226 code points above the Unicode max of 0x10FFFF, surrogate code points, or
1227 non-character code points.
1228
1229 See also
1230 C<L</is_utf8_invariant_string>>,
1231 C<L</is_utf8_invariant_string_loc>>,
1232 C<L</is_utf8_string>>,
1233 C<L</is_utf8_string_flags>>,
1234 C<L</is_utf8_string_loc>>,
1235 C<L</is_utf8_string_loc_flags>>,
1236 C<L</is_utf8_string_loclen>>,
1237 C<L</is_utf8_string_loclen_flags>>,
1238 C<L</is_utf8_fixed_width_buf_flags>>,
1239 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1240 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1241 C<L</is_strict_utf8_string_loc>>,
1242 C<L</is_strict_utf8_string_loclen>>,
1243 C<L</is_c9strict_utf8_string>>,
1244 C<L</is_c9strict_utf8_string_loc>>,
1245 and
1246 C<L</is_c9strict_utf8_string_loclen>>.
1247
1248 =cut
1249 */
1250
1251 #define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
1252
1253 /*
1254 =for apidoc is_c9strict_utf8_string
1255
1256 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1257 UTF-8-encoded string that conforms to
1258 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>;
1259 otherwise it returns FALSE. If C<len> is 0, it will be calculated using
1260 C<strlen(s)> (which means if you use this option, that C<s> can't have embedded
1261 C<NUL> characters and has to have a terminating C<NUL> byte). Note that all
1262 characters being ASCII constitute 'a valid UTF-8 string'.
1263
1264 This function returns FALSE for strings containing any code points above the
1265 Unicode max of 0x10FFFF or surrogate code points, but accepts non-character
1266 code points per
1267 L<Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1268
1269 See also
1270 C<L</is_utf8_invariant_string>>,
1271 C<L</is_utf8_invariant_string_loc>>,
1272 C<L</is_utf8_string>>,
1273 C<L</is_utf8_string_flags>>,
1274 C<L</is_utf8_string_loc>>,
1275 C<L</is_utf8_string_loc_flags>>,
1276 C<L</is_utf8_string_loclen>>,
1277 C<L</is_utf8_string_loclen_flags>>,
1278 C<L</is_utf8_fixed_width_buf_flags>>,
1279 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1280 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1281 C<L</is_strict_utf8_string>>,
1282 C<L</is_strict_utf8_string_loc>>,
1283 C<L</is_strict_utf8_string_loclen>>,
1284 C<L</is_c9strict_utf8_string_loc>>,
1285 and
1286 C<L</is_c9strict_utf8_string_loclen>>.
1287
1288 =cut
1289 */
1290
1291 #define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
1292
1293 /*
1294 =for apidoc is_utf8_string_flags
1295
1296 Returns TRUE if the first C<len> bytes of string C<s> form a valid
1297 UTF-8 string, subject to the restrictions imposed by C<flags>;
1298 returns FALSE otherwise. If C<len> is 0, it will be calculated
1299 using C<strlen(s)> (which means if you use this option, that C<s> can't have
1300 embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
1301 that all characters being ASCII constitute 'a valid UTF-8 string'.
1302
1303 If C<flags> is 0, this gives the same results as C<L</is_utf8_string>>; if
1304 C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
1305 as C<L</is_strict_utf8_string>>; and if C<flags> is
1306 C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives the same results as
1307 C<L</is_c9strict_utf8_string>>. Otherwise C<flags> may be any
1308 combination of the C<UTF8_DISALLOW_I<foo>> flags understood by
1309 C<L</utf8n_to_uvchr>>, with the same meanings.
1310
1311 See also
1312 C<L</is_utf8_invariant_string>>,
1313 C<L</is_utf8_invariant_string_loc>>,
1314 C<L</is_utf8_string>>,
1315 C<L</is_utf8_string_loc>>,
1316 C<L</is_utf8_string_loc_flags>>,
1317 C<L</is_utf8_string_loclen>>,
1318 C<L</is_utf8_string_loclen_flags>>,
1319 C<L</is_utf8_fixed_width_buf_flags>>,
1320 C<L</is_utf8_fixed_width_buf_loc_flags>>,
1321 C<L</is_utf8_fixed_width_buf_loclen_flags>>,
1322 C<L</is_strict_utf8_string>>,
1323 C<L</is_strict_utf8_string_loc>>,
1324 C<L</is_strict_utf8_string_loclen>>,
1325 C<L</is_c9strict_utf8_string>>,
1326 C<L</is_c9strict_utf8_string_loc>>,
1327 and
1328 C<L</is_c9strict_utf8_string_loclen>>.
1329
1330 =cut
1331 */
1332
1333 PERL_STATIC_INLINE bool
Perl_is_utf8_string_flags(const U8 * s,STRLEN len,const U32 flags)1334 Perl_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
1335 {
1336 const U8 * first_variant;
1337
1338 PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
1339 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1340 |UTF8_DISALLOW_PERL_EXTENDED)));
1341
1342 if (len == 0) {
1343 len = strlen((const char *)s);
1344 }
1345
1346 if (flags == 0) {
1347 return is_utf8_string(s, len);
1348 }
1349
1350 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1351 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1352 {
1353 return is_strict_utf8_string(s, len);
1354 }
1355
1356 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1357 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1358 {
1359 return is_c9strict_utf8_string(s, len);
1360 }
1361
1362 if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
1363 const U8* const send = s + len;
1364 const U8* x = first_variant;
1365
1366 while (x < send) {
1367 STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1368 if (UNLIKELY(! cur_len)) {
1369 return FALSE;
1370 }
1371 x += cur_len;
1372 }
1373 }
1374
1375 return TRUE;
1376 }
1377
1378 /*
1379
1380 =for apidoc is_utf8_string_loc
1381
1382 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1383 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1384 "utf8ness success") in the C<ep> pointer.
1385
1386 See also C<L</is_utf8_string_loclen>>.
1387
1388 =cut
1389 */
1390
1391 #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
1392
1393 /*
1394
1395 =for apidoc is_utf8_string_loclen
1396
1397 Like C<L</is_utf8_string>> but stores the location of the failure (in the
1398 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1399 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1400 encoded characters in the C<el> pointer.
1401
1402 See also C<L</is_utf8_string_loc>>.
1403
1404 =cut
1405 */
1406
1407 PERL_STATIC_INLINE bool
Perl_is_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)1408 Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1409 {
1410 const U8 * first_variant;
1411
1412 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
1413
1414 if (len == 0) {
1415 len = strlen((const char *) s);
1416 }
1417
1418 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1419 if (el)
1420 *el = len;
1421
1422 if (ep) {
1423 *ep = s + len;
1424 }
1425
1426 return TRUE;
1427 }
1428
1429 {
1430 const U8* const send = s + len;
1431 const U8* x = first_variant;
1432 STRLEN outlen = first_variant - s;
1433
1434 while (x < send) {
1435 const STRLEN cur_len = isUTF8_CHAR(x, send);
1436 if (UNLIKELY(! cur_len)) {
1437 break;
1438 }
1439 x += cur_len;
1440 outlen++;
1441 }
1442
1443 if (el)
1444 *el = outlen;
1445
1446 if (ep) {
1447 *ep = x;
1448 }
1449
1450 return (x == send);
1451 }
1452 }
1453
1454 /* The perl core arranges to never call the DFA below without there being at
1455 * least one byte available to look at. This allows the DFA to use a do {}
1456 * while loop which means that calling it with a UTF-8 invariant has a single
1457 * conditional, same as the calling code checking for invariance ahead of time.
1458 * And having the calling code remove that conditional speeds up by that
1459 * conditional, the case where it wasn't invariant. So there's no reason to
1460 * check before caling this.
1461 *
1462 * But we don't know this for non-core calls, so have to retain the check for
1463 * them. */
1464 #ifdef PERL_CORE
1465 # define PERL_NON_CORE_CHECK_EMPTY(s,e) assert((e) > (s))
1466 #else
1467 # define PERL_NON_CORE_CHECK_EMPTY(s,e) if ((e) <= (s)) return FALSE
1468 #endif
1469
1470 /*
1471 * DFA for checking input is valid UTF-8 syntax.
1472 *
1473 * This uses adaptations of the table and algorithm given in
1474 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1475 * documentation of the original version. A copyright notice for the original
1476 * version is given at the beginning of this file. The Perl adaptations are
1477 * documented at the definition of PL_extended_utf8_dfa_tab[].
1478 *
1479 * This dfa is fast. There are three exit conditions:
1480 * 1) a well-formed code point, acceptable to the table
1481 * 2) the beginning bytes of an incomplete character, whose completion might
1482 * or might not be acceptable
1483 * 3) unacceptable to the table. Some of the adaptations have certain,
1484 * hopefully less likely to occur, legal inputs be unacceptable to the
1485 * table, so these must be sorted out afterwards.
1486 *
1487 * This macro is a complete implementation of the code executing the DFA. It
1488 * is passed the input sequence bounds and the table to use, and what to do
1489 * for each of the exit conditions. There are three canned actions, likely to
1490 * be the ones you want:
1491 * DFA_RETURN_SUCCESS_
1492 * DFA_RETURN_FAILURE_
1493 * DFA_GOTO_TEASE_APART_FF_
1494 *
1495 * You pass a parameter giving the action to take for each of the three
1496 * possible exit conditions:
1497 *
1498 * 'accept_action' This is executed when the DFA accepts the input.
1499 * DFA_RETURN_SUCCESS_ is the most likely candidate.
1500 * 'reject_action' This is executed when the DFA rejects the input.
1501 * DFA_RETURN_FAILURE_ is a candidate, or 'goto label' where
1502 * you have written code to distinguish the rejecting state
1503 * results. Because it happens in several places, and
1504 * involves #ifdefs, the special action
1505 * DFA_GOTO_TEASE_APART_FF_ is what you want with
1506 * PL_extended_utf8_dfa_tab. On platforms without
1507 * EXTRA_LONG_UTF8, there is no need to tease anything apart,
1508 * so this evaluates to DFA_RETURN_FAILURE_; otherwise you
1509 * need to have a label 'tease_apart_FF' that it will transfer
1510 * to.
1511 * 'incomplete_char_action' This is executed when the DFA ran off the end
1512 * before accepting or rejecting the input.
1513 * DFA_RETURN_FAILURE_ is the likely action, but you could
1514 * have a 'goto', or NOOP. In the latter case the DFA drops
1515 * off the end, and you place your code to handle this case
1516 * immediately after it.
1517 */
1518
1519 #define DFA_RETURN_SUCCESS_ return s - s0
1520 #define DFA_RETURN_FAILURE_ return 0
1521 #ifdef HAS_EXTRA_LONG_UTF8
1522 # define DFA_TEASE_APART_FF_ goto tease_apart_FF
1523 #else
1524 # define DFA_TEASE_APART_FF_ DFA_RETURN_FAILURE_
1525 #endif
1526
1527 #define PERL_IS_UTF8_CHAR_DFA(s0, e, dfa_tab, \
1528 accept_action, \
1529 reject_action, \
1530 incomplete_char_action) \
1531 STMT_START { \
1532 const U8 * s = s0; \
1533 const U8 * e_ = e; \
1534 UV state = 0; \
1535 \
1536 PERL_NON_CORE_CHECK_EMPTY(s, e_); \
1537 \
1538 do { \
1539 state = dfa_tab[256 + state + dfa_tab[*s]]; \
1540 s++; \
1541 \
1542 if (state == 0) { /* Accepting state */ \
1543 accept_action; \
1544 } \
1545 \
1546 if (UNLIKELY(state == 1)) { /* Rejecting state */ \
1547 reject_action; \
1548 } \
1549 } while (s < e_); \
1550 \
1551 /* Here, dropped out of loop before end-of-char */ \
1552 incomplete_char_action; \
1553 } STMT_END
1554
1555
1556 /*
1557
1558 =for apidoc isUTF8_CHAR
1559
1560 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1561 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
1562 that represents some code point; otherwise it evaluates to 0. If non-zero, the
1563 value gives how many bytes starting at C<s> comprise the code point's
1564 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1565 form the first code point in C<s>, are not examined.
1566
1567 The code point can be any that will fit in an IV on this machine, using Perl's
1568 extension to official UTF-8 to represent those higher than the Unicode maximum
1569 of 0x10FFFF. That means that this macro is used to efficiently decide if the
1570 next few bytes in C<s> is legal UTF-8 for a single character.
1571
1572 Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
1573 defined by Unicode to be fully interchangeable across applications;
1574 C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1575 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1576 code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1577
1578 Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
1579 C<L</is_utf8_string_loclen>> to check entire strings.
1580
1581 Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
1582 machines) is a valid UTF-8 character.
1583
1584 =cut
1585
1586 This uses an adaptation of the table and algorithm given in
1587 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1588 documentation of the original version. A copyright notice for the original
1589 version is given at the beginning of this file. The Perl adaptation is
1590 documented at the definition of PL_extended_utf8_dfa_tab[].
1591 */
1592
1593 PERL_STATIC_INLINE Size_t
Perl_isUTF8_CHAR(const U8 * const s0,const U8 * const e)1594 Perl_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
1595 {
1596 PERL_ARGS_ASSERT_ISUTF8_CHAR;
1597
1598 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
1599 DFA_RETURN_SUCCESS_,
1600 DFA_TEASE_APART_FF_,
1601 DFA_RETURN_FAILURE_);
1602
1603 /* Here, we didn't return success, but dropped out of the loop. In the
1604 * case of PL_extended_utf8_dfa_tab, this means the input is either
1605 * malformed, or the start byte was FF on a platform that the dfa doesn't
1606 * handle FF's. Call a helper function. */
1607
1608 #ifdef HAS_EXTRA_LONG_UTF8
1609
1610 tease_apart_FF:
1611
1612 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
1613 * either malformed, or was for the largest possible start byte, which we
1614 * now check, not inline */
1615 if (*s0 != I8_TO_NATIVE_UTF8(0xFF)) {
1616 return 0;
1617 }
1618
1619 return is_utf8_FF_helper_(s0, e,
1620 FALSE /* require full, not partial char */
1621 );
1622 #endif
1623
1624 }
1625
1626 /*
1627
1628 =for apidoc isSTRICT_UTF8_CHAR
1629
1630 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1631 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1632 Unicode code point completely acceptable for open interchange between all
1633 applications; otherwise it evaluates to 0. If non-zero, the value gives how
1634 many bytes starting at C<s> comprise the code point's representation. Any
1635 bytes remaining before C<e>, but beyond the ones needed to form the first code
1636 point in C<s>, are not examined.
1637
1638 The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
1639 be a surrogate nor a non-character code point. Thus this excludes any code
1640 point from Perl's extended UTF-8.
1641
1642 This is used to efficiently decide if the next few bytes in C<s> is
1643 legal Unicode-acceptable UTF-8 for a single character.
1644
1645 Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
1646 #9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
1647 code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
1648 and C<L</isUTF8_CHAR_flags>> for a more customized definition.
1649
1650 Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
1651 C<L</is_strict_utf8_string_loclen>> to check entire strings.
1652
1653 =cut
1654
1655 This uses an adaptation of the tables and algorithm given in
1656 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1657 documentation of the original version. A copyright notice for the original
1658 version is given at the beginning of this file. The Perl adaptation is
1659 documented at the definition of strict_extended_utf8_dfa_tab[].
1660
1661 */
1662
1663 PERL_STATIC_INLINE Size_t
Perl_isSTRICT_UTF8_CHAR(const U8 * const s0,const U8 * const e)1664 Perl_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1665 {
1666 PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
1667
1668 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_strict_utf8_dfa_tab,
1669 DFA_RETURN_SUCCESS_,
1670 goto check_hanguls,
1671 DFA_RETURN_FAILURE_);
1672 check_hanguls:
1673
1674 /* Here, we didn't return success, but dropped out of the loop. In the
1675 * case of PL_strict_utf8_dfa_tab, this means the input is either
1676 * malformed, or was for certain Hanguls; handle them specially */
1677
1678 /* The dfa above drops out for incomplete or illegal inputs, and certain
1679 * legal Hanguls; check and return accordingly */
1680 return is_HANGUL_ED_utf8_safe(s0, e);
1681 }
1682
1683 /*
1684
1685 =for apidoc isC9_STRICT_UTF8_CHAR
1686
1687 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
1688 looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
1689 Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
1690 the value gives how many bytes starting at C<s> comprise the code point's
1691 representation. Any bytes remaining before C<e>, but beyond the ones needed to
1692 form the first code point in C<s>, are not examined.
1693
1694 The largest acceptable code point is the Unicode maximum 0x10FFFF. This
1695 differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
1696 code points. This corresponds to
1697 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
1698 which said that non-character code points are merely discouraged rather than
1699 completely forbidden in open interchange. See
1700 L<perlunicode/Noncharacter code points>.
1701
1702 Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
1703 C<L</isUTF8_CHAR_flags>> for a more customized definition.
1704
1705 Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
1706 C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
1707
1708 =cut
1709
1710 This uses an adaptation of the tables and algorithm given in
1711 https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
1712 documentation of the original version. A copyright notice for the original
1713 version is given at the beginning of this file. The Perl adaptation is
1714 documented at the definition of PL_c9_utf8_dfa_tab[].
1715
1716 */
1717
1718 PERL_STATIC_INLINE Size_t
Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0,const U8 * const e)1719 Perl_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
1720 {
1721 PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
1722
1723 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_c9_utf8_dfa_tab,
1724 DFA_RETURN_SUCCESS_,
1725 DFA_RETURN_FAILURE_,
1726 DFA_RETURN_FAILURE_);
1727 }
1728
1729 /*
1730
1731 =for apidoc is_strict_utf8_string_loc
1732
1733 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1734 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1735 "utf8ness success") in the C<ep> pointer.
1736
1737 See also C<L</is_strict_utf8_string_loclen>>.
1738
1739 =cut
1740 */
1741
1742 #define is_strict_utf8_string_loc(s, len, ep) \
1743 is_strict_utf8_string_loclen(s, len, ep, 0)
1744
1745 /*
1746
1747 =for apidoc is_strict_utf8_string_loclen
1748
1749 Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
1750 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1751 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1752 encoded characters in the C<el> pointer.
1753
1754 See also C<L</is_strict_utf8_string_loc>>.
1755
1756 =cut
1757 */
1758
1759 PERL_STATIC_INLINE bool
Perl_is_strict_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)1760 Perl_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1761 {
1762 const U8 * first_variant;
1763
1764 PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
1765
1766 if (len == 0) {
1767 len = strlen((const char *) s);
1768 }
1769
1770 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1771 if (el)
1772 *el = len;
1773
1774 if (ep) {
1775 *ep = s + len;
1776 }
1777
1778 return TRUE;
1779 }
1780
1781 {
1782 const U8* const send = s + len;
1783 const U8* x = first_variant;
1784 STRLEN outlen = first_variant - s;
1785
1786 while (x < send) {
1787 const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
1788 if (UNLIKELY(! cur_len)) {
1789 break;
1790 }
1791 x += cur_len;
1792 outlen++;
1793 }
1794
1795 if (el)
1796 *el = outlen;
1797
1798 if (ep) {
1799 *ep = x;
1800 }
1801
1802 return (x == send);
1803 }
1804 }
1805
1806 /*
1807
1808 =for apidoc is_c9strict_utf8_string_loc
1809
1810 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1811 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1812 "utf8ness success") in the C<ep> pointer.
1813
1814 See also C<L</is_c9strict_utf8_string_loclen>>.
1815
1816 =cut
1817 */
1818
1819 #define is_c9strict_utf8_string_loc(s, len, ep) \
1820 is_c9strict_utf8_string_loclen(s, len, ep, 0)
1821
1822 /*
1823
1824 =for apidoc is_c9strict_utf8_string_loclen
1825
1826 Like C<L</is_c9strict_utf8_string>> but stores the location of the failure (in
1827 the case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1828 "utf8ness success") in the C<ep> pointer, and the number of UTF-8 encoded
1829 characters in the C<el> pointer.
1830
1831 See also C<L</is_c9strict_utf8_string_loc>>.
1832
1833 =cut
1834 */
1835
1836 PERL_STATIC_INLINE bool
Perl_is_c9strict_utf8_string_loclen(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el)1837 Perl_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
1838 {
1839 const U8 * first_variant;
1840
1841 PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
1842
1843 if (len == 0) {
1844 len = strlen((const char *) s);
1845 }
1846
1847 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1848 if (el)
1849 *el = len;
1850
1851 if (ep) {
1852 *ep = s + len;
1853 }
1854
1855 return TRUE;
1856 }
1857
1858 {
1859 const U8* const send = s + len;
1860 const U8* x = first_variant;
1861 STRLEN outlen = first_variant - s;
1862
1863 while (x < send) {
1864 const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
1865 if (UNLIKELY(! cur_len)) {
1866 break;
1867 }
1868 x += cur_len;
1869 outlen++;
1870 }
1871
1872 if (el)
1873 *el = outlen;
1874
1875 if (ep) {
1876 *ep = x;
1877 }
1878
1879 return (x == send);
1880 }
1881 }
1882
1883 /*
1884
1885 =for apidoc is_utf8_string_loc_flags
1886
1887 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1888 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1889 "utf8ness success") in the C<ep> pointer.
1890
1891 See also C<L</is_utf8_string_loclen_flags>>.
1892
1893 =cut
1894 */
1895
1896 #define is_utf8_string_loc_flags(s, len, ep, flags) \
1897 is_utf8_string_loclen_flags(s, len, ep, 0, flags)
1898
1899
1900 /* The above 3 actual functions could have been moved into the more general one
1901 * just below, and made #defines that call it with the right 'flags'. They are
1902 * currently kept separate to increase their chances of getting inlined */
1903
1904 /*
1905
1906 =for apidoc is_utf8_string_loclen_flags
1907
1908 Like C<L</is_utf8_string_flags>> but stores the location of the failure (in the
1909 case of "utf8ness failure") or the location C<s>+C<len> (in the case of
1910 "utf8ness success") in the C<ep> pointer, and the number of UTF-8
1911 encoded characters in the C<el> pointer.
1912
1913 See also C<L</is_utf8_string_loc_flags>>.
1914
1915 =cut
1916 */
1917
1918 PERL_STATIC_INLINE bool
Perl_is_utf8_string_loclen_flags(const U8 * s,STRLEN len,const U8 ** ep,STRLEN * el,const U32 flags)1919 Perl_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
1920 {
1921 const U8 * first_variant;
1922
1923 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
1924 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
1925 |UTF8_DISALLOW_PERL_EXTENDED)));
1926
1927 if (len == 0) {
1928 len = strlen((const char *) s);
1929 }
1930
1931 if (flags == 0) {
1932 return is_utf8_string_loclen(s, len, ep, el);
1933 }
1934
1935 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1936 == UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
1937 {
1938 return is_strict_utf8_string_loclen(s, len, ep, el);
1939 }
1940
1941 if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
1942 == UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
1943 {
1944 return is_c9strict_utf8_string_loclen(s, len, ep, el);
1945 }
1946
1947 if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
1948 if (el)
1949 *el = len;
1950
1951 if (ep) {
1952 *ep = s + len;
1953 }
1954
1955 return TRUE;
1956 }
1957
1958 {
1959 const U8* send = s + len;
1960 const U8* x = first_variant;
1961 STRLEN outlen = first_variant - s;
1962
1963 while (x < send) {
1964 const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
1965 if (UNLIKELY(! cur_len)) {
1966 break;
1967 }
1968 x += cur_len;
1969 outlen++;
1970 }
1971
1972 if (el)
1973 *el = outlen;
1974
1975 if (ep) {
1976 *ep = x;
1977 }
1978
1979 return (x == send);
1980 }
1981 }
1982
1983 /*
1984 =for apidoc utf8_distance
1985
1986 Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
1987 and C<b>.
1988
1989 WARNING: use only if you *know* that the pointers point inside the
1990 same UTF-8 buffer.
1991
1992 =cut
1993 */
1994
1995 PERL_STATIC_INLINE IV
Perl_utf8_distance(pTHX_ const U8 * a,const U8 * b)1996 Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
1997 {
1998 PERL_ARGS_ASSERT_UTF8_DISTANCE;
1999
2000 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
2001 }
2002
2003 /*
2004 =for apidoc utf8_hop
2005
2006 Return the UTF-8 pointer C<s> displaced by C<off> characters, either
2007 forward (if C<off> is positive) or backward (if negative). C<s> does not need
2008 to be pointing to the starting byte of a character. If it isn't, one count of
2009 C<off> will be used up to get to the start of the next character for forward
2010 hops, and to the start of the current character for negative ones.
2011
2012 WARNING: Prefer L</utf8_hop_safe> to this one.
2013
2014 Do NOT use this function unless you B<know> C<off> is within
2015 the UTF-8 data pointed to by C<s> B<and> that on entry C<s> is aligned
2016 on the first byte of a character or just after the last byte of a character.
2017
2018 =cut
2019 */
2020
2021 PERL_STATIC_INLINE U8 *
Perl_utf8_hop(const U8 * s,SSize_t off)2022 Perl_utf8_hop(const U8 *s, SSize_t off)
2023 {
2024 PERL_ARGS_ASSERT_UTF8_HOP;
2025
2026 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2027 * the XXX bitops (especially ~) can create illegal UTF-8.
2028 * In other words: in Perl UTF-8 is not just for Unicode. */
2029
2030 if (off > 0) {
2031
2032 /* Get to next non-continuation byte */
2033 if (UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2034 do {
2035 s++;
2036 }
2037 while (UTF8_IS_CONTINUATION(*s));
2038 off--;
2039 }
2040
2041 while (off--)
2042 s += UTF8SKIP(s);
2043 }
2044 else {
2045 while (off++) {
2046 s--;
2047 while (UTF8_IS_CONTINUATION(*s))
2048 s--;
2049 }
2050 }
2051
2052 GCC_DIAG_IGNORE(-Wcast-qual)
2053 return (U8 *)s;
2054 GCC_DIAG_RESTORE
2055 }
2056
2057 /*
2058 =for apidoc utf8_hop_forward
2059
2060 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2061 forward. C<s> does not need to be pointing to the starting byte of a
2062 character. If it isn't, one count of C<off> will be used up to get to the
2063 start of the next character.
2064
2065 C<off> must be non-negative.
2066
2067 C<s> must be before or equal to C<end>.
2068
2069 When moving forward it will not move beyond C<end>.
2070
2071 Will not exceed this limit even if the string is not valid "UTF-8".
2072
2073 =cut
2074 */
2075
2076 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_forward(const U8 * s,SSize_t off,const U8 * end)2077 Perl_utf8_hop_forward(const U8 *s, SSize_t off, const U8 *end)
2078 {
2079 PERL_ARGS_ASSERT_UTF8_HOP_FORWARD;
2080
2081 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2082 * the bitops (especially ~) can create illegal UTF-8.
2083 * In other words: in Perl UTF-8 is not just for Unicode. */
2084
2085 assert(s <= end);
2086 assert(off >= 0);
2087
2088 if (off && UNLIKELY(UTF8_IS_CONTINUATION(*s))) {
2089 /* Get to next non-continuation byte */
2090 do {
2091 s++;
2092 }
2093 while (UTF8_IS_CONTINUATION(*s));
2094 off--;
2095 }
2096
2097 while (off--) {
2098 STRLEN skip = UTF8SKIP(s);
2099 if ((STRLEN)(end - s) <= skip) {
2100 GCC_DIAG_IGNORE(-Wcast-qual)
2101 return (U8 *)end;
2102 GCC_DIAG_RESTORE
2103 }
2104 s += skip;
2105 }
2106
2107 GCC_DIAG_IGNORE(-Wcast-qual)
2108 return (U8 *)s;
2109 GCC_DIAG_RESTORE
2110 }
2111
2112 /*
2113 =for apidoc utf8_hop_back
2114
2115 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2116 backward. C<s> does not need to be pointing to the starting byte of a
2117 character. If it isn't, one count of C<off> will be used up to get to that
2118 start.
2119
2120 C<off> must be non-positive.
2121
2122 C<s> must be after or equal to C<start>.
2123
2124 When moving backward it will not move before C<start>.
2125
2126 Will not exceed this limit even if the string is not valid "UTF-8".
2127
2128 =cut
2129 */
2130
2131 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_back(const U8 * s,SSize_t off,const U8 * start)2132 Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
2133 {
2134 PERL_ARGS_ASSERT_UTF8_HOP_BACK;
2135
2136 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2137 * the bitops (especially ~) can create illegal UTF-8.
2138 * In other words: in Perl UTF-8 is not just for Unicode. */
2139
2140 assert(start <= s);
2141 assert(off <= 0);
2142
2143 /* Note: if we know that the input is well-formed, we can do per-word
2144 * hop-back. Commit d6ad3b72778369a84a215b498d8d60d5b03aa1af implemented
2145 * that. But it was reverted because doing per-word has some
2146 * start-up/tear-down overhead, so only makes sense if the distance to be
2147 * moved is large, and core perl doesn't currently move more than a few
2148 * characters at a time. You can reinstate it if it does become
2149 * advantageous. */
2150 while (off++ && s > start) {
2151 do {
2152 s--;
2153 } while (UTF8_IS_CONTINUATION(*s) && s > start);
2154 }
2155
2156 GCC_DIAG_IGNORE(-Wcast-qual)
2157 return (U8 *)s;
2158 GCC_DIAG_RESTORE
2159 }
2160
2161 /*
2162 =for apidoc utf8_hop_safe
2163
2164 Return the UTF-8 pointer C<s> displaced by up to C<off> characters,
2165 either forward or backward. C<s> does not need to be pointing to the starting
2166 byte of a character. If it isn't, one count of C<off> will be used up to get
2167 to the start of the next character for forward hops, and to the start of the
2168 current character for negative ones.
2169
2170 When moving backward it will not move before C<start>.
2171
2172 When moving forward it will not move beyond C<end>.
2173
2174 Will not exceed those limits even if the string is not valid "UTF-8".
2175
2176 =cut
2177 */
2178
2179 PERL_STATIC_INLINE U8 *
Perl_utf8_hop_safe(const U8 * s,SSize_t off,const U8 * start,const U8 * end)2180 Perl_utf8_hop_safe(const U8 *s, SSize_t off, const U8 *start, const U8 *end)
2181 {
2182 PERL_ARGS_ASSERT_UTF8_HOP_SAFE;
2183
2184 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
2185 * the bitops (especially ~) can create illegal UTF-8.
2186 * In other words: in Perl UTF-8 is not just for Unicode. */
2187
2188 assert(start <= s && s <= end);
2189
2190 if (off >= 0) {
2191 return utf8_hop_forward(s, off, end);
2192 }
2193 else {
2194 return utf8_hop_back(s, off, start);
2195 }
2196 }
2197
2198 /*
2199
2200 =for apidoc isUTF8_CHAR_flags
2201
2202 Evaluates to non-zero if the first few bytes of the string starting at C<s> and
2203 looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
2204 that represents some code point, subject to the restrictions given by C<flags>;
2205 otherwise it evaluates to 0. If non-zero, the value gives how many bytes
2206 starting at C<s> comprise the code point's representation. Any bytes remaining
2207 before C<e>, but beyond the ones needed to form the first code point in C<s>,
2208 are not examined.
2209
2210 If C<flags> is 0, this gives the same results as C<L</isUTF8_CHAR>>;
2211 if C<flags> is C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, this gives the same results
2212 as C<L</isSTRICT_UTF8_CHAR>>;
2213 and if C<flags> is C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE>, this gives
2214 the same results as C<L</isC9_STRICT_UTF8_CHAR>>.
2215 Otherwise C<flags> may be any combination of the C<UTF8_DISALLOW_I<foo>> flags
2216 understood by C<L</utf8n_to_uvchr>>, with the same meanings.
2217
2218 The three alternative macros are for the most commonly needed validations; they
2219 are likely to run somewhat faster than this more general one, as they can be
2220 inlined into your code.
2221
2222 Use L</is_utf8_string_flags>, L</is_utf8_string_loc_flags>, and
2223 L</is_utf8_string_loclen_flags> to check entire strings.
2224
2225 =cut
2226 */
2227
2228 PERL_STATIC_INLINE STRLEN
Perl_isUTF8_CHAR_flags(const U8 * const s0,const U8 * const e,const U32 flags)2229 Perl_isUTF8_CHAR_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2230 {
2231 PERL_ARGS_ASSERT_ISUTF8_CHAR_FLAGS;
2232 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2233 |UTF8_DISALLOW_PERL_EXTENDED)));
2234
2235 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2236 goto check_success,
2237 DFA_TEASE_APART_FF_,
2238 DFA_RETURN_FAILURE_);
2239
2240 check_success:
2241
2242 return is_utf8_char_helper_(s0, e, flags);
2243
2244 #ifdef HAS_EXTRA_LONG_UTF8
2245
2246 tease_apart_FF:
2247
2248 /* In the case of PL_extended_utf8_dfa_tab, getting here means the input is
2249 * either malformed, or was for the largest possible start byte, which
2250 * indicates perl extended UTF-8, well above the Unicode maximum */
2251 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
2252 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2253 {
2254 return 0;
2255 }
2256
2257 /* Otherwise examine the sequence not inline */
2258 return is_utf8_FF_helper_(s0, e,
2259 FALSE /* require full, not partial char */
2260 );
2261 #endif
2262
2263 }
2264
2265 /*
2266
2267 =for apidoc is_utf8_valid_partial_char
2268
2269 Returns 0 if the sequence of bytes starting at C<s> and looking no further than
2270 S<C<e - 1>> is the UTF-8 encoding, as extended by Perl, for one or more code
2271 points. Otherwise, it returns 1 if there exists at least one non-empty
2272 sequence of bytes that when appended to sequence C<s>, starting at position
2273 C<e> causes the entire sequence to be the well-formed UTF-8 of some code point;
2274 otherwise returns 0.
2275
2276 In other words this returns TRUE if C<s> points to a partial UTF-8-encoded code
2277 point.
2278
2279 This is useful when a fixed-length buffer is being tested for being well-formed
2280 UTF-8, but the final few bytes in it don't comprise a full character; that is,
2281 it is split somewhere in the middle of the final code point's UTF-8
2282 representation. (Presumably when the buffer is refreshed with the next chunk
2283 of data, the new first bytes will complete the partial code point.) This
2284 function is used to verify that the final bytes in the current buffer are in
2285 fact the legal beginning of some code point, so that if they aren't, the
2286 failure can be signalled without having to wait for the next read.
2287
2288 =cut
2289 */
2290 #define is_utf8_valid_partial_char(s, e) \
2291 is_utf8_valid_partial_char_flags(s, e, 0)
2292
2293 /*
2294
2295 =for apidoc is_utf8_valid_partial_char_flags
2296
2297 Like C<L</is_utf8_valid_partial_char>>, it returns a boolean giving whether
2298 or not the input is a valid UTF-8 encoded partial character, but it takes an
2299 extra parameter, C<flags>, which can further restrict which code points are
2300 considered valid.
2301
2302 If C<flags> is 0, this behaves identically to
2303 C<L</is_utf8_valid_partial_char>>. Otherwise C<flags> can be any combination
2304 of the C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>. If
2305 there is any sequence of bytes that can complete the input partial character in
2306 such a way that a non-prohibited character is formed, the function returns
2307 TRUE; otherwise FALSE. Non character code points cannot be determined based on
2308 partial character input. But many of the other possible excluded types can be
2309 determined from just the first one or two bytes.
2310
2311 =cut
2312 */
2313
2314 PERL_STATIC_INLINE bool
Perl_is_utf8_valid_partial_char_flags(const U8 * const s0,const U8 * const e,const U32 flags)2315 Perl_is_utf8_valid_partial_char_flags(const U8 * const s0, const U8 * const e, const U32 flags)
2316 {
2317 PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
2318 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
2319 |UTF8_DISALLOW_PERL_EXTENDED)));
2320
2321 PERL_IS_UTF8_CHAR_DFA(s0, e, PL_extended_utf8_dfa_tab,
2322 DFA_RETURN_FAILURE_,
2323 DFA_TEASE_APART_FF_,
2324 NOOP);
2325
2326 /* The NOOP above causes the DFA to drop down here iff the input was a
2327 * partial character. flags=0 => can return TRUE immediately; otherwise we
2328 * need to check (not inline) if the partial character is the beginning of
2329 * a disallowed one */
2330 if (flags == 0) {
2331 return TRUE;
2332 }
2333
2334 return cBOOL(is_utf8_char_helper_(s0, e, flags));
2335
2336 #ifdef HAS_EXTRA_LONG_UTF8
2337
2338 tease_apart_FF:
2339
2340 /* Getting here means the input is either malformed, or, in the case of
2341 * PL_extended_utf8_dfa_tab, was for the largest possible start byte. The
2342 * latter case has to be extended UTF-8, so can fail immediately if that is
2343 * forbidden */
2344
2345 if ( *s0 != I8_TO_NATIVE_UTF8(0xFF)
2346 || (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_PERL_EXTENDED)))
2347 {
2348 return 0;
2349 }
2350
2351 return is_utf8_FF_helper_(s0, e,
2352 TRUE /* Require to be a partial character */
2353 );
2354 #endif
2355
2356 }
2357
2358 /*
2359
2360 =for apidoc is_utf8_fixed_width_buf_flags
2361
2362 Returns TRUE if the fixed-width buffer starting at C<s> with length C<len>
2363 is entirely valid UTF-8, subject to the restrictions given by C<flags>;
2364 otherwise it returns FALSE.
2365
2366 If C<flags> is 0, any well-formed UTF-8, as extended by Perl, is accepted
2367 without restriction. If the final few bytes of the buffer do not form a
2368 complete code point, this will return TRUE anyway, provided that
2369 C<L</is_utf8_valid_partial_char_flags>> returns TRUE for them.
2370
2371 If C<flags> in non-zero, it can be any combination of the
2372 C<UTF8_DISALLOW_I<foo>> flags accepted by C<L</utf8n_to_uvchr>>, and with the
2373 same meanings.
2374
2375 This function differs from C<L</is_utf8_string_flags>> only in that the latter
2376 returns FALSE if the final few bytes of the string don't form a complete code
2377 point.
2378
2379 =cut
2380 */
2381 #define is_utf8_fixed_width_buf_flags(s, len, flags) \
2382 is_utf8_fixed_width_buf_loclen_flags(s, len, 0, 0, flags)
2383
2384 /*
2385
2386 =for apidoc is_utf8_fixed_width_buf_loc_flags
2387
2388 Like C<L</is_utf8_fixed_width_buf_flags>> but stores the location of the
2389 failure in the C<ep> pointer. If the function returns TRUE, C<*ep> will point
2390 to the beginning of any partial character at the end of the buffer; if there is
2391 no partial character C<*ep> will contain C<s>+C<len>.
2392
2393 See also C<L</is_utf8_fixed_width_buf_loclen_flags>>.
2394
2395 =cut
2396 */
2397
2398 #define is_utf8_fixed_width_buf_loc_flags(s, len, loc, flags) \
2399 is_utf8_fixed_width_buf_loclen_flags(s, len, loc, 0, flags)
2400
2401 /*
2402
2403 =for apidoc is_utf8_fixed_width_buf_loclen_flags
2404
2405 Like C<L</is_utf8_fixed_width_buf_loc_flags>> but stores the number of
2406 complete, valid characters found in the C<el> pointer.
2407
2408 =cut
2409 */
2410
2411 PERL_STATIC_INLINE bool
Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,STRLEN len,const U8 ** ep,STRLEN * el,const U32 flags)2412 Perl_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
2413 STRLEN len,
2414 const U8 **ep,
2415 STRLEN *el,
2416 const U32 flags)
2417 {
2418 const U8 * maybe_partial;
2419
2420 PERL_ARGS_ASSERT_IS_UTF8_FIXED_WIDTH_BUF_LOCLEN_FLAGS;
2421
2422 if (! ep) {
2423 ep = &maybe_partial;
2424 }
2425
2426 /* If it's entirely valid, return that; otherwise see if the only error is
2427 * that the final few bytes are for a partial character */
2428 return is_utf8_string_loclen_flags(s, len, ep, el, flags)
2429 || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
2430 }
2431
2432 PERL_STATIC_INLINE UV
Perl_utf8n_to_uvchr_msgs(const U8 * s,STRLEN curlen,STRLEN * retlen,const U32 flags,U32 * errors,AV ** msgs)2433 Perl_utf8n_to_uvchr_msgs(const U8 *s,
2434 STRLEN curlen,
2435 STRLEN *retlen,
2436 const U32 flags,
2437 U32 * errors,
2438 AV ** msgs)
2439 {
2440 /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
2441 * simple cases, and, if necessary calls a helper function to deal with the
2442 * more complex ones. Almost all well-formed non-problematic code points
2443 * are considered simple, so that it's unlikely that the helper function
2444 * will need to be called.
2445 *
2446 * This is an adaptation of the tables and algorithm given in
2447 * https://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
2448 * comprehensive documentation of the original version. A copyright notice
2449 * for the original version is given at the beginning of this file. The
2450 * Perl adaptation is documented at the definition of PL_strict_utf8_dfa_tab[].
2451 */
2452
2453 const U8 * const s0 = s;
2454 const U8 * send = s0 + curlen;
2455 UV type;
2456 UV uv;
2457
2458 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
2459
2460 /* This dfa is fast. If it accepts the input, it was for a well-formed,
2461 * non-problematic code point, which can be returned immediately.
2462 * Otherwise we call a helper function to figure out the more complicated
2463 * cases. */
2464
2465 /* No calls from core pass in an empty string; non-core need a check */
2466 #ifdef PERL_CORE
2467 assert(curlen > 0);
2468 #else
2469 if (curlen == 0) return _utf8n_to_uvchr_msgs_helper(s0, 0, retlen,
2470 flags, errors, msgs);
2471 #endif
2472
2473 type = PL_strict_utf8_dfa_tab[*s];
2474
2475 /* The table is structured so that 'type' is 0 iff the input byte is
2476 * represented identically regardless of the UTF-8ness of the string */
2477 if (type == 0) { /* UTF-8 invariants are returned unchanged */
2478 uv = *s;
2479 }
2480 else {
2481 UV state = PL_strict_utf8_dfa_tab[256 + type];
2482 uv = (0xff >> type) & NATIVE_UTF8_TO_I8(*s);
2483
2484 while (++s < send) {
2485 type = PL_strict_utf8_dfa_tab[*s];
2486 state = PL_strict_utf8_dfa_tab[256 + state + type];
2487
2488 uv = UTF8_ACCUMULATE(uv, *s);
2489
2490 if (state == 0) {
2491 #ifdef EBCDIC
2492 uv = UNI_TO_NATIVE(uv);
2493 #endif
2494 goto success;
2495 }
2496
2497 if (UNLIKELY(state == 1)) {
2498 break;
2499 }
2500 }
2501
2502 /* Here is potentially problematic. Use the full mechanism */
2503 return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags,
2504 errors, msgs);
2505 }
2506
2507 success:
2508 if (retlen) {
2509 *retlen = s - s0 + 1;
2510 }
2511 if (errors) {
2512 *errors = 0;
2513 }
2514 if (msgs) {
2515 *msgs = NULL;
2516 }
2517
2518 return uv;
2519 }
2520
2521 PERL_STATIC_INLINE UV
Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 * s,const U8 * send,STRLEN * retlen)2522 Perl_utf8_to_uvchr_buf_helper(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
2523 {
2524 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF_HELPER;
2525
2526 assert(s < send);
2527
2528 if (! ckWARN_d(WARN_UTF8)) {
2529
2530 /* EMPTY is not really allowed, and asserts on debugging builds. But
2531 * on non-debugging we have to deal with it, and this causes it to
2532 * return the REPLACEMENT CHARACTER, as the documentation indicates */
2533 return utf8n_to_uvchr(s, send - s, retlen,
2534 (UTF8_ALLOW_ANY | UTF8_ALLOW_EMPTY));
2535 }
2536 else {
2537 UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
2538 if (retlen && ret == 0 && (send <= s || *s != '\0')) {
2539 *retlen = (STRLEN) -1;
2540 }
2541
2542 return ret;
2543 }
2544 }
2545
2546 /* ------------------------------- perl.h ----------------------------- */
2547
2548 /*
2549 =for apidoc_section $utility
2550
2551 =for apidoc is_safe_syscall
2552
2553 Test that the given C<pv> (with length C<len>) doesn't contain any internal
2554 C<NUL> characters.
2555 If it does, set C<errno> to C<ENOENT>, optionally warn using the C<syscalls>
2556 category, and return FALSE.
2557
2558 Return TRUE if the name is safe.
2559
2560 C<what> and C<op_name> are used in any warning.
2561
2562 Used by the C<IS_SAFE_SYSCALL()> macro.
2563
2564 =cut
2565 */
2566
2567 PERL_STATIC_INLINE bool
Perl_is_safe_syscall(pTHX_ const char * pv,STRLEN len,const char * what,const char * op_name)2568 Perl_is_safe_syscall(pTHX_ const char *pv, STRLEN len, const char *what, const char *op_name)
2569 {
2570 /* While the Windows CE API provides only UCS-16 (or UTF-16) APIs
2571 * perl itself uses xce*() functions which accept 8-bit strings.
2572 */
2573
2574 PERL_ARGS_ASSERT_IS_SAFE_SYSCALL;
2575
2576 if (len > 1) {
2577 char *null_at;
2578 if (UNLIKELY((null_at = (char *)memchr(pv, 0, len-1)) != NULL)) {
2579 SETERRNO(ENOENT, LIB_INVARG);
2580 Perl_ck_warner(aTHX_ packWARN(WARN_SYSCALLS),
2581 "Invalid \\0 character in %s for %s: %s\\0%s",
2582 what, op_name, pv, null_at+1);
2583 return FALSE;
2584 }
2585 }
2586
2587 return TRUE;
2588 }
2589
2590 /*
2591
2592 Return true if the supplied filename has a newline character
2593 immediately before the first (hopefully only) NUL.
2594
2595 My original look at this incorrectly used the len from SvPV(), but
2596 that's incorrect, since we allow for a NUL in pv[len-1].
2597
2598 So instead, strlen() and work from there.
2599
2600 This allow for the user reading a filename, forgetting to chomp it,
2601 then calling:
2602
2603 open my $foo, "$file\0";
2604
2605 */
2606
2607 #ifdef PERL_CORE
2608
2609 PERL_STATIC_INLINE bool
S_should_warn_nl(const char * pv)2610 S_should_warn_nl(const char *pv)
2611 {
2612 STRLEN len;
2613
2614 PERL_ARGS_ASSERT_SHOULD_WARN_NL;
2615
2616 len = strlen(pv);
2617
2618 return len > 0 && pv[len-1] == '\n';
2619 }
2620
2621 #endif
2622
2623 #if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
2624
2625 PERL_STATIC_INLINE bool
S_lossless_NV_to_IV(const NV nv,IV * ivp)2626 S_lossless_NV_to_IV(const NV nv, IV *ivp)
2627 {
2628 /* This function determines if the input NV 'nv' may be converted without
2629 * loss of data to an IV. If not, it returns FALSE taking no other action.
2630 * But if it is possible, it does the conversion, returning TRUE, and
2631 * storing the converted result in '*ivp' */
2632
2633 PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
2634
2635 # if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
2636 /* Normally any comparison with a NaN returns false; if we can't rely
2637 * on that behaviour, check explicitly */
2638 if (UNLIKELY(Perl_isnan(nv))) {
2639 return FALSE;
2640 }
2641 # endif
2642
2643 /* Written this way so that with an always-false NaN comparison we
2644 * return false */
2645 if (!(LIKELY(nv >= (NV) IV_MIN) && LIKELY(nv < IV_MAX_P1))) {
2646 return FALSE;
2647 }
2648
2649 if ((IV) nv != nv) {
2650 return FALSE;
2651 }
2652
2653 *ivp = (IV) nv;
2654 return TRUE;
2655 }
2656
2657 #endif
2658
2659 /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
2660
2661 #if defined(PERL_IN_PP_C) || defined(PERL_IN_REGCOMP_ANY) || defined(PERL_IN_TOKE_C) || defined(PERL_IN_UNIVERSAL_C)
2662
2663 #define MAX_CHARSET_NAME_LENGTH 2
2664
2665 PERL_STATIC_INLINE const char *
S_get_regex_charset_name(const U32 flags,STRLEN * const lenp)2666 S_get_regex_charset_name(const U32 flags, STRLEN* const lenp)
2667 {
2668 PERL_ARGS_ASSERT_GET_REGEX_CHARSET_NAME;
2669
2670 /* Returns a string that corresponds to the name of the regex character set
2671 * given by 'flags', and *lenp is set the length of that string, which
2672 * cannot exceed MAX_CHARSET_NAME_LENGTH characters */
2673
2674 *lenp = 1;
2675 switch (get_regex_charset(flags)) {
2676 case REGEX_DEPENDS_CHARSET: return DEPENDS_PAT_MODS;
2677 case REGEX_LOCALE_CHARSET: return LOCALE_PAT_MODS;
2678 case REGEX_UNICODE_CHARSET: return UNICODE_PAT_MODS;
2679 case REGEX_ASCII_RESTRICTED_CHARSET: return ASCII_RESTRICT_PAT_MODS;
2680 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
2681 *lenp = 2;
2682 return ASCII_MORE_RESTRICT_PAT_MODS;
2683 }
2684 /* The NOT_REACHED; hides an assert() which has a rather complex
2685 * definition in perl.h. */
2686 NOT_REACHED; /* NOTREACHED */
2687 return "?"; /* Unknown */
2688 }
2689
2690 #endif
2691
2692 /*
2693
2694 Return false if any get magic is on the SV other than taint magic.
2695
2696 */
2697
2698 PERL_STATIC_INLINE bool
Perl_sv_only_taint_gmagic(SV * sv)2699 Perl_sv_only_taint_gmagic(SV *sv)
2700 {
2701 MAGIC *mg = SvMAGIC(sv);
2702
2703 PERL_ARGS_ASSERT_SV_ONLY_TAINT_GMAGIC;
2704
2705 while (mg) {
2706 if (mg->mg_type != PERL_MAGIC_taint
2707 && !(mg->mg_flags & MGf_GSKIP)
2708 && mg->mg_virtual->svt_get) {
2709 return FALSE;
2710 }
2711 mg = mg->mg_moremagic;
2712 }
2713
2714 return TRUE;
2715 }
2716
2717 /* ------------------ cop.h ------------------------------------------- */
2718
2719 /* implement GIMME_V() macro */
2720
2721 PERL_STATIC_INLINE U8
Perl_gimme_V(pTHX)2722 Perl_gimme_V(pTHX)
2723 {
2724 I32 cxix;
2725 U8 gimme = (PL_op->op_flags & OPf_WANT);
2726
2727 if (gimme)
2728 return gimme;
2729 cxix = PL_curstackinfo->si_cxsubix;
2730 if (cxix < 0)
2731 return PL_curstackinfo->si_type == PERLSI_SORT ? G_SCALAR: G_VOID;
2732 assert(cxstack[cxix].blk_gimme & G_WANT);
2733 return (cxstack[cxix].blk_gimme & G_WANT);
2734 }
2735
2736
2737 /* Enter a block. Push a new base context and return its address. */
2738
2739 PERL_STATIC_INLINE PERL_CONTEXT *
Perl_cx_pushblock(pTHX_ U8 type,U8 gimme,SV ** sp,I32 saveix)2740 Perl_cx_pushblock(pTHX_ U8 type, U8 gimme, SV** sp, I32 saveix)
2741 {
2742 PERL_CONTEXT * cx;
2743
2744 PERL_ARGS_ASSERT_CX_PUSHBLOCK;
2745
2746 CXINC;
2747 cx = CX_CUR();
2748 cx->cx_type = type;
2749 cx->blk_gimme = gimme;
2750 cx->blk_oldsaveix = saveix;
2751 cx->blk_oldsp = (I32)(sp - PL_stack_base);
2752 cx->blk_oldcop = PL_curcop;
2753 cx->blk_oldmarksp = (I32)(PL_markstack_ptr - PL_markstack);
2754 cx->blk_oldscopesp = PL_scopestack_ix;
2755 cx->blk_oldpm = PL_curpm;
2756 cx->blk_old_tmpsfloor = PL_tmps_floor;
2757
2758 PL_tmps_floor = PL_tmps_ix;
2759 CX_DEBUG(cx, "PUSH");
2760 return cx;
2761 }
2762
2763
2764 /* Exit a block (RETURN and LAST). */
2765
2766 PERL_STATIC_INLINE void
Perl_cx_popblock(pTHX_ PERL_CONTEXT * cx)2767 Perl_cx_popblock(pTHX_ PERL_CONTEXT *cx)
2768 {
2769 PERL_ARGS_ASSERT_CX_POPBLOCK;
2770
2771 CX_DEBUG(cx, "POP");
2772 /* these 3 are common to cx_popblock and cx_topblock */
2773 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2774 PL_scopestack_ix = cx->blk_oldscopesp;
2775 PL_curpm = cx->blk_oldpm;
2776
2777 /* LEAVE_SCOPE() should have made this true. /(?{})/ cheats
2778 * and leaves a CX entry lying around for repeated use, so
2779 * skip for multicall */ \
2780 assert( (CxTYPE(cx) == CXt_SUB && CxMULTICALL(cx))
2781 || PL_savestack_ix == cx->blk_oldsaveix);
2782 PL_curcop = cx->blk_oldcop;
2783 PL_tmps_floor = cx->blk_old_tmpsfloor;
2784 }
2785
2786 /* Continue a block elsewhere (e.g. NEXT, REDO, GOTO).
2787 * Whereas cx_popblock() restores the state to the point just before
2788 * cx_pushblock() was called, cx_topblock() restores it to the point just
2789 * *after* cx_pushblock() was called. */
2790
2791 PERL_STATIC_INLINE void
Perl_cx_topblock(pTHX_ PERL_CONTEXT * cx)2792 Perl_cx_topblock(pTHX_ PERL_CONTEXT *cx)
2793 {
2794 PERL_ARGS_ASSERT_CX_TOPBLOCK;
2795
2796 CX_DEBUG(cx, "TOP");
2797 /* these 3 are common to cx_popblock and cx_topblock */
2798 PL_markstack_ptr = PL_markstack + cx->blk_oldmarksp;
2799 PL_scopestack_ix = cx->blk_oldscopesp;
2800 PL_curpm = cx->blk_oldpm;
2801
2802 PL_stack_sp = PL_stack_base + cx->blk_oldsp;
2803 }
2804
2805
2806 PERL_STATIC_INLINE void
Perl_cx_pushsub(pTHX_ PERL_CONTEXT * cx,CV * cv,OP * retop,bool hasargs)2807 Perl_cx_pushsub(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, bool hasargs)
2808 {
2809 U8 phlags = CX_PUSHSUB_GET_LVALUE_MASK(Perl_was_lvalue_sub);
2810
2811 PERL_ARGS_ASSERT_CX_PUSHSUB;
2812
2813 PERL_DTRACE_PROBE_ENTRY(cv);
2814 cx->blk_sub.old_cxsubix = PL_curstackinfo->si_cxsubix;
2815 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2816 cx->blk_sub.cv = cv;
2817 cx->blk_sub.olddepth = CvDEPTH(cv);
2818 cx->blk_sub.prevcomppad = PL_comppad;
2819 cx->cx_type |= (hasargs) ? CXp_HASARGS : 0;
2820 cx->blk_sub.retop = retop;
2821 SvREFCNT_inc_simple_void_NN(cv);
2822 cx->blk_u16 = PL_op->op_private & (phlags|OPpDEREF);
2823 }
2824
2825
2826 /* subsets of cx_popsub() */
2827
2828 PERL_STATIC_INLINE void
Perl_cx_popsub_common(pTHX_ PERL_CONTEXT * cx)2829 Perl_cx_popsub_common(pTHX_ PERL_CONTEXT *cx)
2830 {
2831 CV *cv;
2832
2833 PERL_ARGS_ASSERT_CX_POPSUB_COMMON;
2834 assert(CxTYPE(cx) == CXt_SUB);
2835
2836 PL_comppad = cx->blk_sub.prevcomppad;
2837 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2838 cv = cx->blk_sub.cv;
2839 CvDEPTH(cv) = cx->blk_sub.olddepth;
2840 cx->blk_sub.cv = NULL;
2841 SvREFCNT_dec(cv);
2842 PL_curstackinfo->si_cxsubix = cx->blk_sub.old_cxsubix;
2843 }
2844
2845
2846 /* handle the @_ part of leaving a sub */
2847
2848 PERL_STATIC_INLINE void
Perl_cx_popsub_args(pTHX_ PERL_CONTEXT * cx)2849 Perl_cx_popsub_args(pTHX_ PERL_CONTEXT *cx)
2850 {
2851 AV *av;
2852
2853 PERL_ARGS_ASSERT_CX_POPSUB_ARGS;
2854 assert(CxTYPE(cx) == CXt_SUB);
2855 assert(AvARRAY(MUTABLE_AV(
2856 PadlistARRAY(CvPADLIST(cx->blk_sub.cv))[
2857 CvDEPTH(cx->blk_sub.cv)])) == PL_curpad);
2858
2859 CX_POP_SAVEARRAY(cx);
2860 av = MUTABLE_AV(PAD_SVl(0));
2861 if (UNLIKELY(AvREAL(av)))
2862 /* abandon @_ if it got reified */
2863 clear_defarray(av, 0);
2864 else {
2865 CLEAR_ARGARRAY(av);
2866 }
2867 }
2868
2869
2870 PERL_STATIC_INLINE void
Perl_cx_popsub(pTHX_ PERL_CONTEXT * cx)2871 Perl_cx_popsub(pTHX_ PERL_CONTEXT *cx)
2872 {
2873 PERL_ARGS_ASSERT_CX_POPSUB;
2874 assert(CxTYPE(cx) == CXt_SUB);
2875
2876 PERL_DTRACE_PROBE_RETURN(cx->blk_sub.cv);
2877
2878 if (CxHASARGS(cx))
2879 cx_popsub_args(cx);
2880 cx_popsub_common(cx);
2881 }
2882
2883
2884 PERL_STATIC_INLINE void
Perl_cx_pushformat(pTHX_ PERL_CONTEXT * cx,CV * cv,OP * retop,GV * gv)2885 Perl_cx_pushformat(pTHX_ PERL_CONTEXT *cx, CV *cv, OP *retop, GV *gv)
2886 {
2887 PERL_ARGS_ASSERT_CX_PUSHFORMAT;
2888
2889 cx->blk_format.old_cxsubix = PL_curstackinfo->si_cxsubix;
2890 PL_curstackinfo->si_cxsubix= cx - PL_curstackinfo->si_cxstack;
2891 cx->blk_format.cv = cv;
2892 cx->blk_format.retop = retop;
2893 cx->blk_format.gv = gv;
2894 cx->blk_format.dfoutgv = PL_defoutgv;
2895 cx->blk_format.prevcomppad = PL_comppad;
2896 cx->blk_u16 = 0;
2897
2898 SvREFCNT_inc_simple_void_NN(cv);
2899 CvDEPTH(cv)++;
2900 SvREFCNT_inc_void(cx->blk_format.dfoutgv);
2901 }
2902
2903
2904 PERL_STATIC_INLINE void
Perl_cx_popformat(pTHX_ PERL_CONTEXT * cx)2905 Perl_cx_popformat(pTHX_ PERL_CONTEXT *cx)
2906 {
2907 CV *cv;
2908 GV *dfout;
2909
2910 PERL_ARGS_ASSERT_CX_POPFORMAT;
2911 assert(CxTYPE(cx) == CXt_FORMAT);
2912
2913 dfout = cx->blk_format.dfoutgv;
2914 setdefout(dfout);
2915 cx->blk_format.dfoutgv = NULL;
2916 SvREFCNT_dec_NN(dfout);
2917
2918 PL_comppad = cx->blk_format.prevcomppad;
2919 PL_curpad = LIKELY(PL_comppad) ? AvARRAY(PL_comppad) : NULL;
2920 cv = cx->blk_format.cv;
2921 cx->blk_format.cv = NULL;
2922 --CvDEPTH(cv);
2923 SvREFCNT_dec_NN(cv);
2924 PL_curstackinfo->si_cxsubix = cx->blk_format.old_cxsubix;
2925 }
2926
2927
2928 PERL_STATIC_INLINE void
Perl_push_evalortry_common(pTHX_ PERL_CONTEXT * cx,OP * retop,SV * namesv)2929 Perl_push_evalortry_common(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2930 {
2931 cx->blk_eval.retop = retop;
2932 cx->blk_eval.old_namesv = namesv;
2933 cx->blk_eval.old_eval_root = PL_eval_root;
2934 cx->blk_eval.cur_text = PL_parser ? PL_parser->linestr : NULL;
2935 cx->blk_eval.cv = NULL; /* later set by doeval_compile() */
2936 cx->blk_eval.cur_top_env = PL_top_env;
2937
2938 assert(!(PL_in_eval & ~ 0x3F));
2939 assert(!(PL_op->op_type & ~0x1FF));
2940 cx->blk_u16 = (PL_in_eval & 0x3F) | ((U16)PL_op->op_type << 7);
2941 }
2942
2943 PERL_STATIC_INLINE void
Perl_cx_pusheval(pTHX_ PERL_CONTEXT * cx,OP * retop,SV * namesv)2944 Perl_cx_pusheval(pTHX_ PERL_CONTEXT *cx, OP *retop, SV *namesv)
2945 {
2946 PERL_ARGS_ASSERT_CX_PUSHEVAL;
2947
2948 Perl_push_evalortry_common(aTHX_ cx, retop, namesv);
2949
2950 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2951 PL_curstackinfo->si_cxsubix = cx - PL_curstackinfo->si_cxstack;
2952 }
2953
2954 PERL_STATIC_INLINE void
Perl_cx_pushtry(pTHX_ PERL_CONTEXT * cx,OP * retop)2955 Perl_cx_pushtry(pTHX_ PERL_CONTEXT *cx, OP *retop)
2956 {
2957 PERL_ARGS_ASSERT_CX_PUSHTRY;
2958
2959 Perl_push_evalortry_common(aTHX_ cx, retop, NULL);
2960
2961 /* Don't actually change it, just store the current value so it's restored
2962 * by the common popeval */
2963 cx->blk_eval.old_cxsubix = PL_curstackinfo->si_cxsubix;
2964 }
2965
2966
2967 PERL_STATIC_INLINE void
Perl_cx_popeval(pTHX_ PERL_CONTEXT * cx)2968 Perl_cx_popeval(pTHX_ PERL_CONTEXT *cx)
2969 {
2970 SV *sv;
2971
2972 PERL_ARGS_ASSERT_CX_POPEVAL;
2973 assert(CxTYPE(cx) == CXt_EVAL);
2974
2975 PL_in_eval = CxOLD_IN_EVAL(cx);
2976 assert(!(PL_in_eval & 0xc0));
2977 PL_eval_root = cx->blk_eval.old_eval_root;
2978 sv = cx->blk_eval.cur_text;
2979 if (sv && CxEVAL_TXT_REFCNTED(cx)) {
2980 cx->blk_eval.cur_text = NULL;
2981 SvREFCNT_dec_NN(sv);
2982 }
2983
2984 sv = cx->blk_eval.old_namesv;
2985 if (sv) {
2986 cx->blk_eval.old_namesv = NULL;
2987 SvREFCNT_dec_NN(sv);
2988 }
2989 PL_curstackinfo->si_cxsubix = cx->blk_eval.old_cxsubix;
2990 }
2991
2992
2993 /* push a plain loop, i.e.
2994 * { block }
2995 * while (cond) { block }
2996 * for (init;cond;continue) { block }
2997 * This loop can be last/redo'ed etc.
2998 */
2999
3000 PERL_STATIC_INLINE void
Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT * cx)3001 Perl_cx_pushloop_plain(pTHX_ PERL_CONTEXT *cx)
3002 {
3003 PERL_ARGS_ASSERT_CX_PUSHLOOP_PLAIN;
3004 cx->blk_loop.my_op = cLOOP;
3005 }
3006
3007
3008 /* push a true for loop, i.e.
3009 * for var (list) { block }
3010 */
3011
3012 PERL_STATIC_INLINE void
Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT * cx,void * itervarp,SV * itersave)3013 Perl_cx_pushloop_for(pTHX_ PERL_CONTEXT *cx, void *itervarp, SV* itersave)
3014 {
3015 PERL_ARGS_ASSERT_CX_PUSHLOOP_FOR;
3016
3017 /* this one line is common with cx_pushloop_plain */
3018 cx->blk_loop.my_op = cLOOP;
3019
3020 cx->blk_loop.itervar_u.svp = (SV**)itervarp;
3021 cx->blk_loop.itersave = itersave;
3022 #ifdef USE_ITHREADS
3023 cx->blk_loop.oldcomppad = PL_comppad;
3024 #endif
3025 }
3026
3027
3028 /* pop all loop types, including plain */
3029
3030 PERL_STATIC_INLINE void
Perl_cx_poploop(pTHX_ PERL_CONTEXT * cx)3031 Perl_cx_poploop(pTHX_ PERL_CONTEXT *cx)
3032 {
3033 PERL_ARGS_ASSERT_CX_POPLOOP;
3034
3035 assert(CxTYPE_is_LOOP(cx));
3036 if ( CxTYPE(cx) == CXt_LOOP_ARY
3037 || CxTYPE(cx) == CXt_LOOP_LAZYSV)
3038 {
3039 /* Free ary or cur. This assumes that state_u.ary.ary
3040 * aligns with state_u.lazysv.cur. See cx_dup() */
3041 SV *sv = cx->blk_loop.state_u.lazysv.cur;
3042 cx->blk_loop.state_u.lazysv.cur = NULL;
3043 SvREFCNT_dec_NN(sv);
3044 if (CxTYPE(cx) == CXt_LOOP_LAZYSV) {
3045 sv = cx->blk_loop.state_u.lazysv.end;
3046 cx->blk_loop.state_u.lazysv.end = NULL;
3047 SvREFCNT_dec_NN(sv);
3048 }
3049 }
3050 if (cx->cx_type & (CXp_FOR_PAD|CXp_FOR_GV)) {
3051 SV *cursv;
3052 SV **svp = (cx)->blk_loop.itervar_u.svp;
3053 if ((cx->cx_type & CXp_FOR_GV))
3054 svp = &GvSV((GV*)svp);
3055 cursv = *svp;
3056 *svp = cx->blk_loop.itersave;
3057 cx->blk_loop.itersave = NULL;
3058 SvREFCNT_dec(cursv);
3059 }
3060 if (cx->cx_type & (CXp_FOR_GV|CXp_FOR_LVREF))
3061 SvREFCNT_dec(cx->blk_loop.itervar_u.svp);
3062 }
3063
3064
3065 PERL_STATIC_INLINE void
Perl_cx_pushwhen(pTHX_ PERL_CONTEXT * cx)3066 Perl_cx_pushwhen(pTHX_ PERL_CONTEXT *cx)
3067 {
3068 PERL_ARGS_ASSERT_CX_PUSHWHEN;
3069
3070 cx->blk_givwhen.leave_op = cLOGOP->op_other;
3071 }
3072
3073
3074 PERL_STATIC_INLINE void
Perl_cx_popwhen(pTHX_ PERL_CONTEXT * cx)3075 Perl_cx_popwhen(pTHX_ PERL_CONTEXT *cx)
3076 {
3077 PERL_ARGS_ASSERT_CX_POPWHEN;
3078 assert(CxTYPE(cx) == CXt_WHEN);
3079
3080 PERL_UNUSED_ARG(cx);
3081 PERL_UNUSED_CONTEXT;
3082 /* currently NOOP */
3083 }
3084
3085
3086 PERL_STATIC_INLINE void
Perl_cx_pushgiven(pTHX_ PERL_CONTEXT * cx,SV * orig_defsv)3087 Perl_cx_pushgiven(pTHX_ PERL_CONTEXT *cx, SV *orig_defsv)
3088 {
3089 PERL_ARGS_ASSERT_CX_PUSHGIVEN;
3090
3091 cx->blk_givwhen.leave_op = cLOGOP->op_other;
3092 cx->blk_givwhen.defsv_save = orig_defsv;
3093 }
3094
3095
3096 PERL_STATIC_INLINE void
Perl_cx_popgiven(pTHX_ PERL_CONTEXT * cx)3097 Perl_cx_popgiven(pTHX_ PERL_CONTEXT *cx)
3098 {
3099 SV *sv;
3100
3101 PERL_ARGS_ASSERT_CX_POPGIVEN;
3102 assert(CxTYPE(cx) == CXt_GIVEN);
3103
3104 sv = GvSV(PL_defgv);
3105 GvSV(PL_defgv) = cx->blk_givwhen.defsv_save;
3106 cx->blk_givwhen.defsv_save = NULL;
3107 SvREFCNT_dec(sv);
3108 }
3109
3110 /*
3111 =for apidoc newPADxVOP
3112
3113 Constructs, checks and returns an op containing a pad offset. C<type> is
3114 the opcode, which should be one of C<OP_PADSV>, C<OP_PADAV>, C<OP_PADHV>
3115 or C<OP_PADCV>. The returned op will have the C<op_targ> field set by
3116 the C<padix> argument.
3117
3118 This is convenient when constructing a large optree in nested function
3119 calls, as it avoids needing to store the pad op directly to set the
3120 C<op_targ> field as a side-effect. For example
3121
3122 o = op_append_elem(OP_LINESEQ, o,
3123 newPADxVOP(OP_PADSV, 0, padix));
3124
3125 =cut
3126 */
3127
3128 PERL_STATIC_INLINE OP *
Perl_newPADxVOP(pTHX_ I32 type,I32 flags,PADOFFSET padix)3129 Perl_newPADxVOP(pTHX_ I32 type, I32 flags, PADOFFSET padix)
3130 {
3131 PERL_ARGS_ASSERT_NEWPADXVOP;
3132
3133 assert(type == OP_PADSV || type == OP_PADAV || type == OP_PADHV
3134 || type == OP_PADCV);
3135 OP *o = newOP(type, flags);
3136 o->op_targ = padix;
3137 return o;
3138 }
3139
3140 /* ------------------ util.h ------------------------------------------- */
3141
3142 /*
3143 =for apidoc_section $string
3144
3145 =for apidoc foldEQ
3146
3147 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3148 same
3149 case-insensitively; false otherwise. Uppercase and lowercase ASCII range bytes
3150 match themselves and their opposite case counterparts. Non-cased and non-ASCII
3151 range bytes match only themselves.
3152
3153 =cut
3154 */
3155
3156 PERL_STATIC_INLINE I32
Perl_foldEQ(pTHX_ const char * s1,const char * s2,I32 len)3157 Perl_foldEQ(pTHX_ const char *s1, const char *s2, I32 len)
3158 {
3159 const U8 *a = (const U8 *)s1;
3160 const U8 *b = (const U8 *)s2;
3161
3162 PERL_ARGS_ASSERT_FOLDEQ;
3163
3164 assert(len >= 0);
3165
3166 while (len--) {
3167 if (*a != *b && *a != PL_fold[*b])
3168 return 0;
3169 a++,b++;
3170 }
3171 return 1;
3172 }
3173
3174 PERL_STATIC_INLINE I32
Perl_foldEQ_latin1(pTHX_ const char * s1,const char * s2,I32 len)3175 Perl_foldEQ_latin1(pTHX_ const char *s1, const char *s2, I32 len)
3176 {
3177 /* Compare non-UTF-8 using Unicode (Latin1) semantics. Works on all folds
3178 * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
3179 * does not check for this. Nor does it check that the strings each have
3180 * at least 'len' characters. */
3181
3182 const U8 *a = (const U8 *)s1;
3183 const U8 *b = (const U8 *)s2;
3184
3185 PERL_ARGS_ASSERT_FOLDEQ_LATIN1;
3186
3187 assert(len >= 0);
3188
3189 while (len--) {
3190 if (*a != *b && *a != PL_fold_latin1[*b]) {
3191 return 0;
3192 }
3193 a++, b++;
3194 }
3195 return 1;
3196 }
3197
3198 /*
3199 =for apidoc_section $locale
3200 =for apidoc foldEQ_locale
3201
3202 Returns true if the leading C<len> bytes of the strings C<s1> and C<s2> are the
3203 same case-insensitively in the current locale; false otherwise.
3204
3205 =cut
3206 */
3207
3208 PERL_STATIC_INLINE I32
Perl_foldEQ_locale(pTHX_ const char * s1,const char * s2,I32 len)3209 Perl_foldEQ_locale(pTHX_ const char *s1, const char *s2, I32 len)
3210 {
3211 const U8 *a = (const U8 *)s1;
3212 const U8 *b = (const U8 *)s2;
3213
3214 PERL_ARGS_ASSERT_FOLDEQ_LOCALE;
3215
3216 assert(len >= 0);
3217
3218 while (len--) {
3219 if (*a != *b && *a != PL_fold_locale[*b]) {
3220 DEBUG_Lv(PerlIO_printf(Perl_debug_log,
3221 "%s:%d: Our records indicate %02x is not a fold of %02x"
3222 " or its mate %02x\n",
3223 __FILE__, __LINE__, *a, *b, PL_fold_locale[*b]));
3224
3225 return 0;
3226 }
3227 a++,b++;
3228 }
3229 return 1;
3230 }
3231
3232 /*
3233 =for apidoc_section $string
3234 =for apidoc my_strnlen
3235
3236 The C library C<strnlen> if available, or a Perl implementation of it.
3237
3238 C<my_strnlen()> computes the length of the string, up to C<maxlen>
3239 characters. It will never attempt to address more than C<maxlen>
3240 characters, making it suitable for use with strings that are not
3241 guaranteed to be NUL-terminated.
3242
3243 =cut
3244
3245 Description stolen from http://man.openbsd.org/strnlen.3,
3246 implementation stolen from PostgreSQL.
3247 */
3248 #ifndef HAS_STRNLEN
3249
3250 PERL_STATIC_INLINE Size_t
Perl_my_strnlen(const char * str,Size_t maxlen)3251 Perl_my_strnlen(const char *str, Size_t maxlen)
3252 {
3253 const char *end = (char *) memchr(str, '\0', maxlen);
3254
3255 PERL_ARGS_ASSERT_MY_STRNLEN;
3256
3257 if (end == NULL) return maxlen;
3258 return end - str;
3259 }
3260
3261 #endif
3262
3263 #if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
3264
3265 PERL_STATIC_INLINE void *
S_my_memrchr(const char * s,const char c,const STRLEN len)3266 S_my_memrchr(const char * s, const char c, const STRLEN len)
3267 {
3268 /* memrchr(), since many platforms lack it */
3269
3270 const char * t = s + len - 1;
3271
3272 PERL_ARGS_ASSERT_MY_MEMRCHR;
3273
3274 while (t >= s) {
3275 if (*t == c) {
3276 return (void *) t;
3277 }
3278 t--;
3279 }
3280
3281 return NULL;
3282 }
3283
3284 #endif
3285
3286 PERL_STATIC_INLINE char *
Perl_mortal_getenv(const char * str)3287 Perl_mortal_getenv(const char * str)
3288 {
3289 /* This implements a (mostly) thread-safe, sequential-call-safe getenv().
3290 *
3291 * It's (mostly) thread-safe because it uses a mutex to prevent other
3292 * threads (that look at this mutex) from destroying the result before this
3293 * routine has a chance to copy the result to a place that won't be
3294 * destroyed before the caller gets a chance to handle it. That place is a
3295 * mortal SV. khw chose this over SAVEFREEPV because he is under the
3296 * impression that the SV will hang around longer under more circumstances
3297 *
3298 * The reason it isn't completely thread-safe is that other code could
3299 * simply not pay attention to the mutex. All of the Perl core uses the
3300 * mutex, but it is possible for code from, say XS, to not use this mutex,
3301 * defeating the safety.
3302 *
3303 * getenv() returns, in some implementations, a pointer to a spot in the
3304 * **environ array, which could be invalidated at any time by this or
3305 * another thread changing the environment. Other implementations copy the
3306 * **environ value to a static buffer, returning a pointer to that. That
3307 * buffer might or might not be invalidated by a getenv() call in another
3308 * thread. If it does get zapped, we need an exclusive lock. Otherwise,
3309 * many getenv() calls can safely be running simultaneously, so a
3310 * many-reader (but no simultaneous writers) lock is ok. There is a
3311 * Configure probe to see if another thread destroys the buffer, and the
3312 * mutex is defined accordingly.
3313 *
3314 * But in all cases, using the mutex prevents these problems, as long as
3315 * all code uses the same mutex.
3316 *
3317 * A complication is that this can be called during phases where the
3318 * mortalization process isn't available. These are in interpreter
3319 * destruction or early in construction. khw believes that at these times
3320 * there shouldn't be anything else going on, so plain getenv is safe AS
3321 * LONG AS the caller acts on the return before calling it again. */
3322
3323 char * ret;
3324 dTHX;
3325
3326 PERL_ARGS_ASSERT_MORTAL_GETENV;
3327
3328 /* Can't mortalize without stacks. khw believes that no other threads
3329 * should be running, so no need to lock things, and this may be during a
3330 * phase when locking isn't even available */
3331 if (UNLIKELY(PL_scopestack_ix == 0)) {
3332 return getenv(str);
3333 }
3334
3335 #ifdef PERL_MEM_LOG
3336
3337 /* A major complication arises under PERL_MEM_LOG. When that is active,
3338 * every memory allocation may result in logging, depending on the value of
3339 * ENV{PERL_MEM_LOG} at the moment. That means, as we create the SV for
3340 * saving ENV{foo}'s value (but before saving it), the logging code will
3341 * call us recursively to find out what ENV{PERL_MEM_LOG} is. Without some
3342 * care that could lead to: 1) infinite recursion; or 2) deadlock (trying to
3343 * lock a boolean mutex recursively); 3) destroying the getenv() static
3344 * buffer; or 4) destroying the temporary created by this for the copy
3345 * causes a log entry to be made which could cause a new temporary to be
3346 * created, which will need to be destroyed at some point, leading to an
3347 * infinite loop.
3348 *
3349 * The solution adopted here (after some gnashing of teeth) is to detect
3350 * the recursive calls and calls from the logger, and treat them specially.
3351 * Let's say we want to do getenv("foo"). We first find
3352 * getenv(PERL_MEM_LOG) and save it to a fixed-length per-interpreter
3353 * variable, so no temporary is required. Then we do getenv(foo), and in
3354 * the process of creating a temporary to save it, this function will be
3355 * called recursively to do a getenv(PERL_MEM_LOG). On the recursed call,
3356 * we detect that it is such a call and return our saved value instead of
3357 * locking and doing a new getenv(). This solves all of problems 1), 2),
3358 * and 3). Because all the getenv()s are done while the mutex is locked,
3359 * the state cannot have changed. To solve 4), we don't create a temporary
3360 * when this is called from the logging code. That code disposes of the
3361 * return value while the mutex is still locked.
3362 *
3363 * The value of getenv(PERL_MEM_LOG) can be anything, but only initial
3364 * digits and 3 particular letters are significant; the rest are ignored by
3365 * the memory logging code. Thus the per-interpreter variable only needs
3366 * to be large enough to save the significant information, the size of
3367 * which is known at compile time. The first byte is extra, reserved for
3368 * flags for our use. To protect against overflowing, only the reserved
3369 * byte, as many digits as don't overflow, and the three letters are
3370 * stored.
3371 *
3372 * The reserved byte has two bits:
3373 * 0x1 if set indicates that if we get here, it is a recursive call of
3374 * getenv()
3375 * 0x2 if set indicates that the call is from the logging code.
3376 *
3377 * If the flag indicates this is a recursive call, just return the stored
3378 * value of PL_mem_log; An empty value gets turned into NULL. */
3379 if (strEQ(str, "PERL_MEM_LOG") && PL_mem_log[0] & 0x1) {
3380 if (PL_mem_log[1] == '\0') {
3381 return NULL;
3382 } else {
3383 return PL_mem_log + 1;
3384 }
3385 }
3386
3387 #endif
3388
3389 GETENV_LOCK;
3390
3391 #ifdef PERL_MEM_LOG
3392
3393 /* Here we are in a critical section. As explained above, we do our own
3394 * getenv(PERL_MEM_LOG), saving the result safely. */
3395 ret = getenv("PERL_MEM_LOG");
3396 if (ret == NULL) { /* No logging active */
3397
3398 /* Return that immediately if called from the logging code */
3399 if (PL_mem_log[0] & 0x2) {
3400 GETENV_UNLOCK;
3401 return NULL;
3402 }
3403
3404 PL_mem_log[1] = '\0';
3405 }
3406 else {
3407 char *mem_log_meat = PL_mem_log + 1; /* first byte reserved */
3408
3409 /* There is nothing to prevent the value of PERL_MEM_LOG from being an
3410 * extremely long string. But we want only a few characters from it.
3411 * PL_mem_log has been made large enough to hold just the ones we need.
3412 * First the file descriptor. */
3413 if (isDIGIT(*ret)) {
3414 const char * s = ret;
3415 if (UNLIKELY(*s == '0')) {
3416
3417 /* Reduce multiple leading zeros to a single one. This is to
3418 * allow the caller to change what to do with leading zeros. */
3419 *mem_log_meat++ = '0';
3420 s++;
3421 while (*s == '0') {
3422 s++;
3423 }
3424 }
3425
3426 /* If the input overflows, copy just enough for the result to also
3427 * overflow, plus 1 to make sure */
3428 while (isDIGIT(*s) && s < ret + TYPE_DIGITS(UV) + 1) {
3429 *mem_log_meat++ = *s++;
3430 }
3431 }
3432
3433 /* Then each of the four significant characters */
3434 if (strchr(ret, 'm')) {
3435 *mem_log_meat++ = 'm';
3436 }
3437 if (strchr(ret, 's')) {
3438 *mem_log_meat++ = 's';
3439 }
3440 if (strchr(ret, 't')) {
3441 *mem_log_meat++ = 't';
3442 }
3443 if (strchr(ret, 'c')) {
3444 *mem_log_meat++ = 'c';
3445 }
3446 *mem_log_meat = '\0';
3447
3448 assert(mem_log_meat < PL_mem_log + sizeof(PL_mem_log));
3449 }
3450
3451 /* If we are being called from the logger, it only needs the significant
3452 * portion of PERL_MEM_LOG, and doesn't need a safe copy */
3453 if (PL_mem_log[0] & 0x2) {
3454 assert(strEQ(str, "PERL_MEM_LOG"));
3455 GETENV_UNLOCK;
3456 return PL_mem_log + 1;
3457 }
3458
3459 /* Here is a generic getenv(). This could be a getenv("PERL_MEM_LOG") that
3460 * is coming from other than the logging code, so it should be treated the
3461 * same as any other getenv(), returning the full value, not just the
3462 * significant part, and having its value saved. Set the flag that
3463 * indicates any call to this routine will be a recursion from here */
3464 PL_mem_log[0] = 0x1;
3465
3466 #endif
3467
3468 /* Now get the value of the real desired variable, and save a copy */
3469 ret = getenv(str);
3470
3471 if (ret != NULL) {
3472 ret = SvPVX( newSVpvn_flags(ret, strlen(ret) ,SVs_TEMP) );
3473 }
3474
3475 GETENV_UNLOCK;
3476
3477 #ifdef PERL_MEM_LOG
3478
3479 /* Clear the buffer */
3480 Zero(PL_mem_log, sizeof(PL_mem_log), char);
3481
3482 #endif
3483
3484 return ret;
3485 }
3486
3487 PERL_STATIC_INLINE bool
Perl_sv_isbool(pTHX_ const SV * sv)3488 Perl_sv_isbool(pTHX_ const SV *sv)
3489 {
3490 return SvBoolFlagsOK(sv) && BOOL_INTERNALS_sv_isbool(sv);
3491 }
3492
3493 #ifdef USE_ITHREADS
3494
3495 PERL_STATIC_INLINE AV *
Perl_cop_file_avn(pTHX_ const COP * cop)3496 Perl_cop_file_avn(pTHX_ const COP *cop) {
3497
3498 PERL_ARGS_ASSERT_COP_FILE_AVN;
3499
3500 const char *file = CopFILE(cop);
3501 if (file) {
3502 GV *gv = gv_fetchfile_flags(file, strlen(file), GVF_NOADD);
3503 if (gv) {
3504 return GvAVn(gv);
3505 }
3506 else
3507 return NULL;
3508 }
3509 else
3510 return NULL;
3511 }
3512
3513 #endif
3514
3515 PERL_STATIC_INLINE PADNAME *
Perl_padname_refcnt_inc(PADNAME * pn)3516 Perl_padname_refcnt_inc(PADNAME *pn)
3517 {
3518 PadnameREFCNT(pn)++;
3519 return pn;
3520 }
3521
3522 PERL_STATIC_INLINE PADNAMELIST *
Perl_padnamelist_refcnt_inc(PADNAMELIST * pnl)3523 Perl_padnamelist_refcnt_inc(PADNAMELIST *pnl)
3524 {
3525 PadnamelistREFCNT(pnl)++;
3526 return pnl;
3527 }
3528
3529 /* copy a string to a safe spot */
3530
3531 /*
3532 =for apidoc_section $string
3533 =for apidoc savepv
3534
3535 Perl's version of C<strdup()>. Returns a pointer to a newly allocated
3536 string which is a duplicate of C<pv>. The size of the string is
3537 determined by C<strlen()>, which means it may not contain embedded C<NUL>
3538 characters and must have a trailing C<NUL>. To prevent memory leaks, the
3539 memory allocated for the new string needs to be freed when no longer needed.
3540 This can be done with the C<L</Safefree>> function, or
3541 L<C<SAVEFREEPV>|perlguts/SAVEFREEPV(p)>.
3542
3543 On some platforms, Windows for example, all allocated memory owned by a thread
3544 is deallocated when that thread ends. So if you need that not to happen, you
3545 need to use the shared memory functions, such as C<L</savesharedpv>>.
3546
3547 =cut
3548 */
3549
3550 PERL_STATIC_INLINE char *
Perl_savepv(pTHX_ const char * pv)3551 Perl_savepv(pTHX_ const char *pv)
3552 {
3553 PERL_UNUSED_CONTEXT;
3554 if (!pv)
3555 return NULL;
3556 else {
3557 char *newaddr;
3558 const STRLEN pvlen = strlen(pv)+1;
3559 Newx(newaddr, pvlen, char);
3560 return (char*)memcpy(newaddr, pv, pvlen);
3561 }
3562 }
3563
3564 /* same thing but with a known length */
3565
3566 /*
3567 =for apidoc savepvn
3568
3569 Perl's version of what C<strndup()> would be if it existed. Returns a
3570 pointer to a newly allocated string which is a duplicate of the first
3571 C<len> bytes from C<pv>, plus a trailing
3572 C<NUL> byte. The memory allocated for
3573 the new string can be freed with the C<Safefree()> function.
3574
3575 On some platforms, Windows for example, all allocated memory owned by a thread
3576 is deallocated when that thread ends. So if you need that not to happen, you
3577 need to use the shared memory functions, such as C<L</savesharedpvn>>.
3578
3579 =cut
3580 */
3581
3582 PERL_STATIC_INLINE char *
Perl_savepvn(pTHX_ const char * pv,Size_t len)3583 Perl_savepvn(pTHX_ const char *pv, Size_t len)
3584 {
3585 char *newaddr;
3586 PERL_UNUSED_CONTEXT;
3587
3588 Newx(newaddr,len+1,char);
3589 /* Give a meaning to NULL pointer mainly for the use in sv_magic() */
3590 if (pv) {
3591 /* might not be null terminated */
3592 newaddr[len] = '\0';
3593 return (char *) CopyD(pv,newaddr,len,char);
3594 }
3595 else {
3596 return (char *) ZeroD(newaddr,len+1,char);
3597 }
3598 }
3599
3600 /*
3601 =for apidoc savesvpv
3602
3603 A version of C<savepv()>/C<savepvn()> which gets the string to duplicate from
3604 the passed in SV using C<SvPV()>
3605
3606 On some platforms, Windows for example, all allocated memory owned by a thread
3607 is deallocated when that thread ends. So if you need that not to happen, you
3608 need to use the shared memory functions, such as C<L</savesharedsvpv>>.
3609
3610 =cut
3611 */
3612
3613 PERL_STATIC_INLINE char *
Perl_savesvpv(pTHX_ SV * sv)3614 Perl_savesvpv(pTHX_ SV *sv)
3615 {
3616 STRLEN len;
3617 const char * const pv = SvPV_const(sv, len);
3618 char *newaddr;
3619
3620 PERL_ARGS_ASSERT_SAVESVPV;
3621
3622 ++len;
3623 Newx(newaddr,len,char);
3624 return (char *) CopyD(pv,newaddr,len,char);
3625 }
3626
3627 /*
3628 =for apidoc savesharedsvpv
3629
3630 A version of C<savesharedpv()> which allocates the duplicate string in
3631 memory which is shared between threads.
3632
3633 =cut
3634 */
3635
3636 PERL_STATIC_INLINE char *
Perl_savesharedsvpv(pTHX_ SV * sv)3637 Perl_savesharedsvpv(pTHX_ SV *sv)
3638 {
3639 STRLEN len;
3640 const char * const pv = SvPV_const(sv, len);
3641
3642 PERL_ARGS_ASSERT_SAVESHAREDSVPV;
3643
3644 return savesharedpvn(pv, len);
3645 }
3646
3647 #ifndef PERL_GET_CONTEXT_DEFINED
3648
3649 /*
3650 =for apidoc_section $embedding
3651 =for apidoc get_context
3652
3653 Implements L<perlapi/C<PERL_GET_CONTEXT>>, which you should use instead.
3654
3655 =cut
3656 */
3657
3658 PERL_STATIC_INLINE void *
Perl_get_context(void)3659 Perl_get_context(void)
3660 {
3661 # if defined(USE_ITHREADS)
3662 # ifdef OLD_PTHREADS_API
3663 pthread_addr_t t;
3664 int error = pthread_getspecific(PL_thr_key, &t);
3665 if (error)
3666 Perl_croak_nocontext("panic: pthread_getspecific, error=%d", error);
3667 return (void*)t;
3668 # elif defined(I_MACH_CTHREADS)
3669 return (void*)cthread_data(cthread_self());
3670 # else
3671 return (void*)PTHREAD_GETSPECIFIC(PL_thr_key);
3672 # endif
3673 # else
3674 return (void*)NULL;
3675 # endif
3676 }
3677
3678 #endif
3679
3680 PERL_STATIC_INLINE MGVTBL*
Perl_get_vtbl(pTHX_ int vtbl_id)3681 Perl_get_vtbl(pTHX_ int vtbl_id)
3682 {
3683 PERL_UNUSED_CONTEXT;
3684
3685 return (vtbl_id < 0 || vtbl_id >= magic_vtable_max)
3686 ? NULL : (MGVTBL*)PL_magic_vtables + vtbl_id;
3687 }
3688
3689 /*
3690 =for apidoc my_strlcat
3691
3692 The C library C<strlcat> if available, or a Perl implementation of it.
3693 This operates on C C<NUL>-terminated strings.
3694
3695 C<my_strlcat()> appends string C<src> to the end of C<dst>. It will append at
3696 most S<C<size - strlen(dst) - 1>> characters. It will then C<NUL>-terminate,
3697 unless C<size> is 0 or the original C<dst> string was longer than C<size> (in
3698 practice this should not happen as it means that either C<size> is incorrect or
3699 that C<dst> is not a proper C<NUL>-terminated string).
3700
3701 Note that C<size> is the full size of the destination buffer and
3702 the result is guaranteed to be C<NUL>-terminated if there is room. Note that
3703 room for the C<NUL> should be included in C<size>.
3704
3705 The return value is the total length that C<dst> would have if C<size> is
3706 sufficiently large. Thus it is the initial length of C<dst> plus the length of
3707 C<src>. If C<size> is smaller than the return, the excess was not appended.
3708
3709 =cut
3710
3711 Description stolen from http://man.openbsd.org/strlcat.3
3712 */
3713 #ifndef HAS_STRLCAT
3714 PERL_STATIC_INLINE Size_t
Perl_my_strlcat(char * dst,const char * src,Size_t size)3715 Perl_my_strlcat(char *dst, const char *src, Size_t size)
3716 {
3717 Size_t used, length, copy;
3718
3719 used = strlen(dst);
3720 length = strlen(src);
3721 if (size > 0 && used < size - 1) {
3722 copy = (length >= size - used) ? size - used - 1 : length;
3723 memcpy(dst + used, src, copy);
3724 dst[used + copy] = '\0';
3725 }
3726 return used + length;
3727 }
3728 #endif
3729
3730
3731 /*
3732 =for apidoc my_strlcpy
3733
3734 The C library C<strlcpy> if available, or a Perl implementation of it.
3735 This operates on C C<NUL>-terminated strings.
3736
3737 C<my_strlcpy()> copies up to S<C<size - 1>> characters from the string C<src>
3738 to C<dst>, C<NUL>-terminating the result if C<size> is not 0.
3739
3740 The return value is the total length C<src> would be if the copy completely
3741 succeeded. If it is larger than C<size>, the excess was not copied.
3742
3743 =cut
3744
3745 Description stolen from http://man.openbsd.org/strlcpy.3
3746 */
3747 #ifndef HAS_STRLCPY
3748 PERL_STATIC_INLINE Size_t
Perl_my_strlcpy(char * dst,const char * src,Size_t size)3749 Perl_my_strlcpy(char *dst, const char *src, Size_t size)
3750 {
3751 Size_t length, copy;
3752
3753 length = strlen(src);
3754 if (size > 0) {
3755 copy = (length >= size) ? size - 1 : length;
3756 memcpy(dst, src, copy);
3757 dst[copy] = '\0';
3758 }
3759 return length;
3760 }
3761 #endif
3762
3763 /*
3764 * ex: set ts=8 sts=4 sw=4 et:
3765 */
3766