1################################################################################
2##
3##  Version 3.x, Copyright (C) 2004-2013, Marcus Holland-Moritz.
4##  Version 2.x, Copyright (C) 2001, Paul Marquess.
5##  Version 1.x, Copyright (C) 1999, Kenneth Albanowski.
6##
7##  This program is free software; you can redistribute it and/or
8##  modify it under the same terms as Perl itself.
9##
10################################################################################
11
12=provides
13
14__UNDEFINED__
15my_strnlen
16SvUOK
17utf8_to_uvchr_buf
18
19=dontwarn
20
21_ppport_utf8_to_uvchr_buf_callee
22_ppport_MIN
23
24=implementation
25
26#define _ppport_MIN(a,b) (((a) <= (b)) ? (a) : (b))
27
28__UNDEFINED__  sv_setuv(sv, uv)                     \
29               STMT_START {                         \
30                 UV TeMpUv = uv;                    \
31                 if (TeMpUv <= IV_MAX)              \
32                   sv_setiv(sv, TeMpUv);            \
33                 else                               \
34                   sv_setnv(sv, (double)TeMpUv);    \
35               } STMT_END
36
37__UNDEFINED__  newSVuv(uv)     ((uv) <= IV_MAX ? newSViv((IV)uv) : newSVnv((NV)uv))
38
39__UNDEFINED__  sv_2uv(sv)      ((PL_Sv = (sv)), (UV) (SvNOK(PL_Sv) ? SvNV(PL_Sv) : sv_2nv(PL_Sv)))
40__UNDEFINED__  SvUVX(sv)       ((UV)SvIVX(sv))
41__UNDEFINED__  SvUVXx(sv)      SvUVX(sv)
42__UNDEFINED__  SvUV(sv)        (SvIOK(sv) ? SvUVX(sv) : sv_2uv(sv))
43__UNDEFINED__  SvUVx(sv)       ((PL_Sv = (sv)), SvUV(PL_Sv))
44
45/* Hint: sv_uv
46 * Always use the SvUVx() macro instead of sv_uv().
47 */
48__UNDEFINED__  sv_uv(sv)       SvUVx(sv)
49
50#if !defined(SvUOK) && defined(SvIOK_UV)
51#  define SvUOK(sv) SvIOK_UV(sv)
52#endif
53
54__UNDEFINED__  XST_mUV(i,v)    (ST(i) = sv_2mortal(newSVuv(v))  )
55__UNDEFINED__  XSRETURN_UV(v)  STMT_START { XST_mUV(0,v);  XSRETURN(1); } STMT_END
56
57__UNDEFINED__  PUSHu(u)        STMT_START { sv_setuv(TARG, (UV)(u)); PUSHTARG;  } STMT_END
58__UNDEFINED__  XPUSHu(u)       STMT_START { sv_setuv(TARG, (UV)(u)); XPUSHTARG; } STMT_END
59
60#if defined UTF8SKIP
61
62/* Don't use official version because it uses MIN, which may not be available */
63#undef UTF8_SAFE_SKIP
64
65__UNDEFINED__  UTF8_SAFE_SKIP(s, e)  (                                          \
66                                      ((((e) - (s)) <= 0)                       \
67                                      ? 0                                       \
68                                      : _ppport_MIN(((e) - (s)), UTF8SKIP(s))))
69#endif
70
71#if !defined(my_strnlen)
72#if { NEED my_strnlen }
73
74STRLEN
75my_strnlen(const char *str, Size_t maxlen)
76{
77    const char *p = str;
78
79    while(maxlen-- && *p)
80        p++;
81
82    return p - str;
83}
84
85#endif
86#endif
87
88#if { VERSION < 5.31.2 }
89        /* Versions prior to this accepted things that are now considered
90         * malformations, and didn't return -1 on error with warnings enabled
91         * */
92#  undef utf8_to_uvchr_buf
93#endif
94
95/* This implementation brings modern, generally more restricted standards to
96 * utf8_to_uvchr_buf.  Some of these are security related, and clearly must
97 * be done.  But its arguable that the others need not, and hence should not.
98 * The reason they're here is that a module that intends to play with the
99 * latest perls shoud be able to work the same in all releases.  An example is
100 * that perl no longer accepts any UV for a code point, but limits them to
101 * IV_MAX or below.  This is for future internal use of the larger code points.
102 * If it turns out that some of these changes are breaking code that isn't
103 * intended to work with modern perls, the tighter restrictions could be
104 * relaxed.  khw thinks this is unlikely, but has been wrong in the past. */
105
106#ifndef utf8_to_uvchr_buf
107   /* Choose which underlying implementation to use.  At least one must be
108    * present or the perl is too early to handle this function */
109#  if defined(utf8n_to_uvchr) || defined(utf8_to_uv)
110#    if defined(utf8n_to_uvchr)   /* This is the preferred implementation */
111#      define _ppport_utf8_to_uvchr_buf_callee utf8n_to_uvchr
112#    else
113#      define _ppport_utf8_to_uvchr_buf_callee utf8_to_uv
114#    endif
115
116#  endif
117
118#ifdef _ppport_utf8_to_uvchr_buf_callee
119#  if { NEED utf8_to_uvchr_buf }
120
121UV
122utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
123{
124    UV ret;
125    STRLEN curlen;
126    bool overflows = 0;
127    const U8 *cur_s = s;
128    const bool do_warnings = ckWARN_d(WARN_UTF8);
129
130    if (send > s) {
131        curlen = send - s;
132    }
133    else {
134        assert(0);  /* Modern perls die under this circumstance */
135        curlen = 0;
136        if (! do_warnings) {    /* Handle empty here if no warnings needed */
137            if (retlen) *retlen = 0;
138            return UNICODE_REPLACEMENT;
139        }
140    }
141
142    /* The modern version allows anything that evaluates to a legal UV, but not
143     * overlongs nor an empty input */
144    ret = _ppport_utf8_to_uvchr_buf_callee(
145                s, curlen, retlen,   (UTF8_ALLOW_ANYUV
146                                  & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
147
148    /* But actually, modern versions restrict the UV to being no more than what
149     * an IV can hold */
150    if (ret > PERL_INT_MAX) {
151        overflows = 1;
152    }
153
154#    if { VERSION < 5.26.0 }
155#      ifndef EBCDIC
156
157        /* There are bugs in versions earlier than this on non-EBCDIC platforms
158         * in which it did not detect all instances of overflow, which could be
159         * a security hole.  Also, earlier versions did not allow the overflow
160         * malformation under any circumstances, and modern ones do.  So we
161         * need to check here.  */
162
163    else if (curlen > 0 && *s >= 0xFE) {
164
165        /* If the main routine detected overflow, great; it returned 0.  But if the
166         * input's first byte indicates it could overflow, we need to verify.
167         * First, on a 32-bit machine the first byte being at least \xFE
168         * automatically is overflow */
169        if (sizeof(ret) < 8) {
170            overflows = 1;
171        }
172        else {
173            const U8 highest[] =    /* 2*63-1 */
174                        "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
175            const U8 *cur_h = highest;
176
177            for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
178                if (UNLIKELY(*cur_s == *cur_h)) {
179                    continue;
180                }
181
182                /* If this byte is larger than the corresponding highest UTF-8
183                * byte, the sequence overflows; otherwise the byte is less than
184                * (as we handled the equality case above), and so the sequence
185                * doesn't overflow */
186                overflows = *cur_s > *cur_h;
187                break;
188
189            }
190
191            /* Here, either we set the bool and broke out of the loop, or got
192             * to the end and all bytes are the same which indicates it doesn't
193             * overflow. */
194        }
195    }
196
197#      endif
198#    endif  /* < 5.26 */
199
200    if (UNLIKELY(overflows)) {
201        if (! do_warnings) {
202            if (retlen) {
203                *retlen = _ppport_MIN(*retlen, UTF8SKIP(s));
204                *retlen = _ppport_MIN(*retlen, curlen);
205            }
206            return UNICODE_REPLACEMENT;
207        }
208        else {
209
210            /* On versions that correctly detect overflow, but forbid it
211             * always, 0 will be returned, but also a warning will have been
212             * raised.  Don't repeat it */
213            if (ret != 0) {
214                /* We use the error message in use from 5.8-5.14 */
215                Perl_warner(aTHX_ packWARN(WARN_UTF8),
216                    "Malformed UTF-8 character (overflow at 0x%" UVxf
217                    ", byte 0x%02x, after start byte 0x%02x)",
218                    ret, *cur_s, *s);
219            }
220            if (retlen) {
221                *retlen = (STRLEN) -1;
222            }
223            return 0;
224        }
225    }
226
227    /* If failed and warnings are off, to emulate the behavior of the real
228     * utf8_to_uvchr(), try again, allowing anything.  (Note a return of 0 is
229     * ok if the input was '\0') */
230    if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
231
232        /* If curlen is 0, we already handled the case where warnings are
233         * disabled, so this 'if' will be true, and we won't look at the
234         * contents of 's' */
235        if (do_warnings) {
236            *retlen = (STRLEN) -1;
237        }
238        else {
239            ret = _ppport_utf8_to_uvchr_buf_callee(
240                                            s, curlen, retlen, UTF8_ALLOW_ANY);
241            /* Override with the REPLACEMENT character, as that is what the
242             * modern version of this function returns */
243            ret = UNICODE_REPLACEMENT;
244
245#           if { VERSION < 5.16.0 }
246
247            /* Versions earlier than this don't necessarily return the proper
248             * length.  It should not extend past the end of string, nor past
249             * what the first byte indicates the length is, nor past the
250             * continuation characters */
251            if (retlen && *retlen >= 0) {
252                *retlen = _ppport_MIN(*retlen, curlen);
253                *retlen = _ppport_MIN(*retlen, UTF8SKIP(s));
254                unsigned int i = 1;
255                do {
256                    if (s[i] < 0x80 || s[i] > 0xBF) {
257                        *retlen = i;
258                        break;
259                    }
260                } while (++i < *retlen);
261            }
262
263#           endif
264
265        }
266    }
267
268    return ret;
269}
270
271#  endif
272#endif
273#endif
274
275#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
276#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
277                        to read past a NUL, making it much less likely to read
278                        off the end of the buffer.  A NUL indicates the start
279                        of the next character anyway.  If the input isn't
280                        NUL-terminated, the function remains unsafe, as it
281                        always has been. */
282
283__UNDEFINED__  utf8_to_uvchr(s, lp)                                             \
284    ((*(s) == '\0')                                                             \
285    ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */        \
286    : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp)))
287
288#endif
289
290=xsinit
291
292#define NEED_my_strnlen
293#define NEED_utf8_to_uvchr_buf
294
295=xsubs
296
297SV *
298sv_setuv(uv)
299        UV uv
300        CODE:
301                RETVAL = newSViv(1);
302                sv_setuv(RETVAL, uv);
303        OUTPUT:
304                RETVAL
305
306SV *
307newSVuv(uv)
308        UV uv
309        CODE:
310                RETVAL = newSVuv(uv);
311        OUTPUT:
312                RETVAL
313
314UV
315sv_2uv(sv)
316        SV *sv
317        CODE:
318                RETVAL = sv_2uv(sv);
319        OUTPUT:
320                RETVAL
321
322UV
323SvUVx(sv)
324        SV *sv
325        CODE:
326                sv--;
327                RETVAL = SvUVx(++sv);
328        OUTPUT:
329                RETVAL
330
331void
332XSRETURN_UV()
333        PPCODE:
334                XSRETURN_UV(42);
335
336void
337PUSHu()
338        PREINIT:
339                dTARG;
340        PPCODE:
341                TARG = sv_newmortal();
342                EXTEND(SP, 1);
343                PUSHu(42);
344                XSRETURN(1);
345
346void
347XPUSHu()
348        PREINIT:
349                dTARG;
350        PPCODE:
351                TARG = sv_newmortal();
352                XPUSHu(43);
353                XSRETURN(1);
354
355STRLEN
356UTF8_SAFE_SKIP(s, adjustment)
357        unsigned char * s
358        int adjustment
359        CODE:
360            /* Instead of passing in an 'e' ptr, use the real end, adjusted */
361            RETVAL = UTF8_SAFE_SKIP(s, s + UTF8SKIP(s) + adjustment);
362        OUTPUT:
363            RETVAL
364
365STRLEN
366my_strnlen(s, max)
367        char * s
368        STRLEN max
369        CODE:
370            RETVAL= my_strnlen(s, max);
371        OUTPUT:
372            RETVAL
373
374AV *
375utf8_to_uvchr_buf(s, adjustment)
376        unsigned char *s
377        int adjustment
378        PREINIT:
379            AV *av;
380            STRLEN len;
381        CODE:
382            av = newAV();
383            av_push(av, newSVuv(utf8_to_uvchr_buf(s,
384                                                  s + UTF8SKIP(s) + adjustment,
385                                                  &len)));
386            if (len == (STRLEN) -1) {
387                av_push(av, newSViv(-1));
388            }
389            else {
390                av_push(av, newSVuv(len));
391            }
392            RETVAL = av;
393        OUTPUT:
394                RETVAL
395
396AV *
397utf8_to_uvchr(s)
398        unsigned char *s
399        PREINIT:
400            AV *av;
401            STRLEN len;
402        CODE:
403            av = newAV();
404            av_push(av, newSVuv(utf8_to_uvchr(s, &len)));
405            if (len == (STRLEN) -1) {
406                av_push(av, newSViv(-1));
407            }
408            else {
409                av_push(av, newSVuv(len));
410            }
411            RETVAL = av;
412        OUTPUT:
413                RETVAL
414
415=tests plan => 52
416
417ok(&Devel::PPPort::sv_setuv(42), 42);
418ok(&Devel::PPPort::newSVuv(123), 123);
419ok(&Devel::PPPort::sv_2uv("4711"), 4711);
420ok(&Devel::PPPort::sv_2uv("1735928559"), 1735928559);
421ok(&Devel::PPPort::SvUVx("1735928559"), 1735928559);
422ok(&Devel::PPPort::SvUVx(1735928559), 1735928559);
423ok(&Devel::PPPort::SvUVx(0xdeadbeef), 0xdeadbeef);
424ok(&Devel::PPPort::XSRETURN_UV(), 42);
425ok(&Devel::PPPort::PUSHu(), 42);
426ok(&Devel::PPPort::XPUSHu(), 43);
427ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
428ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
429ok(&Devel::PPPort::my_strnlen("abc\0def", 7), 3);
430
431my $ret = &Devel::PPPort::utf8_to_uvchr("A");
432ok($ret->[0], ord("A"));
433ok($ret->[1], 1);
434
435$ret = &Devel::PPPort::utf8_to_uvchr("\0");
436ok($ret->[0], 0);
437ok($ret->[1], 1);
438
439$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
440ok($ret->[0], ord("A"));
441ok($ret->[1], 1);
442
443$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
444ok($ret->[0], 0);
445ok($ret->[1], 1);
446
447if (ord("A") != 65) {   # tests not valid for EBCDIC
448    ok(1, 1) for 1 .. (2 + 4 + (5 * 5));
449}
450else {
451    $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
452    ok($ret->[0], 0x100);
453    ok($ret->[1], 2);
454
455    my @warnings;
456    local $SIG{__WARN__} = sub { push @warnings, @_; };
457
458    {
459        use warnings 'utf8';
460        $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
461        ok($ret->[0], 0);
462        ok($ret->[1], -1);
463
464        no warnings;
465        $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
466        ok($ret->[0], 0xFFFD);
467        ok($ret->[1], 1);
468    }
469
470    my @buf_tests = (
471        {
472            input      => "A",
473            adjustment => -1,
474            warning    => qr/empty/,
475            no_warnings_returned_length => 0,
476        },
477        {
478            input      => "\xc4\xc5",
479            adjustment => 0,
480            warning    => qr/non-continuation/,
481            no_warnings_returned_length => 1,
482        },
483        {
484            input      => "\xc4\x80",
485            adjustment => -1,
486            warning    => qr/short|1 byte, need 2/,
487            no_warnings_returned_length => 1,
488        },
489        {
490            input      => "\xc0\x81",
491            adjustment => 0,
492            warning    => qr/overlong|2 bytes, need 1/,
493            no_warnings_returned_length => 2,
494        },
495        {                 # Old algorithm supposedly failed to detect this
496            input      => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
497            adjustment => 0,
498            warning    => qr/overflow/,
499            no_warnings_returned_length => 13,
500        },
501    );
502
503    # An empty input is an assertion failure on debugging builds.  It is
504    # deliberately the first test.
505    require Config; import Config;
506    use vars '%Config';
507    if ($Config{ccflags} =~ /-DDEBUGGING/) {
508        shift @buf_tests;
509        ok(1, 1) for 1..5;
510    }
511
512    for my $test (@buf_tests) {
513        my $input = $test->{'input'};
514        my $adjustment = $test->{'adjustment'};
515        my $display = 'utf8_to_uvchr_buf("';
516        for (my $i = 0; $i < length($input) + $adjustment; $i++) {
517            $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
518        }
519
520        $display .= '")';
521        my $warning = $test->{'warning'};
522
523        undef @warnings;
524        use warnings 'utf8';
525        $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
526        ok($ret->[0], 0,  "returned value $display; warnings enabled");
527        ok($ret->[1], -1, "returned length $display; warnings enabled");
528        my $all_warnings = join "; ", @warnings;
529        my $contains = grep { $_ =~ $warning } $all_warnings;
530        ok($contains, 1, $display . "; '$all_warnings' contains '$warning'");
531
532        undef @warnings;
533        no warnings 'utf8';
534        $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
535        ok($ret->[0], 0xFFFD,  "returned value $display; warnings disabled");
536        ok($ret->[1], $test->{'no_warnings_returned_length'},
537                      "returned length $display; warnings disabled");
538    }
539}
540