1<?php
2
3/*
4 * This file is part of the Symfony package.
5 *
6 * (c) Fabien Potencier <fabien@symfony.com>
7 *
8 * For the full copyright and license information, please view the LICENSE
9 * file that was distributed with this source code.
10 */
11
12namespace Symfony\Polyfill\Intl\Grapheme;
13
14\define('SYMFONY_GRAPHEME_CLUSTER_RX', \PCRE_VERSION >= '8.32' ? '\X' : Grapheme::GRAPHEME_CLUSTER_RX);
15
16/**
17 * Partial intl implementation in pure PHP.
18 *
19 * Implemented:
20 * - grapheme_extract  - Extract a sequence of grapheme clusters from a text buffer, which must be encoded in UTF-8
21 * - grapheme_stripos  - Find position (in grapheme units) of first occurrence of a case-insensitive string
22 * - grapheme_stristr  - Returns part of haystack string from the first occurrence of case-insensitive needle to the end of haystack
23 * - grapheme_strlen   - Get string length in grapheme units
24 * - grapheme_strpos   - Find position (in grapheme units) of first occurrence of a string
25 * - grapheme_strripos - Find position (in grapheme units) of last occurrence of a case-insensitive string
26 * - grapheme_strrpos  - Find position (in grapheme units) of last occurrence of a string
27 * - grapheme_strstr   - Returns part of haystack string from the first occurrence of needle to the end of haystack
28 * - grapheme_substr   - Return part of a string
29 *
30 * @author Nicolas Grekas <p@tchwork.com>
31 *
32 * @internal
33 */
34final class Grapheme
35{
36    // (CRLF|([ZWNJ-ZWJ]|T+|L*(LV?V+|LV|LVT)T*|L+|[^Control])[Extend]*|[Control])
37    // This regular expression is a work around for http://bugs.exim.org/1279
38    public const GRAPHEME_CLUSTER_RX = '(?:\r\n|(?:[ -~\x{200C}\x{200D}]|[ᆨ-ᇹ]+|[ᄀ-ᅟ]*(?:[가개갸걔거게겨계고과괘괴교구궈궤귀규그긔기까깨꺄꺠꺼께껴꼐꼬꽈꽤꾀꾜꾸꿔꿰뀌뀨끄끠끼나내냐냬너네녀녜노놔놰뇌뇨누눠눼뉘뉴느늬니다대댜댸더데뎌뎨도돠돼되됴두둬뒈뒤듀드듸디따때땨떄떠떼뗘뗴또똬뙈뙤뚀뚜뚸뛔뛰뜌뜨띄띠라래랴럐러레려례로롸뢔뢰료루뤄뤠뤼류르릐리마매먀먜머메며몌모뫄뫠뫼묘무뭐뭬뮈뮤므믜미바배뱌뱨버베벼볘보봐봬뵈뵤부붜붸뷔뷰브븨비빠빼뺘뺴뻐뻬뼈뼤뽀뽜뽸뾔뾰뿌뿨쀄쀠쀼쁘쁴삐사새샤섀서세셔셰소솨쇄쇠쇼수숴쉐쉬슈스싀시싸쌔쌰썌써쎄쎠쎼쏘쏴쐐쐬쑈쑤쒀쒜쒸쓔쓰씌씨아애야얘어에여예오와왜외요우워웨위유으의이자재쟈쟤저제져졔조좌좨죄죠주줘줴쥐쥬즈즤지짜째쨔쨰쩌쩨쪄쪠쪼쫘쫴쬐쬬쭈쭤쮀쮜쮸쯔쯰찌차채챠챼처체쳐쳬초촤쵀최쵸추춰췌취츄츠츼치카캐캬컈커케켜켸코콰쾌쾨쿄쿠쿼퀘퀴큐크킈키타태탸턔터테텨톄토톼퇘퇴툐투퉈퉤튀튜트틔티파패퍄퍠퍼페펴폐포퐈퐤푀표푸풔풰퓌퓨프픠피하해햐햬허헤혀혜호화홰회효후훠훼휘휴흐희히]?[ᅠ-ᆢ]+|[가-힣])[ᆨ-ᇹ]*|[ᄀ-ᅟ]+|[^\p{Cc}\p{Cf}\p{Zl}\p{Zp}])[\p{Mn}\p{Me}\x{09BE}\x{09D7}\x{0B3E}\x{0B57}\x{0BBE}\x{0BD7}\x{0CC2}\x{0CD5}\x{0CD6}\x{0D3E}\x{0D57}\x{0DCF}\x{0DDF}\x{200C}\x{200D}\x{1D165}\x{1D16E}-\x{1D172}]*|[\p{Cc}\p{Cf}\p{Zl}\p{Zp}])';
39
40    private const CASE_FOLD = [
41        ['µ', 'ſ', "\xCD\x85", 'ς', "\xCF\x90", "\xCF\x91", "\xCF\x95", "\xCF\x96", "\xCF\xB0", "\xCF\xB1", "\xCF\xB5", "\xE1\xBA\x9B", "\xE1\xBE\xBE"],
42        ['μ', 's', 'ι',        'σ', 'β',        'θ',        'φ',        'π',        'κ',        'ρ',        'ε',        "\xE1\xB9\xA1", 'ι'],
43    ];
44
45    public static function grapheme_extract($s, $size, $type = \GRAPHEME_EXTR_COUNT, $start = 0, &$next = 0)
46    {
47        if (0 > $start) {
48            $start = \strlen($s) + $start;
49        }
50
51        if (!is_scalar($s)) {
52            $hasError = false;
53            set_error_handler(function () use (&$hasError) { $hasError = true; });
54            $next = substr($s, $start);
55            restore_error_handler();
56            if ($hasError) {
57                substr($s, $start);
58                $s = '';
59            } else {
60                $s = $next;
61            }
62        } else {
63            $s = substr($s, $start);
64        }
65        $size = (int) $size;
66        $type = (int) $type;
67        $start = (int) $start;
68
69        if (\GRAPHEME_EXTR_COUNT !== $type && \GRAPHEME_EXTR_MAXBYTES !== $type && \GRAPHEME_EXTR_MAXCHARS !== $type) {
70            if (80000 > \PHP_VERSION_ID) {
71                return false;
72            }
73
74            throw new \ValueError('grapheme_extract(): Argument #3 ($type) must be one of GRAPHEME_EXTR_COUNT, GRAPHEME_EXTR_MAXBYTES, or GRAPHEME_EXTR_MAXCHARS');
75        }
76
77        if (!isset($s[0]) || 0 > $size || 0 > $start) {
78            return false;
79        }
80        if (0 === $size) {
81            return '';
82        }
83
84        $next = $start;
85
86        $s = preg_split('/('.SYMFONY_GRAPHEME_CLUSTER_RX.')/u', "\r\n".$s, $size + 1, \PREG_SPLIT_NO_EMPTY | \PREG_SPLIT_DELIM_CAPTURE);
87
88        if (!isset($s[1])) {
89            return false;
90        }
91
92        $i = 1;
93        $ret = '';
94
95        do {
96            if (\GRAPHEME_EXTR_COUNT === $type) {
97                --$size;
98            } elseif (\GRAPHEME_EXTR_MAXBYTES === $type) {
99                $size -= \strlen($s[$i]);
100            } else {
101                $size -= iconv_strlen($s[$i], 'UTF-8//IGNORE');
102            }
103
104            if ($size >= 0) {
105                $ret .= $s[$i];
106            }
107        } while (isset($s[++$i]) && $size > 0);
108
109        $next += \strlen($ret);
110
111        return $ret;
112    }
113
114    public static function grapheme_strlen($s)
115    {
116        preg_replace('/'.SYMFONY_GRAPHEME_CLUSTER_RX.'/u', '', $s, -1, $len);
117
118        return 0 === $len && '' !== $s ? null : $len;
119    }
120
121    public static function grapheme_substr($s, $start, $len = null)
122    {
123        if (null === $len) {
124            $len = 2147483647;
125        }
126
127        preg_match_all('/'.SYMFONY_GRAPHEME_CLUSTER_RX.'/u', $s, $s);
128
129        $slen = \count($s[0]);
130        $start = (int) $start;
131
132        if (0 > $start) {
133            $start += $slen;
134        }
135        if (0 > $start) {
136            if (\PHP_VERSION_ID < 80000) {
137                return false;
138            }
139
140            $start = 0;
141        }
142        if ($start >= $slen) {
143            return \PHP_VERSION_ID >= 80000 ? '' : false;
144        }
145
146        $rem = $slen - $start;
147
148        if (0 > $len) {
149            $len += $rem;
150        }
151        if (0 === $len) {
152            return '';
153        }
154        if (0 > $len) {
155            return \PHP_VERSION_ID >= 80000 ? '' : false;
156        }
157        if ($len > $rem) {
158            $len = $rem;
159        }
160
161        return implode('', \array_slice($s[0], $start, $len));
162    }
163
164    public static function grapheme_strpos($s, $needle, $offset = 0)
165    {
166        return self::grapheme_position($s, $needle, $offset, 0);
167    }
168
169    public static function grapheme_stripos($s, $needle, $offset = 0)
170    {
171        return self::grapheme_position($s, $needle, $offset, 1);
172    }
173
174    public static function grapheme_strrpos($s, $needle, $offset = 0)
175    {
176        return self::grapheme_position($s, $needle, $offset, 2);
177    }
178
179    public static function grapheme_strripos($s, $needle, $offset = 0)
180    {
181        return self::grapheme_position($s, $needle, $offset, 3);
182    }
183
184    public static function grapheme_stristr($s, $needle, $beforeNeedle = false)
185    {
186        return mb_stristr($s, $needle, $beforeNeedle, 'UTF-8');
187    }
188
189    public static function grapheme_strstr($s, $needle, $beforeNeedle = false)
190    {
191        return mb_strstr($s, $needle, $beforeNeedle, 'UTF-8');
192    }
193
194    private static function grapheme_position($s, $needle, $offset, $mode)
195    {
196        $needle = (string) $needle;
197        if (80000 > \PHP_VERSION_ID && !preg_match('/./us', $needle)) {
198            return false;
199        }
200        $s = (string) $s;
201        if (!preg_match('/./us', $s)) {
202            return false;
203        }
204        if ($offset > 0) {
205            $s = self::grapheme_substr($s, $offset);
206        } elseif ($offset < 0) {
207            if (2 > $mode) {
208                $offset += self::grapheme_strlen($s);
209                $s = self::grapheme_substr($s, $offset);
210                if (0 > $offset) {
211                    $offset = 0;
212                }
213            } elseif (0 > $offset += self::grapheme_strlen($needle)) {
214                $s = self::grapheme_substr($s, 0, $offset);
215                $offset = 0;
216            } else {
217                $offset = 0;
218            }
219        }
220
221        // As UTF-8 is self-synchronizing, and we have ensured the strings are valid UTF-8,
222        // we can use normal binary string functions here. For case-insensitive searches,
223        // case fold the strings first.
224        $caseInsensitive = $mode & 1;
225        $reverse = $mode & 2;
226        if ($caseInsensitive) {
227            // Use the same case folding mode as mbstring does for mb_stripos().
228            // Stick to SIMPLE case folding to avoid changing the length of the string, which
229            // might result in offsets being shifted.
230            $mode = \defined('MB_CASE_FOLD_SIMPLE') ? \MB_CASE_FOLD_SIMPLE : \MB_CASE_LOWER;
231            $s = mb_convert_case($s, $mode, 'UTF-8');
232            $needle = mb_convert_case($needle, $mode, 'UTF-8');
233
234            if (!\defined('MB_CASE_FOLD_SIMPLE')) {
235                $s = str_replace(self::CASE_FOLD[0], self::CASE_FOLD[1], $s);
236                $needle = str_replace(self::CASE_FOLD[0], self::CASE_FOLD[1], $needle);
237            }
238        }
239        if ($reverse) {
240            $needlePos = strrpos($s, $needle);
241        } else {
242            $needlePos = strpos($s, $needle);
243        }
244
245        return false !== $needlePos ? self::grapheme_strlen(substr($s, 0, $needlePos)) + $offset : false;
246    }
247}
248