1 #ifdef PERL_EXT_RE_BUILD
2 #include "re_top.h"
3 #endif
4
5 #include "EXTERN.h"
6 #define PERL_IN_REGEX_ENGINE
7 #define PERL_IN_REGCOMP_ANY
8 #define PERL_IN_REGCOMP_INVLIST_C
9 #include "perl.h"
10
11 #ifdef PERL_IN_XSUB_RE
12 # include "re_comp.h"
13 #else
14 # include "regcomp.h"
15 #endif
16
17 #include "invlist_inline.h"
18 #include "unicode_constants.h"
19 #include "regcomp_internal.h"
20
21
22 void
Perl_populate_bitmap_from_invlist(pTHX_ SV * invlist,const UV offset,const U8 * bitmap,const Size_t len)23 Perl_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len)
24 {
25 PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST;
26
27 /* As the name says. The zeroth bit corresponds to the code point given by
28 * 'offset' */
29
30 UV start, end;
31
32 Zero(bitmap, len, U8);
33
34 invlist_iterinit(invlist);
35 while (invlist_iternext(invlist, &start, &end)) {
36 assert(start >= offset);
37
38 for (UV i = start; i <= end; i++) {
39 UV adjusted = i - offset;
40
41 BITMAP_BYTE(bitmap, adjusted) |= BITMAP_BIT(adjusted);
42 }
43 }
44 invlist_iterfinish(invlist);
45 }
46
47 void
Perl_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap,const Size_t bitmap_len,SV ** invlist,const UV offset)48 Perl_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset)
49 {
50 PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP;
51
52 /* As the name says. The zeroth bit corresponds to the code point given by
53 * 'offset' */
54
55 Size_t i;
56
57 for (i = 0; i < bitmap_len; i++) {
58 if (BITMAP_TEST(bitmap, i)) {
59 int start = i++;
60
61 /* Save a little work by adding a range all at once instead of bit
62 * by bit */
63 while (i < bitmap_len && BITMAP_TEST(bitmap, i)) {
64 i++;
65 }
66
67 *invlist = _add_range_to_invlist(*invlist,
68 start + offset,
69 i + offset - 1);
70 }
71 }
72 }
73
74 /* This section of code defines the inversion list object and its methods. The
75 * interfaces are highly subject to change, so as much as possible is static to
76 * this file. An inversion list is here implemented as a malloc'd C UV array
77 * as an SVt_INVLIST scalar.
78 *
79 * An inversion list for Unicode is an array of code points, sorted by ordinal
80 * number. Each element gives the code point that begins a range that extends
81 * up-to but not including the code point given by the next element. The final
82 * element gives the first code point of a range that extends to the platform's
83 * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4],
84 * ...) give ranges whose code points are all in the inversion list. We say
85 * that those ranges are in the set. The odd-numbered elements give ranges
86 * whose code points are not in the inversion list, and hence not in the set.
87 * Thus, element [0] is the first code point in the list. Element [1]
88 * is the first code point beyond that not in the list; and element [2] is the
89 * first code point beyond that that is in the list. In other words, the first
90 * range is invlist[0]..(invlist[1]-1), and all code points in that range are
91 * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and
92 * all code points in that range are not in the inversion list. The third
93 * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
94 * list, and so forth. Thus every element whose index is divisible by two
95 * gives the beginning of a range that is in the list, and every element whose
96 * index is not divisible by two gives the beginning of a range not in the
97 * list. If the final element's index is divisible by two, the inversion list
98 * extends to the platform's infinity; otherwise the highest code point in the
99 * inversion list is the contents of that element minus 1.
100 *
101 * A range that contains just a single code point N will look like
102 * invlist[i] == N
103 * invlist[i+1] == N+1
104 *
105 * If N is UV_MAX (the highest representable code point on the machine), N+1 is
106 * impossible to represent, so element [i+1] is omitted. The single element
107 * inversion list
108 * invlist[0] == UV_MAX
109 * contains just UV_MAX, but is interpreted as matching to infinity.
110 *
111 * Taking the complement (inverting) an inversion list is quite simple, if the
112 * first element is 0, remove it; otherwise add a 0 element at the beginning.
113 * This implementation reserves an element at the beginning of each inversion
114 * list to always contain 0; there is an additional flag in the header which
115 * indicates if the list begins at the 0, or is offset to begin at the next
116 * element. This means that the inversion list can be inverted without any
117 * copying; just flip the flag.
118 *
119 * More about inversion lists can be found in "Unicode Demystified"
120 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
121 *
122 * The inversion list data structure is currently implemented as an SV pointing
123 * to an array of UVs that the SV thinks are bytes. This allows us to have an
124 * array of UV whose memory management is automatically handled by the existing
125 * facilities for SV's.
126 *
127 * Some of the methods should always be private to the implementation, and some
128 * should eventually be made public */
129
130 /* The header definitions are in F<invlist_inline.h> */
131
132 #ifndef PERL_IN_XSUB_RE
133
134 PERL_STATIC_INLINE UV*
S__invlist_array_init(SV * const invlist,const bool will_have_0)135 S__invlist_array_init(SV* const invlist, const bool will_have_0)
136 {
137 /* Returns a pointer to the first element in the inversion list's array.
138 * This is called upon initialization of an inversion list. Where the
139 * array begins depends on whether the list has the code point U+0000 in it
140 * or not. The other parameter tells it whether the code that follows this
141 * call is about to put a 0 in the inversion list or not. The first
142 * element is either the element reserved for 0, if TRUE, or the element
143 * after it, if FALSE */
144
145 bool* offset = get_invlist_offset_addr(invlist);
146 UV* zero_addr = (UV *) SvPVX(invlist);
147
148 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
149
150 /* Must be empty */
151 assert(! _invlist_len(invlist));
152
153 *zero_addr = 0;
154
155 /* 1^1 = 0; 1^0 = 1 */
156 *offset = 1 ^ will_have_0;
157 return zero_addr + *offset;
158 }
159
160 STATIC void
S_invlist_replace_list_destroys_src(pTHX_ SV * dest,SV * src)161 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
162 {
163 /* Replaces the inversion list in 'dest' with the one from 'src'. It
164 * steals the list from 'src', so 'src' is made to have a NULL list. This
165 * is similar to what SvSetMagicSV() would do, if it were implemented on
166 * inversion lists, though this routine avoids a copy */
167
168 const UV src_len = _invlist_len(src);
169 const bool src_offset = *get_invlist_offset_addr(src);
170 const STRLEN src_byte_len = SvLEN(src);
171 char * array = SvPVX(src);
172
173 #ifndef NO_TAINT_SUPPORT
174 const int oldtainted = TAINT_get;
175 #endif
176
177 PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
178
179 assert(is_invlist(src));
180 assert(is_invlist(dest));
181 assert(! invlist_is_iterating(src));
182 assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
183
184 /* Make sure it ends in the right place with a NUL, as our inversion list
185 * manipulations aren't careful to keep this true, but sv_usepvn_flags()
186 * asserts it */
187 array[src_byte_len - 1] = '\0';
188
189 TAINT_NOT; /* Otherwise it breaks */
190 sv_usepvn_flags(dest,
191 (char *) array,
192 src_byte_len - 1,
193
194 /* This flag is documented to cause a copy to be avoided */
195 SV_HAS_TRAILING_NUL);
196 TAINT_set(oldtainted);
197 SvPV_set(src, 0);
198 SvLEN_set(src, 0);
199 SvCUR_set(src, 0);
200
201 /* Finish up copying over the other fields in an inversion list */
202 *get_invlist_offset_addr(dest) = src_offset;
203 invlist_set_len(dest, src_len, src_offset);
204 *get_invlist_previous_index_addr(dest) = 0;
205 invlist_iterfinish(dest);
206 }
207
208 PERL_STATIC_INLINE IV*
S_get_invlist_previous_index_addr(SV * invlist)209 S_get_invlist_previous_index_addr(SV* invlist)
210 {
211 /* Return the address of the IV that is reserved to hold the cached index
212 * */
213 PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
214
215 assert(is_invlist(invlist));
216
217 return &(((XINVLIST*) SvANY(invlist))->prev_index);
218 }
219
220 PERL_STATIC_INLINE IV
S_invlist_previous_index(SV * const invlist)221 S_invlist_previous_index(SV* const invlist)
222 {
223 /* Returns cached index of previous search */
224
225 PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
226
227 return *get_invlist_previous_index_addr(invlist);
228 }
229
230 PERL_STATIC_INLINE void
S_invlist_set_previous_index(SV * const invlist,const IV index)231 S_invlist_set_previous_index(SV* const invlist, const IV index)
232 {
233 /* Caches <index> for later retrieval */
234
235 PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
236
237 assert(index == 0 || index < (int) _invlist_len(invlist));
238
239 *get_invlist_previous_index_addr(invlist) = index;
240 }
241
242 PERL_STATIC_INLINE void
S_invlist_trim(SV * invlist)243 S_invlist_trim(SV* invlist)
244 {
245 /* Free the not currently-being-used space in an inversion list */
246
247 /* But don't free up the space needed for the 0 UV that is always at the
248 * beginning of the list, nor the trailing NUL */
249 const UV min_size = TO_INTERNAL_SIZE(1) + 1;
250
251 PERL_ARGS_ASSERT_INVLIST_TRIM;
252
253 assert(is_invlist(invlist));
254
255 SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
256 }
257
258 PERL_STATIC_INLINE void
S_invlist_clear(pTHX_ SV * invlist)259 S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */
260 {
261 PERL_ARGS_ASSERT_INVLIST_CLEAR;
262
263 assert(is_invlist(invlist));
264
265 invlist_set_len(invlist, 0, 0);
266 invlist_trim(invlist);
267 }
268
269 PERL_STATIC_INLINE UV
S_invlist_max(const SV * const invlist)270 S_invlist_max(const SV* const invlist)
271 {
272 /* Returns the maximum number of elements storable in the inversion list's
273 * array, without having to realloc() */
274
275 PERL_ARGS_ASSERT_INVLIST_MAX;
276
277 assert(is_invlist(invlist));
278
279 /* Assumes worst case, in which the 0 element is not counted in the
280 * inversion list, so subtracts 1 for that */
281 return SvLEN(invlist) == 0 /* This happens under _new_invlist_C_array */
282 ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
283 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
284 }
285
286 STATIC void
S_initialize_invlist_guts(pTHX_ SV * invlist,const Size_t initial_size)287 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
288 {
289 PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
290
291 /* First 1 is in case the zero element isn't in the list; second 1 is for
292 * trailing NUL */
293 SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
294 invlist_set_len(invlist, 0, 0);
295
296 /* Force iterinit() to be used to get iteration to work */
297 invlist_iterfinish(invlist);
298
299 *get_invlist_previous_index_addr(invlist) = 0;
300 SvPOK_on(invlist); /* This allows B to extract the PV */
301 }
302
303 SV*
Perl__new_invlist(pTHX_ IV initial_size)304 Perl__new_invlist(pTHX_ IV initial_size)
305 {
306
307 /* Return a pointer to a newly constructed inversion list, with enough
308 * space to store 'initial_size' elements. If that number is negative, a
309 * system default is used instead */
310
311 SV* new_list;
312
313 if (initial_size < 0) {
314 initial_size = 10;
315 }
316
317 new_list = newSV_type(SVt_INVLIST);
318 initialize_invlist_guts(new_list, initial_size);
319
320 return new_list;
321 }
322
323 SV*
Perl__new_invlist_C_array(pTHX_ const UV * const list)324 Perl__new_invlist_C_array(pTHX_ const UV* const list)
325 {
326 /* Return a pointer to a newly constructed inversion list, initialized to
327 * point to <list>, which has to be in the exact correct inversion list
328 * form, including internal fields. Thus this is a dangerous routine that
329 * should not be used in the wrong hands. The passed in 'list' contains
330 * several header fields at the beginning that are not part of the
331 * inversion list body proper */
332
333 const STRLEN length = (STRLEN) list[0];
334 const UV version_id = list[1];
335 const bool offset = cBOOL(list[2]);
336 #define HEADER_LENGTH 3
337 /* If any of the above changes in any way, you must change HEADER_LENGTH
338 * (if appropriate) and regenerate INVLIST_VERSION_ID by running
339 * perl -E 'say int(rand 2**31-1)'
340 */
341 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
342 data structure type, so that one being
343 passed in can be validated to be an
344 inversion list of the correct vintage.
345 */
346
347 SV* invlist = newSV_type(SVt_INVLIST);
348
349 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
350
351 if (version_id != INVLIST_VERSION_ID) {
352 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
353 }
354
355 /* The generated array passed in includes header elements that aren't part
356 * of the list proper, so start it just after them */
357 SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
358
359 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
360 shouldn't touch it */
361
362 *(get_invlist_offset_addr(invlist)) = offset;
363
364 /* The 'length' passed to us is the physical number of elements in the
365 * inversion list. But if there is an offset the logical number is one
366 * less than that */
367 invlist_set_len(invlist, length - offset, offset);
368
369 invlist_set_previous_index(invlist, 0);
370
371 /* Initialize the iteration pointer. */
372 invlist_iterfinish(invlist);
373
374 SvREADONLY_on(invlist);
375 SvPOK_on(invlist);
376
377 return invlist;
378 }
379
380 STATIC void
S__append_range_to_invlist(pTHX_ SV * const invlist,const UV start,const UV end)381 S__append_range_to_invlist(pTHX_ SV* const invlist,
382 const UV start, const UV end)
383 {
384 /* Subject to change or removal. Append the range from 'start' to 'end' at
385 * the end of the inversion list. The range must be above any existing
386 * ones. */
387
388 UV* array;
389 UV max = invlist_max(invlist);
390 UV len = _invlist_len(invlist);
391 bool offset;
392
393 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
394
395 if (len == 0) { /* Empty lists must be initialized */
396 offset = start != 0;
397 array = _invlist_array_init(invlist, ! offset);
398 }
399 else {
400 /* Here, the existing list is non-empty. The current max entry in the
401 * list is generally the first value not in the set, except when the
402 * set extends to the end of permissible values, in which case it is
403 * the first entry in that final set, and so this call is an attempt to
404 * append out-of-order */
405
406 UV final_element = len - 1;
407 array = invlist_array(invlist);
408 if ( array[final_element] > start
409 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
410 {
411 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
412 array[final_element], start,
413 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
414 }
415
416 /* Here, it is a legal append. If the new range begins 1 above the end
417 * of the range below it, it is extending the range below it, so the
418 * new first value not in the set is one greater than the newly
419 * extended range. */
420 offset = *get_invlist_offset_addr(invlist);
421 if (array[final_element] == start) {
422 if (end != UV_MAX) {
423 array[final_element] = end + 1;
424 }
425 else {
426 /* But if the end is the maximum representable on the machine,
427 * assume that infinity was actually what was meant. Just let
428 * the range that this would extend to have no end */
429 invlist_set_len(invlist, len - 1, offset);
430 }
431 return;
432 }
433 }
434
435 /* Here the new range doesn't extend any existing set. Add it */
436
437 len += 2; /* Includes an element each for the start and end of range */
438
439 /* If wll overflow the existing space, extend, which may cause the array to
440 * be moved */
441 if (max < len) {
442 invlist_extend(invlist, len);
443
444 /* Have to set len here to avoid assert failure in invlist_array() */
445 invlist_set_len(invlist, len, offset);
446
447 array = invlist_array(invlist);
448 }
449 else {
450 invlist_set_len(invlist, len, offset);
451 }
452
453 /* The next item on the list starts the range, the one after that is
454 * one past the new range. */
455 array[len - 2] = start;
456 if (end != UV_MAX) {
457 array[len - 1] = end + 1;
458 }
459 else {
460 /* But if the end is the maximum representable on the machine, just let
461 * the range have no end */
462 invlist_set_len(invlist, len - 1, offset);
463 }
464 }
465
466 SSize_t
Perl__invlist_search(SV * const invlist,const UV cp)467 Perl__invlist_search(SV* const invlist, const UV cp)
468 {
469 /* Searches the inversion list for the entry that contains the input code
470 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
471 * return value is the index into the list's array of the range that
472 * contains <cp>, that is, 'i' such that
473 * array[i] <= cp < array[i+1]
474 */
475
476 IV low = 0;
477 IV mid;
478 IV high = _invlist_len(invlist);
479 const IV highest_element = high - 1;
480 const UV* array;
481
482 PERL_ARGS_ASSERT__INVLIST_SEARCH;
483
484 /* If list is empty, return failure. */
485 if (UNLIKELY(high == 0)) {
486 return -1;
487 }
488
489 /* (We can't get the array unless we know the list is non-empty) */
490 array = invlist_array(invlist);
491
492 mid = invlist_previous_index(invlist);
493 assert(mid >=0);
494 if (UNLIKELY(mid > highest_element)) {
495 mid = highest_element;
496 }
497
498 /* <mid> contains the cache of the result of the previous call to this
499 * function (0 the first time). See if this call is for the same result,
500 * or if it is for mid-1. This is under the theory that calls to this
501 * function will often be for related code points that are near each other.
502 * And benchmarks show that caching gives better results. We also test
503 * here if the code point is within the bounds of the list. These tests
504 * replace others that would have had to be made anyway to make sure that
505 * the array bounds were not exceeded, and these give us extra information
506 * at the same time */
507 if (cp >= array[mid]) {
508 if (cp >= array[highest_element]) {
509 return highest_element;
510 }
511
512 /* Here, array[mid] <= cp < array[highest_element]. This means that
513 * the final element is not the answer, so can exclude it; it also
514 * means that <mid> is not the final element, so can refer to 'mid + 1'
515 * safely */
516 if (cp < array[mid + 1]) {
517 return mid;
518 }
519 high--;
520 low = mid + 1;
521 }
522 else { /* cp < aray[mid] */
523 if (cp < array[0]) { /* Fail if outside the array */
524 return -1;
525 }
526 high = mid;
527 if (cp >= array[mid - 1]) {
528 goto found_entry;
529 }
530 }
531
532 /* Binary search. What we are looking for is <i> such that
533 * array[i] <= cp < array[i+1]
534 * The loop below converges on the i+1. Note that there may not be an
535 * (i+1)th element in the array, and things work nonetheless */
536 while (low < high) {
537 mid = (low + high) / 2;
538 assert(mid <= highest_element);
539 if (array[mid] <= cp) { /* cp >= array[mid] */
540 low = mid + 1;
541
542 /* We could do this extra test to exit the loop early.
543 if (cp < array[low]) {
544 return mid;
545 }
546 */
547 }
548 else { /* cp < array[mid] */
549 high = mid;
550 }
551 }
552
553 found_entry:
554 high--;
555 invlist_set_previous_index(invlist, high);
556 return high;
557 }
558
559 void
Perl__invlist_union_maybe_complement_2nd(pTHX_ SV * const a,SV * const b,const bool complement_b,SV ** output)560 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
561 const bool complement_b, SV** output)
562 {
563 /* Take the union of two inversion lists and point '*output' to it. On
564 * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
565 * even 'a' or 'b'). If to an inversion list, the contents of the original
566 * list will be replaced by the union. The first list, 'a', may be
567 * NULL, in which case a copy of the second list is placed in '*output'.
568 * If 'complement_b' is TRUE, the union is taken of the complement
569 * (inversion) of 'b' instead of b itself.
570 *
571 * The basis for this comes from "Unicode Demystified" Chapter 13 by
572 * Richard Gillam, published by Addison-Wesley, and explained at some
573 * length there. The preface says to incorporate its examples into your
574 * code at your own risk.
575 *
576 * The algorithm is like a merge sort. */
577
578 const UV* array_a; /* a's array */
579 const UV* array_b;
580 UV len_a; /* length of a's array */
581 UV len_b;
582
583 SV* u; /* the resulting union */
584 UV* array_u;
585 UV len_u = 0;
586
587 UV i_a = 0; /* current index into a's array */
588 UV i_b = 0;
589 UV i_u = 0;
590
591 /* running count, as explained in the algorithm source book; items are
592 * stopped accumulating and are output when the count changes to/from 0.
593 * The count is incremented when we start a range that's in an input's set,
594 * and decremented when we start a range that's not in a set. So this
595 * variable can be 0, 1, or 2. When it is 0 neither input is in their set,
596 * and hence nothing goes into the union; 1, just one of the inputs is in
597 * its set (and its current range gets added to the union); and 2 when both
598 * inputs are in their sets. */
599 UV count = 0;
600
601 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
602 assert(a != b);
603 assert(*output == NULL || is_invlist(*output));
604
605 len_b = _invlist_len(b);
606 if (len_b == 0) {
607
608 /* Here, 'b' is empty, hence it's complement is all possible code
609 * points. So if the union includes the complement of 'b', it includes
610 * everything, and we need not even look at 'a'. It's easiest to
611 * create a new inversion list that matches everything. */
612 if (complement_b) {
613 SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
614
615 if (*output == NULL) { /* If the output didn't exist, just point it
616 at the new list */
617 *output = everything;
618 }
619 else { /* Otherwise, replace its contents with the new list */
620 invlist_replace_list_destroys_src(*output, everything);
621 SvREFCNT_dec_NN(everything);
622 }
623
624 return;
625 }
626
627 /* Here, we don't want the complement of 'b', and since 'b' is empty,
628 * the union will come entirely from 'a'. If 'a' is NULL or empty, the
629 * output will be empty */
630
631 if (a == NULL || _invlist_len(a) == 0) {
632 if (*output == NULL) {
633 *output = _new_invlist(0);
634 }
635 else {
636 invlist_clear(*output);
637 }
638 return;
639 }
640
641 /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
642 * union. We can just return a copy of 'a' if '*output' doesn't point
643 * to an existing list */
644 if (*output == NULL) {
645 *output = invlist_clone(a, NULL);
646 return;
647 }
648
649 /* If the output is to overwrite 'a', we have a no-op, as it's
650 * already in 'a' */
651 if (*output == a) {
652 return;
653 }
654
655 /* Here, '*output' is to be overwritten by 'a' */
656 u = invlist_clone(a, NULL);
657 invlist_replace_list_destroys_src(*output, u);
658 SvREFCNT_dec_NN(u);
659
660 return;
661 }
662
663 /* Here 'b' is not empty. See about 'a' */
664
665 if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
666
667 /* Here, 'a' is empty (and b is not). That means the union will come
668 * entirely from 'b'. If '*output' is NULL, we can directly return a
669 * clone of 'b'. Otherwise, we replace the contents of '*output' with
670 * the clone */
671
672 SV ** dest = (*output == NULL) ? output : &u;
673 *dest = invlist_clone(b, NULL);
674 if (complement_b) {
675 _invlist_invert(*dest);
676 }
677
678 if (dest == &u) {
679 invlist_replace_list_destroys_src(*output, u);
680 SvREFCNT_dec_NN(u);
681 }
682
683 return;
684 }
685
686 /* Here both lists exist and are non-empty */
687 array_a = invlist_array(a);
688 array_b = invlist_array(b);
689
690 /* If are to take the union of 'a' with the complement of b, set it
691 * up so are looking at b's complement. */
692 if (complement_b) {
693
694 /* To complement, we invert: if the first element is 0, remove it. To
695 * do this, we just pretend the array starts one later */
696 if (array_b[0] == 0) {
697 array_b++;
698 len_b--;
699 }
700 else {
701
702 /* But if the first element is not zero, we pretend the list starts
703 * at the 0 that is always stored immediately before the array. */
704 array_b--;
705 len_b++;
706 }
707 }
708
709 /* Size the union for the worst case: that the sets are completely
710 * disjoint */
711 u = _new_invlist(len_a + len_b);
712
713 /* Will contain U+0000 if either component does */
714 array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0)
715 || (len_b > 0 && array_b[0] == 0));
716
717 /* Go through each input list item by item, stopping when have exhausted
718 * one of them */
719 while (i_a < len_a && i_b < len_b) {
720 UV cp; /* The element to potentially add to the union's array */
721 bool cp_in_set; /* is it in the input list's set or not */
722
723 /* We need to take one or the other of the two inputs for the union.
724 * Since we are merging two sorted lists, we take the smaller of the
725 * next items. In case of a tie, we take first the one that is in its
726 * set. If we first took the one not in its set, it would decrement
727 * the count, possibly to 0 which would cause it to be output as ending
728 * the range, and the next time through we would take the same number,
729 * and output it again as beginning the next range. By doing it the
730 * opposite way, there is no possibility that the count will be
731 * momentarily decremented to 0, and thus the two adjoining ranges will
732 * be seamlessly merged. (In a tie and both are in the set or both not
733 * in the set, it doesn't matter which we take first.) */
734 if ( array_a[i_a] < array_b[i_b]
735 || ( array_a[i_a] == array_b[i_b]
736 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
737 {
738 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
739 cp = array_a[i_a++];
740 }
741 else {
742 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
743 cp = array_b[i_b++];
744 }
745
746 /* Here, have chosen which of the two inputs to look at. Only output
747 * if the running count changes to/from 0, which marks the
748 * beginning/end of a range that's in the set */
749 if (cp_in_set) {
750 if (count == 0) {
751 array_u[i_u++] = cp;
752 }
753 count++;
754 }
755 else {
756 count--;
757 if (count == 0) {
758 array_u[i_u++] = cp;
759 }
760 }
761 }
762
763
764 /* The loop above increments the index into exactly one of the input lists
765 * each iteration, and ends when either index gets to its list end. That
766 * means the other index is lower than its end, and so something is
767 * remaining in that one. We decrement 'count', as explained below, if
768 * that list is in its set. (i_a and i_b each currently index the element
769 * beyond the one we care about.) */
770 if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
771 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
772 {
773 count--;
774 }
775
776 /* Above we decremented 'count' if the list that had unexamined elements in
777 * it was in its set. This has made it so that 'count' being non-zero
778 * means there isn't anything left to output; and 'count' equal to 0 means
779 * that what is left to output is precisely that which is left in the
780 * non-exhausted input list.
781 *
782 * To see why, note first that the exhausted input obviously has nothing
783 * left to add to the union. If it was in its set at its end, that means
784 * the set extends from here to the platform's infinity, and hence so does
785 * the union and the non-exhausted set is irrelevant. The exhausted set
786 * also contributed 1 to 'count'. If 'count' was 2, it got decremented to
787 * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
788 * 'count' remains at 1. This is consistent with the decremented 'count'
789 * != 0 meaning there's nothing left to add to the union.
790 *
791 * But if the exhausted input wasn't in its set, it contributed 0 to
792 * 'count', and the rest of the union will be whatever the other input is.
793 * If 'count' was 0, neither list was in its set, and 'count' remains 0;
794 * otherwise it gets decremented to 0. This is consistent with 'count'
795 * == 0 meaning the remainder of the union is whatever is left in the
796 * non-exhausted list. */
797 if (count != 0) {
798 len_u = i_u;
799 }
800 else {
801 IV copy_count = len_a - i_a;
802 if (copy_count > 0) { /* The non-exhausted input is 'a' */
803 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
804 }
805 else { /* The non-exhausted input is b */
806 copy_count = len_b - i_b;
807 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
808 }
809 len_u = i_u + copy_count;
810 }
811
812 /* Set the result to the final length, which can change the pointer to
813 * array_u, so re-find it. (Note that it is unlikely that this will
814 * change, as we are shrinking the space, not enlarging it) */
815 if (len_u != _invlist_len(u)) {
816 invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
817 invlist_trim(u);
818 array_u = invlist_array(u);
819 }
820
821 if (*output == NULL) { /* Simply return the new inversion list */
822 *output = u;
823 }
824 else {
825 /* Otherwise, overwrite the inversion list that was in '*output'. We
826 * could instead free '*output', and then set it to 'u', but experience
827 * has shown [perl #127392] that if the input is a mortal, we can get a
828 * huge build-up of these during regex compilation before they get
829 * freed. */
830 invlist_replace_list_destroys_src(*output, u);
831 SvREFCNT_dec_NN(u);
832 }
833
834 return;
835 }
836
837 void
Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV * const a,SV * const b,const bool complement_b,SV ** i)838 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
839 const bool complement_b, SV** i)
840 {
841 /* Take the intersection of two inversion lists and point '*i' to it. On
842 * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
843 * even 'a' or 'b'). If to an inversion list, the contents of the original
844 * list will be replaced by the intersection. The first list, 'a', may be
845 * NULL, in which case '*i' will be an empty list. If 'complement_b' is
846 * TRUE, the result will be the intersection of 'a' and the complement (or
847 * inversion) of 'b' instead of 'b' directly.
848 *
849 * The basis for this comes from "Unicode Demystified" Chapter 13 by
850 * Richard Gillam, published by Addison-Wesley, and explained at some
851 * length there. The preface says to incorporate its examples into your
852 * code at your own risk. In fact, it had bugs
853 *
854 * The algorithm is like a merge sort, and is essentially the same as the
855 * union above
856 */
857
858 const UV* array_a; /* a's array */
859 const UV* array_b;
860 UV len_a; /* length of a's array */
861 UV len_b;
862
863 SV* r; /* the resulting intersection */
864 UV* array_r;
865 UV len_r = 0;
866
867 UV i_a = 0; /* current index into a's array */
868 UV i_b = 0;
869 UV i_r = 0;
870
871 /* running count of how many of the two inputs are postitioned at ranges
872 * that are in their sets. As explained in the algorithm source book,
873 * items are stopped accumulating and are output when the count changes
874 * to/from 2. The count is incremented when we start a range that's in an
875 * input's set, and decremented when we start a range that's not in a set.
876 * Only when it is 2 are we in the intersection. */
877 UV count = 0;
878
879 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
880 assert(a != b);
881 assert(*i == NULL || is_invlist(*i));
882
883 /* Special case if either one is empty */
884 len_a = (a == NULL) ? 0 : _invlist_len(a);
885 if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
886 if (len_a != 0 && complement_b) {
887
888 /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
889 * must be empty. Here, also we are using 'b's complement, which
890 * hence must be every possible code point. Thus the intersection
891 * is simply 'a'. */
892
893 if (*i == a) { /* No-op */
894 return;
895 }
896
897 if (*i == NULL) {
898 *i = invlist_clone(a, NULL);
899 return;
900 }
901
902 r = invlist_clone(a, NULL);
903 invlist_replace_list_destroys_src(*i, r);
904 SvREFCNT_dec_NN(r);
905 return;
906 }
907
908 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
909 * intersection must be empty */
910 if (*i == NULL) {
911 *i = _new_invlist(0);
912 return;
913 }
914
915 invlist_clear(*i);
916 return;
917 }
918
919 /* Here both lists exist and are non-empty */
920 array_a = invlist_array(a);
921 array_b = invlist_array(b);
922
923 /* If are to take the intersection of 'a' with the complement of b, set it
924 * up so are looking at b's complement. */
925 if (complement_b) {
926
927 /* To complement, we invert: if the first element is 0, remove it. To
928 * do this, we just pretend the array starts one later */
929 if (array_b[0] == 0) {
930 array_b++;
931 len_b--;
932 }
933 else {
934
935 /* But if the first element is not zero, we pretend the list starts
936 * at the 0 that is always stored immediately before the array. */
937 array_b--;
938 len_b++;
939 }
940 }
941
942 /* Size the intersection for the worst case: that the intersection ends up
943 * fragmenting everything to be completely disjoint */
944 r= _new_invlist(len_a + len_b);
945
946 /* Will contain U+0000 iff both components do */
947 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
948 && len_b > 0 && array_b[0] == 0);
949
950 /* Go through each list item by item, stopping when have exhausted one of
951 * them */
952 while (i_a < len_a && i_b < len_b) {
953 UV cp; /* The element to potentially add to the intersection's
954 array */
955 bool cp_in_set; /* Is it in the input list's set or not */
956
957 /* We need to take one or the other of the two inputs for the
958 * intersection. Since we are merging two sorted lists, we take the
959 * smaller of the next items. In case of a tie, we take first the one
960 * that is not in its set (a difference from the union algorithm). If
961 * we first took the one in its set, it would increment the count,
962 * possibly to 2 which would cause it to be output as starting a range
963 * in the intersection, and the next time through we would take that
964 * same number, and output it again as ending the set. By doing the
965 * opposite of this, there is no possibility that the count will be
966 * momentarily incremented to 2. (In a tie and both are in the set or
967 * both not in the set, it doesn't matter which we take first.) */
968 if ( array_a[i_a] < array_b[i_b]
969 || ( array_a[i_a] == array_b[i_b]
970 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
971 {
972 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
973 cp = array_a[i_a++];
974 }
975 else {
976 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
977 cp= array_b[i_b++];
978 }
979
980 /* Here, have chosen which of the two inputs to look at. Only output
981 * if the running count changes to/from 2, which marks the
982 * beginning/end of a range that's in the intersection */
983 if (cp_in_set) {
984 count++;
985 if (count == 2) {
986 array_r[i_r++] = cp;
987 }
988 }
989 else {
990 if (count == 2) {
991 array_r[i_r++] = cp;
992 }
993 count--;
994 }
995
996 }
997
998 /* The loop above increments the index into exactly one of the input lists
999 * each iteration, and ends when either index gets to its list end. That
1000 * means the other index is lower than its end, and so something is
1001 * remaining in that one. We increment 'count', as explained below, if the
1002 * exhausted list was in its set. (i_a and i_b each currently index the
1003 * element beyond the one we care about.) */
1004 if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
1005 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
1006 {
1007 count++;
1008 }
1009
1010 /* Above we incremented 'count' if the exhausted list was in its set. This
1011 * has made it so that 'count' being below 2 means there is nothing left to
1012 * output; otheriwse what's left to add to the intersection is precisely
1013 * that which is left in the non-exhausted input list.
1014 *
1015 * To see why, note first that the exhausted input obviously has nothing
1016 * left to affect the intersection. If it was in its set at its end, that
1017 * means the set extends from here to the platform's infinity, and hence
1018 * anything in the non-exhausted's list will be in the intersection, and
1019 * anything not in it won't be. Hence, the rest of the intersection is
1020 * precisely what's in the non-exhausted list The exhausted set also
1021 * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing
1022 * it means 'count' is now at least 2. This is consistent with the
1023 * incremented 'count' being >= 2 means to add the non-exhausted list to
1024 * the intersection.
1025 *
1026 * But if the exhausted input wasn't in its set, it contributed 0 to
1027 * 'count', and the intersection can't include anything further; the
1028 * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get
1029 * incremented. This is consistent with 'count' being < 2 meaning nothing
1030 * further to add to the intersection. */
1031 if (count < 2) { /* Nothing left to put in the intersection. */
1032 len_r = i_r;
1033 }
1034 else { /* copy the non-exhausted list, unchanged. */
1035 IV copy_count = len_a - i_a;
1036 if (copy_count > 0) { /* a is the one with stuff left */
1037 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
1038 }
1039 else { /* b is the one with stuff left */
1040 copy_count = len_b - i_b;
1041 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
1042 }
1043 len_r = i_r + copy_count;
1044 }
1045
1046 /* Set the result to the final length, which can change the pointer to
1047 * array_r, so re-find it. (Note that it is unlikely that this will
1048 * change, as we are shrinking the space, not enlarging it) */
1049 if (len_r != _invlist_len(r)) {
1050 invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
1051 invlist_trim(r);
1052 array_r = invlist_array(r);
1053 }
1054
1055 if (*i == NULL) { /* Simply return the calculated intersection */
1056 *i = r;
1057 }
1058 else { /* Otherwise, replace the existing inversion list in '*i'. We could
1059 instead free '*i', and then set it to 'r', but experience has
1060 shown [perl #127392] that if the input is a mortal, we can get a
1061 huge build-up of these during regex compilation before they get
1062 freed. */
1063 if (len_r) {
1064 invlist_replace_list_destroys_src(*i, r);
1065 }
1066 else {
1067 invlist_clear(*i);
1068 }
1069 SvREFCNT_dec_NN(r);
1070 }
1071
1072 return;
1073 }
1074
1075 SV*
Perl__add_range_to_invlist(pTHX_ SV * invlist,UV start,UV end)1076 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
1077 {
1078 /* Add the range from 'start' to 'end' inclusive to the inversion list's
1079 * set. A pointer to the inversion list is returned. This may actually be
1080 * a new list, in which case the passed in one has been destroyed. The
1081 * passed-in inversion list can be NULL, in which case a new one is created
1082 * with just the one range in it. The new list is not necessarily
1083 * NUL-terminated. Space is not freed if the inversion list shrinks as a
1084 * result of this function. The gain would not be large, and in many
1085 * cases, this is called multiple times on a single inversion list, so
1086 * anything freed may almost immediately be needed again.
1087 *
1088 * This used to mostly call the 'union' routine, but that is much more
1089 * heavyweight than really needed for a single range addition */
1090
1091 UV* array; /* The array implementing the inversion list */
1092 UV len; /* How many elements in 'array' */
1093 SSize_t i_s; /* index into the invlist array where 'start'
1094 should go */
1095 SSize_t i_e = 0; /* And the index where 'end' should go */
1096 UV cur_highest; /* The highest code point in the inversion list
1097 upon entry to this function */
1098
1099 /* This range becomes the whole inversion list if none already existed */
1100 if (invlist == NULL) {
1101 invlist = _new_invlist(2);
1102 _append_range_to_invlist(invlist, start, end);
1103 return invlist;
1104 }
1105
1106 /* Likewise, if the inversion list is currently empty */
1107 len = _invlist_len(invlist);
1108 if (len == 0) {
1109 _append_range_to_invlist(invlist, start, end);
1110 return invlist;
1111 }
1112
1113 /* Starting here, we have to know the internals of the list */
1114 array = invlist_array(invlist);
1115
1116 /* If the new range ends higher than the current highest ... */
1117 cur_highest = invlist_highest(invlist);
1118 if (end > cur_highest) {
1119
1120 /* If the whole range is higher, we can just append it */
1121 if (start > cur_highest) {
1122 _append_range_to_invlist(invlist, start, end);
1123 return invlist;
1124 }
1125
1126 /* Otherwise, add the portion that is higher ... */
1127 _append_range_to_invlist(invlist, cur_highest + 1, end);
1128
1129 /* ... and continue on below to handle the rest. As a result of the
1130 * above append, we know that the index of the end of the range is the
1131 * final even numbered one of the array. Recall that the final element
1132 * always starts a range that extends to infinity. If that range is in
1133 * the set (meaning the set goes from here to infinity), it will be an
1134 * even index, but if it isn't in the set, it's odd, and the final
1135 * range in the set is one less, which is even. */
1136 if (end == UV_MAX) {
1137 i_e = len;
1138 }
1139 else {
1140 i_e = len - 2;
1141 }
1142 }
1143
1144 /* We have dealt with appending, now see about prepending. If the new
1145 * range starts lower than the current lowest ... */
1146 if (start < array[0]) {
1147
1148 /* Adding something which has 0 in it is somewhat tricky, and uncommon.
1149 * Let the union code handle it, rather than having to know the
1150 * trickiness in two code places. */
1151 if (UNLIKELY(start == 0)) {
1152 SV* range_invlist;
1153
1154 range_invlist = _new_invlist(2);
1155 _append_range_to_invlist(range_invlist, start, end);
1156
1157 _invlist_union(invlist, range_invlist, &invlist);
1158
1159 SvREFCNT_dec_NN(range_invlist);
1160
1161 return invlist;
1162 }
1163
1164 /* If the whole new range comes before the first entry, and doesn't
1165 * extend it, we have to insert it as an additional range */
1166 if (end < array[0] - 1) {
1167 i_s = i_e = -1;
1168 goto splice_in_new_range;
1169 }
1170
1171 /* Here the new range adjoins the existing first range, extending it
1172 * downwards. */
1173 array[0] = start;
1174
1175 /* And continue on below to handle the rest. We know that the index of
1176 * the beginning of the range is the first one of the array */
1177 i_s = 0;
1178 }
1179 else { /* Not prepending any part of the new range to the existing list.
1180 * Find where in the list it should go. This finds i_s, such that:
1181 * invlist[i_s] <= start < array[i_s+1]
1182 */
1183 i_s = _invlist_search(invlist, start);
1184 }
1185
1186 /* At this point, any extending before the beginning of the inversion list
1187 * and/or after the end has been done. This has made it so that, in the
1188 * code below, each endpoint of the new range is either in a range that is
1189 * in the set, or is in a gap between two ranges that are. This means we
1190 * don't have to worry about exceeding the array bounds.
1191 *
1192 * Find where in the list the new range ends (but we can skip this if we
1193 * have already determined what it is, or if it will be the same as i_s,
1194 * which we already have computed) */
1195 if (i_e == 0) {
1196 i_e = (start == end)
1197 ? i_s
1198 : _invlist_search(invlist, end);
1199 }
1200
1201 /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e]
1202 * is a range that goes to infinity there is no element at invlist[i_e+1],
1203 * so only the first relation holds. */
1204
1205 if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
1206
1207 /* Here, the ranges on either side of the beginning of the new range
1208 * are in the set, and this range starts in the gap between them.
1209 *
1210 * The new range extends the range above it downwards if the new range
1211 * ends at or above that range's start */
1212 const bool extends_the_range_above = ( end == UV_MAX
1213 || end + 1 >= array[i_s+1]);
1214
1215 /* The new range extends the range below it upwards if it begins just
1216 * after where that range ends */
1217 if (start == array[i_s]) {
1218
1219 /* If the new range fills the entire gap between the other ranges,
1220 * they will get merged together. Other ranges may also get
1221 * merged, depending on how many of them the new range spans. In
1222 * the general case, we do the merge later, just once, after we
1223 * figure out how many to merge. But in the case where the new
1224 * range exactly spans just this one gap (possibly extending into
1225 * the one above), we do the merge here, and an early exit. This
1226 * is done here to avoid having to special case later. */
1227 if (i_e - i_s <= 1) {
1228
1229 /* If i_e - i_s == 1, it means that the new range terminates
1230 * within the range above, and hence 'extends_the_range_above'
1231 * must be true. (If the range above it extends to infinity,
1232 * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
1233 * will be 0, so no harm done.) */
1234 if (extends_the_range_above) {
1235 Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
1236 invlist_set_len(invlist,
1237 len - 2,
1238 *(get_invlist_offset_addr(invlist)));
1239 return invlist;
1240 }
1241
1242 /* Here, i_e must == i_s. We keep them in sync, as they apply
1243 * to the same range, and below we are about to decrement i_s
1244 * */
1245 i_e--;
1246 }
1247
1248 /* Here, the new range is adjacent to the one below. (It may also
1249 * span beyond the range above, but that will get resolved later.)
1250 * Extend the range below to include this one. */
1251 array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
1252 i_s--;
1253 start = array[i_s];
1254 }
1255 else if (extends_the_range_above) {
1256
1257 /* Here the new range only extends the range above it, but not the
1258 * one below. It merges with the one above. Again, we keep i_e
1259 * and i_s in sync if they point to the same range */
1260 if (i_e == i_s) {
1261 i_e++;
1262 }
1263 i_s++;
1264 array[i_s] = start;
1265 }
1266 }
1267
1268 /* Here, we've dealt with the new range start extending any adjoining
1269 * existing ranges.
1270 *
1271 * If the new range extends to infinity, it is now the final one,
1272 * regardless of what was there before */
1273 if (UNLIKELY(end == UV_MAX)) {
1274 invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
1275 return invlist;
1276 }
1277
1278 /* If i_e started as == i_s, it has also been dealt with,
1279 * and been updated to the new i_s, which will fail the following if */
1280 if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
1281
1282 /* Here, the ranges on either side of the end of the new range are in
1283 * the set, and this range ends in the gap between them.
1284 *
1285 * If this range is adjacent to (hence extends) the range above it, it
1286 * becomes part of that range; likewise if it extends the range below,
1287 * it becomes part of that range */
1288 if (end + 1 == array[i_e+1]) {
1289 i_e++;
1290 array[i_e] = start;
1291 }
1292 else if (start <= array[i_e]) {
1293 array[i_e] = end + 1;
1294 i_e--;
1295 }
1296 }
1297
1298 if (i_s == i_e) {
1299
1300 /* If the range fits entirely in an existing range (as possibly already
1301 * extended above), it doesn't add anything new */
1302 if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
1303 return invlist;
1304 }
1305
1306 /* Here, no part of the range is in the list. Must add it. It will
1307 * occupy 2 more slots */
1308 splice_in_new_range:
1309
1310 invlist_extend(invlist, len + 2);
1311 array = invlist_array(invlist);
1312 /* Move the rest of the array down two slots. Don't include any
1313 * trailing NUL */
1314 Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
1315
1316 /* Do the actual splice */
1317 array[i_e+1] = start;
1318 array[i_e+2] = end + 1;
1319 invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
1320 return invlist;
1321 }
1322
1323 /* Here the new range crossed the boundaries of a pre-existing range. The
1324 * code above has adjusted things so that both ends are in ranges that are
1325 * in the set. This means everything in between must also be in the set.
1326 * Just squash things together */
1327 Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
1328 invlist_set_len(invlist,
1329 len - i_e + i_s,
1330 *(get_invlist_offset_addr(invlist)));
1331
1332 return invlist;
1333 }
1334
1335 SV*
Perl__setup_canned_invlist(pTHX_ const STRLEN size,const UV element0,UV ** other_elements_ptr)1336 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
1337 UV** other_elements_ptr)
1338 {
1339 /* Create and return an inversion list whose contents are to be populated
1340 * by the caller. The caller gives the number of elements (in 'size') and
1341 * the very first element ('element0'). This function will set
1342 * '*other_elements_ptr' to an array of UVs, where the remaining elements
1343 * are to be placed.
1344 *
1345 * Obviously there is some trust involved that the caller will properly
1346 * fill in the other elements of the array.
1347 *
1348 * (The first element needs to be passed in, as the underlying code does
1349 * things differently depending on whether it is zero or non-zero) */
1350
1351 SV* invlist = _new_invlist(size);
1352 bool offset;
1353
1354 PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
1355
1356 invlist = add_cp_to_invlist(invlist, element0);
1357 offset = *get_invlist_offset_addr(invlist);
1358
1359 invlist_set_len(invlist, size, offset);
1360 *other_elements_ptr = invlist_array(invlist) + 1;
1361 return invlist;
1362 }
1363
1364 #endif
1365
1366 #ifndef PERL_IN_XSUB_RE
1367 void
Perl__invlist_invert(pTHX_ SV * const invlist)1368 Perl__invlist_invert(pTHX_ SV* const invlist)
1369 {
1370 /* Complement the input inversion list. This adds a 0 if the list didn't
1371 * have a zero; removes it otherwise. As described above, the data
1372 * structure is set up so that this is very efficient */
1373
1374 PERL_ARGS_ASSERT__INVLIST_INVERT;
1375
1376 assert(! invlist_is_iterating(invlist));
1377
1378 /* The inverse of matching nothing is matching everything */
1379 if (_invlist_len(invlist) == 0) {
1380 _append_range_to_invlist(invlist, 0, UV_MAX);
1381 return;
1382 }
1383
1384 *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
1385 }
1386
1387 SV*
Perl_invlist_clone(pTHX_ SV * const invlist,SV * new_invlist)1388 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
1389 {
1390 /* Return a new inversion list that is a copy of the input one, which is
1391 * unchanged. The new list will not be mortal even if the old one was. */
1392
1393 const STRLEN nominal_length = _invlist_len(invlist);
1394 const STRLEN physical_length = SvCUR(invlist);
1395 const bool offset = *(get_invlist_offset_addr(invlist));
1396
1397 PERL_ARGS_ASSERT_INVLIST_CLONE;
1398
1399 if (new_invlist == NULL) {
1400 new_invlist = _new_invlist(nominal_length);
1401 }
1402 else {
1403 sv_upgrade(new_invlist, SVt_INVLIST);
1404 initialize_invlist_guts(new_invlist, nominal_length);
1405 }
1406
1407 *(get_invlist_offset_addr(new_invlist)) = offset;
1408 invlist_set_len(new_invlist, nominal_length, offset);
1409 Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
1410
1411 return new_invlist;
1412 }
1413
1414 #endif
1415
1416
1417 #ifndef PERL_IN_XSUB_RE
1418 void
Perl__invlist_dump(pTHX_ PerlIO * file,I32 level,const char * const indent,SV * const invlist)1419 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
1420 const char * const indent, SV* const invlist)
1421 {
1422 /* Designed to be called only by do_sv_dump(). Dumps out the ranges of the
1423 * inversion list 'invlist' to 'file' at 'level' Each line is prefixed by
1424 * the string 'indent'. The output looks like this:
1425 [0] 0x000A .. 0x000D
1426 [2] 0x0085
1427 [4] 0x2028 .. 0x2029
1428 [6] 0x3104 .. INFTY
1429 * This means that the first range of code points matched by the list are
1430 * 0xA through 0xD; the second range contains only the single code point
1431 * 0x85, etc. An inversion list is an array of UVs. Two array elements
1432 * are used to define each range (except if the final range extends to
1433 * infinity, only a single element is needed). The array index of the
1434 * first element for the corresponding range is given in brackets. */
1435
1436 UV start, end;
1437 STRLEN count = 0;
1438
1439 PERL_ARGS_ASSERT__INVLIST_DUMP;
1440
1441 if (invlist_is_iterating(invlist)) {
1442 Perl_dump_indent(aTHX_ level, file,
1443 "%sCan't dump inversion list because is in middle of iterating\n",
1444 indent);
1445 return;
1446 }
1447
1448 invlist_iterinit(invlist);
1449 while (invlist_iternext(invlist, &start, &end)) {
1450 if (end == UV_MAX) {
1451 Perl_dump_indent(aTHX_ level, file,
1452 "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
1453 indent, (UV)count, start);
1454 }
1455 else if (end != start) {
1456 Perl_dump_indent(aTHX_ level, file,
1457 "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
1458 indent, (UV)count, start, end);
1459 }
1460 else {
1461 Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
1462 indent, (UV)count, start);
1463 }
1464 count += 2;
1465 }
1466 }
1467
1468 #endif
1469
1470 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
1471 bool
Perl__invlistEQ(pTHX_ SV * const a,SV * const b,const bool complement_b)1472 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
1473 {
1474 /* Return a boolean as to if the two passed in inversion lists are
1475 * identical. The final argument, if TRUE, says to take the complement of
1476 * the second inversion list before doing the comparison */
1477
1478 const UV len_a = _invlist_len(a);
1479 UV len_b = _invlist_len(b);
1480
1481 const UV* array_a = NULL;
1482 const UV* array_b = NULL;
1483
1484 PERL_ARGS_ASSERT__INVLISTEQ;
1485
1486 /* This code avoids accessing the arrays unless it knows the length is
1487 * non-zero */
1488
1489 if (len_a == 0) {
1490 if (len_b == 0) {
1491 return ! complement_b;
1492 }
1493 }
1494 else {
1495 array_a = invlist_array(a);
1496 }
1497
1498 if (len_b != 0) {
1499 array_b = invlist_array(b);
1500 }
1501
1502 /* If are to compare 'a' with the complement of b, set it
1503 * up so are looking at b's complement. */
1504 if (complement_b) {
1505
1506 /* The complement of nothing is everything, so <a> would have to have
1507 * just one element, starting at zero (ending at infinity) */
1508 if (len_b == 0) {
1509 return (len_a == 1 && array_a[0] == 0);
1510 }
1511 if (array_b[0] == 0) {
1512
1513 /* Otherwise, to complement, we invert. Here, the first element is
1514 * 0, just remove it. To do this, we just pretend the array starts
1515 * one later */
1516
1517 array_b++;
1518 len_b--;
1519 }
1520 else {
1521
1522 /* But if the first element is not zero, we pretend the list starts
1523 * at the 0 that is always stored immediately before the array. */
1524 array_b--;
1525 len_b++;
1526 }
1527 }
1528
1529 return len_a == len_b
1530 && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
1531
1532 }
1533 #endif
1534
1535 #undef HEADER_LENGTH
1536 #undef TO_INTERNAL_SIZE
1537 #undef FROM_INTERNAL_SIZE
1538 #undef INVLIST_VERSION_ID
1539
1540 /* End of inversion list object */
1541