xref: /openbsd/gnu/usr.bin/perl/regcomp_invlist.c (revision 5486feef)
1 #ifdef PERL_EXT_RE_BUILD
2 #include "re_top.h"
3 #endif
4 
5 #include "EXTERN.h"
6 #define PERL_IN_REGEX_ENGINE
7 #define PERL_IN_REGCOMP_ANY
8 #define PERL_IN_REGCOMP_INVLIST_C
9 #include "perl.h"
10 
11 #ifdef PERL_IN_XSUB_RE
12 #  include "re_comp.h"
13 #else
14 #  include "regcomp.h"
15 #endif
16 
17 #include "invlist_inline.h"
18 #include "unicode_constants.h"
19 #include "regcomp_internal.h"
20 
21 #ifdef PERL_RE_BUILD_AUX
22 void
Perl_populate_bitmap_from_invlist(pTHX_ SV * invlist,const UV offset,const U8 * bitmap,const Size_t len)23 Perl_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len)
24 {
25     PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST;
26 
27     /* As the name says.  The zeroth bit corresponds to the code point given by
28      * 'offset' */
29 
30     UV start, end;
31 
32     Zero(bitmap, len, U8);
33 
34     invlist_iterinit(invlist);
35     while (invlist_iternext(invlist, &start, &end)) {
36         assert(start >= offset);
37 
38         for (UV i = start; i <= end; i++) {
39             UV adjusted = i - offset;
40 
41             BITMAP_BYTE(bitmap, adjusted) |= BITMAP_BIT(adjusted);
42         }
43     }
44     invlist_iterfinish(invlist);
45 }
46 
47 void
Perl_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap,const Size_t bitmap_len,SV ** invlist,const UV offset)48 Perl_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset)
49 {
50     PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP;
51 
52     /* As the name says.  The zeroth bit corresponds to the code point given by
53      * 'offset' */
54 
55     Size_t i;
56 
57     for (i = 0; i < bitmap_len; i++) {
58         if (BITMAP_TEST(bitmap, i)) {
59             int start = i++;
60 
61             /* Save a little work by adding a range all at once instead of bit
62              * by bit */
63             while (i < bitmap_len && BITMAP_TEST(bitmap, i)) {
64                 i++;
65             }
66 
67             *invlist = _add_range_to_invlist(*invlist,
68                                              start + offset,
69                                              i + offset - 1);
70         }
71     }
72 }
73 #endif /* PERL_RE_BUILD_AUX */
74 
75 /* This section of code defines the inversion list object and its methods.  The
76  * interfaces are highly subject to change, so as much as possible is static to
77  * this file.  An inversion list is here implemented as a malloc'd C UV array
78  * as an SVt_INVLIST scalar.
79  *
80  * An inversion list for Unicode is an array of code points, sorted by ordinal
81  * number.  Each element gives the code point that begins a range that extends
82  * up-to but not including the code point given by the next element.  The final
83  * element gives the first code point of a range that extends to the platform's
84  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
85  * ...) give ranges whose code points are all in the inversion list.  We say
86  * that those ranges are in the set.  The odd-numbered elements give ranges
87  * whose code points are not in the inversion list, and hence not in the set.
88  * Thus, element [0] is the first code point in the list.  Element [1]
89  * is the first code point beyond that not in the list; and element [2] is the
90  * first code point beyond that that is in the list.  In other words, the first
91  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
92  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
93  * all code points in that range are not in the inversion list.  The third
94  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
95  * list, and so forth.  Thus every element whose index is divisible by two
96  * gives the beginning of a range that is in the list, and every element whose
97  * index is not divisible by two gives the beginning of a range not in the
98  * list.  If the final element's index is divisible by two, the inversion list
99  * extends to the platform's infinity; otherwise the highest code point in the
100  * inversion list is the contents of that element minus 1.
101  *
102  * A range that contains just a single code point N will look like
103  *  invlist[i]   == N
104  *  invlist[i+1] == N+1
105  *
106  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
107  * impossible to represent, so element [i+1] is omitted.  The single element
108  * inversion list
109  *  invlist[0] == UV_MAX
110  * contains just UV_MAX, but is interpreted as matching to infinity.
111  *
112  * Taking the complement (inverting) an inversion list is quite simple, if the
113  * first element is 0, remove it; otherwise add a 0 element at the beginning.
114  * This implementation reserves an element at the beginning of each inversion
115  * list to always contain 0; there is an additional flag in the header which
116  * indicates if the list begins at the 0, or is offset to begin at the next
117  * element.  This means that the inversion list can be inverted without any
118  * copying; just flip the flag.
119  *
120  * More about inversion lists can be found in "Unicode Demystified"
121  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
122  *
123  * The inversion list data structure is currently implemented as an SV pointing
124  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
125  * array of UV whose memory management is automatically handled by the existing
126  * facilities for SV's.
127  *
128  * Some of the methods should always be private to the implementation, and some
129  * should eventually be made public */
130 
131 /* The header definitions are in F<invlist_inline.h> */
132 
133 #ifndef PERL_IN_XSUB_RE
134 
135 PERL_STATIC_INLINE UV*
S__invlist_array_init(SV * const invlist,const bool will_have_0)136 S__invlist_array_init(SV* const invlist, const bool will_have_0)
137 {
138     /* Returns a pointer to the first element in the inversion list's array.
139      * This is called upon initialization of an inversion list.  Where the
140      * array begins depends on whether the list has the code point U+0000 in it
141      * or not.  The other parameter tells it whether the code that follows this
142      * call is about to put a 0 in the inversion list or not.  The first
143      * element is either the element reserved for 0, if TRUE, or the element
144      * after it, if FALSE */
145 
146     bool* offset = get_invlist_offset_addr(invlist);
147     UV* zero_addr = (UV *) SvPVX(invlist);
148 
149     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
150 
151     /* Must be empty */
152     assert(! _invlist_len(invlist));
153 
154     *zero_addr = 0;
155 
156     /* 1^1 = 0; 1^0 = 1 */
157     *offset = 1 ^ will_have_0;
158     return zero_addr + *offset;
159 }
160 
161 STATIC void
S_invlist_replace_list_destroys_src(pTHX_ SV * dest,SV * src)162 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
163 {
164     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
165      * steals the list from 'src', so 'src' is made to have a NULL list.  This
166      * is similar to what SvSetMagicSV() would do, if it were implemented on
167      * inversion lists, though this routine avoids a copy */
168 
169     const UV src_len          = _invlist_len(src);
170     const bool src_offset     = *get_invlist_offset_addr(src);
171     const STRLEN src_byte_len = SvLEN(src);
172     char * array              = SvPVX(src);
173 
174 #ifndef NO_TAINT_SUPPORT
175     const int oldtainted = TAINT_get;
176 #endif
177 
178     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
179 
180     assert(is_invlist(src));
181     assert(is_invlist(dest));
182     assert(! invlist_is_iterating(src));
183     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
184 
185     /* Make sure it ends in the right place with a NUL, as our inversion list
186      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
187      * asserts it */
188     array[src_byte_len - 1] = '\0';
189 
190     TAINT_NOT;      /* Otherwise it breaks */
191     sv_usepvn_flags(dest,
192                     (char *) array,
193                     src_byte_len - 1,
194 
195                     /* This flag is documented to cause a copy to be avoided */
196                     SV_HAS_TRAILING_NUL);
197     TAINT_set(oldtainted);
198     SvPV_set(src, 0);
199     SvLEN_set(src, 0);
200     SvCUR_set(src, 0);
201 
202     /* Finish up copying over the other fields in an inversion list */
203     *get_invlist_offset_addr(dest) = src_offset;
204     invlist_set_len(dest, src_len, src_offset);
205     *get_invlist_previous_index_addr(dest) = 0;
206     invlist_iterfinish(dest);
207 }
208 
209 PERL_STATIC_INLINE IV*
S_get_invlist_previous_index_addr(SV * invlist)210 S_get_invlist_previous_index_addr(SV* invlist)
211 {
212     /* Return the address of the IV that is reserved to hold the cached index
213      * */
214     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
215 
216     assert(is_invlist(invlist));
217 
218     return &(((XINVLIST*) SvANY(invlist))->prev_index);
219 }
220 
221 PERL_STATIC_INLINE IV
S_invlist_previous_index(SV * const invlist)222 S_invlist_previous_index(SV* const invlist)
223 {
224     /* Returns cached index of previous search */
225 
226     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
227 
228     return *get_invlist_previous_index_addr(invlist);
229 }
230 
231 PERL_STATIC_INLINE void
S_invlist_set_previous_index(SV * const invlist,const IV index)232 S_invlist_set_previous_index(SV* const invlist, const IV index)
233 {
234     /* Caches <index> for later retrieval */
235 
236     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
237 
238     assert(index == 0 || index < (int) _invlist_len(invlist));
239 
240     *get_invlist_previous_index_addr(invlist) = index;
241 }
242 
243 PERL_STATIC_INLINE void
S_invlist_trim(SV * invlist)244 S_invlist_trim(SV* invlist)
245 {
246     /* Free the not currently-being-used space in an inversion list */
247 
248     /* But don't free up the space needed for the 0 UV that is always at the
249      * beginning of the list, nor the trailing NUL */
250     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
251 
252     PERL_ARGS_ASSERT_INVLIST_TRIM;
253 
254     assert(is_invlist(invlist));
255 
256     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
257 }
258 
259 PERL_STATIC_INLINE void
S_invlist_clear(pTHX_ SV * invlist)260 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
261 {
262     PERL_ARGS_ASSERT_INVLIST_CLEAR;
263 
264     assert(is_invlist(invlist));
265 
266     invlist_set_len(invlist, 0, 0);
267     invlist_trim(invlist);
268 }
269 
270 PERL_STATIC_INLINE UV
S_invlist_max(const SV * const invlist)271 S_invlist_max(const SV* const invlist)
272 {
273     /* Returns the maximum number of elements storable in the inversion list's
274      * array, without having to realloc() */
275 
276     PERL_ARGS_ASSERT_INVLIST_MAX;
277 
278     assert(is_invlist(invlist));
279 
280     /* Assumes worst case, in which the 0 element is not counted in the
281      * inversion list, so subtracts 1 for that */
282     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
283            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
284            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
285 }
286 
287 STATIC void
S_initialize_invlist_guts(pTHX_ SV * invlist,const Size_t initial_size)288 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
289 {
290     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
291 
292     /* First 1 is in case the zero element isn't in the list; second 1 is for
293      * trailing NUL */
294     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
295     invlist_set_len(invlist, 0, 0);
296 
297     /* Force iterinit() to be used to get iteration to work */
298     invlist_iterfinish(invlist);
299 
300     *get_invlist_previous_index_addr(invlist) = 0;
301     SvPOK_on(invlist);  /* This allows B to extract the PV */
302 }
303 
304 SV*
Perl__new_invlist(pTHX_ IV initial_size)305 Perl__new_invlist(pTHX_ IV initial_size)
306 {
307 
308     /* Return a pointer to a newly constructed inversion list, with enough
309      * space to store 'initial_size' elements.  If that number is negative, a
310      * system default is used instead */
311 
312     SV* new_list;
313 
314     if (initial_size < 0) {
315         initial_size = 10;
316     }
317 
318     new_list = newSV_type(SVt_INVLIST);
319     initialize_invlist_guts(new_list, initial_size);
320 
321     return new_list;
322 }
323 
324 SV*
Perl__new_invlist_C_array(pTHX_ const UV * const list)325 Perl__new_invlist_C_array(pTHX_ const UV* const list)
326 {
327     /* Return a pointer to a newly constructed inversion list, initialized to
328      * point to <list>, which has to be in the exact correct inversion list
329      * form, including internal fields.  Thus this is a dangerous routine that
330      * should not be used in the wrong hands.  The passed in 'list' contains
331      * several header fields at the beginning that are not part of the
332      * inversion list body proper */
333 
334     const STRLEN length = (STRLEN) list[0];
335     const UV version_id =          list[1];
336     const bool offset   =    cBOOL(list[2]);
337 #define HEADER_LENGTH 3
338     /* If any of the above changes in any way, you must change HEADER_LENGTH
339      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
340      *      perl -E 'say int(rand 2**31-1)'
341      */
342 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
343                                         data structure type, so that one being
344                                         passed in can be validated to be an
345                                         inversion list of the correct vintage.
346                                        */
347 
348     SV* invlist = newSV_type(SVt_INVLIST);
349 
350     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
351 
352     if (version_id != INVLIST_VERSION_ID) {
353         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
354     }
355 
356     /* The generated array passed in includes header elements that aren't part
357      * of the list proper, so start it just after them */
358     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
359 
360     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
361                                shouldn't touch it */
362 
363     *(get_invlist_offset_addr(invlist)) = offset;
364 
365     /* The 'length' passed to us is the physical number of elements in the
366      * inversion list.  But if there is an offset the logical number is one
367      * less than that */
368     invlist_set_len(invlist, length  - offset, offset);
369 
370     invlist_set_previous_index(invlist, 0);
371 
372     /* Initialize the iteration pointer. */
373     invlist_iterfinish(invlist);
374 
375     SvREADONLY_on(invlist);
376     SvPOK_on(invlist);
377 
378     return invlist;
379 }
380 
381 STATIC void
S__append_range_to_invlist(pTHX_ SV * const invlist,const UV start,const UV end)382 S__append_range_to_invlist(pTHX_ SV* const invlist,
383                                  const UV start, const UV end)
384 {
385    /* Subject to change or removal.  Append the range from 'start' to 'end' at
386     * the end of the inversion list.  The range must be above any existing
387     * ones. */
388 
389     UV* array;
390     UV max = invlist_max(invlist);
391     UV len = _invlist_len(invlist);
392     bool offset;
393 
394     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
395 
396     if (len == 0) { /* Empty lists must be initialized */
397         offset = start != 0;
398         array = _invlist_array_init(invlist, ! offset);
399     }
400     else {
401         /* Here, the existing list is non-empty. The current max entry in the
402          * list is generally the first value not in the set, except when the
403          * set extends to the end of permissible values, in which case it is
404          * the first entry in that final set, and so this call is an attempt to
405          * append out-of-order */
406 
407         UV final_element = len - 1;
408         array = invlist_array(invlist);
409         if (   array[final_element] > start
410             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
411         {
412             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
413                      array[final_element], start,
414                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
415         }
416 
417         /* Here, it is a legal append.  If the new range begins 1 above the end
418          * of the range below it, it is extending the range below it, so the
419          * new first value not in the set is one greater than the newly
420          * extended range.  */
421         offset = *get_invlist_offset_addr(invlist);
422         if (array[final_element] == start) {
423             if (end != UV_MAX) {
424                 array[final_element] = end + 1;
425             }
426             else {
427                 /* But if the end is the maximum representable on the machine,
428                  * assume that infinity was actually what was meant.  Just let
429                  * the range that this would extend to have no end */
430                 invlist_set_len(invlist, len - 1, offset);
431             }
432             return;
433         }
434     }
435 
436     /* Here the new range doesn't extend any existing set.  Add it */
437 
438     len += 2;   /* Includes an element each for the start and end of range */
439 
440     /* If wll overflow the existing space, extend, which may cause the array to
441      * be moved */
442     if (max < len) {
443         invlist_extend(invlist, len);
444 
445         /* Have to set len here to avoid assert failure in invlist_array() */
446         invlist_set_len(invlist, len, offset);
447 
448         array = invlist_array(invlist);
449     }
450     else {
451         invlist_set_len(invlist, len, offset);
452     }
453 
454     /* The next item on the list starts the range, the one after that is
455      * one past the new range.  */
456     array[len - 2] = start;
457     if (end != UV_MAX) {
458         array[len - 1] = end + 1;
459     }
460     else {
461         /* But if the end is the maximum representable on the machine, just let
462          * the range have no end */
463         invlist_set_len(invlist, len - 1, offset);
464     }
465 }
466 
467 SSize_t
Perl__invlist_search(SV * const invlist,const UV cp)468 Perl__invlist_search(SV* const invlist, const UV cp)
469 {
470     /* Searches the inversion list for the entry that contains the input code
471      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
472      * return value is the index into the list's array of the range that
473      * contains <cp>, that is, 'i' such that
474      *  array[i] <= cp < array[i+1]
475      */
476 
477     IV low = 0;
478     IV mid;
479     IV high = _invlist_len(invlist);
480     const IV highest_element = high - 1;
481     const UV* array;
482 
483     PERL_ARGS_ASSERT__INVLIST_SEARCH;
484 
485     /* If list is empty, return failure. */
486     if (UNLIKELY(high == 0)) {
487         return -1;
488     }
489 
490     /* (We can't get the array unless we know the list is non-empty) */
491     array = invlist_array(invlist);
492 
493     mid = invlist_previous_index(invlist);
494     assert(mid >=0);
495     if (UNLIKELY(mid > highest_element)) {
496         mid = highest_element;
497     }
498 
499     /* <mid> contains the cache of the result of the previous call to this
500      * function (0 the first time).  See if this call is for the same result,
501      * or if it is for mid-1.  This is under the theory that calls to this
502      * function will often be for related code points that are near each other.
503      * And benchmarks show that caching gives better results.  We also test
504      * here if the code point is within the bounds of the list.  These tests
505      * replace others that would have had to be made anyway to make sure that
506      * the array bounds were not exceeded, and these give us extra information
507      * at the same time */
508     if (cp >= array[mid]) {
509         if (cp >= array[highest_element]) {
510             return highest_element;
511         }
512 
513         /* Here, array[mid] <= cp < array[highest_element].  This means that
514          * the final element is not the answer, so can exclude it; it also
515          * means that <mid> is not the final element, so can refer to 'mid + 1'
516          * safely */
517         if (cp < array[mid + 1]) {
518             return mid;
519         }
520         high--;
521         low = mid + 1;
522     }
523     else { /* cp < aray[mid] */
524         if (cp < array[0]) { /* Fail if outside the array */
525             return -1;
526         }
527         high = mid;
528         if (cp >= array[mid - 1]) {
529             goto found_entry;
530         }
531     }
532 
533     /* Binary search.  What we are looking for is <i> such that
534      *  array[i] <= cp < array[i+1]
535      * The loop below converges on the i+1.  Note that there may not be an
536      * (i+1)th element in the array, and things work nonetheless */
537     while (low < high) {
538         mid = (low + high) / 2;
539         assert(mid <= highest_element);
540         if (array[mid] <= cp) { /* cp >= array[mid] */
541             low = mid + 1;
542 
543             /* We could do this extra test to exit the loop early.
544             if (cp < array[low]) {
545                 return mid;
546             }
547             */
548         }
549         else { /* cp < array[mid] */
550             high = mid;
551         }
552     }
553 
554   found_entry:
555     high--;
556     invlist_set_previous_index(invlist, high);
557     return high;
558 }
559 
560 void
Perl__invlist_union_maybe_complement_2nd(pTHX_ SV * const a,SV * const b,const bool complement_b,SV ** output)561 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
562                                          const bool complement_b, SV** output)
563 {
564     /* Take the union of two inversion lists and point '*output' to it.  On
565      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
566      * even 'a' or 'b').  If to an inversion list, the contents of the original
567      * list will be replaced by the union.  The first list, 'a', may be
568      * NULL, in which case a copy of the second list is placed in '*output'.
569      * If 'complement_b' is TRUE, the union is taken of the complement
570      * (inversion) of 'b' instead of b itself.
571      *
572      * The basis for this comes from "Unicode Demystified" Chapter 13 by
573      * Richard Gillam, published by Addison-Wesley, and explained at some
574      * length there.  The preface says to incorporate its examples into your
575      * code at your own risk.
576      *
577      * The algorithm is like a merge sort. */
578 
579     const UV* array_a;    /* a's array */
580     const UV* array_b;
581     UV len_a;       /* length of a's array */
582     UV len_b;
583 
584     SV* u;                      /* the resulting union */
585     UV* array_u;
586     UV len_u = 0;
587 
588     UV i_a = 0;             /* current index into a's array */
589     UV i_b = 0;
590     UV i_u = 0;
591 
592     /* running count, as explained in the algorithm source book; items are
593      * stopped accumulating and are output when the count changes to/from 0.
594      * The count is incremented when we start a range that's in an input's set,
595      * and decremented when we start a range that's not in a set.  So this
596      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
597      * and hence nothing goes into the union; 1, just one of the inputs is in
598      * its set (and its current range gets added to the union); and 2 when both
599      * inputs are in their sets.  */
600     UV count = 0;
601 
602     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
603     assert(a != b);
604     assert(*output == NULL || is_invlist(*output));
605 
606     len_b = _invlist_len(b);
607     if (len_b == 0) {
608 
609         /* Here, 'b' is empty, hence it's complement is all possible code
610          * points.  So if the union includes the complement of 'b', it includes
611          * everything, and we need not even look at 'a'.  It's easiest to
612          * create a new inversion list that matches everything.  */
613         if (complement_b) {
614             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
615 
616             if (*output == NULL) { /* If the output didn't exist, just point it
617                                       at the new list */
618                 *output = everything;
619             }
620             else { /* Otherwise, replace its contents with the new list */
621                 invlist_replace_list_destroys_src(*output, everything);
622                 SvREFCNT_dec_NN(everything);
623             }
624 
625             return;
626         }
627 
628         /* Here, we don't want the complement of 'b', and since 'b' is empty,
629          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
630          * output will be empty */
631 
632         if (a == NULL || _invlist_len(a) == 0) {
633             if (*output == NULL) {
634                 *output = _new_invlist(0);
635             }
636             else {
637                 invlist_clear(*output);
638             }
639             return;
640         }
641 
642         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
643          * union.  We can just return a copy of 'a' if '*output' doesn't point
644          * to an existing list */
645         if (*output == NULL) {
646             *output = invlist_clone(a, NULL);
647             return;
648         }
649 
650         /* If the output is to overwrite 'a', we have a no-op, as it's
651          * already in 'a' */
652         if (*output == a) {
653             return;
654         }
655 
656         /* Here, '*output' is to be overwritten by 'a' */
657         u = invlist_clone(a, NULL);
658         invlist_replace_list_destroys_src(*output, u);
659         SvREFCNT_dec_NN(u);
660 
661         return;
662     }
663 
664     /* Here 'b' is not empty.  See about 'a' */
665 
666     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
667 
668         /* Here, 'a' is empty (and b is not).  That means the union will come
669          * entirely from 'b'.  If '*output' is NULL, we can directly return a
670          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
671          * the clone */
672 
673         SV ** dest = (*output == NULL) ? output : &u;
674         *dest = invlist_clone(b, NULL);
675         if (complement_b) {
676             _invlist_invert(*dest);
677         }
678 
679         if (dest == &u) {
680             invlist_replace_list_destroys_src(*output, u);
681             SvREFCNT_dec_NN(u);
682         }
683 
684         return;
685     }
686 
687     /* Here both lists exist and are non-empty */
688     array_a = invlist_array(a);
689     array_b = invlist_array(b);
690 
691     /* If are to take the union of 'a' with the complement of b, set it
692      * up so are looking at b's complement. */
693     if (complement_b) {
694 
695         /* To complement, we invert: if the first element is 0, remove it.  To
696          * do this, we just pretend the array starts one later */
697         if (array_b[0] == 0) {
698             array_b++;
699             len_b--;
700         }
701         else {
702 
703             /* But if the first element is not zero, we pretend the list starts
704              * at the 0 that is always stored immediately before the array. */
705             array_b--;
706             len_b++;
707         }
708     }
709 
710     /* Size the union for the worst case: that the sets are completely
711      * disjoint */
712     u = _new_invlist(len_a + len_b);
713 
714     /* Will contain U+0000 if either component does */
715     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
716                                       || (len_b > 0 && array_b[0] == 0));
717 
718     /* Go through each input list item by item, stopping when have exhausted
719      * one of them */
720     while (i_a < len_a && i_b < len_b) {
721         UV cp;      /* The element to potentially add to the union's array */
722         bool cp_in_set;   /* is it in the input list's set or not */
723 
724         /* We need to take one or the other of the two inputs for the union.
725          * Since we are merging two sorted lists, we take the smaller of the
726          * next items.  In case of a tie, we take first the one that is in its
727          * set.  If we first took the one not in its set, it would decrement
728          * the count, possibly to 0 which would cause it to be output as ending
729          * the range, and the next time through we would take the same number,
730          * and output it again as beginning the next range.  By doing it the
731          * opposite way, there is no possibility that the count will be
732          * momentarily decremented to 0, and thus the two adjoining ranges will
733          * be seamlessly merged.  (In a tie and both are in the set or both not
734          * in the set, it doesn't matter which we take first.) */
735         if (       array_a[i_a] < array_b[i_b]
736             || (   array_a[i_a] == array_b[i_b]
737                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
738         {
739             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
740             cp = array_a[i_a++];
741         }
742         else {
743             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
744             cp = array_b[i_b++];
745         }
746 
747         /* Here, have chosen which of the two inputs to look at.  Only output
748          * if the running count changes to/from 0, which marks the
749          * beginning/end of a range that's in the set */
750         if (cp_in_set) {
751             if (count == 0) {
752                 array_u[i_u++] = cp;
753             }
754             count++;
755         }
756         else {
757             count--;
758             if (count == 0) {
759                 array_u[i_u++] = cp;
760             }
761         }
762     }
763 
764 
765     /* The loop above increments the index into exactly one of the input lists
766      * each iteration, and ends when either index gets to its list end.  That
767      * means the other index is lower than its end, and so something is
768      * remaining in that one.  We decrement 'count', as explained below, if
769      * that list is in its set.  (i_a and i_b each currently index the element
770      * beyond the one we care about.) */
771     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
772         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
773     {
774         count--;
775     }
776 
777     /* Above we decremented 'count' if the list that had unexamined elements in
778      * it was in its set.  This has made it so that 'count' being non-zero
779      * means there isn't anything left to output; and 'count' equal to 0 means
780      * that what is left to output is precisely that which is left in the
781      * non-exhausted input list.
782      *
783      * To see why, note first that the exhausted input obviously has nothing
784      * left to add to the union.  If it was in its set at its end, that means
785      * the set extends from here to the platform's infinity, and hence so does
786      * the union and the non-exhausted set is irrelevant.  The exhausted set
787      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
788      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
789      * 'count' remains at 1.  This is consistent with the decremented 'count'
790      * != 0 meaning there's nothing left to add to the union.
791      *
792      * But if the exhausted input wasn't in its set, it contributed 0 to
793      * 'count', and the rest of the union will be whatever the other input is.
794      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
795      * otherwise it gets decremented to 0.  This is consistent with 'count'
796      * == 0 meaning the remainder of the union is whatever is left in the
797      * non-exhausted list. */
798     if (count != 0) {
799         len_u = i_u;
800     }
801     else {
802         IV copy_count = len_a - i_a;
803         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
804             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
805         }
806         else { /* The non-exhausted input is b */
807             copy_count = len_b - i_b;
808             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
809         }
810         len_u = i_u + copy_count;
811     }
812 
813     /* Set the result to the final length, which can change the pointer to
814      * array_u, so re-find it.  (Note that it is unlikely that this will
815      * change, as we are shrinking the space, not enlarging it) */
816     if (len_u != _invlist_len(u)) {
817         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
818         invlist_trim(u);
819         array_u = invlist_array(u);
820     }
821 
822     if (*output == NULL) {  /* Simply return the new inversion list */
823         *output = u;
824     }
825     else {
826         /* Otherwise, overwrite the inversion list that was in '*output'.  We
827          * could instead free '*output', and then set it to 'u', but experience
828          * has shown [perl #127392] that if the input is a mortal, we can get a
829          * huge build-up of these during regex compilation before they get
830          * freed. */
831         invlist_replace_list_destroys_src(*output, u);
832         SvREFCNT_dec_NN(u);
833     }
834 
835     return;
836 }
837 
838 void
Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV * const a,SV * const b,const bool complement_b,SV ** i)839 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
840                                                const bool complement_b, SV** i)
841 {
842     /* Take the intersection of two inversion lists and point '*i' to it.  On
843      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
844      * even 'a' or 'b').  If to an inversion list, the contents of the original
845      * list will be replaced by the intersection.  The first list, 'a', may be
846      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
847      * TRUE, the result will be the intersection of 'a' and the complement (or
848      * inversion) of 'b' instead of 'b' directly.
849      *
850      * The basis for this comes from "Unicode Demystified" Chapter 13 by
851      * Richard Gillam, published by Addison-Wesley, and explained at some
852      * length there.  The preface says to incorporate its examples into your
853      * code at your own risk.  In fact, it had bugs
854      *
855      * The algorithm is like a merge sort, and is essentially the same as the
856      * union above
857      */
858 
859     const UV* array_a;          /* a's array */
860     const UV* array_b;
861     UV len_a;   /* length of a's array */
862     UV len_b;
863 
864     SV* r;                   /* the resulting intersection */
865     UV* array_r;
866     UV len_r = 0;
867 
868     UV i_a = 0;             /* current index into a's array */
869     UV i_b = 0;
870     UV i_r = 0;
871 
872     /* running count of how many of the two inputs are postitioned at ranges
873      * that are in their sets.  As explained in the algorithm source book,
874      * items are stopped accumulating and are output when the count changes
875      * to/from 2.  The count is incremented when we start a range that's in an
876      * input's set, and decremented when we start a range that's not in a set.
877      * Only when it is 2 are we in the intersection. */
878     UV count = 0;
879 
880     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
881     assert(a != b);
882     assert(*i == NULL || is_invlist(*i));
883 
884     /* Special case if either one is empty */
885     len_a = (a == NULL) ? 0 : _invlist_len(a);
886     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
887         if (len_a != 0 && complement_b) {
888 
889             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
890              * must be empty.  Here, also we are using 'b's complement, which
891              * hence must be every possible code point.  Thus the intersection
892              * is simply 'a'. */
893 
894             if (*i == a) {  /* No-op */
895                 return;
896             }
897 
898             if (*i == NULL) {
899                 *i = invlist_clone(a, NULL);
900                 return;
901             }
902 
903             r = invlist_clone(a, NULL);
904             invlist_replace_list_destroys_src(*i, r);
905             SvREFCNT_dec_NN(r);
906             return;
907         }
908 
909         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
910          * intersection must be empty */
911         if (*i == NULL) {
912             *i = _new_invlist(0);
913             return;
914         }
915 
916         invlist_clear(*i);
917         return;
918     }
919 
920     /* Here both lists exist and are non-empty */
921     array_a = invlist_array(a);
922     array_b = invlist_array(b);
923 
924     /* If are to take the intersection of 'a' with the complement of b, set it
925      * up so are looking at b's complement. */
926     if (complement_b) {
927 
928         /* To complement, we invert: if the first element is 0, remove it.  To
929          * do this, we just pretend the array starts one later */
930         if (array_b[0] == 0) {
931             array_b++;
932             len_b--;
933         }
934         else {
935 
936             /* But if the first element is not zero, we pretend the list starts
937              * at the 0 that is always stored immediately before the array. */
938             array_b--;
939             len_b++;
940         }
941     }
942 
943     /* Size the intersection for the worst case: that the intersection ends up
944      * fragmenting everything to be completely disjoint */
945     r= _new_invlist(len_a + len_b);
946 
947     /* Will contain U+0000 iff both components do */
948     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
949                                      && len_b > 0 && array_b[0] == 0);
950 
951     /* Go through each list item by item, stopping when have exhausted one of
952      * them */
953     while (i_a < len_a && i_b < len_b) {
954         UV cp;      /* The element to potentially add to the intersection's
955                        array */
956         bool cp_in_set; /* Is it in the input list's set or not */
957 
958         /* We need to take one or the other of the two inputs for the
959          * intersection.  Since we are merging two sorted lists, we take the
960          * smaller of the next items.  In case of a tie, we take first the one
961          * that is not in its set (a difference from the union algorithm).  If
962          * we first took the one in its set, it would increment the count,
963          * possibly to 2 which would cause it to be output as starting a range
964          * in the intersection, and the next time through we would take that
965          * same number, and output it again as ending the set.  By doing the
966          * opposite of this, there is no possibility that the count will be
967          * momentarily incremented to 2.  (In a tie and both are in the set or
968          * both not in the set, it doesn't matter which we take first.) */
969         if (       array_a[i_a] < array_b[i_b]
970             || (   array_a[i_a] == array_b[i_b]
971                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
972         {
973             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
974             cp = array_a[i_a++];
975         }
976         else {
977             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
978             cp= array_b[i_b++];
979         }
980 
981         /* Here, have chosen which of the two inputs to look at.  Only output
982          * if the running count changes to/from 2, which marks the
983          * beginning/end of a range that's in the intersection */
984         if (cp_in_set) {
985             count++;
986             if (count == 2) {
987                 array_r[i_r++] = cp;
988             }
989         }
990         else {
991             if (count == 2) {
992                 array_r[i_r++] = cp;
993             }
994             count--;
995         }
996 
997     }
998 
999     /* The loop above increments the index into exactly one of the input lists
1000      * each iteration, and ends when either index gets to its list end.  That
1001      * means the other index is lower than its end, and so something is
1002      * remaining in that one.  We increment 'count', as explained below, if the
1003      * exhausted list was in its set.  (i_a and i_b each currently index the
1004      * element beyond the one we care about.) */
1005     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
1006         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
1007     {
1008         count++;
1009     }
1010 
1011     /* Above we incremented 'count' if the exhausted list was in its set.  This
1012      * has made it so that 'count' being below 2 means there is nothing left to
1013      * output; otheriwse what's left to add to the intersection is precisely
1014      * that which is left in the non-exhausted input list.
1015      *
1016      * To see why, note first that the exhausted input obviously has nothing
1017      * left to affect the intersection.  If it was in its set at its end, that
1018      * means the set extends from here to the platform's infinity, and hence
1019      * anything in the non-exhausted's list will be in the intersection, and
1020      * anything not in it won't be.  Hence, the rest of the intersection is
1021      * precisely what's in the non-exhausted list  The exhausted set also
1022      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
1023      * it means 'count' is now at least 2.  This is consistent with the
1024      * incremented 'count' being >= 2 means to add the non-exhausted list to
1025      * the intersection.
1026      *
1027      * But if the exhausted input wasn't in its set, it contributed 0 to
1028      * 'count', and the intersection can't include anything further; the
1029      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
1030      * incremented.  This is consistent with 'count' being < 2 meaning nothing
1031      * further to add to the intersection. */
1032     if (count < 2) { /* Nothing left to put in the intersection. */
1033         len_r = i_r;
1034     }
1035     else { /* copy the non-exhausted list, unchanged. */
1036         IV copy_count = len_a - i_a;
1037         if (copy_count > 0) {   /* a is the one with stuff left */
1038             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
1039         }
1040         else {  /* b is the one with stuff left */
1041             copy_count = len_b - i_b;
1042             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
1043         }
1044         len_r = i_r + copy_count;
1045     }
1046 
1047     /* Set the result to the final length, which can change the pointer to
1048      * array_r, so re-find it.  (Note that it is unlikely that this will
1049      * change, as we are shrinking the space, not enlarging it) */
1050     if (len_r != _invlist_len(r)) {
1051         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
1052         invlist_trim(r);
1053         array_r = invlist_array(r);
1054     }
1055 
1056     if (*i == NULL) { /* Simply return the calculated intersection */
1057         *i = r;
1058     }
1059     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
1060               instead free '*i', and then set it to 'r', but experience has
1061               shown [perl #127392] that if the input is a mortal, we can get a
1062               huge build-up of these during regex compilation before they get
1063               freed. */
1064         if (len_r) {
1065             invlist_replace_list_destroys_src(*i, r);
1066         }
1067         else {
1068             invlist_clear(*i);
1069         }
1070         SvREFCNT_dec_NN(r);
1071     }
1072 
1073     return;
1074 }
1075 
1076 SV*
Perl__add_range_to_invlist(pTHX_ SV * invlist,UV start,UV end)1077 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
1078 {
1079     /* Add the range from 'start' to 'end' inclusive to the inversion list's
1080      * set.  A pointer to the inversion list is returned.  This may actually be
1081      * a new list, in which case the passed in one has been destroyed.  The
1082      * passed-in inversion list can be NULL, in which case a new one is created
1083      * with just the one range in it.  The new list is not necessarily
1084      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
1085      * result of this function.  The gain would not be large, and in many
1086      * cases, this is called multiple times on a single inversion list, so
1087      * anything freed may almost immediately be needed again.
1088      *
1089      * This used to mostly call the 'union' routine, but that is much more
1090      * heavyweight than really needed for a single range addition */
1091 
1092     UV* array;              /* The array implementing the inversion list */
1093     UV len;                 /* How many elements in 'array' */
1094     SSize_t i_s;            /* index into the invlist array where 'start'
1095                                should go */
1096     SSize_t i_e = 0;        /* And the index where 'end' should go */
1097     UV cur_highest;         /* The highest code point in the inversion list
1098                                upon entry to this function */
1099 
1100     /* This range becomes the whole inversion list if none already existed */
1101     if (invlist == NULL) {
1102         invlist = _new_invlist(2);
1103         _append_range_to_invlist(invlist, start, end);
1104         return invlist;
1105     }
1106 
1107     /* Likewise, if the inversion list is currently empty */
1108     len = _invlist_len(invlist);
1109     if (len == 0) {
1110         _append_range_to_invlist(invlist, start, end);
1111         return invlist;
1112     }
1113 
1114     /* Starting here, we have to know the internals of the list */
1115     array = invlist_array(invlist);
1116 
1117     /* If the new range ends higher than the current highest ... */
1118     cur_highest = invlist_highest(invlist);
1119     if (end > cur_highest) {
1120 
1121         /* If the whole range is higher, we can just append it */
1122         if (start > cur_highest) {
1123             _append_range_to_invlist(invlist, start, end);
1124             return invlist;
1125         }
1126 
1127         /* Otherwise, add the portion that is higher ... */
1128         _append_range_to_invlist(invlist, cur_highest + 1, end);
1129 
1130         /* ... and continue on below to handle the rest.  As a result of the
1131          * above append, we know that the index of the end of the range is the
1132          * final even numbered one of the array.  Recall that the final element
1133          * always starts a range that extends to infinity.  If that range is in
1134          * the set (meaning the set goes from here to infinity), it will be an
1135          * even index, but if it isn't in the set, it's odd, and the final
1136          * range in the set is one less, which is even. */
1137         if (end == UV_MAX) {
1138             i_e = len;
1139         }
1140         else {
1141             i_e = len - 2;
1142         }
1143     }
1144 
1145     /* We have dealt with appending, now see about prepending.  If the new
1146      * range starts lower than the current lowest ... */
1147     if (start < array[0]) {
1148 
1149         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
1150          * Let the union code handle it, rather than having to know the
1151          * trickiness in two code places.  */
1152         if (UNLIKELY(start == 0)) {
1153             SV* range_invlist;
1154 
1155             range_invlist = _new_invlist(2);
1156             _append_range_to_invlist(range_invlist, start, end);
1157 
1158             _invlist_union(invlist, range_invlist, &invlist);
1159 
1160             SvREFCNT_dec_NN(range_invlist);
1161 
1162             return invlist;
1163         }
1164 
1165         /* If the whole new range comes before the first entry, and doesn't
1166          * extend it, we have to insert it as an additional range */
1167         if (end < array[0] - 1) {
1168             i_s = i_e = -1;
1169             goto splice_in_new_range;
1170         }
1171 
1172         /* Here the new range adjoins the existing first range, extending it
1173          * downwards. */
1174         array[0] = start;
1175 
1176         /* And continue on below to handle the rest.  We know that the index of
1177          * the beginning of the range is the first one of the array */
1178         i_s = 0;
1179     }
1180     else { /* Not prepending any part of the new range to the existing list.
1181             * Find where in the list it should go.  This finds i_s, such that:
1182             *     invlist[i_s] <= start < array[i_s+1]
1183             */
1184         i_s = _invlist_search(invlist, start);
1185     }
1186 
1187     /* At this point, any extending before the beginning of the inversion list
1188      * and/or after the end has been done.  This has made it so that, in the
1189      * code below, each endpoint of the new range is either in a range that is
1190      * in the set, or is in a gap between two ranges that are.  This means we
1191      * don't have to worry about exceeding the array bounds.
1192      *
1193      * Find where in the list the new range ends (but we can skip this if we
1194      * have already determined what it is, or if it will be the same as i_s,
1195      * which we already have computed) */
1196     if (i_e == 0) {
1197         i_e = (start == end)
1198               ? i_s
1199               : _invlist_search(invlist, end);
1200     }
1201 
1202     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
1203      * is a range that goes to infinity there is no element at invlist[i_e+1],
1204      * so only the first relation holds. */
1205 
1206     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
1207 
1208         /* Here, the ranges on either side of the beginning of the new range
1209          * are in the set, and this range starts in the gap between them.
1210          *
1211          * The new range extends the range above it downwards if the new range
1212          * ends at or above that range's start */
1213         const bool extends_the_range_above = (   end == UV_MAX
1214                                               || end + 1 >= array[i_s+1]);
1215 
1216         /* The new range extends the range below it upwards if it begins just
1217          * after where that range ends */
1218         if (start == array[i_s]) {
1219 
1220             /* If the new range fills the entire gap between the other ranges,
1221              * they will get merged together.  Other ranges may also get
1222              * merged, depending on how many of them the new range spans.  In
1223              * the general case, we do the merge later, just once, after we
1224              * figure out how many to merge.  But in the case where the new
1225              * range exactly spans just this one gap (possibly extending into
1226              * the one above), we do the merge here, and an early exit.  This
1227              * is done here to avoid having to special case later. */
1228             if (i_e - i_s <= 1) {
1229 
1230                 /* If i_e - i_s == 1, it means that the new range terminates
1231                  * within the range above, and hence 'extends_the_range_above'
1232                  * must be true.  (If the range above it extends to infinity,
1233                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
1234                  * will be 0, so no harm done.) */
1235                 if (extends_the_range_above) {
1236                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
1237                     invlist_set_len(invlist,
1238                                     len - 2,
1239                                     *(get_invlist_offset_addr(invlist)));
1240                     return invlist;
1241                 }
1242 
1243                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
1244                  * to the same range, and below we are about to decrement i_s
1245                  * */
1246                 i_e--;
1247             }
1248 
1249             /* Here, the new range is adjacent to the one below.  (It may also
1250              * span beyond the range above, but that will get resolved later.)
1251              * Extend the range below to include this one. */
1252             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
1253             i_s--;
1254             start = array[i_s];
1255         }
1256         else if (extends_the_range_above) {
1257 
1258             /* Here the new range only extends the range above it, but not the
1259              * one below.  It merges with the one above.  Again, we keep i_e
1260              * and i_s in sync if they point to the same range */
1261             if (i_e == i_s) {
1262                 i_e++;
1263             }
1264             i_s++;
1265             array[i_s] = start;
1266         }
1267     }
1268 
1269     /* Here, we've dealt with the new range start extending any adjoining
1270      * existing ranges.
1271      *
1272      * If the new range extends to infinity, it is now the final one,
1273      * regardless of what was there before */
1274     if (UNLIKELY(end == UV_MAX)) {
1275         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
1276         return invlist;
1277     }
1278 
1279     /* If i_e started as == i_s, it has also been dealt with,
1280      * and been updated to the new i_s, which will fail the following if */
1281     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
1282 
1283         /* Here, the ranges on either side of the end of the new range are in
1284          * the set, and this range ends in the gap between them.
1285          *
1286          * If this range is adjacent to (hence extends) the range above it, it
1287          * becomes part of that range; likewise if it extends the range below,
1288          * it becomes part of that range */
1289         if (end + 1 == array[i_e+1]) {
1290             i_e++;
1291             array[i_e] = start;
1292         }
1293         else if (start <= array[i_e]) {
1294             array[i_e] = end + 1;
1295             i_e--;
1296         }
1297     }
1298 
1299     if (i_s == i_e) {
1300 
1301         /* If the range fits entirely in an existing range (as possibly already
1302          * extended above), it doesn't add anything new */
1303         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
1304             return invlist;
1305         }
1306 
1307         /* Here, no part of the range is in the list.  Must add it.  It will
1308          * occupy 2 more slots */
1309       splice_in_new_range:
1310 
1311         invlist_extend(invlist, len + 2);
1312         array = invlist_array(invlist);
1313         /* Move the rest of the array down two slots. Don't include any
1314          * trailing NUL */
1315         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
1316 
1317         /* Do the actual splice */
1318         array[i_e+1] = start;
1319         array[i_e+2] = end + 1;
1320         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
1321         return invlist;
1322     }
1323 
1324     /* Here the new range crossed the boundaries of a pre-existing range.  The
1325      * code above has adjusted things so that both ends are in ranges that are
1326      * in the set.  This means everything in between must also be in the set.
1327      * Just squash things together */
1328     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
1329     invlist_set_len(invlist,
1330                     len - i_e + i_s,
1331                     *(get_invlist_offset_addr(invlist)));
1332 
1333     return invlist;
1334 }
1335 
1336 SV*
Perl__setup_canned_invlist(pTHX_ const STRLEN size,const UV element0,UV ** other_elements_ptr)1337 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
1338                                  UV** other_elements_ptr)
1339 {
1340     /* Create and return an inversion list whose contents are to be populated
1341      * by the caller.  The caller gives the number of elements (in 'size') and
1342      * the very first element ('element0').  This function will set
1343      * '*other_elements_ptr' to an array of UVs, where the remaining elements
1344      * are to be placed.
1345      *
1346      * Obviously there is some trust involved that the caller will properly
1347      * fill in the other elements of the array.
1348      *
1349      * (The first element needs to be passed in, as the underlying code does
1350      * things differently depending on whether it is zero or non-zero) */
1351 
1352     SV* invlist = _new_invlist(size);
1353     bool offset;
1354 
1355     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
1356 
1357     invlist = add_cp_to_invlist(invlist, element0);
1358     offset = *get_invlist_offset_addr(invlist);
1359 
1360     invlist_set_len(invlist, size, offset);
1361     *other_elements_ptr = invlist_array(invlist) + 1;
1362     return invlist;
1363 }
1364 
1365 #endif
1366 
1367 #ifndef PERL_IN_XSUB_RE
1368 void
Perl__invlist_invert(pTHX_ SV * const invlist)1369 Perl__invlist_invert(pTHX_ SV* const invlist)
1370 {
1371     /* Complement the input inversion list.  This adds a 0 if the list didn't
1372      * have a zero; removes it otherwise.  As described above, the data
1373      * structure is set up so that this is very efficient */
1374 
1375     PERL_ARGS_ASSERT__INVLIST_INVERT;
1376 
1377     assert(! invlist_is_iterating(invlist));
1378 
1379     /* The inverse of matching nothing is matching everything */
1380     if (_invlist_len(invlist) == 0) {
1381         _append_range_to_invlist(invlist, 0, UV_MAX);
1382         return;
1383     }
1384 
1385     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
1386 }
1387 
1388 SV*
Perl_invlist_clone(pTHX_ SV * const invlist,SV * new_invlist)1389 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
1390 {
1391     /* Return a new inversion list that is a copy of the input one, which is
1392      * unchanged.  The new list will not be mortal even if the old one was. */
1393 
1394     const STRLEN nominal_length = _invlist_len(invlist);
1395     const STRLEN physical_length = SvCUR(invlist);
1396     const bool offset = *(get_invlist_offset_addr(invlist));
1397 
1398     PERL_ARGS_ASSERT_INVLIST_CLONE;
1399 
1400     if (new_invlist == NULL) {
1401         new_invlist = _new_invlist(nominal_length);
1402     }
1403     else {
1404         sv_upgrade(new_invlist, SVt_INVLIST);
1405         initialize_invlist_guts(new_invlist, nominal_length);
1406     }
1407 
1408     *(get_invlist_offset_addr(new_invlist)) = offset;
1409     invlist_set_len(new_invlist, nominal_length, offset);
1410     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
1411 
1412     return new_invlist;
1413 }
1414 
1415 #endif
1416 
1417 
1418 #ifndef PERL_IN_XSUB_RE
1419 void
Perl__invlist_dump(pTHX_ PerlIO * file,I32 level,const char * const indent,SV * const invlist)1420 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
1421                          const char * const indent, SV* const invlist)
1422 {
1423     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
1424      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
1425      * the string 'indent'.  The output looks like this:
1426          [0] 0x000A .. 0x000D
1427          [2] 0x0085
1428          [4] 0x2028 .. 0x2029
1429          [6] 0x3104 .. INFTY
1430      * This means that the first range of code points matched by the list are
1431      * 0xA through 0xD; the second range contains only the single code point
1432      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
1433      * are used to define each range (except if the final range extends to
1434      * infinity, only a single element is needed).  The array index of the
1435      * first element for the corresponding range is given in brackets. */
1436 
1437     UV start, end;
1438     STRLEN count = 0;
1439 
1440     PERL_ARGS_ASSERT__INVLIST_DUMP;
1441 
1442     if (invlist_is_iterating(invlist)) {
1443         Perl_dump_indent(aTHX_ level, file,
1444              "%sCan't dump inversion list because is in middle of iterating\n",
1445              indent);
1446         return;
1447     }
1448 
1449     invlist_iterinit(invlist);
1450     while (invlist_iternext(invlist, &start, &end)) {
1451         if (end == UV_MAX) {
1452             Perl_dump_indent(aTHX_ level, file,
1453                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
1454                                    indent, (UV)count, start);
1455         }
1456         else if (end != start) {
1457             Perl_dump_indent(aTHX_ level, file,
1458                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
1459                                 indent, (UV)count, start,         end);
1460         }
1461         else {
1462             Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
1463                                             indent, (UV)count, start);
1464         }
1465         count += 2;
1466     }
1467 }
1468 
1469 #endif
1470 
1471 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
1472 bool
Perl__invlistEQ(pTHX_ SV * const a,SV * const b,const bool complement_b)1473 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
1474 {
1475     /* Return a boolean as to if the two passed in inversion lists are
1476      * identical.  The final argument, if TRUE, says to take the complement of
1477      * the second inversion list before doing the comparison */
1478 
1479     const UV len_a = _invlist_len(a);
1480     UV len_b = _invlist_len(b);
1481 
1482     const UV* array_a = NULL;
1483     const UV* array_b = NULL;
1484 
1485     PERL_ARGS_ASSERT__INVLISTEQ;
1486 
1487     /* This code avoids accessing the arrays unless it knows the length is
1488      * non-zero */
1489 
1490     if (len_a == 0) {
1491         if (len_b == 0) {
1492             return ! complement_b;
1493         }
1494     }
1495     else {
1496         array_a = invlist_array(a);
1497     }
1498 
1499     if (len_b != 0) {
1500         array_b = invlist_array(b);
1501     }
1502 
1503     /* If are to compare 'a' with the complement of b, set it
1504      * up so are looking at b's complement. */
1505     if (complement_b) {
1506 
1507         /* The complement of nothing is everything, so <a> would have to have
1508          * just one element, starting at zero (ending at infinity) */
1509         if (len_b == 0) {
1510             return (len_a == 1 && array_a[0] == 0);
1511         }
1512         if (array_b[0] == 0) {
1513 
1514             /* Otherwise, to complement, we invert.  Here, the first element is
1515              * 0, just remove it.  To do this, we just pretend the array starts
1516              * one later */
1517 
1518             array_b++;
1519             len_b--;
1520         }
1521         else {
1522 
1523             /* But if the first element is not zero, we pretend the list starts
1524              * at the 0 that is always stored immediately before the array. */
1525             array_b--;
1526             len_b++;
1527         }
1528     }
1529 
1530     return    len_a == len_b
1531            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
1532 
1533 }
1534 #endif
1535 
1536 #undef HEADER_LENGTH
1537 #undef TO_INTERNAL_SIZE
1538 #undef FROM_INTERNAL_SIZE
1539 #undef INVLIST_VERSION_ID
1540 
1541 /* End of inversion list object */
1542