xref: /openbsd/gnu/usr.bin/perl/regcomp_invlist.c (revision f2a19305)
1*f2a19305Safresh1 #ifdef PERL_EXT_RE_BUILD
2*f2a19305Safresh1 #include "re_top.h"
3*f2a19305Safresh1 #endif
4*f2a19305Safresh1 
5*f2a19305Safresh1 #include "EXTERN.h"
6*f2a19305Safresh1 #define PERL_IN_REGEX_ENGINE
7*f2a19305Safresh1 #define PERL_IN_REGCOMP_ANY
8*f2a19305Safresh1 #define PERL_IN_REGCOMP_INVLIST_C
9*f2a19305Safresh1 #include "perl.h"
10*f2a19305Safresh1 
11*f2a19305Safresh1 #ifdef PERL_IN_XSUB_RE
12*f2a19305Safresh1 #  include "re_comp.h"
13*f2a19305Safresh1 #else
14*f2a19305Safresh1 #  include "regcomp.h"
15*f2a19305Safresh1 #endif
16*f2a19305Safresh1 
17*f2a19305Safresh1 #include "invlist_inline.h"
18*f2a19305Safresh1 #include "unicode_constants.h"
19*f2a19305Safresh1 #include "regcomp_internal.h"
20*f2a19305Safresh1 
21*f2a19305Safresh1 
22*f2a19305Safresh1 void
Perl_populate_bitmap_from_invlist(pTHX_ SV * invlist,const UV offset,const U8 * bitmap,const Size_t len)23*f2a19305Safresh1 Perl_populate_bitmap_from_invlist(pTHX_ SV * invlist, const UV offset, const U8 * bitmap, const Size_t len)
24*f2a19305Safresh1 {
25*f2a19305Safresh1     PERL_ARGS_ASSERT_POPULATE_BITMAP_FROM_INVLIST;
26*f2a19305Safresh1 
27*f2a19305Safresh1     /* As the name says.  The zeroth bit corresponds to the code point given by
28*f2a19305Safresh1      * 'offset' */
29*f2a19305Safresh1 
30*f2a19305Safresh1     UV start, end;
31*f2a19305Safresh1 
32*f2a19305Safresh1     Zero(bitmap, len, U8);
33*f2a19305Safresh1 
34*f2a19305Safresh1     invlist_iterinit(invlist);
35*f2a19305Safresh1     while (invlist_iternext(invlist, &start, &end)) {
36*f2a19305Safresh1         assert(start >= offset);
37*f2a19305Safresh1 
38*f2a19305Safresh1         for (UV i = start; i <= end; i++) {
39*f2a19305Safresh1             UV adjusted = i - offset;
40*f2a19305Safresh1 
41*f2a19305Safresh1             BITMAP_BYTE(bitmap, adjusted) |= BITMAP_BIT(adjusted);
42*f2a19305Safresh1         }
43*f2a19305Safresh1     }
44*f2a19305Safresh1     invlist_iterfinish(invlist);
45*f2a19305Safresh1 }
46*f2a19305Safresh1 
47*f2a19305Safresh1 void
Perl_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap,const Size_t bitmap_len,SV ** invlist,const UV offset)48*f2a19305Safresh1 Perl_populate_invlist_from_bitmap(pTHX_ const U8 * bitmap, const Size_t bitmap_len, SV ** invlist, const UV offset)
49*f2a19305Safresh1 {
50*f2a19305Safresh1     PERL_ARGS_ASSERT_POPULATE_INVLIST_FROM_BITMAP;
51*f2a19305Safresh1 
52*f2a19305Safresh1     /* As the name says.  The zeroth bit corresponds to the code point given by
53*f2a19305Safresh1      * 'offset' */
54*f2a19305Safresh1 
55*f2a19305Safresh1     Size_t i;
56*f2a19305Safresh1 
57*f2a19305Safresh1     for (i = 0; i < bitmap_len; i++) {
58*f2a19305Safresh1         if (BITMAP_TEST(bitmap, i)) {
59*f2a19305Safresh1             int start = i++;
60*f2a19305Safresh1 
61*f2a19305Safresh1             /* Save a little work by adding a range all at once instead of bit
62*f2a19305Safresh1              * by bit */
63*f2a19305Safresh1             while (i < bitmap_len && BITMAP_TEST(bitmap, i)) {
64*f2a19305Safresh1                 i++;
65*f2a19305Safresh1             }
66*f2a19305Safresh1 
67*f2a19305Safresh1             *invlist = _add_range_to_invlist(*invlist,
68*f2a19305Safresh1                                              start + offset,
69*f2a19305Safresh1                                              i + offset - 1);
70*f2a19305Safresh1         }
71*f2a19305Safresh1     }
72*f2a19305Safresh1 }
73*f2a19305Safresh1 
74*f2a19305Safresh1 /* This section of code defines the inversion list object and its methods.  The
75*f2a19305Safresh1  * interfaces are highly subject to change, so as much as possible is static to
76*f2a19305Safresh1  * this file.  An inversion list is here implemented as a malloc'd C UV array
77*f2a19305Safresh1  * as an SVt_INVLIST scalar.
78*f2a19305Safresh1  *
79*f2a19305Safresh1  * An inversion list for Unicode is an array of code points, sorted by ordinal
80*f2a19305Safresh1  * number.  Each element gives the code point that begins a range that extends
81*f2a19305Safresh1  * up-to but not including the code point given by the next element.  The final
82*f2a19305Safresh1  * element gives the first code point of a range that extends to the platform's
83*f2a19305Safresh1  * infinity.  The even-numbered elements (invlist[0], invlist[2], invlist[4],
84*f2a19305Safresh1  * ...) give ranges whose code points are all in the inversion list.  We say
85*f2a19305Safresh1  * that those ranges are in the set.  The odd-numbered elements give ranges
86*f2a19305Safresh1  * whose code points are not in the inversion list, and hence not in the set.
87*f2a19305Safresh1  * Thus, element [0] is the first code point in the list.  Element [1]
88*f2a19305Safresh1  * is the first code point beyond that not in the list; and element [2] is the
89*f2a19305Safresh1  * first code point beyond that that is in the list.  In other words, the first
90*f2a19305Safresh1  * range is invlist[0]..(invlist[1]-1), and all code points in that range are
91*f2a19305Safresh1  * in the inversion list.  The second range is invlist[1]..(invlist[2]-1), and
92*f2a19305Safresh1  * all code points in that range are not in the inversion list.  The third
93*f2a19305Safresh1  * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion
94*f2a19305Safresh1  * list, and so forth.  Thus every element whose index is divisible by two
95*f2a19305Safresh1  * gives the beginning of a range that is in the list, and every element whose
96*f2a19305Safresh1  * index is not divisible by two gives the beginning of a range not in the
97*f2a19305Safresh1  * list.  If the final element's index is divisible by two, the inversion list
98*f2a19305Safresh1  * extends to the platform's infinity; otherwise the highest code point in the
99*f2a19305Safresh1  * inversion list is the contents of that element minus 1.
100*f2a19305Safresh1  *
101*f2a19305Safresh1  * A range that contains just a single code point N will look like
102*f2a19305Safresh1  *  invlist[i]   == N
103*f2a19305Safresh1  *  invlist[i+1] == N+1
104*f2a19305Safresh1  *
105*f2a19305Safresh1  * If N is UV_MAX (the highest representable code point on the machine), N+1 is
106*f2a19305Safresh1  * impossible to represent, so element [i+1] is omitted.  The single element
107*f2a19305Safresh1  * inversion list
108*f2a19305Safresh1  *  invlist[0] == UV_MAX
109*f2a19305Safresh1  * contains just UV_MAX, but is interpreted as matching to infinity.
110*f2a19305Safresh1  *
111*f2a19305Safresh1  * Taking the complement (inverting) an inversion list is quite simple, if the
112*f2a19305Safresh1  * first element is 0, remove it; otherwise add a 0 element at the beginning.
113*f2a19305Safresh1  * This implementation reserves an element at the beginning of each inversion
114*f2a19305Safresh1  * list to always contain 0; there is an additional flag in the header which
115*f2a19305Safresh1  * indicates if the list begins at the 0, or is offset to begin at the next
116*f2a19305Safresh1  * element.  This means that the inversion list can be inverted without any
117*f2a19305Safresh1  * copying; just flip the flag.
118*f2a19305Safresh1  *
119*f2a19305Safresh1  * More about inversion lists can be found in "Unicode Demystified"
120*f2a19305Safresh1  * Chapter 13 by Richard Gillam, published by Addison-Wesley.
121*f2a19305Safresh1  *
122*f2a19305Safresh1  * The inversion list data structure is currently implemented as an SV pointing
123*f2a19305Safresh1  * to an array of UVs that the SV thinks are bytes.  This allows us to have an
124*f2a19305Safresh1  * array of UV whose memory management is automatically handled by the existing
125*f2a19305Safresh1  * facilities for SV's.
126*f2a19305Safresh1  *
127*f2a19305Safresh1  * Some of the methods should always be private to the implementation, and some
128*f2a19305Safresh1  * should eventually be made public */
129*f2a19305Safresh1 
130*f2a19305Safresh1 /* The header definitions are in F<invlist_inline.h> */
131*f2a19305Safresh1 
132*f2a19305Safresh1 #ifndef PERL_IN_XSUB_RE
133*f2a19305Safresh1 
134*f2a19305Safresh1 PERL_STATIC_INLINE UV*
S__invlist_array_init(SV * const invlist,const bool will_have_0)135*f2a19305Safresh1 S__invlist_array_init(SV* const invlist, const bool will_have_0)
136*f2a19305Safresh1 {
137*f2a19305Safresh1     /* Returns a pointer to the first element in the inversion list's array.
138*f2a19305Safresh1      * This is called upon initialization of an inversion list.  Where the
139*f2a19305Safresh1      * array begins depends on whether the list has the code point U+0000 in it
140*f2a19305Safresh1      * or not.  The other parameter tells it whether the code that follows this
141*f2a19305Safresh1      * call is about to put a 0 in the inversion list or not.  The first
142*f2a19305Safresh1      * element is either the element reserved for 0, if TRUE, or the element
143*f2a19305Safresh1      * after it, if FALSE */
144*f2a19305Safresh1 
145*f2a19305Safresh1     bool* offset = get_invlist_offset_addr(invlist);
146*f2a19305Safresh1     UV* zero_addr = (UV *) SvPVX(invlist);
147*f2a19305Safresh1 
148*f2a19305Safresh1     PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
149*f2a19305Safresh1 
150*f2a19305Safresh1     /* Must be empty */
151*f2a19305Safresh1     assert(! _invlist_len(invlist));
152*f2a19305Safresh1 
153*f2a19305Safresh1     *zero_addr = 0;
154*f2a19305Safresh1 
155*f2a19305Safresh1     /* 1^1 = 0; 1^0 = 1 */
156*f2a19305Safresh1     *offset = 1 ^ will_have_0;
157*f2a19305Safresh1     return zero_addr + *offset;
158*f2a19305Safresh1 }
159*f2a19305Safresh1 
160*f2a19305Safresh1 STATIC void
S_invlist_replace_list_destroys_src(pTHX_ SV * dest,SV * src)161*f2a19305Safresh1 S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src)
162*f2a19305Safresh1 {
163*f2a19305Safresh1     /* Replaces the inversion list in 'dest' with the one from 'src'.  It
164*f2a19305Safresh1      * steals the list from 'src', so 'src' is made to have a NULL list.  This
165*f2a19305Safresh1      * is similar to what SvSetMagicSV() would do, if it were implemented on
166*f2a19305Safresh1      * inversion lists, though this routine avoids a copy */
167*f2a19305Safresh1 
168*f2a19305Safresh1     const UV src_len          = _invlist_len(src);
169*f2a19305Safresh1     const bool src_offset     = *get_invlist_offset_addr(src);
170*f2a19305Safresh1     const STRLEN src_byte_len = SvLEN(src);
171*f2a19305Safresh1     char * array              = SvPVX(src);
172*f2a19305Safresh1 
173*f2a19305Safresh1 #ifndef NO_TAINT_SUPPORT
174*f2a19305Safresh1     const int oldtainted = TAINT_get;
175*f2a19305Safresh1 #endif
176*f2a19305Safresh1 
177*f2a19305Safresh1     PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC;
178*f2a19305Safresh1 
179*f2a19305Safresh1     assert(is_invlist(src));
180*f2a19305Safresh1     assert(is_invlist(dest));
181*f2a19305Safresh1     assert(! invlist_is_iterating(src));
182*f2a19305Safresh1     assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src));
183*f2a19305Safresh1 
184*f2a19305Safresh1     /* Make sure it ends in the right place with a NUL, as our inversion list
185*f2a19305Safresh1      * manipulations aren't careful to keep this true, but sv_usepvn_flags()
186*f2a19305Safresh1      * asserts it */
187*f2a19305Safresh1     array[src_byte_len - 1] = '\0';
188*f2a19305Safresh1 
189*f2a19305Safresh1     TAINT_NOT;      /* Otherwise it breaks */
190*f2a19305Safresh1     sv_usepvn_flags(dest,
191*f2a19305Safresh1                     (char *) array,
192*f2a19305Safresh1                     src_byte_len - 1,
193*f2a19305Safresh1 
194*f2a19305Safresh1                     /* This flag is documented to cause a copy to be avoided */
195*f2a19305Safresh1                     SV_HAS_TRAILING_NUL);
196*f2a19305Safresh1     TAINT_set(oldtainted);
197*f2a19305Safresh1     SvPV_set(src, 0);
198*f2a19305Safresh1     SvLEN_set(src, 0);
199*f2a19305Safresh1     SvCUR_set(src, 0);
200*f2a19305Safresh1 
201*f2a19305Safresh1     /* Finish up copying over the other fields in an inversion list */
202*f2a19305Safresh1     *get_invlist_offset_addr(dest) = src_offset;
203*f2a19305Safresh1     invlist_set_len(dest, src_len, src_offset);
204*f2a19305Safresh1     *get_invlist_previous_index_addr(dest) = 0;
205*f2a19305Safresh1     invlist_iterfinish(dest);
206*f2a19305Safresh1 }
207*f2a19305Safresh1 
208*f2a19305Safresh1 PERL_STATIC_INLINE IV*
S_get_invlist_previous_index_addr(SV * invlist)209*f2a19305Safresh1 S_get_invlist_previous_index_addr(SV* invlist)
210*f2a19305Safresh1 {
211*f2a19305Safresh1     /* Return the address of the IV that is reserved to hold the cached index
212*f2a19305Safresh1      * */
213*f2a19305Safresh1     PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
214*f2a19305Safresh1 
215*f2a19305Safresh1     assert(is_invlist(invlist));
216*f2a19305Safresh1 
217*f2a19305Safresh1     return &(((XINVLIST*) SvANY(invlist))->prev_index);
218*f2a19305Safresh1 }
219*f2a19305Safresh1 
220*f2a19305Safresh1 PERL_STATIC_INLINE IV
S_invlist_previous_index(SV * const invlist)221*f2a19305Safresh1 S_invlist_previous_index(SV* const invlist)
222*f2a19305Safresh1 {
223*f2a19305Safresh1     /* Returns cached index of previous search */
224*f2a19305Safresh1 
225*f2a19305Safresh1     PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
226*f2a19305Safresh1 
227*f2a19305Safresh1     return *get_invlist_previous_index_addr(invlist);
228*f2a19305Safresh1 }
229*f2a19305Safresh1 
230*f2a19305Safresh1 PERL_STATIC_INLINE void
S_invlist_set_previous_index(SV * const invlist,const IV index)231*f2a19305Safresh1 S_invlist_set_previous_index(SV* const invlist, const IV index)
232*f2a19305Safresh1 {
233*f2a19305Safresh1     /* Caches <index> for later retrieval */
234*f2a19305Safresh1 
235*f2a19305Safresh1     PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
236*f2a19305Safresh1 
237*f2a19305Safresh1     assert(index == 0 || index < (int) _invlist_len(invlist));
238*f2a19305Safresh1 
239*f2a19305Safresh1     *get_invlist_previous_index_addr(invlist) = index;
240*f2a19305Safresh1 }
241*f2a19305Safresh1 
242*f2a19305Safresh1 PERL_STATIC_INLINE void
S_invlist_trim(SV * invlist)243*f2a19305Safresh1 S_invlist_trim(SV* invlist)
244*f2a19305Safresh1 {
245*f2a19305Safresh1     /* Free the not currently-being-used space in an inversion list */
246*f2a19305Safresh1 
247*f2a19305Safresh1     /* But don't free up the space needed for the 0 UV that is always at the
248*f2a19305Safresh1      * beginning of the list, nor the trailing NUL */
249*f2a19305Safresh1     const UV min_size = TO_INTERNAL_SIZE(1) + 1;
250*f2a19305Safresh1 
251*f2a19305Safresh1     PERL_ARGS_ASSERT_INVLIST_TRIM;
252*f2a19305Safresh1 
253*f2a19305Safresh1     assert(is_invlist(invlist));
254*f2a19305Safresh1 
255*f2a19305Safresh1     SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1));
256*f2a19305Safresh1 }
257*f2a19305Safresh1 
258*f2a19305Safresh1 PERL_STATIC_INLINE void
S_invlist_clear(pTHX_ SV * invlist)259*f2a19305Safresh1 S_invlist_clear(pTHX_ SV* invlist)    /* Empty the inversion list */
260*f2a19305Safresh1 {
261*f2a19305Safresh1     PERL_ARGS_ASSERT_INVLIST_CLEAR;
262*f2a19305Safresh1 
263*f2a19305Safresh1     assert(is_invlist(invlist));
264*f2a19305Safresh1 
265*f2a19305Safresh1     invlist_set_len(invlist, 0, 0);
266*f2a19305Safresh1     invlist_trim(invlist);
267*f2a19305Safresh1 }
268*f2a19305Safresh1 
269*f2a19305Safresh1 PERL_STATIC_INLINE UV
S_invlist_max(const SV * const invlist)270*f2a19305Safresh1 S_invlist_max(const SV* const invlist)
271*f2a19305Safresh1 {
272*f2a19305Safresh1     /* Returns the maximum number of elements storable in the inversion list's
273*f2a19305Safresh1      * array, without having to realloc() */
274*f2a19305Safresh1 
275*f2a19305Safresh1     PERL_ARGS_ASSERT_INVLIST_MAX;
276*f2a19305Safresh1 
277*f2a19305Safresh1     assert(is_invlist(invlist));
278*f2a19305Safresh1 
279*f2a19305Safresh1     /* Assumes worst case, in which the 0 element is not counted in the
280*f2a19305Safresh1      * inversion list, so subtracts 1 for that */
281*f2a19305Safresh1     return SvLEN(invlist) == 0  /* This happens under _new_invlist_C_array */
282*f2a19305Safresh1            ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1
283*f2a19305Safresh1            : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1;
284*f2a19305Safresh1 }
285*f2a19305Safresh1 
286*f2a19305Safresh1 STATIC void
S_initialize_invlist_guts(pTHX_ SV * invlist,const Size_t initial_size)287*f2a19305Safresh1 S_initialize_invlist_guts(pTHX_ SV* invlist, const Size_t initial_size)
288*f2a19305Safresh1 {
289*f2a19305Safresh1     PERL_ARGS_ASSERT_INITIALIZE_INVLIST_GUTS;
290*f2a19305Safresh1 
291*f2a19305Safresh1     /* First 1 is in case the zero element isn't in the list; second 1 is for
292*f2a19305Safresh1      * trailing NUL */
293*f2a19305Safresh1     SvGROW(invlist, TO_INTERNAL_SIZE(initial_size + 1) + 1);
294*f2a19305Safresh1     invlist_set_len(invlist, 0, 0);
295*f2a19305Safresh1 
296*f2a19305Safresh1     /* Force iterinit() to be used to get iteration to work */
297*f2a19305Safresh1     invlist_iterfinish(invlist);
298*f2a19305Safresh1 
299*f2a19305Safresh1     *get_invlist_previous_index_addr(invlist) = 0;
300*f2a19305Safresh1     SvPOK_on(invlist);  /* This allows B to extract the PV */
301*f2a19305Safresh1 }
302*f2a19305Safresh1 
303*f2a19305Safresh1 SV*
Perl__new_invlist(pTHX_ IV initial_size)304*f2a19305Safresh1 Perl__new_invlist(pTHX_ IV initial_size)
305*f2a19305Safresh1 {
306*f2a19305Safresh1 
307*f2a19305Safresh1     /* Return a pointer to a newly constructed inversion list, with enough
308*f2a19305Safresh1      * space to store 'initial_size' elements.  If that number is negative, a
309*f2a19305Safresh1      * system default is used instead */
310*f2a19305Safresh1 
311*f2a19305Safresh1     SV* new_list;
312*f2a19305Safresh1 
313*f2a19305Safresh1     if (initial_size < 0) {
314*f2a19305Safresh1         initial_size = 10;
315*f2a19305Safresh1     }
316*f2a19305Safresh1 
317*f2a19305Safresh1     new_list = newSV_type(SVt_INVLIST);
318*f2a19305Safresh1     initialize_invlist_guts(new_list, initial_size);
319*f2a19305Safresh1 
320*f2a19305Safresh1     return new_list;
321*f2a19305Safresh1 }
322*f2a19305Safresh1 
323*f2a19305Safresh1 SV*
Perl__new_invlist_C_array(pTHX_ const UV * const list)324*f2a19305Safresh1 Perl__new_invlist_C_array(pTHX_ const UV* const list)
325*f2a19305Safresh1 {
326*f2a19305Safresh1     /* Return a pointer to a newly constructed inversion list, initialized to
327*f2a19305Safresh1      * point to <list>, which has to be in the exact correct inversion list
328*f2a19305Safresh1      * form, including internal fields.  Thus this is a dangerous routine that
329*f2a19305Safresh1      * should not be used in the wrong hands.  The passed in 'list' contains
330*f2a19305Safresh1      * several header fields at the beginning that are not part of the
331*f2a19305Safresh1      * inversion list body proper */
332*f2a19305Safresh1 
333*f2a19305Safresh1     const STRLEN length = (STRLEN) list[0];
334*f2a19305Safresh1     const UV version_id =          list[1];
335*f2a19305Safresh1     const bool offset   =    cBOOL(list[2]);
336*f2a19305Safresh1 #define HEADER_LENGTH 3
337*f2a19305Safresh1     /* If any of the above changes in any way, you must change HEADER_LENGTH
338*f2a19305Safresh1      * (if appropriate) and regenerate INVLIST_VERSION_ID by running
339*f2a19305Safresh1      *      perl -E 'say int(rand 2**31-1)'
340*f2a19305Safresh1      */
341*f2a19305Safresh1 #define INVLIST_VERSION_ID 148565664 /* This is a combination of a version and
342*f2a19305Safresh1                                         data structure type, so that one being
343*f2a19305Safresh1                                         passed in can be validated to be an
344*f2a19305Safresh1                                         inversion list of the correct vintage.
345*f2a19305Safresh1                                        */
346*f2a19305Safresh1 
347*f2a19305Safresh1     SV* invlist = newSV_type(SVt_INVLIST);
348*f2a19305Safresh1 
349*f2a19305Safresh1     PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
350*f2a19305Safresh1 
351*f2a19305Safresh1     if (version_id != INVLIST_VERSION_ID) {
352*f2a19305Safresh1         Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
353*f2a19305Safresh1     }
354*f2a19305Safresh1 
355*f2a19305Safresh1     /* The generated array passed in includes header elements that aren't part
356*f2a19305Safresh1      * of the list proper, so start it just after them */
357*f2a19305Safresh1     SvPV_set(invlist, (char *) (list + HEADER_LENGTH));
358*f2a19305Safresh1 
359*f2a19305Safresh1     SvLEN_set(invlist, 0);  /* Means we own the contents, and the system
360*f2a19305Safresh1                                shouldn't touch it */
361*f2a19305Safresh1 
362*f2a19305Safresh1     *(get_invlist_offset_addr(invlist)) = offset;
363*f2a19305Safresh1 
364*f2a19305Safresh1     /* The 'length' passed to us is the physical number of elements in the
365*f2a19305Safresh1      * inversion list.  But if there is an offset the logical number is one
366*f2a19305Safresh1      * less than that */
367*f2a19305Safresh1     invlist_set_len(invlist, length  - offset, offset);
368*f2a19305Safresh1 
369*f2a19305Safresh1     invlist_set_previous_index(invlist, 0);
370*f2a19305Safresh1 
371*f2a19305Safresh1     /* Initialize the iteration pointer. */
372*f2a19305Safresh1     invlist_iterfinish(invlist);
373*f2a19305Safresh1 
374*f2a19305Safresh1     SvREADONLY_on(invlist);
375*f2a19305Safresh1     SvPOK_on(invlist);
376*f2a19305Safresh1 
377*f2a19305Safresh1     return invlist;
378*f2a19305Safresh1 }
379*f2a19305Safresh1 
380*f2a19305Safresh1 STATIC void
S__append_range_to_invlist(pTHX_ SV * const invlist,const UV start,const UV end)381*f2a19305Safresh1 S__append_range_to_invlist(pTHX_ SV* const invlist,
382*f2a19305Safresh1                                  const UV start, const UV end)
383*f2a19305Safresh1 {
384*f2a19305Safresh1    /* Subject to change or removal.  Append the range from 'start' to 'end' at
385*f2a19305Safresh1     * the end of the inversion list.  The range must be above any existing
386*f2a19305Safresh1     * ones. */
387*f2a19305Safresh1 
388*f2a19305Safresh1     UV* array;
389*f2a19305Safresh1     UV max = invlist_max(invlist);
390*f2a19305Safresh1     UV len = _invlist_len(invlist);
391*f2a19305Safresh1     bool offset;
392*f2a19305Safresh1 
393*f2a19305Safresh1     PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
394*f2a19305Safresh1 
395*f2a19305Safresh1     if (len == 0) { /* Empty lists must be initialized */
396*f2a19305Safresh1         offset = start != 0;
397*f2a19305Safresh1         array = _invlist_array_init(invlist, ! offset);
398*f2a19305Safresh1     }
399*f2a19305Safresh1     else {
400*f2a19305Safresh1         /* Here, the existing list is non-empty. The current max entry in the
401*f2a19305Safresh1          * list is generally the first value not in the set, except when the
402*f2a19305Safresh1          * set extends to the end of permissible values, in which case it is
403*f2a19305Safresh1          * the first entry in that final set, and so this call is an attempt to
404*f2a19305Safresh1          * append out-of-order */
405*f2a19305Safresh1 
406*f2a19305Safresh1         UV final_element = len - 1;
407*f2a19305Safresh1         array = invlist_array(invlist);
408*f2a19305Safresh1         if (   array[final_element] > start
409*f2a19305Safresh1             || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
410*f2a19305Safresh1         {
411*f2a19305Safresh1             Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c",
412*f2a19305Safresh1                      array[final_element], start,
413*f2a19305Safresh1                      ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
414*f2a19305Safresh1         }
415*f2a19305Safresh1 
416*f2a19305Safresh1         /* Here, it is a legal append.  If the new range begins 1 above the end
417*f2a19305Safresh1          * of the range below it, it is extending the range below it, so the
418*f2a19305Safresh1          * new first value not in the set is one greater than the newly
419*f2a19305Safresh1          * extended range.  */
420*f2a19305Safresh1         offset = *get_invlist_offset_addr(invlist);
421*f2a19305Safresh1         if (array[final_element] == start) {
422*f2a19305Safresh1             if (end != UV_MAX) {
423*f2a19305Safresh1                 array[final_element] = end + 1;
424*f2a19305Safresh1             }
425*f2a19305Safresh1             else {
426*f2a19305Safresh1                 /* But if the end is the maximum representable on the machine,
427*f2a19305Safresh1                  * assume that infinity was actually what was meant.  Just let
428*f2a19305Safresh1                  * the range that this would extend to have no end */
429*f2a19305Safresh1                 invlist_set_len(invlist, len - 1, offset);
430*f2a19305Safresh1             }
431*f2a19305Safresh1             return;
432*f2a19305Safresh1         }
433*f2a19305Safresh1     }
434*f2a19305Safresh1 
435*f2a19305Safresh1     /* Here the new range doesn't extend any existing set.  Add it */
436*f2a19305Safresh1 
437*f2a19305Safresh1     len += 2;   /* Includes an element each for the start and end of range */
438*f2a19305Safresh1 
439*f2a19305Safresh1     /* If wll overflow the existing space, extend, which may cause the array to
440*f2a19305Safresh1      * be moved */
441*f2a19305Safresh1     if (max < len) {
442*f2a19305Safresh1         invlist_extend(invlist, len);
443*f2a19305Safresh1 
444*f2a19305Safresh1         /* Have to set len here to avoid assert failure in invlist_array() */
445*f2a19305Safresh1         invlist_set_len(invlist, len, offset);
446*f2a19305Safresh1 
447*f2a19305Safresh1         array = invlist_array(invlist);
448*f2a19305Safresh1     }
449*f2a19305Safresh1     else {
450*f2a19305Safresh1         invlist_set_len(invlist, len, offset);
451*f2a19305Safresh1     }
452*f2a19305Safresh1 
453*f2a19305Safresh1     /* The next item on the list starts the range, the one after that is
454*f2a19305Safresh1      * one past the new range.  */
455*f2a19305Safresh1     array[len - 2] = start;
456*f2a19305Safresh1     if (end != UV_MAX) {
457*f2a19305Safresh1         array[len - 1] = end + 1;
458*f2a19305Safresh1     }
459*f2a19305Safresh1     else {
460*f2a19305Safresh1         /* But if the end is the maximum representable on the machine, just let
461*f2a19305Safresh1          * the range have no end */
462*f2a19305Safresh1         invlist_set_len(invlist, len - 1, offset);
463*f2a19305Safresh1     }
464*f2a19305Safresh1 }
465*f2a19305Safresh1 
466*f2a19305Safresh1 SSize_t
Perl__invlist_search(SV * const invlist,const UV cp)467*f2a19305Safresh1 Perl__invlist_search(SV* const invlist, const UV cp)
468*f2a19305Safresh1 {
469*f2a19305Safresh1     /* Searches the inversion list for the entry that contains the input code
470*f2a19305Safresh1      * point <cp>.  If <cp> is not in the list, -1 is returned.  Otherwise, the
471*f2a19305Safresh1      * return value is the index into the list's array of the range that
472*f2a19305Safresh1      * contains <cp>, that is, 'i' such that
473*f2a19305Safresh1      *  array[i] <= cp < array[i+1]
474*f2a19305Safresh1      */
475*f2a19305Safresh1 
476*f2a19305Safresh1     IV low = 0;
477*f2a19305Safresh1     IV mid;
478*f2a19305Safresh1     IV high = _invlist_len(invlist);
479*f2a19305Safresh1     const IV highest_element = high - 1;
480*f2a19305Safresh1     const UV* array;
481*f2a19305Safresh1 
482*f2a19305Safresh1     PERL_ARGS_ASSERT__INVLIST_SEARCH;
483*f2a19305Safresh1 
484*f2a19305Safresh1     /* If list is empty, return failure. */
485*f2a19305Safresh1     if (UNLIKELY(high == 0)) {
486*f2a19305Safresh1         return -1;
487*f2a19305Safresh1     }
488*f2a19305Safresh1 
489*f2a19305Safresh1     /* (We can't get the array unless we know the list is non-empty) */
490*f2a19305Safresh1     array = invlist_array(invlist);
491*f2a19305Safresh1 
492*f2a19305Safresh1     mid = invlist_previous_index(invlist);
493*f2a19305Safresh1     assert(mid >=0);
494*f2a19305Safresh1     if (UNLIKELY(mid > highest_element)) {
495*f2a19305Safresh1         mid = highest_element;
496*f2a19305Safresh1     }
497*f2a19305Safresh1 
498*f2a19305Safresh1     /* <mid> contains the cache of the result of the previous call to this
499*f2a19305Safresh1      * function (0 the first time).  See if this call is for the same result,
500*f2a19305Safresh1      * or if it is for mid-1.  This is under the theory that calls to this
501*f2a19305Safresh1      * function will often be for related code points that are near each other.
502*f2a19305Safresh1      * And benchmarks show that caching gives better results.  We also test
503*f2a19305Safresh1      * here if the code point is within the bounds of the list.  These tests
504*f2a19305Safresh1      * replace others that would have had to be made anyway to make sure that
505*f2a19305Safresh1      * the array bounds were not exceeded, and these give us extra information
506*f2a19305Safresh1      * at the same time */
507*f2a19305Safresh1     if (cp >= array[mid]) {
508*f2a19305Safresh1         if (cp >= array[highest_element]) {
509*f2a19305Safresh1             return highest_element;
510*f2a19305Safresh1         }
511*f2a19305Safresh1 
512*f2a19305Safresh1         /* Here, array[mid] <= cp < array[highest_element].  This means that
513*f2a19305Safresh1          * the final element is not the answer, so can exclude it; it also
514*f2a19305Safresh1          * means that <mid> is not the final element, so can refer to 'mid + 1'
515*f2a19305Safresh1          * safely */
516*f2a19305Safresh1         if (cp < array[mid + 1]) {
517*f2a19305Safresh1             return mid;
518*f2a19305Safresh1         }
519*f2a19305Safresh1         high--;
520*f2a19305Safresh1         low = mid + 1;
521*f2a19305Safresh1     }
522*f2a19305Safresh1     else { /* cp < aray[mid] */
523*f2a19305Safresh1         if (cp < array[0]) { /* Fail if outside the array */
524*f2a19305Safresh1             return -1;
525*f2a19305Safresh1         }
526*f2a19305Safresh1         high = mid;
527*f2a19305Safresh1         if (cp >= array[mid - 1]) {
528*f2a19305Safresh1             goto found_entry;
529*f2a19305Safresh1         }
530*f2a19305Safresh1     }
531*f2a19305Safresh1 
532*f2a19305Safresh1     /* Binary search.  What we are looking for is <i> such that
533*f2a19305Safresh1      *  array[i] <= cp < array[i+1]
534*f2a19305Safresh1      * The loop below converges on the i+1.  Note that there may not be an
535*f2a19305Safresh1      * (i+1)th element in the array, and things work nonetheless */
536*f2a19305Safresh1     while (low < high) {
537*f2a19305Safresh1         mid = (low + high) / 2;
538*f2a19305Safresh1         assert(mid <= highest_element);
539*f2a19305Safresh1         if (array[mid] <= cp) { /* cp >= array[mid] */
540*f2a19305Safresh1             low = mid + 1;
541*f2a19305Safresh1 
542*f2a19305Safresh1             /* We could do this extra test to exit the loop early.
543*f2a19305Safresh1             if (cp < array[low]) {
544*f2a19305Safresh1                 return mid;
545*f2a19305Safresh1             }
546*f2a19305Safresh1             */
547*f2a19305Safresh1         }
548*f2a19305Safresh1         else { /* cp < array[mid] */
549*f2a19305Safresh1             high = mid;
550*f2a19305Safresh1         }
551*f2a19305Safresh1     }
552*f2a19305Safresh1 
553*f2a19305Safresh1   found_entry:
554*f2a19305Safresh1     high--;
555*f2a19305Safresh1     invlist_set_previous_index(invlist, high);
556*f2a19305Safresh1     return high;
557*f2a19305Safresh1 }
558*f2a19305Safresh1 
559*f2a19305Safresh1 void
Perl__invlist_union_maybe_complement_2nd(pTHX_ SV * const a,SV * const b,const bool complement_b,SV ** output)560*f2a19305Safresh1 Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
561*f2a19305Safresh1                                          const bool complement_b, SV** output)
562*f2a19305Safresh1 {
563*f2a19305Safresh1     /* Take the union of two inversion lists and point '*output' to it.  On
564*f2a19305Safresh1      * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
565*f2a19305Safresh1      * even 'a' or 'b').  If to an inversion list, the contents of the original
566*f2a19305Safresh1      * list will be replaced by the union.  The first list, 'a', may be
567*f2a19305Safresh1      * NULL, in which case a copy of the second list is placed in '*output'.
568*f2a19305Safresh1      * If 'complement_b' is TRUE, the union is taken of the complement
569*f2a19305Safresh1      * (inversion) of 'b' instead of b itself.
570*f2a19305Safresh1      *
571*f2a19305Safresh1      * The basis for this comes from "Unicode Demystified" Chapter 13 by
572*f2a19305Safresh1      * Richard Gillam, published by Addison-Wesley, and explained at some
573*f2a19305Safresh1      * length there.  The preface says to incorporate its examples into your
574*f2a19305Safresh1      * code at your own risk.
575*f2a19305Safresh1      *
576*f2a19305Safresh1      * The algorithm is like a merge sort. */
577*f2a19305Safresh1 
578*f2a19305Safresh1     const UV* array_a;    /* a's array */
579*f2a19305Safresh1     const UV* array_b;
580*f2a19305Safresh1     UV len_a;       /* length of a's array */
581*f2a19305Safresh1     UV len_b;
582*f2a19305Safresh1 
583*f2a19305Safresh1     SV* u;                      /* the resulting union */
584*f2a19305Safresh1     UV* array_u;
585*f2a19305Safresh1     UV len_u = 0;
586*f2a19305Safresh1 
587*f2a19305Safresh1     UV i_a = 0;             /* current index into a's array */
588*f2a19305Safresh1     UV i_b = 0;
589*f2a19305Safresh1     UV i_u = 0;
590*f2a19305Safresh1 
591*f2a19305Safresh1     /* running count, as explained in the algorithm source book; items are
592*f2a19305Safresh1      * stopped accumulating and are output when the count changes to/from 0.
593*f2a19305Safresh1      * The count is incremented when we start a range that's in an input's set,
594*f2a19305Safresh1      * and decremented when we start a range that's not in a set.  So this
595*f2a19305Safresh1      * variable can be 0, 1, or 2.  When it is 0 neither input is in their set,
596*f2a19305Safresh1      * and hence nothing goes into the union; 1, just one of the inputs is in
597*f2a19305Safresh1      * its set (and its current range gets added to the union); and 2 when both
598*f2a19305Safresh1      * inputs are in their sets.  */
599*f2a19305Safresh1     UV count = 0;
600*f2a19305Safresh1 
601*f2a19305Safresh1     PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
602*f2a19305Safresh1     assert(a != b);
603*f2a19305Safresh1     assert(*output == NULL || is_invlist(*output));
604*f2a19305Safresh1 
605*f2a19305Safresh1     len_b = _invlist_len(b);
606*f2a19305Safresh1     if (len_b == 0) {
607*f2a19305Safresh1 
608*f2a19305Safresh1         /* Here, 'b' is empty, hence it's complement is all possible code
609*f2a19305Safresh1          * points.  So if the union includes the complement of 'b', it includes
610*f2a19305Safresh1          * everything, and we need not even look at 'a'.  It's easiest to
611*f2a19305Safresh1          * create a new inversion list that matches everything.  */
612*f2a19305Safresh1         if (complement_b) {
613*f2a19305Safresh1             SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX);
614*f2a19305Safresh1 
615*f2a19305Safresh1             if (*output == NULL) { /* If the output didn't exist, just point it
616*f2a19305Safresh1                                       at the new list */
617*f2a19305Safresh1                 *output = everything;
618*f2a19305Safresh1             }
619*f2a19305Safresh1             else { /* Otherwise, replace its contents with the new list */
620*f2a19305Safresh1                 invlist_replace_list_destroys_src(*output, everything);
621*f2a19305Safresh1                 SvREFCNT_dec_NN(everything);
622*f2a19305Safresh1             }
623*f2a19305Safresh1 
624*f2a19305Safresh1             return;
625*f2a19305Safresh1         }
626*f2a19305Safresh1 
627*f2a19305Safresh1         /* Here, we don't want the complement of 'b', and since 'b' is empty,
628*f2a19305Safresh1          * the union will come entirely from 'a'.  If 'a' is NULL or empty, the
629*f2a19305Safresh1          * output will be empty */
630*f2a19305Safresh1 
631*f2a19305Safresh1         if (a == NULL || _invlist_len(a) == 0) {
632*f2a19305Safresh1             if (*output == NULL) {
633*f2a19305Safresh1                 *output = _new_invlist(0);
634*f2a19305Safresh1             }
635*f2a19305Safresh1             else {
636*f2a19305Safresh1                 invlist_clear(*output);
637*f2a19305Safresh1             }
638*f2a19305Safresh1             return;
639*f2a19305Safresh1         }
640*f2a19305Safresh1 
641*f2a19305Safresh1         /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the
642*f2a19305Safresh1          * union.  We can just return a copy of 'a' if '*output' doesn't point
643*f2a19305Safresh1          * to an existing list */
644*f2a19305Safresh1         if (*output == NULL) {
645*f2a19305Safresh1             *output = invlist_clone(a, NULL);
646*f2a19305Safresh1             return;
647*f2a19305Safresh1         }
648*f2a19305Safresh1 
649*f2a19305Safresh1         /* If the output is to overwrite 'a', we have a no-op, as it's
650*f2a19305Safresh1          * already in 'a' */
651*f2a19305Safresh1         if (*output == a) {
652*f2a19305Safresh1             return;
653*f2a19305Safresh1         }
654*f2a19305Safresh1 
655*f2a19305Safresh1         /* Here, '*output' is to be overwritten by 'a' */
656*f2a19305Safresh1         u = invlist_clone(a, NULL);
657*f2a19305Safresh1         invlist_replace_list_destroys_src(*output, u);
658*f2a19305Safresh1         SvREFCNT_dec_NN(u);
659*f2a19305Safresh1 
660*f2a19305Safresh1         return;
661*f2a19305Safresh1     }
662*f2a19305Safresh1 
663*f2a19305Safresh1     /* Here 'b' is not empty.  See about 'a' */
664*f2a19305Safresh1 
665*f2a19305Safresh1     if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
666*f2a19305Safresh1 
667*f2a19305Safresh1         /* Here, 'a' is empty (and b is not).  That means the union will come
668*f2a19305Safresh1          * entirely from 'b'.  If '*output' is NULL, we can directly return a
669*f2a19305Safresh1          * clone of 'b'.  Otherwise, we replace the contents of '*output' with
670*f2a19305Safresh1          * the clone */
671*f2a19305Safresh1 
672*f2a19305Safresh1         SV ** dest = (*output == NULL) ? output : &u;
673*f2a19305Safresh1         *dest = invlist_clone(b, NULL);
674*f2a19305Safresh1         if (complement_b) {
675*f2a19305Safresh1             _invlist_invert(*dest);
676*f2a19305Safresh1         }
677*f2a19305Safresh1 
678*f2a19305Safresh1         if (dest == &u) {
679*f2a19305Safresh1             invlist_replace_list_destroys_src(*output, u);
680*f2a19305Safresh1             SvREFCNT_dec_NN(u);
681*f2a19305Safresh1         }
682*f2a19305Safresh1 
683*f2a19305Safresh1         return;
684*f2a19305Safresh1     }
685*f2a19305Safresh1 
686*f2a19305Safresh1     /* Here both lists exist and are non-empty */
687*f2a19305Safresh1     array_a = invlist_array(a);
688*f2a19305Safresh1     array_b = invlist_array(b);
689*f2a19305Safresh1 
690*f2a19305Safresh1     /* If are to take the union of 'a' with the complement of b, set it
691*f2a19305Safresh1      * up so are looking at b's complement. */
692*f2a19305Safresh1     if (complement_b) {
693*f2a19305Safresh1 
694*f2a19305Safresh1         /* To complement, we invert: if the first element is 0, remove it.  To
695*f2a19305Safresh1          * do this, we just pretend the array starts one later */
696*f2a19305Safresh1         if (array_b[0] == 0) {
697*f2a19305Safresh1             array_b++;
698*f2a19305Safresh1             len_b--;
699*f2a19305Safresh1         }
700*f2a19305Safresh1         else {
701*f2a19305Safresh1 
702*f2a19305Safresh1             /* But if the first element is not zero, we pretend the list starts
703*f2a19305Safresh1              * at the 0 that is always stored immediately before the array. */
704*f2a19305Safresh1             array_b--;
705*f2a19305Safresh1             len_b++;
706*f2a19305Safresh1         }
707*f2a19305Safresh1     }
708*f2a19305Safresh1 
709*f2a19305Safresh1     /* Size the union for the worst case: that the sets are completely
710*f2a19305Safresh1      * disjoint */
711*f2a19305Safresh1     u = _new_invlist(len_a + len_b);
712*f2a19305Safresh1 
713*f2a19305Safresh1     /* Will contain U+0000 if either component does */
714*f2a19305Safresh1     array_u = _invlist_array_init(u, (    len_a > 0 && array_a[0] == 0)
715*f2a19305Safresh1                                       || (len_b > 0 && array_b[0] == 0));
716*f2a19305Safresh1 
717*f2a19305Safresh1     /* Go through each input list item by item, stopping when have exhausted
718*f2a19305Safresh1      * one of them */
719*f2a19305Safresh1     while (i_a < len_a && i_b < len_b) {
720*f2a19305Safresh1         UV cp;      /* The element to potentially add to the union's array */
721*f2a19305Safresh1         bool cp_in_set;   /* is it in the input list's set or not */
722*f2a19305Safresh1 
723*f2a19305Safresh1         /* We need to take one or the other of the two inputs for the union.
724*f2a19305Safresh1          * Since we are merging two sorted lists, we take the smaller of the
725*f2a19305Safresh1          * next items.  In case of a tie, we take first the one that is in its
726*f2a19305Safresh1          * set.  If we first took the one not in its set, it would decrement
727*f2a19305Safresh1          * the count, possibly to 0 which would cause it to be output as ending
728*f2a19305Safresh1          * the range, and the next time through we would take the same number,
729*f2a19305Safresh1          * and output it again as beginning the next range.  By doing it the
730*f2a19305Safresh1          * opposite way, there is no possibility that the count will be
731*f2a19305Safresh1          * momentarily decremented to 0, and thus the two adjoining ranges will
732*f2a19305Safresh1          * be seamlessly merged.  (In a tie and both are in the set or both not
733*f2a19305Safresh1          * in the set, it doesn't matter which we take first.) */
734*f2a19305Safresh1         if (       array_a[i_a] < array_b[i_b]
735*f2a19305Safresh1             || (   array_a[i_a] == array_b[i_b]
736*f2a19305Safresh1                 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
737*f2a19305Safresh1         {
738*f2a19305Safresh1             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
739*f2a19305Safresh1             cp = array_a[i_a++];
740*f2a19305Safresh1         }
741*f2a19305Safresh1         else {
742*f2a19305Safresh1             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
743*f2a19305Safresh1             cp = array_b[i_b++];
744*f2a19305Safresh1         }
745*f2a19305Safresh1 
746*f2a19305Safresh1         /* Here, have chosen which of the two inputs to look at.  Only output
747*f2a19305Safresh1          * if the running count changes to/from 0, which marks the
748*f2a19305Safresh1          * beginning/end of a range that's in the set */
749*f2a19305Safresh1         if (cp_in_set) {
750*f2a19305Safresh1             if (count == 0) {
751*f2a19305Safresh1                 array_u[i_u++] = cp;
752*f2a19305Safresh1             }
753*f2a19305Safresh1             count++;
754*f2a19305Safresh1         }
755*f2a19305Safresh1         else {
756*f2a19305Safresh1             count--;
757*f2a19305Safresh1             if (count == 0) {
758*f2a19305Safresh1                 array_u[i_u++] = cp;
759*f2a19305Safresh1             }
760*f2a19305Safresh1         }
761*f2a19305Safresh1     }
762*f2a19305Safresh1 
763*f2a19305Safresh1 
764*f2a19305Safresh1     /* The loop above increments the index into exactly one of the input lists
765*f2a19305Safresh1      * each iteration, and ends when either index gets to its list end.  That
766*f2a19305Safresh1      * means the other index is lower than its end, and so something is
767*f2a19305Safresh1      * remaining in that one.  We decrement 'count', as explained below, if
768*f2a19305Safresh1      * that list is in its set.  (i_a and i_b each currently index the element
769*f2a19305Safresh1      * beyond the one we care about.) */
770*f2a19305Safresh1     if (   (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
771*f2a19305Safresh1         || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
772*f2a19305Safresh1     {
773*f2a19305Safresh1         count--;
774*f2a19305Safresh1     }
775*f2a19305Safresh1 
776*f2a19305Safresh1     /* Above we decremented 'count' if the list that had unexamined elements in
777*f2a19305Safresh1      * it was in its set.  This has made it so that 'count' being non-zero
778*f2a19305Safresh1      * means there isn't anything left to output; and 'count' equal to 0 means
779*f2a19305Safresh1      * that what is left to output is precisely that which is left in the
780*f2a19305Safresh1      * non-exhausted input list.
781*f2a19305Safresh1      *
782*f2a19305Safresh1      * To see why, note first that the exhausted input obviously has nothing
783*f2a19305Safresh1      * left to add to the union.  If it was in its set at its end, that means
784*f2a19305Safresh1      * the set extends from here to the platform's infinity, and hence so does
785*f2a19305Safresh1      * the union and the non-exhausted set is irrelevant.  The exhausted set
786*f2a19305Safresh1      * also contributed 1 to 'count'.  If 'count' was 2, it got decremented to
787*f2a19305Safresh1      * 1, but if it was 1, the non-exhausted set wasn't in its set, and so
788*f2a19305Safresh1      * 'count' remains at 1.  This is consistent with the decremented 'count'
789*f2a19305Safresh1      * != 0 meaning there's nothing left to add to the union.
790*f2a19305Safresh1      *
791*f2a19305Safresh1      * But if the exhausted input wasn't in its set, it contributed 0 to
792*f2a19305Safresh1      * 'count', and the rest of the union will be whatever the other input is.
793*f2a19305Safresh1      * If 'count' was 0, neither list was in its set, and 'count' remains 0;
794*f2a19305Safresh1      * otherwise it gets decremented to 0.  This is consistent with 'count'
795*f2a19305Safresh1      * == 0 meaning the remainder of the union is whatever is left in the
796*f2a19305Safresh1      * non-exhausted list. */
797*f2a19305Safresh1     if (count != 0) {
798*f2a19305Safresh1         len_u = i_u;
799*f2a19305Safresh1     }
800*f2a19305Safresh1     else {
801*f2a19305Safresh1         IV copy_count = len_a - i_a;
802*f2a19305Safresh1         if (copy_count > 0) {   /* The non-exhausted input is 'a' */
803*f2a19305Safresh1             Copy(array_a + i_a, array_u + i_u, copy_count, UV);
804*f2a19305Safresh1         }
805*f2a19305Safresh1         else { /* The non-exhausted input is b */
806*f2a19305Safresh1             copy_count = len_b - i_b;
807*f2a19305Safresh1             Copy(array_b + i_b, array_u + i_u, copy_count, UV);
808*f2a19305Safresh1         }
809*f2a19305Safresh1         len_u = i_u + copy_count;
810*f2a19305Safresh1     }
811*f2a19305Safresh1 
812*f2a19305Safresh1     /* Set the result to the final length, which can change the pointer to
813*f2a19305Safresh1      * array_u, so re-find it.  (Note that it is unlikely that this will
814*f2a19305Safresh1      * change, as we are shrinking the space, not enlarging it) */
815*f2a19305Safresh1     if (len_u != _invlist_len(u)) {
816*f2a19305Safresh1         invlist_set_len(u, len_u, *get_invlist_offset_addr(u));
817*f2a19305Safresh1         invlist_trim(u);
818*f2a19305Safresh1         array_u = invlist_array(u);
819*f2a19305Safresh1     }
820*f2a19305Safresh1 
821*f2a19305Safresh1     if (*output == NULL) {  /* Simply return the new inversion list */
822*f2a19305Safresh1         *output = u;
823*f2a19305Safresh1     }
824*f2a19305Safresh1     else {
825*f2a19305Safresh1         /* Otherwise, overwrite the inversion list that was in '*output'.  We
826*f2a19305Safresh1          * could instead free '*output', and then set it to 'u', but experience
827*f2a19305Safresh1          * has shown [perl #127392] that if the input is a mortal, we can get a
828*f2a19305Safresh1          * huge build-up of these during regex compilation before they get
829*f2a19305Safresh1          * freed. */
830*f2a19305Safresh1         invlist_replace_list_destroys_src(*output, u);
831*f2a19305Safresh1         SvREFCNT_dec_NN(u);
832*f2a19305Safresh1     }
833*f2a19305Safresh1 
834*f2a19305Safresh1     return;
835*f2a19305Safresh1 }
836*f2a19305Safresh1 
837*f2a19305Safresh1 void
Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV * const a,SV * const b,const bool complement_b,SV ** i)838*f2a19305Safresh1 Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b,
839*f2a19305Safresh1                                                const bool complement_b, SV** i)
840*f2a19305Safresh1 {
841*f2a19305Safresh1     /* Take the intersection of two inversion lists and point '*i' to it.  On
842*f2a19305Safresh1      * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly
843*f2a19305Safresh1      * even 'a' or 'b').  If to an inversion list, the contents of the original
844*f2a19305Safresh1      * list will be replaced by the intersection.  The first list, 'a', may be
845*f2a19305Safresh1      * NULL, in which case '*i' will be an empty list.  If 'complement_b' is
846*f2a19305Safresh1      * TRUE, the result will be the intersection of 'a' and the complement (or
847*f2a19305Safresh1      * inversion) of 'b' instead of 'b' directly.
848*f2a19305Safresh1      *
849*f2a19305Safresh1      * The basis for this comes from "Unicode Demystified" Chapter 13 by
850*f2a19305Safresh1      * Richard Gillam, published by Addison-Wesley, and explained at some
851*f2a19305Safresh1      * length there.  The preface says to incorporate its examples into your
852*f2a19305Safresh1      * code at your own risk.  In fact, it had bugs
853*f2a19305Safresh1      *
854*f2a19305Safresh1      * The algorithm is like a merge sort, and is essentially the same as the
855*f2a19305Safresh1      * union above
856*f2a19305Safresh1      */
857*f2a19305Safresh1 
858*f2a19305Safresh1     const UV* array_a;          /* a's array */
859*f2a19305Safresh1     const UV* array_b;
860*f2a19305Safresh1     UV len_a;   /* length of a's array */
861*f2a19305Safresh1     UV len_b;
862*f2a19305Safresh1 
863*f2a19305Safresh1     SV* r;                   /* the resulting intersection */
864*f2a19305Safresh1     UV* array_r;
865*f2a19305Safresh1     UV len_r = 0;
866*f2a19305Safresh1 
867*f2a19305Safresh1     UV i_a = 0;             /* current index into a's array */
868*f2a19305Safresh1     UV i_b = 0;
869*f2a19305Safresh1     UV i_r = 0;
870*f2a19305Safresh1 
871*f2a19305Safresh1     /* running count of how many of the two inputs are postitioned at ranges
872*f2a19305Safresh1      * that are in their sets.  As explained in the algorithm source book,
873*f2a19305Safresh1      * items are stopped accumulating and are output when the count changes
874*f2a19305Safresh1      * to/from 2.  The count is incremented when we start a range that's in an
875*f2a19305Safresh1      * input's set, and decremented when we start a range that's not in a set.
876*f2a19305Safresh1      * Only when it is 2 are we in the intersection. */
877*f2a19305Safresh1     UV count = 0;
878*f2a19305Safresh1 
879*f2a19305Safresh1     PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
880*f2a19305Safresh1     assert(a != b);
881*f2a19305Safresh1     assert(*i == NULL || is_invlist(*i));
882*f2a19305Safresh1 
883*f2a19305Safresh1     /* Special case if either one is empty */
884*f2a19305Safresh1     len_a = (a == NULL) ? 0 : _invlist_len(a);
885*f2a19305Safresh1     if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
886*f2a19305Safresh1         if (len_a != 0 && complement_b) {
887*f2a19305Safresh1 
888*f2a19305Safresh1             /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b'
889*f2a19305Safresh1              * must be empty.  Here, also we are using 'b's complement, which
890*f2a19305Safresh1              * hence must be every possible code point.  Thus the intersection
891*f2a19305Safresh1              * is simply 'a'. */
892*f2a19305Safresh1 
893*f2a19305Safresh1             if (*i == a) {  /* No-op */
894*f2a19305Safresh1                 return;
895*f2a19305Safresh1             }
896*f2a19305Safresh1 
897*f2a19305Safresh1             if (*i == NULL) {
898*f2a19305Safresh1                 *i = invlist_clone(a, NULL);
899*f2a19305Safresh1                 return;
900*f2a19305Safresh1             }
901*f2a19305Safresh1 
902*f2a19305Safresh1             r = invlist_clone(a, NULL);
903*f2a19305Safresh1             invlist_replace_list_destroys_src(*i, r);
904*f2a19305Safresh1             SvREFCNT_dec_NN(r);
905*f2a19305Safresh1             return;
906*f2a19305Safresh1         }
907*f2a19305Safresh1 
908*f2a19305Safresh1         /* Here, 'a' or 'b' is empty and not using the complement of 'b'.  The
909*f2a19305Safresh1          * intersection must be empty */
910*f2a19305Safresh1         if (*i == NULL) {
911*f2a19305Safresh1             *i = _new_invlist(0);
912*f2a19305Safresh1             return;
913*f2a19305Safresh1         }
914*f2a19305Safresh1 
915*f2a19305Safresh1         invlist_clear(*i);
916*f2a19305Safresh1         return;
917*f2a19305Safresh1     }
918*f2a19305Safresh1 
919*f2a19305Safresh1     /* Here both lists exist and are non-empty */
920*f2a19305Safresh1     array_a = invlist_array(a);
921*f2a19305Safresh1     array_b = invlist_array(b);
922*f2a19305Safresh1 
923*f2a19305Safresh1     /* If are to take the intersection of 'a' with the complement of b, set it
924*f2a19305Safresh1      * up so are looking at b's complement. */
925*f2a19305Safresh1     if (complement_b) {
926*f2a19305Safresh1 
927*f2a19305Safresh1         /* To complement, we invert: if the first element is 0, remove it.  To
928*f2a19305Safresh1          * do this, we just pretend the array starts one later */
929*f2a19305Safresh1         if (array_b[0] == 0) {
930*f2a19305Safresh1             array_b++;
931*f2a19305Safresh1             len_b--;
932*f2a19305Safresh1         }
933*f2a19305Safresh1         else {
934*f2a19305Safresh1 
935*f2a19305Safresh1             /* But if the first element is not zero, we pretend the list starts
936*f2a19305Safresh1              * at the 0 that is always stored immediately before the array. */
937*f2a19305Safresh1             array_b--;
938*f2a19305Safresh1             len_b++;
939*f2a19305Safresh1         }
940*f2a19305Safresh1     }
941*f2a19305Safresh1 
942*f2a19305Safresh1     /* Size the intersection for the worst case: that the intersection ends up
943*f2a19305Safresh1      * fragmenting everything to be completely disjoint */
944*f2a19305Safresh1     r= _new_invlist(len_a + len_b);
945*f2a19305Safresh1 
946*f2a19305Safresh1     /* Will contain U+0000 iff both components do */
947*f2a19305Safresh1     array_r = _invlist_array_init(r,    len_a > 0 && array_a[0] == 0
948*f2a19305Safresh1                                      && len_b > 0 && array_b[0] == 0);
949*f2a19305Safresh1 
950*f2a19305Safresh1     /* Go through each list item by item, stopping when have exhausted one of
951*f2a19305Safresh1      * them */
952*f2a19305Safresh1     while (i_a < len_a && i_b < len_b) {
953*f2a19305Safresh1         UV cp;      /* The element to potentially add to the intersection's
954*f2a19305Safresh1                        array */
955*f2a19305Safresh1         bool cp_in_set; /* Is it in the input list's set or not */
956*f2a19305Safresh1 
957*f2a19305Safresh1         /* We need to take one or the other of the two inputs for the
958*f2a19305Safresh1          * intersection.  Since we are merging two sorted lists, we take the
959*f2a19305Safresh1          * smaller of the next items.  In case of a tie, we take first the one
960*f2a19305Safresh1          * that is not in its set (a difference from the union algorithm).  If
961*f2a19305Safresh1          * we first took the one in its set, it would increment the count,
962*f2a19305Safresh1          * possibly to 2 which would cause it to be output as starting a range
963*f2a19305Safresh1          * in the intersection, and the next time through we would take that
964*f2a19305Safresh1          * same number, and output it again as ending the set.  By doing the
965*f2a19305Safresh1          * opposite of this, there is no possibility that the count will be
966*f2a19305Safresh1          * momentarily incremented to 2.  (In a tie and both are in the set or
967*f2a19305Safresh1          * both not in the set, it doesn't matter which we take first.) */
968*f2a19305Safresh1         if (       array_a[i_a] < array_b[i_b]
969*f2a19305Safresh1             || (   array_a[i_a] == array_b[i_b]
970*f2a19305Safresh1                 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
971*f2a19305Safresh1         {
972*f2a19305Safresh1             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
973*f2a19305Safresh1             cp = array_a[i_a++];
974*f2a19305Safresh1         }
975*f2a19305Safresh1         else {
976*f2a19305Safresh1             cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
977*f2a19305Safresh1             cp= array_b[i_b++];
978*f2a19305Safresh1         }
979*f2a19305Safresh1 
980*f2a19305Safresh1         /* Here, have chosen which of the two inputs to look at.  Only output
981*f2a19305Safresh1          * if the running count changes to/from 2, which marks the
982*f2a19305Safresh1          * beginning/end of a range that's in the intersection */
983*f2a19305Safresh1         if (cp_in_set) {
984*f2a19305Safresh1             count++;
985*f2a19305Safresh1             if (count == 2) {
986*f2a19305Safresh1                 array_r[i_r++] = cp;
987*f2a19305Safresh1             }
988*f2a19305Safresh1         }
989*f2a19305Safresh1         else {
990*f2a19305Safresh1             if (count == 2) {
991*f2a19305Safresh1                 array_r[i_r++] = cp;
992*f2a19305Safresh1             }
993*f2a19305Safresh1             count--;
994*f2a19305Safresh1         }
995*f2a19305Safresh1 
996*f2a19305Safresh1     }
997*f2a19305Safresh1 
998*f2a19305Safresh1     /* The loop above increments the index into exactly one of the input lists
999*f2a19305Safresh1      * each iteration, and ends when either index gets to its list end.  That
1000*f2a19305Safresh1      * means the other index is lower than its end, and so something is
1001*f2a19305Safresh1      * remaining in that one.  We increment 'count', as explained below, if the
1002*f2a19305Safresh1      * exhausted list was in its set.  (i_a and i_b each currently index the
1003*f2a19305Safresh1      * element beyond the one we care about.) */
1004*f2a19305Safresh1     if (   (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
1005*f2a19305Safresh1         || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
1006*f2a19305Safresh1     {
1007*f2a19305Safresh1         count++;
1008*f2a19305Safresh1     }
1009*f2a19305Safresh1 
1010*f2a19305Safresh1     /* Above we incremented 'count' if the exhausted list was in its set.  This
1011*f2a19305Safresh1      * has made it so that 'count' being below 2 means there is nothing left to
1012*f2a19305Safresh1      * output; otheriwse what's left to add to the intersection is precisely
1013*f2a19305Safresh1      * that which is left in the non-exhausted input list.
1014*f2a19305Safresh1      *
1015*f2a19305Safresh1      * To see why, note first that the exhausted input obviously has nothing
1016*f2a19305Safresh1      * left to affect the intersection.  If it was in its set at its end, that
1017*f2a19305Safresh1      * means the set extends from here to the platform's infinity, and hence
1018*f2a19305Safresh1      * anything in the non-exhausted's list will be in the intersection, and
1019*f2a19305Safresh1      * anything not in it won't be.  Hence, the rest of the intersection is
1020*f2a19305Safresh1      * precisely what's in the non-exhausted list  The exhausted set also
1021*f2a19305Safresh1      * contributed 1 to 'count', meaning 'count' was at least 1.  Incrementing
1022*f2a19305Safresh1      * it means 'count' is now at least 2.  This is consistent with the
1023*f2a19305Safresh1      * incremented 'count' being >= 2 means to add the non-exhausted list to
1024*f2a19305Safresh1      * the intersection.
1025*f2a19305Safresh1      *
1026*f2a19305Safresh1      * But if the exhausted input wasn't in its set, it contributed 0 to
1027*f2a19305Safresh1      * 'count', and the intersection can't include anything further; the
1028*f2a19305Safresh1      * non-exhausted set is irrelevant.  'count' was at most 1, and doesn't get
1029*f2a19305Safresh1      * incremented.  This is consistent with 'count' being < 2 meaning nothing
1030*f2a19305Safresh1      * further to add to the intersection. */
1031*f2a19305Safresh1     if (count < 2) { /* Nothing left to put in the intersection. */
1032*f2a19305Safresh1         len_r = i_r;
1033*f2a19305Safresh1     }
1034*f2a19305Safresh1     else { /* copy the non-exhausted list, unchanged. */
1035*f2a19305Safresh1         IV copy_count = len_a - i_a;
1036*f2a19305Safresh1         if (copy_count > 0) {   /* a is the one with stuff left */
1037*f2a19305Safresh1             Copy(array_a + i_a, array_r + i_r, copy_count, UV);
1038*f2a19305Safresh1         }
1039*f2a19305Safresh1         else {  /* b is the one with stuff left */
1040*f2a19305Safresh1             copy_count = len_b - i_b;
1041*f2a19305Safresh1             Copy(array_b + i_b, array_r + i_r, copy_count, UV);
1042*f2a19305Safresh1         }
1043*f2a19305Safresh1         len_r = i_r + copy_count;
1044*f2a19305Safresh1     }
1045*f2a19305Safresh1 
1046*f2a19305Safresh1     /* Set the result to the final length, which can change the pointer to
1047*f2a19305Safresh1      * array_r, so re-find it.  (Note that it is unlikely that this will
1048*f2a19305Safresh1      * change, as we are shrinking the space, not enlarging it) */
1049*f2a19305Safresh1     if (len_r != _invlist_len(r)) {
1050*f2a19305Safresh1         invlist_set_len(r, len_r, *get_invlist_offset_addr(r));
1051*f2a19305Safresh1         invlist_trim(r);
1052*f2a19305Safresh1         array_r = invlist_array(r);
1053*f2a19305Safresh1     }
1054*f2a19305Safresh1 
1055*f2a19305Safresh1     if (*i == NULL) { /* Simply return the calculated intersection */
1056*f2a19305Safresh1         *i = r;
1057*f2a19305Safresh1     }
1058*f2a19305Safresh1     else { /* Otherwise, replace the existing inversion list in '*i'.  We could
1059*f2a19305Safresh1               instead free '*i', and then set it to 'r', but experience has
1060*f2a19305Safresh1               shown [perl #127392] that if the input is a mortal, we can get a
1061*f2a19305Safresh1               huge build-up of these during regex compilation before they get
1062*f2a19305Safresh1               freed. */
1063*f2a19305Safresh1         if (len_r) {
1064*f2a19305Safresh1             invlist_replace_list_destroys_src(*i, r);
1065*f2a19305Safresh1         }
1066*f2a19305Safresh1         else {
1067*f2a19305Safresh1             invlist_clear(*i);
1068*f2a19305Safresh1         }
1069*f2a19305Safresh1         SvREFCNT_dec_NN(r);
1070*f2a19305Safresh1     }
1071*f2a19305Safresh1 
1072*f2a19305Safresh1     return;
1073*f2a19305Safresh1 }
1074*f2a19305Safresh1 
1075*f2a19305Safresh1 SV*
Perl__add_range_to_invlist(pTHX_ SV * invlist,UV start,UV end)1076*f2a19305Safresh1 Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end)
1077*f2a19305Safresh1 {
1078*f2a19305Safresh1     /* Add the range from 'start' to 'end' inclusive to the inversion list's
1079*f2a19305Safresh1      * set.  A pointer to the inversion list is returned.  This may actually be
1080*f2a19305Safresh1      * a new list, in which case the passed in one has been destroyed.  The
1081*f2a19305Safresh1      * passed-in inversion list can be NULL, in which case a new one is created
1082*f2a19305Safresh1      * with just the one range in it.  The new list is not necessarily
1083*f2a19305Safresh1      * NUL-terminated.  Space is not freed if the inversion list shrinks as a
1084*f2a19305Safresh1      * result of this function.  The gain would not be large, and in many
1085*f2a19305Safresh1      * cases, this is called multiple times on a single inversion list, so
1086*f2a19305Safresh1      * anything freed may almost immediately be needed again.
1087*f2a19305Safresh1      *
1088*f2a19305Safresh1      * This used to mostly call the 'union' routine, but that is much more
1089*f2a19305Safresh1      * heavyweight than really needed for a single range addition */
1090*f2a19305Safresh1 
1091*f2a19305Safresh1     UV* array;              /* The array implementing the inversion list */
1092*f2a19305Safresh1     UV len;                 /* How many elements in 'array' */
1093*f2a19305Safresh1     SSize_t i_s;            /* index into the invlist array where 'start'
1094*f2a19305Safresh1                                should go */
1095*f2a19305Safresh1     SSize_t i_e = 0;        /* And the index where 'end' should go */
1096*f2a19305Safresh1     UV cur_highest;         /* The highest code point in the inversion list
1097*f2a19305Safresh1                                upon entry to this function */
1098*f2a19305Safresh1 
1099*f2a19305Safresh1     /* This range becomes the whole inversion list if none already existed */
1100*f2a19305Safresh1     if (invlist == NULL) {
1101*f2a19305Safresh1         invlist = _new_invlist(2);
1102*f2a19305Safresh1         _append_range_to_invlist(invlist, start, end);
1103*f2a19305Safresh1         return invlist;
1104*f2a19305Safresh1     }
1105*f2a19305Safresh1 
1106*f2a19305Safresh1     /* Likewise, if the inversion list is currently empty */
1107*f2a19305Safresh1     len = _invlist_len(invlist);
1108*f2a19305Safresh1     if (len == 0) {
1109*f2a19305Safresh1         _append_range_to_invlist(invlist, start, end);
1110*f2a19305Safresh1         return invlist;
1111*f2a19305Safresh1     }
1112*f2a19305Safresh1 
1113*f2a19305Safresh1     /* Starting here, we have to know the internals of the list */
1114*f2a19305Safresh1     array = invlist_array(invlist);
1115*f2a19305Safresh1 
1116*f2a19305Safresh1     /* If the new range ends higher than the current highest ... */
1117*f2a19305Safresh1     cur_highest = invlist_highest(invlist);
1118*f2a19305Safresh1     if (end > cur_highest) {
1119*f2a19305Safresh1 
1120*f2a19305Safresh1         /* If the whole range is higher, we can just append it */
1121*f2a19305Safresh1         if (start > cur_highest) {
1122*f2a19305Safresh1             _append_range_to_invlist(invlist, start, end);
1123*f2a19305Safresh1             return invlist;
1124*f2a19305Safresh1         }
1125*f2a19305Safresh1 
1126*f2a19305Safresh1         /* Otherwise, add the portion that is higher ... */
1127*f2a19305Safresh1         _append_range_to_invlist(invlist, cur_highest + 1, end);
1128*f2a19305Safresh1 
1129*f2a19305Safresh1         /* ... and continue on below to handle the rest.  As a result of the
1130*f2a19305Safresh1          * above append, we know that the index of the end of the range is the
1131*f2a19305Safresh1          * final even numbered one of the array.  Recall that the final element
1132*f2a19305Safresh1          * always starts a range that extends to infinity.  If that range is in
1133*f2a19305Safresh1          * the set (meaning the set goes from here to infinity), it will be an
1134*f2a19305Safresh1          * even index, but if it isn't in the set, it's odd, and the final
1135*f2a19305Safresh1          * range in the set is one less, which is even. */
1136*f2a19305Safresh1         if (end == UV_MAX) {
1137*f2a19305Safresh1             i_e = len;
1138*f2a19305Safresh1         }
1139*f2a19305Safresh1         else {
1140*f2a19305Safresh1             i_e = len - 2;
1141*f2a19305Safresh1         }
1142*f2a19305Safresh1     }
1143*f2a19305Safresh1 
1144*f2a19305Safresh1     /* We have dealt with appending, now see about prepending.  If the new
1145*f2a19305Safresh1      * range starts lower than the current lowest ... */
1146*f2a19305Safresh1     if (start < array[0]) {
1147*f2a19305Safresh1 
1148*f2a19305Safresh1         /* Adding something which has 0 in it is somewhat tricky, and uncommon.
1149*f2a19305Safresh1          * Let the union code handle it, rather than having to know the
1150*f2a19305Safresh1          * trickiness in two code places.  */
1151*f2a19305Safresh1         if (UNLIKELY(start == 0)) {
1152*f2a19305Safresh1             SV* range_invlist;
1153*f2a19305Safresh1 
1154*f2a19305Safresh1             range_invlist = _new_invlist(2);
1155*f2a19305Safresh1             _append_range_to_invlist(range_invlist, start, end);
1156*f2a19305Safresh1 
1157*f2a19305Safresh1             _invlist_union(invlist, range_invlist, &invlist);
1158*f2a19305Safresh1 
1159*f2a19305Safresh1             SvREFCNT_dec_NN(range_invlist);
1160*f2a19305Safresh1 
1161*f2a19305Safresh1             return invlist;
1162*f2a19305Safresh1         }
1163*f2a19305Safresh1 
1164*f2a19305Safresh1         /* If the whole new range comes before the first entry, and doesn't
1165*f2a19305Safresh1          * extend it, we have to insert it as an additional range */
1166*f2a19305Safresh1         if (end < array[0] - 1) {
1167*f2a19305Safresh1             i_s = i_e = -1;
1168*f2a19305Safresh1             goto splice_in_new_range;
1169*f2a19305Safresh1         }
1170*f2a19305Safresh1 
1171*f2a19305Safresh1         /* Here the new range adjoins the existing first range, extending it
1172*f2a19305Safresh1          * downwards. */
1173*f2a19305Safresh1         array[0] = start;
1174*f2a19305Safresh1 
1175*f2a19305Safresh1         /* And continue on below to handle the rest.  We know that the index of
1176*f2a19305Safresh1          * the beginning of the range is the first one of the array */
1177*f2a19305Safresh1         i_s = 0;
1178*f2a19305Safresh1     }
1179*f2a19305Safresh1     else { /* Not prepending any part of the new range to the existing list.
1180*f2a19305Safresh1             * Find where in the list it should go.  This finds i_s, such that:
1181*f2a19305Safresh1             *     invlist[i_s] <= start < array[i_s+1]
1182*f2a19305Safresh1             */
1183*f2a19305Safresh1         i_s = _invlist_search(invlist, start);
1184*f2a19305Safresh1     }
1185*f2a19305Safresh1 
1186*f2a19305Safresh1     /* At this point, any extending before the beginning of the inversion list
1187*f2a19305Safresh1      * and/or after the end has been done.  This has made it so that, in the
1188*f2a19305Safresh1      * code below, each endpoint of the new range is either in a range that is
1189*f2a19305Safresh1      * in the set, or is in a gap between two ranges that are.  This means we
1190*f2a19305Safresh1      * don't have to worry about exceeding the array bounds.
1191*f2a19305Safresh1      *
1192*f2a19305Safresh1      * Find where in the list the new range ends (but we can skip this if we
1193*f2a19305Safresh1      * have already determined what it is, or if it will be the same as i_s,
1194*f2a19305Safresh1      * which we already have computed) */
1195*f2a19305Safresh1     if (i_e == 0) {
1196*f2a19305Safresh1         i_e = (start == end)
1197*f2a19305Safresh1               ? i_s
1198*f2a19305Safresh1               : _invlist_search(invlist, end);
1199*f2a19305Safresh1     }
1200*f2a19305Safresh1 
1201*f2a19305Safresh1     /* Here generally invlist[i_e] <= end < array[i_e+1].  But if invlist[i_e]
1202*f2a19305Safresh1      * is a range that goes to infinity there is no element at invlist[i_e+1],
1203*f2a19305Safresh1      * so only the first relation holds. */
1204*f2a19305Safresh1 
1205*f2a19305Safresh1     if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
1206*f2a19305Safresh1 
1207*f2a19305Safresh1         /* Here, the ranges on either side of the beginning of the new range
1208*f2a19305Safresh1          * are in the set, and this range starts in the gap between them.
1209*f2a19305Safresh1          *
1210*f2a19305Safresh1          * The new range extends the range above it downwards if the new range
1211*f2a19305Safresh1          * ends at or above that range's start */
1212*f2a19305Safresh1         const bool extends_the_range_above = (   end == UV_MAX
1213*f2a19305Safresh1                                               || end + 1 >= array[i_s+1]);
1214*f2a19305Safresh1 
1215*f2a19305Safresh1         /* The new range extends the range below it upwards if it begins just
1216*f2a19305Safresh1          * after where that range ends */
1217*f2a19305Safresh1         if (start == array[i_s]) {
1218*f2a19305Safresh1 
1219*f2a19305Safresh1             /* If the new range fills the entire gap between the other ranges,
1220*f2a19305Safresh1              * they will get merged together.  Other ranges may also get
1221*f2a19305Safresh1              * merged, depending on how many of them the new range spans.  In
1222*f2a19305Safresh1              * the general case, we do the merge later, just once, after we
1223*f2a19305Safresh1              * figure out how many to merge.  But in the case where the new
1224*f2a19305Safresh1              * range exactly spans just this one gap (possibly extending into
1225*f2a19305Safresh1              * the one above), we do the merge here, and an early exit.  This
1226*f2a19305Safresh1              * is done here to avoid having to special case later. */
1227*f2a19305Safresh1             if (i_e - i_s <= 1) {
1228*f2a19305Safresh1 
1229*f2a19305Safresh1                 /* If i_e - i_s == 1, it means that the new range terminates
1230*f2a19305Safresh1                  * within the range above, and hence 'extends_the_range_above'
1231*f2a19305Safresh1                  * must be true.  (If the range above it extends to infinity,
1232*f2a19305Safresh1                  * 'i_s+2' will be above the array's limit, but 'len-i_s-2'
1233*f2a19305Safresh1                  * will be 0, so no harm done.) */
1234*f2a19305Safresh1                 if (extends_the_range_above) {
1235*f2a19305Safresh1                     Move(array + i_s + 2, array + i_s, len - i_s - 2, UV);
1236*f2a19305Safresh1                     invlist_set_len(invlist,
1237*f2a19305Safresh1                                     len - 2,
1238*f2a19305Safresh1                                     *(get_invlist_offset_addr(invlist)));
1239*f2a19305Safresh1                     return invlist;
1240*f2a19305Safresh1                 }
1241*f2a19305Safresh1 
1242*f2a19305Safresh1                 /* Here, i_e must == i_s.  We keep them in sync, as they apply
1243*f2a19305Safresh1                  * to the same range, and below we are about to decrement i_s
1244*f2a19305Safresh1                  * */
1245*f2a19305Safresh1                 i_e--;
1246*f2a19305Safresh1             }
1247*f2a19305Safresh1 
1248*f2a19305Safresh1             /* Here, the new range is adjacent to the one below.  (It may also
1249*f2a19305Safresh1              * span beyond the range above, but that will get resolved later.)
1250*f2a19305Safresh1              * Extend the range below to include this one. */
1251*f2a19305Safresh1             array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1;
1252*f2a19305Safresh1             i_s--;
1253*f2a19305Safresh1             start = array[i_s];
1254*f2a19305Safresh1         }
1255*f2a19305Safresh1         else if (extends_the_range_above) {
1256*f2a19305Safresh1 
1257*f2a19305Safresh1             /* Here the new range only extends the range above it, but not the
1258*f2a19305Safresh1              * one below.  It merges with the one above.  Again, we keep i_e
1259*f2a19305Safresh1              * and i_s in sync if they point to the same range */
1260*f2a19305Safresh1             if (i_e == i_s) {
1261*f2a19305Safresh1                 i_e++;
1262*f2a19305Safresh1             }
1263*f2a19305Safresh1             i_s++;
1264*f2a19305Safresh1             array[i_s] = start;
1265*f2a19305Safresh1         }
1266*f2a19305Safresh1     }
1267*f2a19305Safresh1 
1268*f2a19305Safresh1     /* Here, we've dealt with the new range start extending any adjoining
1269*f2a19305Safresh1      * existing ranges.
1270*f2a19305Safresh1      *
1271*f2a19305Safresh1      * If the new range extends to infinity, it is now the final one,
1272*f2a19305Safresh1      * regardless of what was there before */
1273*f2a19305Safresh1     if (UNLIKELY(end == UV_MAX)) {
1274*f2a19305Safresh1         invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist)));
1275*f2a19305Safresh1         return invlist;
1276*f2a19305Safresh1     }
1277*f2a19305Safresh1 
1278*f2a19305Safresh1     /* If i_e started as == i_s, it has also been dealt with,
1279*f2a19305Safresh1      * and been updated to the new i_s, which will fail the following if */
1280*f2a19305Safresh1     if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) {
1281*f2a19305Safresh1 
1282*f2a19305Safresh1         /* Here, the ranges on either side of the end of the new range are in
1283*f2a19305Safresh1          * the set, and this range ends in the gap between them.
1284*f2a19305Safresh1          *
1285*f2a19305Safresh1          * If this range is adjacent to (hence extends) the range above it, it
1286*f2a19305Safresh1          * becomes part of that range; likewise if it extends the range below,
1287*f2a19305Safresh1          * it becomes part of that range */
1288*f2a19305Safresh1         if (end + 1 == array[i_e+1]) {
1289*f2a19305Safresh1             i_e++;
1290*f2a19305Safresh1             array[i_e] = start;
1291*f2a19305Safresh1         }
1292*f2a19305Safresh1         else if (start <= array[i_e]) {
1293*f2a19305Safresh1             array[i_e] = end + 1;
1294*f2a19305Safresh1             i_e--;
1295*f2a19305Safresh1         }
1296*f2a19305Safresh1     }
1297*f2a19305Safresh1 
1298*f2a19305Safresh1     if (i_s == i_e) {
1299*f2a19305Safresh1 
1300*f2a19305Safresh1         /* If the range fits entirely in an existing range (as possibly already
1301*f2a19305Safresh1          * extended above), it doesn't add anything new */
1302*f2a19305Safresh1         if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) {
1303*f2a19305Safresh1             return invlist;
1304*f2a19305Safresh1         }
1305*f2a19305Safresh1 
1306*f2a19305Safresh1         /* Here, no part of the range is in the list.  Must add it.  It will
1307*f2a19305Safresh1          * occupy 2 more slots */
1308*f2a19305Safresh1       splice_in_new_range:
1309*f2a19305Safresh1 
1310*f2a19305Safresh1         invlist_extend(invlist, len + 2);
1311*f2a19305Safresh1         array = invlist_array(invlist);
1312*f2a19305Safresh1         /* Move the rest of the array down two slots. Don't include any
1313*f2a19305Safresh1          * trailing NUL */
1314*f2a19305Safresh1         Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV);
1315*f2a19305Safresh1 
1316*f2a19305Safresh1         /* Do the actual splice */
1317*f2a19305Safresh1         array[i_e+1] = start;
1318*f2a19305Safresh1         array[i_e+2] = end + 1;
1319*f2a19305Safresh1         invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist)));
1320*f2a19305Safresh1         return invlist;
1321*f2a19305Safresh1     }
1322*f2a19305Safresh1 
1323*f2a19305Safresh1     /* Here the new range crossed the boundaries of a pre-existing range.  The
1324*f2a19305Safresh1      * code above has adjusted things so that both ends are in ranges that are
1325*f2a19305Safresh1      * in the set.  This means everything in between must also be in the set.
1326*f2a19305Safresh1      * Just squash things together */
1327*f2a19305Safresh1     Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV);
1328*f2a19305Safresh1     invlist_set_len(invlist,
1329*f2a19305Safresh1                     len - i_e + i_s,
1330*f2a19305Safresh1                     *(get_invlist_offset_addr(invlist)));
1331*f2a19305Safresh1 
1332*f2a19305Safresh1     return invlist;
1333*f2a19305Safresh1 }
1334*f2a19305Safresh1 
1335*f2a19305Safresh1 SV*
Perl__setup_canned_invlist(pTHX_ const STRLEN size,const UV element0,UV ** other_elements_ptr)1336*f2a19305Safresh1 Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0,
1337*f2a19305Safresh1                                  UV** other_elements_ptr)
1338*f2a19305Safresh1 {
1339*f2a19305Safresh1     /* Create and return an inversion list whose contents are to be populated
1340*f2a19305Safresh1      * by the caller.  The caller gives the number of elements (in 'size') and
1341*f2a19305Safresh1      * the very first element ('element0').  This function will set
1342*f2a19305Safresh1      * '*other_elements_ptr' to an array of UVs, where the remaining elements
1343*f2a19305Safresh1      * are to be placed.
1344*f2a19305Safresh1      *
1345*f2a19305Safresh1      * Obviously there is some trust involved that the caller will properly
1346*f2a19305Safresh1      * fill in the other elements of the array.
1347*f2a19305Safresh1      *
1348*f2a19305Safresh1      * (The first element needs to be passed in, as the underlying code does
1349*f2a19305Safresh1      * things differently depending on whether it is zero or non-zero) */
1350*f2a19305Safresh1 
1351*f2a19305Safresh1     SV* invlist = _new_invlist(size);
1352*f2a19305Safresh1     bool offset;
1353*f2a19305Safresh1 
1354*f2a19305Safresh1     PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST;
1355*f2a19305Safresh1 
1356*f2a19305Safresh1     invlist = add_cp_to_invlist(invlist, element0);
1357*f2a19305Safresh1     offset = *get_invlist_offset_addr(invlist);
1358*f2a19305Safresh1 
1359*f2a19305Safresh1     invlist_set_len(invlist, size, offset);
1360*f2a19305Safresh1     *other_elements_ptr = invlist_array(invlist) + 1;
1361*f2a19305Safresh1     return invlist;
1362*f2a19305Safresh1 }
1363*f2a19305Safresh1 
1364*f2a19305Safresh1 #endif
1365*f2a19305Safresh1 
1366*f2a19305Safresh1 #ifndef PERL_IN_XSUB_RE
1367*f2a19305Safresh1 void
Perl__invlist_invert(pTHX_ SV * const invlist)1368*f2a19305Safresh1 Perl__invlist_invert(pTHX_ SV* const invlist)
1369*f2a19305Safresh1 {
1370*f2a19305Safresh1     /* Complement the input inversion list.  This adds a 0 if the list didn't
1371*f2a19305Safresh1      * have a zero; removes it otherwise.  As described above, the data
1372*f2a19305Safresh1      * structure is set up so that this is very efficient */
1373*f2a19305Safresh1 
1374*f2a19305Safresh1     PERL_ARGS_ASSERT__INVLIST_INVERT;
1375*f2a19305Safresh1 
1376*f2a19305Safresh1     assert(! invlist_is_iterating(invlist));
1377*f2a19305Safresh1 
1378*f2a19305Safresh1     /* The inverse of matching nothing is matching everything */
1379*f2a19305Safresh1     if (_invlist_len(invlist) == 0) {
1380*f2a19305Safresh1         _append_range_to_invlist(invlist, 0, UV_MAX);
1381*f2a19305Safresh1         return;
1382*f2a19305Safresh1     }
1383*f2a19305Safresh1 
1384*f2a19305Safresh1     *get_invlist_offset_addr(invlist) = ! *get_invlist_offset_addr(invlist);
1385*f2a19305Safresh1 }
1386*f2a19305Safresh1 
1387*f2a19305Safresh1 SV*
Perl_invlist_clone(pTHX_ SV * const invlist,SV * new_invlist)1388*f2a19305Safresh1 Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist)
1389*f2a19305Safresh1 {
1390*f2a19305Safresh1     /* Return a new inversion list that is a copy of the input one, which is
1391*f2a19305Safresh1      * unchanged.  The new list will not be mortal even if the old one was. */
1392*f2a19305Safresh1 
1393*f2a19305Safresh1     const STRLEN nominal_length = _invlist_len(invlist);
1394*f2a19305Safresh1     const STRLEN physical_length = SvCUR(invlist);
1395*f2a19305Safresh1     const bool offset = *(get_invlist_offset_addr(invlist));
1396*f2a19305Safresh1 
1397*f2a19305Safresh1     PERL_ARGS_ASSERT_INVLIST_CLONE;
1398*f2a19305Safresh1 
1399*f2a19305Safresh1     if (new_invlist == NULL) {
1400*f2a19305Safresh1         new_invlist = _new_invlist(nominal_length);
1401*f2a19305Safresh1     }
1402*f2a19305Safresh1     else {
1403*f2a19305Safresh1         sv_upgrade(new_invlist, SVt_INVLIST);
1404*f2a19305Safresh1         initialize_invlist_guts(new_invlist, nominal_length);
1405*f2a19305Safresh1     }
1406*f2a19305Safresh1 
1407*f2a19305Safresh1     *(get_invlist_offset_addr(new_invlist)) = offset;
1408*f2a19305Safresh1     invlist_set_len(new_invlist, nominal_length, offset);
1409*f2a19305Safresh1     Copy(SvPVX(invlist), SvPVX(new_invlist), physical_length, char);
1410*f2a19305Safresh1 
1411*f2a19305Safresh1     return new_invlist;
1412*f2a19305Safresh1 }
1413*f2a19305Safresh1 
1414*f2a19305Safresh1 #endif
1415*f2a19305Safresh1 
1416*f2a19305Safresh1 
1417*f2a19305Safresh1 #ifndef PERL_IN_XSUB_RE
1418*f2a19305Safresh1 void
Perl__invlist_dump(pTHX_ PerlIO * file,I32 level,const char * const indent,SV * const invlist)1419*f2a19305Safresh1 Perl__invlist_dump(pTHX_ PerlIO *file, I32 level,
1420*f2a19305Safresh1                          const char * const indent, SV* const invlist)
1421*f2a19305Safresh1 {
1422*f2a19305Safresh1     /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
1423*f2a19305Safresh1      * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
1424*f2a19305Safresh1      * the string 'indent'.  The output looks like this:
1425*f2a19305Safresh1          [0] 0x000A .. 0x000D
1426*f2a19305Safresh1          [2] 0x0085
1427*f2a19305Safresh1          [4] 0x2028 .. 0x2029
1428*f2a19305Safresh1          [6] 0x3104 .. INFTY
1429*f2a19305Safresh1      * This means that the first range of code points matched by the list are
1430*f2a19305Safresh1      * 0xA through 0xD; the second range contains only the single code point
1431*f2a19305Safresh1      * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
1432*f2a19305Safresh1      * are used to define each range (except if the final range extends to
1433*f2a19305Safresh1      * infinity, only a single element is needed).  The array index of the
1434*f2a19305Safresh1      * first element for the corresponding range is given in brackets. */
1435*f2a19305Safresh1 
1436*f2a19305Safresh1     UV start, end;
1437*f2a19305Safresh1     STRLEN count = 0;
1438*f2a19305Safresh1 
1439*f2a19305Safresh1     PERL_ARGS_ASSERT__INVLIST_DUMP;
1440*f2a19305Safresh1 
1441*f2a19305Safresh1     if (invlist_is_iterating(invlist)) {
1442*f2a19305Safresh1         Perl_dump_indent(aTHX_ level, file,
1443*f2a19305Safresh1              "%sCan't dump inversion list because is in middle of iterating\n",
1444*f2a19305Safresh1              indent);
1445*f2a19305Safresh1         return;
1446*f2a19305Safresh1     }
1447*f2a19305Safresh1 
1448*f2a19305Safresh1     invlist_iterinit(invlist);
1449*f2a19305Safresh1     while (invlist_iternext(invlist, &start, &end)) {
1450*f2a19305Safresh1         if (end == UV_MAX) {
1451*f2a19305Safresh1             Perl_dump_indent(aTHX_ level, file,
1452*f2a19305Safresh1                                        "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
1453*f2a19305Safresh1                                    indent, (UV)count, start);
1454*f2a19305Safresh1         }
1455*f2a19305Safresh1         else if (end != start) {
1456*f2a19305Safresh1             Perl_dump_indent(aTHX_ level, file,
1457*f2a19305Safresh1                                     "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n",
1458*f2a19305Safresh1                                 indent, (UV)count, start,         end);
1459*f2a19305Safresh1         }
1460*f2a19305Safresh1         else {
1461*f2a19305Safresh1             Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n",
1462*f2a19305Safresh1                                             indent, (UV)count, start);
1463*f2a19305Safresh1         }
1464*f2a19305Safresh1         count += 2;
1465*f2a19305Safresh1     }
1466*f2a19305Safresh1 }
1467*f2a19305Safresh1 
1468*f2a19305Safresh1 #endif
1469*f2a19305Safresh1 
1470*f2a19305Safresh1 #if defined(PERL_ARGS_ASSERT__INVLISTEQ) && !defined(PERL_IN_XSUB_RE)
1471*f2a19305Safresh1 bool
Perl__invlistEQ(pTHX_ SV * const a,SV * const b,const bool complement_b)1472*f2a19305Safresh1 Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b)
1473*f2a19305Safresh1 {
1474*f2a19305Safresh1     /* Return a boolean as to if the two passed in inversion lists are
1475*f2a19305Safresh1      * identical.  The final argument, if TRUE, says to take the complement of
1476*f2a19305Safresh1      * the second inversion list before doing the comparison */
1477*f2a19305Safresh1 
1478*f2a19305Safresh1     const UV len_a = _invlist_len(a);
1479*f2a19305Safresh1     UV len_b = _invlist_len(b);
1480*f2a19305Safresh1 
1481*f2a19305Safresh1     const UV* array_a = NULL;
1482*f2a19305Safresh1     const UV* array_b = NULL;
1483*f2a19305Safresh1 
1484*f2a19305Safresh1     PERL_ARGS_ASSERT__INVLISTEQ;
1485*f2a19305Safresh1 
1486*f2a19305Safresh1     /* This code avoids accessing the arrays unless it knows the length is
1487*f2a19305Safresh1      * non-zero */
1488*f2a19305Safresh1 
1489*f2a19305Safresh1     if (len_a == 0) {
1490*f2a19305Safresh1         if (len_b == 0) {
1491*f2a19305Safresh1             return ! complement_b;
1492*f2a19305Safresh1         }
1493*f2a19305Safresh1     }
1494*f2a19305Safresh1     else {
1495*f2a19305Safresh1         array_a = invlist_array(a);
1496*f2a19305Safresh1     }
1497*f2a19305Safresh1 
1498*f2a19305Safresh1     if (len_b != 0) {
1499*f2a19305Safresh1         array_b = invlist_array(b);
1500*f2a19305Safresh1     }
1501*f2a19305Safresh1 
1502*f2a19305Safresh1     /* If are to compare 'a' with the complement of b, set it
1503*f2a19305Safresh1      * up so are looking at b's complement. */
1504*f2a19305Safresh1     if (complement_b) {
1505*f2a19305Safresh1 
1506*f2a19305Safresh1         /* The complement of nothing is everything, so <a> would have to have
1507*f2a19305Safresh1          * just one element, starting at zero (ending at infinity) */
1508*f2a19305Safresh1         if (len_b == 0) {
1509*f2a19305Safresh1             return (len_a == 1 && array_a[0] == 0);
1510*f2a19305Safresh1         }
1511*f2a19305Safresh1         if (array_b[0] == 0) {
1512*f2a19305Safresh1 
1513*f2a19305Safresh1             /* Otherwise, to complement, we invert.  Here, the first element is
1514*f2a19305Safresh1              * 0, just remove it.  To do this, we just pretend the array starts
1515*f2a19305Safresh1              * one later */
1516*f2a19305Safresh1 
1517*f2a19305Safresh1             array_b++;
1518*f2a19305Safresh1             len_b--;
1519*f2a19305Safresh1         }
1520*f2a19305Safresh1         else {
1521*f2a19305Safresh1 
1522*f2a19305Safresh1             /* But if the first element is not zero, we pretend the list starts
1523*f2a19305Safresh1              * at the 0 that is always stored immediately before the array. */
1524*f2a19305Safresh1             array_b--;
1525*f2a19305Safresh1             len_b++;
1526*f2a19305Safresh1         }
1527*f2a19305Safresh1     }
1528*f2a19305Safresh1 
1529*f2a19305Safresh1     return    len_a == len_b
1530*f2a19305Safresh1            && memEQ(array_a, array_b, len_a * sizeof(array_a[0]));
1531*f2a19305Safresh1 
1532*f2a19305Safresh1 }
1533*f2a19305Safresh1 #endif
1534*f2a19305Safresh1 
1535*f2a19305Safresh1 #undef HEADER_LENGTH
1536*f2a19305Safresh1 #undef TO_INTERNAL_SIZE
1537*f2a19305Safresh1 #undef FROM_INTERNAL_SIZE
1538*f2a19305Safresh1 #undef INVLIST_VERSION_ID
1539*f2a19305Safresh1 
1540*f2a19305Safresh1 /* End of inversion list object */
1541