1 /*
2  * Amalgamated copy of CRoaring 0.2.66, modified for GTK to reduce compiler
3  * warnings.
4  *
5  * Copyright 2016-2020 The CRoaring authors
6  * Copyright 2020 Benjamin Otte
7  *
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *    http://www.apache.org/licenses/LICENSE-2.0
13  *
14  * Unless required by applicable law or agreed to in writing, software
15  * distributed under the License is distributed on an "AS IS" BASIS,
16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  * See the License for the specific language governing permissions and
18  * limitations under the License.
19  *
20  * SPDX-License-Identifier: Apache-2.0
21  */
22 
23 #include "roaring.h"
24 
25 /* used for http://dmalloc.com/ Dmalloc - Debug Malloc Library */
26 #ifdef DMALLOC
27 #include "dmalloc.h"
28 #endif
29 
30 /* begin file src/array_util.c */
31 #include <assert.h>
32 #include <stdbool.h>
33 #include <stdint.h>
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 
38 
39 #ifdef USESSE4
40 // used by intersect_vector16
41 ALIGNED(0x1000)
42 static const uint8_t shuffle_mask16[] = {
43     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
44     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
45     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    0xFF, 0xFF,
46     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
47     0,    1,    2,    3,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
48     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
49     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
50     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
51     2,    3,    4,    5,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
52     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    0xFF, 0xFF,
53     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    0xFF, 0xFF,
54     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
55     0,    1,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
56     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF,
57     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
58     6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
59     4,    5,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
60     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    0xFF, 0xFF,
61     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
62     6,    7,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
63     0,    1,    2,    3,    4,    5,    6,    7,    0xFF, 0xFF, 0xFF, 0xFF,
64     0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
65     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,
66     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
67     2,    3,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
68     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    8,    9,    0xFF, 0xFF,
69     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,
70     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
71     0,    1,    4,    5,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
72     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    8,    9,    0xFF, 0xFF,
73     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
74     4,    5,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
75     6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
76     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    8,    9,    0xFF, 0xFF,
77     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
78     8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
79     0,    1,    2,    3,    6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF,
80     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    8,    9,    0xFF, 0xFF,
81     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
82     6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
83     2,    3,    4,    5,    6,    7,    8,    9,    0xFF, 0xFF, 0xFF, 0xFF,
84     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
85     8,    9,    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 10,   11,   0xFF, 0xFF,
86     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
87     0,    1,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
88     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
89     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
90     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
91     4,    5,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
92     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    10,   11,   0xFF, 0xFF,
93     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
94     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
95     0,    1,    2,    3,    4,    5,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
96     0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
97     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
98     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
99     2,    3,    6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
100     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    10,   11,
101     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
102     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
103     0,    1,    4,    5,    6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
104     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    10,   11,
105     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
106     4,    5,    6,    7,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
107     8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
108     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,    10,   11,   0xFF, 0xFF,
109     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,
110     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
111     0,    1,    2,    3,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
112     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,    10,   11,   0xFF, 0xFF,
113     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
114     8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
115     2,    3,    4,    5,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
116     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    8,    9,
117     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,
118     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
119     0,    1,    6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
120     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    8,    9,    10,   11,
121     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
122     6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
123     4,    5,    6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF,
124     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    8,    9,
125     10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
126     6,    7,    8,    9,    10,   11,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
127     0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,
128     0xFF, 0xFF, 0xFF, 0xFF, 12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
129     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    12,   13,
130     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
131     2,    3,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
132     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    12,   13,   0xFF, 0xFF,
133     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    12,   13,
134     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
135     0,    1,    4,    5,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
136     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    12,   13,   0xFF, 0xFF,
137     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
138     4,    5,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
139     6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
140     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    12,   13,   0xFF, 0xFF,
141     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
142     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
143     0,    1,    2,    3,    6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
144     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    12,   13,   0xFF, 0xFF,
145     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
146     6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
147     2,    3,    4,    5,    6,    7,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
148     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
149     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    12,   13,
150     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
151     0,    1,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
152     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,    12,   13,   0xFF, 0xFF,
153     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
154     8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
155     4,    5,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
156     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    8,    9,    12,   13,
157     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
158     8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
159     0,    1,    2,    3,    4,    5,    8,    9,    12,   13,   0xFF, 0xFF,
160     0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,    12,   13,   0xFF, 0xFF,
161     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
162     8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
163     2,    3,    6,    7,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
164     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    8,    9,
165     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
166     8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
167     0,    1,    4,    5,    6,    7,    8,    9,    12,   13,   0xFF, 0xFF,
168     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    8,    9,
169     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
170     4,    5,    6,    7,    8,    9,    12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
171     10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
172     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    10,   11,   12,   13,   0xFF, 0xFF,
173     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    10,   11,
174     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
175     0,    1,    2,    3,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
176     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    10,   11,   12,   13,   0xFF, 0xFF,
177     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
178     10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
179     2,    3,    4,    5,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
180     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    10,   11,
181     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    10,   11,
182     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
183     0,    1,    6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
184     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    10,   11,   12,   13,
185     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
186     6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
187     4,    5,    6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
188     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    10,   11,
189     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
190     6,    7,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
191     0,    1,    2,    3,    4,    5,    6,    7,    10,   11,   12,   13,
192     0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    10,   11,   12,   13,   0xFF, 0xFF,
193     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,
194     10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
195     2,    3,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
196     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    8,    9,    10,   11,
197     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,
198     10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
199     0,    1,    4,    5,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF,
200     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    8,    9,    10,   11,
201     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
202     4,    5,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
203     6,    7,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
204     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    8,    9,    10,   11,
205     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
206     8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
207     0,    1,    2,    3,    6,    7,    8,    9,    10,   11,   12,   13,
208     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    8,    9,    10,   11,
209     12,   13,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
210     6,    7,    8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 0xFF, 0xFF,
211     2,    3,    4,    5,    6,    7,    8,    9,    10,   11,   12,   13,
212     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
213     8,    9,    10,   11,   12,   13,   0xFF, 0xFF, 14,   15,   0xFF, 0xFF,
214     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
215     0,    1,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
216     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
217     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
218     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
219     4,    5,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
220     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    14,   15,   0xFF, 0xFF,
221     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
222     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
223     0,    1,    2,    3,    4,    5,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
224     0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
225     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
226     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
227     2,    3,    6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
228     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    14,   15,
229     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
230     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
231     0,    1,    4,    5,    6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
232     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    14,   15,
233     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
234     4,    5,    6,    7,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
235     8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
236     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,    14,   15,   0xFF, 0xFF,
237     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,
238     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
239     0,    1,    2,    3,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
240     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,    14,   15,   0xFF, 0xFF,
241     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
242     8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
243     2,    3,    4,    5,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
244     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    8,    9,
245     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,
246     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
247     0,    1,    6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
248     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    8,    9,    14,   15,
249     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
250     6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
251     4,    5,    6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
252     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    8,    9,
253     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
254     6,    7,    8,    9,    14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
255     0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    14,   15,
256     0xFF, 0xFF, 0xFF, 0xFF, 10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
257     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    10,   11,
258     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
259     2,    3,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
260     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    10,   11,   14,   15,
261     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    10,   11,
262     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
263     0,    1,    4,    5,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
264     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    10,   11,   14,   15,
265     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
266     4,    5,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
267     6,    7,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
268     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    10,   11,   14,   15,
269     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
270     10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
271     0,    1,    2,    3,    6,    7,    10,   11,   14,   15,   0xFF, 0xFF,
272     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    10,   11,   14,   15,
273     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
274     6,    7,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
275     2,    3,    4,    5,    6,    7,    10,   11,   14,   15,   0xFF, 0xFF,
276     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
277     10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    10,   11,
278     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
279     0,    1,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
280     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,    10,   11,   14,   15,
281     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
282     8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
283     4,    5,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
284     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    8,    9,    10,   11,
285     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
286     8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
287     0,    1,    2,    3,    4,    5,    8,    9,    10,   11,   14,   15,
288     0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    8,    9,    10,   11,   14,   15,
289     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
290     8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
291     2,    3,    6,    7,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF,
292     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    8,    9,
293     10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
294     8,    9,    10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
295     0,    1,    4,    5,    6,    7,    8,    9,    10,   11,   14,   15,
296     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    8,    9,
297     10,   11,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
298     4,    5,    6,    7,    8,    9,    10,   11,   14,   15,   0xFF, 0xFF,
299     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
300     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    12,   13,   14,   15,   0xFF, 0xFF,
301     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    12,   13,
302     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
303     0,    1,    2,    3,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
304     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    12,   13,   14,   15,   0xFF, 0xFF,
305     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
306     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
307     2,    3,    4,    5,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
308     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    12,   13,
309     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    12,   13,
310     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
311     0,    1,    6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
312     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    12,   13,   14,   15,
313     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
314     6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
315     4,    5,    6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
316     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    12,   13,
317     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
318     6,    7,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
319     0,    1,    2,    3,    4,    5,    6,    7,    12,   13,   14,   15,
320     0xFF, 0xFF, 0xFF, 0xFF, 8,    9,    12,   13,   14,   15,   0xFF, 0xFF,
321     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,
322     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
323     2,    3,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
324     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    8,    9,    12,   13,
325     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,
326     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
327     0,    1,    4,    5,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF,
328     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    8,    9,    12,   13,
329     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
330     4,    5,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
331     6,    7,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
332     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,    8,    9,    12,   13,
333     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,
334     8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
335     0,    1,    2,    3,    6,    7,    8,    9,    12,   13,   14,   15,
336     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,    8,    9,    12,   13,
337     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
338     6,    7,    8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
339     2,    3,    4,    5,    6,    7,    8,    9,    12,   13,   14,   15,
340     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    6,    7,
341     8,    9,    12,   13,   14,   15,   0xFF, 0xFF, 10,   11,   12,   13,
342     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
343     0,    1,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
344     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    10,   11,   12,   13,   14,   15,
345     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
346     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
347     4,    5,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
348     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    10,   11,   12,   13,
349     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,
350     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
351     0,    1,    2,    3,    4,    5,    10,   11,   12,   13,   14,   15,
352     0xFF, 0xFF, 0xFF, 0xFF, 6,    7,    10,   11,   12,   13,   14,   15,
353     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    6,    7,
354     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
355     2,    3,    6,    7,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
356     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    6,    7,    10,   11,
357     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    6,    7,
358     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
359     0,    1,    4,    5,    6,    7,    10,   11,   12,   13,   14,   15,
360     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    4,    5,    6,    7,    10,   11,
361     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
362     4,    5,    6,    7,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
363     8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
364     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    8,    9,    10,   11,   12,   13,
365     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    8,    9,
366     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
367     0,    1,    2,    3,    8,    9,    10,   11,   12,   13,   14,   15,
368     0xFF, 0xFF, 0xFF, 0xFF, 4,    5,    8,    9,    10,   11,   12,   13,
369     14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,
370     8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF,
371     2,    3,    4,    5,    8,    9,    10,   11,   12,   13,   14,   15,
372     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,    4,    5,    8,    9,
373     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 6,    7,    8,    9,
374     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
375     0,    1,    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,
376     0xFF, 0xFF, 0xFF, 0xFF, 2,    3,    6,    7,    8,    9,    10,   11,
377     12,   13,   14,   15,   0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    2,    3,
378     6,    7,    8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
379     4,    5,    6,    7,    8,    9,    10,   11,   12,   13,   14,   15,
380     0xFF, 0xFF, 0xFF, 0xFF, 0,    1,    4,    5,    6,    7,    8,    9,
381     10,   11,   12,   13,   14,   15,   0xFF, 0xFF, 2,    3,    4,    5,
382     6,    7,    8,    9,    10,   11,   12,   13,   14,   15,   0xFF, 0xFF,
383     0,    1,    2,    3,    4,    5,    6,    7,    8,    9,    10,   11,
384     12,   13,   14,   15};
385 
386 /**
387  * From Schlegel et al., Fast Sorted-Set Intersection using SIMD Instructions
388  * Optimized by D. Lemire on May 3rd 2013
389  */
intersect_vector16(const uint16_t * __restrict__ A,size_t s_a,const uint16_t * __restrict__ B,size_t s_b,uint16_t * C)390 int32_t intersect_vector16(const uint16_t *__restrict__ A, size_t s_a,
391                            const uint16_t *__restrict__ B, size_t s_b,
392                            uint16_t *C) {
393     size_t count = 0;
394     size_t i_a = 0, i_b = 0;
395     const int vectorlength = sizeof(__m128i) / sizeof(uint16_t);
396     const size_t st_a = (s_a / vectorlength) * vectorlength;
397     const size_t st_b = (s_b / vectorlength) * vectorlength;
398     __m128i v_a, v_b;
399     if ((i_a < st_a) && (i_b < st_b)) {
400         v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
401         v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
402         while ((A[i_a] == 0) || (B[i_b] == 0)) {
403             const __m128i res_v = _mm_cmpestrm(
404                 v_b, vectorlength, v_a, vectorlength,
405                 _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
406             const int r = _mm_extract_epi32(res_v, 0);
407             __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 + r);
408             __m128i p = _mm_shuffle_epi8(v_a, sm16);
409             _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
410             count += _mm_popcnt_u32(r);
411             const uint16_t a_max = A[i_a + vectorlength - 1];
412             const uint16_t b_max = B[i_b + vectorlength - 1];
413             if (a_max <= b_max) {
414                 i_a += vectorlength;
415                 if (i_a == st_a) break;
416                 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
417             }
418             if (b_max <= a_max) {
419                 i_b += vectorlength;
420                 if (i_b == st_b) break;
421                 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
422             }
423         }
424         if ((i_a < st_a) && (i_b < st_b))
425             while (true) {
426                 const __m128i res_v = _mm_cmpistrm(
427                     v_b, v_a,
428                     _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
429                 const int r = _mm_extract_epi32(res_v, 0);
430                 __m128i sm16 =
431                     _mm_load_si128((const __m128i *)shuffle_mask16 + r);
432                 __m128i p = _mm_shuffle_epi8(v_a, sm16);
433                 _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
434                 count += _mm_popcnt_u32(r);
435                 const uint16_t a_max = A[i_a + vectorlength - 1];
436                 const uint16_t b_max = B[i_b + vectorlength - 1];
437                 if (a_max <= b_max) {
438                     i_a += vectorlength;
439                     if (i_a == st_a) break;
440                     v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
441                 }
442                 if (b_max <= a_max) {
443                     i_b += vectorlength;
444                     if (i_b == st_b) break;
445                     v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
446                 }
447             }
448     }
449     // intersect the tail using scalar intersection
450     while (i_a < s_a && i_b < s_b) {
451         uint16_t a = A[i_a];
452         uint16_t b = B[i_b];
453         if (a < b) {
454             i_a++;
455         } else if (b < a) {
456             i_b++;
457         } else {
458             C[count] = a;  //==b;
459             count++;
460             i_a++;
461             i_b++;
462         }
463     }
464     return (int32_t)count;
465 }
466 
intersect_vector16_cardinality(const uint16_t * __restrict__ A,size_t s_a,const uint16_t * __restrict__ B,size_t s_b)467 int32_t intersect_vector16_cardinality(const uint16_t *__restrict__ A,
468                                        size_t s_a,
469                                        const uint16_t *__restrict__ B,
470                                        size_t s_b) {
471     size_t count = 0;
472     size_t i_a = 0, i_b = 0;
473     const int vectorlength = sizeof(__m128i) / sizeof(uint16_t);
474     const size_t st_a = (s_a / vectorlength) * vectorlength;
475     const size_t st_b = (s_b / vectorlength) * vectorlength;
476     __m128i v_a, v_b;
477     if ((i_a < st_a) && (i_b < st_b)) {
478         v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
479         v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
480         while ((A[i_a] == 0) || (B[i_b] == 0)) {
481             const __m128i res_v = _mm_cmpestrm(
482                 v_b, vectorlength, v_a, vectorlength,
483                 _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
484             const int r = _mm_extract_epi32(res_v, 0);
485             count += _mm_popcnt_u32(r);
486             const uint16_t a_max = A[i_a + vectorlength - 1];
487             const uint16_t b_max = B[i_b + vectorlength - 1];
488             if (a_max <= b_max) {
489                 i_a += vectorlength;
490                 if (i_a == st_a) break;
491                 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
492             }
493             if (b_max <= a_max) {
494                 i_b += vectorlength;
495                 if (i_b == st_b) break;
496                 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
497             }
498         }
499         if ((i_a < st_a) && (i_b < st_b))
500             while (true) {
501                 const __m128i res_v = _mm_cmpistrm(
502                     v_b, v_a,
503                     _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY | _SIDD_BIT_MASK);
504                 const int r = _mm_extract_epi32(res_v, 0);
505                 count += _mm_popcnt_u32(r);
506                 const uint16_t a_max = A[i_a + vectorlength - 1];
507                 const uint16_t b_max = B[i_b + vectorlength - 1];
508                 if (a_max <= b_max) {
509                     i_a += vectorlength;
510                     if (i_a == st_a) break;
511                     v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
512                 }
513                 if (b_max <= a_max) {
514                     i_b += vectorlength;
515                     if (i_b == st_b) break;
516                     v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
517                 }
518             }
519     }
520     // intersect the tail using scalar intersection
521     while (i_a < s_a && i_b < s_b) {
522         uint16_t a = A[i_a];
523         uint16_t b = B[i_b];
524         if (a < b) {
525             i_a++;
526         } else if (b < a) {
527             i_b++;
528         } else {
529             count++;
530             i_a++;
531             i_b++;
532         }
533     }
534     return (int32_t)count;
535 }
536 
537 /////////
538 // Warning:
539 // This function may not be safe if A == C or B == C.
540 /////////
difference_vector16(const uint16_t * __restrict__ A,size_t s_a,const uint16_t * __restrict__ B,size_t s_b,uint16_t * C)541 int32_t difference_vector16(const uint16_t *__restrict__ A, size_t s_a,
542                             const uint16_t *__restrict__ B, size_t s_b,
543                             uint16_t *C) {
544     // we handle the degenerate case
545     if (s_a == 0) return 0;
546     if (s_b == 0) {
547         if (A != C) memcpy(C, A, sizeof(uint16_t) * s_a);
548         return (int32_t)s_a;
549     }
550     // handle the leading zeroes, it is messy but it allows us to use the fast
551     // _mm_cmpistrm intrinsic safely
552     int32_t count = 0;
553     if ((A[0] == 0) || (B[0] == 0)) {
554         if ((A[0] == 0) && (B[0] == 0)) {
555             A++;
556             s_a--;
557             B++;
558             s_b--;
559         } else if (A[0] == 0) {
560             C[count++] = 0;
561             A++;
562             s_a--;
563         } else {
564             B++;
565             s_b--;
566         }
567     }
568     // at this point, we have two non-empty arrays, made of non-zero
569     // increasing values.
570     size_t i_a = 0, i_b = 0;
571     const size_t vectorlength = sizeof(__m128i) / sizeof(uint16_t);
572     const size_t st_a = (s_a / vectorlength) * vectorlength;
573     const size_t st_b = (s_b / vectorlength) * vectorlength;
574     if ((i_a < st_a) && (i_b < st_b)) {  // this is the vectorized code path
575         __m128i v_a, v_b;                //, v_bmax;
576         // we load a vector from A and a vector from B
577         v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
578         v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
579         // we have a runningmask which indicates which values from A have been
580         // spotted in B, these don't get written out.
581         __m128i runningmask_a_found_in_b = _mm_setzero_si128();
582         /****
583         * start of the main vectorized loop
584         *****/
585         while (true) {
586             // afoundinb will contain a mask indicate for each entry in A
587             // whether it is seen
588             // in B
589             const __m128i a_found_in_b =
590                 _mm_cmpistrm(v_b, v_a, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY |
591                                            _SIDD_BIT_MASK);
592             runningmask_a_found_in_b =
593                 _mm_or_si128(runningmask_a_found_in_b, a_found_in_b);
594             // we always compare the last values of A and B
595             const uint16_t a_max = A[i_a + vectorlength - 1];
596             const uint16_t b_max = B[i_b + vectorlength - 1];
597             if (a_max <= b_max) {
598                 // Ok. In this code path, we are ready to write our v_a
599                 // because there is no need to read more from B, they will
600                 // all be large values.
601                 const int bitmask_belongs_to_difference =
602                     _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF;
603                 /*** next few lines are probably expensive *****/
604                 __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 +
605                                               bitmask_belongs_to_difference);
606                 __m128i p = _mm_shuffle_epi8(v_a, sm16);
607                 _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
608                 count += _mm_popcnt_u32(bitmask_belongs_to_difference);
609                 // we advance a
610                 i_a += vectorlength;
611                 if (i_a == st_a)  // no more
612                     break;
613                 runningmask_a_found_in_b = _mm_setzero_si128();
614                 v_a = _mm_lddqu_si128((__m128i *)&A[i_a]);
615             }
616             if (b_max <= a_max) {
617                 // in this code path, the current v_b has become useless
618                 i_b += vectorlength;
619                 if (i_b == st_b) break;
620                 v_b = _mm_lddqu_si128((__m128i *)&B[i_b]);
621             }
622         }
623         // at this point, either we have i_a == st_a, which is the end of the
624         // vectorized processing,
625         // or we have i_b == st_b,  and we are not done processing the vector...
626         // so we need to finish it off.
627         if (i_a < st_a) {        // we have unfinished business...
628             uint16_t buffer[8];  // buffer to do a masked load
629             memset(buffer, 0, 8 * sizeof(uint16_t));
630             memcpy(buffer, B + i_b, (s_b - i_b) * sizeof(uint16_t));
631             v_b = _mm_lddqu_si128((__m128i *)buffer);
632             const __m128i a_found_in_b =
633                 _mm_cmpistrm(v_b, v_a, _SIDD_UWORD_OPS | _SIDD_CMP_EQUAL_ANY |
634                                            _SIDD_BIT_MASK);
635             runningmask_a_found_in_b =
636                 _mm_or_si128(runningmask_a_found_in_b, a_found_in_b);
637             const int bitmask_belongs_to_difference =
638                 _mm_extract_epi32(runningmask_a_found_in_b, 0) ^ 0xFF;
639             __m128i sm16 = _mm_load_si128((const __m128i *)shuffle_mask16 +
640                                           bitmask_belongs_to_difference);
641             __m128i p = _mm_shuffle_epi8(v_a, sm16);
642             _mm_storeu_si128((__m128i *)&C[count], p);  // can overflow
643             count += _mm_popcnt_u32(bitmask_belongs_to_difference);
644             i_a += vectorlength;
645         }
646         // at this point we should have i_a == st_a and i_b == st_b
647     }
648     // do the tail using scalar code
649     while (i_a < s_a && i_b < s_b) {
650         uint16_t a = A[i_a];
651         uint16_t b = B[i_b];
652         if (b < a) {
653             i_b++;
654         } else if (a < b) {
655             C[count] = a;
656             count++;
657             i_a++;
658         } else {  //==
659             i_a++;
660             i_b++;
661         }
662     }
663     if (i_a < s_a) {
664         if(C == A) {
665           assert((size_t)count <= i_a);
666           if((size_t)count < i_a) {
667             memmove(C + count, A + i_a, sizeof(uint16_t) * (s_a - i_a));
668           }
669         } else {
670            for(size_t i = 0; i < (s_a - i_a); i++) {
671                 C[count + i] = A[i + i_a];
672            }
673         }
674         count += (int32_t)(s_a - i_a);
675     }
676     return count;
677 }
678 
679 #endif  // USESSE4
680 
681 
682 
683 #ifdef USE_OLD_SKEW_INTERSECT
684 // TODO: given enough experience with the new skew intersect, drop the old one from the code base.
685 
686 
687 /* Computes the intersection between one small and one large set of uint16_t.
688  * Stores the result into buffer and return the number of elements. */
intersect_skewed_uint16(const uint16_t * small,size_t size_s,const uint16_t * large,size_t size_l,uint16_t * buffer)689 int32_t intersect_skewed_uint16(const uint16_t *small, size_t size_s,
690                                 const uint16_t *large, size_t size_l,
691                                 uint16_t *buffer) {
692     size_t pos = 0, idx_l = 0, idx_s = 0;
693 
694     if (0 == size_s) {
695         return 0;
696     }
697 
698     uint16_t val_l = large[idx_l], val_s = small[idx_s];
699 
700     while (true) {
701         if (val_l < val_s) {
702             idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
703             if (idx_l == size_l) break;
704             val_l = large[idx_l];
705         } else if (val_s < val_l) {
706             idx_s++;
707             if (idx_s == size_s) break;
708             val_s = small[idx_s];
709         } else {
710             buffer[pos++] = val_s;
711             idx_s++;
712             if (idx_s == size_s) break;
713             val_s = small[idx_s];
714             idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
715             if (idx_l == size_l) break;
716             val_l = large[idx_l];
717         }
718     }
719 
720     return (int32_t)pos;
721 }
722 #else // USE_OLD_SKEW_INTERSECT
723 
724 
725 /**
726 * Branchless binary search going after 4 values at once.
727 * Assumes that array is sorted.
728 * You have that array[*index1] >= target1, array[*index12] >= target2, ...
729 * except when *index1 = n, in which case you know that all values in array are
730 * smaller than target1, and so forth.
731 * It has logarithmic complexity.
732 */
binarySearch4(const uint16_t * array,int32_t n,uint16_t target1,uint16_t target2,uint16_t target3,uint16_t target4,int32_t * index1,int32_t * index2,int32_t * index3,int32_t * index4)733 static void binarySearch4(const uint16_t *array, int32_t n, uint16_t target1,
734                    uint16_t target2, uint16_t target3, uint16_t target4,
735                    int32_t *index1, int32_t *index2, int32_t *index3,
736                    int32_t *index4) {
737   const uint16_t *base1 = array;
738   const uint16_t *base2 = array;
739   const uint16_t *base3 = array;
740   const uint16_t *base4 = array;
741   if (n == 0)
742     return;
743   while (n > 1) {
744     int32_t half = n >> 1;
745     base1 = (base1[half] < target1) ? &base1[half] : base1;
746     base2 = (base2[half] < target2) ? &base2[half] : base2;
747     base3 = (base3[half] < target3) ? &base3[half] : base3;
748     base4 = (base4[half] < target4) ? &base4[half] : base4;
749     n -= half;
750   }
751   *index1 = (int32_t)((*base1 < target1) + base1 - array);
752   *index2 = (int32_t)((*base2 < target2) + base2 - array);
753   *index3 = (int32_t)((*base3 < target3) + base3 - array);
754   *index4 = (int32_t)((*base4 < target4) + base4 - array);
755 }
756 
757 /**
758 * Branchless binary search going after 2 values at once.
759 * Assumes that array is sorted.
760 * You have that array[*index1] >= target1, array[*index12] >= target2.
761 * except when *index1 = n, in which case you know that all values in array are
762 * smaller than target1, and so forth.
763 * It has logarithmic complexity.
764 */
binarySearch2(const uint16_t * array,int32_t n,uint16_t target1,uint16_t target2,int32_t * index1,int32_t * index2)765 static void binarySearch2(const uint16_t *array, int32_t n, uint16_t target1,
766                    uint16_t target2, int32_t *index1, int32_t *index2) {
767   const uint16_t *base1 = array;
768   const uint16_t *base2 = array;
769   if (n == 0)
770     return;
771   while (n > 1) {
772     int32_t half = n >> 1;
773     base1 = (base1[half] < target1) ? &base1[half] : base1;
774     base2 = (base2[half] < target2) ? &base2[half] : base2;
775     n -= half;
776   }
777   *index1 = (int32_t)((*base1 < target1) + base1 - array);
778   *index2 = (int32_t)((*base2 < target2) + base2 - array);
779 }
780 
781 /* Computes the intersection between one small and one large set of uint16_t.
782  * Stores the result into buffer and return the number of elements.
783  * Processes the small set in blocks of 4 values calling binarySearch4
784  * and binarySearch2. This approach can be slightly superior to a conventional
785  * galloping search in some instances.
786  */
intersect_skewed_uint16(const uint16_t * small,size_t size_s,const uint16_t * large,size_t size_l,uint16_t * buffer)787 int32_t intersect_skewed_uint16(const uint16_t *small, size_t size_s,
788                                          const uint16_t *large, size_t size_l,
789                                          uint16_t *buffer) {
790   size_t pos = 0, idx_l = 0, idx_s = 0;
791 
792   if (0 == size_s) {
793     return 0;
794   }
795   int32_t index1 = 0, index2 = 0, index3 = 0, index4 = 0;
796   while ((idx_s + 4 <= size_s) && (idx_l < size_l)) {
797     uint16_t target1 = small[idx_s];
798     uint16_t target2 = small[idx_s + 1];
799     uint16_t target3 = small[idx_s + 2];
800     uint16_t target4 = small[idx_s + 3];
801     binarySearch4(large + idx_l, (int32_t)(size_l - idx_l), target1, target2, target3,
802                   target4, &index1, &index2, &index3, &index4);
803     if ((index1 + idx_l < size_l) && (large[idx_l + index1] == target1)) {
804       buffer[pos++] = target1;
805     }
806     if ((index2 + idx_l < size_l) && (large[idx_l + index2] == target2)) {
807       buffer[pos++] = target2;
808     }
809     if ((index3 + idx_l < size_l) && (large[idx_l + index3] == target3)) {
810       buffer[pos++] = target3;
811     }
812     if ((index4 + idx_l < size_l) && (large[idx_l + index4] == target4)) {
813       buffer[pos++] = target4;
814     }
815     idx_s += 4;
816     idx_l += index4;
817   }
818   if ((idx_s + 2 <= size_s) && (idx_l < size_l)) {
819     uint16_t target1 = small[idx_s];
820     uint16_t target2 = small[idx_s + 1];
821     binarySearch2(large + idx_l, (int32_t)(size_l - idx_l), target1, target2, &index1,
822                   &index2);
823     if ((index1 + idx_l < size_l) && (large[idx_l + index1] == target1)) {
824       buffer[pos++] = target1;
825     }
826     if ((index2 + idx_l < size_l) && (large[idx_l + index2] == target2)) {
827       buffer[pos++] = target2;
828     }
829     idx_s += 2;
830     idx_l += index2;
831   }
832   if ((idx_s < size_s) && (idx_l < size_l)) {
833     uint16_t val_s = small[idx_s];
834     int32_t index = binarySearch(large + idx_l, (int32_t)(size_l - idx_l), val_s);
835     if (index >= 0)
836       buffer[pos++] = val_s;
837   }
838   return (int32_t)pos;
839 }
840 
841 
842 #endif //USE_OLD_SKEW_INTERSECT
843 
844 
845 // TODO: this could be accelerated, possibly, by using binarySearch4 as above.
intersect_skewed_uint16_cardinality(const uint16_t * small,size_t size_s,const uint16_t * large,size_t size_l)846 int32_t intersect_skewed_uint16_cardinality(const uint16_t *small,
847                                             size_t size_s,
848                                             const uint16_t *large,
849                                             size_t size_l) {
850     size_t pos = 0, idx_l = 0, idx_s = 0;
851 
852     if (0 == size_s) {
853         return 0;
854     }
855 
856     uint16_t val_l = large[idx_l], val_s = small[idx_s];
857 
858     while (true) {
859         if (val_l < val_s) {
860             idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
861             if (idx_l == size_l) break;
862             val_l = large[idx_l];
863         } else if (val_s < val_l) {
864             idx_s++;
865             if (idx_s == size_s) break;
866             val_s = small[idx_s];
867         } else {
868             pos++;
869             idx_s++;
870             if (idx_s == size_s) break;
871             val_s = small[idx_s];
872             idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
873             if (idx_l == size_l) break;
874             val_l = large[idx_l];
875         }
876     }
877 
878     return (int32_t)pos;
879 }
880 
intersect_skewed_uint16_nonempty(const uint16_t * small,size_t size_s,const uint16_t * large,size_t size_l)881 bool intersect_skewed_uint16_nonempty(const uint16_t *small, size_t size_s,
882                                 const uint16_t *large, size_t size_l) {
883     size_t idx_l = 0, idx_s = 0;
884 
885     if (0 == size_s) {
886         return false;
887     }
888 
889     uint16_t val_l = large[idx_l], val_s = small[idx_s];
890 
891     while (true) {
892         if (val_l < val_s) {
893             idx_l = advanceUntil(large, (int32_t)idx_l, (int32_t)size_l, val_s);
894             if (idx_l == size_l) break;
895             val_l = large[idx_l];
896         } else if (val_s < val_l) {
897             idx_s++;
898             if (idx_s == size_s) break;
899             val_s = small[idx_s];
900         } else {
901             return true;
902         }
903     }
904 
905     return false;
906 }
907 
908 /**
909  * Generic intersection function.
910  */
intersect_uint16(const uint16_t * A,const size_t lenA,const uint16_t * B,const size_t lenB,uint16_t * out)911 int32_t intersect_uint16(const uint16_t *A, const size_t lenA,
912                          const uint16_t *B, const size_t lenB, uint16_t *out) {
913     const uint16_t *initout = out;
914     if (lenA == 0 || lenB == 0) return 0;
915     const uint16_t *endA = A + lenA;
916     const uint16_t *endB = B + lenB;
917 
918     while (1) {
919         while (*A < *B) {
920         SKIP_FIRST_COMPARE:
921             if (++A == endA) return (int32_t)(out - initout);
922         }
923         while (*A > *B) {
924             if (++B == endB) return (int32_t)(out - initout);
925         }
926         if (*A == *B) {
927             *out++ = *A;
928             if (++A == endA || ++B == endB) return (int32_t)(out - initout);
929         } else {
930             goto SKIP_FIRST_COMPARE;
931         }
932     }
933     return (int32_t)(out - initout);  // NOTREACHED
934 }
935 
intersect_uint16_cardinality(const uint16_t * A,const size_t lenA,const uint16_t * B,const size_t lenB)936 int32_t intersect_uint16_cardinality(const uint16_t *A, const size_t lenA,
937                                      const uint16_t *B, const size_t lenB) {
938     int32_t answer = 0;
939     if (lenA == 0 || lenB == 0) return 0;
940     const uint16_t *endA = A + lenA;
941     const uint16_t *endB = B + lenB;
942 
943     while (1) {
944         while (*A < *B) {
945         SKIP_FIRST_COMPARE:
946             if (++A == endA) return answer;
947         }
948         while (*A > *B) {
949             if (++B == endB) return answer;
950         }
951         if (*A == *B) {
952             ++answer;
953             if (++A == endA || ++B == endB) return answer;
954         } else {
955             goto SKIP_FIRST_COMPARE;
956         }
957     }
958     return answer;  // NOTREACHED
959 }
960 
961 
intersect_uint16_nonempty(const uint16_t * A,const size_t lenA,const uint16_t * B,const size_t lenB)962 bool intersect_uint16_nonempty(const uint16_t *A, const size_t lenA,
963                          const uint16_t *B, const size_t lenB) {
964     if (lenA == 0 || lenB == 0) return 0;
965     const uint16_t *endA = A + lenA;
966     const uint16_t *endB = B + lenB;
967 
968     while (1) {
969         while (*A < *B) {
970         SKIP_FIRST_COMPARE:
971             if (++A == endA) return false;
972         }
973         while (*A > *B) {
974             if (++B == endB) return false;
975         }
976         if (*A == *B) {
977             return true;
978         } else {
979             goto SKIP_FIRST_COMPARE;
980         }
981     }
982     return false;  // NOTREACHED
983 }
984 
985 
986 
987 /**
988  * Generic intersection function.
989  */
intersection_uint32(const uint32_t * A,const size_t lenA,const uint32_t * B,const size_t lenB,uint32_t * out)990 size_t intersection_uint32(const uint32_t *A, const size_t lenA,
991                            const uint32_t *B, const size_t lenB,
992                            uint32_t *out) {
993     const uint32_t *initout = out;
994     if (lenA == 0 || lenB == 0) return 0;
995     const uint32_t *endA = A + lenA;
996     const uint32_t *endB = B + lenB;
997 
998     while (1) {
999         while (*A < *B) {
1000         SKIP_FIRST_COMPARE:
1001             if (++A == endA) return (out - initout);
1002         }
1003         while (*A > *B) {
1004             if (++B == endB) return (out - initout);
1005         }
1006         if (*A == *B) {
1007             *out++ = *A;
1008             if (++A == endA || ++B == endB) return (out - initout);
1009         } else {
1010             goto SKIP_FIRST_COMPARE;
1011         }
1012     }
1013     return (out - initout);  // NOTREACHED
1014 }
1015 
intersection_uint32_card(const uint32_t * A,const size_t lenA,const uint32_t * B,const size_t lenB)1016 size_t intersection_uint32_card(const uint32_t *A, const size_t lenA,
1017                                 const uint32_t *B, const size_t lenB) {
1018     if (lenA == 0 || lenB == 0) return 0;
1019     size_t card = 0;
1020     const uint32_t *endA = A + lenA;
1021     const uint32_t *endB = B + lenB;
1022 
1023     while (1) {
1024         while (*A < *B) {
1025         SKIP_FIRST_COMPARE:
1026             if (++A == endA) return card;
1027         }
1028         while (*A > *B) {
1029             if (++B == endB) return card;
1030         }
1031         if (*A == *B) {
1032             card++;
1033             if (++A == endA || ++B == endB) return card;
1034         } else {
1035             goto SKIP_FIRST_COMPARE;
1036         }
1037     }
1038     return card;  // NOTREACHED
1039 }
1040 
1041 // can one vectorize the computation of the union? (Update: Yes! See
1042 // union_vector16).
1043 
union_uint16(const uint16_t * set_1,size_t size_1,const uint16_t * set_2,size_t size_2,uint16_t * buffer)1044 size_t union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2,
1045                     size_t size_2, uint16_t *buffer) {
1046     size_t pos = 0, idx_1 = 0, idx_2 = 0;
1047 
1048     if (0 == size_2) {
1049         memmove(buffer, set_1, size_1 * sizeof(uint16_t));
1050         return size_1;
1051     }
1052     if (0 == size_1) {
1053         memmove(buffer, set_2, size_2 * sizeof(uint16_t));
1054         return size_2;
1055     }
1056 
1057     uint16_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];
1058 
1059     while (true) {
1060         if (val_1 < val_2) {
1061             buffer[pos++] = val_1;
1062             ++idx_1;
1063             if (idx_1 >= size_1) break;
1064             val_1 = set_1[idx_1];
1065         } else if (val_2 < val_1) {
1066             buffer[pos++] = val_2;
1067             ++idx_2;
1068             if (idx_2 >= size_2) break;
1069             val_2 = set_2[idx_2];
1070         } else {
1071             buffer[pos++] = val_1;
1072             ++idx_1;
1073             ++idx_2;
1074             if (idx_1 >= size_1 || idx_2 >= size_2) break;
1075             val_1 = set_1[idx_1];
1076             val_2 = set_2[idx_2];
1077         }
1078     }
1079 
1080     if (idx_1 < size_1) {
1081         const size_t n_elems = size_1 - idx_1;
1082         memmove(buffer + pos, set_1 + idx_1, n_elems * sizeof(uint16_t));
1083         pos += n_elems;
1084     } else if (idx_2 < size_2) {
1085         const size_t n_elems = size_2 - idx_2;
1086         memmove(buffer + pos, set_2 + idx_2, n_elems * sizeof(uint16_t));
1087         pos += n_elems;
1088     }
1089 
1090     return pos;
1091 }
1092 
difference_uint16(const uint16_t * a1,int length1,const uint16_t * a2,int length2,uint16_t * a_out)1093 int difference_uint16(const uint16_t *a1, int length1, const uint16_t *a2,
1094                       int length2, uint16_t *a_out) {
1095     int out_card = 0;
1096     int k1 = 0, k2 = 0;
1097     if (length1 == 0) return 0;
1098     if (length2 == 0) {
1099         if (a1 != a_out) memcpy(a_out, a1, sizeof(uint16_t) * length1);
1100         return length1;
1101     }
1102     uint16_t s1 = a1[k1];
1103     uint16_t s2 = a2[k2];
1104     while (true) {
1105         if (s1 < s2) {
1106             a_out[out_card++] = s1;
1107             ++k1;
1108             if (k1 >= length1) {
1109                 break;
1110             }
1111             s1 = a1[k1];
1112         } else if (s1 == s2) {
1113             ++k1;
1114             ++k2;
1115             if (k1 >= length1) {
1116                 break;
1117             }
1118             if (k2 >= length2) {
1119                 memmove(a_out + out_card, a1 + k1,
1120                         sizeof(uint16_t) * (length1 - k1));
1121                 return out_card + length1 - k1;
1122             }
1123             s1 = a1[k1];
1124             s2 = a2[k2];
1125         } else {  // if (val1>val2)
1126             ++k2;
1127             if (k2 >= length2) {
1128                 memmove(a_out + out_card, a1 + k1,
1129                         sizeof(uint16_t) * (length1 - k1));
1130                 return out_card + length1 - k1;
1131             }
1132             s2 = a2[k2];
1133         }
1134     }
1135     return out_card;
1136 }
1137 
xor_uint16(const uint16_t * array_1,int32_t card_1,const uint16_t * array_2,int32_t card_2,uint16_t * out)1138 int32_t xor_uint16(const uint16_t *array_1, int32_t card_1,
1139                    const uint16_t *array_2, int32_t card_2, uint16_t *out) {
1140     int32_t pos1 = 0, pos2 = 0, pos_out = 0;
1141     while (pos1 < card_1 && pos2 < card_2) {
1142         const uint16_t v1 = array_1[pos1];
1143         const uint16_t v2 = array_2[pos2];
1144         if (v1 == v2) {
1145             ++pos1;
1146             ++pos2;
1147             continue;
1148         }
1149         if (v1 < v2) {
1150             out[pos_out++] = v1;
1151             ++pos1;
1152         } else {
1153             out[pos_out++] = v2;
1154             ++pos2;
1155         }
1156     }
1157     if (pos1 < card_1) {
1158         const size_t n_elems = card_1 - pos1;
1159         memcpy(out + pos_out, array_1 + pos1, n_elems * sizeof(uint16_t));
1160         pos_out += (int32_t)n_elems;
1161     } else if (pos2 < card_2) {
1162         const size_t n_elems = card_2 - pos2;
1163         memcpy(out + pos_out, array_2 + pos2, n_elems * sizeof(uint16_t));
1164         pos_out += (int32_t)n_elems;
1165     }
1166     return pos_out;
1167 }
1168 
1169 #ifdef USESSE4
1170 
1171 /***
1172  * start of the SIMD 16-bit union code
1173  *
1174  */
1175 
1176 // Assuming that vInput1 and vInput2 are sorted, produces a sorted output going
1177 // from vecMin all the way to vecMax
1178 // developed originally for merge sort using SIMD instructions.
1179 // Standard merge. See, e.g., Inoue and Taura, SIMD- and Cache-Friendly
1180 // Algorithm for Sorting an Array of Structures
sse_merge(const __m128i * vInput1,const __m128i * vInput2,__m128i * vecMin,__m128i * vecMax)1181 static inline void sse_merge(const __m128i *vInput1,
1182                              const __m128i *vInput2,              // input 1 & 2
1183                              __m128i *vecMin, __m128i *vecMax) {  // output
1184     __m128i vecTmp;
1185     vecTmp = _mm_min_epu16(*vInput1, *vInput2);
1186     *vecMax = _mm_max_epu16(*vInput1, *vInput2);
1187     vecTmp = _mm_alignr_epi8(vecTmp, vecTmp, 2);
1188     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1189     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1190     vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1191     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1192     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1193     vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1194     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1195     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1196     vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1197     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1198     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1199     vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1200     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1201     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1202     vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1203     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1204     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1205     vecTmp = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1206     *vecMin = _mm_min_epu16(vecTmp, *vecMax);
1207     *vecMax = _mm_max_epu16(vecTmp, *vecMax);
1208     *vecMin = _mm_alignr_epi8(*vecMin, *vecMin, 2);
1209 }
1210 
1211 // used by store_unique, generated by simdunion.py
1212 static uint8_t uniqshuf[] = {
1213     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
1214     0xc,  0xd,  0xe,  0xf,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
1215     0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
1216     0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
1217     0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
1218     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,
1219     0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
1220     0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1221     0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
1222     0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
1223     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1224     0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
1225     0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
1226     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,
1227     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,
1228     0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1229     0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
1230     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
1231     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,
1232     0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1233     0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1234     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
1235     0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
1236     0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1237     0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
1238     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,
1239     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1240     0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1241     0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
1242     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,
1243     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xa,  0xb,
1244     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1245     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,
1246     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,
1247     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
1248     0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1249     0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1250     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0xa,  0xb,  0xc,  0xd,
1251     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xa,  0xb,
1252     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1253     0x0,  0x1,  0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1254     0xFF, 0xFF, 0xFF, 0xFF, 0xa,  0xb,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
1255     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1256     0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
1257     0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
1258     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
1259     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
1260     0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1261     0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
1262     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,
1263     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
1264     0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1265     0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1266     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,
1267     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
1268     0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1269     0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
1270     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
1271     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1272     0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1273     0x2,  0x3,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1274     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,  0xc,  0xd,  0xe,  0xf,
1275     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xc,  0xd,
1276     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1277     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xe,  0xf,
1278     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,
1279     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
1280     0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1281     0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1282     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xc,  0xd,
1283     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
1284     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1285     0x0,  0x1,  0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1286     0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
1287     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1288     0x4,  0x5,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1289     0x2,  0x3,  0x4,  0x5,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1290     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0xc,  0xd,  0xe,  0xf,
1291     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xc,  0xd,
1292     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1293     0x0,  0x1,  0x2,  0x3,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1294     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF,
1295     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xc,  0xd,
1296     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1297     0xc,  0xd,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1298     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
1299     0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
1300     0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1301     0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,
1302     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
1303     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1304     0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1305     0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
1306     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
1307     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,
1308     0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1309     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,
1310     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,
1311     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
1312     0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1313     0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1314     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xa,  0xb,
1315     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,
1316     0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1317     0x0,  0x1,  0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1318     0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
1319     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1320     0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1321     0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
1322     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,
1323     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
1324     0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1325     0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
1326     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xe,  0xf,
1327     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
1328     0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1329     0x6,  0x7,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1330     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xa,  0xb,
1331     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
1332     0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1333     0x0,  0x1,  0x4,  0x5,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1334     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
1335     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1336     0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1337     0x2,  0x3,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1338     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xa,  0xb,  0xe,  0xf,  0xFF, 0xFF,
1339     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xa,  0xb,  0xe,  0xf,
1340     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1341     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xe,  0xf,
1342     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
1343     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
1344     0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1345     0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1346     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,
1347     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
1348     0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1349     0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1350     0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF,
1351     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1352     0x4,  0x5,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1353     0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1354     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xe,  0xf,
1355     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,
1356     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1357     0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1358     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF,
1359     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,
1360     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1361     0x8,  0x9,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1362     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
1363     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
1364     0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1365     0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1366     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF,
1367     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1368     0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1369     0x2,  0x3,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1370     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0xe,  0xf,  0xFF, 0xFF,
1371     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xe,  0xf,
1372     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1373     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF,
1374     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0xe,  0xf,  0xFF, 0xFF,
1375     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
1376     0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1377     0x4,  0x5,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1378     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0xe,  0xf,  0xFF, 0xFF,
1379     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xe,  0xf,
1380     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1381     0x0,  0x1,  0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1382     0xFF, 0xFF, 0xFF, 0xFF, 0xe,  0xf,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1383     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1384     0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
1385     0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
1386     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
1387     0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
1388     0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1389     0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
1390     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
1391     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
1392     0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1393     0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
1394     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,
1395     0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
1396     0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1397     0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
1398     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
1399     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1400     0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1401     0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
1402     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,  0xa,  0xb,  0xc,  0xd,
1403     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xa,  0xb,
1404     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1405     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,
1406     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,
1407     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
1408     0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1409     0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
1410     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xa,  0xb,
1411     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
1412     0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1413     0x0,  0x1,  0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
1414     0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
1415     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1416     0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1417     0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
1418     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0xa,  0xb,  0xc,  0xd,
1419     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xa,  0xb,
1420     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1421     0x0,  0x1,  0x2,  0x3,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
1422     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF,
1423     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xa,  0xb,
1424     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1425     0xa,  0xb,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1426     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
1427     0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
1428     0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1429     0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF,
1430     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,
1431     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1432     0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1433     0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
1434     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xc,  0xd,
1435     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,
1436     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1437     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF,
1438     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xc,  0xd,
1439     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
1440     0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1441     0x4,  0x5,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1442     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xc,  0xd,
1443     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,
1444     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1445     0x0,  0x1,  0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1446     0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
1447     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1448     0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1449     0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
1450     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xc,  0xd,
1451     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
1452     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1453     0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
1454     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF,
1455     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
1456     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1457     0x6,  0x7,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1458     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xc,  0xd,
1459     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
1460     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1461     0x0,  0x1,  0x4,  0x5,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1462     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
1463     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1464     0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1465     0x2,  0x3,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1466     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xc,  0xd,  0xFF, 0xFF, 0xFF, 0xFF,
1467     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xc,  0xd,  0xFF, 0xFF,
1468     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1469     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,
1470     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
1471     0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
1472     0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1473     0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
1474     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,
1475     0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
1476     0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1477     0x0,  0x1,  0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
1478     0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF,
1479     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1480     0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1481     0x2,  0x3,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
1482     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xa,  0xb,
1483     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,
1484     0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1485     0x0,  0x1,  0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
1486     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF,
1487     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,
1488     0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1489     0x8,  0x9,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1490     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,
1491     0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
1492     0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1493     0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
1494     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF,
1495     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1496     0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1497     0x2,  0x3,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1498     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,  0xa,  0xb,  0xFF, 0xFF,
1499     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xa,  0xb,
1500     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1501     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF,
1502     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0xa,  0xb,  0xFF, 0xFF,
1503     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
1504     0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1505     0x4,  0x5,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1506     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0xa,  0xb,  0xFF, 0xFF,
1507     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xa,  0xb,
1508     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1509     0x0,  0x1,  0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1510     0xFF, 0xFF, 0xFF, 0xFF, 0xa,  0xb,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1511     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1512     0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1513     0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
1514     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0x6,  0x7,  0x8,  0x9,
1515     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x6,  0x7,
1516     0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1517     0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
1518     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF,
1519     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x6,  0x7,
1520     0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1521     0x6,  0x7,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1522     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x8,  0x9,
1523     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,
1524     0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1525     0x0,  0x1,  0x4,  0x5,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1526     0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
1527     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1528     0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1529     0x2,  0x3,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1530     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x8,  0x9,  0xFF, 0xFF, 0xFF, 0xFF,
1531     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x8,  0x9,  0xFF, 0xFF,
1532     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1533     0x0,  0x1,  0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF,
1534     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x4,  0x5,  0x6,  0x7,  0xFF, 0xFF,
1535     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,
1536     0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1537     0x4,  0x5,  0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1538     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,  0x6,  0x7,  0xFF, 0xFF,
1539     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0x6,  0x7,
1540     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1541     0x0,  0x1,  0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1542     0xFF, 0xFF, 0xFF, 0xFF, 0x6,  0x7,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1543     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x2,  0x3,
1544     0x4,  0x5,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1545     0x2,  0x3,  0x4,  0x5,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1546     0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0x4,  0x5,  0xFF, 0xFF, 0xFF, 0xFF,
1547     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x4,  0x5,  0xFF, 0xFF,
1548     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1549     0x0,  0x1,  0x2,  0x3,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1550     0xFF, 0xFF, 0xFF, 0xFF, 0x2,  0x3,  0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1551     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x0,  0x1,  0xFF, 0xFF,
1552     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1553     0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1554     0xFF, 0xFF, 0xFF, 0xFF};
1555 
1556 // write vector new, while omitting repeated values assuming that previously
1557 // written vector was "old"
store_unique(__m128i old,__m128i newval,uint16_t * output)1558 static inline int store_unique(__m128i old, __m128i newval, uint16_t *output) {
1559     __m128i vecTmp = _mm_alignr_epi8(newval, old, 16 - 2);
1560     // lots of high latency instructions follow (optimize?)
1561     int M = _mm_movemask_epi8(
1562         _mm_packs_epi16(_mm_cmpeq_epi16(vecTmp, newval), _mm_setzero_si128()));
1563     int numberofnewvalues = 8 - _mm_popcnt_u32(M);
1564     __m128i key = _mm_lddqu_si128((const __m128i *)uniqshuf + M);
1565     __m128i val = _mm_shuffle_epi8(newval, key);
1566     _mm_storeu_si128((__m128i *)output, val);
1567     return numberofnewvalues;
1568 }
1569 
1570 // working in-place, this function overwrites the repeated values
1571 // could be avoided?
unique(uint16_t * out,uint32_t len)1572 static inline uint32_t unique(uint16_t *out, uint32_t len) {
1573     uint32_t pos = 1;
1574     for (uint32_t i = 1; i < len; ++i) {
1575         if (out[i] != out[i - 1]) {
1576             out[pos++] = out[i];
1577         }
1578     }
1579     return pos;
1580 }
1581 
1582 // use with qsort, could be avoided
uint16_compare(const void * a,const void * b)1583 static int uint16_compare(const void *a, const void *b) {
1584     return (*(uint16_t *)a - *(uint16_t *)b);
1585 }
1586 
1587 // a one-pass SSE union algorithm
1588 // This function may not be safe if array1 == output or array2 == output.
union_vector16(const uint16_t * __restrict__ array1,uint32_t length1,const uint16_t * __restrict__ array2,uint32_t length2,uint16_t * __restrict__ output)1589 uint32_t union_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
1590                         const uint16_t *__restrict__ array2, uint32_t length2,
1591                         uint16_t *__restrict__ output) {
1592     if ((length1 < 8) || (length2 < 8)) {
1593         return (uint32_t)union_uint16(array1, length1, array2, length2, output);
1594     }
1595     __m128i vA, vB, V, vecMin, vecMax;
1596     __m128i laststore;
1597     uint16_t *initoutput = output;
1598     uint32_t len1 = length1 / 8;
1599     uint32_t len2 = length2 / 8;
1600     uint32_t pos1 = 0;
1601     uint32_t pos2 = 0;
1602     // we start the machine
1603     vA = _mm_lddqu_si128((const __m128i *)array1 + pos1);
1604     pos1++;
1605     vB = _mm_lddqu_si128((const __m128i *)array2 + pos2);
1606     pos2++;
1607     sse_merge(&vA, &vB, &vecMin, &vecMax);
1608     laststore = _mm_set1_epi16(-1);
1609     output += store_unique(laststore, vecMin, output);
1610     laststore = vecMin;
1611     if ((pos1 < len1) && (pos2 < len2)) {
1612         uint16_t curA, curB;
1613         curA = array1[8 * pos1];
1614         curB = array2[8 * pos2];
1615         while (true) {
1616             if (curA <= curB) {
1617                 V = _mm_lddqu_si128((const __m128i *)array1 + pos1);
1618                 pos1++;
1619                 if (pos1 < len1) {
1620                     curA = array1[8 * pos1];
1621                 } else {
1622                     break;
1623                 }
1624             } else {
1625                 V = _mm_lddqu_si128((const __m128i *)array2 + pos2);
1626                 pos2++;
1627                 if (pos2 < len2) {
1628                     curB = array2[8 * pos2];
1629                 } else {
1630                     break;
1631                 }
1632             }
1633             sse_merge(&V, &vecMax, &vecMin, &vecMax);
1634             output += store_unique(laststore, vecMin, output);
1635             laststore = vecMin;
1636         }
1637         sse_merge(&V, &vecMax, &vecMin, &vecMax);
1638         output += store_unique(laststore, vecMin, output);
1639         laststore = vecMin;
1640     }
1641     // we finish the rest off using a scalar algorithm
1642     // could be improved?
1643     //
1644     // copy the small end on a tmp buffer
1645     uint32_t len = (uint32_t)(output - initoutput);
1646     uint16_t buffer[16];
1647     uint32_t leftoversize = store_unique(laststore, vecMax, buffer);
1648     if (pos1 == len1) {
1649         memcpy(buffer + leftoversize, array1 + 8 * pos1,
1650                (length1 - 8 * len1) * sizeof(uint16_t));
1651         leftoversize += length1 - 8 * len1;
1652         qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
1653 
1654         leftoversize = unique(buffer, leftoversize);
1655         len += (uint32_t)union_uint16(buffer, leftoversize, array2 + 8 * pos2,
1656                                       length2 - 8 * pos2, output);
1657     } else {
1658         memcpy(buffer + leftoversize, array2 + 8 * pos2,
1659                (length2 - 8 * len2) * sizeof(uint16_t));
1660         leftoversize += length2 - 8 * len2;
1661         qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
1662         leftoversize = unique(buffer, leftoversize);
1663         len += (uint32_t)union_uint16(buffer, leftoversize, array1 + 8 * pos1,
1664                                       length1 - 8 * pos1, output);
1665     }
1666     return len;
1667 }
1668 
1669 /**
1670  * End of the SIMD 16-bit union code
1671  *
1672  */
1673 
1674 /**
1675  * Start of SIMD 16-bit XOR code
1676  */
1677 
1678 // write vector new, while omitting repeated values assuming that previously
1679 // written vector was "old"
store_unique_xor(__m128i old,__m128i newval,uint16_t * output)1680 static inline int store_unique_xor(__m128i old, __m128i newval,
1681                                    uint16_t *output) {
1682     __m128i vecTmp1 = _mm_alignr_epi8(newval, old, 16 - 4);
1683     __m128i vecTmp2 = _mm_alignr_epi8(newval, old, 16 - 2);
1684     __m128i equalleft = _mm_cmpeq_epi16(vecTmp2, vecTmp1);
1685     __m128i equalright = _mm_cmpeq_epi16(vecTmp2, newval);
1686     __m128i equalleftoright = _mm_or_si128(equalleft, equalright);
1687     int M = _mm_movemask_epi8(
1688         _mm_packs_epi16(equalleftoright, _mm_setzero_si128()));
1689     int numberofnewvalues = 8 - _mm_popcnt_u32(M);
1690     __m128i key = _mm_lddqu_si128((const __m128i *)uniqshuf + M);
1691     __m128i val = _mm_shuffle_epi8(vecTmp2, key);
1692     _mm_storeu_si128((__m128i *)output, val);
1693     return numberofnewvalues;
1694 }
1695 
1696 // working in-place, this function overwrites the repeated values
1697 // could be avoided? Warning: assumes len > 0
unique_xor(uint16_t * out,uint32_t len)1698 static inline uint32_t unique_xor(uint16_t *out, uint32_t len) {
1699     uint32_t pos = 1;
1700     for (uint32_t i = 1; i < len; ++i) {
1701         if (out[i] != out[i - 1]) {
1702             out[pos++] = out[i];
1703         } else
1704             pos--;  // if it is identical to previous, delete it
1705     }
1706     return pos;
1707 }
1708 
1709 // a one-pass SSE xor algorithm
xor_vector16(const uint16_t * __restrict__ array1,uint32_t length1,const uint16_t * __restrict__ array2,uint32_t length2,uint16_t * __restrict__ output)1710 uint32_t xor_vector16(const uint16_t *__restrict__ array1, uint32_t length1,
1711                       const uint16_t *__restrict__ array2, uint32_t length2,
1712                       uint16_t *__restrict__ output) {
1713     if ((length1 < 8) || (length2 < 8)) {
1714         return xor_uint16(array1, length1, array2, length2, output);
1715     }
1716     __m128i vA, vB, V, vecMin, vecMax;
1717     __m128i laststore;
1718     uint16_t *initoutput = output;
1719     uint32_t len1 = length1 / 8;
1720     uint32_t len2 = length2 / 8;
1721     uint32_t pos1 = 0;
1722     uint32_t pos2 = 0;
1723     // we start the machine
1724     vA = _mm_lddqu_si128((const __m128i *)array1 + pos1);
1725     pos1++;
1726     vB = _mm_lddqu_si128((const __m128i *)array2 + pos2);
1727     pos2++;
1728     sse_merge(&vA, &vB, &vecMin, &vecMax);
1729     laststore = _mm_set1_epi16(-1);
1730     uint16_t buffer[17];
1731     output += store_unique_xor(laststore, vecMin, output);
1732 
1733     laststore = vecMin;
1734     if ((pos1 < len1) && (pos2 < len2)) {
1735         uint16_t curA, curB;
1736         curA = array1[8 * pos1];
1737         curB = array2[8 * pos2];
1738         while (true) {
1739             if (curA <= curB) {
1740                 V = _mm_lddqu_si128((const __m128i *)array1 + pos1);
1741                 pos1++;
1742                 if (pos1 < len1) {
1743                     curA = array1[8 * pos1];
1744                 } else {
1745                     break;
1746                 }
1747             } else {
1748                 V = _mm_lddqu_si128((const __m128i *)array2 + pos2);
1749                 pos2++;
1750                 if (pos2 < len2) {
1751                     curB = array2[8 * pos2];
1752                 } else {
1753                     break;
1754                 }
1755             }
1756             sse_merge(&V, &vecMax, &vecMin, &vecMax);
1757             // conditionally stores the last value of laststore as well as all
1758             // but the
1759             // last value of vecMin
1760             output += store_unique_xor(laststore, vecMin, output);
1761             laststore = vecMin;
1762         }
1763         sse_merge(&V, &vecMax, &vecMin, &vecMax);
1764         // conditionally stores the last value of laststore as well as all but
1765         // the
1766         // last value of vecMin
1767         output += store_unique_xor(laststore, vecMin, output);
1768         laststore = vecMin;
1769     }
1770     uint32_t len = (uint32_t)(output - initoutput);
1771 
1772     // we finish the rest off using a scalar algorithm
1773     // could be improved?
1774     // conditionally stores the last value of laststore as well as all but the
1775     // last value of vecMax,
1776     // we store to "buffer"
1777     int leftoversize = store_unique_xor(laststore, vecMax, buffer);
1778     uint16_t vec7 = _mm_extract_epi16(vecMax, 7);
1779     uint16_t vec6 = _mm_extract_epi16(vecMax, 6);
1780     if (vec7 != vec6) buffer[leftoversize++] = vec7;
1781     if (pos1 == len1) {
1782         memcpy(buffer + leftoversize, array1 + 8 * pos1,
1783                (length1 - 8 * len1) * sizeof(uint16_t));
1784         leftoversize += length1 - 8 * len1;
1785         if (leftoversize == 0) {  // trivial case
1786             memcpy(output, array2 + 8 * pos2,
1787                    (length2 - 8 * pos2) * sizeof(uint16_t));
1788             len += (length2 - 8 * pos2);
1789         } else {
1790             qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
1791             leftoversize = unique_xor(buffer, leftoversize);
1792             len += xor_uint16(buffer, leftoversize, array2 + 8 * pos2,
1793                               length2 - 8 * pos2, output);
1794         }
1795     } else {
1796         memcpy(buffer + leftoversize, array2 + 8 * pos2,
1797                (length2 - 8 * len2) * sizeof(uint16_t));
1798         leftoversize += length2 - 8 * len2;
1799         if (leftoversize == 0) {  // trivial case
1800             memcpy(output, array1 + 8 * pos1,
1801                    (length1 - 8 * pos1) * sizeof(uint16_t));
1802             len += (length1 - 8 * pos1);
1803         } else {
1804             qsort(buffer, leftoversize, sizeof(uint16_t), uint16_compare);
1805             leftoversize = unique_xor(buffer, leftoversize);
1806             len += xor_uint16(buffer, leftoversize, array1 + 8 * pos1,
1807                               length1 - 8 * pos1, output);
1808         }
1809     }
1810     return len;
1811 }
1812 
1813 /**
1814  * End of SIMD 16-bit XOR code
1815  */
1816 
1817 #endif  // USESSE4
1818 
union_uint32(const uint32_t * set_1,size_t size_1,const uint32_t * set_2,size_t size_2,uint32_t * buffer)1819 size_t union_uint32(const uint32_t *set_1, size_t size_1, const uint32_t *set_2,
1820                     size_t size_2, uint32_t *buffer) {
1821     size_t pos = 0, idx_1 = 0, idx_2 = 0;
1822 
1823     if (0 == size_2) {
1824         memmove(buffer, set_1, size_1 * sizeof(uint32_t));
1825         return size_1;
1826     }
1827     if (0 == size_1) {
1828         memmove(buffer, set_2, size_2 * sizeof(uint32_t));
1829         return size_2;
1830     }
1831 
1832     uint32_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];
1833 
1834     while (true) {
1835         if (val_1 < val_2) {
1836             buffer[pos++] = val_1;
1837             ++idx_1;
1838             if (idx_1 >= size_1) break;
1839             val_1 = set_1[idx_1];
1840         } else if (val_2 < val_1) {
1841             buffer[pos++] = val_2;
1842             ++idx_2;
1843             if (idx_2 >= size_2) break;
1844             val_2 = set_2[idx_2];
1845         } else {
1846             buffer[pos++] = val_1;
1847             ++idx_1;
1848             ++idx_2;
1849             if (idx_1 >= size_1 || idx_2 >= size_2) break;
1850             val_1 = set_1[idx_1];
1851             val_2 = set_2[idx_2];
1852         }
1853     }
1854 
1855     if (idx_1 < size_1) {
1856         const size_t n_elems = size_1 - idx_1;
1857         memmove(buffer + pos, set_1 + idx_1, n_elems * sizeof(uint32_t));
1858         pos += n_elems;
1859     } else if (idx_2 < size_2) {
1860         const size_t n_elems = size_2 - idx_2;
1861         memmove(buffer + pos, set_2 + idx_2, n_elems * sizeof(uint32_t));
1862         pos += n_elems;
1863     }
1864 
1865     return pos;
1866 }
1867 
union_uint32_card(const uint32_t * set_1,size_t size_1,const uint32_t * set_2,size_t size_2)1868 size_t union_uint32_card(const uint32_t *set_1, size_t size_1,
1869                          const uint32_t *set_2, size_t size_2) {
1870     size_t pos = 0, idx_1 = 0, idx_2 = 0;
1871 
1872     if (0 == size_2) {
1873         return size_1;
1874     }
1875     if (0 == size_1) {
1876         return size_2;
1877     }
1878 
1879     uint32_t val_1 = set_1[idx_1], val_2 = set_2[idx_2];
1880 
1881     while (true) {
1882         if (val_1 < val_2) {
1883             ++idx_1;
1884             ++pos;
1885             if (idx_1 >= size_1) break;
1886             val_1 = set_1[idx_1];
1887         } else if (val_2 < val_1) {
1888             ++idx_2;
1889             ++pos;
1890             if (idx_2 >= size_2) break;
1891             val_2 = set_2[idx_2];
1892         } else {
1893             ++idx_1;
1894             ++idx_2;
1895             ++pos;
1896             if (idx_1 >= size_1 || idx_2 >= size_2) break;
1897             val_1 = set_1[idx_1];
1898             val_2 = set_2[idx_2];
1899         }
1900     }
1901 
1902     if (idx_1 < size_1) {
1903         const size_t n_elems = size_1 - idx_1;
1904         pos += n_elems;
1905     } else if (idx_2 < size_2) {
1906         const size_t n_elems = size_2 - idx_2;
1907         pos += n_elems;
1908     }
1909     return pos;
1910 }
1911 
1912 
1913 
fast_union_uint16(const uint16_t * set_1,size_t size_1,const uint16_t * set_2,size_t size_2,uint16_t * buffer)1914 size_t fast_union_uint16(const uint16_t *set_1, size_t size_1, const uint16_t *set_2,
1915                     size_t size_2, uint16_t *buffer) {
1916 #ifdef ROARING_VECTOR_OPERATIONS_ENABLED
1917     // compute union with smallest array first
1918     if (size_1 < size_2) {
1919         return union_vector16(set_1, (uint32_t)size_1,
1920                                           set_2, (uint32_t)size_2, buffer);
1921     } else {
1922         return union_vector16(set_2, (uint32_t)size_2,
1923                                           set_1, (uint32_t)size_1, buffer);
1924     }
1925 #else
1926     // compute union with smallest array first
1927     if (size_1 < size_2) {
1928         return union_uint16(
1929             set_1, size_1, set_2, size_2, buffer);
1930     } else {
1931         return union_uint16(
1932             set_2, size_2, set_1, size_1, buffer);
1933     }
1934 #endif
1935 }
1936 
memequals(const void * s1,const void * s2,size_t n)1937 bool memequals(const void *s1, const void *s2, size_t n) {
1938     if (n == 0) {
1939         return true;
1940     }
1941 #ifdef USEAVX
1942     const uint8_t *ptr1 = (const uint8_t *)s1;
1943     const uint8_t *ptr2 = (const uint8_t *)s2;
1944     const uint8_t *end1 = ptr1 + n;
1945     const uint8_t *end8 = ptr1 + n/8*8;
1946     const uint8_t *end32 = ptr1 + n/32*32;
1947 
1948     while (ptr1 < end32) {
1949         __m256i r1 = _mm256_loadu_si256((const __m256i*)ptr1);
1950         __m256i r2 = _mm256_loadu_si256((const __m256i*)ptr2);
1951         int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2));
1952         if ((uint32_t)mask != UINT32_MAX) {
1953             return false;
1954         }
1955         ptr1 += 32;
1956         ptr2 += 32;
1957     }
1958 
1959     while (ptr1 < end8) {
1960         uint64_t v1 = *((const uint64_t*)ptr1);
1961         uint64_t v2 = *((const uint64_t*)ptr2);
1962         if (v1 != v2) {
1963             return false;
1964         }
1965         ptr1 += 8;
1966         ptr2 += 8;
1967     }
1968 
1969     while (ptr1 < end1) {
1970         if (*ptr1 != *ptr2) {
1971             return false;
1972         }
1973         ptr1++;
1974         ptr2++;
1975     }
1976 
1977     return true;
1978 #else
1979     return memcmp(s1, s2, n) == 0;
1980 #endif
1981 }
1982 /* end file src/array_util.c */
1983 /* begin file src/bitset_util.c */
1984 #include <assert.h>
1985 #include <stdint.h>
1986 #include <stdio.h>
1987 #include <stdlib.h>
1988 #include <string.h>
1989 
1990 
1991 #ifdef IS_X64
1992 static uint8_t lengthTable[256] = {
1993     0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4,
1994     2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1995     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4,
1996     2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
1997     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
1998     4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
1999     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5,
2000     3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2001     2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,
2002     4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
2003     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
2004 #endif
2005 
2006 #ifdef USEAVX
2007 ALIGNED(32)
2008 static uint32_t vecDecodeTable[256][8] = {
2009     {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
2010     {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
2011     {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
2012     {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
2013     {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
2014     {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
2015     {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
2016     {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
2017     {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
2018     {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
2019     {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
2020     {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
2021     {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
2022     {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
2023     {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
2024     {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
2025     {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
2026     {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
2027     {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
2028     {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
2029     {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
2030     {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
2031     {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
2032     {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
2033     {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
2034     {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
2035     {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
2036     {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
2037     {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
2038     {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
2039     {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
2040     {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
2041     {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
2042     {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
2043     {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
2044     {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
2045     {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
2046     {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
2047     {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
2048     {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
2049     {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
2050     {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
2051     {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
2052     {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
2053     {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
2054     {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
2055     {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
2056     {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
2057     {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
2058     {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
2059     {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
2060     {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
2061     {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
2062     {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
2063     {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
2064     {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
2065     {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
2066     {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
2067     {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
2068     {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
2069     {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
2070     {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
2071     {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
2072     {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
2073     {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
2074     {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
2075     {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
2076     {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
2077     {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
2078     {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
2079     {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
2080     {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
2081     {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
2082     {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
2083     {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
2084     {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
2085     {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
2086     {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
2087     {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
2088     {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
2089     {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
2090     {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
2091     {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
2092     {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
2093     {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
2094     {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
2095     {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
2096     {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
2097     {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
2098     {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
2099     {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
2100     {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
2101     {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
2102     {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
2103     {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
2104     {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
2105     {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
2106     {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
2107     {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
2108     {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
2109     {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
2110     {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
2111     {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
2112     {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
2113     {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
2114     {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
2115     {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
2116     {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
2117     {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
2118     {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
2119     {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
2120     {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
2121     {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
2122     {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
2123     {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
2124     {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
2125     {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
2126     {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
2127     {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
2128     {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
2129     {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
2130     {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
2131     {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
2132     {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
2133     {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
2134     {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
2135     {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
2136     {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
2137     {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
2138     {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
2139     {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
2140     {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
2141     {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
2142     {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
2143     {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
2144     {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
2145     {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
2146     {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
2147     {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
2148     {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
2149     {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
2150     {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
2151     {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
2152     {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
2153     {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
2154     {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
2155     {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
2156     {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
2157     {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
2158     {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
2159     {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
2160     {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
2161     {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
2162     {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
2163     {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
2164     {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
2165     {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
2166     {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
2167     {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
2168     {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
2169     {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
2170     {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
2171     {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
2172     {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
2173     {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
2174     {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
2175     {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
2176     {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
2177     {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
2178     {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
2179     {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
2180     {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
2181     {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
2182     {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
2183     {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
2184     {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
2185     {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
2186     {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
2187     {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
2188     {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
2189     {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
2190     {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
2191     {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
2192     {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
2193     {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
2194     {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
2195     {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
2196     {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
2197     {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
2198     {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
2199     {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
2200     {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
2201     {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
2202     {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
2203     {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
2204     {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
2205     {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
2206     {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
2207     {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
2208     {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
2209     {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
2210     {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
2211     {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
2212     {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
2213     {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
2214     {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
2215     {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
2216     {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
2217     {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
2218     {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
2219     {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
2220     {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
2221     {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
2222     {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
2223     {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
2224     {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
2225     {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
2226     {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
2227     {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
2228     {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
2229     {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
2230     {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
2231     {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
2232     {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
2233     {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
2234     {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
2235     {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
2236     {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
2237     {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
2238     {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
2239     {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
2240     {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
2241     {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
2242     {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
2243     {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
2244     {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
2245     {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
2246     {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
2247     {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
2248     {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
2249     {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
2250     {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
2251     {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
2252     {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
2253     {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
2254     {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
2255     {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
2256     {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
2257     {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
2258     {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
2259     {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
2260     {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
2261     {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
2262     {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
2263     {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
2264     {1, 2, 3, 4, 5, 6, 7, 8}  /* 0xFF (11111111) */
2265 };
2266 
2267 #endif  // #ifdef USEAVX
2268 
2269 #ifdef IS_X64
2270 // same as vecDecodeTable but in 16 bits
2271 ALIGNED(32)
2272 static uint16_t vecDecodeTable_uint16[256][8] = {
2273     {0, 0, 0, 0, 0, 0, 0, 0}, /* 0x00 (00000000) */
2274     {1, 0, 0, 0, 0, 0, 0, 0}, /* 0x01 (00000001) */
2275     {2, 0, 0, 0, 0, 0, 0, 0}, /* 0x02 (00000010) */
2276     {1, 2, 0, 0, 0, 0, 0, 0}, /* 0x03 (00000011) */
2277     {3, 0, 0, 0, 0, 0, 0, 0}, /* 0x04 (00000100) */
2278     {1, 3, 0, 0, 0, 0, 0, 0}, /* 0x05 (00000101) */
2279     {2, 3, 0, 0, 0, 0, 0, 0}, /* 0x06 (00000110) */
2280     {1, 2, 3, 0, 0, 0, 0, 0}, /* 0x07 (00000111) */
2281     {4, 0, 0, 0, 0, 0, 0, 0}, /* 0x08 (00001000) */
2282     {1, 4, 0, 0, 0, 0, 0, 0}, /* 0x09 (00001001) */
2283     {2, 4, 0, 0, 0, 0, 0, 0}, /* 0x0A (00001010) */
2284     {1, 2, 4, 0, 0, 0, 0, 0}, /* 0x0B (00001011) */
2285     {3, 4, 0, 0, 0, 0, 0, 0}, /* 0x0C (00001100) */
2286     {1, 3, 4, 0, 0, 0, 0, 0}, /* 0x0D (00001101) */
2287     {2, 3, 4, 0, 0, 0, 0, 0}, /* 0x0E (00001110) */
2288     {1, 2, 3, 4, 0, 0, 0, 0}, /* 0x0F (00001111) */
2289     {5, 0, 0, 0, 0, 0, 0, 0}, /* 0x10 (00010000) */
2290     {1, 5, 0, 0, 0, 0, 0, 0}, /* 0x11 (00010001) */
2291     {2, 5, 0, 0, 0, 0, 0, 0}, /* 0x12 (00010010) */
2292     {1, 2, 5, 0, 0, 0, 0, 0}, /* 0x13 (00010011) */
2293     {3, 5, 0, 0, 0, 0, 0, 0}, /* 0x14 (00010100) */
2294     {1, 3, 5, 0, 0, 0, 0, 0}, /* 0x15 (00010101) */
2295     {2, 3, 5, 0, 0, 0, 0, 0}, /* 0x16 (00010110) */
2296     {1, 2, 3, 5, 0, 0, 0, 0}, /* 0x17 (00010111) */
2297     {4, 5, 0, 0, 0, 0, 0, 0}, /* 0x18 (00011000) */
2298     {1, 4, 5, 0, 0, 0, 0, 0}, /* 0x19 (00011001) */
2299     {2, 4, 5, 0, 0, 0, 0, 0}, /* 0x1A (00011010) */
2300     {1, 2, 4, 5, 0, 0, 0, 0}, /* 0x1B (00011011) */
2301     {3, 4, 5, 0, 0, 0, 0, 0}, /* 0x1C (00011100) */
2302     {1, 3, 4, 5, 0, 0, 0, 0}, /* 0x1D (00011101) */
2303     {2, 3, 4, 5, 0, 0, 0, 0}, /* 0x1E (00011110) */
2304     {1, 2, 3, 4, 5, 0, 0, 0}, /* 0x1F (00011111) */
2305     {6, 0, 0, 0, 0, 0, 0, 0}, /* 0x20 (00100000) */
2306     {1, 6, 0, 0, 0, 0, 0, 0}, /* 0x21 (00100001) */
2307     {2, 6, 0, 0, 0, 0, 0, 0}, /* 0x22 (00100010) */
2308     {1, 2, 6, 0, 0, 0, 0, 0}, /* 0x23 (00100011) */
2309     {3, 6, 0, 0, 0, 0, 0, 0}, /* 0x24 (00100100) */
2310     {1, 3, 6, 0, 0, 0, 0, 0}, /* 0x25 (00100101) */
2311     {2, 3, 6, 0, 0, 0, 0, 0}, /* 0x26 (00100110) */
2312     {1, 2, 3, 6, 0, 0, 0, 0}, /* 0x27 (00100111) */
2313     {4, 6, 0, 0, 0, 0, 0, 0}, /* 0x28 (00101000) */
2314     {1, 4, 6, 0, 0, 0, 0, 0}, /* 0x29 (00101001) */
2315     {2, 4, 6, 0, 0, 0, 0, 0}, /* 0x2A (00101010) */
2316     {1, 2, 4, 6, 0, 0, 0, 0}, /* 0x2B (00101011) */
2317     {3, 4, 6, 0, 0, 0, 0, 0}, /* 0x2C (00101100) */
2318     {1, 3, 4, 6, 0, 0, 0, 0}, /* 0x2D (00101101) */
2319     {2, 3, 4, 6, 0, 0, 0, 0}, /* 0x2E (00101110) */
2320     {1, 2, 3, 4, 6, 0, 0, 0}, /* 0x2F (00101111) */
2321     {5, 6, 0, 0, 0, 0, 0, 0}, /* 0x30 (00110000) */
2322     {1, 5, 6, 0, 0, 0, 0, 0}, /* 0x31 (00110001) */
2323     {2, 5, 6, 0, 0, 0, 0, 0}, /* 0x32 (00110010) */
2324     {1, 2, 5, 6, 0, 0, 0, 0}, /* 0x33 (00110011) */
2325     {3, 5, 6, 0, 0, 0, 0, 0}, /* 0x34 (00110100) */
2326     {1, 3, 5, 6, 0, 0, 0, 0}, /* 0x35 (00110101) */
2327     {2, 3, 5, 6, 0, 0, 0, 0}, /* 0x36 (00110110) */
2328     {1, 2, 3, 5, 6, 0, 0, 0}, /* 0x37 (00110111) */
2329     {4, 5, 6, 0, 0, 0, 0, 0}, /* 0x38 (00111000) */
2330     {1, 4, 5, 6, 0, 0, 0, 0}, /* 0x39 (00111001) */
2331     {2, 4, 5, 6, 0, 0, 0, 0}, /* 0x3A (00111010) */
2332     {1, 2, 4, 5, 6, 0, 0, 0}, /* 0x3B (00111011) */
2333     {3, 4, 5, 6, 0, 0, 0, 0}, /* 0x3C (00111100) */
2334     {1, 3, 4, 5, 6, 0, 0, 0}, /* 0x3D (00111101) */
2335     {2, 3, 4, 5, 6, 0, 0, 0}, /* 0x3E (00111110) */
2336     {1, 2, 3, 4, 5, 6, 0, 0}, /* 0x3F (00111111) */
2337     {7, 0, 0, 0, 0, 0, 0, 0}, /* 0x40 (01000000) */
2338     {1, 7, 0, 0, 0, 0, 0, 0}, /* 0x41 (01000001) */
2339     {2, 7, 0, 0, 0, 0, 0, 0}, /* 0x42 (01000010) */
2340     {1, 2, 7, 0, 0, 0, 0, 0}, /* 0x43 (01000011) */
2341     {3, 7, 0, 0, 0, 0, 0, 0}, /* 0x44 (01000100) */
2342     {1, 3, 7, 0, 0, 0, 0, 0}, /* 0x45 (01000101) */
2343     {2, 3, 7, 0, 0, 0, 0, 0}, /* 0x46 (01000110) */
2344     {1, 2, 3, 7, 0, 0, 0, 0}, /* 0x47 (01000111) */
2345     {4, 7, 0, 0, 0, 0, 0, 0}, /* 0x48 (01001000) */
2346     {1, 4, 7, 0, 0, 0, 0, 0}, /* 0x49 (01001001) */
2347     {2, 4, 7, 0, 0, 0, 0, 0}, /* 0x4A (01001010) */
2348     {1, 2, 4, 7, 0, 0, 0, 0}, /* 0x4B (01001011) */
2349     {3, 4, 7, 0, 0, 0, 0, 0}, /* 0x4C (01001100) */
2350     {1, 3, 4, 7, 0, 0, 0, 0}, /* 0x4D (01001101) */
2351     {2, 3, 4, 7, 0, 0, 0, 0}, /* 0x4E (01001110) */
2352     {1, 2, 3, 4, 7, 0, 0, 0}, /* 0x4F (01001111) */
2353     {5, 7, 0, 0, 0, 0, 0, 0}, /* 0x50 (01010000) */
2354     {1, 5, 7, 0, 0, 0, 0, 0}, /* 0x51 (01010001) */
2355     {2, 5, 7, 0, 0, 0, 0, 0}, /* 0x52 (01010010) */
2356     {1, 2, 5, 7, 0, 0, 0, 0}, /* 0x53 (01010011) */
2357     {3, 5, 7, 0, 0, 0, 0, 0}, /* 0x54 (01010100) */
2358     {1, 3, 5, 7, 0, 0, 0, 0}, /* 0x55 (01010101) */
2359     {2, 3, 5, 7, 0, 0, 0, 0}, /* 0x56 (01010110) */
2360     {1, 2, 3, 5, 7, 0, 0, 0}, /* 0x57 (01010111) */
2361     {4, 5, 7, 0, 0, 0, 0, 0}, /* 0x58 (01011000) */
2362     {1, 4, 5, 7, 0, 0, 0, 0}, /* 0x59 (01011001) */
2363     {2, 4, 5, 7, 0, 0, 0, 0}, /* 0x5A (01011010) */
2364     {1, 2, 4, 5, 7, 0, 0, 0}, /* 0x5B (01011011) */
2365     {3, 4, 5, 7, 0, 0, 0, 0}, /* 0x5C (01011100) */
2366     {1, 3, 4, 5, 7, 0, 0, 0}, /* 0x5D (01011101) */
2367     {2, 3, 4, 5, 7, 0, 0, 0}, /* 0x5E (01011110) */
2368     {1, 2, 3, 4, 5, 7, 0, 0}, /* 0x5F (01011111) */
2369     {6, 7, 0, 0, 0, 0, 0, 0}, /* 0x60 (01100000) */
2370     {1, 6, 7, 0, 0, 0, 0, 0}, /* 0x61 (01100001) */
2371     {2, 6, 7, 0, 0, 0, 0, 0}, /* 0x62 (01100010) */
2372     {1, 2, 6, 7, 0, 0, 0, 0}, /* 0x63 (01100011) */
2373     {3, 6, 7, 0, 0, 0, 0, 0}, /* 0x64 (01100100) */
2374     {1, 3, 6, 7, 0, 0, 0, 0}, /* 0x65 (01100101) */
2375     {2, 3, 6, 7, 0, 0, 0, 0}, /* 0x66 (01100110) */
2376     {1, 2, 3, 6, 7, 0, 0, 0}, /* 0x67 (01100111) */
2377     {4, 6, 7, 0, 0, 0, 0, 0}, /* 0x68 (01101000) */
2378     {1, 4, 6, 7, 0, 0, 0, 0}, /* 0x69 (01101001) */
2379     {2, 4, 6, 7, 0, 0, 0, 0}, /* 0x6A (01101010) */
2380     {1, 2, 4, 6, 7, 0, 0, 0}, /* 0x6B (01101011) */
2381     {3, 4, 6, 7, 0, 0, 0, 0}, /* 0x6C (01101100) */
2382     {1, 3, 4, 6, 7, 0, 0, 0}, /* 0x6D (01101101) */
2383     {2, 3, 4, 6, 7, 0, 0, 0}, /* 0x6E (01101110) */
2384     {1, 2, 3, 4, 6, 7, 0, 0}, /* 0x6F (01101111) */
2385     {5, 6, 7, 0, 0, 0, 0, 0}, /* 0x70 (01110000) */
2386     {1, 5, 6, 7, 0, 0, 0, 0}, /* 0x71 (01110001) */
2387     {2, 5, 6, 7, 0, 0, 0, 0}, /* 0x72 (01110010) */
2388     {1, 2, 5, 6, 7, 0, 0, 0}, /* 0x73 (01110011) */
2389     {3, 5, 6, 7, 0, 0, 0, 0}, /* 0x74 (01110100) */
2390     {1, 3, 5, 6, 7, 0, 0, 0}, /* 0x75 (01110101) */
2391     {2, 3, 5, 6, 7, 0, 0, 0}, /* 0x76 (01110110) */
2392     {1, 2, 3, 5, 6, 7, 0, 0}, /* 0x77 (01110111) */
2393     {4, 5, 6, 7, 0, 0, 0, 0}, /* 0x78 (01111000) */
2394     {1, 4, 5, 6, 7, 0, 0, 0}, /* 0x79 (01111001) */
2395     {2, 4, 5, 6, 7, 0, 0, 0}, /* 0x7A (01111010) */
2396     {1, 2, 4, 5, 6, 7, 0, 0}, /* 0x7B (01111011) */
2397     {3, 4, 5, 6, 7, 0, 0, 0}, /* 0x7C (01111100) */
2398     {1, 3, 4, 5, 6, 7, 0, 0}, /* 0x7D (01111101) */
2399     {2, 3, 4, 5, 6, 7, 0, 0}, /* 0x7E (01111110) */
2400     {1, 2, 3, 4, 5, 6, 7, 0}, /* 0x7F (01111111) */
2401     {8, 0, 0, 0, 0, 0, 0, 0}, /* 0x80 (10000000) */
2402     {1, 8, 0, 0, 0, 0, 0, 0}, /* 0x81 (10000001) */
2403     {2, 8, 0, 0, 0, 0, 0, 0}, /* 0x82 (10000010) */
2404     {1, 2, 8, 0, 0, 0, 0, 0}, /* 0x83 (10000011) */
2405     {3, 8, 0, 0, 0, 0, 0, 0}, /* 0x84 (10000100) */
2406     {1, 3, 8, 0, 0, 0, 0, 0}, /* 0x85 (10000101) */
2407     {2, 3, 8, 0, 0, 0, 0, 0}, /* 0x86 (10000110) */
2408     {1, 2, 3, 8, 0, 0, 0, 0}, /* 0x87 (10000111) */
2409     {4, 8, 0, 0, 0, 0, 0, 0}, /* 0x88 (10001000) */
2410     {1, 4, 8, 0, 0, 0, 0, 0}, /* 0x89 (10001001) */
2411     {2, 4, 8, 0, 0, 0, 0, 0}, /* 0x8A (10001010) */
2412     {1, 2, 4, 8, 0, 0, 0, 0}, /* 0x8B (10001011) */
2413     {3, 4, 8, 0, 0, 0, 0, 0}, /* 0x8C (10001100) */
2414     {1, 3, 4, 8, 0, 0, 0, 0}, /* 0x8D (10001101) */
2415     {2, 3, 4, 8, 0, 0, 0, 0}, /* 0x8E (10001110) */
2416     {1, 2, 3, 4, 8, 0, 0, 0}, /* 0x8F (10001111) */
2417     {5, 8, 0, 0, 0, 0, 0, 0}, /* 0x90 (10010000) */
2418     {1, 5, 8, 0, 0, 0, 0, 0}, /* 0x91 (10010001) */
2419     {2, 5, 8, 0, 0, 0, 0, 0}, /* 0x92 (10010010) */
2420     {1, 2, 5, 8, 0, 0, 0, 0}, /* 0x93 (10010011) */
2421     {3, 5, 8, 0, 0, 0, 0, 0}, /* 0x94 (10010100) */
2422     {1, 3, 5, 8, 0, 0, 0, 0}, /* 0x95 (10010101) */
2423     {2, 3, 5, 8, 0, 0, 0, 0}, /* 0x96 (10010110) */
2424     {1, 2, 3, 5, 8, 0, 0, 0}, /* 0x97 (10010111) */
2425     {4, 5, 8, 0, 0, 0, 0, 0}, /* 0x98 (10011000) */
2426     {1, 4, 5, 8, 0, 0, 0, 0}, /* 0x99 (10011001) */
2427     {2, 4, 5, 8, 0, 0, 0, 0}, /* 0x9A (10011010) */
2428     {1, 2, 4, 5, 8, 0, 0, 0}, /* 0x9B (10011011) */
2429     {3, 4, 5, 8, 0, 0, 0, 0}, /* 0x9C (10011100) */
2430     {1, 3, 4, 5, 8, 0, 0, 0}, /* 0x9D (10011101) */
2431     {2, 3, 4, 5, 8, 0, 0, 0}, /* 0x9E (10011110) */
2432     {1, 2, 3, 4, 5, 8, 0, 0}, /* 0x9F (10011111) */
2433     {6, 8, 0, 0, 0, 0, 0, 0}, /* 0xA0 (10100000) */
2434     {1, 6, 8, 0, 0, 0, 0, 0}, /* 0xA1 (10100001) */
2435     {2, 6, 8, 0, 0, 0, 0, 0}, /* 0xA2 (10100010) */
2436     {1, 2, 6, 8, 0, 0, 0, 0}, /* 0xA3 (10100011) */
2437     {3, 6, 8, 0, 0, 0, 0, 0}, /* 0xA4 (10100100) */
2438     {1, 3, 6, 8, 0, 0, 0, 0}, /* 0xA5 (10100101) */
2439     {2, 3, 6, 8, 0, 0, 0, 0}, /* 0xA6 (10100110) */
2440     {1, 2, 3, 6, 8, 0, 0, 0}, /* 0xA7 (10100111) */
2441     {4, 6, 8, 0, 0, 0, 0, 0}, /* 0xA8 (10101000) */
2442     {1, 4, 6, 8, 0, 0, 0, 0}, /* 0xA9 (10101001) */
2443     {2, 4, 6, 8, 0, 0, 0, 0}, /* 0xAA (10101010) */
2444     {1, 2, 4, 6, 8, 0, 0, 0}, /* 0xAB (10101011) */
2445     {3, 4, 6, 8, 0, 0, 0, 0}, /* 0xAC (10101100) */
2446     {1, 3, 4, 6, 8, 0, 0, 0}, /* 0xAD (10101101) */
2447     {2, 3, 4, 6, 8, 0, 0, 0}, /* 0xAE (10101110) */
2448     {1, 2, 3, 4, 6, 8, 0, 0}, /* 0xAF (10101111) */
2449     {5, 6, 8, 0, 0, 0, 0, 0}, /* 0xB0 (10110000) */
2450     {1, 5, 6, 8, 0, 0, 0, 0}, /* 0xB1 (10110001) */
2451     {2, 5, 6, 8, 0, 0, 0, 0}, /* 0xB2 (10110010) */
2452     {1, 2, 5, 6, 8, 0, 0, 0}, /* 0xB3 (10110011) */
2453     {3, 5, 6, 8, 0, 0, 0, 0}, /* 0xB4 (10110100) */
2454     {1, 3, 5, 6, 8, 0, 0, 0}, /* 0xB5 (10110101) */
2455     {2, 3, 5, 6, 8, 0, 0, 0}, /* 0xB6 (10110110) */
2456     {1, 2, 3, 5, 6, 8, 0, 0}, /* 0xB7 (10110111) */
2457     {4, 5, 6, 8, 0, 0, 0, 0}, /* 0xB8 (10111000) */
2458     {1, 4, 5, 6, 8, 0, 0, 0}, /* 0xB9 (10111001) */
2459     {2, 4, 5, 6, 8, 0, 0, 0}, /* 0xBA (10111010) */
2460     {1, 2, 4, 5, 6, 8, 0, 0}, /* 0xBB (10111011) */
2461     {3, 4, 5, 6, 8, 0, 0, 0}, /* 0xBC (10111100) */
2462     {1, 3, 4, 5, 6, 8, 0, 0}, /* 0xBD (10111101) */
2463     {2, 3, 4, 5, 6, 8, 0, 0}, /* 0xBE (10111110) */
2464     {1, 2, 3, 4, 5, 6, 8, 0}, /* 0xBF (10111111) */
2465     {7, 8, 0, 0, 0, 0, 0, 0}, /* 0xC0 (11000000) */
2466     {1, 7, 8, 0, 0, 0, 0, 0}, /* 0xC1 (11000001) */
2467     {2, 7, 8, 0, 0, 0, 0, 0}, /* 0xC2 (11000010) */
2468     {1, 2, 7, 8, 0, 0, 0, 0}, /* 0xC3 (11000011) */
2469     {3, 7, 8, 0, 0, 0, 0, 0}, /* 0xC4 (11000100) */
2470     {1, 3, 7, 8, 0, 0, 0, 0}, /* 0xC5 (11000101) */
2471     {2, 3, 7, 8, 0, 0, 0, 0}, /* 0xC6 (11000110) */
2472     {1, 2, 3, 7, 8, 0, 0, 0}, /* 0xC7 (11000111) */
2473     {4, 7, 8, 0, 0, 0, 0, 0}, /* 0xC8 (11001000) */
2474     {1, 4, 7, 8, 0, 0, 0, 0}, /* 0xC9 (11001001) */
2475     {2, 4, 7, 8, 0, 0, 0, 0}, /* 0xCA (11001010) */
2476     {1, 2, 4, 7, 8, 0, 0, 0}, /* 0xCB (11001011) */
2477     {3, 4, 7, 8, 0, 0, 0, 0}, /* 0xCC (11001100) */
2478     {1, 3, 4, 7, 8, 0, 0, 0}, /* 0xCD (11001101) */
2479     {2, 3, 4, 7, 8, 0, 0, 0}, /* 0xCE (11001110) */
2480     {1, 2, 3, 4, 7, 8, 0, 0}, /* 0xCF (11001111) */
2481     {5, 7, 8, 0, 0, 0, 0, 0}, /* 0xD0 (11010000) */
2482     {1, 5, 7, 8, 0, 0, 0, 0}, /* 0xD1 (11010001) */
2483     {2, 5, 7, 8, 0, 0, 0, 0}, /* 0xD2 (11010010) */
2484     {1, 2, 5, 7, 8, 0, 0, 0}, /* 0xD3 (11010011) */
2485     {3, 5, 7, 8, 0, 0, 0, 0}, /* 0xD4 (11010100) */
2486     {1, 3, 5, 7, 8, 0, 0, 0}, /* 0xD5 (11010101) */
2487     {2, 3, 5, 7, 8, 0, 0, 0}, /* 0xD6 (11010110) */
2488     {1, 2, 3, 5, 7, 8, 0, 0}, /* 0xD7 (11010111) */
2489     {4, 5, 7, 8, 0, 0, 0, 0}, /* 0xD8 (11011000) */
2490     {1, 4, 5, 7, 8, 0, 0, 0}, /* 0xD9 (11011001) */
2491     {2, 4, 5, 7, 8, 0, 0, 0}, /* 0xDA (11011010) */
2492     {1, 2, 4, 5, 7, 8, 0, 0}, /* 0xDB (11011011) */
2493     {3, 4, 5, 7, 8, 0, 0, 0}, /* 0xDC (11011100) */
2494     {1, 3, 4, 5, 7, 8, 0, 0}, /* 0xDD (11011101) */
2495     {2, 3, 4, 5, 7, 8, 0, 0}, /* 0xDE (11011110) */
2496     {1, 2, 3, 4, 5, 7, 8, 0}, /* 0xDF (11011111) */
2497     {6, 7, 8, 0, 0, 0, 0, 0}, /* 0xE0 (11100000) */
2498     {1, 6, 7, 8, 0, 0, 0, 0}, /* 0xE1 (11100001) */
2499     {2, 6, 7, 8, 0, 0, 0, 0}, /* 0xE2 (11100010) */
2500     {1, 2, 6, 7, 8, 0, 0, 0}, /* 0xE3 (11100011) */
2501     {3, 6, 7, 8, 0, 0, 0, 0}, /* 0xE4 (11100100) */
2502     {1, 3, 6, 7, 8, 0, 0, 0}, /* 0xE5 (11100101) */
2503     {2, 3, 6, 7, 8, 0, 0, 0}, /* 0xE6 (11100110) */
2504     {1, 2, 3, 6, 7, 8, 0, 0}, /* 0xE7 (11100111) */
2505     {4, 6, 7, 8, 0, 0, 0, 0}, /* 0xE8 (11101000) */
2506     {1, 4, 6, 7, 8, 0, 0, 0}, /* 0xE9 (11101001) */
2507     {2, 4, 6, 7, 8, 0, 0, 0}, /* 0xEA (11101010) */
2508     {1, 2, 4, 6, 7, 8, 0, 0}, /* 0xEB (11101011) */
2509     {3, 4, 6, 7, 8, 0, 0, 0}, /* 0xEC (11101100) */
2510     {1, 3, 4, 6, 7, 8, 0, 0}, /* 0xED (11101101) */
2511     {2, 3, 4, 6, 7, 8, 0, 0}, /* 0xEE (11101110) */
2512     {1, 2, 3, 4, 6, 7, 8, 0}, /* 0xEF (11101111) */
2513     {5, 6, 7, 8, 0, 0, 0, 0}, /* 0xF0 (11110000) */
2514     {1, 5, 6, 7, 8, 0, 0, 0}, /* 0xF1 (11110001) */
2515     {2, 5, 6, 7, 8, 0, 0, 0}, /* 0xF2 (11110010) */
2516     {1, 2, 5, 6, 7, 8, 0, 0}, /* 0xF3 (11110011) */
2517     {3, 5, 6, 7, 8, 0, 0, 0}, /* 0xF4 (11110100) */
2518     {1, 3, 5, 6, 7, 8, 0, 0}, /* 0xF5 (11110101) */
2519     {2, 3, 5, 6, 7, 8, 0, 0}, /* 0xF6 (11110110) */
2520     {1, 2, 3, 5, 6, 7, 8, 0}, /* 0xF7 (11110111) */
2521     {4, 5, 6, 7, 8, 0, 0, 0}, /* 0xF8 (11111000) */
2522     {1, 4, 5, 6, 7, 8, 0, 0}, /* 0xF9 (11111001) */
2523     {2, 4, 5, 6, 7, 8, 0, 0}, /* 0xFA (11111010) */
2524     {1, 2, 4, 5, 6, 7, 8, 0}, /* 0xFB (11111011) */
2525     {3, 4, 5, 6, 7, 8, 0, 0}, /* 0xFC (11111100) */
2526     {1, 3, 4, 5, 6, 7, 8, 0}, /* 0xFD (11111101) */
2527     {2, 3, 4, 5, 6, 7, 8, 0}, /* 0xFE (11111110) */
2528     {1, 2, 3, 4, 5, 6, 7, 8}  /* 0xFF (11111111) */
2529 };
2530 
2531 #endif
2532 
2533 #ifdef USEAVX
2534 
bitset_extract_setbits_avx2(uint64_t * array,size_t length,void * vout,size_t outcapacity,uint32_t base)2535 size_t bitset_extract_setbits_avx2(uint64_t *array, size_t length, void *vout,
2536                                    size_t outcapacity, uint32_t base) {
2537     uint32_t *out = (uint32_t *)vout;
2538     uint32_t *initout = out;
2539     __m256i baseVec = _mm256_set1_epi32(base - 1);
2540     __m256i incVec = _mm256_set1_epi32(64);
2541     __m256i add8 = _mm256_set1_epi32(8);
2542     uint32_t *safeout = out + outcapacity;
2543     size_t i = 0;
2544     for (; (i < length) && (out + 64 <= safeout); ++i) {
2545         uint64_t w = array[i];
2546         if (w == 0) {
2547             baseVec = _mm256_add_epi32(baseVec, incVec);
2548         } else {
2549             for (int k = 0; k < 4; ++k) {
2550                 uint8_t byteA = (uint8_t)w;
2551                 uint8_t byteB = (uint8_t)(w >> 8);
2552                 w >>= 16;
2553                 __m256i vecA =
2554                     _mm256_load_si256((const __m256i *)vecDecodeTable[byteA]);
2555                 __m256i vecB =
2556                     _mm256_load_si256((const __m256i *)vecDecodeTable[byteB]);
2557                 uint8_t advanceA = lengthTable[byteA];
2558                 uint8_t advanceB = lengthTable[byteB];
2559                 vecA = _mm256_add_epi32(baseVec, vecA);
2560                 baseVec = _mm256_add_epi32(baseVec, add8);
2561                 vecB = _mm256_add_epi32(baseVec, vecB);
2562                 baseVec = _mm256_add_epi32(baseVec, add8);
2563                 _mm256_storeu_si256((__m256i *)out, vecA);
2564                 out += advanceA;
2565                 _mm256_storeu_si256((__m256i *)out, vecB);
2566                 out += advanceB;
2567             }
2568         }
2569     }
2570     base += i * 64;
2571     for (; (i < length) && (out < safeout); ++i) {
2572         uint64_t w = array[i];
2573         while ((w != 0) && (out < safeout)) {
2574             uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
2575             int r = __builtin_ctzll(w); // on x64, should compile to TZCNT
2576             uint32_t val = r + base;
2577             memcpy(out, &val,
2578                    sizeof(uint32_t));  // should be compiled as a MOV on x64
2579             out++;
2580             w ^= t;
2581         }
2582         base += 64;
2583     }
2584     return out - initout;
2585 }
2586 #endif  // USEAVX
2587 
bitset_extract_setbits(uint64_t * bitset,size_t length,void * vout,uint32_t base)2588 size_t bitset_extract_setbits(uint64_t *bitset, size_t length, void *vout,
2589                               uint32_t base) {
2590     int outpos = 0;
2591     uint32_t *out = (uint32_t *)vout;
2592     for (size_t i = 0; i < length; ++i) {
2593         uint64_t w = bitset[i];
2594         while (w != 0) {
2595             uint64_t t = w & (~w + 1); // on x64, should compile to BLSI (careful: the Intel compiler seems to fail)
2596             int r = __builtin_ctzll(w); // on x64, should compile to TZCNT
2597             uint32_t val = r + base;
2598             memcpy(out + outpos, &val,
2599                    sizeof(uint32_t));  // should be compiled as a MOV on x64
2600             outpos++;
2601             w ^= t;
2602         }
2603         base += 64;
2604     }
2605     return outpos;
2606 }
2607 
bitset_extract_intersection_setbits_uint16(const uint64_t * __restrict__ bitset1,const uint64_t * __restrict__ bitset2,size_t length,uint16_t * out,uint16_t base)2608 size_t bitset_extract_intersection_setbits_uint16(const uint64_t * __restrict__ bitset1,
2609                                                   const uint64_t * __restrict__ bitset2,
2610                                                   size_t length, uint16_t *out,
2611                                                   uint16_t base) {
2612     int outpos = 0;
2613     for (size_t i = 0; i < length; ++i) {
2614         uint64_t w = bitset1[i] & bitset2[i];
2615         while (w != 0) {
2616             uint64_t t = w & (~w + 1);
2617             int r = __builtin_ctzll(w);
2618             out[outpos++] = r + base;
2619             w ^= t;
2620         }
2621         base += 64;
2622     }
2623     return outpos;
2624 }
2625 
2626 #ifdef IS_X64
2627 /*
2628  * Given a bitset containing "length" 64-bit words, write out the position
2629  * of all the set bits to "out" as 16-bit integers, values start at "base" (can
2630  *be set to zero).
2631  *
2632  * The "out" pointer should be sufficient to store the actual number of bits
2633  *set.
2634  *
2635  * Returns how many values were actually decoded.
2636  *
2637  * This function uses SSE decoding.
2638  */
bitset_extract_setbits_sse_uint16(const uint64_t * bitset,size_t length,uint16_t * out,size_t outcapacity,uint16_t base)2639 size_t bitset_extract_setbits_sse_uint16(const uint64_t *bitset, size_t length,
2640                                          uint16_t *out, size_t outcapacity,
2641                                          uint16_t base) {
2642     uint16_t *initout = out;
2643     __m128i baseVec = _mm_set1_epi16(base - 1);
2644     __m128i incVec = _mm_set1_epi16(64);
2645     __m128i add8 = _mm_set1_epi16(8);
2646     uint16_t *safeout = out + outcapacity;
2647     const int numberofbytes = 2;  // process two bytes at a time
2648     size_t i = 0;
2649     for (; (i < length) && (out + numberofbytes * 8 <= safeout); ++i) {
2650         uint64_t w = bitset[i];
2651         if (w == 0) {
2652             baseVec = _mm_add_epi16(baseVec, incVec);
2653         } else {
2654             for (int k = 0; k < 4; ++k) {
2655                 uint8_t byteA = (uint8_t)w;
2656                 uint8_t byteB = (uint8_t)(w >> 8);
2657                 w >>= 16;
2658                 __m128i vecA = _mm_load_si128(
2659                     (const __m128i *)vecDecodeTable_uint16[byteA]);
2660                 __m128i vecB = _mm_load_si128(
2661                     (const __m128i *)vecDecodeTable_uint16[byteB]);
2662                 uint8_t advanceA = lengthTable[byteA];
2663                 uint8_t advanceB = lengthTable[byteB];
2664                 vecA = _mm_add_epi16(baseVec, vecA);
2665                 baseVec = _mm_add_epi16(baseVec, add8);
2666                 vecB = _mm_add_epi16(baseVec, vecB);
2667                 baseVec = _mm_add_epi16(baseVec, add8);
2668                 _mm_storeu_si128((__m128i *)out, vecA);
2669                 out += advanceA;
2670                 _mm_storeu_si128((__m128i *)out, vecB);
2671                 out += advanceB;
2672             }
2673         }
2674     }
2675     base += (uint16_t)(i * 64);
2676     for (; (i < length) && (out < safeout); ++i) {
2677         uint64_t w = bitset[i];
2678         while ((w != 0) && (out < safeout)) {
2679             uint64_t t = w & (~w + 1);
2680             int r = __builtin_ctzll(w);
2681             *out = r + base;
2682             out++;
2683             w ^= t;
2684         }
2685         base += 64;
2686     }
2687     return out - initout;
2688 }
2689 #endif
2690 
2691 /*
2692  * Given a bitset containing "length" 64-bit words, write out the position
2693  * of all the set bits to "out", values start at "base" (can be set to zero).
2694  *
2695  * The "out" pointer should be sufficient to store the actual number of bits
2696  *set.
2697  *
2698  * Returns how many values were actually decoded.
2699  */
bitset_extract_setbits_uint16(const uint64_t * bitset,size_t length,uint16_t * out,uint16_t base)2700 size_t bitset_extract_setbits_uint16(const uint64_t *bitset, size_t length,
2701                                      uint16_t *out, uint16_t base) {
2702     int outpos = 0;
2703     for (size_t i = 0; i < length; ++i) {
2704         uint64_t w = bitset[i];
2705         while (w != 0) {
2706             uint64_t t = w & (~w + 1);
2707             int r = __builtin_ctzll(w);
2708             out[outpos++] = r + base;
2709             w ^= t;
2710         }
2711         base += 64;
2712     }
2713     return outpos;
2714 }
2715 
2716 #if defined(ASMBITMANIPOPTIMIZATION)
2717 
bitset_set_list_withcard(void * bitset,uint64_t card,const uint16_t * list,uint64_t length)2718 uint64_t bitset_set_list_withcard(void *bitset, uint64_t card,
2719                                   const uint16_t *list, uint64_t length) {
2720     uint64_t offset, load, pos;
2721     uint64_t shift = 6;
2722     const uint16_t *end = list + length;
2723     if (!length) return card;
2724     // TODO: could unroll for performance, see bitset_set_list
2725     // bts is not available as an intrinsic in GCC
2726     __asm volatile(
2727         "1:\n"
2728         "movzwq (%[list]), %[pos]\n"
2729         "shrx %[shift], %[pos], %[offset]\n"
2730         "mov (%[bitset],%[offset],8), %[load]\n"
2731         "bts %[pos], %[load]\n"
2732         "mov %[load], (%[bitset],%[offset],8)\n"
2733         "sbb $-1, %[card]\n"
2734         "add $2, %[list]\n"
2735         "cmp %[list], %[end]\n"
2736         "jnz 1b"
2737         : [card] "+&r"(card), [list] "+&r"(list), [load] "=&r"(load),
2738           [pos] "=&r"(pos), [offset] "=&r"(offset)
2739         : [end] "r"(end), [bitset] "r"(bitset), [shift] "r"(shift));
2740     return card;
2741 }
2742 
bitset_set_list(void * bitset,const uint16_t * list,uint64_t length)2743 void bitset_set_list(void *bitset, const uint16_t *list, uint64_t length) {
2744     uint64_t pos;
2745     const uint16_t *end = list + length;
2746 
2747     uint64_t shift = 6;
2748     uint64_t offset;
2749     uint64_t load;
2750     for (; list + 3 < end; list += 4) {
2751         pos = list[0];
2752         __asm volatile(
2753             "shrx %[shift], %[pos], %[offset]\n"
2754             "mov (%[bitset],%[offset],8), %[load]\n"
2755             "bts %[pos], %[load]\n"
2756             "mov %[load], (%[bitset],%[offset],8)"
2757             : [load] "=&r"(load), [offset] "=&r"(offset)
2758             : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
2759         pos = list[1];
2760         __asm volatile(
2761             "shrx %[shift], %[pos], %[offset]\n"
2762             "mov (%[bitset],%[offset],8), %[load]\n"
2763             "bts %[pos], %[load]\n"
2764             "mov %[load], (%[bitset],%[offset],8)"
2765             : [load] "=&r"(load), [offset] "=&r"(offset)
2766             : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
2767         pos = list[2];
2768         __asm volatile(
2769             "shrx %[shift], %[pos], %[offset]\n"
2770             "mov (%[bitset],%[offset],8), %[load]\n"
2771             "bts %[pos], %[load]\n"
2772             "mov %[load], (%[bitset],%[offset],8)"
2773             : [load] "=&r"(load), [offset] "=&r"(offset)
2774             : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
2775         pos = list[3];
2776         __asm volatile(
2777             "shrx %[shift], %[pos], %[offset]\n"
2778             "mov (%[bitset],%[offset],8), %[load]\n"
2779             "bts %[pos], %[load]\n"
2780             "mov %[load], (%[bitset],%[offset],8)"
2781             : [load] "=&r"(load), [offset] "=&r"(offset)
2782             : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
2783     }
2784 
2785     while (list != end) {
2786         pos = list[0];
2787         __asm volatile(
2788             "shrx %[shift], %[pos], %[offset]\n"
2789             "mov (%[bitset],%[offset],8), %[load]\n"
2790             "bts %[pos], %[load]\n"
2791             "mov %[load], (%[bitset],%[offset],8)"
2792             : [load] "=&r"(load), [offset] "=&r"(offset)
2793             : [bitset] "r"(bitset), [shift] "r"(shift), [pos] "r"(pos));
2794         list++;
2795     }
2796 }
2797 
bitset_clear_list(void * bitset,uint64_t card,const uint16_t * list,uint64_t length)2798 uint64_t bitset_clear_list(void *bitset, uint64_t card, const uint16_t *list,
2799                            uint64_t length) {
2800     uint64_t offset, load, pos;
2801     uint64_t shift = 6;
2802     const uint16_t *end = list + length;
2803     if (!length) return card;
2804     // btr is not available as an intrinsic in GCC
2805     __asm volatile(
2806         "1:\n"
2807         "movzwq (%[list]), %[pos]\n"
2808         "shrx %[shift], %[pos], %[offset]\n"
2809         "mov (%[bitset],%[offset],8), %[load]\n"
2810         "btr %[pos], %[load]\n"
2811         "mov %[load], (%[bitset],%[offset],8)\n"
2812         "sbb $0, %[card]\n"
2813         "add $2, %[list]\n"
2814         "cmp %[list], %[end]\n"
2815         "jnz 1b"
2816         : [card] "+&r"(card), [list] "+&r"(list), [load] "=&r"(load),
2817           [pos] "=&r"(pos), [offset] "=&r"(offset)
2818         : [end] "r"(end), [bitset] "r"(bitset), [shift] "r"(shift)
2819         :
2820         /* clobbers */ "memory");
2821     return card;
2822 }
2823 
2824 #else
bitset_clear_list(void * bitset,uint64_t card,const uint16_t * list,uint64_t length)2825 uint64_t bitset_clear_list(void *bitset, uint64_t card, const uint16_t *list,
2826                            uint64_t length) {
2827     uint64_t offset, load, newload, pos, index;
2828     const uint16_t *end = list + length;
2829     while (list != end) {
2830         pos = *(const uint16_t *)list;
2831         offset = pos >> 6;
2832         index = pos % 64;
2833         load = ((uint64_t *)bitset)[offset];
2834         newload = load & ~(UINT64_C(1) << index);
2835         card -= (load ^ newload) >> index;
2836         ((uint64_t *)bitset)[offset] = newload;
2837         list++;
2838     }
2839     return card;
2840 }
2841 
bitset_set_list_withcard(void * bitset,uint64_t card,const uint16_t * list,uint64_t length)2842 uint64_t bitset_set_list_withcard(void *bitset, uint64_t card,
2843                                   const uint16_t *list, uint64_t length) {
2844     uint64_t offset, load, newload, pos, index;
2845     const uint16_t *end = list + length;
2846     while (list != end) {
2847         pos = *(const uint16_t *)list;
2848         offset = pos >> 6;
2849         index = pos % 64;
2850         load = ((uint64_t *)bitset)[offset];
2851         newload = load | (UINT64_C(1) << index);
2852         card += (load ^ newload) >> index;
2853         ((uint64_t *)bitset)[offset] = newload;
2854         list++;
2855     }
2856     return card;
2857 }
2858 
bitset_set_list(void * bitset,const uint16_t * list,uint64_t length)2859 void bitset_set_list(void *bitset, const uint16_t *list, uint64_t length) {
2860     uint64_t offset, load, newload, pos, index;
2861     const uint16_t *end = list + length;
2862     while (list != end) {
2863         pos = *(const uint16_t *)list;
2864         offset = pos >> 6;
2865         index = pos % 64;
2866         load = ((uint64_t *)bitset)[offset];
2867         newload = load | (UINT64_C(1) << index);
2868         ((uint64_t *)bitset)[offset] = newload;
2869         list++;
2870     }
2871 }
2872 
2873 #endif
2874 
2875 /* flip specified bits */
2876 /* TODO: consider whether worthwhile to make an asm version */
2877 
bitset_flip_list_withcard(void * bitset,uint64_t card,const uint16_t * list,uint64_t length)2878 uint64_t bitset_flip_list_withcard(void *bitset, uint64_t card,
2879                                    const uint16_t *list, uint64_t length) {
2880     uint64_t offset, load, newload, pos, index;
2881     const uint16_t *end = list + length;
2882     while (list != end) {
2883         pos = *(const uint16_t *)list;
2884         offset = pos >> 6;
2885         index = pos % 64;
2886         load = ((uint64_t *)bitset)[offset];
2887         newload = load ^ (UINT64_C(1) << index);
2888         // todo: is a branch here all that bad?
2889         card +=
2890             (1 - 2 * (((UINT64_C(1) << index) & load) >> index));  // +1 or -1
2891         ((uint64_t *)bitset)[offset] = newload;
2892         list++;
2893     }
2894     return card;
2895 }
2896 
bitset_flip_list(void * bitset,const uint16_t * list,uint64_t length)2897 void bitset_flip_list(void *bitset, const uint16_t *list, uint64_t length) {
2898     uint64_t offset, load, newload, pos, index;
2899     const uint16_t *end = list + length;
2900     while (list != end) {
2901         pos = *(const uint16_t *)list;
2902         offset = pos >> 6;
2903         index = pos % 64;
2904         load = ((uint64_t *)bitset)[offset];
2905         newload = load ^ (UINT64_C(1) << index);
2906         ((uint64_t *)bitset)[offset] = newload;
2907         list++;
2908     }
2909 }
2910 /* end file src/bitset_util.c */
2911 /* begin file src/containers/array.c */
2912 /*
2913  * array.c
2914  *
2915  */
2916 
2917 #include <assert.h>
2918 #include <stdio.h>
2919 #include <stdlib.h>
2920 
2921 /* Create a new array with capacity size. Return NULL in case of failure. */
array_container_create_given_capacity(int32_t size)2922 array_container_t *array_container_create_given_capacity(int32_t size) {
2923     array_container_t *container;
2924 
2925     container = (array_container_t *)malloc(sizeof(array_container_t));
2926     assert (container);
2927 
2928     if( size <= 0 ) { // we don't want to rely on malloc(0)
2929         container->array = NULL;
2930     } else {
2931         container->array = (uint16_t *)malloc(sizeof(uint16_t) * size);
2932         assert (container->array);
2933     }
2934 
2935     container->capacity = size;
2936     container->cardinality = 0;
2937 
2938     return container;
2939 }
2940 
2941 /* Create a new array. Return NULL in case of failure. */
array_container_create(void)2942 array_container_t *array_container_create(void) {
2943     return array_container_create_given_capacity(ARRAY_DEFAULT_INIT_SIZE);
2944 }
2945 
2946 /* Create a new array containing all values in [min,max). */
array_container_create_range(uint32_t min,uint32_t max)2947 array_container_t * array_container_create_range(uint32_t min, uint32_t max) {
2948     array_container_t * answer = array_container_create_given_capacity(max - min + 1);
2949     if(answer == NULL) return answer;
2950     answer->cardinality = 0;
2951     for(uint32_t k = min; k < max; k++) {
2952       answer->array[answer->cardinality++] = k;
2953     }
2954     return answer;
2955 }
2956 
2957 /* Duplicate container */
array_container_clone(const array_container_t * src)2958 array_container_t *array_container_clone(const array_container_t *src) {
2959     array_container_t *newcontainer =
2960         array_container_create_given_capacity(src->capacity);
2961     if (newcontainer == NULL) return NULL;
2962 
2963     newcontainer->cardinality = src->cardinality;
2964 
2965     memcpy(newcontainer->array, src->array,
2966            src->cardinality * sizeof(uint16_t));
2967 
2968     return newcontainer;
2969 }
2970 
array_container_shrink_to_fit(array_container_t * src)2971 int array_container_shrink_to_fit(array_container_t *src) {
2972     if (src->cardinality == src->capacity) return 0;  // nothing to do
2973     int savings = src->capacity - src->cardinality;
2974     src->capacity = src->cardinality;
2975     if( src->capacity == 0) { // we do not want to rely on realloc for zero allocs
2976       free(src->array);
2977       src->array = NULL;
2978     } else {
2979       uint16_t *oldarray = src->array;
2980       src->array =
2981         (uint16_t *)realloc(oldarray, src->capacity * sizeof(uint16_t));
2982       if (src->array == NULL) free(oldarray);  // should never happen?
2983     }
2984     return savings;
2985 }
2986 
2987 /* Free memory. */
array_container_free(array_container_t * arr)2988 void array_container_free(array_container_t *arr) {
2989     if(arr->array != NULL) {// Jon Strabala reports that some tools complain otherwise
2990       free(arr->array);
2991       arr->array = NULL; // pedantic
2992     }
2993     free(arr);
2994 }
2995 
grow_capacity(int32_t capacity)2996 static inline int32_t grow_capacity(int32_t capacity) {
2997     return (capacity <= 0) ? ARRAY_DEFAULT_INIT_SIZE
2998                            : capacity < 64 ? capacity * 2
2999                                            : capacity < 1024 ? capacity * 3 / 2
3000                                                              : capacity * 5 / 4;
3001 }
3002 
clamp(int32_t val,int32_t min,int32_t max)3003 static inline int32_t clamp(int32_t val, int32_t min, int32_t max) {
3004     return ((val < min) ? min : (val > max) ? max : val);
3005 }
3006 
array_container_grow(array_container_t * container,int32_t min,bool preserve)3007 void array_container_grow(array_container_t *container, int32_t min,
3008                           bool preserve) {
3009 
3010     int32_t max = (min <= DEFAULT_MAX_SIZE ? DEFAULT_MAX_SIZE : 65536);
3011     int32_t new_capacity = clamp(grow_capacity(container->capacity), min, max);
3012 
3013     container->capacity = new_capacity;
3014     uint16_t *array = container->array;
3015 
3016     if (preserve) {
3017         container->array =
3018             (uint16_t *)realloc(array, new_capacity * sizeof(uint16_t));
3019         if (container->array == NULL) free(array);
3020     } else {
3021         // Jon Strabala reports that some tools complain otherwise
3022         if (array != NULL) {
3023           free(array);
3024         }
3025         container->array = (uint16_t *)malloc(new_capacity * sizeof(uint16_t));
3026     }
3027 
3028     //  handle the case where realloc fails
3029     if (container->array == NULL) {
3030       fprintf(stderr, "could not allocate memory\n");
3031     }
3032     assert(container->array != NULL);
3033 }
3034 
3035 /* Copy one container into another. We assume that they are distinct. */
array_container_copy(const array_container_t * src,array_container_t * dst)3036 void array_container_copy(const array_container_t *src,
3037                           array_container_t *dst) {
3038     const int32_t cardinality = src->cardinality;
3039     if (cardinality > dst->capacity) {
3040         array_container_grow(dst, cardinality, false);
3041     }
3042 
3043     dst->cardinality = cardinality;
3044     memcpy(dst->array, src->array, cardinality * sizeof(uint16_t));
3045 }
3046 
array_container_add_from_range(array_container_t * arr,uint32_t min,uint32_t max,uint16_t step)3047 void array_container_add_from_range(array_container_t *arr, uint32_t min,
3048                                     uint32_t max, uint16_t step) {
3049     for (uint32_t value = min; value < max; value += step) {
3050         array_container_append(arr, value);
3051     }
3052 }
3053 
3054 /* Computes the union of array1 and array2 and write the result to arrayout.
3055  * It is assumed that arrayout is distinct from both array1 and array2.
3056  */
array_container_union(const array_container_t * array_1,const array_container_t * array_2,array_container_t * out)3057 void array_container_union(const array_container_t *array_1,
3058                            const array_container_t *array_2,
3059                            array_container_t *out) {
3060     const int32_t card_1 = array_1->cardinality, card_2 = array_2->cardinality;
3061     const int32_t max_cardinality = card_1 + card_2;
3062 
3063     if (out->capacity < max_cardinality) {
3064       array_container_grow(out, max_cardinality, false);
3065     }
3066     out->cardinality = (int32_t)fast_union_uint16(array_1->array, card_1,
3067                                       array_2->array, card_2, out->array);
3068 
3069 }
3070 
3071 /* Computes the  difference of array1 and array2 and write the result
3072  * to array out.
3073  * Array out does not need to be distinct from array_1
3074  */
array_container_andnot(const array_container_t * array_1,const array_container_t * array_2,array_container_t * out)3075 void array_container_andnot(const array_container_t *array_1,
3076                             const array_container_t *array_2,
3077                             array_container_t *out) {
3078     if (out->capacity < array_1->cardinality)
3079         array_container_grow(out, array_1->cardinality, false);
3080 #ifdef ROARING_VECTOR_OPERATIONS_ENABLED
3081     if((out != array_1) && (out != array_2)) {
3082       out->cardinality =
3083           difference_vector16(array_1->array, array_1->cardinality,
3084                             array_2->array, array_2->cardinality, out->array);
3085      } else {
3086       out->cardinality =
3087         difference_uint16(array_1->array, array_1->cardinality, array_2->array,
3088                           array_2->cardinality, out->array);
3089      }
3090 #else
3091     out->cardinality =
3092         difference_uint16(array_1->array, array_1->cardinality, array_2->array,
3093                           array_2->cardinality, out->array);
3094 #endif
3095 }
3096 
3097 /* Computes the symmetric difference of array1 and array2 and write the
3098  * result
3099  * to arrayout.
3100  * It is assumed that arrayout is distinct from both array1 and array2.
3101  */
array_container_xor(const array_container_t * array_1,const array_container_t * array_2,array_container_t * out)3102 void array_container_xor(const array_container_t *array_1,
3103                          const array_container_t *array_2,
3104                          array_container_t *out) {
3105     const int32_t card_1 = array_1->cardinality, card_2 = array_2->cardinality;
3106     const int32_t max_cardinality = card_1 + card_2;
3107     if (out->capacity < max_cardinality) {
3108         array_container_grow(out, max_cardinality, false);
3109     }
3110 
3111 #ifdef ROARING_VECTOR_OPERATIONS_ENABLED
3112     out->cardinality =
3113         xor_vector16(array_1->array, array_1->cardinality, array_2->array,
3114                      array_2->cardinality, out->array);
3115 #else
3116     out->cardinality =
3117         xor_uint16(array_1->array, array_1->cardinality, array_2->array,
3118                    array_2->cardinality, out->array);
3119 #endif
3120 }
3121 
minimum_int32(int32_t a,int32_t b)3122 static inline int32_t minimum_int32(int32_t a, int32_t b) {
3123     return (a < b) ? a : b;
3124 }
3125 
3126 /* computes the intersection of array1 and array2 and write the result to
3127  * arrayout.
3128  * It is assumed that arrayout is distinct from both array1 and array2.
3129  * */
array_container_intersection(const array_container_t * array1,const array_container_t * array2,array_container_t * out)3130 void array_container_intersection(const array_container_t *array1,
3131                                   const array_container_t *array2,
3132                                   array_container_t *out) {
3133     int32_t card_1 = array1->cardinality, card_2 = array2->cardinality,
3134             min_card = minimum_int32(card_1, card_2);
3135     const int threshold = 64;  // subject to tuning
3136 #ifdef USEAVX
3137     if (out->capacity < min_card) {
3138       array_container_grow(out, min_card + sizeof(__m128i) / sizeof(uint16_t),
3139         false);
3140     }
3141 #else
3142     if (out->capacity < min_card) {
3143       array_container_grow(out, min_card, false);
3144     }
3145 #endif
3146 
3147     if (card_1 * threshold < card_2) {
3148         out->cardinality = intersect_skewed_uint16(
3149             array1->array, card_1, array2->array, card_2, out->array);
3150     } else if (card_2 * threshold < card_1) {
3151         out->cardinality = intersect_skewed_uint16(
3152             array2->array, card_2, array1->array, card_1, out->array);
3153     } else {
3154 #ifdef USEAVX
3155         out->cardinality = intersect_vector16(
3156             array1->array, card_1, array2->array, card_2, out->array);
3157 #else
3158         out->cardinality = intersect_uint16(array1->array, card_1,
3159                                             array2->array, card_2, out->array);
3160 #endif
3161     }
3162 }
3163 
3164 /* computes the size of the intersection of array1 and array2
3165  * */
array_container_intersection_cardinality(const array_container_t * array1,const array_container_t * array2)3166 int array_container_intersection_cardinality(const array_container_t *array1,
3167                                              const array_container_t *array2) {
3168     int32_t card_1 = array1->cardinality, card_2 = array2->cardinality;
3169     const int threshold = 64;  // subject to tuning
3170     if (card_1 * threshold < card_2) {
3171         return intersect_skewed_uint16_cardinality(array1->array, card_1,
3172                                                    array2->array, card_2);
3173     } else if (card_2 * threshold < card_1) {
3174         return intersect_skewed_uint16_cardinality(array2->array, card_2,
3175                                                    array1->array, card_1);
3176     } else {
3177 #ifdef USEAVX
3178         return intersect_vector16_cardinality(array1->array, card_1,
3179                                               array2->array, card_2);
3180 #else
3181         return intersect_uint16_cardinality(array1->array, card_1,
3182                                             array2->array, card_2);
3183 #endif
3184     }
3185 }
3186 
array_container_intersect(const array_container_t * array1,const array_container_t * array2)3187 bool array_container_intersect(const array_container_t *array1,
3188                                   const array_container_t *array2) {
3189     int32_t card_1 = array1->cardinality, card_2 = array2->cardinality;
3190     const int threshold = 64;  // subject to tuning
3191     if (card_1 * threshold < card_2) {
3192         return intersect_skewed_uint16_nonempty(
3193             array1->array, card_1, array2->array, card_2);
3194     } else if (card_2 * threshold < card_1) {
3195     	return intersect_skewed_uint16_nonempty(
3196             array2->array, card_2, array1->array, card_1);
3197     } else {
3198     	// we do not bother vectorizing
3199         return intersect_uint16_nonempty(array1->array, card_1,
3200                                             array2->array, card_2);
3201     }
3202 }
3203 
3204 /* computes the intersection of array1 and array2 and write the result to
3205  * array1.
3206  * */
array_container_intersection_inplace(array_container_t * src_1,const array_container_t * src_2)3207 void array_container_intersection_inplace(array_container_t *src_1,
3208                                           const array_container_t *src_2) {
3209     // todo: can any of this be vectorized?
3210     int32_t card_1 = src_1->cardinality, card_2 = src_2->cardinality;
3211     const int threshold = 64;  // subject to tuning
3212     if (card_1 * threshold < card_2) {
3213         src_1->cardinality = intersect_skewed_uint16(
3214             src_1->array, card_1, src_2->array, card_2, src_1->array);
3215     } else if (card_2 * threshold < card_1) {
3216         src_1->cardinality = intersect_skewed_uint16(
3217             src_2->array, card_2, src_1->array, card_1, src_1->array);
3218     } else {
3219         src_1->cardinality = intersect_uint16(
3220             src_1->array, card_1, src_2->array, card_2, src_1->array);
3221     }
3222 }
3223 
array_container_to_uint32_array(void * vout,const array_container_t * cont,uint32_t base)3224 int array_container_to_uint32_array(void *vout, const array_container_t *cont,
3225                                     uint32_t base) {
3226     int outpos = 0;
3227     uint32_t *out = (uint32_t *)vout;
3228     for (int i = 0; i < cont->cardinality; ++i) {
3229         const uint32_t val = base + cont->array[i];
3230         memcpy(out + outpos, &val,
3231                sizeof(uint32_t));  // should be compiled as a MOV on x64
3232         outpos++;
3233     }
3234     return outpos;
3235 }
3236 
array_container_printf(const array_container_t * v)3237 void array_container_printf(const array_container_t *v) {
3238     if (v->cardinality == 0) {
3239         printf("{}");
3240         return;
3241     }
3242     printf("{");
3243     printf("%d", v->array[0]);
3244     for (int i = 1; i < v->cardinality; ++i) {
3245         printf(",%d", v->array[i]);
3246     }
3247     printf("}");
3248 }
3249 
array_container_printf_as_uint32_array(const array_container_t * v,uint32_t base)3250 void array_container_printf_as_uint32_array(const array_container_t *v,
3251                                             uint32_t base) {
3252     if (v->cardinality == 0) {
3253         return;
3254     }
3255     printf("%u", v->array[0] + base);
3256     for (int i = 1; i < v->cardinality; ++i) {
3257         printf(",%u", v->array[i] + base);
3258     }
3259 }
3260 
3261 /* Compute the number of runs */
array_container_number_of_runs(const array_container_t * a)3262 int32_t array_container_number_of_runs(const array_container_t *a) {
3263     // Can SIMD work here?
3264     int32_t nr_runs = 0;
3265     int32_t prev = -2;
3266     for (const uint16_t *p = a->array; p != a->array + a->cardinality; ++p) {
3267         if (*p != prev + 1) nr_runs++;
3268         prev = *p;
3269     }
3270     return nr_runs;
3271 }
3272 
array_container_serialize(const array_container_t * container,char * buf)3273 int32_t array_container_serialize(const array_container_t *container, char *buf) {
3274     int32_t l, off;
3275     uint16_t cardinality = (uint16_t)container->cardinality;
3276 
3277     memcpy(buf, &cardinality, off = sizeof(cardinality));
3278     l = sizeof(uint16_t) * container->cardinality;
3279     if (l) memcpy(&buf[off], container->array, l);
3280 
3281     return (off + l);
3282 }
3283 
3284 /**
3285  * Writes the underlying array to buf, outputs how many bytes were written.
3286  * The number of bytes written should be
3287  * array_container_size_in_bytes(container).
3288  *
3289  */
array_container_write(const array_container_t * container,char * buf)3290 int32_t array_container_write(const array_container_t *container, char *buf) {
3291     memcpy(buf, container->array, container->cardinality * sizeof(uint16_t));
3292     return array_container_size_in_bytes(container);
3293 }
3294 
array_container_is_subset(const array_container_t * container1,const array_container_t * container2)3295 bool array_container_is_subset(const array_container_t *container1,
3296                                const array_container_t *container2) {
3297     if (container1->cardinality > container2->cardinality) {
3298         return false;
3299     }
3300     int i1 = 0, i2 = 0;
3301     while (i1 < container1->cardinality && i2 < container2->cardinality) {
3302         if (container1->array[i1] == container2->array[i2]) {
3303             i1++;
3304             i2++;
3305         } else if (container1->array[i1] > container2->array[i2]) {
3306             i2++;
3307         } else {  // container1->array[i1] < container2->array[i2]
3308             return false;
3309         }
3310     }
3311     if (i1 == container1->cardinality) {
3312         return true;
3313     } else {
3314         return false;
3315     }
3316 }
3317 
array_container_read(int32_t cardinality,array_container_t * container,const char * buf)3318 int32_t array_container_read(int32_t cardinality, array_container_t *container,
3319                              const char *buf) {
3320     if (container->capacity < cardinality) {
3321         array_container_grow(container, cardinality, false);
3322     }
3323     container->cardinality = cardinality;
3324     memcpy(container->array, buf, container->cardinality * sizeof(uint16_t));
3325 
3326     return array_container_size_in_bytes(container);
3327 }
3328 
array_container_serialization_len(const array_container_t * container)3329 uint32_t array_container_serialization_len(const array_container_t *container) {
3330     return (sizeof(uint16_t) /* container->cardinality converted to 16 bit */ +
3331             (sizeof(uint16_t) * container->cardinality));
3332 }
3333 
array_container_deserialize(const char * buf,size_t buf_len)3334 void *array_container_deserialize(const char *buf, size_t buf_len) {
3335     array_container_t *ptr;
3336 
3337     if (buf_len < 2) /* capacity converted to 16 bit */
3338         return (NULL);
3339     else
3340         buf_len -= 2;
3341 
3342     if ((ptr = (array_container_t *)malloc(sizeof(array_container_t))) !=
3343         NULL) {
3344         size_t len;
3345         int32_t off;
3346         uint16_t cardinality;
3347 
3348         memcpy(&cardinality, buf, off = sizeof(cardinality));
3349 
3350         ptr->capacity = ptr->cardinality = (uint32_t)cardinality;
3351         len = sizeof(uint16_t) * ptr->cardinality;
3352 
3353         if (len != buf_len) {
3354             free(ptr);
3355             return (NULL);
3356         }
3357 
3358         if ((ptr->array = (uint16_t *)malloc(sizeof(uint16_t) *
3359                                              ptr->capacity)) == NULL) {
3360             free(ptr);
3361             return (NULL);
3362         }
3363 
3364         if (len) memcpy(ptr->array, &buf[off], len);
3365 
3366         /* Check if returned values are monotonically increasing */
3367         for (int32_t i = 0, j = 0; i < ptr->cardinality; i++) {
3368             if (ptr->array[i] < j) {
3369                 free(ptr->array);
3370                 free(ptr);
3371                 return (NULL);
3372             } else
3373                 j = ptr->array[i];
3374         }
3375     }
3376 
3377     return (ptr);
3378 }
3379 
array_container_iterate(const array_container_t * cont,uint32_t base,roaring_iterator iterator,void * ptr)3380 bool array_container_iterate(const array_container_t *cont, uint32_t base,
3381                              roaring_iterator iterator, void *ptr) {
3382     for (int i = 0; i < cont->cardinality; i++)
3383         if (!iterator(cont->array[i] + base, ptr)) return false;
3384     return true;
3385 }
3386 
array_container_iterate64(const array_container_t * cont,uint32_t base,roaring_iterator64 iterator,uint64_t high_bits,void * ptr)3387 bool array_container_iterate64(const array_container_t *cont, uint32_t base,
3388                                roaring_iterator64 iterator, uint64_t high_bits,
3389                                void *ptr) {
3390     for (int i = 0; i < cont->cardinality; i++)
3391         if (!iterator(high_bits | (uint64_t)(cont->array[i] + base), ptr))
3392             return false;
3393     return true;
3394 }
3395 /* end file src/containers/array.c */
3396 /* begin file src/containers/bitset.c */
3397 /*
3398  * bitset.c
3399  *
3400  */
3401 #ifndef _POSIX_C_SOURCE
3402 #define _POSIX_C_SOURCE 200809L
3403 #endif
3404 #include <assert.h>
3405 #include <stdio.h>
3406 #include <stdlib.h>
3407 #include <string.h>
3408 
3409 
bitset_container_clear(bitset_container_t * bitset)3410 void bitset_container_clear(bitset_container_t *bitset) {
3411     memset(bitset->array, 0, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
3412     bitset->cardinality = 0;
3413 }
3414 
bitset_container_set_all(bitset_container_t * bitset)3415 void bitset_container_set_all(bitset_container_t *bitset) {
3416     memset(bitset->array, INT64_C(-1),
3417            sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
3418     bitset->cardinality = (1 << 16);
3419 }
3420 
3421 
3422 
3423 /* Create a new bitset. Return NULL in case of failure. */
bitset_container_create(void)3424 bitset_container_t *bitset_container_create(void) {
3425     bitset_container_t *bitset =
3426         (bitset_container_t *)malloc(sizeof(bitset_container_t));
3427 
3428     if (!bitset) {
3429         return NULL;
3430     }
3431     // sizeof(__m256i) == 32
3432     bitset->array = (uint64_t *)roaring_bitmap_aligned_malloc(
3433         32, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
3434     if (!bitset->array) {
3435         free(bitset);
3436         return NULL;
3437     }
3438     bitset_container_clear(bitset);
3439     return bitset;
3440 }
3441 
3442 /* Copy one container into another. We assume that they are distinct. */
bitset_container_copy(const bitset_container_t * source,bitset_container_t * dest)3443 void bitset_container_copy(const bitset_container_t *source,
3444                            bitset_container_t *dest) {
3445     dest->cardinality = source->cardinality;
3446     memcpy(dest->array, source->array,
3447            sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
3448 }
3449 
bitset_container_add_from_range(bitset_container_t * bitset,uint32_t min,uint32_t max,uint16_t step)3450 void bitset_container_add_from_range(bitset_container_t *bitset, uint32_t min,
3451                                      uint32_t max, uint16_t step) {
3452     if (step == 0) return;   // refuse to crash
3453     if ((64 % step) == 0) {  // step divides 64
3454         uint64_t mask = 0;   // construct the repeated mask
3455         for (uint32_t value = (min % step); value < 64; value += step) {
3456             mask |= ((uint64_t)1 << value);
3457         }
3458         uint32_t firstword = min / 64;
3459         uint32_t endword = (max - 1) / 64;
3460         bitset->cardinality = (max - min + step - 1) / step;
3461         if (firstword == endword) {
3462             bitset->array[firstword] |=
3463                 mask & (((~UINT64_C(0)) << (min % 64)) &
3464                         ((~UINT64_C(0)) >> ((~max + 1) % 64)));
3465             return;
3466         }
3467         bitset->array[firstword] = mask & ((~UINT64_C(0)) << (min % 64));
3468         for (uint32_t i = firstword + 1; i < endword; i++)
3469             bitset->array[i] = mask;
3470         bitset->array[endword] = mask & ((~UINT64_C(0)) >> ((~max + 1) % 64));
3471     } else {
3472         for (uint32_t value = min; value < max; value += step) {
3473             bitset_container_add(bitset, value);
3474         }
3475     }
3476 }
3477 
3478 /* Free memory. */
bitset_container_free(bitset_container_t * bitset)3479 void bitset_container_free(bitset_container_t *bitset) {
3480     if(bitset->array != NULL) {// Jon Strabala reports that some tools complain otherwise
3481       roaring_bitmap_aligned_free(bitset->array);
3482       bitset->array = NULL; // pedantic
3483     }
3484     free(bitset);
3485 }
3486 
3487 /* duplicate container. */
bitset_container_clone(const bitset_container_t * src)3488 bitset_container_t *bitset_container_clone(const bitset_container_t *src) {
3489     bitset_container_t *bitset =
3490         (bitset_container_t *)malloc(sizeof(bitset_container_t));
3491     assert(bitset);
3492 
3493     // sizeof(__m256i) == 32
3494     bitset->array = (uint64_t *)roaring_bitmap_aligned_malloc(
3495         32, sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
3496     assert(bitset->array);
3497     bitset->cardinality = src->cardinality;
3498     memcpy(bitset->array, src->array,
3499            sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
3500     return bitset;
3501 }
3502 
bitset_container_set_range(bitset_container_t * bitset,uint32_t begin,uint32_t end)3503 void bitset_container_set_range(bitset_container_t *bitset, uint32_t begin,
3504                                 uint32_t end) {
3505     bitset_set_range(bitset->array, begin, end);
3506     bitset->cardinality =
3507         bitset_container_compute_cardinality(bitset);  // could be smarter
3508 }
3509 
3510 
bitset_container_intersect(const bitset_container_t * src_1,const bitset_container_t * src_2)3511 bool bitset_container_intersect(const bitset_container_t *src_1,
3512                                   const bitset_container_t *src_2) {
3513 	// could vectorize, but this is probably already quite fast in practice
3514     const uint64_t * __restrict__ array_1 = src_1->array;
3515     const uint64_t * __restrict__ array_2 = src_2->array;
3516 	for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i ++) {
3517         if((array_1[i] & array_2[i]) != 0) return true;
3518     }
3519     return false;
3520 }
3521 
3522 
3523 #ifdef USEAVX
3524 #ifndef WORDS_IN_AVX2_REG
3525 #define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t)
3526 #endif
3527 /* Get the number of bits set (force computation) */
bitset_container_compute_cardinality(const bitset_container_t * bitset)3528 int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
3529     return (int) avx2_harley_seal_popcount256(
3530         (const __m256i *)bitset->array,
3531         BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));
3532 }
3533 
3534 #elif defined(USENEON)
bitset_container_compute_cardinality(const bitset_container_t * bitset)3535 int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
3536     uint16x8_t n0 = vdupq_n_u16(0);
3537     uint16x8_t n1 = vdupq_n_u16(0);
3538     uint16x8_t n2 = vdupq_n_u16(0);
3539     uint16x8_t n3 = vdupq_n_u16(0);
3540     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) {
3541         uint64x2_t c0 = vld1q_u64(&bitset->array[i + 0]);
3542         n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0))));
3543         uint64x2_t c1 = vld1q_u64(&bitset->array[i + 2]);
3544         n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1))));
3545         uint64x2_t c2 = vld1q_u64(&bitset->array[i + 4]);
3546         n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2))));
3547         uint64x2_t c3 = vld1q_u64(&bitset->array[i + 6]);
3548         n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3))));
3549     }
3550     uint64x2_t n = vdupq_n_u64(0);
3551     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0)));
3552     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1)));
3553     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2)));
3554     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3)));
3555     return vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1);
3556 }
3557 
3558 #else
3559 
3560 /* Get the number of bits set (force computation) */
bitset_container_compute_cardinality(const bitset_container_t * bitset)3561 int bitset_container_compute_cardinality(const bitset_container_t *bitset) {
3562     const uint64_t *array = bitset->array;
3563     int32_t sum = 0;
3564     for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 4) {
3565         sum += hamming(array[i]);
3566         sum += hamming(array[i + 1]);
3567         sum += hamming(array[i + 2]);
3568         sum += hamming(array[i + 3]);
3569     }
3570     return sum;
3571 }
3572 
3573 #endif
3574 
3575 #ifdef USEAVX
3576 
3577 #define BITSET_CONTAINER_FN_REPEAT 8
3578 #ifndef WORDS_IN_AVX2_REG
3579 #define WORDS_IN_AVX2_REG sizeof(__m256i) / sizeof(uint64_t)
3580 #endif
3581 #define LOOP_SIZE                    \
3582     BITSET_CONTAINER_SIZE_IN_WORDS / \
3583         ((WORDS_IN_AVX2_REG)*BITSET_CONTAINER_FN_REPEAT)
3584 
3585 /* Computes a binary operation (eg union) on bitset1 and bitset2 and write the
3586    result to bitsetout */
3587 // clang-format off
3588 #define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic)  \
3589 int bitset_container_##opname##_nocard(const bitset_container_t *src_1, \
3590                                        const bitset_container_t *src_2, \
3591                                        bitset_container_t *dst) {       \
3592     const uint8_t * __restrict__ array_1 = (const uint8_t *)src_1->array; \
3593     const uint8_t * __restrict__ array_2 = (const uint8_t *)src_2->array; \
3594     /* not using the blocking optimization for some reason*/            \
3595     uint8_t *out = (uint8_t*)dst->array;                                \
3596     const int innerloop = 8;                                            \
3597     for (size_t i = 0;                                                  \
3598         i < BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG);       \
3599                                                          i+=innerloop) {\
3600         __m256i A1, A2, AO;                                             \
3601         A1 = _mm256_lddqu_si256((const __m256i *)(array_1));                  \
3602         A2 = _mm256_lddqu_si256((const __m256i *)(array_2));                  \
3603         AO = avx_intrinsic(A2, A1);                                     \
3604         _mm256_storeu_si256((__m256i *)out, AO);                        \
3605         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 32));             \
3606         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 32));             \
3607         AO = avx_intrinsic(A2, A1);                                     \
3608         _mm256_storeu_si256((__m256i *)(out+32), AO);                   \
3609         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 64));             \
3610         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 64));             \
3611         AO = avx_intrinsic(A2, A1);                                     \
3612         _mm256_storeu_si256((__m256i *)(out+64), AO);                   \
3613         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 96));             \
3614         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 96));             \
3615         AO = avx_intrinsic(A2, A1);                                     \
3616         _mm256_storeu_si256((__m256i *)(out+96), AO);                   \
3617         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 128));            \
3618         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 128));            \
3619         AO = avx_intrinsic(A2, A1);                                     \
3620         _mm256_storeu_si256((__m256i *)(out+128), AO);                  \
3621         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 160));            \
3622         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 160));            \
3623         AO = avx_intrinsic(A2, A1);                                     \
3624         _mm256_storeu_si256((__m256i *)(out+160), AO);                  \
3625         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 192));            \
3626         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 192));            \
3627         AO = avx_intrinsic(A2, A1);                                     \
3628         _mm256_storeu_si256((__m256i *)(out+192), AO);                  \
3629         A1 = _mm256_lddqu_si256((const __m256i *)(array_1 + 224));            \
3630         A2 = _mm256_lddqu_si256((const __m256i *)(array_2 + 224));            \
3631         AO = avx_intrinsic(A2, A1);                                     \
3632         _mm256_storeu_si256((__m256i *)(out+224), AO);                  \
3633         out+=256;                                                       \
3634         array_1 += 256;                                                 \
3635         array_2 += 256;                                                 \
3636     }                                                                   \
3637     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                      \
3638     return dst->cardinality;                                            \
3639 }                                                                       \
3640 /* next, a version that updates cardinality*/                           \
3641 int bitset_container_##opname(const bitset_container_t *src_1,          \
3642                               const bitset_container_t *src_2,          \
3643                               bitset_container_t *dst) {                \
3644     const __m256i * __restrict__ array_1 = (const __m256i *) src_1->array; \
3645     const __m256i * __restrict__ array_2 = (const __m256i *) src_2->array; \
3646     __m256i *out = (__m256i *) dst->array;                              \
3647     dst->cardinality = (int32_t)avx2_harley_seal_popcount256andstore_##opname(array_2,\
3648     		array_1, out,BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));\
3649     return dst->cardinality;                                            \
3650 }                                                                       \
3651 /* next, a version that just computes the cardinality*/                 \
3652 int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \
3653                               const bitset_container_t *src_2) {        \
3654     const __m256i * __restrict__ data1 = (const __m256i *) src_1->array; \
3655     const __m256i * __restrict__ data2 = (const __m256i *) src_2->array; \
3656     return (int)avx2_harley_seal_popcount256_##opname(data2,                \
3657     		data1, BITSET_CONTAINER_SIZE_IN_WORDS / (WORDS_IN_AVX2_REG));\
3658 }
3659 
3660 #elif defined(USENEON)
3661 
3662 #define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic)  \
3663 int bitset_container_##opname(const bitset_container_t *src_1,                \
3664                               const bitset_container_t *src_2,                \
3665                               bitset_container_t *dst) {                      \
3666     const uint64_t * __restrict__ array_1 = src_1->array;                     \
3667     const uint64_t * __restrict__ array_2 = src_2->array;                     \
3668     uint64_t *out = dst->array;                                               \
3669     uint16x8_t n0 = vdupq_n_u16(0);                                           \
3670     uint16x8_t n1 = vdupq_n_u16(0);                                           \
3671     uint16x8_t n2 = vdupq_n_u16(0);                                           \
3672     uint16x8_t n3 = vdupq_n_u16(0);                                           \
3673     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) {          \
3674         uint64x2_t c0 = neon_intrinsic(vld1q_u64(&array_1[i + 0]),            \
3675                                        vld1q_u64(&array_2[i + 0]));           \
3676         n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0))));   \
3677         vst1q_u64(&out[i + 0], c0);                                           \
3678         uint64x2_t c1 = neon_intrinsic(vld1q_u64(&array_1[i + 2]),            \
3679                                        vld1q_u64(&array_2[i + 2]));           \
3680         n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1))));   \
3681         vst1q_u64(&out[i + 2], c1);                                           \
3682         uint64x2_t c2 = neon_intrinsic(vld1q_u64(&array_1[i + 4]),            \
3683                                        vld1q_u64(&array_2[i + 4]));           \
3684         n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2))));   \
3685         vst1q_u64(&out[i + 4], c2);                                           \
3686         uint64x2_t c3 = neon_intrinsic(vld1q_u64(&array_1[i + 6]),            \
3687                                        vld1q_u64(&array_2[i + 6]));           \
3688         n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3))));   \
3689         vst1q_u64(&out[i + 6], c3);                                           \
3690     }                                                                         \
3691     uint64x2_t n = vdupq_n_u64(0);                                            \
3692     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0)));                           \
3693     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1)));                           \
3694     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2)));                           \
3695     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3)));                           \
3696     dst->cardinality = vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1);           \
3697     return dst->cardinality;                                                  \
3698 }                                                                             \
3699 int bitset_container_##opname##_nocard(const bitset_container_t *src_1,       \
3700                                        const bitset_container_t *src_2,       \
3701                                              bitset_container_t *dst) {       \
3702     const uint64_t * __restrict__ array_1 = src_1->array;                     \
3703     const uint64_t * __restrict__ array_2 = src_2->array;                     \
3704     uint64_t *out = dst->array;                                               \
3705     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) {          \
3706         vst1q_u64(&out[i + 0], neon_intrinsic(vld1q_u64(&array_1[i + 0]),     \
3707                                               vld1q_u64(&array_2[i + 0])));   \
3708         vst1q_u64(&out[i + 2], neon_intrinsic(vld1q_u64(&array_1[i + 2]),     \
3709                                               vld1q_u64(&array_2[i + 2])));   \
3710         vst1q_u64(&out[i + 4], neon_intrinsic(vld1q_u64(&array_1[i + 4]),     \
3711                                               vld1q_u64(&array_2[i + 4])));   \
3712         vst1q_u64(&out[i + 6], neon_intrinsic(vld1q_u64(&array_1[i + 6]),     \
3713                                               vld1q_u64(&array_2[i + 6])));   \
3714     }                                                                         \
3715     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                            \
3716     return dst->cardinality;                                                  \
3717 }                                                                             \
3718 int bitset_container_##opname##_justcard(const bitset_container_t *src_1,     \
3719                                          const bitset_container_t *src_2) {   \
3720     const uint64_t * __restrict__ array_1 = src_1->array;                     \
3721     const uint64_t * __restrict__ array_2 = src_2->array;                     \
3722     uint16x8_t n0 = vdupq_n_u16(0);                                           \
3723     uint16x8_t n1 = vdupq_n_u16(0);                                           \
3724     uint16x8_t n2 = vdupq_n_u16(0);                                           \
3725     uint16x8_t n3 = vdupq_n_u16(0);                                           \
3726     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 8) {          \
3727         uint64x2_t c0 = neon_intrinsic(vld1q_u64(&array_1[i + 0]),            \
3728                                        vld1q_u64(&array_2[i + 0]));           \
3729         n0 = vaddq_u16(n0, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c0))));   \
3730         uint64x2_t c1 = neon_intrinsic(vld1q_u64(&array_1[i + 2]),            \
3731                                        vld1q_u64(&array_2[i + 2]));           \
3732         n1 = vaddq_u16(n1, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c1))));   \
3733         uint64x2_t c2 = neon_intrinsic(vld1q_u64(&array_1[i + 4]),            \
3734                                        vld1q_u64(&array_2[i + 4]));           \
3735         n2 = vaddq_u16(n2, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c2))));   \
3736         uint64x2_t c3 = neon_intrinsic(vld1q_u64(&array_1[i + 6]),            \
3737                                        vld1q_u64(&array_2[i + 6]));           \
3738         n3 = vaddq_u16(n3, vpaddlq_u8(vcntq_u8(vreinterpretq_u8_u64(c3))));   \
3739     }                                                                         \
3740     uint64x2_t n = vdupq_n_u64(0);                                            \
3741     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n0)));                           \
3742     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n1)));                           \
3743     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n2)));                           \
3744     n = vaddq_u64(n, vpaddlq_u32(vpaddlq_u16(n3)));                           \
3745     return vgetq_lane_u64(n, 0) + vgetq_lane_u64(n, 1);                       \
3746 }
3747 
3748 #else /* not USEAVX  */
3749 
3750 #define BITSET_CONTAINER_FN(opname, opsymbol, avx_intrinsic, neon_intrinsic)  \
3751 int bitset_container_##opname(const bitset_container_t *src_1,            \
3752                               const bitset_container_t *src_2,            \
3753                               bitset_container_t *dst) {                  \
3754     const uint64_t * __restrict__ array_1 = src_1->array;                 \
3755     const uint64_t * __restrict__ array_2 = src_2->array;                 \
3756     uint64_t *out = dst->array;                                           \
3757     int32_t sum = 0;                                                      \
3758     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) {      \
3759         const uint64_t word_1 = (array_1[i])opsymbol(array_2[i]),         \
3760                        word_2 = (array_1[i + 1])opsymbol(array_2[i + 1]); \
3761         out[i] = word_1;                                                  \
3762         out[i + 1] = word_2;                                              \
3763         sum += hamming(word_1);                                    \
3764         sum += hamming(word_2);                                    \
3765     }                                                                     \
3766     dst->cardinality = sum;                                               \
3767     return dst->cardinality;                                              \
3768 }                                                                         \
3769 int bitset_container_##opname##_nocard(const bitset_container_t *src_1,   \
3770                                        const bitset_container_t *src_2,   \
3771                                        bitset_container_t *dst) {         \
3772     const uint64_t * __restrict__ array_1 = src_1->array;                 \
3773     const uint64_t * __restrict__ array_2 = src_2->array;                 \
3774     uint64_t *out = dst->array;                                           \
3775     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i++) {         \
3776         out[i] = (array_1[i])opsymbol(array_2[i]);                        \
3777     }                                                                     \
3778     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;                        \
3779     return dst->cardinality;                                              \
3780 }                                                                         \
3781 int bitset_container_##opname##_justcard(const bitset_container_t *src_1, \
3782                               const bitset_container_t *src_2) {          \
3783     const uint64_t * __restrict__ array_1 = src_1->array;                 \
3784     const uint64_t * __restrict__ array_2 = src_2->array;                 \
3785     int32_t sum = 0;                                                      \
3786     for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 2) {      \
3787         const uint64_t word_1 = (array_1[i])opsymbol(array_2[i]),         \
3788                        word_2 = (array_1[i + 1])opsymbol(array_2[i + 1]); \
3789         sum += hamming(word_1);                                    \
3790         sum += hamming(word_2);                                    \
3791     }                                                                     \
3792     return sum;                                                           \
3793 }
3794 
3795 #endif
3796 
3797 // we duplicate the function because other containers use the "or" term, makes API more consistent
3798 BITSET_CONTAINER_FN(or,    |, _mm256_or_si256, vorrq_u64)
3799 BITSET_CONTAINER_FN(union, |, _mm256_or_si256, vorrq_u64)
3800 
3801 // we duplicate the function because other containers use the "intersection" term, makes API more consistent
3802 BITSET_CONTAINER_FN(and,          &, _mm256_and_si256, vandq_u64)
3803 BITSET_CONTAINER_FN(intersection, &, _mm256_and_si256, vandq_u64)
3804 
3805 BITSET_CONTAINER_FN(xor,    ^,  _mm256_xor_si256,    veorq_u64)
3806 BITSET_CONTAINER_FN(andnot, &~, _mm256_andnot_si256, vbicq_u64)
3807 // clang-format On
3808 
3809 
3810 
bitset_container_to_uint32_array(void * vout,const bitset_container_t * cont,uint32_t base)3811 int bitset_container_to_uint32_array( void *vout, const bitset_container_t *cont, uint32_t base) {
3812 #ifdef USEAVX2FORDECODING
3813 	if(cont->cardinality >= 8192)// heuristic
3814 		return (int) bitset_extract_setbits_avx2(cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, vout,cont->cardinality,base);
3815 	else
3816 		return (int) bitset_extract_setbits(cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, vout,base);
3817 #else
3818 	return (int) bitset_extract_setbits(cont->array, BITSET_CONTAINER_SIZE_IN_WORDS, vout,base);
3819 #endif
3820 }
3821 
3822 /*
3823  * Print this container using printf (useful for debugging).
3824  */
bitset_container_printf(const bitset_container_t * v)3825 void bitset_container_printf(const bitset_container_t * v) {
3826 	printf("{");
3827 	uint32_t base = 0;
3828 	bool iamfirst = true;// TODO: rework so that this is not necessary yet still readable
3829 	for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
3830 		uint64_t w = v->array[i];
3831 		while (w != 0) {
3832 			uint64_t t = w & (~w + 1);
3833 			int r = __builtin_ctzll(w);
3834 			if(iamfirst) {// predicted to be false
3835 				printf("%u",base + r);
3836 				iamfirst = false;
3837 			} else {
3838 				printf(",%u",base + r);
3839 			}
3840 			w ^= t;
3841 		}
3842 		base += 64;
3843 	}
3844 	printf("}");
3845 }
3846 
3847 
3848 /*
3849  * Print this container using printf as a comma-separated list of 32-bit integers starting at base.
3850  */
bitset_container_printf_as_uint32_array(const bitset_container_t * v,uint32_t base)3851 void bitset_container_printf_as_uint32_array(const bitset_container_t * v, uint32_t base) {
3852 	bool iamfirst = true;// TODO: rework so that this is not necessary yet still readable
3853 	for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
3854 		uint64_t w = v->array[i];
3855 		while (w != 0) {
3856 			uint64_t t = w & (~w + 1);
3857 			int r = __builtin_ctzll(w);
3858 			if(iamfirst) {// predicted to be false
3859 				printf("%u", r + base);
3860 				iamfirst = false;
3861 			} else {
3862 				printf(",%u",r + base);
3863 			}
3864 			w ^= t;
3865 		}
3866 		base += 64;
3867 	}
3868 }
3869 
3870 
3871 // TODO: use the fast lower bound, also
bitset_container_number_of_runs(bitset_container_t * b)3872 int bitset_container_number_of_runs(bitset_container_t *b) {
3873   int num_runs = 0;
3874   uint64_t next_word = b->array[0];
3875 
3876   for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS-1; ++i) {
3877     uint64_t word = next_word;
3878     next_word = b->array[i+1];
3879     num_runs += hamming((~word) & (word << 1)) + ( (word >> 63) & ~next_word);
3880   }
3881 
3882   uint64_t word = next_word;
3883   num_runs += hamming((~word) & (word << 1));
3884   if((word & 0x8000000000000000ULL) != 0)
3885     num_runs++;
3886   return num_runs;
3887 }
3888 
bitset_container_serialize(const bitset_container_t * container,char * buf)3889 int32_t bitset_container_serialize(const bitset_container_t *container, char *buf) {
3890   int32_t l = sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS;
3891   memcpy(buf, container->array, l);
3892   return(l);
3893 }
3894 
3895 
3896 
bitset_container_write(const bitset_container_t * container,char * buf)3897 int32_t bitset_container_write(const bitset_container_t *container,
3898                                   char *buf) {
3899 	memcpy(buf, container->array, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t));
3900 	return bitset_container_size_in_bytes(container);
3901 }
3902 
3903 
bitset_container_read(int32_t cardinality,bitset_container_t * container,const char * buf)3904 int32_t bitset_container_read(int32_t cardinality, bitset_container_t *container,
3905 		const char *buf)  {
3906 	container->cardinality = cardinality;
3907 	memcpy(container->array, buf, BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t));
3908 	return bitset_container_size_in_bytes(container);
3909 }
3910 
bitset_container_serialization_len(void)3911 uint32_t bitset_container_serialization_len(void) {
3912   return(sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS);
3913 }
3914 
bitset_container_deserialize(const char * buf,size_t buf_len)3915 void* bitset_container_deserialize(const char *buf, size_t buf_len) {
3916   bitset_container_t *ptr;
3917   size_t l = sizeof(uint64_t) * BITSET_CONTAINER_SIZE_IN_WORDS;
3918 
3919   if(l != buf_len)
3920     return(NULL);
3921 
3922   if((ptr = (bitset_container_t *)malloc(sizeof(bitset_container_t))) != NULL) {
3923     memcpy(ptr, buf, sizeof(bitset_container_t));
3924     // sizeof(__m256i) == 32
3925     ptr->array = (uint64_t *) roaring_bitmap_aligned_malloc(32, l);
3926     if (! ptr->array) {
3927         free(ptr);
3928         return NULL;
3929     }
3930     memcpy(ptr->array, buf, l);
3931     ptr->cardinality = bitset_container_compute_cardinality(ptr);
3932   }
3933 
3934   return((void*)ptr);
3935 }
3936 
bitset_container_iterate(const bitset_container_t * cont,uint32_t base,roaring_iterator iterator,void * ptr)3937 bool bitset_container_iterate(const bitset_container_t *cont, uint32_t base, roaring_iterator iterator, void *ptr) {
3938   for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
3939     uint64_t w = cont->array[i];
3940     while (w != 0) {
3941       uint64_t t = w & (~w + 1);
3942       int r = __builtin_ctzll(w);
3943       if(!iterator(r + base, ptr)) return false;
3944       w ^= t;
3945     }
3946     base += 64;
3947   }
3948   return true;
3949 }
3950 
bitset_container_iterate64(const bitset_container_t * cont,uint32_t base,roaring_iterator64 iterator,uint64_t high_bits,void * ptr)3951 bool bitset_container_iterate64(const bitset_container_t *cont, uint32_t base, roaring_iterator64 iterator, uint64_t high_bits, void *ptr) {
3952   for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
3953     uint64_t w = cont->array[i];
3954     while (w != 0) {
3955       uint64_t t = w & (~w + 1);
3956       int r = __builtin_ctzll(w);
3957       if(!iterator(high_bits | (uint64_t)(r + base), ptr)) return false;
3958       w ^= t;
3959     }
3960     base += 64;
3961   }
3962   return true;
3963 }
3964 
3965 
bitset_container_equals(const bitset_container_t * container1,const bitset_container_t * container2)3966 bool bitset_container_equals(const bitset_container_t *container1, const bitset_container_t *container2) {
3967 	if((container1->cardinality != BITSET_UNKNOWN_CARDINALITY) && (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)) {
3968 		if(container1->cardinality != container2->cardinality) {
3969 			return false;
3970 		}
3971     if (container1->cardinality == INT32_C(0x10000)) {
3972         return true;
3973     }
3974 	}
3975 #ifdef USEAVX
3976   const __m256i *ptr1 = (const __m256i*)container1->array;
3977   const __m256i *ptr2 = (const __m256i*)container2->array;
3978   for (size_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)/32; i++) {
3979       __m256i r1 = _mm256_load_si256(ptr1+i);
3980       __m256i r2 = _mm256_load_si256(ptr2+i);
3981       int mask = _mm256_movemask_epi8(_mm256_cmpeq_epi8(r1, r2));
3982       if ((uint32_t)mask != UINT32_MAX) {
3983           return false;
3984       }
3985   }
3986 #else
3987   return memcmp(container1->array,
3988                 container2->array,
3989                 BITSET_CONTAINER_SIZE_IN_WORDS*sizeof(uint64_t)) == 0;
3990 #endif
3991 	return true;
3992 }
3993 
bitset_container_is_subset(const bitset_container_t * container1,const bitset_container_t * container2)3994 bool bitset_container_is_subset(const bitset_container_t *container1,
3995                           const bitset_container_t *container2) {
3996     if((container1->cardinality != BITSET_UNKNOWN_CARDINALITY) && (container2->cardinality != BITSET_UNKNOWN_CARDINALITY)) {
3997         if(container1->cardinality > container2->cardinality) {
3998             return false;
3999         }
4000     }
4001     for(int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
4002 		if((container1->array[i] & container2->array[i]) != container1->array[i]) {
4003 			return false;
4004 		}
4005 	}
4006 	return true;
4007 }
4008 
bitset_container_select(const bitset_container_t * container,uint32_t * start_rank,uint32_t rank,uint32_t * element)4009 bool bitset_container_select(const bitset_container_t *container, uint32_t *start_rank, uint32_t rank, uint32_t *element) {
4010     int card = bitset_container_cardinality(container);
4011     if(rank >= *start_rank + card) {
4012         *start_rank += card;
4013         return false;
4014     }
4015     const uint64_t *array = container->array;
4016     int32_t size;
4017     for (int i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; i += 1) {
4018         size = hamming(array[i]);
4019         if(rank <= *start_rank + size) {
4020             uint64_t w = container->array[i];
4021             uint16_t base = i*64;
4022             while (w != 0) {
4023                 uint64_t t = w & (~w + 1);
4024                 int r = __builtin_ctzll(w);
4025                 if(*start_rank == rank) {
4026                     *element = r+base;
4027                     return true;
4028                 }
4029                 w ^= t;
4030                 *start_rank += 1;
4031             }
4032         }
4033         else
4034             *start_rank += size;
4035     }
4036     assert(false);
4037     __builtin_unreachable();
4038 }
4039 
4040 
4041 /* Returns the smallest value (assumes not empty) */
bitset_container_minimum(const bitset_container_t * container)4042 uint16_t bitset_container_minimum(const bitset_container_t *container) {
4043   for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i ) {
4044     uint64_t w = container->array[i];
4045     if (w != 0) {
4046       int r = __builtin_ctzll(w);
4047       return r + i * 64;
4048     }
4049   }
4050   return UINT16_MAX;
4051 }
4052 
4053 /* Returns the largest value (assumes not empty) */
bitset_container_maximum(const bitset_container_t * container)4054 uint16_t bitset_container_maximum(const bitset_container_t *container) {
4055   for (int32_t i = BITSET_CONTAINER_SIZE_IN_WORDS - 1; i > 0; --i ) {
4056     uint64_t w = container->array[i];
4057     if (w != 0) {
4058       int r = __builtin_clzll(w);
4059       return i * 64 + 63  - r;
4060     }
4061   }
4062   return 0;
4063 }
4064 
4065 /* Returns the number of values equal or smaller than x */
bitset_container_rank(const bitset_container_t * container,uint16_t x)4066 int bitset_container_rank(const bitset_container_t *container, uint16_t x) {
4067   // credit: aqrit
4068   int sum = 0;
4069   int i = 0;
4070   for (int end = x / 64; i < end; i++){
4071     sum += hamming(container->array[i]);
4072   }
4073   uint64_t lastword = container->array[i];
4074   uint64_t lastpos = UINT64_C(1) << (x % 64);
4075   uint64_t mask = lastpos + lastpos - 1; // smear right
4076   sum += hamming(lastword & mask);
4077   return sum;
4078 }
4079 
4080 /* Returns the index of the first value equal or larger than x, or -1 */
bitset_container_index_equalorlarger(const bitset_container_t * container,uint16_t x)4081 int bitset_container_index_equalorlarger(const bitset_container_t *container, uint16_t x) {
4082   uint32_t x32 = x;
4083   uint32_t k = x32 / 64;
4084   uint64_t word = container->array[k];
4085   const int diff = x32 - k * 64; // in [0,64)
4086   word = (word >> diff) << diff; // a mask is faster, but we don't care
4087   while(word == 0) {
4088     k++;
4089     if(k == BITSET_CONTAINER_SIZE_IN_WORDS) return -1;
4090     word = container->array[k];
4091   }
4092   return k * 64 + __builtin_ctzll(word);
4093 }
4094 /* end file src/containers/bitset.c */
4095 /* begin file src/containers/containers.c */
4096 
4097 
container_free(void * container,uint8_t typecode)4098 void container_free(void *container, uint8_t typecode) {
4099     switch (typecode) {
4100         case BITSET_CONTAINER_TYPE_CODE:
4101             bitset_container_free((bitset_container_t *)container);
4102             break;
4103         case ARRAY_CONTAINER_TYPE_CODE:
4104             array_container_free((array_container_t *)container);
4105             break;
4106         case RUN_CONTAINER_TYPE_CODE:
4107             run_container_free((run_container_t *)container);
4108             break;
4109         case SHARED_CONTAINER_TYPE_CODE:
4110             shared_container_free((shared_container_t *)container);
4111             break;
4112         default:
4113             assert(false);
4114             __builtin_unreachable();
4115     }
4116 }
4117 
container_printf(const void * container,uint8_t typecode)4118 void container_printf(const void *container, uint8_t typecode) {
4119     container = container_unwrap_shared(container, &typecode);
4120     switch (typecode) {
4121         case BITSET_CONTAINER_TYPE_CODE:
4122             bitset_container_printf((const bitset_container_t *)container);
4123             return;
4124         case ARRAY_CONTAINER_TYPE_CODE:
4125             array_container_printf((const array_container_t *)container);
4126             return;
4127         case RUN_CONTAINER_TYPE_CODE:
4128             run_container_printf((const run_container_t *)container);
4129             return;
4130         default:
4131             __builtin_unreachable();
4132     }
4133 }
4134 
container_printf_as_uint32_array(const void * container,uint8_t typecode,uint32_t base)4135 void container_printf_as_uint32_array(const void *container, uint8_t typecode,
4136                                       uint32_t base) {
4137     container = container_unwrap_shared(container, &typecode);
4138     switch (typecode) {
4139         case BITSET_CONTAINER_TYPE_CODE:
4140             bitset_container_printf_as_uint32_array(
4141                 (const bitset_container_t *)container, base);
4142             return;
4143         case ARRAY_CONTAINER_TYPE_CODE:
4144             array_container_printf_as_uint32_array(
4145                 (const array_container_t *)container, base);
4146             return;
4147         case RUN_CONTAINER_TYPE_CODE:
4148             run_container_printf_as_uint32_array(
4149                 (const run_container_t *)container, base);
4150             return;
4151             return;
4152         default:
4153             __builtin_unreachable();
4154     }
4155 }
4156 
container_serialize(const void * container,uint8_t typecode,char * buf)4157 int32_t container_serialize(const void *container, uint8_t typecode,
4158                             char *buf) {
4159     container = container_unwrap_shared(container, &typecode);
4160     switch (typecode) {
4161         case BITSET_CONTAINER_TYPE_CODE:
4162             return (bitset_container_serialize((const bitset_container_t *)container,
4163                                                buf));
4164         case ARRAY_CONTAINER_TYPE_CODE:
4165             return (
4166                 array_container_serialize((const array_container_t *)container, buf));
4167         case RUN_CONTAINER_TYPE_CODE:
4168             return (run_container_serialize((const run_container_t *)container, buf));
4169         default:
4170             assert(0);
4171             __builtin_unreachable();
4172             return (-1);
4173     }
4174 }
4175 
container_serialization_len(const void * container,uint8_t typecode)4176 uint32_t container_serialization_len(const void *container, uint8_t typecode) {
4177     container = container_unwrap_shared(container, &typecode);
4178     switch (typecode) {
4179         case BITSET_CONTAINER_TYPE_CODE:
4180             return bitset_container_serialization_len();
4181         case ARRAY_CONTAINER_TYPE_CODE:
4182             return array_container_serialization_len(
4183                 (const array_container_t *)container);
4184         case RUN_CONTAINER_TYPE_CODE:
4185             return run_container_serialization_len(
4186                 (const run_container_t *)container);
4187         default:
4188             assert(0);
4189             __builtin_unreachable();
4190             return (0);
4191     }
4192 }
4193 
container_deserialize(uint8_t typecode,const char * buf,size_t buf_len)4194 void *container_deserialize(uint8_t typecode, const char *buf, size_t buf_len) {
4195     switch (typecode) {
4196         case BITSET_CONTAINER_TYPE_CODE:
4197             return (bitset_container_deserialize(buf, buf_len));
4198         case ARRAY_CONTAINER_TYPE_CODE:
4199             return (array_container_deserialize(buf, buf_len));
4200         case RUN_CONTAINER_TYPE_CODE:
4201             return (run_container_deserialize(buf, buf_len));
4202         case SHARED_CONTAINER_TYPE_CODE:
4203             printf("this should never happen.\n");
4204             assert(0);
4205             __builtin_unreachable();
4206             return (NULL);
4207         default:
4208             assert(0);
4209             __builtin_unreachable();
4210             return (NULL);
4211     }
4212 }
4213 
get_copy_of_container(void * container,uint8_t * typecode,bool copy_on_write)4214 void *get_copy_of_container(void *container, uint8_t *typecode,
4215                             bool copy_on_write) {
4216     if (copy_on_write) {
4217         shared_container_t *shared_container;
4218         if (*typecode == SHARED_CONTAINER_TYPE_CODE) {
4219             shared_container = (shared_container_t *)container;
4220             shared_container->counter += 1;
4221             return shared_container;
4222         }
4223         assert(*typecode != SHARED_CONTAINER_TYPE_CODE);
4224 
4225         if ((shared_container = (shared_container_t *)malloc(
4226                  sizeof(shared_container_t))) == NULL) {
4227             return NULL;
4228         }
4229 
4230         shared_container->container = container;
4231         shared_container->typecode = *typecode;
4232 
4233         shared_container->counter = 2;
4234         *typecode = SHARED_CONTAINER_TYPE_CODE;
4235 
4236         return shared_container;
4237     }  // copy_on_write
4238     // otherwise, no copy on write...
4239     const void *actualcontainer =
4240         container_unwrap_shared((const void *)container, typecode);
4241     assert(*typecode != SHARED_CONTAINER_TYPE_CODE);
4242     return container_clone(actualcontainer, *typecode);
4243 }
4244 /**
4245  * Copies a container, requires a typecode. This allocates new memory, caller
4246  * is responsible for deallocation.
4247  */
container_clone(const void * container,uint8_t typecode)4248 void *container_clone(const void *container, uint8_t typecode) {
4249     container = container_unwrap_shared(container, &typecode);
4250     switch (typecode) {
4251         case BITSET_CONTAINER_TYPE_CODE:
4252             return bitset_container_clone((const bitset_container_t *)container);
4253         case ARRAY_CONTAINER_TYPE_CODE:
4254             return array_container_clone((const array_container_t *)container);
4255         case RUN_CONTAINER_TYPE_CODE:
4256             return run_container_clone((const run_container_t *)container);
4257         case SHARED_CONTAINER_TYPE_CODE:
4258             printf("shared containers are not clonable\n");
4259             assert(false);
4260             return NULL;
4261         default:
4262             assert(false);
4263             __builtin_unreachable();
4264             return NULL;
4265     }
4266 }
4267 
shared_container_extract_copy(shared_container_t * container,uint8_t * typecode)4268 void *shared_container_extract_copy(shared_container_t *container,
4269                                     uint8_t *typecode) {
4270     assert(container->counter > 0);
4271     assert(container->typecode != SHARED_CONTAINER_TYPE_CODE);
4272     container->counter--;
4273     *typecode = container->typecode;
4274     void *answer;
4275     if (container->counter == 0) {
4276         answer = container->container;
4277         container->container = NULL;  // paranoid
4278         free(container);
4279     } else {
4280         answer = container_clone(container->container, *typecode);
4281     }
4282     assert(*typecode != SHARED_CONTAINER_TYPE_CODE);
4283     return answer;
4284 }
4285 
shared_container_free(shared_container_t * container)4286 void shared_container_free(shared_container_t *container) {
4287     assert(container->counter > 0);
4288     container->counter--;
4289     if (container->counter == 0) {
4290         assert(container->typecode != SHARED_CONTAINER_TYPE_CODE);
4291         container_free(container->container, container->typecode);
4292         container->container = NULL;  // paranoid
4293         free(container);
4294     }
4295 }
4296 
4297 /* end file src/containers/containers.c */
4298 /* begin file src/containers/convert.c */
4299 #include <stdio.h>
4300 
4301 
4302 // file contains grubby stuff that must know impl. details of all container
4303 // types.
bitset_container_from_array(const array_container_t * a)4304 bitset_container_t *bitset_container_from_array(const array_container_t *a) {
4305     bitset_container_t *ans = bitset_container_create();
4306     int limit = array_container_cardinality(a);
4307     for (int i = 0; i < limit; ++i) bitset_container_set(ans, a->array[i]);
4308     return ans;
4309 }
4310 
bitset_container_from_run(const run_container_t * arr)4311 bitset_container_t *bitset_container_from_run(const run_container_t *arr) {
4312     int card = run_container_cardinality(arr);
4313     bitset_container_t *answer = bitset_container_create();
4314     for (int rlepos = 0; rlepos < arr->n_runs; ++rlepos) {
4315         rle16_t vl = arr->runs[rlepos];
4316         bitset_set_lenrange(answer->array, vl.value, vl.length);
4317     }
4318     answer->cardinality = card;
4319     return answer;
4320 }
4321 
array_container_from_run(const run_container_t * arr)4322 array_container_t *array_container_from_run(const run_container_t *arr) {
4323     array_container_t *answer =
4324         array_container_create_given_capacity(run_container_cardinality(arr));
4325     answer->cardinality = 0;
4326     for (int rlepos = 0; rlepos < arr->n_runs; ++rlepos) {
4327         int run_start = arr->runs[rlepos].value;
4328         int run_end = run_start + arr->runs[rlepos].length;
4329 
4330         for (int run_value = run_start; run_value <= run_end; ++run_value) {
4331             answer->array[answer->cardinality++] = (uint16_t)run_value;
4332         }
4333     }
4334     return answer;
4335 }
4336 
array_container_from_bitset(const bitset_container_t * bits)4337 array_container_t *array_container_from_bitset(const bitset_container_t *bits) {
4338     array_container_t *result =
4339         array_container_create_given_capacity(bits->cardinality);
4340     result->cardinality = bits->cardinality;
4341     //  sse version ends up being slower here
4342     // (bitset_extract_setbits_sse_uint16)
4343     // because of the sparsity of the data
4344     bitset_extract_setbits_uint16(bits->array, BITSET_CONTAINER_SIZE_IN_WORDS,
4345                                   result->array, 0);
4346     return result;
4347 }
4348 
4349 /* assumes that container has adequate space.  Run from [s,e] (inclusive) */
add_run(run_container_t * r,int s,int e)4350 static void add_run(run_container_t *r, int s, int e) {
4351     r->runs[r->n_runs].value = s;
4352     r->runs[r->n_runs].length = e - s;
4353     r->n_runs++;
4354 }
4355 
run_container_from_array(const array_container_t * c)4356 run_container_t *run_container_from_array(const array_container_t *c) {
4357     int32_t n_runs = array_container_number_of_runs(c);
4358     run_container_t *answer = run_container_create_given_capacity(n_runs);
4359     int prev = -2;
4360     int run_start = -1;
4361     int32_t card = c->cardinality;
4362     if (card == 0) return answer;
4363     for (int i = 0; i < card; ++i) {
4364         const uint16_t cur_val = c->array[i];
4365         if (cur_val != prev + 1) {
4366             // new run starts; flush old one, if any
4367             if (run_start != -1) add_run(answer, run_start, prev);
4368             run_start = cur_val;
4369         }
4370         prev = c->array[i];
4371     }
4372     // now prev is the last seen value
4373     add_run(answer, run_start, prev);
4374     // assert(run_container_cardinality(answer) == c->cardinality);
4375     return answer;
4376 }
4377 
4378 /**
4379  * Convert the runcontainer to either a Bitmap or an Array Container, depending
4380  * on the cardinality.  Frees the container.
4381  * Allocates and returns new container, which caller is responsible for freeing.
4382  * It does not free the run container.
4383  */
4384 
convert_to_bitset_or_array_container(run_container_t * r,int32_t card,uint8_t * resulttype)4385 void *convert_to_bitset_or_array_container(run_container_t *r, int32_t card,
4386                                            uint8_t *resulttype) {
4387     if (card <= DEFAULT_MAX_SIZE) {
4388         array_container_t *answer = array_container_create_given_capacity(card);
4389         answer->cardinality = 0;
4390         for (int rlepos = 0; rlepos < r->n_runs; ++rlepos) {
4391             uint16_t run_start = r->runs[rlepos].value;
4392             uint16_t run_end = run_start + r->runs[rlepos].length;
4393             for (uint16_t run_value = run_start; run_value <= run_end;
4394                  ++run_value) {
4395                 answer->array[answer->cardinality++] = run_value;
4396             }
4397         }
4398         assert(card == answer->cardinality);
4399         *resulttype = ARRAY_CONTAINER_TYPE_CODE;
4400         //run_container_free(r);
4401         return answer;
4402     }
4403     bitset_container_t *answer = bitset_container_create();
4404     for (int rlepos = 0; rlepos < r->n_runs; ++rlepos) {
4405         uint16_t run_start = r->runs[rlepos].value;
4406         bitset_set_lenrange(answer->array, run_start, r->runs[rlepos].length);
4407     }
4408     answer->cardinality = card;
4409     *resulttype = BITSET_CONTAINER_TYPE_CODE;
4410     //run_container_free(r);
4411     return answer;
4412 }
4413 
4414 /* Converts a run container to either an array or a bitset, IF it saves space.
4415  */
4416 /* If a conversion occurs, the caller is responsible to free the original
4417  * container and
4418  * he becomes responsible to free the new one. */
convert_run_to_efficient_container(run_container_t * c,uint8_t * typecode_after)4419 void *convert_run_to_efficient_container(run_container_t *c,
4420                                          uint8_t *typecode_after) {
4421     int32_t size_as_run_container =
4422         run_container_serialized_size_in_bytes(c->n_runs);
4423 
4424     int32_t size_as_bitset_container =
4425         bitset_container_serialized_size_in_bytes();
4426     int32_t card = run_container_cardinality(c);
4427     int32_t size_as_array_container =
4428         array_container_serialized_size_in_bytes(card);
4429 
4430     int32_t min_size_non_run =
4431         size_as_bitset_container < size_as_array_container
4432             ? size_as_bitset_container
4433             : size_as_array_container;
4434     if (size_as_run_container <= min_size_non_run) {  // no conversion
4435         *typecode_after = RUN_CONTAINER_TYPE_CODE;
4436         return c;
4437     }
4438     if (card <= DEFAULT_MAX_SIZE) {
4439         // to array
4440         array_container_t *answer = array_container_create_given_capacity(card);
4441         answer->cardinality = 0;
4442         for (int rlepos = 0; rlepos < c->n_runs; ++rlepos) {
4443             int run_start = c->runs[rlepos].value;
4444             int run_end = run_start + c->runs[rlepos].length;
4445 
4446             for (int run_value = run_start; run_value <= run_end; ++run_value) {
4447                 answer->array[answer->cardinality++] = (uint16_t)run_value;
4448             }
4449         }
4450         *typecode_after = ARRAY_CONTAINER_TYPE_CODE;
4451         return answer;
4452     }
4453 
4454     // else to bitset
4455     bitset_container_t *answer = bitset_container_create();
4456 
4457     for (int rlepos = 0; rlepos < c->n_runs; ++rlepos) {
4458         int start = c->runs[rlepos].value;
4459         int end = start + c->runs[rlepos].length;
4460         bitset_set_range(answer->array, start, end + 1);
4461     }
4462     answer->cardinality = card;
4463     *typecode_after = BITSET_CONTAINER_TYPE_CODE;
4464     return answer;
4465 }
4466 
4467 // like convert_run_to_efficient_container but frees the old result if needed
convert_run_to_efficient_container_and_free(run_container_t * c,uint8_t * typecode_after)4468 void *convert_run_to_efficient_container_and_free(run_container_t *c,
4469                                                   uint8_t *typecode_after) {
4470     void *answer = convert_run_to_efficient_container(c, typecode_after);
4471     if (answer != c) run_container_free(c);
4472     return answer;
4473 }
4474 
4475 /* once converted, the original container is disposed here, rather than
4476    in roaring_array
4477 */
4478 
4479 // TODO: split into run-  array-  and bitset-  subfunctions for sanity;
4480 // a few function calls won't really matter.
4481 
convert_run_optimize(void * c,uint8_t typecode_original,uint8_t * typecode_after)4482 void *convert_run_optimize(void *c, uint8_t typecode_original,
4483                            uint8_t *typecode_after) {
4484     if (typecode_original == RUN_CONTAINER_TYPE_CODE) {
4485         void *newc = convert_run_to_efficient_container((run_container_t *)c,
4486                                                         typecode_after);
4487         if (newc != c) {
4488             container_free(c, typecode_original);
4489         }
4490         return newc;
4491     } else if (typecode_original == ARRAY_CONTAINER_TYPE_CODE) {
4492         // it might need to be converted to a run container.
4493         array_container_t *c_qua_array = (array_container_t *)c;
4494         int32_t n_runs = array_container_number_of_runs(c_qua_array);
4495         int32_t size_as_run_container =
4496             run_container_serialized_size_in_bytes(n_runs);
4497         int32_t card = array_container_cardinality(c_qua_array);
4498         int32_t size_as_array_container =
4499             array_container_serialized_size_in_bytes(card);
4500 
4501         if (size_as_run_container >= size_as_array_container) {
4502             *typecode_after = ARRAY_CONTAINER_TYPE_CODE;
4503             return c;
4504         }
4505         // else convert array to run container
4506         run_container_t *answer = run_container_create_given_capacity(n_runs);
4507         int prev = -2;
4508         int run_start = -1;
4509 
4510         assert(card > 0);
4511         for (int i = 0; i < card; ++i) {
4512             uint16_t cur_val = c_qua_array->array[i];
4513             if (cur_val != prev + 1) {
4514                 // new run starts; flush old one, if any
4515                 if (run_start != -1) add_run(answer, run_start, prev);
4516                 run_start = cur_val;
4517             }
4518             prev = c_qua_array->array[i];
4519         }
4520         assert(run_start >= 0);
4521         // now prev is the last seen value
4522         add_run(answer, run_start, prev);
4523         *typecode_after = RUN_CONTAINER_TYPE_CODE;
4524         array_container_free(c_qua_array);
4525         return answer;
4526     } else if (typecode_original ==
4527                BITSET_CONTAINER_TYPE_CODE) {  // run conversions on bitset
4528         // does bitset need conversion to run?
4529         bitset_container_t *c_qua_bitset = (bitset_container_t *)c;
4530         int32_t n_runs = bitset_container_number_of_runs(c_qua_bitset);
4531         int32_t size_as_run_container =
4532             run_container_serialized_size_in_bytes(n_runs);
4533         int32_t size_as_bitset_container =
4534             bitset_container_serialized_size_in_bytes();
4535 
4536         if (size_as_bitset_container <= size_as_run_container) {
4537             // no conversion needed.
4538             *typecode_after = BITSET_CONTAINER_TYPE_CODE;
4539             return c;
4540         }
4541         // bitset to runcontainer (ported from Java  RunContainer(
4542         // BitmapContainer bc, int nbrRuns))
4543         assert(n_runs > 0);  // no empty bitmaps
4544         run_container_t *answer = run_container_create_given_capacity(n_runs);
4545 
4546         int long_ctr = 0;
4547         uint64_t cur_word = c_qua_bitset->array[0];
4548         int run_count = 0;
4549         while (true) {
4550             while (cur_word == UINT64_C(0) &&
4551                    long_ctr < BITSET_CONTAINER_SIZE_IN_WORDS - 1)
4552                 cur_word = c_qua_bitset->array[++long_ctr];
4553 
4554             if (cur_word == UINT64_C(0)) {
4555                 bitset_container_free(c_qua_bitset);
4556                 *typecode_after = RUN_CONTAINER_TYPE_CODE;
4557                 return answer;
4558             }
4559 
4560             int local_run_start = __builtin_ctzll(cur_word);
4561             int run_start = local_run_start + 64 * long_ctr;
4562             uint64_t cur_word_with_1s = cur_word | (cur_word - 1);
4563 
4564             int run_end = 0;
4565             while (cur_word_with_1s == UINT64_C(0xFFFFFFFFFFFFFFFF) &&
4566                    long_ctr < BITSET_CONTAINER_SIZE_IN_WORDS - 1)
4567                 cur_word_with_1s = c_qua_bitset->array[++long_ctr];
4568 
4569             if (cur_word_with_1s == UINT64_C(0xFFFFFFFFFFFFFFFF)) {
4570                 run_end = 64 + long_ctr * 64;  // exclusive, I guess
4571                 add_run(answer, run_start, run_end - 1);
4572                 bitset_container_free(c_qua_bitset);
4573                 *typecode_after = RUN_CONTAINER_TYPE_CODE;
4574                 return answer;
4575             }
4576             int local_run_end = __builtin_ctzll(~cur_word_with_1s);
4577             run_end = local_run_end + long_ctr * 64;
4578             add_run(answer, run_start, run_end - 1);
4579             run_count++;
4580             cur_word = cur_word_with_1s & (cur_word_with_1s + 1);
4581         }
4582         return answer;
4583     } else {
4584         assert(false);
4585         __builtin_unreachable();
4586         return NULL;
4587     }
4588 }
4589 
bitset_container_from_run_range(const run_container_t * run,uint32_t min,uint32_t max)4590 bitset_container_t *bitset_container_from_run_range(const run_container_t *run,
4591                                                     uint32_t min, uint32_t max) {
4592     bitset_container_t *bitset = bitset_container_create();
4593     int32_t union_cardinality = 0;
4594     for (int32_t i = 0; i < run->n_runs; ++i) {
4595         uint32_t rle_min = run->runs[i].value;
4596         uint32_t rle_max = rle_min + run->runs[i].length;
4597         bitset_set_lenrange(bitset->array, rle_min, rle_max - rle_min);
4598         union_cardinality += run->runs[i].length + 1;
4599     }
4600     union_cardinality += max - min + 1;
4601     union_cardinality -= bitset_lenrange_cardinality(bitset->array, min, max-min);
4602     bitset_set_lenrange(bitset->array, min, max - min);
4603     bitset->cardinality = union_cardinality;
4604     return bitset;
4605 }
4606 /* end file src/containers/convert.c */
4607 /* begin file src/containers/mixed_andnot.c */
4608 /*
4609  * mixed_andnot.c.  More methods since operation is not symmetric,
4610  * except no "wide" andnot , so no lazy options motivated.
4611  */
4612 
4613 #include <assert.h>
4614 #include <string.h>
4615 
4616 
4617 /* Compute the andnot of src_1 and src_2 and write the result to
4618  * dst, a valid array container that could be the same as dst.*/
array_bitset_container_andnot(const array_container_t * src_1,const bitset_container_t * src_2,array_container_t * dst)4619 void array_bitset_container_andnot(const array_container_t *src_1,
4620                                    const bitset_container_t *src_2,
4621                                    array_container_t *dst) {
4622     // follows Java implementation as of June 2016
4623     if (dst->capacity < src_1->cardinality) {
4624         array_container_grow(dst, src_1->cardinality, false);
4625     }
4626     int32_t newcard = 0;
4627     const int32_t origcard = src_1->cardinality;
4628     for (int i = 0; i < origcard; ++i) {
4629         uint16_t key = src_1->array[i];
4630         dst->array[newcard] = key;
4631         newcard += 1 - bitset_container_contains(src_2, key);
4632     }
4633     dst->cardinality = newcard;
4634 }
4635 
4636 /* Compute the andnot of src_1 and src_2 and write the result to
4637  * src_1 */
4638 
array_bitset_container_iandnot(array_container_t * src_1,const bitset_container_t * src_2)4639 void array_bitset_container_iandnot(array_container_t *src_1,
4640                                     const bitset_container_t *src_2) {
4641     array_bitset_container_andnot(src_1, src_2, src_1);
4642 }
4643 
4644 /* Compute the andnot of src_1 and src_2 and write the result to
4645  * dst, which does not initially have a valid container.
4646  * Return true for a bitset result; false for array
4647  */
4648 
bitset_array_container_andnot(const bitset_container_t * src_1,const array_container_t * src_2,void ** dst)4649 bool bitset_array_container_andnot(const bitset_container_t *src_1,
4650                                    const array_container_t *src_2, void **dst) {
4651     // Java did this directly, but we have option of asm or avx
4652     bitset_container_t *result = bitset_container_create();
4653     bitset_container_copy(src_1, result);
4654     result->cardinality =
4655         (int32_t)bitset_clear_list(result->array, (uint64_t)result->cardinality,
4656                                    src_2->array, (uint64_t)src_2->cardinality);
4657 
4658     // do required type conversions.
4659     if (result->cardinality <= DEFAULT_MAX_SIZE) {
4660         *dst = array_container_from_bitset(result);
4661         bitset_container_free(result);
4662         return false;
4663     }
4664     *dst = result;
4665     return true;
4666 }
4667 
4668 /* Compute the andnot of src_1 and src_2 and write the result to
4669  * dst (which has no container initially).  It will modify src_1
4670  * to be dst if the result is a bitset.  Otherwise, it will
4671  * free src_1 and dst will be a new array container.  In both
4672  * cases, the caller is responsible for deallocating dst.
4673  * Returns true iff dst is a bitset  */
4674 
bitset_array_container_iandnot(bitset_container_t * src_1,const array_container_t * src_2,void ** dst)4675 bool bitset_array_container_iandnot(bitset_container_t *src_1,
4676                                     const array_container_t *src_2,
4677                                     void **dst) {
4678     *dst = src_1;
4679     src_1->cardinality =
4680         (int32_t)bitset_clear_list(src_1->array, (uint64_t)src_1->cardinality,
4681                                    src_2->array, (uint64_t)src_2->cardinality);
4682 
4683     if (src_1->cardinality <= DEFAULT_MAX_SIZE) {
4684         *dst = array_container_from_bitset(src_1);
4685         bitset_container_free(src_1);
4686         return false;  // not bitset
4687     } else
4688         return true;
4689 }
4690 
4691 /* Compute the andnot of src_1 and src_2 and write the result to
4692  * dst. Result may be either a bitset or an array container
4693  * (returns "result is bitset"). dst does not initially have
4694  * any container, but becomes either a bitset container (return
4695  * result true) or an array container.
4696  */
4697 
run_bitset_container_andnot(const run_container_t * src_1,const bitset_container_t * src_2,void ** dst)4698 bool run_bitset_container_andnot(const run_container_t *src_1,
4699                                  const bitset_container_t *src_2, void **dst) {
4700     // follows the Java implementation as of June 2016
4701     int card = run_container_cardinality(src_1);
4702     if (card <= DEFAULT_MAX_SIZE) {
4703         // must be an array
4704         array_container_t *answer = array_container_create_given_capacity(card);
4705         answer->cardinality = 0;
4706         for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
4707             rle16_t rle = src_1->runs[rlepos];
4708             for (int run_value = rle.value; run_value <= rle.value + rle.length;
4709                  ++run_value) {
4710                 if (!bitset_container_get(src_2, (uint16_t)run_value)) {
4711                     answer->array[answer->cardinality++] = (uint16_t)run_value;
4712                 }
4713             }
4714         }
4715         *dst = answer;
4716         return false;
4717     } else {  // we guess it will be a bitset, though have to check guess when
4718               // done
4719         bitset_container_t *answer = bitset_container_clone(src_2);
4720 
4721         uint32_t last_pos = 0;
4722         for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
4723             rle16_t rle = src_1->runs[rlepos];
4724 
4725             uint32_t start = rle.value;
4726             uint32_t end = start + rle.length + 1;
4727             bitset_reset_range(answer->array, last_pos, start);
4728             bitset_flip_range(answer->array, start, end);
4729             last_pos = end;
4730         }
4731         bitset_reset_range(answer->array, last_pos, (uint32_t)(1 << 16));
4732 
4733         answer->cardinality = bitset_container_compute_cardinality(answer);
4734 
4735         if (answer->cardinality <= DEFAULT_MAX_SIZE) {
4736             *dst = array_container_from_bitset(answer);
4737             bitset_container_free(answer);
4738             return false;  // not bitset
4739         }
4740         *dst = answer;
4741         return true;  // bitset
4742     }
4743 }
4744 
4745 /* Compute the andnot of src_1 and src_2 and write the result to
4746  * dst. Result may be either a bitset or an array container
4747  * (returns "result is bitset"). dst does not initially have
4748  * any container, but becomes either a bitset container (return
4749  * result true) or an array container.
4750  */
4751 
run_bitset_container_iandnot(run_container_t * src_1,const bitset_container_t * src_2,void ** dst)4752 bool run_bitset_container_iandnot(run_container_t *src_1,
4753                                   const bitset_container_t *src_2, void **dst) {
4754     // dummy implementation
4755     bool ans = run_bitset_container_andnot(src_1, src_2, dst);
4756     run_container_free(src_1);
4757     return ans;
4758 }
4759 
4760 /* Compute the andnot of src_1 and src_2 and write the result to
4761  * dst. Result may be either a bitset or an array container
4762  * (returns "result is bitset").  dst does not initially have
4763  * any container, but becomes either a bitset container (return
4764  * result true) or an array container.
4765  */
4766 
bitset_run_container_andnot(const bitset_container_t * src_1,const run_container_t * src_2,void ** dst)4767 bool bitset_run_container_andnot(const bitset_container_t *src_1,
4768                                  const run_container_t *src_2, void **dst) {
4769     // follows Java implementation
4770     bitset_container_t *result = bitset_container_create();
4771 
4772     bitset_container_copy(src_1, result);
4773     for (int32_t rlepos = 0; rlepos < src_2->n_runs; ++rlepos) {
4774         rle16_t rle = src_2->runs[rlepos];
4775         bitset_reset_range(result->array, rle.value,
4776                            rle.value + rle.length + UINT32_C(1));
4777     }
4778     result->cardinality = bitset_container_compute_cardinality(result);
4779 
4780     if (result->cardinality <= DEFAULT_MAX_SIZE) {
4781         *dst = array_container_from_bitset(result);
4782         bitset_container_free(result);
4783         return false;  // not bitset
4784     }
4785     *dst = result;
4786     return true;  // bitset
4787 }
4788 
4789 /* Compute the andnot of src_1 and src_2 and write the result to
4790  * dst (which has no container initially).  It will modify src_1
4791  * to be dst if the result is a bitset.  Otherwise, it will
4792  * free src_1 and dst will be a new array container.  In both
4793  * cases, the caller is responsible for deallocating dst.
4794  * Returns true iff dst is a bitset  */
4795 
bitset_run_container_iandnot(bitset_container_t * src_1,const run_container_t * src_2,void ** dst)4796 bool bitset_run_container_iandnot(bitset_container_t *src_1,
4797                                   const run_container_t *src_2, void **dst) {
4798     *dst = src_1;
4799 
4800     for (int32_t rlepos = 0; rlepos < src_2->n_runs; ++rlepos) {
4801         rle16_t rle = src_2->runs[rlepos];
4802         bitset_reset_range(src_1->array, rle.value,
4803                            rle.value + rle.length + UINT32_C(1));
4804     }
4805     src_1->cardinality = bitset_container_compute_cardinality(src_1);
4806 
4807     if (src_1->cardinality <= DEFAULT_MAX_SIZE) {
4808         *dst = array_container_from_bitset(src_1);
4809         bitset_container_free(src_1);
4810         return false;  // not bitset
4811     } else
4812         return true;
4813 }
4814 
4815 /* helper. a_out must be a valid array container with adequate capacity.
4816  * Returns the cardinality of the output container. Partly Based on Java
4817  * implementation Util.unsignedDifference.
4818  *
4819  * TODO: Util.unsignedDifference does not use advanceUntil.  Is it cheaper
4820  * to avoid advanceUntil?
4821  */
4822 
run_array_array_subtract(const run_container_t * r,const array_container_t * a_in,array_container_t * a_out)4823 static int run_array_array_subtract(const run_container_t *r,
4824                                     const array_container_t *a_in,
4825                                     array_container_t *a_out) {
4826     int out_card = 0;
4827     int32_t in_array_pos =
4828         -1;  // since advanceUntil always assumes we start the search AFTER this
4829 
4830     for (int rlepos = 0; rlepos < r->n_runs; rlepos++) {
4831         int32_t start = r->runs[rlepos].value;
4832         int32_t end = start + r->runs[rlepos].length + 1;
4833 
4834         in_array_pos = advanceUntil(a_in->array, in_array_pos,
4835                                     a_in->cardinality, (uint16_t)start);
4836 
4837         if (in_array_pos >= a_in->cardinality) {  // run has no items subtracted
4838             for (int32_t i = start; i < end; ++i)
4839                 a_out->array[out_card++] = (uint16_t)i;
4840         } else {
4841             uint16_t next_nonincluded = a_in->array[in_array_pos];
4842             if (next_nonincluded >= end) {
4843                 // another case when run goes unaltered
4844                 for (int32_t i = start; i < end; ++i)
4845                     a_out->array[out_card++] = (uint16_t)i;
4846                 in_array_pos--;  // ensure we see this item again if necessary
4847             } else {
4848                 for (int32_t i = start; i < end; ++i)
4849                     if (i != next_nonincluded)
4850                         a_out->array[out_card++] = (uint16_t)i;
4851                     else  // 0 should ensure  we don't match
4852                         next_nonincluded =
4853                             (in_array_pos + 1 >= a_in->cardinality)
4854                                 ? 0
4855                                 : a_in->array[++in_array_pos];
4856                 in_array_pos--;  // see again
4857             }
4858         }
4859     }
4860     return out_card;
4861 }
4862 
4863 /* dst does not indicate a valid container initially.  Eventually it
4864  * can become any type of container.
4865  */
4866 
run_array_container_andnot(const run_container_t * src_1,const array_container_t * src_2,void ** dst)4867 int run_array_container_andnot(const run_container_t *src_1,
4868                                const array_container_t *src_2, void **dst) {
4869     // follows the Java impl as of June 2016
4870 
4871     int card = run_container_cardinality(src_1);
4872     const int arbitrary_threshold = 32;
4873 
4874     if (card <= arbitrary_threshold) {
4875         if (src_2->cardinality == 0) {
4876             *dst = run_container_clone(src_1);
4877             return RUN_CONTAINER_TYPE_CODE;
4878         }
4879         // Java's "lazyandNot.toEfficientContainer" thing
4880         run_container_t *answer = run_container_create_given_capacity(
4881             card + array_container_cardinality(src_2));
4882 
4883         int rlepos = 0;
4884         int xrlepos = 0;  // "x" is src_2
4885         rle16_t rle = src_1->runs[rlepos];
4886         int32_t start = rle.value;
4887         int32_t end = start + rle.length + 1;
4888         int32_t xstart = src_2->array[xrlepos];
4889 
4890         while ((rlepos < src_1->n_runs) && (xrlepos < src_2->cardinality)) {
4891             if (end <= xstart) {
4892                 // output the first run
4893                 answer->runs[answer->n_runs++] =
4894                     (rle16_t){.value = (uint16_t)start,
4895                               .length = (uint16_t)(end - start - 1)};
4896                 rlepos++;
4897                 if (rlepos < src_1->n_runs) {
4898                     start = src_1->runs[rlepos].value;
4899                     end = start + src_1->runs[rlepos].length + 1;
4900                 }
4901             } else if (xstart + 1 <= start) {
4902                 // exit the second run
4903                 xrlepos++;
4904                 if (xrlepos < src_2->cardinality) {
4905                     xstart = src_2->array[xrlepos];
4906                 }
4907             } else {
4908                 if (start < xstart) {
4909                     answer->runs[answer->n_runs++] =
4910                         (rle16_t){.value = (uint16_t)start,
4911                                   .length = (uint16_t)(xstart - start - 1)};
4912                 }
4913                 if (xstart + 1 < end) {
4914                     start = xstart + 1;
4915                 } else {
4916                     rlepos++;
4917                     if (rlepos < src_1->n_runs) {
4918                         start = src_1->runs[rlepos].value;
4919                         end = start + src_1->runs[rlepos].length + 1;
4920                     }
4921                 }
4922             }
4923         }
4924         if (rlepos < src_1->n_runs) {
4925             answer->runs[answer->n_runs++] =
4926                 (rle16_t){.value = (uint16_t)start,
4927                           .length = (uint16_t)(end - start - 1)};
4928             rlepos++;
4929             if (rlepos < src_1->n_runs) {
4930                 memcpy(answer->runs + answer->n_runs, src_1->runs + rlepos,
4931                        (src_1->n_runs - rlepos) * sizeof(rle16_t));
4932                 answer->n_runs += (src_1->n_runs - rlepos);
4933             }
4934         }
4935         uint8_t return_type;
4936         *dst = convert_run_to_efficient_container(answer, &return_type);
4937         if (answer != *dst) run_container_free(answer);
4938         return return_type;
4939     }
4940     // else it's a bitmap or array
4941 
4942     if (card <= DEFAULT_MAX_SIZE) {
4943         array_container_t *ac = array_container_create_given_capacity(card);
4944         // nb Java code used a generic iterator-based merge to compute
4945         // difference
4946         ac->cardinality = run_array_array_subtract(src_1, src_2, ac);
4947         *dst = ac;
4948         return ARRAY_CONTAINER_TYPE_CODE;
4949     }
4950     bitset_container_t *ans = bitset_container_from_run(src_1);
4951     bool result_is_bitset = bitset_array_container_iandnot(ans, src_2, dst);
4952     return (result_is_bitset ? BITSET_CONTAINER_TYPE_CODE
4953                              : ARRAY_CONTAINER_TYPE_CODE);
4954 }
4955 
4956 /* Compute the andnot of src_1 and src_2 and write the result to
4957  * dst (which has no container initially).  It will modify src_1
4958  * to be dst if the result is a bitset.  Otherwise, it will
4959  * free src_1 and dst will be a new array container.  In both
4960  * cases, the caller is responsible for deallocating dst.
4961  * Returns true iff dst is a bitset  */
4962 
run_array_container_iandnot(run_container_t * src_1,const array_container_t * src_2,void ** dst)4963 int run_array_container_iandnot(run_container_t *src_1,
4964                                 const array_container_t *src_2, void **dst) {
4965     // dummy implementation same as June 2016 Java
4966     int ans = run_array_container_andnot(src_1, src_2, dst);
4967     run_container_free(src_1);
4968     return ans;
4969 }
4970 
4971 /* dst must be a valid array container, allowed to be src_1 */
4972 
array_run_container_andnot(const array_container_t * src_1,const run_container_t * src_2,array_container_t * dst)4973 void array_run_container_andnot(const array_container_t *src_1,
4974                                 const run_container_t *src_2,
4975                                 array_container_t *dst) {
4976     // basically following Java impl as of June 2016
4977     if (src_1->cardinality > dst->capacity) {
4978         array_container_grow(dst, src_1->cardinality, false);
4979     }
4980 
4981     if (src_2->n_runs == 0) {
4982         memmove(dst->array, src_1->array,
4983                 sizeof(uint16_t) * src_1->cardinality);
4984         dst->cardinality = src_1->cardinality;
4985         return;
4986     }
4987     int32_t run_start = src_2->runs[0].value;
4988     int32_t run_end = run_start + src_2->runs[0].length;
4989     int which_run = 0;
4990 
4991     uint16_t val = 0;
4992     int dest_card = 0;
4993     for (int i = 0; i < src_1->cardinality; ++i) {
4994         val = src_1->array[i];
4995         if (val < run_start)
4996             dst->array[dest_card++] = val;
4997         else if (val <= run_end) {
4998             ;  // omitted item
4999         } else {
5000             do {
5001                 if (which_run + 1 < src_2->n_runs) {
5002                     ++which_run;
5003                     run_start = src_2->runs[which_run].value;
5004                     run_end = run_start + src_2->runs[which_run].length;
5005 
5006                 } else
5007                     run_start = run_end = (1 << 16) + 1;
5008             } while (val > run_end);
5009             --i;
5010         }
5011     }
5012     dst->cardinality = dest_card;
5013 }
5014 
5015 /* dst does not indicate a valid container initially.  Eventually it
5016  * can become any kind of container.
5017  */
5018 
array_run_container_iandnot(array_container_t * src_1,const run_container_t * src_2)5019 void array_run_container_iandnot(array_container_t *src_1,
5020                                  const run_container_t *src_2) {
5021     array_run_container_andnot(src_1, src_2, src_1);
5022 }
5023 
5024 /* dst does not indicate a valid container initially.  Eventually it
5025  * can become any kind of container.
5026  */
5027 
run_run_container_andnot(const run_container_t * src_1,const run_container_t * src_2,void ** dst)5028 int run_run_container_andnot(const run_container_t *src_1,
5029                              const run_container_t *src_2, void **dst) {
5030     run_container_t *ans = run_container_create();
5031     run_container_andnot(src_1, src_2, ans);
5032     uint8_t typecode_after;
5033     *dst = convert_run_to_efficient_container_and_free(ans, &typecode_after);
5034     return typecode_after;
5035 }
5036 
5037 /* Compute the andnot of src_1 and src_2 and write the result to
5038  * dst (which has no container initially).  It will modify src_1
5039  * to be dst if the result is a bitset.  Otherwise, it will
5040  * free src_1 and dst will be a new array container.  In both
5041  * cases, the caller is responsible for deallocating dst.
5042  * Returns true iff dst is a bitset  */
5043 
run_run_container_iandnot(run_container_t * src_1,const run_container_t * src_2,void ** dst)5044 int run_run_container_iandnot(run_container_t *src_1,
5045                               const run_container_t *src_2, void **dst) {
5046     // following Java impl as of June 2016 (dummy)
5047     int ans = run_run_container_andnot(src_1, src_2, dst);
5048     run_container_free(src_1);
5049     return ans;
5050 }
5051 
5052 /*
5053  * dst is a valid array container and may be the same as src_1
5054  */
5055 
array_array_container_andnot(const array_container_t * src_1,const array_container_t * src_2,array_container_t * dst)5056 void array_array_container_andnot(const array_container_t *src_1,
5057                                   const array_container_t *src_2,
5058                                   array_container_t *dst) {
5059     array_container_andnot(src_1, src_2, dst);
5060 }
5061 
5062 /* inplace array-array andnot will always be able to reuse the space of
5063  * src_1 */
array_array_container_iandnot(array_container_t * src_1,const array_container_t * src_2)5064 void array_array_container_iandnot(array_container_t *src_1,
5065                                    const array_container_t *src_2) {
5066     array_container_andnot(src_1, src_2, src_1);
5067 }
5068 
5069 /* Compute the andnot of src_1 and src_2 and write the result to
5070  * dst (which has no container initially). Return value is
5071  * "dst is a bitset"
5072  */
5073 
bitset_bitset_container_andnot(const bitset_container_t * src_1,const bitset_container_t * src_2,void ** dst)5074 bool bitset_bitset_container_andnot(const bitset_container_t *src_1,
5075                                     const bitset_container_t *src_2,
5076                                     void **dst) {
5077     bitset_container_t *ans = bitset_container_create();
5078     int card = bitset_container_andnot(src_1, src_2, ans);
5079     if (card <= DEFAULT_MAX_SIZE) {
5080         *dst = array_container_from_bitset(ans);
5081         bitset_container_free(ans);
5082         return false;  // not bitset
5083     } else {
5084         *dst = ans;
5085         return true;
5086     }
5087 }
5088 
5089 /* Compute the andnot of src_1 and src_2 and write the result to
5090  * dst (which has no container initially).  It will modify src_1
5091  * to be dst if the result is a bitset.  Otherwise, it will
5092  * free src_1 and dst will be a new array container.  In both
5093  * cases, the caller is responsible for deallocating dst.
5094  * Returns true iff dst is a bitset  */
5095 
bitset_bitset_container_iandnot(bitset_container_t * src_1,const bitset_container_t * src_2,void ** dst)5096 bool bitset_bitset_container_iandnot(bitset_container_t *src_1,
5097                                      const bitset_container_t *src_2,
5098                                      void **dst) {
5099     int card = bitset_container_andnot(src_1, src_2, src_1);
5100     if (card <= DEFAULT_MAX_SIZE) {
5101         *dst = array_container_from_bitset(src_1);
5102         bitset_container_free(src_1);
5103         return false;  // not bitset
5104     } else {
5105         *dst = src_1;
5106         return true;
5107     }
5108 }
5109 /* end file src/containers/mixed_andnot.c */
5110 /* begin file src/containers/mixed_equal.c */
5111 
array_container_equal_bitset(const array_container_t * container1,const bitset_container_t * container2)5112 bool array_container_equal_bitset(const array_container_t* container1,
5113                                   const bitset_container_t* container2) {
5114     if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) {
5115         if (container2->cardinality != container1->cardinality) {
5116             return false;
5117         }
5118     }
5119     int32_t pos = 0;
5120     for (int32_t i = 0; i < BITSET_CONTAINER_SIZE_IN_WORDS; ++i) {
5121         uint64_t w = container2->array[i];
5122         while (w != 0) {
5123             uint64_t t = w & (~w + 1);
5124             uint16_t r = i * 64 + __builtin_ctzll(w);
5125             if (pos >= container1->cardinality) {
5126                 return false;
5127             }
5128             if (container1->array[pos] != r) {
5129                 return false;
5130             }
5131             ++pos;
5132             w ^= t;
5133         }
5134     }
5135     return (pos == container1->cardinality);
5136 }
5137 
run_container_equals_array(const run_container_t * container1,const array_container_t * container2)5138 bool run_container_equals_array(const run_container_t* container1,
5139                                 const array_container_t* container2) {
5140     if (run_container_cardinality(container1) != container2->cardinality)
5141         return false;
5142     int32_t pos = 0;
5143     for (int i = 0; i < container1->n_runs; ++i) {
5144         const uint32_t run_start = container1->runs[i].value;
5145         const uint32_t le = container1->runs[i].length;
5146 
5147         if (container2->array[pos] != run_start) {
5148             return false;
5149         }
5150 
5151         if (container2->array[pos + le] != run_start + le) {
5152             return false;
5153         }
5154 
5155         pos += le + 1;
5156     }
5157     return true;
5158 }
5159 
run_container_equals_bitset(const run_container_t * container1,const bitset_container_t * container2)5160 bool run_container_equals_bitset(const run_container_t* container1,
5161                                  const bitset_container_t* container2) {
5162 
5163     int run_card = run_container_cardinality(container1);
5164     int bitset_card = (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) ?
5165                       container2->cardinality :
5166                       bitset_container_compute_cardinality(container2);
5167     if (bitset_card != run_card) {
5168         return false;
5169     }
5170 
5171     for (int32_t i = 0; i < container1->n_runs; i++) {
5172         uint32_t begin = container1->runs[i].value;
5173         if (container1->runs[i].length) {
5174             uint32_t end = begin + container1->runs[i].length + 1;
5175             if (!bitset_container_contains_range(container2, begin, end)) {
5176                 return false;
5177             }
5178         } else {
5179             if (!bitset_container_contains(container2, begin)) {
5180                 return false;
5181             }
5182         }
5183     }
5184 
5185     return true;
5186 }
5187 /* end file src/containers/mixed_equal.c */
5188 /* begin file src/containers/mixed_intersection.c */
5189 /*
5190  * mixed_intersection.c
5191  *
5192  */
5193 
5194 
5195 /* Compute the intersection of src_1 and src_2 and write the result to
5196  * dst.  */
array_bitset_container_intersection(const array_container_t * src_1,const bitset_container_t * src_2,array_container_t * dst)5197 void array_bitset_container_intersection(const array_container_t *src_1,
5198                                          const bitset_container_t *src_2,
5199                                          array_container_t *dst) {
5200     if (dst->capacity < src_1->cardinality) {
5201         array_container_grow(dst, src_1->cardinality, false);
5202     }
5203     int32_t newcard = 0;  // dst could be src_1
5204     const int32_t origcard = src_1->cardinality;
5205     for (int i = 0; i < origcard; ++i) {
5206         uint16_t key = src_1->array[i];
5207         // this branchless approach is much faster...
5208         dst->array[newcard] = key;
5209         newcard += bitset_container_contains(src_2, key);
5210         /**
5211          * we could do it this way instead...
5212          * if (bitset_container_contains(src_2, key)) {
5213          * dst->array[newcard++] = key;
5214          * }
5215          * but if the result is unpredictable, the processor generates
5216          * many mispredicted branches.
5217          * Difference can be huge (from 3 cycles when predictable all the way
5218          * to 16 cycles when unpredictable.
5219          * See
5220          * https://github.com/lemire/Code-used-on-Daniel-Lemire-s-blog/blob/master/extra/bitset/c/arraybitsetintersection.c
5221          */
5222     }
5223     dst->cardinality = newcard;
5224 }
5225 
5226 /* Compute the size of the intersection of src_1 and src_2. */
array_bitset_container_intersection_cardinality(const array_container_t * src_1,const bitset_container_t * src_2)5227 int array_bitset_container_intersection_cardinality(
5228     const array_container_t *src_1, const bitset_container_t *src_2) {
5229     int32_t newcard = 0;
5230     const int32_t origcard = src_1->cardinality;
5231     for (int i = 0; i < origcard; ++i) {
5232         uint16_t key = src_1->array[i];
5233         newcard += bitset_container_contains(src_2, key);
5234     }
5235     return newcard;
5236 }
5237 
5238 
array_bitset_container_intersect(const array_container_t * src_1,const bitset_container_t * src_2)5239 bool array_bitset_container_intersect(const array_container_t *src_1,
5240                                          const bitset_container_t *src_2) {
5241 	const int32_t origcard = src_1->cardinality;
5242 	for (int i = 0; i < origcard; ++i) {
5243 	        uint16_t key = src_1->array[i];
5244 	        if(bitset_container_contains(src_2, key)) return true;
5245 	}
5246 	return false;
5247 }
5248 
5249 /* Compute the intersection of src_1 and src_2 and write the result to
5250  * dst. It is allowed for dst to be equal to src_1. We assume that dst is a
5251  * valid container. */
array_run_container_intersection(const array_container_t * src_1,const run_container_t * src_2,array_container_t * dst)5252 void array_run_container_intersection(const array_container_t *src_1,
5253                                       const run_container_t *src_2,
5254                                       array_container_t *dst) {
5255     if (run_container_is_full(src_2)) {
5256         if (dst != src_1) array_container_copy(src_1, dst);
5257         return;
5258     }
5259     if (dst->capacity < src_1->cardinality) {
5260         array_container_grow(dst, src_1->cardinality, false);
5261     }
5262     if (src_2->n_runs == 0) {
5263         return;
5264     }
5265     int32_t rlepos = 0;
5266     int32_t arraypos = 0;
5267     rle16_t rle = src_2->runs[rlepos];
5268     int32_t newcard = 0;
5269     while (arraypos < src_1->cardinality) {
5270         const uint16_t arrayval = src_1->array[arraypos];
5271         while (rle.value + rle.length <
5272                arrayval) {  // this will frequently be false
5273             ++rlepos;
5274             if (rlepos == src_2->n_runs) {
5275                 dst->cardinality = newcard;
5276                 return;  // we are done
5277             }
5278             rle = src_2->runs[rlepos];
5279         }
5280         if (rle.value > arrayval) {
5281             arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality,
5282                                     rle.value);
5283         } else {
5284             dst->array[newcard] = arrayval;
5285             newcard++;
5286             arraypos++;
5287         }
5288     }
5289     dst->cardinality = newcard;
5290 }
5291 
5292 /* Compute the intersection of src_1 and src_2 and write the result to
5293  * *dst. If the result is true then the result is a bitset_container_t
5294  * otherwise is a array_container_t. If *dst ==  src_2, an in-place processing
5295  * is attempted.*/
run_bitset_container_intersection(const run_container_t * src_1,const bitset_container_t * src_2,void ** dst)5296 bool run_bitset_container_intersection(const run_container_t *src_1,
5297                                        const bitset_container_t *src_2,
5298                                        void **dst) {
5299     if (run_container_is_full(src_1)) {
5300         if (*dst != src_2) *dst = bitset_container_clone(src_2);
5301         return true;
5302     }
5303     int32_t card = run_container_cardinality(src_1);
5304     if (card <= DEFAULT_MAX_SIZE) {
5305         // result can only be an array (assuming that we never make a
5306         // RunContainer)
5307         if (card > src_2->cardinality) {
5308             card = src_2->cardinality;
5309         }
5310         array_container_t *answer = array_container_create_given_capacity(card);
5311         *dst = answer;
5312         if (*dst == NULL) {
5313             return false;
5314         }
5315         for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
5316             rle16_t rle = src_1->runs[rlepos];
5317             uint32_t endofrun = (uint32_t)rle.value + rle.length;
5318             for (uint32_t runValue = rle.value; runValue <= endofrun;
5319                  ++runValue) {
5320                 answer->array[answer->cardinality] = (uint16_t)runValue;
5321                 answer->cardinality +=
5322                     bitset_container_contains(src_2, runValue);
5323             }
5324         }
5325         return false;
5326     }
5327     if (*dst == src_2) {  // we attempt in-place
5328         bitset_container_t *answer = (bitset_container_t *)*dst;
5329         uint32_t start = 0;
5330         for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
5331             const rle16_t rle = src_1->runs[rlepos];
5332             uint32_t end = rle.value;
5333             bitset_reset_range(src_2->array, start, end);
5334 
5335             start = end + rle.length + 1;
5336         }
5337         bitset_reset_range(src_2->array, start, UINT32_C(1) << 16);
5338         answer->cardinality = bitset_container_compute_cardinality(answer);
5339         if (src_2->cardinality > DEFAULT_MAX_SIZE) {
5340             return true;
5341         } else {
5342             array_container_t *newanswer = array_container_from_bitset(src_2);
5343             if (newanswer == NULL) {
5344                 *dst = NULL;
5345                 return false;
5346             }
5347             *dst = newanswer;
5348             return false;
5349         }
5350     } else {  // no inplace
5351         // we expect the answer to be a bitmap (if we are lucky)
5352         bitset_container_t *answer = bitset_container_clone(src_2);
5353 
5354         *dst = answer;
5355         if (answer == NULL) {
5356             return true;
5357         }
5358         uint32_t start = 0;
5359         for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
5360             const rle16_t rle = src_1->runs[rlepos];
5361             uint32_t end = rle.value;
5362             bitset_reset_range(answer->array, start, end);
5363             start = end + rle.length + 1;
5364         }
5365         bitset_reset_range(answer->array, start, UINT32_C(1) << 16);
5366         answer->cardinality = bitset_container_compute_cardinality(answer);
5367 
5368         if (answer->cardinality > DEFAULT_MAX_SIZE) {
5369             return true;
5370         } else {
5371             array_container_t *newanswer = array_container_from_bitset(answer);
5372             bitset_container_free((bitset_container_t *)*dst);
5373             if (newanswer == NULL) {
5374                 *dst = NULL;
5375                 return false;
5376             }
5377             *dst = newanswer;
5378             return false;
5379         }
5380     }
5381 }
5382 
5383 /* Compute the size of the intersection between src_1 and src_2 . */
array_run_container_intersection_cardinality(const array_container_t * src_1,const run_container_t * src_2)5384 int array_run_container_intersection_cardinality(const array_container_t *src_1,
5385                                                  const run_container_t *src_2) {
5386     if (run_container_is_full(src_2)) {
5387         return src_1->cardinality;
5388     }
5389     if (src_2->n_runs == 0) {
5390         return 0;
5391     }
5392     int32_t rlepos = 0;
5393     int32_t arraypos = 0;
5394     rle16_t rle = src_2->runs[rlepos];
5395     int32_t newcard = 0;
5396     while (arraypos < src_1->cardinality) {
5397         const uint16_t arrayval = src_1->array[arraypos];
5398         while (rle.value + rle.length <
5399                arrayval) {  // this will frequently be false
5400             ++rlepos;
5401             if (rlepos == src_2->n_runs) {
5402                 return newcard;  // we are done
5403             }
5404             rle = src_2->runs[rlepos];
5405         }
5406         if (rle.value > arrayval) {
5407             arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality,
5408                                     rle.value);
5409         } else {
5410             newcard++;
5411             arraypos++;
5412         }
5413     }
5414     return newcard;
5415 }
5416 
5417 /* Compute the intersection  between src_1 and src_2
5418  **/
run_bitset_container_intersection_cardinality(const run_container_t * src_1,const bitset_container_t * src_2)5419 int run_bitset_container_intersection_cardinality(
5420     const run_container_t *src_1, const bitset_container_t *src_2) {
5421     if (run_container_is_full(src_1)) {
5422         return bitset_container_cardinality(src_2);
5423     }
5424     int answer = 0;
5425     for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
5426         rle16_t rle = src_1->runs[rlepos];
5427         answer +=
5428             bitset_lenrange_cardinality(src_2->array, rle.value, rle.length);
5429     }
5430     return answer;
5431 }
5432 
5433 
array_run_container_intersect(const array_container_t * src_1,const run_container_t * src_2)5434 bool array_run_container_intersect(const array_container_t *src_1,
5435                                       const run_container_t *src_2) {
5436 	if( run_container_is_full(src_2) ) {
5437 	    return !array_container_empty(src_1);
5438 	}
5439 	if (src_2->n_runs == 0) {
5440         return false;
5441     }
5442     int32_t rlepos = 0;
5443     int32_t arraypos = 0;
5444     rle16_t rle = src_2->runs[rlepos];
5445     while (arraypos < src_1->cardinality) {
5446         const uint16_t arrayval = src_1->array[arraypos];
5447         while (rle.value + rle.length <
5448                arrayval) {  // this will frequently be false
5449             ++rlepos;
5450             if (rlepos == src_2->n_runs) {
5451                 return false;  // we are done
5452             }
5453             rle = src_2->runs[rlepos];
5454         }
5455         if (rle.value > arrayval) {
5456             arraypos = advanceUntil(src_1->array, arraypos, src_1->cardinality,
5457                                     rle.value);
5458         } else {
5459             return true;
5460         }
5461     }
5462     return false;
5463 }
5464 
5465 /* Compute the intersection  between src_1 and src_2
5466  **/
run_bitset_container_intersect(const run_container_t * src_1,const bitset_container_t * src_2)5467 bool run_bitset_container_intersect(const run_container_t *src_1,
5468                                        const bitset_container_t *src_2) {
5469 	   if( run_container_is_full(src_1) ) {
5470 		   return !bitset_container_empty(src_2);
5471 	   }
5472        for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
5473            rle16_t rle = src_1->runs[rlepos];
5474            if(!bitset_lenrange_empty(src_2->array, rle.value,rle.length)) return true;
5475        }
5476        return false;
5477 }
5478 
5479 /*
5480  * Compute the intersection between src_1 and src_2 and write the result
5481  * to *dst. If the return function is true, the result is a bitset_container_t
5482  * otherwise is a array_container_t.
5483  */
bitset_bitset_container_intersection(const bitset_container_t * src_1,const bitset_container_t * src_2,void ** dst)5484 bool bitset_bitset_container_intersection(const bitset_container_t *src_1,
5485                                           const bitset_container_t *src_2,
5486                                           void **dst) {
5487     const int newCardinality = bitset_container_and_justcard(src_1, src_2);
5488     if (newCardinality > DEFAULT_MAX_SIZE) {
5489         *dst = bitset_container_create();
5490         if (*dst != NULL) {
5491             bitset_container_and_nocard(src_1, src_2,
5492                                         (bitset_container_t *)*dst);
5493             ((bitset_container_t *)*dst)->cardinality = newCardinality;
5494         }
5495         return true;  // it is a bitset
5496     }
5497     *dst = array_container_create_given_capacity(newCardinality);
5498     if (*dst != NULL) {
5499         ((array_container_t *)*dst)->cardinality = newCardinality;
5500         bitset_extract_intersection_setbits_uint16(
5501             ((const bitset_container_t *)src_1)->array,
5502             ((const bitset_container_t *)src_2)->array,
5503             BITSET_CONTAINER_SIZE_IN_WORDS, ((array_container_t *)*dst)->array,
5504             0);
5505     }
5506     return false;  // not a bitset
5507 }
5508 
bitset_bitset_container_intersection_inplace(bitset_container_t * src_1,const bitset_container_t * src_2,void ** dst)5509 bool bitset_bitset_container_intersection_inplace(
5510     bitset_container_t *src_1, const bitset_container_t *src_2, void **dst) {
5511     const int newCardinality = bitset_container_and_justcard(src_1, src_2);
5512     if (newCardinality > DEFAULT_MAX_SIZE) {
5513         *dst = src_1;
5514         bitset_container_and_nocard(src_1, src_2, src_1);
5515         ((bitset_container_t *)*dst)->cardinality = newCardinality;
5516         return true;  // it is a bitset
5517     }
5518     *dst = array_container_create_given_capacity(newCardinality);
5519     if (*dst != NULL) {
5520         ((array_container_t *)*dst)->cardinality = newCardinality;
5521         bitset_extract_intersection_setbits_uint16(
5522             ((const bitset_container_t *)src_1)->array,
5523             ((const bitset_container_t *)src_2)->array,
5524             BITSET_CONTAINER_SIZE_IN_WORDS, ((array_container_t *)*dst)->array,
5525             0);
5526     }
5527     return false;  // not a bitset
5528 }
5529 /* end file src/containers/mixed_intersection.c */
5530 /* begin file src/containers/mixed_negation.c */
5531 /*
5532  * mixed_negation.c
5533  *
5534  */
5535 
5536 #include <assert.h>
5537 #include <string.h>
5538 
5539 
5540 // TODO: make simplified and optimized negation code across
5541 // the full range.
5542 
5543 /* Negation across the entire range of the container.
5544  * Compute the  negation of src  and write the result
5545  * to *dst. The complement of a
5546  * sufficiently sparse set will always be dense and a hence a bitmap
5547 ' * We assume that dst is pre-allocated and a valid bitset container
5548  * There can be no in-place version.
5549  */
array_container_negation(const array_container_t * src,bitset_container_t * dst)5550 void array_container_negation(const array_container_t *src,
5551                               bitset_container_t *dst) {
5552     uint64_t card = UINT64_C(1 << 16);
5553     bitset_container_set_all(dst);
5554 
5555     dst->cardinality = (int32_t)bitset_clear_list(dst->array, card, src->array,
5556                                                   (uint64_t)src->cardinality);
5557 }
5558 
5559 /* Negation across the entire range of the container
5560  * Compute the  negation of src  and write the result
5561  * to *dst.  A true return value indicates a bitset result,
5562  * otherwise the result is an array container.
5563  *  We assume that dst is not pre-allocated. In
5564  * case of failure, *dst will be NULL.
5565  */
bitset_container_negation(const bitset_container_t * src,void ** dst)5566 bool bitset_container_negation(const bitset_container_t *src, void **dst) {
5567     return bitset_container_negation_range(src, 0, (1 << 16), dst);
5568 }
5569 
5570 /* inplace version */
5571 /*
5572  * Same as bitset_container_negation except that if the output is to
5573  * be a
5574  * bitset_container_t, then src is modified and no allocation is made.
5575  * If the output is to be an array_container_t, then caller is responsible
5576  * to free the container.
5577  * In all cases, the result is in *dst.
5578  */
bitset_container_negation_inplace(bitset_container_t * src,void ** dst)5579 bool bitset_container_negation_inplace(bitset_container_t *src, void **dst) {
5580     return bitset_container_negation_range_inplace(src, 0, (1 << 16), dst);
5581 }
5582 
5583 /* Negation across the entire range of container
5584  * Compute the  negation of src  and write the result
5585  * to *dst.  Return values are the *_TYPECODES as defined * in containers.h
5586  *  We assume that dst is not pre-allocated. In
5587  * case of failure, *dst will be NULL.
5588  */
run_container_negation(const run_container_t * src,void ** dst)5589 int run_container_negation(const run_container_t *src, void **dst) {
5590     return run_container_negation_range(src, 0, (1 << 16), dst);
5591 }
5592 
5593 /*
5594  * Same as run_container_negation except that if the output is to
5595  * be a
5596  * run_container_t, and has the capacity to hold the result,
5597  * then src is modified and no allocation is made.
5598  * In all cases, the result is in *dst.
5599  */
run_container_negation_inplace(run_container_t * src,void ** dst)5600 int run_container_negation_inplace(run_container_t *src, void **dst) {
5601     return run_container_negation_range_inplace(src, 0, (1 << 16), dst);
5602 }
5603 
5604 /* Negation across a range of the container.
5605  * Compute the  negation of src  and write the result
5606  * to *dst. Returns true if the result is a bitset container
5607  * and false for an array container.  *dst is not preallocated.
5608  */
array_container_negation_range(const array_container_t * src,const int range_start,const int range_end,void ** dst)5609 bool array_container_negation_range(const array_container_t *src,
5610                                     const int range_start, const int range_end,
5611                                     void **dst) {
5612     /* close port of the Java implementation */
5613     if (range_start >= range_end) {
5614         *dst = array_container_clone(src);
5615         return false;
5616     }
5617 
5618     int32_t start_index =
5619         binarySearch(src->array, src->cardinality, (uint16_t)range_start);
5620     if (start_index < 0) start_index = -start_index - 1;
5621 
5622     int32_t last_index =
5623         binarySearch(src->array, src->cardinality, (uint16_t)(range_end - 1));
5624     if (last_index < 0) last_index = -last_index - 2;
5625 
5626     const int32_t current_values_in_range = last_index - start_index + 1;
5627     const int32_t span_to_be_flipped = range_end - range_start;
5628     const int32_t new_values_in_range =
5629         span_to_be_flipped - current_values_in_range;
5630     const int32_t cardinality_change =
5631         new_values_in_range - current_values_in_range;
5632     const int32_t new_cardinality = src->cardinality + cardinality_change;
5633 
5634     if (new_cardinality > DEFAULT_MAX_SIZE) {
5635         bitset_container_t *temp = bitset_container_from_array(src);
5636         bitset_flip_range(temp->array, (uint32_t)range_start,
5637                           (uint32_t)range_end);
5638         temp->cardinality = new_cardinality;
5639         *dst = temp;
5640         return true;
5641     }
5642 
5643     array_container_t *arr =
5644         array_container_create_given_capacity(new_cardinality);
5645     *dst = (void *)arr;
5646     if(new_cardinality == 0) {
5647       arr->cardinality = new_cardinality;
5648       return false; // we are done.
5649     }
5650     // copy stuff before the active area
5651     memcpy(arr->array, src->array, start_index * sizeof(uint16_t));
5652 
5653     // work on the range
5654     int32_t out_pos = start_index, in_pos = start_index;
5655     int32_t val_in_range = range_start;
5656     for (; val_in_range < range_end && in_pos <= last_index; ++val_in_range) {
5657         if ((uint16_t)val_in_range != src->array[in_pos]) {
5658             arr->array[out_pos++] = (uint16_t)val_in_range;
5659         } else {
5660             ++in_pos;
5661         }
5662     }
5663     for (; val_in_range < range_end; ++val_in_range)
5664         arr->array[out_pos++] = (uint16_t)val_in_range;
5665 
5666     // content after the active range
5667     memcpy(arr->array + out_pos, src->array + (last_index + 1),
5668            (src->cardinality - (last_index + 1)) * sizeof(uint16_t));
5669     arr->cardinality = new_cardinality;
5670     return false;
5671 }
5672 
5673 /* Even when the result would fit, it is unclear how to make an
5674  * inplace version without inefficient copying.
5675  */
5676 
array_container_negation_range_inplace(array_container_t * src,const int range_start,const int range_end,void ** dst)5677 bool array_container_negation_range_inplace(array_container_t *src,
5678                                             const int range_start,
5679                                             const int range_end, void **dst) {
5680     bool ans = array_container_negation_range(src, range_start, range_end, dst);
5681     // TODO : try a real inplace version
5682     array_container_free(src);
5683     return ans;
5684 }
5685 
5686 /* Negation across a range of the container
5687  * Compute the  negation of src  and write the result
5688  * to *dst.  A true return value indicates a bitset result,
5689  * otherwise the result is an array container.
5690  *  We assume that dst is not pre-allocated. In
5691  * case of failure, *dst will be NULL.
5692  */
bitset_container_negation_range(const bitset_container_t * src,const int range_start,const int range_end,void ** dst)5693 bool bitset_container_negation_range(const bitset_container_t *src,
5694                                      const int range_start, const int range_end,
5695                                      void **dst) {
5696     // TODO maybe consider density-based estimate
5697     // and sometimes build result directly as array, with
5698     // conversion back to bitset if wrong.  Or determine
5699     // actual result cardinality, then go directly for the known final cont.
5700 
5701     // keep computation using bitsets as long as possible.
5702     bitset_container_t *t = bitset_container_clone(src);
5703     bitset_flip_range(t->array, (uint32_t)range_start, (uint32_t)range_end);
5704     t->cardinality = bitset_container_compute_cardinality(t);
5705 
5706     if (t->cardinality > DEFAULT_MAX_SIZE) {
5707         *dst = t;
5708         return true;
5709     } else {
5710         *dst = array_container_from_bitset(t);
5711         bitset_container_free(t);
5712         return false;
5713     }
5714 }
5715 
5716 /* inplace version */
5717 /*
5718  * Same as bitset_container_negation except that if the output is to
5719  * be a
5720  * bitset_container_t, then src is modified and no allocation is made.
5721  * If the output is to be an array_container_t, then caller is responsible
5722  * to free the container.
5723  * In all cases, the result is in *dst.
5724  */
bitset_container_negation_range_inplace(bitset_container_t * src,const int range_start,const int range_end,void ** dst)5725 bool bitset_container_negation_range_inplace(bitset_container_t *src,
5726                                              const int range_start,
5727                                              const int range_end, void **dst) {
5728     bitset_flip_range(src->array, (uint32_t)range_start, (uint32_t)range_end);
5729     src->cardinality = bitset_container_compute_cardinality(src);
5730     if (src->cardinality > DEFAULT_MAX_SIZE) {
5731         *dst = src;
5732         return true;
5733     }
5734     *dst = array_container_from_bitset(src);
5735     bitset_container_free(src);
5736     return false;
5737 }
5738 
5739 /* Negation across a range of container
5740  * Compute the  negation of src  and write the result
5741  * to *dst. Return values are the *_TYPECODES as defined * in containers.h
5742  *  We assume that dst is not pre-allocated. In
5743  * case of failure, *dst will be NULL.
5744  */
run_container_negation_range(const run_container_t * src,const int range_start,const int range_end,void ** dst)5745 int run_container_negation_range(const run_container_t *src,
5746                                  const int range_start, const int range_end,
5747                                  void **dst) {
5748     uint8_t return_typecode;
5749 
5750     // follows the Java implementation
5751     if (range_end <= range_start) {
5752         *dst = run_container_clone(src);
5753         return RUN_CONTAINER_TYPE_CODE;
5754     }
5755 
5756     run_container_t *ans = run_container_create_given_capacity(
5757         src->n_runs + 1);  // src->n_runs + 1);
5758     int k = 0;
5759     for (; k < src->n_runs && src->runs[k].value < range_start; ++k) {
5760         ans->runs[k] = src->runs[k];
5761         ans->n_runs++;
5762     }
5763 
5764     run_container_smart_append_exclusive(
5765         ans, (uint16_t)range_start, (uint16_t)(range_end - range_start - 1));
5766 
5767     for (; k < src->n_runs; ++k) {
5768         run_container_smart_append_exclusive(ans, src->runs[k].value,
5769                                              src->runs[k].length);
5770     }
5771 
5772     *dst = convert_run_to_efficient_container(ans, &return_typecode);
5773     if (return_typecode != RUN_CONTAINER_TYPE_CODE) run_container_free(ans);
5774 
5775     return return_typecode;
5776 }
5777 
5778 /*
5779  * Same as run_container_negation except that if the output is to
5780  * be a
5781  * run_container_t, and has the capacity to hold the result,
5782  * then src is modified and no allocation is made.
5783  * In all cases, the result is in *dst.
5784  */
run_container_negation_range_inplace(run_container_t * src,const int range_start,const int range_end,void ** dst)5785 int run_container_negation_range_inplace(run_container_t *src,
5786                                          const int range_start,
5787                                          const int range_end, void **dst) {
5788     uint8_t return_typecode;
5789 
5790     if (range_end <= range_start) {
5791         *dst = src;
5792         return RUN_CONTAINER_TYPE_CODE;
5793     }
5794 
5795     // TODO: efficient special case when range is 0 to 65535 inclusive
5796 
5797     if (src->capacity == src->n_runs) {
5798         // no excess room.  More checking to see if result can fit
5799         bool last_val_before_range = false;
5800         bool first_val_in_range = false;
5801         bool last_val_in_range = false;
5802         bool first_val_past_range = false;
5803 
5804         if (range_start > 0)
5805             last_val_before_range =
5806                 run_container_contains(src, (uint16_t)(range_start - 1));
5807         first_val_in_range = run_container_contains(src, (uint16_t)range_start);
5808 
5809         if (last_val_before_range == first_val_in_range) {
5810             last_val_in_range =
5811                 run_container_contains(src, (uint16_t)(range_end - 1));
5812             if (range_end != 0x10000)
5813                 first_val_past_range =
5814                     run_container_contains(src, (uint16_t)range_end);
5815 
5816             if (last_val_in_range ==
5817                 first_val_past_range) {  // no space for inplace
5818                 int ans = run_container_negation_range(src, range_start,
5819                                                        range_end, dst);
5820                 run_container_free(src);
5821                 return ans;
5822             }
5823         }
5824     }
5825     // all other cases: result will fit
5826 
5827     run_container_t *ans = src;
5828     int my_nbr_runs = src->n_runs;
5829 
5830     ans->n_runs = 0;
5831     int k = 0;
5832     for (; (k < my_nbr_runs) && (src->runs[k].value < range_start); ++k) {
5833         // ans->runs[k] = src->runs[k]; (would be self-copy)
5834         ans->n_runs++;
5835     }
5836 
5837     // as with Java implementation, use locals to give self a buffer of depth 1
5838     rle16_t buffered = (rle16_t){.value = (uint16_t)0, .length = (uint16_t)0};
5839     rle16_t next = buffered;
5840     if (k < my_nbr_runs) buffered = src->runs[k];
5841 
5842     run_container_smart_append_exclusive(
5843         ans, (uint16_t)range_start, (uint16_t)(range_end - range_start - 1));
5844 
5845     for (; k < my_nbr_runs; ++k) {
5846         if (k + 1 < my_nbr_runs) next = src->runs[k + 1];
5847 
5848         run_container_smart_append_exclusive(ans, buffered.value,
5849                                              buffered.length);
5850         buffered = next;
5851     }
5852 
5853     *dst = convert_run_to_efficient_container(ans, &return_typecode);
5854     if (return_typecode != RUN_CONTAINER_TYPE_CODE) run_container_free(ans);
5855 
5856     return return_typecode;
5857 }
5858 /* end file src/containers/mixed_negation.c */
5859 /* begin file src/containers/mixed_subset.c */
5860 
array_container_is_subset_bitset(const array_container_t * container1,const bitset_container_t * container2)5861 bool array_container_is_subset_bitset(const array_container_t* container1,
5862                                       const bitset_container_t* container2) {
5863     if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) {
5864         if (container2->cardinality < container1->cardinality) {
5865             return false;
5866         }
5867     }
5868     for (int i = 0; i < container1->cardinality; ++i) {
5869         if (!bitset_container_contains(container2, container1->array[i])) {
5870             return false;
5871         }
5872     }
5873     return true;
5874 }
5875 
run_container_is_subset_array(const run_container_t * container1,const array_container_t * container2)5876 bool run_container_is_subset_array(const run_container_t* container1,
5877                                    const array_container_t* container2) {
5878     if (run_container_cardinality(container1) > container2->cardinality)
5879         return false;
5880     int32_t start_pos = -1, stop_pos = -1;
5881     for (int i = 0; i < container1->n_runs; ++i) {
5882         int32_t start = container1->runs[i].value;
5883         int32_t stop = start + container1->runs[i].length;
5884         start_pos = advanceUntil(container2->array, stop_pos,
5885                                  container2->cardinality, start);
5886         stop_pos = advanceUntil(container2->array, stop_pos,
5887                                 container2->cardinality, stop);
5888         if (start_pos == container2->cardinality) {
5889             return false;
5890         } else if (stop_pos - start_pos != stop - start ||
5891                    container2->array[start_pos] != start ||
5892                    container2->array[stop_pos] != stop) {
5893             return false;
5894         }
5895     }
5896     return true;
5897 }
5898 
array_container_is_subset_run(const array_container_t * container1,const run_container_t * container2)5899 bool array_container_is_subset_run(const array_container_t* container1,
5900                                    const run_container_t* container2) {
5901     if (container1->cardinality > run_container_cardinality(container2))
5902         return false;
5903     int i_array = 0, i_run = 0;
5904     while (i_array < container1->cardinality && i_run < container2->n_runs) {
5905         uint32_t start = container2->runs[i_run].value;
5906         uint32_t stop = start + container2->runs[i_run].length;
5907         if (container1->array[i_array] < start) {
5908             return false;
5909         } else if (container1->array[i_array] > stop) {
5910             i_run++;
5911         } else {  // the value of the array is in the run
5912             i_array++;
5913         }
5914     }
5915     if (i_array == container1->cardinality) {
5916         return true;
5917     } else {
5918         return false;
5919     }
5920 }
5921 
run_container_is_subset_bitset(const run_container_t * container1,const bitset_container_t * container2)5922 bool run_container_is_subset_bitset(const run_container_t* container1,
5923                                     const bitset_container_t* container2) {
5924     // todo: this code could be much faster
5925     if (container2->cardinality != BITSET_UNKNOWN_CARDINALITY) {
5926         if (container2->cardinality < run_container_cardinality(container1)) {
5927             return false;
5928         }
5929     } else {
5930         int32_t card = bitset_container_compute_cardinality(
5931             container2);  // modify container2?
5932         if (card < run_container_cardinality(container1)) {
5933             return false;
5934         }
5935     }
5936     for (int i = 0; i < container1->n_runs; ++i) {
5937         uint32_t run_start = container1->runs[i].value;
5938         uint32_t le = container1->runs[i].length;
5939         for (uint32_t j = run_start; j <= run_start + le; ++j) {
5940             if (!bitset_container_contains(container2, j)) {
5941                 return false;
5942             }
5943         }
5944     }
5945     return true;
5946 }
5947 
bitset_container_is_subset_run(const bitset_container_t * container1,const run_container_t * container2)5948 bool bitset_container_is_subset_run(const bitset_container_t* container1,
5949                                     const run_container_t* container2) {
5950     // todo: this code could be much faster
5951     if (container1->cardinality != BITSET_UNKNOWN_CARDINALITY) {
5952         if (container1->cardinality > run_container_cardinality(container2)) {
5953             return false;
5954         }
5955     }
5956     int32_t i_bitset = 0, i_run = 0;
5957     while (i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS &&
5958            i_run < container2->n_runs) {
5959         uint64_t w = container1->array[i_bitset];
5960         while (w != 0 && i_run < container2->n_runs) {
5961             uint32_t start = container2->runs[i_run].value;
5962             uint32_t stop = start + container2->runs[i_run].length;
5963             uint64_t t = w & (~w + 1);
5964             uint16_t r = i_bitset * 64 + __builtin_ctzll(w);
5965             if (r < start) {
5966                 return false;
5967             } else if (r > stop) {
5968                 i_run++;
5969                 continue;
5970             } else {
5971                 w ^= t;
5972             }
5973         }
5974         if (w == 0) {
5975             i_bitset++;
5976         } else {
5977             return false;
5978         }
5979     }
5980     if (i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS) {
5981         // terminated iterating on the run containers, check that rest of bitset
5982         // is empty
5983         for (; i_bitset < BITSET_CONTAINER_SIZE_IN_WORDS; i_bitset++) {
5984             if (container1->array[i_bitset] != 0) {
5985                 return false;
5986             }
5987         }
5988     }
5989     return true;
5990 }
5991 /* end file src/containers/mixed_subset.c */
5992 /* begin file src/containers/mixed_union.c */
5993 /*
5994  * mixed_union.c
5995  *
5996  */
5997 
5998 #include <assert.h>
5999 #include <string.h>
6000 
6001 
6002 /* Compute the union of src_1 and src_2 and write the result to
6003  * dst.  */
array_bitset_container_union(const array_container_t * src_1,const bitset_container_t * src_2,bitset_container_t * dst)6004 void array_bitset_container_union(const array_container_t *src_1,
6005                                   const bitset_container_t *src_2,
6006                                   bitset_container_t *dst) {
6007     if (src_2 != dst) bitset_container_copy(src_2, dst);
6008     dst->cardinality = (int32_t)bitset_set_list_withcard(
6009         dst->array, dst->cardinality, src_1->array, src_1->cardinality);
6010 }
6011 
6012 /* Compute the union of src_1 and src_2 and write the result to
6013  * dst. It is allowed for src_2 to be dst.  This version does not
6014  * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY). */
array_bitset_container_lazy_union(const array_container_t * src_1,const bitset_container_t * src_2,bitset_container_t * dst)6015 void array_bitset_container_lazy_union(const array_container_t *src_1,
6016                                        const bitset_container_t *src_2,
6017                                        bitset_container_t *dst) {
6018     if (src_2 != dst) bitset_container_copy(src_2, dst);
6019     bitset_set_list(dst->array, src_1->array, src_1->cardinality);
6020     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
6021 }
6022 
run_bitset_container_union(const run_container_t * src_1,const bitset_container_t * src_2,bitset_container_t * dst)6023 void run_bitset_container_union(const run_container_t *src_1,
6024                                 const bitset_container_t *src_2,
6025                                 bitset_container_t *dst) {
6026     assert(!run_container_is_full(src_1));  // catch this case upstream
6027     if (src_2 != dst) bitset_container_copy(src_2, dst);
6028     for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
6029         rle16_t rle = src_1->runs[rlepos];
6030         bitset_set_lenrange(dst->array, rle.value, rle.length);
6031     }
6032     dst->cardinality = bitset_container_compute_cardinality(dst);
6033 }
6034 
run_bitset_container_lazy_union(const run_container_t * src_1,const bitset_container_t * src_2,bitset_container_t * dst)6035 void run_bitset_container_lazy_union(const run_container_t *src_1,
6036                                      const bitset_container_t *src_2,
6037                                      bitset_container_t *dst) {
6038     assert(!run_container_is_full(src_1));  // catch this case upstream
6039     if (src_2 != dst) bitset_container_copy(src_2, dst);
6040     for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
6041         rle16_t rle = src_1->runs[rlepos];
6042         bitset_set_lenrange(dst->array, rle.value, rle.length);
6043     }
6044     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
6045 }
6046 
6047 // why do we leave the result as a run container??
array_run_container_union(const array_container_t * src_1,const run_container_t * src_2,run_container_t * dst)6048 void array_run_container_union(const array_container_t *src_1,
6049                                const run_container_t *src_2,
6050                                run_container_t *dst) {
6051     if (run_container_is_full(src_2)) {
6052         run_container_copy(src_2, dst);
6053         return;
6054     }
6055     // TODO: see whether the "2*" is spurious
6056     run_container_grow(dst, 2 * (src_1->cardinality + src_2->n_runs), false);
6057     int32_t rlepos = 0;
6058     int32_t arraypos = 0;
6059     rle16_t previousrle;
6060     if (src_2->runs[rlepos].value <= src_1->array[arraypos]) {
6061         previousrle = run_container_append_first(dst, src_2->runs[rlepos]);
6062         rlepos++;
6063     } else {
6064         previousrle =
6065             run_container_append_value_first(dst, src_1->array[arraypos]);
6066         arraypos++;
6067     }
6068     while ((rlepos < src_2->n_runs) && (arraypos < src_1->cardinality)) {
6069         if (src_2->runs[rlepos].value <= src_1->array[arraypos]) {
6070             run_container_append(dst, src_2->runs[rlepos], &previousrle);
6071             rlepos++;
6072         } else {
6073             run_container_append_value(dst, src_1->array[arraypos],
6074                                        &previousrle);
6075             arraypos++;
6076         }
6077     }
6078     if (arraypos < src_1->cardinality) {
6079         while (arraypos < src_1->cardinality) {
6080             run_container_append_value(dst, src_1->array[arraypos],
6081                                        &previousrle);
6082             arraypos++;
6083         }
6084     } else {
6085         while (rlepos < src_2->n_runs) {
6086             run_container_append(dst, src_2->runs[rlepos], &previousrle);
6087             rlepos++;
6088         }
6089     }
6090 }
6091 
array_run_container_inplace_union(const array_container_t * src_1,run_container_t * src_2)6092 void array_run_container_inplace_union(const array_container_t *src_1,
6093                                        run_container_t *src_2) {
6094     if (run_container_is_full(src_2)) {
6095         return;
6096     }
6097     const int32_t maxoutput = src_1->cardinality + src_2->n_runs;
6098     const int32_t neededcapacity = maxoutput + src_2->n_runs;
6099     if (src_2->capacity < neededcapacity)
6100         run_container_grow(src_2, neededcapacity, true);
6101     memmove(src_2->runs + maxoutput, src_2->runs,
6102             src_2->n_runs * sizeof(rle16_t));
6103     rle16_t *inputsrc2 = src_2->runs + maxoutput;
6104     int32_t rlepos = 0;
6105     int32_t arraypos = 0;
6106     int src2nruns = src_2->n_runs;
6107     src_2->n_runs = 0;
6108 
6109     rle16_t previousrle;
6110 
6111     if (inputsrc2[rlepos].value <= src_1->array[arraypos]) {
6112         previousrle = run_container_append_first(src_2, inputsrc2[rlepos]);
6113         rlepos++;
6114     } else {
6115         previousrle =
6116             run_container_append_value_first(src_2, src_1->array[arraypos]);
6117         arraypos++;
6118     }
6119 
6120     while ((rlepos < src2nruns) && (arraypos < src_1->cardinality)) {
6121         if (inputsrc2[rlepos].value <= src_1->array[arraypos]) {
6122             run_container_append(src_2, inputsrc2[rlepos], &previousrle);
6123             rlepos++;
6124         } else {
6125             run_container_append_value(src_2, src_1->array[arraypos],
6126                                        &previousrle);
6127             arraypos++;
6128         }
6129     }
6130     if (arraypos < src_1->cardinality) {
6131         while (arraypos < src_1->cardinality) {
6132             run_container_append_value(src_2, src_1->array[arraypos],
6133                                        &previousrle);
6134             arraypos++;
6135         }
6136     } else {
6137         while (rlepos < src2nruns) {
6138             run_container_append(src_2, inputsrc2[rlepos], &previousrle);
6139             rlepos++;
6140         }
6141     }
6142 }
6143 
array_array_container_union(const array_container_t * src_1,const array_container_t * src_2,void ** dst)6144 bool array_array_container_union(const array_container_t *src_1,
6145                                  const array_container_t *src_2, void **dst) {
6146     int totalCardinality = src_1->cardinality + src_2->cardinality;
6147     if (totalCardinality <= DEFAULT_MAX_SIZE) {
6148         *dst = array_container_create_given_capacity(totalCardinality);
6149         if (*dst != NULL) {
6150             array_container_union(src_1, src_2, (array_container_t *)*dst);
6151         } else {
6152             return true; // otherwise failure won't be caught
6153         }
6154         return false;  // not a bitset
6155     }
6156     *dst = bitset_container_create();
6157     bool returnval = true;  // expect a bitset
6158     if (*dst != NULL) {
6159         bitset_container_t *ourbitset = (bitset_container_t *)*dst;
6160         bitset_set_list(ourbitset->array, src_1->array, src_1->cardinality);
6161         ourbitset->cardinality = (int32_t)bitset_set_list_withcard(
6162             ourbitset->array, src_1->cardinality, src_2->array,
6163             src_2->cardinality);
6164         if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) {
6165             // need to convert!
6166             *dst = array_container_from_bitset(ourbitset);
6167             bitset_container_free(ourbitset);
6168             returnval = false;  // not going to be a bitset
6169         }
6170     }
6171     return returnval;
6172 }
6173 
array_array_container_inplace_union(array_container_t * src_1,const array_container_t * src_2,void ** dst)6174 bool array_array_container_inplace_union(array_container_t *src_1,
6175                                  const array_container_t *src_2, void **dst) {
6176     int totalCardinality = src_1->cardinality + src_2->cardinality;
6177     *dst = NULL;
6178     if (totalCardinality <= DEFAULT_MAX_SIZE) {
6179         if(src_1->capacity < totalCardinality) {
6180           *dst = array_container_create_given_capacity(2  * totalCardinality); // be purposefully generous
6181           if (*dst != NULL) {
6182               array_container_union(src_1, src_2, (array_container_t *)*dst);
6183           } else {
6184               return true; // otherwise failure won't be caught
6185           }
6186           return false;  // not a bitset
6187         } else {
6188           memmove(src_1->array + src_2->cardinality, src_1->array, src_1->cardinality * sizeof(uint16_t));
6189           src_1->cardinality = (int32_t)union_uint16(src_1->array + src_2->cardinality, src_1->cardinality,
6190                                   src_2->array, src_2->cardinality, src_1->array);
6191           return false; // not a bitset
6192         }
6193     }
6194     *dst = bitset_container_create();
6195     bool returnval = true;  // expect a bitset
6196     if (*dst != NULL) {
6197         bitset_container_t *ourbitset = (bitset_container_t *)*dst;
6198         bitset_set_list(ourbitset->array, src_1->array, src_1->cardinality);
6199         ourbitset->cardinality = (int32_t)bitset_set_list_withcard(
6200             ourbitset->array, src_1->cardinality, src_2->array,
6201             src_2->cardinality);
6202         if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) {
6203             // need to convert!
6204             if(src_1->capacity < ourbitset->cardinality) {
6205               array_container_grow(src_1, ourbitset->cardinality, false);
6206             }
6207 
6208             bitset_extract_setbits_uint16(ourbitset->array, BITSET_CONTAINER_SIZE_IN_WORDS,
6209                                   src_1->array, 0);
6210             src_1->cardinality =  ourbitset->cardinality;
6211             *dst = src_1;
6212             bitset_container_free(ourbitset);
6213             returnval = false;  // not going to be a bitset
6214         }
6215     }
6216     return returnval;
6217 }
6218 
6219 
array_array_container_lazy_union(const array_container_t * src_1,const array_container_t * src_2,void ** dst)6220 bool array_array_container_lazy_union(const array_container_t *src_1,
6221                                       const array_container_t *src_2,
6222                                       void **dst) {
6223     int totalCardinality = src_1->cardinality + src_2->cardinality;
6224     if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
6225         *dst = array_container_create_given_capacity(totalCardinality);
6226         if (*dst != NULL) {
6227             array_container_union(src_1, src_2, (array_container_t *)*dst);
6228         } else {
6229               return true; // otherwise failure won't be caught
6230         }
6231         return false;  // not a bitset
6232     }
6233     *dst = bitset_container_create();
6234     bool returnval = true;  // expect a bitset
6235     if (*dst != NULL) {
6236         bitset_container_t *ourbitset = (bitset_container_t *)*dst;
6237         bitset_set_list(ourbitset->array, src_1->array, src_1->cardinality);
6238         bitset_set_list(ourbitset->array, src_2->array, src_2->cardinality);
6239         ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY;
6240     }
6241     return returnval;
6242 }
6243 
6244 
array_array_container_lazy_inplace_union(array_container_t * src_1,const array_container_t * src_2,void ** dst)6245 bool array_array_container_lazy_inplace_union(array_container_t *src_1,
6246                                       const array_container_t *src_2,
6247                                       void **dst) {
6248     int totalCardinality = src_1->cardinality + src_2->cardinality;
6249     *dst = NULL;
6250     if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
6251         if(src_1->capacity < totalCardinality) {
6252           *dst = array_container_create_given_capacity(2  * totalCardinality); // be purposefully generous
6253           if (*dst != NULL) {
6254               array_container_union(src_1, src_2, (array_container_t *)*dst);
6255           } else {
6256             return true; // otherwise failure won't be caught
6257           }
6258           return false;  // not a bitset
6259         } else {
6260           memmove(src_1->array + src_2->cardinality, src_1->array, src_1->cardinality * sizeof(uint16_t));
6261           src_1->cardinality = (int32_t)union_uint16(src_1->array + src_2->cardinality, src_1->cardinality,
6262                                   src_2->array, src_2->cardinality, src_1->array);
6263           return false; // not a bitset
6264         }
6265     }
6266     *dst = bitset_container_create();
6267     bool returnval = true;  // expect a bitset
6268     if (*dst != NULL) {
6269         bitset_container_t *ourbitset = (bitset_container_t *)*dst;
6270         bitset_set_list(ourbitset->array, src_1->array, src_1->cardinality);
6271         bitset_set_list(ourbitset->array, src_2->array, src_2->cardinality);
6272         ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY;
6273     }
6274     return returnval;
6275 }
6276 /* end file src/containers/mixed_union.c */
6277 /* begin file src/containers/mixed_xor.c */
6278 /*
6279  * mixed_xor.c
6280  */
6281 
6282 #include <assert.h>
6283 #include <string.h>
6284 
6285 
6286 /* Compute the xor of src_1 and src_2 and write the result to
6287  * dst (which has no container initially).
6288  * Result is true iff dst is a bitset  */
array_bitset_container_xor(const array_container_t * src_1,const bitset_container_t * src_2,void ** dst)6289 bool array_bitset_container_xor(const array_container_t *src_1,
6290                                 const bitset_container_t *src_2, void **dst) {
6291     bitset_container_t *result = bitset_container_create();
6292     bitset_container_copy(src_2, result);
6293     result->cardinality = (int32_t)bitset_flip_list_withcard(
6294         result->array, result->cardinality, src_1->array, src_1->cardinality);
6295 
6296     // do required type conversions.
6297     if (result->cardinality <= DEFAULT_MAX_SIZE) {
6298         *dst = array_container_from_bitset(result);
6299         bitset_container_free(result);
6300         return false;  // not bitset
6301     }
6302     *dst = result;
6303     return true;  // bitset
6304 }
6305 
6306 /* Compute the xor of src_1 and src_2 and write the result to
6307  * dst. It is allowed for src_2 to be dst.  This version does not
6308  * update the cardinality of dst (it is set to BITSET_UNKNOWN_CARDINALITY).
6309  */
6310 
array_bitset_container_lazy_xor(const array_container_t * src_1,const bitset_container_t * src_2,bitset_container_t * dst)6311 void array_bitset_container_lazy_xor(const array_container_t *src_1,
6312                                      const bitset_container_t *src_2,
6313                                      bitset_container_t *dst) {
6314     if (src_2 != dst) bitset_container_copy(src_2, dst);
6315     bitset_flip_list(dst->array, src_1->array, src_1->cardinality);
6316     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
6317 }
6318 
6319 /* Compute the xor of src_1 and src_2 and write the result to
6320  * dst. Result may be either a bitset or an array container
6321  * (returns "result is bitset"). dst does not initially have
6322  * any container, but becomes either a bitset container (return
6323  * result true) or an array container.
6324  */
6325 
run_bitset_container_xor(const run_container_t * src_1,const bitset_container_t * src_2,void ** dst)6326 bool run_bitset_container_xor(const run_container_t *src_1,
6327                               const bitset_container_t *src_2, void **dst) {
6328     bitset_container_t *result = bitset_container_create();
6329 
6330     bitset_container_copy(src_2, result);
6331     for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
6332         rle16_t rle = src_1->runs[rlepos];
6333         bitset_flip_range(result->array, rle.value,
6334                           rle.value + rle.length + UINT32_C(1));
6335     }
6336     result->cardinality = bitset_container_compute_cardinality(result);
6337 
6338     if (result->cardinality <= DEFAULT_MAX_SIZE) {
6339         *dst = array_container_from_bitset(result);
6340         bitset_container_free(result);
6341         return false;  // not bitset
6342     }
6343     *dst = result;
6344     return true;  // bitset
6345 }
6346 
6347 /* lazy xor.  Dst is initialized and may be equal to src_2.
6348  *  Result is left as a bitset container, even if actual
6349  *  cardinality would dictate an array container.
6350  */
6351 
run_bitset_container_lazy_xor(const run_container_t * src_1,const bitset_container_t * src_2,bitset_container_t * dst)6352 void run_bitset_container_lazy_xor(const run_container_t *src_1,
6353                                    const bitset_container_t *src_2,
6354                                    bitset_container_t *dst) {
6355     if (src_2 != dst) bitset_container_copy(src_2, dst);
6356     for (int32_t rlepos = 0; rlepos < src_1->n_runs; ++rlepos) {
6357         rle16_t rle = src_1->runs[rlepos];
6358         bitset_flip_range(dst->array, rle.value,
6359                           rle.value + rle.length + UINT32_C(1));
6360     }
6361     dst->cardinality = BITSET_UNKNOWN_CARDINALITY;
6362 }
6363 
6364 /* dst does not indicate a valid container initially.  Eventually it
6365  * can become any kind of container.
6366  */
6367 
array_run_container_xor(const array_container_t * src_1,const run_container_t * src_2,void ** dst)6368 int array_run_container_xor(const array_container_t *src_1,
6369                             const run_container_t *src_2, void **dst) {
6370     // semi following Java XOR implementation as of May 2016
6371     // the C OR implementation works quite differently and can return a run
6372     // container
6373     // TODO could optimize for full run containers.
6374 
6375     // use of lazy following Java impl.
6376     const int arbitrary_threshold = 32;
6377     if (src_1->cardinality < arbitrary_threshold) {
6378         run_container_t *ans = run_container_create();
6379         array_run_container_lazy_xor(src_1, src_2, ans);  // keeps runs.
6380         uint8_t typecode_after;
6381         *dst =
6382             convert_run_to_efficient_container_and_free(ans, &typecode_after);
6383         return typecode_after;
6384     }
6385 
6386     int card = run_container_cardinality(src_2);
6387     if (card <= DEFAULT_MAX_SIZE) {
6388         // Java implementation works with the array, xoring the run elements via
6389         // iterator
6390         array_container_t *temp = array_container_from_run(src_2);
6391         bool ret_is_bitset = array_array_container_xor(temp, src_1, dst);
6392         array_container_free(temp);
6393         return ret_is_bitset ? BITSET_CONTAINER_TYPE_CODE
6394                              : ARRAY_CONTAINER_TYPE_CODE;
6395 
6396     } else {  // guess that it will end up as a bitset
6397         bitset_container_t *result = bitset_container_from_run(src_2);
6398         bool is_bitset = bitset_array_container_ixor(result, src_1, dst);
6399         // any necessary type conversion has been done by the ixor
6400         int retval = (is_bitset ? BITSET_CONTAINER_TYPE_CODE
6401                                 : ARRAY_CONTAINER_TYPE_CODE);
6402         return retval;
6403     }
6404 }
6405 
6406 /* Dst is a valid run container. (Can it be src_2? Let's say not.)
6407  * Leaves result as run container, even if other options are
6408  * smaller.
6409  */
6410 
array_run_container_lazy_xor(const array_container_t * src_1,const run_container_t * src_2,run_container_t * dst)6411 void array_run_container_lazy_xor(const array_container_t *src_1,
6412                                   const run_container_t *src_2,
6413                                   run_container_t *dst) {
6414     run_container_grow(dst, src_1->cardinality + src_2->n_runs, false);
6415     int32_t rlepos = 0;
6416     int32_t arraypos = 0;
6417     dst->n_runs = 0;
6418 
6419     while ((rlepos < src_2->n_runs) && (arraypos < src_1->cardinality)) {
6420         if (src_2->runs[rlepos].value <= src_1->array[arraypos]) {
6421             run_container_smart_append_exclusive(dst, src_2->runs[rlepos].value,
6422                                                  src_2->runs[rlepos].length);
6423             rlepos++;
6424         } else {
6425             run_container_smart_append_exclusive(dst, src_1->array[arraypos],
6426                                                  0);
6427             arraypos++;
6428         }
6429     }
6430     while (arraypos < src_1->cardinality) {
6431         run_container_smart_append_exclusive(dst, src_1->array[arraypos], 0);
6432         arraypos++;
6433     }
6434     while (rlepos < src_2->n_runs) {
6435         run_container_smart_append_exclusive(dst, src_2->runs[rlepos].value,
6436                                              src_2->runs[rlepos].length);
6437         rlepos++;
6438     }
6439 }
6440 
6441 /* dst does not indicate a valid container initially.  Eventually it
6442  * can become any kind of container.
6443  */
6444 
run_run_container_xor(const run_container_t * src_1,const run_container_t * src_2,void ** dst)6445 int run_run_container_xor(const run_container_t *src_1,
6446                           const run_container_t *src_2, void **dst) {
6447     run_container_t *ans = run_container_create();
6448     run_container_xor(src_1, src_2, ans);
6449     uint8_t typecode_after;
6450     *dst = convert_run_to_efficient_container_and_free(ans, &typecode_after);
6451     return typecode_after;
6452 }
6453 
6454 /*
6455  * Java implementation (as of May 2016) for array_run, run_run
6456  * and  bitset_run don't do anything different for inplace.
6457  * Could adopt the mixed_union.c approach instead (ie, using
6458  * smart_append_exclusive)
6459  *
6460  */
6461 
array_array_container_xor(const array_container_t * src_1,const array_container_t * src_2,void ** dst)6462 bool array_array_container_xor(const array_container_t *src_1,
6463                                const array_container_t *src_2, void **dst) {
6464     int totalCardinality =
6465         src_1->cardinality + src_2->cardinality;  // upper bound
6466     if (totalCardinality <= DEFAULT_MAX_SIZE) {
6467         *dst = array_container_create_given_capacity(totalCardinality);
6468         array_container_xor(src_1, src_2, (array_container_t *)*dst);
6469         return false;  // not a bitset
6470     }
6471     *dst = bitset_container_from_array(src_1);
6472     bool returnval = true;  // expect a bitset
6473     bitset_container_t *ourbitset = (bitset_container_t *)*dst;
6474     ourbitset->cardinality = (uint32_t)bitset_flip_list_withcard(
6475         ourbitset->array, src_1->cardinality, src_2->array, src_2->cardinality);
6476     if (ourbitset->cardinality <= DEFAULT_MAX_SIZE) {
6477         // need to convert!
6478         *dst = array_container_from_bitset(ourbitset);
6479         bitset_container_free(ourbitset);
6480         returnval = false;  // not going to be a bitset
6481     }
6482 
6483     return returnval;
6484 }
6485 
array_array_container_lazy_xor(const array_container_t * src_1,const array_container_t * src_2,void ** dst)6486 bool array_array_container_lazy_xor(const array_container_t *src_1,
6487                                     const array_container_t *src_2,
6488                                     void **dst) {
6489     int totalCardinality = src_1->cardinality + src_2->cardinality;
6490     // upper bound, but probably poor estimate for xor
6491     if (totalCardinality <= ARRAY_LAZY_LOWERBOUND) {
6492         *dst = array_container_create_given_capacity(totalCardinality);
6493         if (*dst != NULL)
6494             array_container_xor(src_1, src_2, (array_container_t *)*dst);
6495         return false;  // not a bitset
6496     }
6497     *dst = bitset_container_from_array(src_1);
6498     bool returnval = true;  // expect a bitset (maybe, for XOR??)
6499     if (*dst != NULL) {
6500         bitset_container_t *ourbitset = (bitset_container_t *)*dst;
6501         bitset_flip_list(ourbitset->array, src_2->array, src_2->cardinality);
6502         ourbitset->cardinality = BITSET_UNKNOWN_CARDINALITY;
6503     }
6504     return returnval;
6505 }
6506 
6507 /* Compute the xor of src_1 and src_2 and write the result to
6508  * dst (which has no container initially). Return value is
6509  * "dst is a bitset"
6510  */
6511 
bitset_bitset_container_xor(const bitset_container_t * src_1,const bitset_container_t * src_2,void ** dst)6512 bool bitset_bitset_container_xor(const bitset_container_t *src_1,
6513                                  const bitset_container_t *src_2, void **dst) {
6514     bitset_container_t *ans = bitset_container_create();
6515     int card = bitset_container_xor(src_1, src_2, ans);
6516     if (card <= DEFAULT_MAX_SIZE) {
6517         *dst = array_container_from_bitset(ans);
6518         bitset_container_free(ans);
6519         return false;  // not bitset
6520     } else {
6521         *dst = ans;
6522         return true;
6523     }
6524 }
6525 
6526 /* Compute the xor of src_1 and src_2 and write the result to
6527  * dst (which has no container initially).  It will modify src_1
6528  * to be dst if the result is a bitset.  Otherwise, it will
6529  * free src_1 and dst will be a new array container.  In both
6530  * cases, the caller is responsible for deallocating dst.
6531  * Returns true iff dst is a bitset  */
6532 
bitset_array_container_ixor(bitset_container_t * src_1,const array_container_t * src_2,void ** dst)6533 bool bitset_array_container_ixor(bitset_container_t *src_1,
6534                                  const array_container_t *src_2, void **dst) {
6535     *dst = src_1;
6536     src_1->cardinality = (uint32_t)bitset_flip_list_withcard(
6537         src_1->array, src_1->cardinality, src_2->array, src_2->cardinality);
6538 
6539     if (src_1->cardinality <= DEFAULT_MAX_SIZE) {
6540         *dst = array_container_from_bitset(src_1);
6541         bitset_container_free(src_1);
6542         return false;  // not bitset
6543     } else
6544         return true;
6545 }
6546 
6547 /* a bunch of in-place, some of which may not *really* be inplace.
6548  * TODO: write actual inplace routine if efficiency warrants it
6549  * Anything inplace with a bitset is a good candidate
6550  */
6551 
bitset_bitset_container_ixor(bitset_container_t * src_1,const bitset_container_t * src_2,void ** dst)6552 bool bitset_bitset_container_ixor(bitset_container_t *src_1,
6553                                   const bitset_container_t *src_2, void **dst) {
6554     bool ans = bitset_bitset_container_xor(src_1, src_2, dst);
6555     bitset_container_free(src_1);
6556     return ans;
6557 }
6558 
array_bitset_container_ixor(array_container_t * src_1,const bitset_container_t * src_2,void ** dst)6559 bool array_bitset_container_ixor(array_container_t *src_1,
6560                                  const bitset_container_t *src_2, void **dst) {
6561     bool ans = array_bitset_container_xor(src_1, src_2, dst);
6562     array_container_free(src_1);
6563     return ans;
6564 }
6565 
6566 /* Compute the xor of src_1 and src_2 and write the result to
6567  * dst. Result may be either a bitset or an array container
6568  * (returns "result is bitset"). dst does not initially have
6569  * any container, but becomes either a bitset container (return
6570  * result true) or an array container.
6571  */
6572 
run_bitset_container_ixor(run_container_t * src_1,const bitset_container_t * src_2,void ** dst)6573 bool run_bitset_container_ixor(run_container_t *src_1,
6574                                const bitset_container_t *src_2, void **dst) {
6575     bool ans = run_bitset_container_xor(src_1, src_2, dst);
6576     run_container_free(src_1);
6577     return ans;
6578 }
6579 
bitset_run_container_ixor(bitset_container_t * src_1,const run_container_t * src_2,void ** dst)6580 bool bitset_run_container_ixor(bitset_container_t *src_1,
6581                                const run_container_t *src_2, void **dst) {
6582     bool ans = run_bitset_container_xor(src_2, src_1, dst);
6583     bitset_container_free(src_1);
6584     return ans;
6585 }
6586 
6587 /* dst does not indicate a valid container initially.  Eventually it
6588  * can become any kind of container.
6589  */
6590 
array_run_container_ixor(array_container_t * src_1,const run_container_t * src_2,void ** dst)6591 int array_run_container_ixor(array_container_t *src_1,
6592                              const run_container_t *src_2, void **dst) {
6593     int ans = array_run_container_xor(src_1, src_2, dst);
6594     array_container_free(src_1);
6595     return ans;
6596 }
6597 
run_array_container_ixor(run_container_t * src_1,const array_container_t * src_2,void ** dst)6598 int run_array_container_ixor(run_container_t *src_1,
6599                              const array_container_t *src_2, void **dst) {
6600     int ans = array_run_container_xor(src_2, src_1, dst);
6601     run_container_free(src_1);
6602     return ans;
6603 }
6604 
array_array_container_ixor(array_container_t * src_1,const array_container_t * src_2,void ** dst)6605 bool array_array_container_ixor(array_container_t *src_1,
6606                                 const array_container_t *src_2, void **dst) {
6607     bool ans = array_array_container_xor(src_1, src_2, dst);
6608     array_container_free(src_1);
6609     return ans;
6610 }
6611 
run_run_container_ixor(run_container_t * src_1,const run_container_t * src_2,void ** dst)6612 int run_run_container_ixor(run_container_t *src_1, const run_container_t *src_2,
6613                            void **dst) {
6614     int ans = run_run_container_xor(src_1, src_2, dst);
6615     run_container_free(src_1);
6616     return ans;
6617 }
6618 /* end file src/containers/mixed_xor.c */
6619 /* begin file src/containers/run.c */
6620 #include <stdio.h>
6621 #include <stdlib.h>
6622 
6623 
run_container_add(run_container_t * run,uint16_t pos)6624 bool run_container_add(run_container_t *run, uint16_t pos) {
6625     int32_t index = interleavedBinarySearch(run->runs, run->n_runs, pos);
6626     if (index >= 0) return false;  // already there
6627     index = -index - 2;            // points to preceding value, possibly -1
6628     if (index >= 0) {              // possible match
6629         int32_t offset = pos - run->runs[index].value;
6630         int32_t le = run->runs[index].length;
6631         if (offset <= le) return false;  // already there
6632         if (offset == le + 1) {
6633             // we may need to fuse
6634             if (index + 1 < run->n_runs) {
6635                 if (run->runs[index + 1].value == pos + 1) {
6636                     // indeed fusion is needed
6637                     run->runs[index].length = run->runs[index + 1].value +
6638                                               run->runs[index + 1].length -
6639                                               run->runs[index].value;
6640                     recoverRoomAtIndex(run, (uint16_t)(index + 1));
6641                     return true;
6642                 }
6643             }
6644             run->runs[index].length++;
6645             return true;
6646         }
6647         if (index + 1 < run->n_runs) {
6648             // we may need to fuse
6649             if (run->runs[index + 1].value == pos + 1) {
6650                 // indeed fusion is needed
6651                 run->runs[index + 1].value = pos;
6652                 run->runs[index + 1].length = run->runs[index + 1].length + 1;
6653                 return true;
6654             }
6655         }
6656     }
6657     if (index == -1) {
6658         // we may need to extend the first run
6659         if (0 < run->n_runs) {
6660             if (run->runs[0].value == pos + 1) {
6661                 run->runs[0].length++;
6662                 run->runs[0].value--;
6663                 return true;
6664             }
6665         }
6666     }
6667     makeRoomAtIndex(run, (uint16_t)(index + 1));
6668     run->runs[index + 1].value = pos;
6669     run->runs[index + 1].length = 0;
6670     return true;
6671 }
6672 
6673 /* Create a new run container. Return NULL in case of failure. */
run_container_create_given_capacity(int32_t size)6674 run_container_t *run_container_create_given_capacity(int32_t size) {
6675     run_container_t *run;
6676     /* Allocate the run container itself. */
6677     run = (run_container_t *)malloc(sizeof(run_container_t));
6678     assert (run);
6679     if (size <= 0) // we don't want to rely on malloc(0)
6680         run->runs = NULL;
6681     run->runs = (rle16_t *)malloc(sizeof(rle16_t) * size);
6682     assert (run->runs);
6683     run->capacity = size;
6684     run->n_runs = 0;
6685     return run;
6686 }
6687 
run_container_shrink_to_fit(run_container_t * src)6688 int run_container_shrink_to_fit(run_container_t *src) {
6689     if (src->n_runs == src->capacity) return 0;  // nothing to do
6690     int savings = src->capacity - src->n_runs;
6691     src->capacity = src->n_runs;
6692     rle16_t *oldruns = src->runs;
6693     src->runs = (rle16_t *)realloc(oldruns, src->capacity * sizeof(rle16_t));
6694     if (src->runs == NULL) free(oldruns);  // should never happen?
6695     return savings;
6696 }
6697 /* Create a new run container. Return NULL in case of failure. */
run_container_create(void)6698 run_container_t *run_container_create(void) {
6699     return run_container_create_given_capacity(RUN_DEFAULT_INIT_SIZE);
6700 }
6701 
run_container_clone(const run_container_t * src)6702 run_container_t *run_container_clone(const run_container_t *src) {
6703     run_container_t *run = run_container_create_given_capacity(src->capacity);
6704     if (run == NULL) return NULL;
6705     run->capacity = src->capacity;
6706     run->n_runs = src->n_runs;
6707     memcpy(run->runs, src->runs, src->n_runs * sizeof(rle16_t));
6708     return run;
6709 }
6710 
6711 /* Free memory. */
run_container_free(run_container_t * run)6712 void run_container_free(run_container_t *run) {
6713     if(run->runs != NULL) {// Jon Strabala reports that some tools complain otherwise
6714       free(run->runs);
6715       run->runs = NULL;  // pedantic
6716     }
6717     free(run);
6718 }
6719 
run_container_grow(run_container_t * run,int32_t min,bool copy)6720 void run_container_grow(run_container_t *run, int32_t min, bool copy) {
6721     int32_t newCapacity =
6722         (run->capacity == 0)
6723             ? RUN_DEFAULT_INIT_SIZE
6724             : run->capacity < 64 ? run->capacity * 2
6725                                  : run->capacity < 1024 ? run->capacity * 3 / 2
6726                                                         : run->capacity * 5 / 4;
6727     if (newCapacity < min) newCapacity = min;
6728     run->capacity = newCapacity;
6729     assert(run->capacity >= min);
6730     if (copy) {
6731         rle16_t *oldruns = run->runs;
6732         run->runs =
6733             (rle16_t *)realloc(oldruns, run->capacity * sizeof(rle16_t));
6734         if (run->runs == NULL) free(oldruns);
6735     } else {
6736         // Jon Strabala reports that some tools complain otherwise
6737         if (run->runs != NULL) {
6738           free(run->runs);
6739         }
6740         run->runs = (rle16_t *)malloc(run->capacity * sizeof(rle16_t));
6741     }
6742     // handle the case where realloc fails
6743     if (run->runs == NULL) {
6744       fprintf(stderr, "could not allocate memory\n");
6745     }
6746     assert(run->runs != NULL);
6747 }
6748 
6749 /* copy one container into another */
run_container_copy(const run_container_t * src,run_container_t * dst)6750 void run_container_copy(const run_container_t *src, run_container_t *dst) {
6751     const int32_t n_runs = src->n_runs;
6752     if (src->n_runs > dst->capacity) {
6753         run_container_grow(dst, n_runs, false);
6754     }
6755     dst->n_runs = n_runs;
6756     memcpy(dst->runs, src->runs, sizeof(rle16_t) * n_runs);
6757 }
6758 
6759 /* Compute the union of `src_1' and `src_2' and write the result to `dst'
6760  * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
run_container_union(const run_container_t * src_1,const run_container_t * src_2,run_container_t * dst)6761 void run_container_union(const run_container_t *src_1,
6762                          const run_container_t *src_2, run_container_t *dst) {
6763     // TODO: this could be a lot more efficient
6764 
6765     // we start out with inexpensive checks
6766     const bool if1 = run_container_is_full(src_1);
6767     const bool if2 = run_container_is_full(src_2);
6768     if (if1 || if2) {
6769         if (if1) {
6770             run_container_copy(src_1, dst);
6771             return;
6772         }
6773         if (if2) {
6774             run_container_copy(src_2, dst);
6775             return;
6776         }
6777     }
6778     const int32_t neededcapacity = src_1->n_runs + src_2->n_runs;
6779     if (dst->capacity < neededcapacity)
6780         run_container_grow(dst, neededcapacity, false);
6781     dst->n_runs = 0;
6782     int32_t rlepos = 0;
6783     int32_t xrlepos = 0;
6784 
6785     rle16_t previousrle;
6786     if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) {
6787         previousrle = run_container_append_first(dst, src_1->runs[rlepos]);
6788         rlepos++;
6789     } else {
6790         previousrle = run_container_append_first(dst, src_2->runs[xrlepos]);
6791         xrlepos++;
6792     }
6793 
6794     while ((xrlepos < src_2->n_runs) && (rlepos < src_1->n_runs)) {
6795         rle16_t newrl;
6796         if (src_1->runs[rlepos].value <= src_2->runs[xrlepos].value) {
6797             newrl = src_1->runs[rlepos];
6798             rlepos++;
6799         } else {
6800             newrl = src_2->runs[xrlepos];
6801             xrlepos++;
6802         }
6803         run_container_append(dst, newrl, &previousrle);
6804     }
6805     while (xrlepos < src_2->n_runs) {
6806         run_container_append(dst, src_2->runs[xrlepos], &previousrle);
6807         xrlepos++;
6808     }
6809     while (rlepos < src_1->n_runs) {
6810         run_container_append(dst, src_1->runs[rlepos], &previousrle);
6811         rlepos++;
6812     }
6813 }
6814 
6815 /* Compute the union of `src_1' and `src_2' and write the result to `src_1'
6816  */
run_container_union_inplace(run_container_t * src_1,const run_container_t * src_2)6817 void run_container_union_inplace(run_container_t *src_1,
6818                                  const run_container_t *src_2) {
6819     // TODO: this could be a lot more efficient
6820 
6821     // we start out with inexpensive checks
6822     const bool if1 = run_container_is_full(src_1);
6823     const bool if2 = run_container_is_full(src_2);
6824     if (if1 || if2) {
6825         if (if1) {
6826             return;
6827         }
6828         if (if2) {
6829             run_container_copy(src_2, src_1);
6830             return;
6831         }
6832     }
6833     // we move the data to the end of the current array
6834     const int32_t maxoutput = src_1->n_runs + src_2->n_runs;
6835     const int32_t neededcapacity = maxoutput + src_1->n_runs;
6836     if (src_1->capacity < neededcapacity)
6837         run_container_grow(src_1, neededcapacity, true);
6838     memmove(src_1->runs + maxoutput, src_1->runs,
6839             src_1->n_runs * sizeof(rle16_t));
6840     rle16_t *inputsrc1 = src_1->runs + maxoutput;
6841     const int32_t input1nruns = src_1->n_runs;
6842     src_1->n_runs = 0;
6843     int32_t rlepos = 0;
6844     int32_t xrlepos = 0;
6845 
6846     rle16_t previousrle;
6847     if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) {
6848         previousrle = run_container_append_first(src_1, inputsrc1[rlepos]);
6849         rlepos++;
6850     } else {
6851         previousrle = run_container_append_first(src_1, src_2->runs[xrlepos]);
6852         xrlepos++;
6853     }
6854     while ((xrlepos < src_2->n_runs) && (rlepos < input1nruns)) {
6855         rle16_t newrl;
6856         if (inputsrc1[rlepos].value <= src_2->runs[xrlepos].value) {
6857             newrl = inputsrc1[rlepos];
6858             rlepos++;
6859         } else {
6860             newrl = src_2->runs[xrlepos];
6861             xrlepos++;
6862         }
6863         run_container_append(src_1, newrl, &previousrle);
6864     }
6865     while (xrlepos < src_2->n_runs) {
6866         run_container_append(src_1, src_2->runs[xrlepos], &previousrle);
6867         xrlepos++;
6868     }
6869     while (rlepos < input1nruns) {
6870         run_container_append(src_1, inputsrc1[rlepos], &previousrle);
6871         rlepos++;
6872     }
6873 }
6874 
6875 /* Compute the symmetric difference of `src_1' and `src_2' and write the result
6876  * to `dst'
6877  * It is assumed that `dst' is distinct from both `src_1' and `src_2'. */
run_container_xor(const run_container_t * src_1,const run_container_t * src_2,run_container_t * dst)6878 void run_container_xor(const run_container_t *src_1,
6879                        const run_container_t *src_2, run_container_t *dst) {
6880     // don't bother to convert xor with full range into negation
6881     // since negation is implemented similarly
6882 
6883     const int32_t neededcapacity = src_1->n_runs + src_2->n_runs;
6884     if (dst->capacity < neededcapacity)
6885         run_container_grow(dst, neededcapacity, false);
6886 
6887     int32_t pos1 = 0;
6888     int32_t pos2 = 0;
6889     dst->n_runs = 0;
6890 
6891     while ((pos1 < src_1->n_runs) && (pos2 < src_2->n_runs)) {
6892         if (src_1->runs[pos1].value <= src_2->runs[pos2].value) {
6893             run_container_smart_append_exclusive(dst, src_1->runs[pos1].value,
6894                                                  src_1->runs[pos1].length);
6895             pos1++;
6896         } else {
6897             run_container_smart_append_exclusive(dst, src_2->runs[pos2].value,
6898                                                  src_2->runs[pos2].length);
6899             pos2++;
6900         }
6901     }
6902     while (pos1 < src_1->n_runs) {
6903         run_container_smart_append_exclusive(dst, src_1->runs[pos1].value,
6904                                              src_1->runs[pos1].length);
6905         pos1++;
6906     }
6907 
6908     while (pos2 < src_2->n_runs) {
6909         run_container_smart_append_exclusive(dst, src_2->runs[pos2].value,
6910                                              src_2->runs[pos2].length);
6911         pos2++;
6912     }
6913 }
6914 
6915 /* Compute the intersection of src_1 and src_2 and write the result to
6916  * dst. It is assumed that dst is distinct from both src_1 and src_2. */
run_container_intersection(const run_container_t * src_1,const run_container_t * src_2,run_container_t * dst)6917 void run_container_intersection(const run_container_t *src_1,
6918                                 const run_container_t *src_2,
6919                                 run_container_t *dst) {
6920     const bool if1 = run_container_is_full(src_1);
6921     const bool if2 = run_container_is_full(src_2);
6922     if (if1 || if2) {
6923         if (if1) {
6924             run_container_copy(src_2, dst);
6925             return;
6926         }
6927         if (if2) {
6928             run_container_copy(src_1, dst);
6929             return;
6930         }
6931     }
6932     // TODO: this could be a lot more efficient, could use SIMD optimizations
6933     const int32_t neededcapacity = src_1->n_runs + src_2->n_runs;
6934     if (dst->capacity < neededcapacity)
6935         run_container_grow(dst, neededcapacity, false);
6936     dst->n_runs = 0;
6937     int32_t rlepos = 0;
6938     int32_t xrlepos = 0;
6939     int32_t start = src_1->runs[rlepos].value;
6940     int32_t end = start + src_1->runs[rlepos].length + 1;
6941     int32_t xstart = src_2->runs[xrlepos].value;
6942     int32_t xend = xstart + src_2->runs[xrlepos].length + 1;
6943     while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) {
6944         if (end <= xstart) {
6945             ++rlepos;
6946             if (rlepos < src_1->n_runs) {
6947                 start = src_1->runs[rlepos].value;
6948                 end = start + src_1->runs[rlepos].length + 1;
6949             }
6950         } else if (xend <= start) {
6951             ++xrlepos;
6952             if (xrlepos < src_2->n_runs) {
6953                 xstart = src_2->runs[xrlepos].value;
6954                 xend = xstart + src_2->runs[xrlepos].length + 1;
6955             }
6956         } else {  // they overlap
6957             const int32_t lateststart = start > xstart ? start : xstart;
6958             int32_t earliestend;
6959             if (end == xend) {  // improbable
6960                 earliestend = end;
6961                 rlepos++;
6962                 xrlepos++;
6963                 if (rlepos < src_1->n_runs) {
6964                     start = src_1->runs[rlepos].value;
6965                     end = start + src_1->runs[rlepos].length + 1;
6966                 }
6967                 if (xrlepos < src_2->n_runs) {
6968                     xstart = src_2->runs[xrlepos].value;
6969                     xend = xstart + src_2->runs[xrlepos].length + 1;
6970                 }
6971             } else if (end < xend) {
6972                 earliestend = end;
6973                 rlepos++;
6974                 if (rlepos < src_1->n_runs) {
6975                     start = src_1->runs[rlepos].value;
6976                     end = start + src_1->runs[rlepos].length + 1;
6977                 }
6978 
6979             } else {  // end > xend
6980                 earliestend = xend;
6981                 xrlepos++;
6982                 if (xrlepos < src_2->n_runs) {
6983                     xstart = src_2->runs[xrlepos].value;
6984                     xend = xstart + src_2->runs[xrlepos].length + 1;
6985                 }
6986             }
6987             dst->runs[dst->n_runs].value = (uint16_t)lateststart;
6988             dst->runs[dst->n_runs].length =
6989                 (uint16_t)(earliestend - lateststart - 1);
6990             dst->n_runs++;
6991         }
6992     }
6993 }
6994 
6995 /* Compute the size of the intersection of src_1 and src_2 . */
run_container_intersection_cardinality(const run_container_t * src_1,const run_container_t * src_2)6996 int run_container_intersection_cardinality(const run_container_t *src_1,
6997                                            const run_container_t *src_2) {
6998     const bool if1 = run_container_is_full(src_1);
6999     const bool if2 = run_container_is_full(src_2);
7000     if (if1 || if2) {
7001         if (if1) {
7002             return run_container_cardinality(src_2);
7003         }
7004         if (if2) {
7005             return run_container_cardinality(src_1);
7006         }
7007     }
7008     int answer = 0;
7009     int32_t rlepos = 0;
7010     int32_t xrlepos = 0;
7011     int32_t start = src_1->runs[rlepos].value;
7012     int32_t end = start + src_1->runs[rlepos].length + 1;
7013     int32_t xstart = src_2->runs[xrlepos].value;
7014     int32_t xend = xstart + src_2->runs[xrlepos].length + 1;
7015     while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) {
7016         if (end <= xstart) {
7017             ++rlepos;
7018             if (rlepos < src_1->n_runs) {
7019                 start = src_1->runs[rlepos].value;
7020                 end = start + src_1->runs[rlepos].length + 1;
7021             }
7022         } else if (xend <= start) {
7023             ++xrlepos;
7024             if (xrlepos < src_2->n_runs) {
7025                 xstart = src_2->runs[xrlepos].value;
7026                 xend = xstart + src_2->runs[xrlepos].length + 1;
7027             }
7028         } else {  // they overlap
7029             const int32_t lateststart = start > xstart ? start : xstart;
7030             int32_t earliestend;
7031             if (end == xend) {  // improbable
7032                 earliestend = end;
7033                 rlepos++;
7034                 xrlepos++;
7035                 if (rlepos < src_1->n_runs) {
7036                     start = src_1->runs[rlepos].value;
7037                     end = start + src_1->runs[rlepos].length + 1;
7038                 }
7039                 if (xrlepos < src_2->n_runs) {
7040                     xstart = src_2->runs[xrlepos].value;
7041                     xend = xstart + src_2->runs[xrlepos].length + 1;
7042                 }
7043             } else if (end < xend) {
7044                 earliestend = end;
7045                 rlepos++;
7046                 if (rlepos < src_1->n_runs) {
7047                     start = src_1->runs[rlepos].value;
7048                     end = start + src_1->runs[rlepos].length + 1;
7049                 }
7050 
7051             } else {  // end > xend
7052                 earliestend = xend;
7053                 xrlepos++;
7054                 if (xrlepos < src_2->n_runs) {
7055                     xstart = src_2->runs[xrlepos].value;
7056                     xend = xstart + src_2->runs[xrlepos].length + 1;
7057                 }
7058             }
7059             answer += earliestend - lateststart;
7060         }
7061     }
7062     return answer;
7063 }
7064 
run_container_intersect(const run_container_t * src_1,const run_container_t * src_2)7065 bool run_container_intersect(const run_container_t *src_1,
7066                                 const run_container_t *src_2) {
7067     const bool if1 = run_container_is_full(src_1);
7068     const bool if2 = run_container_is_full(src_2);
7069     if (if1 || if2) {
7070         if (if1) {
7071             return !run_container_empty(src_2);
7072         }
7073         if (if2) {
7074         	return !run_container_empty(src_1);
7075         }
7076     }
7077     int32_t rlepos = 0;
7078     int32_t xrlepos = 0;
7079     int32_t start = src_1->runs[rlepos].value;
7080     int32_t end = start + src_1->runs[rlepos].length + 1;
7081     int32_t xstart = src_2->runs[xrlepos].value;
7082     int32_t xend = xstart + src_2->runs[xrlepos].length + 1;
7083     while ((rlepos < src_1->n_runs) && (xrlepos < src_2->n_runs)) {
7084         if (end <= xstart) {
7085             ++rlepos;
7086             if (rlepos < src_1->n_runs) {
7087                 start = src_1->runs[rlepos].value;
7088                 end = start + src_1->runs[rlepos].length + 1;
7089             }
7090         } else if (xend <= start) {
7091             ++xrlepos;
7092             if (xrlepos < src_2->n_runs) {
7093                 xstart = src_2->runs[xrlepos].value;
7094                 xend = xstart + src_2->runs[xrlepos].length + 1;
7095             }
7096         } else {  // they overlap
7097             return true;
7098         }
7099     }
7100     return false;
7101 }
7102 
7103 
7104 /* Compute the difference of src_1 and src_2 and write the result to
7105  * dst. It is assumed that dst is distinct from both src_1 and src_2. */
run_container_andnot(const run_container_t * src_1,const run_container_t * src_2,run_container_t * dst)7106 void run_container_andnot(const run_container_t *src_1,
7107                           const run_container_t *src_2, run_container_t *dst) {
7108     // following Java implementation as of June 2016
7109 
7110     if (dst->capacity < src_1->n_runs + src_2->n_runs)
7111         run_container_grow(dst, src_1->n_runs + src_2->n_runs, false);
7112 
7113     dst->n_runs = 0;
7114 
7115     int rlepos1 = 0;
7116     int rlepos2 = 0;
7117     int32_t start = src_1->runs[rlepos1].value;
7118     int32_t end = start + src_1->runs[rlepos1].length + 1;
7119     int32_t start2 = src_2->runs[rlepos2].value;
7120     int32_t end2 = start2 + src_2->runs[rlepos2].length + 1;
7121 
7122     while ((rlepos1 < src_1->n_runs) && (rlepos2 < src_2->n_runs)) {
7123         if (end <= start2) {
7124             // output the first run
7125             dst->runs[dst->n_runs++] =
7126                 (rle16_t){.value = (uint16_t)start,
7127                           .length = (uint16_t)(end - start - 1)};
7128             rlepos1++;
7129             if (rlepos1 < src_1->n_runs) {
7130                 start = src_1->runs[rlepos1].value;
7131                 end = start + src_1->runs[rlepos1].length + 1;
7132             }
7133         } else if (end2 <= start) {
7134             // exit the second run
7135             rlepos2++;
7136             if (rlepos2 < src_2->n_runs) {
7137                 start2 = src_2->runs[rlepos2].value;
7138                 end2 = start2 + src_2->runs[rlepos2].length + 1;
7139             }
7140         } else {
7141             if (start < start2) {
7142                 dst->runs[dst->n_runs++] =
7143                     (rle16_t){.value = (uint16_t)start,
7144                               .length = (uint16_t)(start2 - start - 1)};
7145             }
7146             if (end2 < end) {
7147                 start = end2;
7148             } else {
7149                 rlepos1++;
7150                 if (rlepos1 < src_1->n_runs) {
7151                     start = src_1->runs[rlepos1].value;
7152                     end = start + src_1->runs[rlepos1].length + 1;
7153                 }
7154             }
7155         }
7156     }
7157     if (rlepos1 < src_1->n_runs) {
7158         dst->runs[dst->n_runs++] = (rle16_t){
7159             .value = (uint16_t)start, .length = (uint16_t)(end - start - 1)};
7160         rlepos1++;
7161         if (rlepos1 < src_1->n_runs) {
7162             memcpy(dst->runs + dst->n_runs, src_1->runs + rlepos1,
7163                    sizeof(rle16_t) * (src_1->n_runs - rlepos1));
7164             dst->n_runs += src_1->n_runs - rlepos1;
7165         }
7166     }
7167 }
7168 
run_container_to_uint32_array(void * vout,const run_container_t * cont,uint32_t base)7169 int run_container_to_uint32_array(void *vout, const run_container_t *cont,
7170                                   uint32_t base) {
7171     int outpos = 0;
7172     uint32_t *out = (uint32_t *)vout;
7173     for (int i = 0; i < cont->n_runs; ++i) {
7174         uint32_t run_start = base + cont->runs[i].value;
7175         uint16_t le = cont->runs[i].length;
7176         for (int j = 0; j <= le; ++j) {
7177             uint32_t val = run_start + j;
7178             memcpy(out + outpos, &val,
7179                    sizeof(uint32_t));  // should be compiled as a MOV on x64
7180             outpos++;
7181         }
7182     }
7183     return outpos;
7184 }
7185 
7186 /*
7187  * Print this container using printf (useful for debugging).
7188  */
run_container_printf(const run_container_t * cont)7189 void run_container_printf(const run_container_t *cont) {
7190     for (int i = 0; i < cont->n_runs; ++i) {
7191         uint16_t run_start = cont->runs[i].value;
7192         uint16_t le = cont->runs[i].length;
7193         printf("[%d,%d]", run_start, run_start + le);
7194     }
7195 }
7196 
7197 /*
7198  * Print this container using printf as a comma-separated list of 32-bit
7199  * integers starting at base.
7200  */
run_container_printf_as_uint32_array(const run_container_t * cont,uint32_t base)7201 void run_container_printf_as_uint32_array(const run_container_t *cont,
7202                                           uint32_t base) {
7203     if (cont->n_runs == 0) return;
7204     {
7205         uint32_t run_start = base + cont->runs[0].value;
7206         uint16_t le = cont->runs[0].length;
7207         printf("%u", run_start);
7208         for (uint32_t j = 1; j <= le; ++j) printf(",%u", run_start + j);
7209     }
7210     for (int32_t i = 1; i < cont->n_runs; ++i) {
7211         uint32_t run_start = base + cont->runs[i].value;
7212         uint16_t le = cont->runs[i].length;
7213         for (uint32_t j = 0; j <= le; ++j) printf(",%u", run_start + j);
7214     }
7215 }
7216 
run_container_serialize(const run_container_t * container,char * buf)7217 int32_t run_container_serialize(const run_container_t *container, char *buf) {
7218     int32_t l, off;
7219 
7220     memcpy(buf, &container->n_runs, off = sizeof(container->n_runs));
7221     memcpy(&buf[off], &container->capacity, sizeof(container->capacity));
7222     off += sizeof(container->capacity);
7223 
7224     l = sizeof(rle16_t) * container->n_runs;
7225     memcpy(&buf[off], container->runs, l);
7226     return (off + l);
7227 }
7228 
run_container_write(const run_container_t * container,char * buf)7229 int32_t run_container_write(const run_container_t *container, char *buf) {
7230     memcpy(buf, &container->n_runs, sizeof(uint16_t));
7231     memcpy(buf + sizeof(uint16_t), container->runs,
7232            container->n_runs * sizeof(rle16_t));
7233     return run_container_size_in_bytes(container);
7234 }
7235 
run_container_read(int32_t cardinality,run_container_t * container,const char * buf)7236 int32_t run_container_read(int32_t cardinality, run_container_t *container,
7237                            const char *buf) {
7238     (void)cardinality;
7239     memcpy(&container->n_runs, buf, sizeof(uint16_t));
7240     if (container->n_runs > container->capacity)
7241         run_container_grow(container, container->n_runs, false);
7242     if(container->n_runs > 0) {
7243       memcpy(container->runs, buf + sizeof(uint16_t),
7244            container->n_runs * sizeof(rle16_t));
7245     }
7246     return run_container_size_in_bytes(container);
7247 }
7248 
run_container_serialization_len(const run_container_t * container)7249 uint32_t run_container_serialization_len(const run_container_t *container) {
7250     return (sizeof(container->n_runs) + sizeof(container->capacity) +
7251             sizeof(rle16_t) * container->n_runs);
7252 }
7253 
run_container_deserialize(const char * buf,size_t buf_len)7254 void *run_container_deserialize(const char *buf, size_t buf_len) {
7255     run_container_t *ptr;
7256 
7257     if (buf_len < 8 /* n_runs + capacity */)
7258         return (NULL);
7259     else
7260         buf_len -= 8;
7261 
7262     if ((ptr = (run_container_t *)malloc(sizeof(run_container_t))) != NULL) {
7263         size_t len;
7264         int32_t off;
7265 
7266         memcpy(&ptr->n_runs, buf, off = 4);
7267         memcpy(&ptr->capacity, &buf[off], 4);
7268         off += 4;
7269 
7270         len = sizeof(rle16_t) * ptr->n_runs;
7271 
7272         if (len != buf_len) {
7273             free(ptr);
7274             return (NULL);
7275         }
7276 
7277         if ((ptr->runs = (rle16_t *)malloc(len)) == NULL) {
7278             free(ptr);
7279             return (NULL);
7280         }
7281 
7282         memcpy(ptr->runs, &buf[off], len);
7283 
7284         /* Check if returned values are monotonically increasing */
7285         for (int32_t i = 0, j = 0; i < ptr->n_runs; i++) {
7286             if (ptr->runs[i].value < j) {
7287                 free(ptr->runs);
7288                 free(ptr);
7289                 return (NULL);
7290             } else
7291                 j = ptr->runs[i].value;
7292         }
7293     }
7294 
7295     return (ptr);
7296 }
7297 
run_container_iterate(const run_container_t * cont,uint32_t base,roaring_iterator iterator,void * ptr)7298 bool run_container_iterate(const run_container_t *cont, uint32_t base,
7299                            roaring_iterator iterator, void *ptr) {
7300     for (int i = 0; i < cont->n_runs; ++i) {
7301         uint32_t run_start = base + cont->runs[i].value;
7302         uint16_t le = cont->runs[i].length;
7303 
7304         for (int j = 0; j <= le; ++j)
7305             if (!iterator(run_start + j, ptr)) return false;
7306     }
7307     return true;
7308 }
7309 
run_container_iterate64(const run_container_t * cont,uint32_t base,roaring_iterator64 iterator,uint64_t high_bits,void * ptr)7310 bool run_container_iterate64(const run_container_t *cont, uint32_t base,
7311                              roaring_iterator64 iterator, uint64_t high_bits,
7312                              void *ptr) {
7313     for (int i = 0; i < cont->n_runs; ++i) {
7314         uint32_t run_start = base + cont->runs[i].value;
7315         uint16_t le = cont->runs[i].length;
7316 
7317         for (int j = 0; j <= le; ++j)
7318             if (!iterator(high_bits | (uint64_t)(run_start + j), ptr))
7319                 return false;
7320     }
7321     return true;
7322 }
7323 
run_container_is_subset(const run_container_t * container1,const run_container_t * container2)7324 bool run_container_is_subset(const run_container_t *container1,
7325                              const run_container_t *container2) {
7326     int i1 = 0, i2 = 0;
7327     while (i1 < container1->n_runs && i2 < container2->n_runs) {
7328         int start1 = container1->runs[i1].value;
7329         int stop1 = start1 + container1->runs[i1].length;
7330         int start2 = container2->runs[i2].value;
7331         int stop2 = start2 + container2->runs[i2].length;
7332         if (start1 < start2) {
7333             return false;
7334         } else {  // start1 >= start2
7335             if (stop1 < stop2) {
7336                 i1++;
7337             } else if (stop1 == stop2) {
7338                 i1++;
7339                 i2++;
7340             } else {  // stop1 > stop2
7341                 i2++;
7342             }
7343         }
7344     }
7345     if (i1 == container1->n_runs) {
7346         return true;
7347     } else {
7348         return false;
7349     }
7350 }
7351 
7352 // TODO: write smart_append_exclusive version to match the overloaded 1 param
7353 // Java version (or  is it even used?)
7354 
7355 // follows the Java implementation closely
7356 // length is the rle-value.  Ie, run [10,12) uses a length value 1.
run_container_smart_append_exclusive(run_container_t * src,const uint16_t start,const uint16_t length)7357 void run_container_smart_append_exclusive(run_container_t *src,
7358                                           const uint16_t start,
7359                                           const uint16_t length) {
7360     int old_end;
7361     rle16_t *last_run = src->n_runs ? src->runs + (src->n_runs - 1) : NULL;
7362     rle16_t *appended_last_run = src->runs + src->n_runs;
7363 
7364     if (!src->n_runs ||
7365         (start > (old_end = last_run->value + last_run->length + 1))) {
7366         *appended_last_run = (rle16_t){.value = start, .length = length};
7367         src->n_runs++;
7368         return;
7369     }
7370     if (old_end == start) {
7371         // we merge
7372         last_run->length += (length + 1);
7373         return;
7374     }
7375     int new_end = start + length + 1;
7376 
7377     if (start == last_run->value) {
7378         // wipe out previous
7379         if (new_end < old_end) {
7380             *last_run = (rle16_t){.value = (uint16_t)new_end,
7381                                   .length = (uint16_t)(old_end - new_end - 1)};
7382             return;
7383         } else if (new_end > old_end) {
7384             *last_run = (rle16_t){.value = (uint16_t)old_end,
7385                                   .length = (uint16_t)(new_end - old_end - 1)};
7386             return;
7387         } else {
7388             src->n_runs--;
7389             return;
7390         }
7391     }
7392     last_run->length = start - last_run->value - 1;
7393     if (new_end < old_end) {
7394         *appended_last_run =
7395             (rle16_t){.value = (uint16_t)new_end,
7396                       .length = (uint16_t)(old_end - new_end - 1)};
7397         src->n_runs++;
7398     } else if (new_end > old_end) {
7399         *appended_last_run =
7400             (rle16_t){.value = (uint16_t)old_end,
7401                       .length = (uint16_t)(new_end - old_end - 1)};
7402         src->n_runs++;
7403     }
7404 }
7405 
run_container_select(const run_container_t * container,uint32_t * start_rank,uint32_t rank,uint32_t * element)7406 bool run_container_select(const run_container_t *container,
7407                           uint32_t *start_rank, uint32_t rank,
7408                           uint32_t *element) {
7409     for (int i = 0; i < container->n_runs; i++) {
7410         uint16_t length = container->runs[i].length;
7411         if (rank <= *start_rank + length) {
7412             uint16_t value = container->runs[i].value;
7413             *element = value + rank - (*start_rank);
7414             return true;
7415         } else
7416             *start_rank += length + 1;
7417     }
7418     return false;
7419 }
7420 
run_container_rank(const run_container_t * container,uint16_t x)7421 int run_container_rank(const run_container_t *container, uint16_t x) {
7422     int sum = 0;
7423     uint32_t x32 = x;
7424     for (int i = 0; i < container->n_runs; i++) {
7425         uint32_t startpoint = container->runs[i].value;
7426         uint32_t length = container->runs[i].length;
7427         uint32_t endpoint = length + startpoint;
7428         if (x <= endpoint) {
7429             if (x < startpoint) break;
7430             return sum + (x32 - startpoint) + 1;
7431         } else {
7432             sum += length + 1;
7433         }
7434     }
7435     return sum;
7436 }
7437 /* end file src/containers/run.c */
7438 /* begin file src/roaring.c */
7439 #include <assert.h>
7440 #include <stdarg.h>
7441 #include <stdint.h>
7442 #include <stdio.h>
7443 #include <string.h>
7444 #include <inttypes.h>
7445 
is_cow(const roaring_bitmap_t * r)7446 static inline bool is_cow(const roaring_bitmap_t *r) {
7447     return r->high_low_container.flags & ROARING_FLAG_COW;
7448 }
is_frozen(const roaring_bitmap_t * r)7449 static inline bool is_frozen(const roaring_bitmap_t *r) {
7450     return r->high_low_container.flags & ROARING_FLAG_FROZEN;
7451 }
7452 
7453 // this is like roaring_bitmap_add, but it populates pointer arguments in such a
7454 // way
7455 // that we can recover the container touched, which, in turn can be used to
7456 // accelerate some functions (when you repeatedly need to add to the same
7457 // container)
containerptr_roaring_bitmap_add(roaring_bitmap_t * r,uint32_t val,uint8_t * typecode,int * index)7458 static inline void *containerptr_roaring_bitmap_add(roaring_bitmap_t *r,
7459                                                     uint32_t val,
7460                                                     uint8_t *typecode,
7461                                                     int *index) {
7462     uint16_t hb = val >> 16;
7463     const int i = ra_get_index(&r->high_low_container, hb);
7464     if (i >= 0) {
7465         ra_unshare_container_at_index(&r->high_low_container, i);
7466         void *container =
7467             ra_get_container_at_index(&r->high_low_container, i, typecode);
7468         uint8_t newtypecode = *typecode;
7469         void *container2 =
7470             container_add(container, val & 0xFFFF, *typecode, &newtypecode);
7471         *index = i;
7472         if (container2 != container) {
7473             container_free(container, *typecode);
7474             ra_set_container_at_index(&r->high_low_container, i, container2,
7475                                       newtypecode);
7476             *typecode = newtypecode;
7477             return container2;
7478         } else {
7479             return container;
7480         }
7481     } else {
7482         array_container_t *newac = array_container_create();
7483         void *container = container_add(newac, val & 0xFFFF,
7484                                         ARRAY_CONTAINER_TYPE_CODE, typecode);
7485         // we could just assume that it stays an array container
7486         ra_insert_new_key_value_at(&r->high_low_container, -i - 1, hb,
7487                                    container, *typecode);
7488         *index = -i - 1;
7489         return container;
7490     }
7491 }
7492 
roaring_bitmap_create(void)7493 roaring_bitmap_t *roaring_bitmap_create(void) {
7494     roaring_bitmap_t *ans =
7495         (roaring_bitmap_t *)malloc(sizeof(roaring_bitmap_t));
7496     if (!ans) {
7497         return NULL;
7498     }
7499     ra_init(&ans->high_low_container);
7500     return ans;
7501 }
7502 
roaring_bitmap_create_with_capacity(uint32_t cap)7503 roaring_bitmap_t *roaring_bitmap_create_with_capacity(uint32_t cap) {
7504     roaring_bitmap_t *ans =
7505         (roaring_bitmap_t *)malloc(sizeof(roaring_bitmap_t));
7506     if (!ans) {
7507         return NULL;
7508     }
7509     bool is_ok = ra_init_with_capacity(&ans->high_low_container, cap);
7510     if (!is_ok) {
7511         free(ans);
7512         return NULL;
7513     }
7514     return ans;
7515 }
7516 
roaring_bitmap_add_many(roaring_bitmap_t * r,size_t n_args,const uint32_t * vals)7517 void roaring_bitmap_add_many(roaring_bitmap_t *r, size_t n_args,
7518                              const uint32_t *vals) {
7519     void *container = NULL;  // hold value of last container touched
7520     uint8_t typecode = 0;    // typecode of last container touched
7521     uint32_t prev = 0;       // previous valued inserted
7522     size_t i = 0;            // index of value
7523     int containerindex = 0;
7524     if (n_args == 0) return;
7525     uint32_t val;
7526     memcpy(&val, vals + i, sizeof(val));
7527     container =
7528         containerptr_roaring_bitmap_add(r, val, &typecode, &containerindex);
7529     prev = val;
7530     i++;
7531     for (; i < n_args; i++) {
7532         memcpy(&val, vals + i, sizeof(val));
7533         if (((prev ^ val) >> 16) ==
7534             0) {  // no need to seek the container, it is at hand
7535             // because we already have the container at hand, we can do the
7536             // insertion
7537             // automatically, bypassing the roaring_bitmap_add call
7538             uint8_t newtypecode = typecode;
7539             void *container2 =
7540                 container_add(container, val & 0xFFFF, typecode, &newtypecode);
7541             if (container2 != container) {  // rare instance when we need to
7542                                             // change the container type
7543                 container_free(container, typecode);
7544                 ra_set_container_at_index(&r->high_low_container,
7545                                           containerindex, container2,
7546                                           newtypecode);
7547                 typecode = newtypecode;
7548                 container = container2;
7549             }
7550         } else {
7551             container = containerptr_roaring_bitmap_add(r, val, &typecode,
7552                                                         &containerindex);
7553         }
7554         prev = val;
7555     }
7556 }
7557 
roaring_bitmap_of_ptr(size_t n_args,const uint32_t * vals)7558 roaring_bitmap_t *roaring_bitmap_of_ptr(size_t n_args, const uint32_t *vals) {
7559     roaring_bitmap_t *answer = roaring_bitmap_create();
7560     roaring_bitmap_add_many(answer, n_args, vals);
7561     return answer;
7562 }
7563 
roaring_bitmap_of(size_t n_args,...)7564 roaring_bitmap_t *roaring_bitmap_of(size_t n_args, ...) {
7565     // todo: could be greatly optimized but we do not expect this call to ever
7566     // include long lists
7567     roaring_bitmap_t *answer = roaring_bitmap_create();
7568     va_list ap;
7569     va_start(ap, n_args);
7570     for (size_t i = 1; i <= n_args; i++) {
7571         uint32_t val = va_arg(ap, uint32_t);
7572         roaring_bitmap_add(answer, val);
7573     }
7574     va_end(ap);
7575     return answer;
7576 }
7577 
minimum_uint32(uint32_t a,uint32_t b)7578 static inline uint32_t minimum_uint32(uint32_t a, uint32_t b) {
7579     return (a < b) ? a : b;
7580 }
7581 
minimum_uint64(uint64_t a,uint64_t b)7582 static inline uint64_t minimum_uint64(uint64_t a, uint64_t b) {
7583     return (a < b) ? a : b;
7584 }
7585 
roaring_bitmap_from_range(uint64_t min,uint64_t max,uint32_t step)7586 roaring_bitmap_t *roaring_bitmap_from_range(uint64_t min, uint64_t max,
7587                                             uint32_t step) {
7588     if(max >= UINT64_C(0x100000000)) {
7589         max = UINT64_C(0x100000000);
7590     }
7591     if (step == 0) return NULL;
7592     if (max <= min) return NULL;
7593     roaring_bitmap_t *answer = roaring_bitmap_create();
7594     if (step >= (1 << 16)) {
7595         for (uint32_t value = (uint32_t)min; value < max; value += step) {
7596             roaring_bitmap_add(answer, value);
7597         }
7598         return answer;
7599     }
7600     uint64_t min_tmp = min;
7601     do {
7602         uint32_t key = (uint32_t)min_tmp >> 16;
7603         uint32_t container_min = min_tmp & 0xFFFF;
7604         uint32_t container_max = (uint32_t)minimum_uint64(max - (key << 16), 1 << 16);
7605         uint8_t type;
7606         void *container = container_from_range(&type, container_min,
7607                                                container_max, (uint16_t)step);
7608         ra_append(&answer->high_low_container, key, container, type);
7609         uint32_t gap = container_max - container_min + step - 1;
7610         min_tmp += gap - (gap % step);
7611     } while (min_tmp < max);
7612     // cardinality of bitmap will be ((uint64_t) max - min + step - 1 ) / step
7613     return answer;
7614 }
7615 
roaring_bitmap_add_range_closed(roaring_bitmap_t * ra,uint32_t min,uint32_t max)7616 void roaring_bitmap_add_range_closed(roaring_bitmap_t *ra, uint32_t min, uint32_t max) {
7617     if (min > max) {
7618         return;
7619     }
7620 
7621     uint32_t min_key = min >> 16;
7622     uint32_t max_key = max >> 16;
7623 
7624     int32_t num_required_containers = max_key - min_key + 1;
7625     int32_t suffix_length = count_greater(ra->high_low_container.keys,
7626                                           ra->high_low_container.size,
7627                                           max_key);
7628     int32_t prefix_length = count_less(ra->high_low_container.keys,
7629                                        ra->high_low_container.size - suffix_length,
7630                                        min_key);
7631     int32_t common_length = ra->high_low_container.size - prefix_length - suffix_length;
7632 
7633     if (num_required_containers > common_length) {
7634         ra_shift_tail(&ra->high_low_container, suffix_length,
7635                       num_required_containers - common_length);
7636     }
7637 
7638     int32_t src = prefix_length + common_length - 1;
7639     int32_t dst = ra->high_low_container.size - suffix_length - 1;
7640     for (uint32_t key = max_key; key != min_key-1; key--) { // beware of min_key==0
7641         uint32_t container_min = (min_key == key) ? (min & 0xffff) : 0;
7642         uint32_t container_max = (max_key == key) ? (max & 0xffff) : 0xffff;
7643         void* new_container;
7644         uint8_t new_type;
7645 
7646         if (src >= 0 && ra->high_low_container.keys[src] == key) {
7647             ra_unshare_container_at_index(&ra->high_low_container, src);
7648             new_container = container_add_range(ra->high_low_container.containers[src],
7649                                                 ra->high_low_container.typecodes[src],
7650                                                 container_min, container_max, &new_type);
7651             if (new_container != ra->high_low_container.containers[src]) {
7652                 container_free(ra->high_low_container.containers[src],
7653                                ra->high_low_container.typecodes[src]);
7654             }
7655             src--;
7656         } else {
7657             new_container = container_from_range(&new_type, container_min,
7658                                                  container_max+1, 1);
7659         }
7660         ra_replace_key_and_container_at_index(&ra->high_low_container, dst,
7661                                               key, new_container, new_type);
7662         dst--;
7663     }
7664 }
7665 
roaring_bitmap_remove_range_closed(roaring_bitmap_t * ra,uint32_t min,uint32_t max)7666 void roaring_bitmap_remove_range_closed(roaring_bitmap_t *ra, uint32_t min, uint32_t max) {
7667     if (min > max) {
7668         return;
7669     }
7670 
7671     uint32_t min_key = min >> 16;
7672     uint32_t max_key = max >> 16;
7673 
7674     int32_t src = count_less(ra->high_low_container.keys, ra->high_low_container.size, min_key);
7675     int32_t dst = src;
7676     while (src < ra->high_low_container.size && ra->high_low_container.keys[src] <= max_key) {
7677         uint32_t container_min = (min_key == ra->high_low_container.keys[src]) ? (min & 0xffff) : 0;
7678         uint32_t container_max = (max_key == ra->high_low_container.keys[src]) ? (max & 0xffff) : 0xffff;
7679         ra_unshare_container_at_index(&ra->high_low_container, src);
7680         void *new_container;
7681         uint8_t new_type;
7682         new_container = container_remove_range(ra->high_low_container.containers[src],
7683                                                ra->high_low_container.typecodes[src],
7684                                                container_min, container_max,
7685                                                &new_type);
7686         if (new_container != ra->high_low_container.containers[src]) {
7687             container_free(ra->high_low_container.containers[src],
7688                            ra->high_low_container.typecodes[src]);
7689         }
7690         if (new_container) {
7691             ra_replace_key_and_container_at_index(&ra->high_low_container, dst,
7692                                                   ra->high_low_container.keys[src],
7693                                                   new_container, new_type);
7694             dst++;
7695         }
7696         src++;
7697     }
7698     if (src > dst) {
7699         ra_shift_tail(&ra->high_low_container, ra->high_low_container.size - src, dst - src);
7700     }
7701 }
7702 
roaring_bitmap_printf(const roaring_bitmap_t * ra)7703 void roaring_bitmap_printf(const roaring_bitmap_t *ra) {
7704     printf("{");
7705     for (int i = 0; i < ra->high_low_container.size; ++i) {
7706         container_printf_as_uint32_array(
7707             ra->high_low_container.containers[i],
7708             ra->high_low_container.typecodes[i],
7709             ((uint32_t)ra->high_low_container.keys[i]) << 16);
7710         if (i + 1 < ra->high_low_container.size) printf(",");
7711     }
7712     printf("}");
7713 }
7714 
roaring_bitmap_printf_describe(const roaring_bitmap_t * ra)7715 void roaring_bitmap_printf_describe(const roaring_bitmap_t *ra) {
7716     printf("{");
7717     for (int i = 0; i < ra->high_low_container.size; ++i) {
7718         printf("%d: %s (%d)", ra->high_low_container.keys[i],
7719                get_full_container_name(ra->high_low_container.containers[i],
7720                                        ra->high_low_container.typecodes[i]),
7721                container_get_cardinality(ra->high_low_container.containers[i],
7722                                          ra->high_low_container.typecodes[i]));
7723         if (ra->high_low_container.typecodes[i] == SHARED_CONTAINER_TYPE_CODE) {
7724             printf(
7725                 "(shared count = %" PRIu32 " )",
7726                 ((shared_container_t *)(ra->high_low_container.containers[i]))
7727                     ->counter);
7728         }
7729 
7730         if (i + 1 < ra->high_low_container.size) printf(", ");
7731     }
7732     printf("}");
7733 }
7734 
7735 typedef struct min_max_sum_s {
7736     uint32_t min;
7737     uint32_t max;
7738     uint64_t sum;
7739 } min_max_sum_t;
7740 
min_max_sum_fnc(uint32_t value,void * param)7741 static bool min_max_sum_fnc(uint32_t value, void *param) {
7742     min_max_sum_t *mms = (min_max_sum_t *)param;
7743     if (value > mms->max) mms->max = value;
7744     if (value < mms->min) mms->min = value;
7745     mms->sum += value;
7746     return true;  // we always process all data points
7747 }
7748 
7749 /**
7750 *  (For advanced users.)
7751 * Collect statistics about the bitmap
7752 */
roaring_bitmap_statistics(const roaring_bitmap_t * ra,roaring_statistics_t * stat)7753 void roaring_bitmap_statistics(const roaring_bitmap_t *ra,
7754                                roaring_statistics_t *stat) {
7755     memset(stat, 0, sizeof(*stat));
7756     stat->n_containers = ra->high_low_container.size;
7757     stat->cardinality = roaring_bitmap_get_cardinality(ra);
7758     min_max_sum_t mms;
7759     mms.min = UINT32_C(0xFFFFFFFF);
7760     mms.max = UINT32_C(0);
7761     mms.sum = 0;
7762     roaring_iterate(ra, &min_max_sum_fnc, &mms);
7763     stat->min_value = mms.min;
7764     stat->max_value = mms.max;
7765     stat->sum_value = mms.sum;
7766 
7767     for (int i = 0; i < ra->high_low_container.size; ++i) {
7768         uint8_t truetype =
7769             get_container_type(ra->high_low_container.containers[i],
7770                                ra->high_low_container.typecodes[i]);
7771         uint32_t card =
7772             container_get_cardinality(ra->high_low_container.containers[i],
7773                                       ra->high_low_container.typecodes[i]);
7774         uint32_t sbytes =
7775             container_size_in_bytes(ra->high_low_container.containers[i],
7776                                     ra->high_low_container.typecodes[i]);
7777         switch (truetype) {
7778             case BITSET_CONTAINER_TYPE_CODE:
7779                 stat->n_bitset_containers++;
7780                 stat->n_values_bitset_containers += card;
7781                 stat->n_bytes_bitset_containers += sbytes;
7782                 break;
7783             case ARRAY_CONTAINER_TYPE_CODE:
7784                 stat->n_array_containers++;
7785                 stat->n_values_array_containers += card;
7786                 stat->n_bytes_array_containers += sbytes;
7787                 break;
7788             case RUN_CONTAINER_TYPE_CODE:
7789                 stat->n_run_containers++;
7790                 stat->n_values_run_containers += card;
7791                 stat->n_bytes_run_containers += sbytes;
7792                 break;
7793             default:
7794                 assert(false);
7795                 __builtin_unreachable();
7796         }
7797     }
7798 }
7799 
roaring_bitmap_copy(const roaring_bitmap_t * r)7800 roaring_bitmap_t *roaring_bitmap_copy(const roaring_bitmap_t *r) {
7801     roaring_bitmap_t *ans =
7802         (roaring_bitmap_t *)malloc(sizeof(roaring_bitmap_t));
7803     if (!ans) {
7804         return NULL;
7805     }
7806     bool is_ok = ra_copy(&r->high_low_container, &ans->high_low_container,
7807                          is_cow(r));
7808     if (!is_ok) {
7809         free(ans);
7810         return NULL;
7811     }
7812     roaring_bitmap_set_copy_on_write(ans, is_cow(r));
7813     return ans;
7814 }
7815 
roaring_bitmap_overwrite(roaring_bitmap_t * dest,const roaring_bitmap_t * src)7816 bool roaring_bitmap_overwrite(roaring_bitmap_t *dest,
7817                                      const roaring_bitmap_t *src) {
7818     return ra_overwrite(&src->high_low_container, &dest->high_low_container,
7819                         is_cow(src));
7820 }
7821 
roaring_bitmap_free(const roaring_bitmap_t * r)7822 void roaring_bitmap_free(const roaring_bitmap_t *r) {
7823     if (!is_frozen(r)) {
7824       ra_clear((roaring_array_t*)&r->high_low_container);
7825     }
7826     free((roaring_bitmap_t*)r);
7827 }
7828 
roaring_bitmap_clear(roaring_bitmap_t * r)7829 void roaring_bitmap_clear(roaring_bitmap_t *r) {
7830   ra_reset(&r->high_low_container);
7831 }
7832 
roaring_bitmap_add(roaring_bitmap_t * r,uint32_t val)7833 void roaring_bitmap_add(roaring_bitmap_t *r, uint32_t val) {
7834     const uint16_t hb = val >> 16;
7835     const int i = ra_get_index(&r->high_low_container, hb);
7836     uint8_t typecode;
7837     if (i >= 0) {
7838         ra_unshare_container_at_index(&r->high_low_container, i);
7839         void *container =
7840             ra_get_container_at_index(&r->high_low_container, i, &typecode);
7841         uint8_t newtypecode = typecode;
7842         void *container2 =
7843             container_add(container, val & 0xFFFF, typecode, &newtypecode);
7844         if (container2 != container) {
7845             container_free(container, typecode);
7846             ra_set_container_at_index(&r->high_low_container, i, container2,
7847                                       newtypecode);
7848         }
7849     } else {
7850         array_container_t *newac = array_container_create();
7851         void *container = container_add(newac, val & 0xFFFF,
7852                                         ARRAY_CONTAINER_TYPE_CODE, &typecode);
7853         // we could just assume that it stays an array container
7854         ra_insert_new_key_value_at(&r->high_low_container, -i - 1, hb,
7855                                    container, typecode);
7856     }
7857 }
7858 
roaring_bitmap_add_checked(roaring_bitmap_t * r,uint32_t val)7859 bool roaring_bitmap_add_checked(roaring_bitmap_t *r, uint32_t val) {
7860     const uint16_t hb = val >> 16;
7861     const int i = ra_get_index(&r->high_low_container, hb);
7862     uint8_t typecode;
7863     bool result = false;
7864     if (i >= 0) {
7865         ra_unshare_container_at_index(&r->high_low_container, i);
7866         void *container =
7867             ra_get_container_at_index(&r->high_low_container, i, &typecode);
7868 
7869         const int oldCardinality =
7870             container_get_cardinality(container, typecode);
7871 
7872         uint8_t newtypecode = typecode;
7873         void *container2 =
7874             container_add(container, val & 0xFFFF, typecode, &newtypecode);
7875         if (container2 != container) {
7876             container_free(container, typecode);
7877             ra_set_container_at_index(&r->high_low_container, i, container2,
7878                                       newtypecode);
7879             result = true;
7880         } else {
7881             const int newCardinality =
7882                 container_get_cardinality(container, newtypecode);
7883 
7884             result = oldCardinality != newCardinality;
7885         }
7886     } else {
7887         array_container_t *newac = array_container_create();
7888         void *container = container_add(newac, val & 0xFFFF,
7889                                         ARRAY_CONTAINER_TYPE_CODE, &typecode);
7890         // we could just assume that it stays an array container
7891         ra_insert_new_key_value_at(&r->high_low_container, -i - 1, hb,
7892                                    container, typecode);
7893         result = true;
7894     }
7895 
7896     return result;
7897 }
7898 
roaring_bitmap_remove(roaring_bitmap_t * r,uint32_t val)7899 void roaring_bitmap_remove(roaring_bitmap_t *r, uint32_t val) {
7900     const uint16_t hb = val >> 16;
7901     const int i = ra_get_index(&r->high_low_container, hb);
7902     uint8_t typecode;
7903     if (i >= 0) {
7904         ra_unshare_container_at_index(&r->high_low_container, i);
7905         void *container =
7906             ra_get_container_at_index(&r->high_low_container, i, &typecode);
7907         uint8_t newtypecode = typecode;
7908         void *container2 =
7909             container_remove(container, val & 0xFFFF, typecode, &newtypecode);
7910         if (container2 != container) {
7911             container_free(container, typecode);
7912             ra_set_container_at_index(&r->high_low_container, i, container2,
7913                                       newtypecode);
7914         }
7915         if (container_get_cardinality(container2, newtypecode) != 0) {
7916             ra_set_container_at_index(&r->high_low_container, i, container2,
7917                                       newtypecode);
7918         } else {
7919             ra_remove_at_index_and_free(&r->high_low_container, i);
7920         }
7921     }
7922 }
7923 
roaring_bitmap_remove_checked(roaring_bitmap_t * r,uint32_t val)7924 bool roaring_bitmap_remove_checked(roaring_bitmap_t *r, uint32_t val) {
7925     const uint16_t hb = val >> 16;
7926     const int i = ra_get_index(&r->high_low_container, hb);
7927     uint8_t typecode;
7928     bool result = false;
7929     if (i >= 0) {
7930         ra_unshare_container_at_index(&r->high_low_container, i);
7931         void *container =
7932             ra_get_container_at_index(&r->high_low_container, i, &typecode);
7933 
7934         const int oldCardinality =
7935             container_get_cardinality(container, typecode);
7936 
7937         uint8_t newtypecode = typecode;
7938         void *container2 =
7939             container_remove(container, val & 0xFFFF, typecode, &newtypecode);
7940         if (container2 != container) {
7941             container_free(container, typecode);
7942             ra_set_container_at_index(&r->high_low_container, i, container2,
7943                                       newtypecode);
7944         }
7945 
7946         const int newCardinality =
7947             container_get_cardinality(container2, newtypecode);
7948 
7949         if (newCardinality != 0) {
7950             ra_set_container_at_index(&r->high_low_container, i, container2,
7951                                       newtypecode);
7952         } else {
7953             ra_remove_at_index_and_free(&r->high_low_container, i);
7954         }
7955 
7956         result = oldCardinality != newCardinality;
7957     }
7958     return result;
7959 }
7960 
roaring_bitmap_remove_many(roaring_bitmap_t * r,size_t n_args,const uint32_t * vals)7961 void roaring_bitmap_remove_many(roaring_bitmap_t *r, size_t n_args,
7962                                 const uint32_t *vals) {
7963     if (n_args == 0 || r->high_low_container.size == 0) {
7964         return;
7965     }
7966     int32_t pos = -1; // position of the container used in the previous iteration
7967     for (size_t i = 0; i < n_args; i++) {
7968         uint16_t key = (uint16_t)(vals[i] >> 16);
7969         if (pos < 0 || key != r->high_low_container.keys[pos]) {
7970             pos = ra_get_index(&r->high_low_container, key);
7971         }
7972         if (pos >= 0) {
7973             uint8_t new_typecode;
7974             void *new_container;
7975             new_container = container_remove(r->high_low_container.containers[pos],
7976                                              vals[i] & 0xffff,
7977                                              r->high_low_container.typecodes[pos],
7978                                              &new_typecode);
7979             if (new_container != r->high_low_container.containers[pos]) {
7980                 container_free(r->high_low_container.containers[pos],
7981                                r->high_low_container.typecodes[pos]);
7982                 ra_replace_key_and_container_at_index(&r->high_low_container,
7983                                                       pos, key, new_container,
7984                                                       new_typecode);
7985             }
7986             if (!container_nonzero_cardinality(new_container, new_typecode)) {
7987                 container_free(new_container, new_typecode);
7988                 ra_remove_at_index(&r->high_low_container, pos);
7989                 pos = -1;
7990             }
7991         }
7992     }
7993 }
7994 
7995 // there should be some SIMD optimizations possible here
roaring_bitmap_and(const roaring_bitmap_t * x1,const roaring_bitmap_t * x2)7996 roaring_bitmap_t *roaring_bitmap_and(const roaring_bitmap_t *x1,
7997                                      const roaring_bitmap_t *x2) {
7998     uint8_t container_result_type = 0;
7999     const int length1 = x1->high_low_container.size,
8000               length2 = x2->high_low_container.size;
8001     uint32_t neededcap = length1 > length2 ? length2 : length1;
8002     roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(neededcap);
8003     roaring_bitmap_set_copy_on_write(answer, is_cow(x1) && is_cow(x2));
8004 
8005     int pos1 = 0, pos2 = 0;
8006 
8007     while (pos1 < length1 && pos2 < length2) {
8008         const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8009         const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8010 
8011         if (s1 == s2) {
8012             uint8_t container_type_1, container_type_2;
8013             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
8014                                                  &container_type_1);
8015             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
8016                                                  &container_type_2);
8017             void *c = container_and(c1, container_type_1, c2, container_type_2,
8018                                     &container_result_type);
8019             if (container_nonzero_cardinality(c, container_result_type)) {
8020                 ra_append(&answer->high_low_container, s1, c,
8021                           container_result_type);
8022             } else {
8023                 container_free(
8024                     c, container_result_type);  // otherwise:memory leak!
8025             }
8026             ++pos1;
8027             ++pos2;
8028         } else if (s1 < s2) {  // s1 < s2
8029             pos1 = ra_advance_until(&x1->high_low_container, s2, pos1);
8030         } else {  // s1 > s2
8031             pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
8032         }
8033     }
8034     return answer;
8035 }
8036 
8037 /**
8038  * Compute the union of 'number' bitmaps.
8039  */
roaring_bitmap_or_many(size_t number,const roaring_bitmap_t ** x)8040 roaring_bitmap_t *roaring_bitmap_or_many(size_t number,
8041                                          const roaring_bitmap_t **x) {
8042     if (number == 0) {
8043         return roaring_bitmap_create();
8044     }
8045     if (number == 1) {
8046         return roaring_bitmap_copy(x[0]);
8047     }
8048     roaring_bitmap_t *answer =
8049         roaring_bitmap_lazy_or(x[0], x[1], LAZY_OR_BITSET_CONVERSION);
8050     for (size_t i = 2; i < number; i++) {
8051         roaring_bitmap_lazy_or_inplace(answer, x[i], LAZY_OR_BITSET_CONVERSION);
8052     }
8053     roaring_bitmap_repair_after_lazy(answer);
8054     return answer;
8055 }
8056 
8057 /**
8058  * Compute the xor of 'number' bitmaps.
8059  */
roaring_bitmap_xor_many(size_t number,const roaring_bitmap_t ** x)8060 roaring_bitmap_t *roaring_bitmap_xor_many(size_t number,
8061                                           const roaring_bitmap_t **x) {
8062     if (number == 0) {
8063         return roaring_bitmap_create();
8064     }
8065     if (number == 1) {
8066         return roaring_bitmap_copy(x[0]);
8067     }
8068     roaring_bitmap_t *answer = roaring_bitmap_lazy_xor(x[0], x[1]);
8069     for (size_t i = 2; i < number; i++) {
8070         roaring_bitmap_lazy_xor_inplace(answer, x[i]);
8071     }
8072     roaring_bitmap_repair_after_lazy(answer);
8073     return answer;
8074 }
8075 
8076 // inplace and (modifies its first argument).
roaring_bitmap_and_inplace(roaring_bitmap_t * x1,const roaring_bitmap_t * x2)8077 void roaring_bitmap_and_inplace(roaring_bitmap_t *x1,
8078                                 const roaring_bitmap_t *x2) {
8079     if (x1 == x2) return;
8080     int pos1 = 0, pos2 = 0, intersection_size = 0;
8081     const int length1 = ra_get_size(&x1->high_low_container);
8082     const int length2 = ra_get_size(&x2->high_low_container);
8083 
8084     // any skipped-over or newly emptied containers in x1
8085     // have to be freed.
8086     while (pos1 < length1 && pos2 < length2) {
8087         const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8088         const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8089 
8090         if (s1 == s2) {
8091             uint8_t typecode1, typecode2, typecode_result;
8092             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
8093                                                  &typecode1);
8094             c1 = get_writable_copy_if_shared(c1, &typecode1);
8095             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
8096                                                  &typecode2);
8097             void *c =
8098                 container_iand(c1, typecode1, c2, typecode2, &typecode_result);
8099             if (c != c1) {  // in this instance a new container was created, and
8100                             // we need to free the old one
8101                 container_free(c1, typecode1);
8102             }
8103             if (container_nonzero_cardinality(c, typecode_result)) {
8104                 ra_replace_key_and_container_at_index(&x1->high_low_container,
8105                                                       intersection_size, s1, c,
8106                                                       typecode_result);
8107                 intersection_size++;
8108             } else {
8109                 container_free(c, typecode_result);
8110             }
8111             ++pos1;
8112             ++pos2;
8113         } else if (s1 < s2) {
8114             pos1 = ra_advance_until_freeing(&x1->high_low_container, s2, pos1);
8115         } else {  // s1 > s2
8116             pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
8117         }
8118     }
8119 
8120     // if we ended early because x2 ran out, then all remaining in x1 should be
8121     // freed
8122     while (pos1 < length1) {
8123         container_free(x1->high_low_container.containers[pos1],
8124                        x1->high_low_container.typecodes[pos1]);
8125         ++pos1;
8126     }
8127 
8128     // all containers after this have either been copied or freed
8129     ra_downsize(&x1->high_low_container, intersection_size);
8130 }
8131 
roaring_bitmap_or(const roaring_bitmap_t * x1,const roaring_bitmap_t * x2)8132 roaring_bitmap_t *roaring_bitmap_or(const roaring_bitmap_t *x1,
8133                                     const roaring_bitmap_t *x2) {
8134     uint8_t container_result_type = 0;
8135     const int length1 = x1->high_low_container.size,
8136               length2 = x2->high_low_container.size;
8137     if (0 == length1) {
8138         return roaring_bitmap_copy(x2);
8139     }
8140     if (0 == length2) {
8141         return roaring_bitmap_copy(x1);
8142     }
8143     roaring_bitmap_t *answer =
8144         roaring_bitmap_create_with_capacity(length1 + length2);
8145     roaring_bitmap_set_copy_on_write(answer, is_cow(x1) && is_cow(x2));
8146     int pos1 = 0, pos2 = 0;
8147     uint8_t container_type_1, container_type_2;
8148     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8149     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8150     while (true) {
8151         if (s1 == s2) {
8152             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
8153                                                  &container_type_1);
8154             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
8155                                                  &container_type_2);
8156             void *c = container_or(c1, container_type_1, c2, container_type_2,
8157                                    &container_result_type);
8158             // since we assume that the initial containers are non-empty, the
8159             // result here
8160             // can only be non-empty
8161             ra_append(&answer->high_low_container, s1, c,
8162                       container_result_type);
8163             ++pos1;
8164             ++pos2;
8165             if (pos1 == length1) break;
8166             if (pos2 == length2) break;
8167             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8168             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8169 
8170         } else if (s1 < s2) {  // s1 < s2
8171             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
8172                                                  &container_type_1);
8173             // c1 = container_clone(c1, container_type_1);
8174             c1 =
8175                 get_copy_of_container(c1, &container_type_1, is_cow(x1));
8176             if (is_cow(x1)) {
8177                 ra_set_container_at_index(&x1->high_low_container, pos1, c1,
8178                                           container_type_1);
8179             }
8180             ra_append(&answer->high_low_container, s1, c1, container_type_1);
8181             pos1++;
8182             if (pos1 == length1) break;
8183             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8184 
8185         } else {  // s1 > s2
8186             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
8187                                                  &container_type_2);
8188             // c2 = container_clone(c2, container_type_2);
8189             c2 =
8190                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
8191             if (is_cow(x2)) {
8192                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
8193                                           container_type_2);
8194             }
8195             ra_append(&answer->high_low_container, s2, c2, container_type_2);
8196             pos2++;
8197             if (pos2 == length2) break;
8198             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8199         }
8200     }
8201     if (pos1 == length1) {
8202         ra_append_copy_range(&answer->high_low_container,
8203                              &x2->high_low_container, pos2, length2,
8204                              is_cow(x2));
8205     } else if (pos2 == length2) {
8206         ra_append_copy_range(&answer->high_low_container,
8207                              &x1->high_low_container, pos1, length1,
8208                              is_cow(x1));
8209     }
8210     return answer;
8211 }
8212 
8213 // inplace or (modifies its first argument).
roaring_bitmap_or_inplace(roaring_bitmap_t * x1,const roaring_bitmap_t * x2)8214 void roaring_bitmap_or_inplace(roaring_bitmap_t *x1,
8215                                const roaring_bitmap_t *x2) {
8216     uint8_t container_result_type = 0;
8217     int length1 = x1->high_low_container.size;
8218     const int length2 = x2->high_low_container.size;
8219 
8220     if (0 == length2) return;
8221 
8222     if (0 == length1) {
8223         roaring_bitmap_overwrite(x1, x2);
8224         return;
8225     }
8226     int pos1 = 0, pos2 = 0;
8227     uint8_t container_type_1, container_type_2;
8228     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8229     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8230     while (true) {
8231         if (s1 == s2) {
8232             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
8233                                                  &container_type_1);
8234             if (!container_is_full(c1, container_type_1)) {
8235                 c1 = get_writable_copy_if_shared(c1, &container_type_1);
8236 
8237                 void *c2 = ra_get_container_at_index(&x2->high_low_container,
8238                                                      pos2, &container_type_2);
8239                 void *c =
8240                     container_ior(c1, container_type_1, c2, container_type_2,
8241                                   &container_result_type);
8242                 if (c !=
8243                     c1) {  // in this instance a new container was created, and
8244                            // we need to free the old one
8245                     container_free(c1, container_type_1);
8246                 }
8247 
8248                 ra_set_container_at_index(&x1->high_low_container, pos1, c,
8249                                           container_result_type);
8250             }
8251             ++pos1;
8252             ++pos2;
8253             if (pos1 == length1) break;
8254             if (pos2 == length2) break;
8255             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8256             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8257 
8258         } else if (s1 < s2) {  // s1 < s2
8259             pos1++;
8260             if (pos1 == length1) break;
8261             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8262 
8263         } else {  // s1 > s2
8264             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
8265                                                  &container_type_2);
8266             c2 =
8267                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
8268             if (is_cow(x2)) {
8269                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
8270                                           container_type_2);
8271             }
8272 
8273             // void *c2_clone = container_clone(c2, container_type_2);
8274             ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
8275                                        container_type_2);
8276             pos1++;
8277             length1++;
8278             pos2++;
8279             if (pos2 == length2) break;
8280             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8281         }
8282     }
8283     if (pos1 == length1) {
8284         ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
8285                              pos2, length2, is_cow(x2));
8286     }
8287 }
8288 
roaring_bitmap_xor(const roaring_bitmap_t * x1,const roaring_bitmap_t * x2)8289 roaring_bitmap_t *roaring_bitmap_xor(const roaring_bitmap_t *x1,
8290                                      const roaring_bitmap_t *x2) {
8291     uint8_t container_result_type = 0;
8292     const int length1 = x1->high_low_container.size,
8293               length2 = x2->high_low_container.size;
8294     if (0 == length1) {
8295         return roaring_bitmap_copy(x2);
8296     }
8297     if (0 == length2) {
8298         return roaring_bitmap_copy(x1);
8299     }
8300     roaring_bitmap_t *answer =
8301         roaring_bitmap_create_with_capacity(length1 + length2);
8302     roaring_bitmap_set_copy_on_write(answer, is_cow(x1) && is_cow(x2));
8303     int pos1 = 0, pos2 = 0;
8304     uint8_t container_type_1, container_type_2;
8305     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8306     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8307     while (true) {
8308         if (s1 == s2) {
8309             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
8310                                                  &container_type_1);
8311             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
8312                                                  &container_type_2);
8313             void *c = container_xor(c1, container_type_1, c2, container_type_2,
8314                                     &container_result_type);
8315 
8316             if (container_nonzero_cardinality(c, container_result_type)) {
8317                 ra_append(&answer->high_low_container, s1, c,
8318                           container_result_type);
8319             } else {
8320                 container_free(c, container_result_type);
8321             }
8322             ++pos1;
8323             ++pos2;
8324             if (pos1 == length1) break;
8325             if (pos2 == length2) break;
8326             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8327             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8328 
8329         } else if (s1 < s2) {  // s1 < s2
8330             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
8331                                                  &container_type_1);
8332             c1 =
8333                 get_copy_of_container(c1, &container_type_1, is_cow(x1));
8334             if (is_cow(x1)) {
8335                 ra_set_container_at_index(&x1->high_low_container, pos1, c1,
8336                                           container_type_1);
8337             }
8338             ra_append(&answer->high_low_container, s1, c1, container_type_1);
8339             pos1++;
8340             if (pos1 == length1) break;
8341             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8342 
8343         } else {  // s1 > s2
8344             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
8345                                                  &container_type_2);
8346             c2 =
8347                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
8348             if (is_cow(x2)) {
8349                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
8350                                           container_type_2);
8351             }
8352             ra_append(&answer->high_low_container, s2, c2, container_type_2);
8353             pos2++;
8354             if (pos2 == length2) break;
8355             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8356         }
8357     }
8358     if (pos1 == length1) {
8359         ra_append_copy_range(&answer->high_low_container,
8360                              &x2->high_low_container, pos2, length2,
8361                              is_cow(x2));
8362     } else if (pos2 == length2) {
8363         ra_append_copy_range(&answer->high_low_container,
8364                              &x1->high_low_container, pos1, length1,
8365                              is_cow(x1));
8366     }
8367     return answer;
8368 }
8369 
8370 // inplace xor (modifies its first argument).
8371 
roaring_bitmap_xor_inplace(roaring_bitmap_t * x1,const roaring_bitmap_t * x2)8372 void roaring_bitmap_xor_inplace(roaring_bitmap_t *x1,
8373                                 const roaring_bitmap_t *x2) {
8374     assert(x1 != x2);
8375     uint8_t container_result_type = 0;
8376     int length1 = x1->high_low_container.size;
8377     const int length2 = x2->high_low_container.size;
8378 
8379     if (0 == length2) return;
8380 
8381     if (0 == length1) {
8382         roaring_bitmap_overwrite(x1, x2);
8383         return;
8384     }
8385 
8386     // XOR can have new containers inserted from x2, but can also
8387     // lose containers when x1 and x2 are nonempty and identical.
8388 
8389     int pos1 = 0, pos2 = 0;
8390     uint8_t container_type_1, container_type_2;
8391     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8392     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8393     while (true) {
8394         if (s1 == s2) {
8395             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
8396                                                  &container_type_1);
8397             c1 = get_writable_copy_if_shared(c1, &container_type_1);
8398 
8399             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
8400                                                  &container_type_2);
8401             void *c = container_ixor(c1, container_type_1, c2, container_type_2,
8402                                      &container_result_type);
8403 
8404             if (container_nonzero_cardinality(c, container_result_type)) {
8405                 ra_set_container_at_index(&x1->high_low_container, pos1, c,
8406                                           container_result_type);
8407                 ++pos1;
8408             } else {
8409                 container_free(c, container_result_type);
8410                 ra_remove_at_index(&x1->high_low_container, pos1);
8411                 --length1;
8412             }
8413 
8414             ++pos2;
8415             if (pos1 == length1) break;
8416             if (pos2 == length2) break;
8417             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8418             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8419 
8420         } else if (s1 < s2) {  // s1 < s2
8421             pos1++;
8422             if (pos1 == length1) break;
8423             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8424 
8425         } else {  // s1 > s2
8426             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
8427                                                  &container_type_2);
8428             c2 =
8429                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
8430             if (is_cow(x2)) {
8431                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
8432                                           container_type_2);
8433             }
8434 
8435             ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
8436                                        container_type_2);
8437             pos1++;
8438             length1++;
8439             pos2++;
8440             if (pos2 == length2) break;
8441             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8442         }
8443     }
8444     if (pos1 == length1) {
8445         ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
8446                              pos2, length2, is_cow(x2));
8447     }
8448 }
8449 
roaring_bitmap_andnot(const roaring_bitmap_t * x1,const roaring_bitmap_t * x2)8450 roaring_bitmap_t *roaring_bitmap_andnot(const roaring_bitmap_t *x1,
8451                                         const roaring_bitmap_t *x2) {
8452     uint8_t container_result_type = 0;
8453     const int length1 = x1->high_low_container.size,
8454               length2 = x2->high_low_container.size;
8455     if (0 == length1) {
8456         roaring_bitmap_t *empty_bitmap = roaring_bitmap_create();
8457         roaring_bitmap_set_copy_on_write(empty_bitmap, is_cow(x1) && is_cow(x2));
8458         return empty_bitmap;
8459     }
8460     if (0 == length2) {
8461         return roaring_bitmap_copy(x1);
8462     }
8463     roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(length1);
8464     roaring_bitmap_set_copy_on_write(answer, is_cow(x1) && is_cow(x2));
8465 
8466     int pos1 = 0, pos2 = 0;
8467     uint8_t container_type_1, container_type_2;
8468     uint16_t s1 = 0;
8469     uint16_t s2 = 0;
8470     while (true) {
8471         s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8472         s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8473 
8474         if (s1 == s2) {
8475             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
8476                                                  &container_type_1);
8477             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
8478                                                  &container_type_2);
8479             void *c =
8480                 container_andnot(c1, container_type_1, c2, container_type_2,
8481                                  &container_result_type);
8482 
8483             if (container_nonzero_cardinality(c, container_result_type)) {
8484                 ra_append(&answer->high_low_container, s1, c,
8485                           container_result_type);
8486             } else {
8487                 container_free(c, container_result_type);
8488             }
8489             ++pos1;
8490             ++pos2;
8491             if (pos1 == length1) break;
8492             if (pos2 == length2) break;
8493         } else if (s1 < s2) {  // s1 < s2
8494             const int next_pos1 =
8495                 ra_advance_until(&x1->high_low_container, s2, pos1);
8496             ra_append_copy_range(&answer->high_low_container,
8497                                  &x1->high_low_container, pos1, next_pos1,
8498                                  is_cow(x1));
8499             // TODO : perhaps some of the copy_on_write should be based on
8500             // answer rather than x1 (more stringent?).  Many similar cases
8501             pos1 = next_pos1;
8502             if (pos1 == length1) break;
8503         } else {  // s1 > s2
8504             pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
8505             if (pos2 == length2) break;
8506         }
8507     }
8508     if (pos2 == length2) {
8509         ra_append_copy_range(&answer->high_low_container,
8510                              &x1->high_low_container, pos1, length1,
8511                              is_cow(x1));
8512     }
8513     return answer;
8514 }
8515 
8516 // inplace andnot (modifies its first argument).
8517 
roaring_bitmap_andnot_inplace(roaring_bitmap_t * x1,const roaring_bitmap_t * x2)8518 void roaring_bitmap_andnot_inplace(roaring_bitmap_t *x1,
8519                                    const roaring_bitmap_t *x2) {
8520     assert(x1 != x2);
8521 
8522     uint8_t container_result_type = 0;
8523     int length1 = x1->high_low_container.size;
8524     const int length2 = x2->high_low_container.size;
8525     int intersection_size = 0;
8526 
8527     if (0 == length2) return;
8528 
8529     if (0 == length1) {
8530         roaring_bitmap_clear(x1);
8531         return;
8532     }
8533 
8534     int pos1 = 0, pos2 = 0;
8535     uint8_t container_type_1, container_type_2;
8536     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8537     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8538     while (true) {
8539         if (s1 == s2) {
8540             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
8541                                                  &container_type_1);
8542             c1 = get_writable_copy_if_shared(c1, &container_type_1);
8543 
8544             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
8545                                                  &container_type_2);
8546             void *c =
8547                 container_iandnot(c1, container_type_1, c2, container_type_2,
8548                                   &container_result_type);
8549 
8550             if (container_nonzero_cardinality(c, container_result_type)) {
8551                 ra_replace_key_and_container_at_index(&x1->high_low_container,
8552                                                       intersection_size++, s1,
8553                                                       c, container_result_type);
8554             } else {
8555                 container_free(c, container_result_type);
8556             }
8557 
8558             ++pos1;
8559             ++pos2;
8560             if (pos1 == length1) break;
8561             if (pos2 == length2) break;
8562             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8563             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8564 
8565         } else if (s1 < s2) {  // s1 < s2
8566             if (pos1 != intersection_size) {
8567                 void *c1 = ra_get_container_at_index(&x1->high_low_container,
8568                                                      pos1, &container_type_1);
8569 
8570                 ra_replace_key_and_container_at_index(&x1->high_low_container,
8571                                                       intersection_size, s1, c1,
8572                                                       container_type_1);
8573             }
8574             intersection_size++;
8575             pos1++;
8576             if (pos1 == length1) break;
8577             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
8578 
8579         } else {  // s1 > s2
8580             pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
8581             if (pos2 == length2) break;
8582             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
8583         }
8584     }
8585 
8586     if (pos1 < length1) {
8587         // all containers between intersection_size and
8588         // pos1 are junk.  However, they have either been moved
8589         // (thus still referenced) or involved in an iandnot
8590         // that will clean up all containers that could not be reused.
8591         // Thus we should not free the junk containers between
8592         // intersection_size and pos1.
8593         if (pos1 > intersection_size) {
8594             // left slide of remaining items
8595             ra_copy_range(&x1->high_low_container, pos1, length1,
8596                           intersection_size);
8597         }
8598         // else current placement is fine
8599         intersection_size += (length1 - pos1);
8600     }
8601     ra_downsize(&x1->high_low_container, intersection_size);
8602 }
8603 
roaring_bitmap_get_cardinality(const roaring_bitmap_t * ra)8604 uint64_t roaring_bitmap_get_cardinality(const roaring_bitmap_t *ra) {
8605     uint64_t card = 0;
8606     for (int i = 0; i < ra->high_low_container.size; ++i)
8607         card += container_get_cardinality(ra->high_low_container.containers[i],
8608                                           ra->high_low_container.typecodes[i]);
8609     return card;
8610 }
8611 
roaring_bitmap_range_cardinality(const roaring_bitmap_t * ra,uint64_t range_start,uint64_t range_end)8612 uint64_t roaring_bitmap_range_cardinality(const roaring_bitmap_t *ra,
8613                                           uint64_t range_start,
8614                                           uint64_t range_end) {
8615     if (range_end > UINT32_MAX) {
8616         range_end = UINT32_MAX + UINT64_C(1);
8617     }
8618     if (range_start >= range_end) {
8619         return 0;
8620     }
8621     range_end--; // make range_end inclusive
8622     // now we have: 0 <= range_start <= range_end <= UINT32_MAX
8623 
8624     uint16_t minhb = range_start >> 16;
8625     uint16_t maxhb = range_end >> 16;
8626 
8627     uint64_t card = 0;
8628 
8629     int i = ra_get_index(&ra->high_low_container, minhb);
8630     if (i >= 0) {
8631         if (minhb == maxhb) {
8632             card += container_rank(ra->high_low_container.containers[i],
8633                                    ra->high_low_container.typecodes[i],
8634                                    range_end & 0xffff);
8635         } else {
8636             card += container_get_cardinality(ra->high_low_container.containers[i],
8637                                               ra->high_low_container.typecodes[i]);
8638         }
8639         if ((range_start & 0xffff) != 0) {
8640             card -= container_rank(ra->high_low_container.containers[i],
8641                                    ra->high_low_container.typecodes[i],
8642                                    (range_start & 0xffff) - 1);
8643         }
8644         i++;
8645     } else {
8646         i = -i - 1;
8647     }
8648 
8649     for (; i < ra->high_low_container.size; i++) {
8650         uint16_t key = ra->high_low_container.keys[i];
8651         if (key < maxhb) {
8652             card += container_get_cardinality(ra->high_low_container.containers[i],
8653                                               ra->high_low_container.typecodes[i]);
8654         } else if (key == maxhb) {
8655             card += container_rank(ra->high_low_container.containers[i],
8656                                    ra->high_low_container.typecodes[i],
8657                                    range_end & 0xffff);
8658             break;
8659         } else {
8660             break;
8661         }
8662     }
8663 
8664     return card;
8665 }
8666 
8667 
roaring_bitmap_is_empty(const roaring_bitmap_t * ra)8668 bool roaring_bitmap_is_empty(const roaring_bitmap_t *ra) {
8669     return ra->high_low_container.size == 0;
8670 }
8671 
roaring_bitmap_to_uint32_array(const roaring_bitmap_t * ra,uint32_t * ans)8672 void roaring_bitmap_to_uint32_array(const roaring_bitmap_t *ra, uint32_t *ans) {
8673     ra_to_uint32_array(&ra->high_low_container, ans);
8674 }
8675 
roaring_bitmap_range_uint32_array(const roaring_bitmap_t * ra,size_t offset,size_t limit,uint32_t * ans)8676 bool roaring_bitmap_range_uint32_array(const roaring_bitmap_t *ra, size_t offset, size_t limit,  uint32_t *ans) {
8677     return ra_range_uint32_array(&ra->high_low_container, offset, limit, ans);
8678 }
8679 
8680 /** convert array and bitmap containers to run containers when it is more
8681  * efficient;
8682  * also convert from run containers when more space efficient.  Returns
8683  * true if the result has at least one run container.
8684 */
roaring_bitmap_run_optimize(roaring_bitmap_t * r)8685 bool roaring_bitmap_run_optimize(roaring_bitmap_t *r) {
8686     bool answer = false;
8687     for (int i = 0; i < r->high_low_container.size; i++) {
8688         uint8_t typecode_original, typecode_after;
8689         ra_unshare_container_at_index(
8690             &r->high_low_container, i);  // TODO: this introduces extra cloning!
8691         void *c = ra_get_container_at_index(&r->high_low_container, i,
8692                                             &typecode_original);
8693         void *c1 = convert_run_optimize(c, typecode_original, &typecode_after);
8694         if (typecode_after == RUN_CONTAINER_TYPE_CODE) answer = true;
8695         ra_set_container_at_index(&r->high_low_container, i, c1,
8696                                   typecode_after);
8697     }
8698     return answer;
8699 }
8700 
roaring_bitmap_shrink_to_fit(roaring_bitmap_t * r)8701 size_t roaring_bitmap_shrink_to_fit(roaring_bitmap_t *r) {
8702     size_t answer = 0;
8703     for (int i = 0; i < r->high_low_container.size; i++) {
8704         uint8_t typecode_original;
8705         void *c = ra_get_container_at_index(&r->high_low_container, i,
8706                                             &typecode_original);
8707         answer += container_shrink_to_fit(c, typecode_original);
8708     }
8709     answer += ra_shrink_to_fit(&r->high_low_container);
8710     return answer;
8711 }
8712 
8713 /**
8714  *  Remove run-length encoding even when it is more space efficient
8715  *  return whether a change was applied
8716  */
roaring_bitmap_remove_run_compression(roaring_bitmap_t * r)8717 bool roaring_bitmap_remove_run_compression(roaring_bitmap_t *r) {
8718     bool answer = false;
8719     for (int i = 0; i < r->high_low_container.size; i++) {
8720         uint8_t typecode_original, typecode_after;
8721         void *c = ra_get_container_at_index(&r->high_low_container, i,
8722                                             &typecode_original);
8723         if (get_container_type(c, typecode_original) ==
8724             RUN_CONTAINER_TYPE_CODE) {
8725             answer = true;
8726             if (typecode_original == SHARED_CONTAINER_TYPE_CODE) {
8727                 run_container_t *truec =
8728                     (run_container_t *)((shared_container_t *)c)->container;
8729                 int32_t card = run_container_cardinality(truec);
8730                 void *c1 = convert_to_bitset_or_array_container(
8731                     truec, card, &typecode_after);
8732                 shared_container_free((shared_container_t *)c);// will free the run container as needed
8733                 ra_set_container_at_index(&r->high_low_container, i, c1,
8734                                           typecode_after);
8735 
8736             } else {
8737                 int32_t card = run_container_cardinality((run_container_t *)c);
8738                 void *c1 = convert_to_bitset_or_array_container(
8739                     (run_container_t *)c, card, &typecode_after);
8740                 run_container_free((run_container_t *)c);
8741                 ra_set_container_at_index(&r->high_low_container, i, c1,
8742                                           typecode_after);
8743             }
8744         }
8745     }
8746     return answer;
8747 }
8748 
roaring_bitmap_serialize(const roaring_bitmap_t * ra,char * buf)8749 size_t roaring_bitmap_serialize(const roaring_bitmap_t *ra, char *buf) {
8750     size_t portablesize = roaring_bitmap_portable_size_in_bytes(ra);
8751     uint64_t cardinality = roaring_bitmap_get_cardinality(ra);
8752     uint64_t sizeasarray = cardinality * sizeof(uint32_t) + sizeof(uint32_t);
8753     if (portablesize < sizeasarray) {
8754         buf[0] = SERIALIZATION_CONTAINER;
8755         return roaring_bitmap_portable_serialize(ra, buf + 1) + 1;
8756     } else {
8757         buf[0] = SERIALIZATION_ARRAY_UINT32;
8758         memcpy(buf + 1, &cardinality, sizeof(uint32_t));
8759         roaring_bitmap_to_uint32_array(
8760             ra, (uint32_t *)(buf + 1 + sizeof(uint32_t)));
8761         return 1 + (size_t)sizeasarray;
8762     }
8763 }
8764 
roaring_bitmap_size_in_bytes(const roaring_bitmap_t * ra)8765 size_t roaring_bitmap_size_in_bytes(const roaring_bitmap_t *ra) {
8766     size_t portablesize = roaring_bitmap_portable_size_in_bytes(ra);
8767     uint64_t sizeasarray = roaring_bitmap_get_cardinality(ra) * sizeof(uint32_t) +
8768                          sizeof(uint32_t);
8769     return portablesize < sizeasarray ? portablesize + 1 : (size_t)sizeasarray + 1;
8770 }
8771 
roaring_bitmap_portable_size_in_bytes(const roaring_bitmap_t * ra)8772 size_t roaring_bitmap_portable_size_in_bytes(const roaring_bitmap_t *ra) {
8773     return ra_portable_size_in_bytes(&ra->high_low_container);
8774 }
8775 
8776 
roaring_bitmap_portable_deserialize_safe(const char * buf,size_t maxbytes)8777 roaring_bitmap_t *roaring_bitmap_portable_deserialize_safe(const char *buf, size_t maxbytes) {
8778     roaring_bitmap_t *ans =
8779         (roaring_bitmap_t *)malloc(sizeof(roaring_bitmap_t));
8780     if (ans == NULL) {
8781         return NULL;
8782     }
8783     size_t bytesread;
8784     bool is_ok = ra_portable_deserialize(&ans->high_low_container, buf, maxbytes, &bytesread);
8785     if(is_ok) assert(bytesread <= maxbytes);
8786     roaring_bitmap_set_copy_on_write(ans, false);
8787     if (!is_ok) {
8788         free(ans);
8789         return NULL;
8790     }
8791     return ans;
8792 }
8793 
roaring_bitmap_portable_deserialize(const char * buf)8794 roaring_bitmap_t *roaring_bitmap_portable_deserialize(const char *buf) {
8795     return roaring_bitmap_portable_deserialize_safe(buf, SIZE_MAX);
8796 }
8797 
8798 
roaring_bitmap_portable_deserialize_size(const char * buf,size_t maxbytes)8799 size_t roaring_bitmap_portable_deserialize_size(const char *buf, size_t maxbytes) {
8800   return ra_portable_deserialize_size(buf, maxbytes);
8801 }
8802 
8803 
roaring_bitmap_portable_serialize(const roaring_bitmap_t * ra,char * buf)8804 size_t roaring_bitmap_portable_serialize(const roaring_bitmap_t *ra,
8805                                          char *buf) {
8806     return ra_portable_serialize(&ra->high_low_container, buf);
8807 }
8808 
roaring_bitmap_deserialize(const void * buf)8809 roaring_bitmap_t *roaring_bitmap_deserialize(const void *buf) {
8810     const char *bufaschar = (const char *)buf;
8811     if (*(const unsigned char *)buf == SERIALIZATION_ARRAY_UINT32) {
8812         /* This looks like a compressed set of uint32_t elements */
8813         uint32_t card;
8814         memcpy(&card, bufaschar + 1, sizeof(uint32_t));
8815         const uint32_t *elems =
8816             (const uint32_t *)(bufaschar + 1 + sizeof(uint32_t));
8817 
8818         return roaring_bitmap_of_ptr(card, elems);
8819     } else if (bufaschar[0] == SERIALIZATION_CONTAINER) {
8820         return roaring_bitmap_portable_deserialize(bufaschar + 1);
8821     } else
8822         return (NULL);
8823 }
8824 
roaring_iterate(const roaring_bitmap_t * ra,roaring_iterator iterator,void * ptr)8825 bool roaring_iterate(const roaring_bitmap_t *ra, roaring_iterator iterator,
8826                      void *ptr) {
8827     for (int i = 0; i < ra->high_low_container.size; ++i)
8828         if (!container_iterate(ra->high_low_container.containers[i],
8829                                ra->high_low_container.typecodes[i],
8830                                ((uint32_t)ra->high_low_container.keys[i]) << 16,
8831                                iterator, ptr)) {
8832             return false;
8833         }
8834     return true;
8835 }
8836 
roaring_iterate64(const roaring_bitmap_t * ra,roaring_iterator64 iterator,uint64_t high_bits,void * ptr)8837 bool roaring_iterate64(const roaring_bitmap_t *ra, roaring_iterator64 iterator,
8838                        uint64_t high_bits, void *ptr) {
8839     for (int i = 0; i < ra->high_low_container.size; ++i)
8840         if (!container_iterate64(
8841                 ra->high_low_container.containers[i],
8842                 ra->high_low_container.typecodes[i],
8843                 ((uint32_t)ra->high_low_container.keys[i]) << 16, iterator,
8844                 high_bits, ptr)) {
8845             return false;
8846         }
8847     return true;
8848 }
8849 
8850 /****
8851 * begin roaring_uint32_iterator_t
8852 *****/
8853 
8854 // Partially initializes the roaring iterator when it begins looking at
8855 // a new container.
iter_new_container_partial_init(roaring_uint32_iterator_t * newit)8856 static bool iter_new_container_partial_init(roaring_uint32_iterator_t *newit) {
8857     newit->in_container_index = 0;
8858     newit->run_index = 0;
8859     newit->current_value = 0;
8860     if (newit->container_index >= newit->parent->high_low_container.size ||
8861         newit->container_index < 0) {
8862         newit->current_value = UINT32_MAX;
8863         return (newit->has_value = false);
8864     }
8865     // assume not empty
8866     newit->has_value = true;
8867     // we precompute container, typecode and highbits so that successive
8868     // iterators do not have to grab them from odd memory locations
8869     // and have to worry about the (easily predicted) container_unwrap_shared
8870     // call.
8871     newit->container =
8872             newit->parent->high_low_container.containers[newit->container_index];
8873     newit->typecode =
8874             newit->parent->high_low_container.typecodes[newit->container_index];
8875     newit->highbits =
8876             ((uint32_t)
8877                     newit->parent->high_low_container.keys[newit->container_index])
8878                     << 16;
8879     newit->container =
8880             container_unwrap_shared(newit->container, &(newit->typecode));
8881     return newit->has_value;
8882 }
8883 
loadfirstvalue(roaring_uint32_iterator_t * newit)8884 static bool loadfirstvalue(roaring_uint32_iterator_t *newit) {
8885     if (!iter_new_container_partial_init(newit))
8886         return newit->has_value;
8887 
8888     uint32_t wordindex;
8889     uint64_t word;  // used for bitsets
8890     switch (newit->typecode) {
8891         case BITSET_CONTAINER_TYPE_CODE:
8892             wordindex = 0;
8893             while ((word = ((const bitset_container_t *)(newit->container))
8894                                ->array[wordindex]) == 0)
8895                 wordindex++;  // advance
8896             // here "word" is non-zero
8897             newit->in_container_index = wordindex * 64 + __builtin_ctzll(word);
8898             newit->current_value = newit->highbits | newit->in_container_index;
8899             break;
8900         case ARRAY_CONTAINER_TYPE_CODE:
8901             newit->current_value =
8902                 newit->highbits |
8903                 ((const array_container_t *)(newit->container))->array[0];
8904             break;
8905         case RUN_CONTAINER_TYPE_CODE:
8906             newit->current_value =
8907                 newit->highbits |
8908                 (((const run_container_t *)(newit->container))->runs[0].value);
8909             break;
8910         default:
8911             // if this ever happens, bug!
8912             assert(false);
8913     }  // switch (typecode)
8914     return true;
8915 }
8916 
loadlastvalue(roaring_uint32_iterator_t * newit)8917 static bool loadlastvalue(roaring_uint32_iterator_t* newit) {
8918     if (!iter_new_container_partial_init(newit))
8919         return newit->has_value;
8920 
8921     switch(newit->typecode) {
8922         case BITSET_CONTAINER_TYPE_CODE: {
8923             uint32_t wordindex = BITSET_CONTAINER_SIZE_IN_WORDS - 1;
8924             uint64_t word;
8925             const bitset_container_t* bitset_container = (const bitset_container_t*)newit->container;
8926             while ((word = bitset_container->array[wordindex]) == 0)
8927                 --wordindex;
8928 
8929             int num_leading_zeros = __builtin_clzll(word);
8930             newit->in_container_index = (wordindex * 64) + (63 - num_leading_zeros);
8931             newit->current_value = newit->highbits | newit->in_container_index;
8932             break;
8933         }
8934         case ARRAY_CONTAINER_TYPE_CODE: {
8935             const array_container_t* array_container = (const array_container_t*)newit->container;
8936             newit->in_container_index = array_container->cardinality - 1;
8937             newit->current_value = newit->highbits | array_container->array[newit->in_container_index];
8938             break;
8939         }
8940         case RUN_CONTAINER_TYPE_CODE: {
8941             const run_container_t* run_container = (const run_container_t*)newit->container;
8942             newit->run_index = run_container->n_runs - 1;
8943             const rle16_t* last_run = &run_container->runs[newit->run_index];
8944             newit->current_value = newit->highbits | (last_run->value + last_run->length);
8945             break;
8946         }
8947         default:
8948             // if this ever happens, bug!
8949             assert(false);
8950     }
8951     return true;
8952 }
8953 
8954 // prerequesite: the value should be in range of the container
loadfirstvalue_largeorequal(roaring_uint32_iterator_t * newit,uint32_t val)8955 static bool loadfirstvalue_largeorequal(roaring_uint32_iterator_t *newit, uint32_t val) {
8956     // Don't have to check return value because of prerequisite
8957     iter_new_container_partial_init(newit);
8958     uint16_t lb = val & 0xFFFF;
8959 
8960     switch (newit->typecode) {
8961         case BITSET_CONTAINER_TYPE_CODE:
8962             newit->in_container_index =  bitset_container_index_equalorlarger((const bitset_container_t *)(newit->container), lb);
8963             newit->current_value = newit->highbits | newit->in_container_index;
8964             break;
8965         case ARRAY_CONTAINER_TYPE_CODE:
8966             newit->in_container_index = array_container_index_equalorlarger((const array_container_t *)(newit->container), lb);
8967             newit->current_value =
8968                 newit->highbits |
8969                 ((const array_container_t *)(newit->container))->array[newit->in_container_index];
8970             break;
8971         case RUN_CONTAINER_TYPE_CODE:
8972             newit->run_index = run_container_index_equalorlarger((const run_container_t *)(newit->container), lb);
8973             if(((const run_container_t *)(newit->container))->runs[newit->run_index].value <= lb) {
8974               newit->current_value = val;
8975             } else {
8976               newit->current_value =
8977                 newit->highbits |
8978                 (((const run_container_t *)(newit->container))->runs[newit->run_index].value);
8979             }
8980             break;
8981         default:
8982             // if this ever happens, bug!
8983             assert(false);
8984     }  // switch (typecode)
8985     return true;
8986 }
8987 
roaring_init_iterator(const roaring_bitmap_t * ra,roaring_uint32_iterator_t * newit)8988 void roaring_init_iterator(const roaring_bitmap_t *ra,
8989                            roaring_uint32_iterator_t *newit) {
8990     newit->parent = ra;
8991     newit->container_index = 0;
8992     newit->has_value = loadfirstvalue(newit);
8993 }
8994 
roaring_init_iterator_last(const roaring_bitmap_t * ra,roaring_uint32_iterator_t * newit)8995 void roaring_init_iterator_last(const roaring_bitmap_t *ra,
8996                                 roaring_uint32_iterator_t *newit) {
8997     newit->parent = ra;
8998     newit->container_index = newit->parent->high_low_container.size - 1;
8999     newit->has_value = loadlastvalue(newit);
9000 }
9001 
roaring_create_iterator(const roaring_bitmap_t * ra)9002 roaring_uint32_iterator_t *roaring_create_iterator(const roaring_bitmap_t *ra) {
9003     roaring_uint32_iterator_t *newit =
9004         (roaring_uint32_iterator_t *)malloc(sizeof(roaring_uint32_iterator_t));
9005     if (newit == NULL) return NULL;
9006     roaring_init_iterator(ra, newit);
9007     return newit;
9008 }
9009 
roaring_copy_uint32_iterator(const roaring_uint32_iterator_t * it)9010 roaring_uint32_iterator_t *roaring_copy_uint32_iterator(
9011     const roaring_uint32_iterator_t *it) {
9012     roaring_uint32_iterator_t *newit =
9013         (roaring_uint32_iterator_t *)malloc(sizeof(roaring_uint32_iterator_t));
9014     memcpy(newit, it, sizeof(roaring_uint32_iterator_t));
9015     return newit;
9016 }
9017 
roaring_move_uint32_iterator_equalorlarger(roaring_uint32_iterator_t * it,uint32_t val)9018 bool roaring_move_uint32_iterator_equalorlarger(roaring_uint32_iterator_t *it, uint32_t val) {
9019     uint16_t hb = val >> 16;
9020     const int i = ra_get_index(& it->parent->high_low_container, hb);
9021     if (i >= 0) {
9022       uint32_t lowvalue = container_maximum(it->parent->high_low_container.containers[i], it->parent->high_low_container.typecodes[i]);
9023       uint16_t lb = val & 0xFFFF;
9024       if(lowvalue < lb ) {
9025         it->container_index = i+1; // will have to load first value of next container
9026       } else {// the value is necessarily within the range of the container
9027         it->container_index = i;
9028         it->has_value = loadfirstvalue_largeorequal(it, val);
9029         return it->has_value;
9030       }
9031     } else {
9032       // there is no matching, so we are going for the next container
9033       it->container_index = -i-1;
9034     }
9035     it->has_value = loadfirstvalue(it);
9036     return it->has_value;
9037 }
9038 
9039 
roaring_advance_uint32_iterator(roaring_uint32_iterator_t * it)9040 bool roaring_advance_uint32_iterator(roaring_uint32_iterator_t *it) {
9041     if (it->container_index >= it->parent->high_low_container.size) {
9042         return (it->has_value = false);
9043     }
9044     if (it->container_index < 0) {
9045         it->container_index = 0;
9046         return (it->has_value = loadfirstvalue(it));
9047     }
9048 
9049     uint32_t wordindex;  // used for bitsets
9050     uint64_t word;       // used for bitsets
9051     switch (it->typecode) {
9052         case BITSET_CONTAINER_TYPE_CODE:
9053             it->in_container_index++;
9054             wordindex = it->in_container_index / 64;
9055             if (wordindex >= BITSET_CONTAINER_SIZE_IN_WORDS) break;
9056             word = ((const bitset_container_t *)(it->container))
9057                        ->array[wordindex] &
9058                    (UINT64_MAX << (it->in_container_index % 64));
9059             // next part could be optimized/simplified
9060             while ((word == 0) &&
9061                    (wordindex + 1 < BITSET_CONTAINER_SIZE_IN_WORDS)) {
9062                 wordindex++;
9063                 word = ((const bitset_container_t *)(it->container))
9064                            ->array[wordindex];
9065             }
9066             if (word != 0) {
9067                 it->in_container_index = wordindex * 64 + __builtin_ctzll(word);
9068                 it->current_value = it->highbits | it->in_container_index;
9069                 return (it->has_value = true);
9070             }
9071             break;
9072         case ARRAY_CONTAINER_TYPE_CODE:
9073             it->in_container_index++;
9074             if (it->in_container_index <
9075                 ((const array_container_t *)(it->container))->cardinality) {
9076                 it->current_value = it->highbits |
9077                                     ((const array_container_t *)(it->container))
9078                                         ->array[it->in_container_index];
9079                 return (it->has_value = true);
9080             }
9081             break;
9082         case RUN_CONTAINER_TYPE_CODE: {
9083             if(it->current_value == UINT32_MAX) {
9084                 return (it->has_value = false); // without this, we risk an overflow to zero
9085             }
9086 
9087             const run_container_t* run_container = (const run_container_t*)it->container;
9088             if (++it->current_value <= (it->highbits | (run_container->runs[it->run_index].value +
9089                                                         run_container->runs[it->run_index].length))) {
9090                 return (it->has_value = true);
9091             }
9092 
9093             if (++it->run_index < run_container->n_runs) {
9094                 // Assume the run has a value
9095                 it->current_value = it->highbits | run_container->runs[it->run_index].value;
9096                 return (it->has_value = true);
9097             }
9098             break;
9099         }
9100         default:
9101             // if this ever happens, bug!
9102             assert(false);
9103     }  // switch (typecode)
9104     // moving to next container
9105     it->container_index++;
9106     return (it->has_value = loadfirstvalue(it));
9107 }
9108 
roaring_previous_uint32_iterator(roaring_uint32_iterator_t * it)9109 bool roaring_previous_uint32_iterator(roaring_uint32_iterator_t *it) {
9110     if (it->container_index < 0) {
9111         return (it->has_value = false);
9112     }
9113     if (it->container_index >= it->parent->high_low_container.size) {
9114         it->container_index = it->parent->high_low_container.size - 1;
9115         return (it->has_value = loadlastvalue(it));
9116     }
9117 
9118     switch (it->typecode) {
9119         case BITSET_CONTAINER_TYPE_CODE: {
9120             if (--it->in_container_index < 0)
9121                 break;
9122 
9123             const bitset_container_t* bitset_container = (const bitset_container_t*)it->container;
9124             int32_t wordindex = it->in_container_index / 64;
9125             uint64_t word = bitset_container->array[wordindex] & (UINT64_MAX >> (63 - (it->in_container_index % 64)));
9126 
9127             while (word == 0 && --wordindex >= 0) {
9128                 word = bitset_container->array[wordindex];
9129             }
9130             if (word == 0)
9131                 break;
9132 
9133             int num_leading_zeros = __builtin_clzll(word);
9134             it->in_container_index = (wordindex * 64) + (63 - num_leading_zeros);
9135             it->current_value = it->highbits | it->in_container_index;
9136             return (it->has_value = true);
9137         }
9138         case ARRAY_CONTAINER_TYPE_CODE: {
9139             if (--it->in_container_index < 0)
9140                 break;
9141 
9142             const array_container_t* array_container = (const array_container_t*)it->container;
9143             it->current_value = it->highbits | array_container->array[it->in_container_index];
9144             return (it->has_value = true);
9145         }
9146         case RUN_CONTAINER_TYPE_CODE: {
9147             if(it->current_value == 0)
9148                 return (it->has_value = false);
9149 
9150             const run_container_t* run_container = (const run_container_t*)it->container;
9151             if (--it->current_value >= (it->highbits | run_container->runs[it->run_index].value)) {
9152                 return (it->has_value = true);
9153             }
9154 
9155             if (--it->run_index < 0)
9156                 break;
9157 
9158             it->current_value = it->highbits | (run_container->runs[it->run_index].value +
9159                                                 run_container->runs[it->run_index].length);
9160             return (it->has_value = true);
9161         }
9162         default:
9163             // if this ever happens, bug!
9164             assert(false);
9165     }  // switch (typecode)
9166 
9167     // moving to previous container
9168     it->container_index--;
9169     return (it->has_value = loadlastvalue(it));
9170 }
9171 
roaring_read_uint32_iterator(roaring_uint32_iterator_t * it,uint32_t * buf,uint32_t count)9172 uint32_t roaring_read_uint32_iterator(roaring_uint32_iterator_t *it, uint32_t* buf, uint32_t count) {
9173   uint32_t ret = 0;
9174   uint32_t num_values;
9175   uint32_t wordindex;  // used for bitsets
9176   uint64_t word;       // used for bitsets
9177   const array_container_t* acont; //TODO remove
9178   const run_container_t* rcont; //TODO remove
9179   const bitset_container_t* bcont; //TODO remove
9180 
9181   while (it->has_value && ret < count) {
9182     switch (it->typecode) {
9183       case BITSET_CONTAINER_TYPE_CODE:
9184         bcont = (const bitset_container_t*)(it->container);
9185         wordindex = it->in_container_index / 64;
9186         word = bcont->array[wordindex] & (UINT64_MAX << (it->in_container_index % 64));
9187         do {
9188           while (word != 0 && ret < count) {
9189             buf[0] = it->highbits | (wordindex * 64 + __builtin_ctzll(word));
9190             word = word & (word - 1);
9191             buf++;
9192             ret++;
9193           }
9194           while (word == 0 && wordindex+1 < BITSET_CONTAINER_SIZE_IN_WORDS) {
9195             wordindex++;
9196             word = bcont->array[wordindex];
9197           }
9198         } while (word != 0 && ret < count);
9199         it->has_value = (word != 0);
9200         if (it->has_value) {
9201           it->in_container_index = wordindex * 64 + __builtin_ctzll(word);
9202           it->current_value = it->highbits | it->in_container_index;
9203         }
9204         break;
9205       case ARRAY_CONTAINER_TYPE_CODE:
9206         acont = (const array_container_t *)(it->container);
9207         num_values = minimum_uint32(acont->cardinality - it->in_container_index, count - ret);
9208         for (uint32_t i = 0; i < num_values; i++) {
9209           buf[i] = it->highbits | acont->array[it->in_container_index + i];
9210         }
9211         buf += num_values;
9212         ret += num_values;
9213         it->in_container_index += num_values;
9214         it->has_value = (it->in_container_index < acont->cardinality);
9215         if (it->has_value) {
9216           it->current_value = it->highbits | acont->array[it->in_container_index];
9217         }
9218         break;
9219       case RUN_CONTAINER_TYPE_CODE:
9220         rcont = (const run_container_t*)(it->container);
9221         //"in_run_index" name is misleading, read it as "max_value_in_current_run"
9222         do {
9223           uint32_t largest_run_value = it->highbits | (rcont->runs[it->run_index].value + rcont->runs[it->run_index].length);
9224           num_values = minimum_uint32(largest_run_value - it->current_value + 1, count - ret);
9225           for (uint32_t i = 0; i < num_values; i++) {
9226             buf[i] = it->current_value + i;
9227           }
9228           it->current_value += num_values; // this can overflow to zero: UINT32_MAX+1=0
9229           buf += num_values;
9230           ret += num_values;
9231 
9232           if (it->current_value > largest_run_value || it->current_value == 0) {
9233             it->run_index++;
9234             if (it->run_index < rcont->n_runs) {
9235               it->current_value = it->highbits | rcont->runs[it->run_index].value;
9236             } else {
9237               it->has_value = false;
9238             }
9239           }
9240         } while ((ret < count) && it->has_value);
9241         break;
9242       default:
9243         assert(false);
9244     }
9245     if (it->has_value) {
9246       assert(ret == count);
9247       return ret;
9248     }
9249     it->container_index++;
9250     it->has_value = loadfirstvalue(it);
9251   }
9252   return ret;
9253 }
9254 
9255 
9256 
roaring_free_uint32_iterator(roaring_uint32_iterator_t * it)9257 void roaring_free_uint32_iterator(roaring_uint32_iterator_t *it) { free(it); }
9258 
9259 /****
9260 * end of roaring_uint32_iterator_t
9261 *****/
9262 
roaring_bitmap_equals(const roaring_bitmap_t * ra1,const roaring_bitmap_t * ra2)9263 bool roaring_bitmap_equals(const roaring_bitmap_t *ra1,
9264                            const roaring_bitmap_t *ra2) {
9265     if (ra1->high_low_container.size != ra2->high_low_container.size) {
9266         return false;
9267     }
9268     for (int i = 0; i < ra1->high_low_container.size; ++i) {
9269         if (ra1->high_low_container.keys[i] !=
9270             ra2->high_low_container.keys[i]) {
9271             return false;
9272         }
9273     }
9274     for (int i = 0; i < ra1->high_low_container.size; ++i) {
9275         bool areequal = container_equals(ra1->high_low_container.containers[i],
9276                                          ra1->high_low_container.typecodes[i],
9277                                          ra2->high_low_container.containers[i],
9278                                          ra2->high_low_container.typecodes[i]);
9279         if (!areequal) {
9280             return false;
9281         }
9282     }
9283     return true;
9284 }
9285 
roaring_bitmap_is_subset(const roaring_bitmap_t * ra1,const roaring_bitmap_t * ra2)9286 bool roaring_bitmap_is_subset(const roaring_bitmap_t *ra1,
9287                               const roaring_bitmap_t *ra2) {
9288     const int length1 = ra1->high_low_container.size,
9289               length2 = ra2->high_low_container.size;
9290 
9291     int pos1 = 0, pos2 = 0;
9292 
9293     while (pos1 < length1 && pos2 < length2) {
9294         const uint16_t s1 = ra_get_key_at_index(&ra1->high_low_container, pos1);
9295         const uint16_t s2 = ra_get_key_at_index(&ra2->high_low_container, pos2);
9296 
9297         if (s1 == s2) {
9298             uint8_t container_type_1, container_type_2;
9299             void *c1 = ra_get_container_at_index(&ra1->high_low_container, pos1,
9300                                                  &container_type_1);
9301             void *c2 = ra_get_container_at_index(&ra2->high_low_container, pos2,
9302                                                  &container_type_2);
9303             bool subset =
9304                 container_is_subset(c1, container_type_1, c2, container_type_2);
9305             if (!subset) return false;
9306             ++pos1;
9307             ++pos2;
9308         } else if (s1 < s2) {  // s1 < s2
9309             return false;
9310         } else {  // s1 > s2
9311             pos2 = ra_advance_until(&ra2->high_low_container, s1, pos2);
9312         }
9313     }
9314     if (pos1 == length1)
9315         return true;
9316     else
9317         return false;
9318 }
9319 
insert_flipped_container(roaring_array_t * ans_arr,const roaring_array_t * x1_arr,uint16_t hb,uint16_t lb_start,uint16_t lb_end)9320 static void insert_flipped_container(roaring_array_t *ans_arr,
9321                                      const roaring_array_t *x1_arr, uint16_t hb,
9322                                      uint16_t lb_start, uint16_t lb_end) {
9323     const int i = ra_get_index(x1_arr, hb);
9324     const int j = ra_get_index(ans_arr, hb);
9325     uint8_t ctype_in, ctype_out;
9326     void *flipped_container = NULL;
9327     if (i >= 0) {
9328         void *container_to_flip =
9329             ra_get_container_at_index(x1_arr, i, &ctype_in);
9330         flipped_container =
9331             container_not_range(container_to_flip, ctype_in, (uint32_t)lb_start,
9332                                 (uint32_t)(lb_end + 1), &ctype_out);
9333 
9334         if (container_get_cardinality(flipped_container, ctype_out))
9335             ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
9336                                        ctype_out);
9337         else {
9338             container_free(flipped_container, ctype_out);
9339         }
9340     } else {
9341         flipped_container = container_range_of_ones(
9342             (uint32_t)lb_start, (uint32_t)(lb_end + 1), &ctype_out);
9343         ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
9344                                    ctype_out);
9345     }
9346 }
9347 
inplace_flip_container(roaring_array_t * x1_arr,uint16_t hb,uint16_t lb_start,uint16_t lb_end)9348 static void inplace_flip_container(roaring_array_t *x1_arr, uint16_t hb,
9349                                    uint16_t lb_start, uint16_t lb_end) {
9350     const int i = ra_get_index(x1_arr, hb);
9351     uint8_t ctype_in, ctype_out;
9352     void *flipped_container = NULL;
9353     if (i >= 0) {
9354         void *container_to_flip =
9355             ra_get_container_at_index(x1_arr, i, &ctype_in);
9356         flipped_container = container_inot_range(
9357             container_to_flip, ctype_in, (uint32_t)lb_start,
9358             (uint32_t)(lb_end + 1), &ctype_out);
9359         // if a new container was created, the old one was already freed
9360         if (container_get_cardinality(flipped_container, ctype_out)) {
9361             ra_set_container_at_index(x1_arr, i, flipped_container, ctype_out);
9362         } else {
9363             container_free(flipped_container, ctype_out);
9364             ra_remove_at_index(x1_arr, i);
9365         }
9366 
9367     } else {
9368         flipped_container = container_range_of_ones(
9369             (uint32_t)lb_start, (uint32_t)(lb_end + 1), &ctype_out);
9370         ra_insert_new_key_value_at(x1_arr, -i - 1, hb, flipped_container,
9371                                    ctype_out);
9372     }
9373 }
9374 
insert_fully_flipped_container(roaring_array_t * ans_arr,const roaring_array_t * x1_arr,uint16_t hb)9375 static void insert_fully_flipped_container(roaring_array_t *ans_arr,
9376                                            const roaring_array_t *x1_arr,
9377                                            uint16_t hb) {
9378     const int i = ra_get_index(x1_arr, hb);
9379     const int j = ra_get_index(ans_arr, hb);
9380     uint8_t ctype_in, ctype_out;
9381     void *flipped_container = NULL;
9382     if (i >= 0) {
9383         void *container_to_flip =
9384             ra_get_container_at_index(x1_arr, i, &ctype_in);
9385         flipped_container =
9386             container_not(container_to_flip, ctype_in, &ctype_out);
9387         if (container_get_cardinality(flipped_container, ctype_out))
9388             ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
9389                                        ctype_out);
9390         else {
9391             container_free(flipped_container, ctype_out);
9392         }
9393     } else {
9394         flipped_container = container_range_of_ones(0U, 0x10000U, &ctype_out);
9395         ra_insert_new_key_value_at(ans_arr, -j - 1, hb, flipped_container,
9396                                    ctype_out);
9397     }
9398 }
9399 
inplace_fully_flip_container(roaring_array_t * x1_arr,uint16_t hb)9400 static void inplace_fully_flip_container(roaring_array_t *x1_arr, uint16_t hb) {
9401     const int i = ra_get_index(x1_arr, hb);
9402     uint8_t ctype_in, ctype_out;
9403     void *flipped_container = NULL;
9404     if (i >= 0) {
9405         void *container_to_flip =
9406             ra_get_container_at_index(x1_arr, i, &ctype_in);
9407         flipped_container =
9408             container_inot(container_to_flip, ctype_in, &ctype_out);
9409 
9410         if (container_get_cardinality(flipped_container, ctype_out)) {
9411             ra_set_container_at_index(x1_arr, i, flipped_container, ctype_out);
9412         } else {
9413             container_free(flipped_container, ctype_out);
9414             ra_remove_at_index(x1_arr, i);
9415         }
9416 
9417     } else {
9418         flipped_container = container_range_of_ones(0U, 0x10000U, &ctype_out);
9419         ra_insert_new_key_value_at(x1_arr, -i - 1, hb, flipped_container,
9420                                    ctype_out);
9421     }
9422 }
9423 
roaring_bitmap_flip(const roaring_bitmap_t * x1,uint64_t range_start,uint64_t range_end)9424 roaring_bitmap_t *roaring_bitmap_flip(const roaring_bitmap_t *x1,
9425                                       uint64_t range_start,
9426                                       uint64_t range_end) {
9427     if (range_start >= range_end) {
9428         return roaring_bitmap_copy(x1);
9429     }
9430     if(range_end >= UINT64_C(0x100000000)) {
9431         range_end = UINT64_C(0x100000000);
9432     }
9433 
9434     roaring_bitmap_t *ans = roaring_bitmap_create();
9435     roaring_bitmap_set_copy_on_write(ans, is_cow(x1));
9436 
9437     uint16_t hb_start = (uint16_t)(range_start >> 16);
9438     const uint16_t lb_start = (uint16_t)range_start;  // & 0xFFFF;
9439     uint16_t hb_end = (uint16_t)((range_end - 1) >> 16);
9440     const uint16_t lb_end = (uint16_t)(range_end - 1);  // & 0xFFFF;
9441 
9442     ra_append_copies_until(&ans->high_low_container, &x1->high_low_container,
9443                            hb_start, is_cow(x1));
9444     if (hb_start == hb_end) {
9445         insert_flipped_container(&ans->high_low_container,
9446                                  &x1->high_low_container, hb_start, lb_start,
9447                                  lb_end);
9448     } else {
9449         // start and end containers are distinct
9450         if (lb_start > 0) {
9451             // handle first (partial) container
9452             insert_flipped_container(&ans->high_low_container,
9453                                      &x1->high_low_container, hb_start,
9454                                      lb_start, 0xFFFF);
9455             ++hb_start;  // for the full containers.  Can't wrap.
9456         }
9457 
9458         if (lb_end != 0xFFFF) --hb_end;  // later we'll handle the partial block
9459 
9460         for (uint32_t hb = hb_start; hb <= hb_end; ++hb) {
9461             insert_fully_flipped_container(&ans->high_low_container,
9462                                            &x1->high_low_container, hb);
9463         }
9464 
9465         // handle a partial final container
9466         if (lb_end != 0xFFFF) {
9467             insert_flipped_container(&ans->high_low_container,
9468                                      &x1->high_low_container, hb_end + 1, 0,
9469                                      lb_end);
9470             ++hb_end;
9471         }
9472     }
9473     ra_append_copies_after(&ans->high_low_container, &x1->high_low_container,
9474                            hb_end, is_cow(x1));
9475     return ans;
9476 }
9477 
roaring_bitmap_flip_inplace(roaring_bitmap_t * x1,uint64_t range_start,uint64_t range_end)9478 void roaring_bitmap_flip_inplace(roaring_bitmap_t *x1, uint64_t range_start,
9479                                  uint64_t range_end) {
9480     if (range_start >= range_end) {
9481         return;  // empty range
9482     }
9483     if(range_end >= UINT64_C(0x100000000)) {
9484         range_end = UINT64_C(0x100000000);
9485     }
9486 
9487     uint16_t hb_start = (uint16_t)(range_start >> 16);
9488     const uint16_t lb_start = (uint16_t)range_start;
9489     uint16_t hb_end = (uint16_t)((range_end - 1) >> 16);
9490     const uint16_t lb_end = (uint16_t)(range_end - 1);
9491 
9492     if (hb_start == hb_end) {
9493         inplace_flip_container(&x1->high_low_container, hb_start, lb_start,
9494                                lb_end);
9495     } else {
9496         // start and end containers are distinct
9497         if (lb_start > 0) {
9498             // handle first (partial) container
9499             inplace_flip_container(&x1->high_low_container, hb_start, lb_start,
9500                                    0xFFFF);
9501             ++hb_start;  // for the full containers.  Can't wrap.
9502         }
9503 
9504         if (lb_end != 0xFFFF) --hb_end;
9505 
9506         for (uint32_t hb = hb_start; hb <= hb_end; ++hb) {
9507             inplace_fully_flip_container(&x1->high_low_container, hb);
9508         }
9509         // handle a partial final container
9510         if (lb_end != 0xFFFF) {
9511             inplace_flip_container(&x1->high_low_container, hb_end + 1, 0,
9512                                    lb_end);
9513             ++hb_end;
9514         }
9515     }
9516 }
9517 
roaring_bitmap_lazy_or(const roaring_bitmap_t * x1,const roaring_bitmap_t * x2,const bool bitsetconversion)9518 roaring_bitmap_t *roaring_bitmap_lazy_or(const roaring_bitmap_t *x1,
9519                                          const roaring_bitmap_t *x2,
9520                                          const bool bitsetconversion) {
9521     uint8_t container_result_type = 0;
9522     const int length1 = x1->high_low_container.size,
9523               length2 = x2->high_low_container.size;
9524     if (0 == length1) {
9525         return roaring_bitmap_copy(x2);
9526     }
9527     if (0 == length2) {
9528         return roaring_bitmap_copy(x1);
9529     }
9530     roaring_bitmap_t *answer =
9531         roaring_bitmap_create_with_capacity(length1 + length2);
9532     roaring_bitmap_set_copy_on_write(answer, is_cow(x1) && is_cow(x2));
9533     int pos1 = 0, pos2 = 0;
9534     uint8_t container_type_1, container_type_2;
9535     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
9536     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
9537     while (true) {
9538         if (s1 == s2) {
9539             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
9540                                                  &container_type_1);
9541             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
9542                                                  &container_type_2);
9543             void *c;
9544             if (bitsetconversion && (get_container_type(c1, container_type_1) !=
9545                                      BITSET_CONTAINER_TYPE_CODE) &&
9546                 (get_container_type(c2, container_type_2) !=
9547                  BITSET_CONTAINER_TYPE_CODE)) {
9548                 void *newc1 =
9549                     container_mutable_unwrap_shared(c1, &container_type_1);
9550                 newc1 = container_to_bitset(newc1, container_type_1);
9551                 container_type_1 = BITSET_CONTAINER_TYPE_CODE;
9552                 c = container_lazy_ior(newc1, container_type_1, c2,
9553                                        container_type_2,
9554                                        &container_result_type);
9555                 if (c != newc1) {  // should not happen
9556                     container_free(newc1, container_type_1);
9557                 }
9558             } else {
9559                 c = container_lazy_or(c1, container_type_1, c2,
9560                                       container_type_2, &container_result_type);
9561             }
9562             // since we assume that the initial containers are non-empty,
9563             // the
9564             // result here
9565             // can only be non-empty
9566             ra_append(&answer->high_low_container, s1, c,
9567                       container_result_type);
9568             ++pos1;
9569             ++pos2;
9570             if (pos1 == length1) break;
9571             if (pos2 == length2) break;
9572             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
9573             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
9574 
9575         } else if (s1 < s2) {  // s1 < s2
9576             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
9577                                                  &container_type_1);
9578             c1 =
9579                 get_copy_of_container(c1, &container_type_1, is_cow(x1));
9580             if (is_cow(x1)) {
9581                 ra_set_container_at_index(&x1->high_low_container, pos1, c1,
9582                                           container_type_1);
9583             }
9584             ra_append(&answer->high_low_container, s1, c1, container_type_1);
9585             pos1++;
9586             if (pos1 == length1) break;
9587             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
9588 
9589         } else {  // s1 > s2
9590             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
9591                                                  &container_type_2);
9592             c2 =
9593                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
9594             if (is_cow(x2)) {
9595                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
9596                                           container_type_2);
9597             }
9598             ra_append(&answer->high_low_container, s2, c2, container_type_2);
9599             pos2++;
9600             if (pos2 == length2) break;
9601             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
9602         }
9603     }
9604     if (pos1 == length1) {
9605         ra_append_copy_range(&answer->high_low_container,
9606                              &x2->high_low_container, pos2, length2,
9607                              is_cow(x2));
9608     } else if (pos2 == length2) {
9609         ra_append_copy_range(&answer->high_low_container,
9610                              &x1->high_low_container, pos1, length1,
9611                              is_cow(x1));
9612     }
9613     return answer;
9614 }
9615 
roaring_bitmap_lazy_or_inplace(roaring_bitmap_t * x1,const roaring_bitmap_t * x2,const bool bitsetconversion)9616 void roaring_bitmap_lazy_or_inplace(roaring_bitmap_t *x1,
9617                                     const roaring_bitmap_t *x2,
9618                                     const bool bitsetconversion) {
9619     uint8_t container_result_type = 0;
9620     int length1 = x1->high_low_container.size;
9621     const int length2 = x2->high_low_container.size;
9622 
9623     if (0 == length2) return;
9624 
9625     if (0 == length1) {
9626         roaring_bitmap_overwrite(x1, x2);
9627         return;
9628     }
9629     int pos1 = 0, pos2 = 0;
9630     uint8_t container_type_1, container_type_2;
9631     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
9632     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
9633     while (true) {
9634         if (s1 == s2) {
9635             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
9636                                                  &container_type_1);
9637             if (!container_is_full(c1, container_type_1)) {
9638                 if ((bitsetconversion == false) ||
9639                     (get_container_type(c1, container_type_1) ==
9640                      BITSET_CONTAINER_TYPE_CODE)) {
9641                     c1 = get_writable_copy_if_shared(c1, &container_type_1);
9642                 } else {
9643                     // convert to bitset
9644                     void *oldc1 = c1;
9645                     uint8_t oldt1 = container_type_1;
9646                     c1 = container_mutable_unwrap_shared(c1, &container_type_1);
9647                     c1 = container_to_bitset(c1, container_type_1);
9648                     container_free(oldc1, oldt1);
9649                     container_type_1 = BITSET_CONTAINER_TYPE_CODE;
9650                 }
9651 
9652                 void *c2 = ra_get_container_at_index(&x2->high_low_container,
9653                                                      pos2, &container_type_2);
9654                 void *c = container_lazy_ior(c1, container_type_1, c2,
9655                                              container_type_2,
9656                                              &container_result_type);
9657                 if (c !=
9658                     c1) {  // in this instance a new container was created, and
9659                            // we need to free the old one
9660                     container_free(c1, container_type_1);
9661                 }
9662 
9663                 ra_set_container_at_index(&x1->high_low_container, pos1, c,
9664                                           container_result_type);
9665             }
9666             ++pos1;
9667             ++pos2;
9668             if (pos1 == length1) break;
9669             if (pos2 == length2) break;
9670             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
9671             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
9672 
9673         } else if (s1 < s2) {  // s1 < s2
9674             pos1++;
9675             if (pos1 == length1) break;
9676             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
9677 
9678         } else {  // s1 > s2
9679             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
9680                                                  &container_type_2);
9681             // void *c2_clone = container_clone(c2, container_type_2);
9682             c2 =
9683                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
9684             if (is_cow(x2)) {
9685                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
9686                                           container_type_2);
9687             }
9688             ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
9689                                        container_type_2);
9690             pos1++;
9691             length1++;
9692             pos2++;
9693             if (pos2 == length2) break;
9694             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
9695         }
9696     }
9697     if (pos1 == length1) {
9698         ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
9699                              pos2, length2, is_cow(x2));
9700     }
9701 }
9702 
roaring_bitmap_lazy_xor(const roaring_bitmap_t * x1,const roaring_bitmap_t * x2)9703 roaring_bitmap_t *roaring_bitmap_lazy_xor(const roaring_bitmap_t *x1,
9704                                           const roaring_bitmap_t *x2) {
9705     uint8_t container_result_type = 0;
9706     const int length1 = x1->high_low_container.size,
9707               length2 = x2->high_low_container.size;
9708     if (0 == length1) {
9709         return roaring_bitmap_copy(x2);
9710     }
9711     if (0 == length2) {
9712         return roaring_bitmap_copy(x1);
9713     }
9714     roaring_bitmap_t *answer =
9715         roaring_bitmap_create_with_capacity(length1 + length2);
9716     roaring_bitmap_set_copy_on_write(answer, is_cow(x1) && is_cow(x2));
9717     int pos1 = 0, pos2 = 0;
9718     uint8_t container_type_1, container_type_2;
9719     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
9720     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
9721     while (true) {
9722         if (s1 == s2) {
9723             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
9724                                                  &container_type_1);
9725             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
9726                                                  &container_type_2);
9727             void *c =
9728                 container_lazy_xor(c1, container_type_1, c2, container_type_2,
9729                                    &container_result_type);
9730 
9731             if (container_nonzero_cardinality(c, container_result_type)) {
9732                 ra_append(&answer->high_low_container, s1, c,
9733                           container_result_type);
9734             } else {
9735                 container_free(c, container_result_type);
9736             }
9737 
9738             ++pos1;
9739             ++pos2;
9740             if (pos1 == length1) break;
9741             if (pos2 == length2) break;
9742             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
9743             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
9744 
9745         } else if (s1 < s2) {  // s1 < s2
9746             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
9747                                                  &container_type_1);
9748             c1 =
9749                 get_copy_of_container(c1, &container_type_1, is_cow(x1));
9750             if (is_cow(x1)) {
9751                 ra_set_container_at_index(&x1->high_low_container, pos1, c1,
9752                                           container_type_1);
9753             }
9754             ra_append(&answer->high_low_container, s1, c1, container_type_1);
9755             pos1++;
9756             if (pos1 == length1) break;
9757             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
9758 
9759         } else {  // s1 > s2
9760             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
9761                                                  &container_type_2);
9762             c2 =
9763                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
9764             if (is_cow(x2)) {
9765                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
9766                                           container_type_2);
9767             }
9768             ra_append(&answer->high_low_container, s2, c2, container_type_2);
9769             pos2++;
9770             if (pos2 == length2) break;
9771             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
9772         }
9773     }
9774     if (pos1 == length1) {
9775         ra_append_copy_range(&answer->high_low_container,
9776                              &x2->high_low_container, pos2, length2,
9777                              is_cow(x2));
9778     } else if (pos2 == length2) {
9779         ra_append_copy_range(&answer->high_low_container,
9780                              &x1->high_low_container, pos1, length1,
9781                              is_cow(x1));
9782     }
9783     return answer;
9784 }
9785 
roaring_bitmap_lazy_xor_inplace(roaring_bitmap_t * x1,const roaring_bitmap_t * x2)9786 void roaring_bitmap_lazy_xor_inplace(roaring_bitmap_t *x1,
9787                                      const roaring_bitmap_t *x2) {
9788     assert(x1 != x2);
9789     uint8_t container_result_type = 0;
9790     int length1 = x1->high_low_container.size;
9791     const int length2 = x2->high_low_container.size;
9792 
9793     if (0 == length2) return;
9794 
9795     if (0 == length1) {
9796         roaring_bitmap_overwrite(x1, x2);
9797         return;
9798     }
9799     int pos1 = 0, pos2 = 0;
9800     uint8_t container_type_1, container_type_2;
9801     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
9802     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
9803     while (true) {
9804         if (s1 == s2) {
9805             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
9806                                                  &container_type_1);
9807             c1 = get_writable_copy_if_shared(c1, &container_type_1);
9808             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
9809                                                  &container_type_2);
9810             void *c =
9811                 container_lazy_ixor(c1, container_type_1, c2, container_type_2,
9812                                     &container_result_type);
9813             if (container_nonzero_cardinality(c, container_result_type)) {
9814                 ra_set_container_at_index(&x1->high_low_container, pos1, c,
9815                                           container_result_type);
9816                 ++pos1;
9817             } else {
9818                 container_free(c, container_result_type);
9819                 ra_remove_at_index(&x1->high_low_container, pos1);
9820                 --length1;
9821             }
9822             ++pos2;
9823             if (pos1 == length1) break;
9824             if (pos2 == length2) break;
9825             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
9826             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
9827 
9828         } else if (s1 < s2) {  // s1 < s2
9829             pos1++;
9830             if (pos1 == length1) break;
9831             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
9832 
9833         } else {  // s1 > s2
9834             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
9835                                                  &container_type_2);
9836             // void *c2_clone = container_clone(c2, container_type_2);
9837             c2 =
9838                 get_copy_of_container(c2, &container_type_2, is_cow(x2));
9839             if (is_cow(x2)) {
9840                 ra_set_container_at_index(&x2->high_low_container, pos2, c2,
9841                                           container_type_2);
9842             }
9843             ra_insert_new_key_value_at(&x1->high_low_container, pos1, s2, c2,
9844                                        container_type_2);
9845             pos1++;
9846             length1++;
9847             pos2++;
9848             if (pos2 == length2) break;
9849             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
9850         }
9851     }
9852     if (pos1 == length1) {
9853         ra_append_copy_range(&x1->high_low_container, &x2->high_low_container,
9854                              pos2, length2, is_cow(x2));
9855     }
9856 }
9857 
roaring_bitmap_repair_after_lazy(roaring_bitmap_t * ra)9858 void roaring_bitmap_repair_after_lazy(roaring_bitmap_t *ra) {
9859     for (int i = 0; i < ra->high_low_container.size; ++i) {
9860         const uint8_t original_typecode = ra->high_low_container.typecodes[i];
9861         void *container = ra->high_low_container.containers[i];
9862         uint8_t new_typecode = original_typecode;
9863         void *newcontainer =
9864             container_repair_after_lazy(container, &new_typecode);
9865         ra->high_low_container.containers[i] = newcontainer;
9866         ra->high_low_container.typecodes[i] = new_typecode;
9867     }
9868 }
9869 
9870 
9871 
9872 /**
9873 * roaring_bitmap_rank returns the number of integers that are smaller or equal
9874 * to x.
9875 */
roaring_bitmap_rank(const roaring_bitmap_t * bm,uint32_t x)9876 uint64_t roaring_bitmap_rank(const roaring_bitmap_t *bm, uint32_t x) {
9877     uint64_t size = 0;
9878     uint32_t xhigh = x >> 16;
9879     for (int i = 0; i < bm->high_low_container.size; i++) {
9880         uint32_t key = bm->high_low_container.keys[i];
9881         if (xhigh > key) {
9882             size +=
9883                 container_get_cardinality(bm->high_low_container.containers[i],
9884                                           bm->high_low_container.typecodes[i]);
9885         } else if (xhigh == key) {
9886             return size + container_rank(bm->high_low_container.containers[i],
9887                                          bm->high_low_container.typecodes[i],
9888                                          x & 0xFFFF);
9889         } else {
9890             return size;
9891         }
9892     }
9893     return size;
9894 }
9895 
9896 /**
9897 * roaring_bitmap_smallest returns the smallest value in the set.
9898 * Returns UINT32_MAX if the set is empty.
9899 */
roaring_bitmap_minimum(const roaring_bitmap_t * bm)9900 uint32_t roaring_bitmap_minimum(const roaring_bitmap_t *bm) {
9901     if (bm->high_low_container.size > 0) {
9902         void *container = bm->high_low_container.containers[0];
9903         uint8_t typecode = bm->high_low_container.typecodes[0];
9904         uint32_t key = bm->high_low_container.keys[0];
9905         uint32_t lowvalue = container_minimum(container, typecode);
9906         return lowvalue | (key << 16);
9907     }
9908     return UINT32_MAX;
9909 }
9910 
9911 /**
9912 * roaring_bitmap_smallest returns the greatest value in the set.
9913 * Returns 0 if the set is empty.
9914 */
roaring_bitmap_maximum(const roaring_bitmap_t * bm)9915 uint32_t roaring_bitmap_maximum(const roaring_bitmap_t *bm) {
9916     if (bm->high_low_container.size > 0) {
9917         void *container =
9918             bm->high_low_container.containers[bm->high_low_container.size - 1];
9919         uint8_t typecode =
9920             bm->high_low_container.typecodes[bm->high_low_container.size - 1];
9921         uint32_t key =
9922             bm->high_low_container.keys[bm->high_low_container.size - 1];
9923         uint32_t lowvalue = container_maximum(container, typecode);
9924         return lowvalue | (key << 16);
9925     }
9926     return 0;
9927 }
9928 
roaring_bitmap_select(const roaring_bitmap_t * bm,uint32_t rank,uint32_t * element)9929 bool roaring_bitmap_select(const roaring_bitmap_t *bm, uint32_t rank,
9930                            uint32_t *element) {
9931     void *container;
9932     uint8_t typecode;
9933     uint16_t key;
9934     uint32_t start_rank = 0;
9935     int i = 0;
9936     bool valid = false;
9937     while (!valid && i < bm->high_low_container.size) {
9938         container = bm->high_low_container.containers[i];
9939         typecode = bm->high_low_container.typecodes[i];
9940         valid =
9941             container_select(container, typecode, &start_rank, rank, element);
9942         i++;
9943     }
9944 
9945     if (valid) {
9946         key = bm->high_low_container.keys[i - 1];
9947         *element |= (key << 16);
9948         return true;
9949     } else
9950         return false;
9951 }
9952 
roaring_bitmap_intersect(const roaring_bitmap_t * x1,const roaring_bitmap_t * x2)9953 bool roaring_bitmap_intersect(const roaring_bitmap_t *x1,
9954                                      const roaring_bitmap_t *x2) {
9955     const int length1 = x1->high_low_container.size,
9956               length2 = x2->high_low_container.size;
9957     uint64_t answer = 0;
9958     int pos1 = 0, pos2 = 0;
9959 
9960     while (pos1 < length1 && pos2 < length2) {
9961         const uint16_t s1 = ra_get_key_at_index(& x1->high_low_container, pos1);
9962         const uint16_t s2 = ra_get_key_at_index(& x2->high_low_container, pos2);
9963 
9964         if (s1 == s2) {
9965             uint8_t container_type_1, container_type_2;
9966             void *c1 = ra_get_container_at_index(& x1->high_low_container, pos1,
9967                                                  &container_type_1);
9968             void *c2 = ra_get_container_at_index(& x2->high_low_container, pos2,
9969                                                  &container_type_2);
9970             if( container_intersect(c1, container_type_1, c2, container_type_2) ) return true;
9971             ++pos1;
9972             ++pos2;
9973         } else if (s1 < s2) {  // s1 < s2
9974             pos1 = ra_advance_until(& x1->high_low_container, s2, pos1);
9975         } else {  // s1 > s2
9976             pos2 = ra_advance_until(& x2->high_low_container, s1, pos2);
9977         }
9978     }
9979     return answer;
9980 }
9981 
9982 
roaring_bitmap_and_cardinality(const roaring_bitmap_t * x1,const roaring_bitmap_t * x2)9983 uint64_t roaring_bitmap_and_cardinality(const roaring_bitmap_t *x1,
9984                                         const roaring_bitmap_t *x2) {
9985     const int length1 = x1->high_low_container.size,
9986               length2 = x2->high_low_container.size;
9987     uint64_t answer = 0;
9988     int pos1 = 0, pos2 = 0;
9989 
9990     while (pos1 < length1 && pos2 < length2) {
9991         const uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
9992         const uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
9993 
9994         if (s1 == s2) {
9995             uint8_t container_type_1, container_type_2;
9996             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
9997                                                  &container_type_1);
9998             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
9999                                                  &container_type_2);
10000             answer += container_and_cardinality(c1, container_type_1, c2,
10001                                                 container_type_2);
10002             ++pos1;
10003             ++pos2;
10004         } else if (s1 < s2) {  // s1 < s2
10005             pos1 = ra_advance_until(&x1->high_low_container, s2, pos1);
10006         } else {  // s1 > s2
10007             pos2 = ra_advance_until(&x2->high_low_container, s1, pos2);
10008         }
10009     }
10010     return answer;
10011 }
10012 
roaring_bitmap_jaccard_index(const roaring_bitmap_t * x1,const roaring_bitmap_t * x2)10013 double roaring_bitmap_jaccard_index(const roaring_bitmap_t *x1,
10014                                     const roaring_bitmap_t *x2) {
10015     const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
10016     const uint64_t c2 = roaring_bitmap_get_cardinality(x2);
10017     const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
10018     return (double)inter / (double)(c1 + c2 - inter);
10019 }
10020 
roaring_bitmap_or_cardinality(const roaring_bitmap_t * x1,const roaring_bitmap_t * x2)10021 uint64_t roaring_bitmap_or_cardinality(const roaring_bitmap_t *x1,
10022                                        const roaring_bitmap_t *x2) {
10023     const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
10024     const uint64_t c2 = roaring_bitmap_get_cardinality(x2);
10025     const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
10026     return c1 + c2 - inter;
10027 }
10028 
roaring_bitmap_andnot_cardinality(const roaring_bitmap_t * x1,const roaring_bitmap_t * x2)10029 uint64_t roaring_bitmap_andnot_cardinality(const roaring_bitmap_t *x1,
10030                                            const roaring_bitmap_t *x2) {
10031     const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
10032     const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
10033     return c1 - inter;
10034 }
10035 
roaring_bitmap_xor_cardinality(const roaring_bitmap_t * x1,const roaring_bitmap_t * x2)10036 uint64_t roaring_bitmap_xor_cardinality(const roaring_bitmap_t *x1,
10037                                         const roaring_bitmap_t *x2) {
10038     const uint64_t c1 = roaring_bitmap_get_cardinality(x1);
10039     const uint64_t c2 = roaring_bitmap_get_cardinality(x2);
10040     const uint64_t inter = roaring_bitmap_and_cardinality(x1, x2);
10041     return c1 + c2 - 2 * inter;
10042 }
10043 
10044 
10045 /**
10046  * Check whether a range of values from range_start (included) to range_end (excluded) is present
10047  */
roaring_bitmap_contains_range(const roaring_bitmap_t * r,uint64_t range_start,uint64_t range_end)10048 bool roaring_bitmap_contains_range(const roaring_bitmap_t *r, uint64_t range_start, uint64_t range_end) {
10049     if(range_end >= UINT64_C(0x100000000)) {
10050         range_end = UINT64_C(0x100000000);
10051     }
10052     if (range_start >= range_end) return true;  // empty range are always contained!
10053     if (range_end - range_start == 1) return roaring_bitmap_contains(r, (uint32_t)range_start);
10054     uint16_t hb_rs = (uint16_t)(range_start >> 16);
10055     uint16_t hb_re = (uint16_t)((range_end - 1) >> 16);
10056     const int32_t span = hb_re - hb_rs;
10057     const int32_t hlc_sz = ra_get_size(&r->high_low_container);
10058     if (hlc_sz < span + 1) {
10059       return false;
10060     }
10061     int32_t is = ra_get_index(&r->high_low_container, hb_rs);
10062     int32_t ie = ra_get_index(&r->high_low_container, hb_re);
10063     ie = (ie < 0 ? -ie - 1 : ie);
10064     if ((is < 0) || ((ie - is) != span)) {
10065        return false;
10066     }
10067     const uint32_t lb_rs = range_start & 0xFFFF;
10068     const uint32_t lb_re = ((range_end - 1) & 0xFFFF) + 1;
10069     uint8_t typecode;
10070     void *container = ra_get_container_at_index(&r->high_low_container, is, &typecode);
10071     if (hb_rs == hb_re) {
10072       return container_contains_range(container, lb_rs, lb_re, typecode);
10073     }
10074     if (!container_contains_range(container, lb_rs, 1 << 16, typecode)) {
10075       return false;
10076     }
10077     assert(ie < hlc_sz); // would indicate an algorithmic bug
10078     container = ra_get_container_at_index(&r->high_low_container, ie, &typecode);
10079     if (!container_contains_range(container, 0, lb_re, typecode)) {
10080         return false;
10081     }
10082     for (int32_t i = is + 1; i < ie; ++i) {
10083         container = ra_get_container_at_index(&r->high_low_container, i, &typecode);
10084         if (!container_is_full(container, typecode) ) {
10085           return false;
10086         }
10087     }
10088     return true;
10089 }
10090 
10091 
roaring_bitmap_is_strict_subset(const roaring_bitmap_t * ra1,const roaring_bitmap_t * ra2)10092 bool roaring_bitmap_is_strict_subset(const roaring_bitmap_t *ra1,
10093                                             const roaring_bitmap_t *ra2) {
10094     return (roaring_bitmap_get_cardinality(ra2) >
10095                 roaring_bitmap_get_cardinality(ra1) &&
10096             roaring_bitmap_is_subset(ra1, ra2));
10097 }
10098 
10099 
10100 /*
10101  * FROZEN SERIALIZATION FORMAT DESCRIPTION
10102  *
10103  * -- (beginning must be aligned by 32 bytes) --
10104  * <bitset_data> uint64_t[BITSET_CONTAINER_SIZE_IN_WORDS * num_bitset_containers]
10105  * <run_data>    rle16_t[total number of rle elements in all run containers]
10106  * <array_data>  uint16_t[total number of array elements in all array containers]
10107  * <keys>        uint16_t[num_containers]
10108  * <counts>      uint16_t[num_containers]
10109  * <typecodes>   uint8_t[num_containers]
10110  * <header>      uint32_t
10111  *
10112  * <header> is a 4-byte value which is a bit union of FROZEN_COOKIE (15 bits)
10113  * and the number of containers (17 bits).
10114  *
10115  * <counts> stores number of elements for every container.
10116  * Its meaning depends on container type.
10117  * For array and bitset containers, this value is the container cardinality minus one.
10118  * For run container, it is the number of rle_t elements (n_runs).
10119  *
10120  * <bitset_data>,<array_data>,<run_data> are flat arrays of elements of
10121  * all containers of respective type.
10122  *
10123  * <*_data> and <keys> are kept close together because they are not accessed
10124  * during deserilization. This may reduce IO in case of large mapped bitmaps.
10125  * All members have their native alignments during deserilization except <header>,
10126  * which is not guaranteed to be aligned by 4 bytes.
10127  */
10128 
roaring_bitmap_frozen_size_in_bytes(const roaring_bitmap_t * rb)10129 size_t roaring_bitmap_frozen_size_in_bytes(const roaring_bitmap_t *rb) {
10130     const roaring_array_t *ra = &rb->high_low_container;
10131     size_t num_bytes = 0;
10132     for (int32_t i = 0; i < ra->size; i++) {
10133         switch (ra->typecodes[i]) {
10134             case BITSET_CONTAINER_TYPE_CODE: {
10135                 num_bytes += BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
10136                 break;
10137             }
10138             case RUN_CONTAINER_TYPE_CODE: {
10139                 const run_container_t *run =
10140                         (const run_container_t *) ra->containers[i];
10141                 num_bytes += run->n_runs * sizeof(rle16_t);
10142                 break;
10143             }
10144             case ARRAY_CONTAINER_TYPE_CODE: {
10145                 const array_container_t *array =
10146                         (const array_container_t *) ra->containers[i];
10147                 num_bytes += array->cardinality * sizeof(uint16_t);
10148                 break;
10149             }
10150             default:
10151                 __builtin_unreachable();
10152         }
10153     }
10154     num_bytes += (2 + 2 + 1) * ra->size; // keys, counts, typecodes
10155     num_bytes += 4; // header
10156     return num_bytes;
10157 }
10158 
arena_alloc(char ** arena,size_t num_bytes)10159 inline static void *arena_alloc(char **arena, size_t num_bytes) {
10160     char *res = *arena;
10161     *arena += num_bytes;
10162     return res;
10163 }
10164 
roaring_bitmap_frozen_serialize(const roaring_bitmap_t * rb,char * buf)10165 void roaring_bitmap_frozen_serialize(const roaring_bitmap_t *rb, char *buf) {
10166     /*
10167      * Note: we do not require user to supply spicificly aligned buffer.
10168      * Thus we have to use memcpy() everywhere.
10169      */
10170 
10171     const roaring_array_t *ra = &rb->high_low_container;
10172 
10173     size_t bitset_zone_size = 0;
10174     size_t run_zone_size = 0;
10175     size_t array_zone_size = 0;
10176     for (int32_t i = 0; i < ra->size; i++) {
10177         switch (ra->typecodes[i]) {
10178             case BITSET_CONTAINER_TYPE_CODE: {
10179                 bitset_zone_size +=
10180                         BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
10181                 break;
10182             }
10183             case RUN_CONTAINER_TYPE_CODE: {
10184                 const run_container_t *run =
10185                         (const run_container_t *) ra->containers[i];
10186                 run_zone_size += run->n_runs * sizeof(rle16_t);
10187                 break;
10188             }
10189             case ARRAY_CONTAINER_TYPE_CODE: {
10190                 const array_container_t *array =
10191                         (const array_container_t *) ra->containers[i];
10192                 array_zone_size += array->cardinality * sizeof(uint16_t);
10193                 break;
10194             }
10195             default:
10196                 __builtin_unreachable();
10197         }
10198     }
10199 
10200     uint64_t *bitset_zone = (uint64_t *)arena_alloc(&buf, bitset_zone_size);
10201     rle16_t *run_zone = (rle16_t *)arena_alloc(&buf, run_zone_size);
10202     uint16_t *array_zone = (uint16_t *)arena_alloc(&buf, array_zone_size);
10203     uint16_t *key_zone = (uint16_t *)arena_alloc(&buf, 2*ra->size);
10204     uint16_t *count_zone = (uint16_t *)arena_alloc(&buf, 2*ra->size);
10205     uint8_t *typecode_zone = (uint8_t *)arena_alloc(&buf, ra->size);
10206     uint32_t *header_zone = (uint32_t *)arena_alloc(&buf, 4);
10207 
10208     for (int32_t i = 0; i < ra->size; i++) {
10209         uint16_t count;
10210         switch (ra->typecodes[i]) {
10211             case BITSET_CONTAINER_TYPE_CODE: {
10212                 const bitset_container_t *bitset =
10213                         (const bitset_container_t *) ra->containers[i];
10214                 memcpy(bitset_zone, bitset->array,
10215                        BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t));
10216                 bitset_zone += BITSET_CONTAINER_SIZE_IN_WORDS;
10217                 if (bitset->cardinality != BITSET_UNKNOWN_CARDINALITY) {
10218                     count = bitset->cardinality - 1;
10219                 } else {
10220                     count = bitset_container_compute_cardinality(bitset) - 1;
10221                 }
10222                 break;
10223             }
10224             case RUN_CONTAINER_TYPE_CODE: {
10225                 const run_container_t *run =
10226                         (const run_container_t *) ra->containers[i];
10227                 size_t num_bytes = run->n_runs * sizeof(rle16_t);
10228                 memcpy(run_zone, run->runs, num_bytes);
10229                 run_zone += run->n_runs;
10230                 count = run->n_runs;
10231                 break;
10232             }
10233             case ARRAY_CONTAINER_TYPE_CODE: {
10234                 const array_container_t *array =
10235                         (const array_container_t *) ra->containers[i];
10236                 size_t num_bytes = array->cardinality * sizeof(uint16_t);
10237                 memcpy(array_zone, array->array, num_bytes);
10238                 array_zone += array->cardinality;
10239                 count = array->cardinality - 1;
10240                 break;
10241             }
10242             default:
10243                 __builtin_unreachable();
10244         }
10245         memcpy(&count_zone[i], &count, 2);
10246     }
10247     memcpy(key_zone, ra->keys, ra->size * sizeof(uint16_t));
10248     memcpy(typecode_zone, ra->typecodes, ra->size * sizeof(uint8_t));
10249     uint32_t header = ((uint32_t)ra->size << 15) | FROZEN_COOKIE;
10250     memcpy(header_zone, &header, 4);
10251 }
10252 
10253 const roaring_bitmap_t *
roaring_bitmap_frozen_view(const char * buf,size_t length)10254 roaring_bitmap_frozen_view(const char *buf, size_t length) {
10255     if ((uintptr_t)buf % 32 != 0) {
10256         return NULL;
10257     }
10258 
10259     // cookie and num_containers
10260     if (length < 4) {
10261         return NULL;
10262     }
10263     uint32_t header;
10264     memcpy(&header, buf + length - 4, 4); // header may be misaligned
10265     if ((header & 0x7FFF) != FROZEN_COOKIE) {
10266         return NULL;
10267     }
10268     int32_t num_containers = (header >> 15);
10269 
10270     // typecodes, counts and keys
10271     if (length < 4 + (size_t)num_containers * (1 + 2 + 2)) {
10272         return NULL;
10273     }
10274     uint16_t *keys = (uint16_t *)(buf + length - 4 - num_containers * 5);
10275     uint16_t *counts = (uint16_t *)(buf + length - 4 - num_containers * 3);
10276     uint8_t *typecodes = (uint8_t *)(buf + length - 4 - num_containers * 1);
10277 
10278     // {bitset,array,run}_zone
10279     int32_t num_bitset_containers = 0;
10280     int32_t num_run_containers = 0;
10281     int32_t num_array_containers = 0;
10282     size_t bitset_zone_size = 0;
10283     size_t run_zone_size = 0;
10284     size_t array_zone_size = 0;
10285     for (int32_t i = 0; i < num_containers; i++) {
10286         switch (typecodes[i]) {
10287             case BITSET_CONTAINER_TYPE_CODE:
10288                 num_bitset_containers++;
10289                 bitset_zone_size += BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
10290                 break;
10291             case RUN_CONTAINER_TYPE_CODE:
10292                 num_run_containers++;
10293                 run_zone_size += counts[i] * sizeof(rle16_t);
10294                 break;
10295             case ARRAY_CONTAINER_TYPE_CODE:
10296                 num_array_containers++;
10297                 array_zone_size += (counts[i] + UINT32_C(1)) * sizeof(uint16_t);
10298                 break;
10299             default:
10300                 return NULL;
10301         }
10302     }
10303     if (length != bitset_zone_size + run_zone_size + array_zone_size +
10304                   5 * num_containers + 4) {
10305         return NULL;
10306     }
10307     uint64_t *bitset_zone = (uint64_t*) (buf);
10308     rle16_t *run_zone = (rle16_t*) (buf + bitset_zone_size);
10309     uint16_t *array_zone = (uint16_t*) (buf + bitset_zone_size + run_zone_size);
10310 
10311     size_t alloc_size = 0;
10312     alloc_size += sizeof(roaring_bitmap_t);
10313     alloc_size += num_containers * sizeof(void *);
10314     alloc_size += num_bitset_containers * sizeof(bitset_container_t);
10315     alloc_size += num_run_containers * sizeof(run_container_t);
10316     alloc_size += num_array_containers * sizeof(array_container_t);
10317 
10318     char *arena = (char *)malloc(alloc_size);
10319     if (arena == NULL) {
10320         return NULL;
10321     }
10322 
10323     roaring_bitmap_t *rb = (roaring_bitmap_t *)
10324             arena_alloc(&arena, sizeof(roaring_bitmap_t));
10325     rb->high_low_container.flags = ROARING_FLAG_FROZEN;
10326     rb->high_low_container.allocation_size = num_containers;
10327     rb->high_low_container.size = num_containers;
10328     rb->high_low_container.keys = (uint16_t *)keys;
10329     rb->high_low_container.typecodes = (uint8_t *)typecodes;
10330     rb->high_low_container.containers =
10331             (void **)arena_alloc(&arena, sizeof(void*) * num_containers);
10332     for (int32_t i = 0; i < num_containers; i++) {
10333         switch (typecodes[i]) {
10334             case BITSET_CONTAINER_TYPE_CODE: {
10335                 bitset_container_t *bitset = (bitset_container_t *)
10336                         arena_alloc(&arena, sizeof(bitset_container_t));
10337                 bitset->array = bitset_zone;
10338                 bitset->cardinality = counts[i] + UINT32_C(1);
10339                 rb->high_low_container.containers[i] = bitset;
10340                 bitset_zone += BITSET_CONTAINER_SIZE_IN_WORDS;
10341                 break;
10342             }
10343             case RUN_CONTAINER_TYPE_CODE: {
10344                 run_container_t *run = (run_container_t *)
10345                         arena_alloc(&arena, sizeof(run_container_t));
10346                 run->capacity = counts[i];
10347                 run->n_runs = counts[i];
10348                 run->runs = run_zone;
10349                 rb->high_low_container.containers[i] = run;
10350                 run_zone += run->n_runs;
10351                 break;
10352             }
10353             case ARRAY_CONTAINER_TYPE_CODE: {
10354                 array_container_t *array = (array_container_t *)
10355                         arena_alloc(&arena, sizeof(array_container_t));
10356                 array->capacity = counts[i] + UINT32_C(1);
10357                 array->cardinality = counts[i] + UINT32_C(1);
10358                 array->array = array_zone;
10359                 rb->high_low_container.containers[i] = array;
10360                 array_zone += counts[i] + UINT32_C(1);
10361                 break;
10362             }
10363             default:
10364                 free(arena);
10365                 return NULL;
10366         }
10367     }
10368 
10369     return rb;
10370 }
10371 /* end file src/roaring.c */
10372 /* begin file src/roaring_array.c */
10373 #include <assert.h>
10374 #include <stdbool.h>
10375 #include <stdio.h>
10376 #include <stdlib.h>
10377 #include <string.h>
10378 #include <inttypes.h>
10379 
10380 
10381 // Convention: [0,ra->size) all elements are initialized
10382 //  [ra->size, ra->allocation_size) is junk and contains nothing needing freeing
10383 
realloc_array(roaring_array_t * ra,int32_t new_capacity)10384 static bool realloc_array(roaring_array_t *ra, int32_t new_capacity) {
10385     // because we combine the allocations, it is not possible to use realloc
10386     /*ra->keys =
10387     (uint16_t *)realloc(ra->keys, sizeof(uint16_t) * new_capacity);
10388 ra->containers =
10389     (void **)realloc(ra->containers, sizeof(void *) * new_capacity);
10390 ra->typecodes =
10391     (uint8_t *)realloc(ra->typecodes, sizeof(uint8_t) * new_capacity);
10392 if (!ra->keys || !ra->containers || !ra->typecodes) {
10393     free(ra->keys);
10394     free(ra->containers);
10395     free(ra->typecodes);
10396     return false;
10397 }*/
10398 
10399     if ( new_capacity == 0 ) {
10400       free(ra->containers);
10401       ra->containers = NULL;
10402       ra->keys = NULL;
10403       ra->typecodes = NULL;
10404       ra->allocation_size = 0;
10405       return true;
10406     }
10407     const size_t memoryneeded =
10408         new_capacity * (sizeof(uint16_t) + sizeof(void *) + sizeof(uint8_t));
10409     void *bigalloc = malloc(memoryneeded);
10410     if (!bigalloc) return false;
10411     void *oldbigalloc = ra->containers;
10412     void **newcontainers = (void **)bigalloc;
10413     uint16_t *newkeys = (uint16_t *)(newcontainers + new_capacity);
10414     uint8_t *newtypecodes = (uint8_t *)(newkeys + new_capacity);
10415     assert((char *)(newtypecodes + new_capacity) ==
10416            (char *)bigalloc + memoryneeded);
10417     if(ra->size > 0) {
10418       memcpy(newcontainers, ra->containers, sizeof(void *) * ra->size);
10419       memcpy(newkeys, ra->keys, sizeof(uint16_t) * ra->size);
10420       memcpy(newtypecodes, ra->typecodes, sizeof(uint8_t) * ra->size);
10421     }
10422     ra->containers = newcontainers;
10423     ra->keys = newkeys;
10424     ra->typecodes = newtypecodes;
10425     ra->allocation_size = new_capacity;
10426     free(oldbigalloc);
10427     return true;
10428 }
10429 
ra_init_with_capacity(roaring_array_t * new_ra,uint32_t cap)10430 bool ra_init_with_capacity(roaring_array_t *new_ra, uint32_t cap) {
10431     if (!new_ra) return false;
10432     ra_init(new_ra);
10433 
10434     if (cap > INT32_MAX) { return false; }
10435 
10436     if(cap > 0) {
10437       void *bigalloc =
10438         malloc(cap * (sizeof(uint16_t) + sizeof(void *) + sizeof(uint8_t)));
10439       if( bigalloc == NULL ) return false;
10440       new_ra->containers = (void **)bigalloc;
10441       new_ra->keys = (uint16_t *)(new_ra->containers + cap);
10442       new_ra->typecodes = (uint8_t *)(new_ra->keys + cap);
10443       // Narrowing is safe because of above check
10444       new_ra->allocation_size = (int32_t)cap;
10445     }
10446     return true;
10447 }
10448 
ra_shrink_to_fit(roaring_array_t * ra)10449 int ra_shrink_to_fit(roaring_array_t *ra) {
10450     int savings = (ra->allocation_size - ra->size) *
10451                   (sizeof(uint16_t) + sizeof(void *) + sizeof(uint8_t));
10452     if (!realloc_array(ra, ra->size)) {
10453       return 0;
10454     }
10455     ra->allocation_size = ra->size;
10456     return savings;
10457 }
10458 
ra_init(roaring_array_t * new_ra)10459 void ra_init(roaring_array_t *new_ra) {
10460     if (!new_ra) { return; }
10461     new_ra->keys = NULL;
10462     new_ra->containers = NULL;
10463     new_ra->typecodes = NULL;
10464 
10465     new_ra->allocation_size = 0;
10466     new_ra->size = 0;
10467     new_ra->flags = 0;
10468 }
10469 
ra_copy(const roaring_array_t * source,roaring_array_t * dest,bool copy_on_write)10470 bool ra_copy(const roaring_array_t *source, roaring_array_t *dest,
10471              bool copy_on_write) {
10472     if (!ra_init_with_capacity(dest, source->size)) return false;
10473     dest->size = source->size;
10474     dest->allocation_size = source->size;
10475     if(dest->size > 0) {
10476       memcpy(dest->keys, source->keys, dest->size * sizeof(uint16_t));
10477     }
10478     // we go through the containers, turning them into shared containers...
10479     if (copy_on_write) {
10480         for (int32_t i = 0; i < dest->size; ++i) {
10481             source->containers[i] = get_copy_of_container(
10482                 source->containers[i], &source->typecodes[i], copy_on_write);
10483         }
10484         // we do a shallow copy to the other bitmap
10485         if(dest->size > 0) {
10486           memcpy(dest->containers, source->containers,
10487                dest->size * sizeof(void *));
10488           memcpy(dest->typecodes, source->typecodes,
10489                dest->size * sizeof(uint8_t));
10490         }
10491     } else {
10492         if(dest->size > 0) {
10493           memcpy(dest->typecodes, source->typecodes,
10494                dest->size * sizeof(uint8_t));
10495         }
10496         for (int32_t i = 0; i < dest->size; i++) {
10497             dest->containers[i] =
10498                 container_clone(source->containers[i], source->typecodes[i]);
10499             if (dest->containers[i] == NULL) {
10500                 for (int32_t j = 0; j < i; j++) {
10501                     container_free(dest->containers[j], dest->typecodes[j]);
10502                 }
10503                 ra_clear_without_containers(dest);
10504                 return false;
10505             }
10506         }
10507     }
10508     return true;
10509 }
10510 
ra_overwrite(const roaring_array_t * source,roaring_array_t * dest,bool copy_on_write)10511 bool ra_overwrite(const roaring_array_t *source, roaring_array_t *dest,
10512                   bool copy_on_write) {
10513     ra_clear_containers(dest);  // we are going to overwrite them
10514     if (dest->allocation_size < source->size) {
10515         if (!realloc_array(dest, source->size)) {
10516             return false;
10517         }
10518     }
10519     dest->size = source->size;
10520     memcpy(dest->keys, source->keys, dest->size * sizeof(uint16_t));
10521     // we go through the containers, turning them into shared containers...
10522     if (copy_on_write) {
10523         for (int32_t i = 0; i < dest->size; ++i) {
10524             source->containers[i] = get_copy_of_container(
10525                 source->containers[i], &source->typecodes[i], copy_on_write);
10526         }
10527         // we do a shallow copy to the other bitmap
10528         memcpy(dest->containers, source->containers,
10529                dest->size * sizeof(void *));
10530         memcpy(dest->typecodes, source->typecodes,
10531                dest->size * sizeof(uint8_t));
10532     } else {
10533         memcpy(dest->typecodes, source->typecodes,
10534                dest->size * sizeof(uint8_t));
10535         for (int32_t i = 0; i < dest->size; i++) {
10536             dest->containers[i] =
10537                 container_clone(source->containers[i], source->typecodes[i]);
10538             if (dest->containers[i] == NULL) {
10539                 for (int32_t j = 0; j < i; j++) {
10540                     container_free(dest->containers[j], dest->typecodes[j]);
10541                 }
10542                 ra_clear_without_containers(dest);
10543                 return false;
10544             }
10545         }
10546     }
10547     return true;
10548 }
10549 
ra_clear_containers(roaring_array_t * ra)10550 void ra_clear_containers(roaring_array_t *ra) {
10551     for (int32_t i = 0; i < ra->size; ++i) {
10552         container_free(ra->containers[i], ra->typecodes[i]);
10553     }
10554 }
10555 
ra_reset(roaring_array_t * ra)10556 void ra_reset(roaring_array_t *ra) {
10557   ra_clear_containers(ra);
10558   ra->size = 0;
10559   ra_shrink_to_fit(ra);
10560 }
10561 
ra_clear_without_containers(roaring_array_t * ra)10562 void ra_clear_without_containers(roaring_array_t *ra) {
10563     free(ra->containers);    // keys and typecodes are allocated with containers
10564     ra->size = 0;
10565     ra->allocation_size = 0;
10566     ra->containers = NULL;
10567     ra->keys = NULL;
10568     ra->typecodes = NULL;
10569 }
10570 
ra_clear(roaring_array_t * ra)10571 void ra_clear(roaring_array_t *ra) {
10572     ra_clear_containers(ra);
10573     ra_clear_without_containers(ra);
10574 }
10575 
extend_array(roaring_array_t * ra,int32_t k)10576 bool extend_array(roaring_array_t *ra, int32_t k) {
10577     int32_t desired_size = ra->size + k;
10578     assert(desired_size <= MAX_CONTAINERS);
10579     if (desired_size > ra->allocation_size) {
10580         int32_t new_capacity =
10581             (ra->size < 1024) ? 2 * desired_size : 5 * desired_size / 4;
10582         if (new_capacity > MAX_CONTAINERS) {
10583             new_capacity = MAX_CONTAINERS;
10584         }
10585 
10586         return realloc_array(ra, new_capacity);
10587     }
10588     return true;
10589 }
10590 
ra_append(roaring_array_t * ra,uint16_t key,void * container,uint8_t typecode)10591 void ra_append(roaring_array_t *ra, uint16_t key, void *container,
10592                uint8_t typecode) {
10593     extend_array(ra, 1);
10594     const int32_t pos = ra->size;
10595 
10596     ra->keys[pos] = key;
10597     ra->containers[pos] = container;
10598     ra->typecodes[pos] = typecode;
10599     ra->size++;
10600 }
10601 
ra_append_copy(roaring_array_t * ra,const roaring_array_t * sa,uint16_t index,bool copy_on_write)10602 void ra_append_copy(roaring_array_t *ra, const roaring_array_t *sa,
10603                     uint16_t index, bool copy_on_write) {
10604     extend_array(ra, 1);
10605     const int32_t pos = ra->size;
10606 
10607     // old contents is junk not needing freeing
10608     ra->keys[pos] = sa->keys[index];
10609     // the shared container will be in two bitmaps
10610     if (copy_on_write) {
10611         sa->containers[index] = get_copy_of_container(
10612             sa->containers[index], &sa->typecodes[index], copy_on_write);
10613         ra->containers[pos] = sa->containers[index];
10614         ra->typecodes[pos] = sa->typecodes[index];
10615     } else {
10616         ra->containers[pos] =
10617             container_clone(sa->containers[index], sa->typecodes[index]);
10618         ra->typecodes[pos] = sa->typecodes[index];
10619     }
10620     ra->size++;
10621 }
10622 
ra_append_copies_until(roaring_array_t * ra,const roaring_array_t * sa,uint16_t stopping_key,bool copy_on_write)10623 void ra_append_copies_until(roaring_array_t *ra, const roaring_array_t *sa,
10624                             uint16_t stopping_key, bool copy_on_write) {
10625     for (int32_t i = 0; i < sa->size; ++i) {
10626         if (sa->keys[i] >= stopping_key) break;
10627         ra_append_copy(ra, sa, i, copy_on_write);
10628     }
10629 }
10630 
ra_append_copy_range(roaring_array_t * ra,const roaring_array_t * sa,int32_t start_index,int32_t end_index,bool copy_on_write)10631 void ra_append_copy_range(roaring_array_t *ra, const roaring_array_t *sa,
10632                           int32_t start_index, int32_t end_index,
10633                           bool copy_on_write) {
10634     extend_array(ra, end_index - start_index);
10635     for (int32_t i = start_index; i < end_index; ++i) {
10636         const int32_t pos = ra->size;
10637         ra->keys[pos] = sa->keys[i];
10638         if (copy_on_write) {
10639             sa->containers[i] = get_copy_of_container(
10640                 sa->containers[i], &sa->typecodes[i], copy_on_write);
10641             ra->containers[pos] = sa->containers[i];
10642             ra->typecodes[pos] = sa->typecodes[i];
10643         } else {
10644             ra->containers[pos] =
10645                 container_clone(sa->containers[i], sa->typecodes[i]);
10646             ra->typecodes[pos] = sa->typecodes[i];
10647         }
10648         ra->size++;
10649     }
10650 }
10651 
ra_append_copies_after(roaring_array_t * ra,const roaring_array_t * sa,uint16_t before_start,bool copy_on_write)10652 void ra_append_copies_after(roaring_array_t *ra, const roaring_array_t *sa,
10653                             uint16_t before_start, bool copy_on_write) {
10654     int start_location = ra_get_index(sa, before_start);
10655     if (start_location >= 0)
10656         ++start_location;
10657     else
10658         start_location = -start_location - 1;
10659     ra_append_copy_range(ra, sa, start_location, sa->size, copy_on_write);
10660 }
10661 
ra_append_move_range(roaring_array_t * ra,roaring_array_t * sa,int32_t start_index,int32_t end_index)10662 void ra_append_move_range(roaring_array_t *ra, roaring_array_t *sa,
10663                           int32_t start_index, int32_t end_index) {
10664     extend_array(ra, end_index - start_index);
10665 
10666     for (int32_t i = start_index; i < end_index; ++i) {
10667         const int32_t pos = ra->size;
10668 
10669         ra->keys[pos] = sa->keys[i];
10670         ra->containers[pos] = sa->containers[i];
10671         ra->typecodes[pos] = sa->typecodes[i];
10672         ra->size++;
10673     }
10674 }
10675 
ra_append_range(roaring_array_t * ra,roaring_array_t * sa,int32_t start_index,int32_t end_index,bool copy_on_write)10676 void ra_append_range(roaring_array_t *ra, roaring_array_t *sa,
10677                      int32_t start_index, int32_t end_index,
10678                      bool copy_on_write) {
10679     extend_array(ra, end_index - start_index);
10680 
10681     for (int32_t i = start_index; i < end_index; ++i) {
10682         const int32_t pos = ra->size;
10683         ra->keys[pos] = sa->keys[i];
10684         if (copy_on_write) {
10685             sa->containers[i] = get_copy_of_container(
10686                 sa->containers[i], &sa->typecodes[i], copy_on_write);
10687             ra->containers[pos] = sa->containers[i];
10688             ra->typecodes[pos] = sa->typecodes[i];
10689         } else {
10690             ra->containers[pos] =
10691                 container_clone(sa->containers[i], sa->typecodes[i]);
10692             ra->typecodes[pos] = sa->typecodes[i];
10693         }
10694         ra->size++;
10695     }
10696 }
10697 
ra_get_key_at_index(const roaring_array_t * ra,uint16_t i)10698 uint16_t ra_get_key_at_index(const roaring_array_t *ra, uint16_t i) {
10699     return ra->keys[i];
10700 }
10701 
10702 // everything skipped over is freed
ra_advance_until_freeing(roaring_array_t * ra,uint16_t x,int32_t pos)10703 int32_t ra_advance_until_freeing(roaring_array_t *ra, uint16_t x, int32_t pos) {
10704     while (pos < ra->size && ra->keys[pos] < x) {
10705         container_free(ra->containers[pos], ra->typecodes[pos]);
10706         ++pos;
10707     }
10708     return pos;
10709 }
10710 
ra_insert_new_key_value_at(roaring_array_t * ra,int32_t i,uint16_t key,void * container,uint8_t typecode)10711 void ra_insert_new_key_value_at(roaring_array_t *ra, int32_t i, uint16_t key,
10712                                 void *container, uint8_t typecode) {
10713     extend_array(ra, 1);
10714     // May be an optimization opportunity with DIY memmove
10715     memmove(&(ra->keys[i + 1]), &(ra->keys[i]),
10716             sizeof(uint16_t) * (ra->size - i));
10717     memmove(&(ra->containers[i + 1]), &(ra->containers[i]),
10718             sizeof(void *) * (ra->size - i));
10719     memmove(&(ra->typecodes[i + 1]), &(ra->typecodes[i]),
10720             sizeof(uint8_t) * (ra->size - i));
10721     ra->keys[i] = key;
10722     ra->containers[i] = container;
10723     ra->typecodes[i] = typecode;
10724     ra->size++;
10725 }
10726 
10727 // note: Java routine set things to 0, enabling GC.
10728 // Java called it "resize" but it was always used to downsize.
10729 // Allowing upsize would break the conventions about
10730 // valid containers below ra->size.
10731 
ra_downsize(roaring_array_t * ra,int32_t new_length)10732 void ra_downsize(roaring_array_t *ra, int32_t new_length) {
10733     assert(new_length <= ra->size);
10734     ra->size = new_length;
10735 }
10736 
ra_remove_at_index(roaring_array_t * ra,int32_t i)10737 void ra_remove_at_index(roaring_array_t *ra, int32_t i) {
10738     memmove(&(ra->containers[i]), &(ra->containers[i + 1]),
10739             sizeof(void *) * (ra->size - i - 1));
10740     memmove(&(ra->keys[i]), &(ra->keys[i + 1]),
10741             sizeof(uint16_t) * (ra->size - i - 1));
10742     memmove(&(ra->typecodes[i]), &(ra->typecodes[i + 1]),
10743             sizeof(uint8_t) * (ra->size - i - 1));
10744     ra->size--;
10745 }
10746 
ra_remove_at_index_and_free(roaring_array_t * ra,int32_t i)10747 void ra_remove_at_index_and_free(roaring_array_t *ra, int32_t i) {
10748     container_free(ra->containers[i], ra->typecodes[i]);
10749     ra_remove_at_index(ra, i);
10750 }
10751 
10752 // used in inplace andNot only, to slide left the containers from
10753 // the mutated RoaringBitmap that are after the largest container of
10754 // the argument RoaringBitmap.  In use it should be followed by a call to
10755 // downsize.
10756 //
ra_copy_range(roaring_array_t * ra,uint32_t begin,uint32_t end,uint32_t new_begin)10757 void ra_copy_range(roaring_array_t *ra, uint32_t begin, uint32_t end,
10758                    uint32_t new_begin) {
10759     assert(begin <= end);
10760     assert(new_begin < begin);
10761 
10762     const int range = end - begin;
10763 
10764     // We ensure to previously have freed overwritten containers
10765     // that are not copied elsewhere
10766 
10767     memmove(&(ra->containers[new_begin]), &(ra->containers[begin]),
10768             sizeof(void *) * range);
10769     memmove(&(ra->keys[new_begin]), &(ra->keys[begin]),
10770             sizeof(uint16_t) * range);
10771     memmove(&(ra->typecodes[new_begin]), &(ra->typecodes[begin]),
10772             sizeof(uint8_t) * range);
10773 }
10774 
ra_shift_tail(roaring_array_t * ra,int32_t count,int32_t distance)10775 void ra_shift_tail(roaring_array_t *ra, int32_t count, int32_t distance) {
10776     if (distance > 0) {
10777         extend_array(ra, distance);
10778     }
10779     int32_t srcpos = ra->size - count;
10780     int32_t dstpos = srcpos + distance;
10781     memmove(&(ra->keys[dstpos]), &(ra->keys[srcpos]),
10782             sizeof(uint16_t) * count);
10783     memmove(&(ra->containers[dstpos]), &(ra->containers[srcpos]),
10784             sizeof(void *) * count);
10785     memmove(&(ra->typecodes[dstpos]), &(ra->typecodes[srcpos]),
10786             sizeof(uint8_t) * count);
10787     ra->size += distance;
10788 }
10789 
10790 
ra_to_uint32_array(const roaring_array_t * ra,uint32_t * ans)10791 void ra_to_uint32_array(const roaring_array_t *ra, uint32_t *ans) {
10792     size_t ctr = 0;
10793     for (int32_t i = 0; i < ra->size; ++i) {
10794         int num_added = container_to_uint32_array(
10795             ans + ctr, ra->containers[i], ra->typecodes[i],
10796             ((uint32_t)ra->keys[i]) << 16);
10797         ctr += num_added;
10798     }
10799 }
10800 
ra_range_uint32_array(const roaring_array_t * ra,size_t offset,size_t limit,uint32_t * ans)10801 bool ra_range_uint32_array(const roaring_array_t *ra, size_t offset, size_t limit, uint32_t *ans) {
10802     size_t ctr = 0;
10803     size_t dtr = 0;
10804 
10805     size_t t_limit = 0;
10806 
10807     bool first = false;
10808     size_t first_skip = 0;
10809 
10810     uint32_t *t_ans = NULL;
10811     size_t cur_len = 0;
10812 
10813     for (int i = 0; i < ra->size; ++i) {
10814 
10815         const void *container = container_unwrap_shared(ra->containers[i], &ra->typecodes[i]);
10816         switch (ra->typecodes[i]) {
10817             case BITSET_CONTAINER_TYPE_CODE:
10818                 t_limit = ((const bitset_container_t *)container)->cardinality;
10819                 break;
10820             case ARRAY_CONTAINER_TYPE_CODE:
10821                 t_limit = ((const array_container_t *)container)->cardinality;
10822                 break;
10823             case RUN_CONTAINER_TYPE_CODE:
10824                 t_limit = run_container_cardinality((const run_container_t *)container);
10825                 break;
10826             case SHARED_CONTAINER_TYPE_CODE:
10827             default:
10828                 __builtin_unreachable();
10829         }
10830         if (ctr + t_limit - 1 >= offset && ctr < offset + limit){
10831             if (!first){
10832                 //first_skip = t_limit - (ctr + t_limit - offset);
10833                 first_skip = offset - ctr;
10834                 first = true;
10835                 t_ans = (uint32_t *)malloc(sizeof(*t_ans) * (first_skip + limit));
10836                 if(t_ans == NULL) {
10837                   return false;
10838                 }
10839                 memset(t_ans, 0, sizeof(*t_ans) * (first_skip + limit)) ;
10840                 cur_len = first_skip + limit;
10841             }
10842             if (dtr + t_limit > cur_len){
10843                 uint32_t * append_ans = (uint32_t *)malloc(sizeof(*append_ans) * (cur_len + t_limit));
10844                 if(append_ans == NULL) {
10845                   if(t_ans != NULL) free(t_ans);
10846                   return false;
10847                 }
10848                 memset(append_ans, 0, sizeof(*append_ans) * (cur_len + t_limit));
10849                 cur_len = cur_len + t_limit;
10850                 memcpy(append_ans, t_ans, dtr * sizeof(uint32_t));
10851                 free(t_ans);
10852                 t_ans = append_ans;
10853             }
10854             switch (ra->typecodes[i]) {
10855                 case BITSET_CONTAINER_TYPE_CODE:
10856                     container_to_uint32_array(
10857                         t_ans + dtr, (const bitset_container_t *)container,  ra->typecodes[i],
10858                         ((uint32_t)ra->keys[i]) << 16);
10859                     break;
10860                 case ARRAY_CONTAINER_TYPE_CODE:
10861                     container_to_uint32_array(
10862                         t_ans + dtr, (const array_container_t *)container, ra->typecodes[i],
10863                         ((uint32_t)ra->keys[i]) << 16);
10864                     break;
10865                 case RUN_CONTAINER_TYPE_CODE:
10866                     container_to_uint32_array(
10867                         t_ans + dtr, (const run_container_t *)container, ra->typecodes[i],
10868                         ((uint32_t)ra->keys[i]) << 16);
10869                     break;
10870                 case SHARED_CONTAINER_TYPE_CODE:
10871                 default:
10872                     __builtin_unreachable();
10873             }
10874             dtr += t_limit;
10875         }
10876         ctr += t_limit;
10877         if (dtr-first_skip >= limit) break;
10878     }
10879     if(t_ans != NULL) {
10880       memcpy(ans, t_ans+first_skip, limit * sizeof(uint32_t));
10881       free(t_ans);
10882     }
10883     return true;
10884 }
10885 
ra_has_run_container(const roaring_array_t * ra)10886 bool ra_has_run_container(const roaring_array_t *ra) {
10887     for (int32_t k = 0; k < ra->size; ++k) {
10888         if (get_container_type(ra->containers[k], ra->typecodes[k]) ==
10889             RUN_CONTAINER_TYPE_CODE)
10890             return true;
10891     }
10892     return false;
10893 }
10894 
ra_portable_header_size(const roaring_array_t * ra)10895 uint32_t ra_portable_header_size(const roaring_array_t *ra) {
10896     if (ra_has_run_container(ra)) {
10897         if (ra->size <
10898             NO_OFFSET_THRESHOLD) {  // for small bitmaps, we omit the offsets
10899             return 4 + (ra->size + 7) / 8 + 4 * ra->size;
10900         }
10901         return 4 + (ra->size + 7) / 8 +
10902                8 * ra->size;  // - 4 because we pack the size with the cookie
10903     } else {
10904         return 4 + 4 + 8 * ra->size;
10905     }
10906 }
10907 
ra_portable_size_in_bytes(const roaring_array_t * ra)10908 size_t ra_portable_size_in_bytes(const roaring_array_t *ra) {
10909     size_t count = ra_portable_header_size(ra);
10910 
10911     for (int32_t k = 0; k < ra->size; ++k) {
10912         count += container_size_in_bytes(ra->containers[k], ra->typecodes[k]);
10913     }
10914     return count;
10915 }
10916 
ra_portable_serialize(const roaring_array_t * ra,char * buf)10917 size_t ra_portable_serialize(const roaring_array_t *ra, char *buf) {
10918     char *initbuf = buf;
10919     uint32_t startOffset = 0;
10920     bool hasrun = ra_has_run_container(ra);
10921     if (hasrun) {
10922         uint32_t cookie = SERIAL_COOKIE | ((ra->size - 1) << 16);
10923         memcpy(buf, &cookie, sizeof(cookie));
10924         buf += sizeof(cookie);
10925         uint32_t s = (ra->size + 7) / 8;
10926         uint8_t *bitmapOfRunContainers = (uint8_t *)calloc(s, 1);
10927         assert(bitmapOfRunContainers != NULL);  // todo: handle
10928         for (int32_t i = 0; i < ra->size; ++i) {
10929             if (get_container_type(ra->containers[i], ra->typecodes[i]) ==
10930                 RUN_CONTAINER_TYPE_CODE) {
10931                 bitmapOfRunContainers[i / 8] |= (1 << (i % 8));
10932             }
10933         }
10934         memcpy(buf, bitmapOfRunContainers, s);
10935         buf += s;
10936         free(bitmapOfRunContainers);
10937         if (ra->size < NO_OFFSET_THRESHOLD) {
10938             startOffset = 4 + 4 * ra->size + s;
10939         } else {
10940             startOffset = 4 + 8 * ra->size + s;
10941         }
10942     } else {  // backwards compatibility
10943         uint32_t cookie = SERIAL_COOKIE_NO_RUNCONTAINER;
10944 
10945         memcpy(buf, &cookie, sizeof(cookie));
10946         buf += sizeof(cookie);
10947         memcpy(buf, &ra->size, sizeof(ra->size));
10948         buf += sizeof(ra->size);
10949 
10950         startOffset = 4 + 4 + 4 * ra->size + 4 * ra->size;
10951     }
10952     for (int32_t k = 0; k < ra->size; ++k) {
10953         memcpy(buf, &ra->keys[k], sizeof(ra->keys[k]));
10954         buf += sizeof(ra->keys[k]);
10955         // get_cardinality returns a value in [1,1<<16], subtracting one
10956         // we get [0,1<<16 - 1] which fits in 16 bits
10957         uint16_t card = (uint16_t)(
10958             container_get_cardinality(ra->containers[k], ra->typecodes[k]) - 1);
10959         memcpy(buf, &card, sizeof(card));
10960         buf += sizeof(card);
10961     }
10962     if ((!hasrun) || (ra->size >= NO_OFFSET_THRESHOLD)) {
10963         // writing the containers offsets
10964         for (int32_t k = 0; k < ra->size; k++) {
10965             memcpy(buf, &startOffset, sizeof(startOffset));
10966             buf += sizeof(startOffset);
10967             startOffset =
10968                 startOffset +
10969                 container_size_in_bytes(ra->containers[k], ra->typecodes[k]);
10970         }
10971     }
10972     for (int32_t k = 0; k < ra->size; ++k) {
10973         buf += container_write(ra->containers[k], ra->typecodes[k], buf);
10974     }
10975     return buf - initbuf;
10976 }
10977 
10978 // Quickly checks whether there is a serialized bitmap at the pointer,
10979 // not exceeding size "maxbytes" in bytes. This function does not allocate
10980 // memory dynamically.
10981 //
10982 // This function returns 0 if and only if no valid bitmap is found.
10983 // Otherwise, it returns how many bytes are occupied.
10984 //
ra_portable_deserialize_size(const char * buf,const size_t maxbytes)10985 size_t ra_portable_deserialize_size(const char *buf, const size_t maxbytes) {
10986     size_t bytestotal = sizeof(int32_t);// for cookie
10987     if(bytestotal > maxbytes) return 0;
10988     uint32_t cookie;
10989     memcpy(&cookie, buf, sizeof(int32_t));
10990     buf += sizeof(uint32_t);
10991     if ((cookie & 0xFFFF) != SERIAL_COOKIE &&
10992         cookie != SERIAL_COOKIE_NO_RUNCONTAINER) {
10993         return 0;
10994     }
10995     int32_t size;
10996 
10997     if ((cookie & 0xFFFF) == SERIAL_COOKIE)
10998         size = (cookie >> 16) + 1;
10999     else {
11000         bytestotal += sizeof(int32_t);
11001         if(bytestotal > maxbytes) return 0;
11002         memcpy(&size, buf, sizeof(int32_t));
11003         buf += sizeof(uint32_t);
11004     }
11005     if (size > (1<<16)) {
11006        return 0; // logically impossible
11007     }
11008     char *bitmapOfRunContainers = NULL;
11009     bool hasrun = (cookie & 0xFFFF) == SERIAL_COOKIE;
11010     if (hasrun) {
11011         int32_t s = (size + 7) / 8;
11012         bytestotal += s;
11013         if(bytestotal > maxbytes) return 0;
11014         bitmapOfRunContainers = (char *)buf;
11015         buf += s;
11016     }
11017     bytestotal += size * 2 * sizeof(uint16_t);
11018     if(bytestotal > maxbytes) return 0;
11019     uint16_t *keyscards = (uint16_t *)buf;
11020     buf += size * 2 * sizeof(uint16_t);
11021     if ((!hasrun) || (size >= NO_OFFSET_THRESHOLD)) {
11022         // skipping the offsets
11023         bytestotal += size * 4;
11024         if(bytestotal > maxbytes) return 0;
11025         buf += size * 4;
11026     }
11027     // Reading the containers
11028     for (int32_t k = 0; k < size; ++k) {
11029         uint16_t tmp;
11030         memcpy(&tmp, keyscards + 2*k+1, sizeof(tmp));
11031         uint32_t thiscard = tmp + 1;
11032         bool isbitmap = (thiscard > DEFAULT_MAX_SIZE);
11033         bool isrun = false;
11034         if(hasrun) {
11035           if((bitmapOfRunContainers[k / 8] & (1 << (k % 8))) != 0) {
11036             isbitmap = false;
11037             isrun = true;
11038           }
11039         }
11040         if (isbitmap) {
11041             size_t containersize = BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
11042             bytestotal += containersize;
11043             if(bytestotal > maxbytes) return 0;
11044             buf += containersize;
11045         } else if (isrun) {
11046             bytestotal += sizeof(uint16_t);
11047             if(bytestotal > maxbytes) return 0;
11048             uint16_t n_runs;
11049             memcpy(&n_runs, buf, sizeof(uint16_t));
11050             buf += sizeof(uint16_t);
11051             size_t containersize = n_runs * sizeof(rle16_t);
11052             bytestotal += containersize;
11053             if(bytestotal > maxbytes) return 0;
11054             buf += containersize;
11055         } else {
11056             size_t containersize = thiscard * sizeof(uint16_t);
11057             bytestotal += containersize;
11058             if(bytestotal > maxbytes) return 0;
11059             buf += containersize;
11060         }
11061     }
11062     return bytestotal;
11063 }
11064 
11065 
11066 // this function populates answer from the content of buf (reading up to maxbytes bytes).
11067 // The function returns false if a properly serialized bitmap cannot be found.
11068 // if it returns true, readbytes is populated by how many bytes were read, we have that *readbytes <= maxbytes.
ra_portable_deserialize(roaring_array_t * answer,const char * buf,const size_t maxbytes,size_t * readbytes)11069 bool ra_portable_deserialize(roaring_array_t *answer, const char *buf, const size_t maxbytes, size_t * readbytes) {
11070     *readbytes = sizeof(int32_t);// for cookie
11071     if(*readbytes > maxbytes) {
11072       fprintf(stderr, "Ran out of bytes while reading first 4 bytes.\n");
11073       return false;
11074     }
11075     uint32_t cookie;
11076     memcpy(&cookie, buf, sizeof(int32_t));
11077     buf += sizeof(uint32_t);
11078     if ((cookie & 0xFFFF) != SERIAL_COOKIE &&
11079         cookie != SERIAL_COOKIE_NO_RUNCONTAINER) {
11080         fprintf(stderr, "I failed to find one of the right cookies. Found %" PRIu32 "\n",
11081                 cookie);
11082         return false;
11083     }
11084     int32_t size;
11085 
11086     if ((cookie & 0xFFFF) == SERIAL_COOKIE)
11087         size = (cookie >> 16) + 1;
11088     else {
11089         *readbytes += sizeof(int32_t);
11090         if(*readbytes > maxbytes) {
11091           fprintf(stderr, "Ran out of bytes while reading second part of the cookie.\n");
11092           return false;
11093         }
11094         memcpy(&size, buf, sizeof(int32_t));
11095         buf += sizeof(uint32_t);
11096     }
11097     if (size > (1<<16)) {
11098        fprintf(stderr, "You cannot have so many containers, the data must be corrupted: %" PRId32 "\n",
11099                 size);
11100        return false; // logically impossible
11101     }
11102     const char *bitmapOfRunContainers = NULL;
11103     bool hasrun = (cookie & 0xFFFF) == SERIAL_COOKIE;
11104     if (hasrun) {
11105         int32_t s = (size + 7) / 8;
11106         *readbytes += s;
11107         if(*readbytes > maxbytes) {// data is corrupted?
11108           fprintf(stderr, "Ran out of bytes while reading run bitmap.\n");
11109           return false;
11110         }
11111         bitmapOfRunContainers = buf;
11112         buf += s;
11113     }
11114     uint16_t *keyscards = (uint16_t *)buf;
11115 
11116     *readbytes += size * 2 * sizeof(uint16_t);
11117     if(*readbytes > maxbytes) {
11118       fprintf(stderr, "Ran out of bytes while reading key-cardinality array.\n");
11119       return false;
11120     }
11121     buf += size * 2 * sizeof(uint16_t);
11122 
11123     bool is_ok = ra_init_with_capacity(answer, size);
11124     if (!is_ok) {
11125         fprintf(stderr, "Failed to allocate memory for roaring array. Bailing out.\n");
11126         return false;
11127     }
11128 
11129     for (int32_t k = 0; k < size; ++k) {
11130         uint16_t tmp;
11131         memcpy(&tmp, keyscards + 2*k, sizeof(tmp));
11132         answer->keys[k] = tmp;
11133     }
11134     if ((!hasrun) || (size >= NO_OFFSET_THRESHOLD)) {
11135         *readbytes += size * 4;
11136         if(*readbytes > maxbytes) {// data is corrupted?
11137           fprintf(stderr, "Ran out of bytes while reading offsets.\n");
11138           ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
11139           return false;
11140         }
11141 
11142         // skipping the offsets
11143         buf += size * 4;
11144     }
11145     // Reading the containers
11146     for (int32_t k = 0; k < size; ++k) {
11147         uint16_t tmp;
11148         memcpy(&tmp, keyscards + 2*k+1, sizeof(tmp));
11149         uint32_t thiscard = tmp + 1;
11150         bool isbitmap = (thiscard > DEFAULT_MAX_SIZE);
11151         bool isrun = false;
11152         if(hasrun) {
11153           if((bitmapOfRunContainers[k / 8] & (1 << (k % 8))) != 0) {
11154             isbitmap = false;
11155             isrun = true;
11156           }
11157         }
11158         if (isbitmap) {
11159             // we check that the read is allowed
11160             size_t containersize = BITSET_CONTAINER_SIZE_IN_WORDS * sizeof(uint64_t);
11161             *readbytes += containersize;
11162             if(*readbytes > maxbytes) {
11163               fprintf(stderr, "Running out of bytes while reading a bitset container.\n");
11164               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
11165               return false;
11166             }
11167             // it is now safe to read
11168             bitset_container_t *c = bitset_container_create();
11169             if(c == NULL) {// memory allocation failure
11170               fprintf(stderr, "Failed to allocate memory for a bitset container.\n");
11171               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
11172               return false;
11173             }
11174             answer->size++;
11175             buf += bitset_container_read(thiscard, c, buf);
11176             answer->containers[k] = c;
11177             answer->typecodes[k] = BITSET_CONTAINER_TYPE_CODE;
11178         } else if (isrun) {
11179             // we check that the read is allowed
11180             *readbytes += sizeof(uint16_t);
11181             if(*readbytes > maxbytes) {
11182               fprintf(stderr, "Running out of bytes while reading a run container (header).\n");
11183               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
11184               return false;
11185             }
11186             uint16_t n_runs;
11187             memcpy(&n_runs, buf, sizeof(uint16_t));
11188             size_t containersize = n_runs * sizeof(rle16_t);
11189             *readbytes += containersize;
11190             if(*readbytes > maxbytes) {// data is corrupted?
11191               fprintf(stderr, "Running out of bytes while reading a run container.\n");
11192               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
11193               return false;
11194             }
11195             // it is now safe to read
11196 
11197             run_container_t *c = run_container_create();
11198             if(c == NULL) {// memory allocation failure
11199               fprintf(stderr, "Failed to allocate memory for a run container.\n");
11200               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
11201               return false;
11202             }
11203             answer->size++;
11204             buf += run_container_read(thiscard, c, buf);
11205             answer->containers[k] = c;
11206             answer->typecodes[k] = RUN_CONTAINER_TYPE_CODE;
11207         } else {
11208             // we check that the read is allowed
11209             size_t containersize = thiscard * sizeof(uint16_t);
11210             *readbytes += containersize;
11211             if(*readbytes > maxbytes) {// data is corrupted?
11212               fprintf(stderr, "Running out of bytes while reading an array container.\n");
11213               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
11214               return false;
11215             }
11216             // it is now safe to read
11217             array_container_t *c =
11218                 array_container_create_given_capacity(thiscard);
11219             if(c == NULL) {// memory allocation failure
11220               fprintf(stderr, "Failed to allocate memory for an array container.\n");
11221               ra_clear(answer);// we need to clear the containers already allocated, and the roaring array
11222               return false;
11223             }
11224             answer->size++;
11225             buf += array_container_read(thiscard, c, buf);
11226             answer->containers[k] = c;
11227             answer->typecodes[k] = ARRAY_CONTAINER_TYPE_CODE;
11228         }
11229     }
11230     return true;
11231 }
11232 /* end file src/roaring_array.c */
11233 /* begin file src/roaring_priority_queue.c */
11234 
11235 struct roaring_pq_element_s {
11236     uint64_t size;
11237     bool is_temporary;
11238     roaring_bitmap_t *bitmap;
11239 };
11240 
11241 typedef struct roaring_pq_element_s roaring_pq_element_t;
11242 
11243 struct roaring_pq_s {
11244     roaring_pq_element_t *elements;
11245     uint64_t size;
11246 };
11247 
11248 typedef struct roaring_pq_s roaring_pq_t;
11249 
compare(roaring_pq_element_t * t1,roaring_pq_element_t * t2)11250 static inline bool compare(roaring_pq_element_t *t1, roaring_pq_element_t *t2) {
11251     return t1->size < t2->size;
11252 }
11253 
pq_add(roaring_pq_t * pq,roaring_pq_element_t * t)11254 static void pq_add(roaring_pq_t *pq, roaring_pq_element_t *t) {
11255     uint64_t i = pq->size;
11256     pq->elements[pq->size++] = *t;
11257     while (i > 0) {
11258         uint64_t p = (i - 1) >> 1;
11259         roaring_pq_element_t ap = pq->elements[p];
11260         if (!compare(t, &ap)) break;
11261         pq->elements[i] = ap;
11262         i = p;
11263     }
11264     pq->elements[i] = *t;
11265 }
11266 
pq_free(roaring_pq_t * pq)11267 static void pq_free(roaring_pq_t *pq) {
11268     free(pq->elements);
11269     pq->elements = NULL;  // paranoid
11270     free(pq);
11271 }
11272 
percolate_down(roaring_pq_t * pq,uint32_t i)11273 static void percolate_down(roaring_pq_t *pq, uint32_t i) {
11274     uint32_t size = (uint32_t)pq->size;
11275     uint32_t hsize = size >> 1;
11276     roaring_pq_element_t ai = pq->elements[i];
11277     while (i < hsize) {
11278         uint32_t l = (i << 1) + 1;
11279         uint32_t r = l + 1;
11280         roaring_pq_element_t bestc = pq->elements[l];
11281         if (r < size) {
11282             if (compare(pq->elements + r, &bestc)) {
11283                 l = r;
11284                 bestc = pq->elements[r];
11285             }
11286         }
11287         if (!compare(&bestc, &ai)) {
11288             break;
11289         }
11290         pq->elements[i] = bestc;
11291         i = l;
11292     }
11293     pq->elements[i] = ai;
11294 }
11295 
create_pq(const roaring_bitmap_t ** arr,uint32_t length)11296 static roaring_pq_t *create_pq(const roaring_bitmap_t **arr, uint32_t length) {
11297     roaring_pq_t *answer = (roaring_pq_t *)malloc(sizeof(roaring_pq_t));
11298     answer->elements =
11299         (roaring_pq_element_t *)malloc(sizeof(roaring_pq_element_t) * length);
11300     answer->size = length;
11301     for (uint32_t i = 0; i < length; i++) {
11302         answer->elements[i].bitmap = (roaring_bitmap_t *)arr[i];
11303         answer->elements[i].is_temporary = false;
11304         answer->elements[i].size =
11305             roaring_bitmap_portable_size_in_bytes(arr[i]);
11306     }
11307     for (int32_t i = (length >> 1); i >= 0; i--) {
11308         percolate_down(answer, i);
11309     }
11310     return answer;
11311 }
11312 
pq_poll(roaring_pq_t * pq)11313 static roaring_pq_element_t pq_poll(roaring_pq_t *pq) {
11314     roaring_pq_element_t ans = *pq->elements;
11315     if (pq->size > 1) {
11316         pq->elements[0] = pq->elements[--pq->size];
11317         percolate_down(pq, 0);
11318     } else
11319         --pq->size;
11320     // memmove(pq->elements,pq->elements+1,(pq->size-1)*sizeof(roaring_pq_element_t));--pq->size;
11321     return ans;
11322 }
11323 
11324 // this function consumes and frees the inputs
lazy_or_from_lazy_inputs(roaring_bitmap_t * x1,roaring_bitmap_t * x2)11325 static roaring_bitmap_t *lazy_or_from_lazy_inputs(roaring_bitmap_t *x1,
11326                                                   roaring_bitmap_t *x2) {
11327     uint8_t container_result_type = 0;
11328     const int length1 = ra_get_size(&x1->high_low_container),
11329               length2 = ra_get_size(&x2->high_low_container);
11330     if (0 == length1) {
11331         roaring_bitmap_free(x1);
11332         return x2;
11333     }
11334     if (0 == length2) {
11335         roaring_bitmap_free(x2);
11336         return x1;
11337     }
11338     uint32_t neededcap = length1 > length2 ? length2 : length1;
11339     roaring_bitmap_t *answer = roaring_bitmap_create_with_capacity(neededcap);
11340     int pos1 = 0, pos2 = 0;
11341     uint8_t container_type_1, container_type_2;
11342     uint16_t s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
11343     uint16_t s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
11344     while (true) {
11345         if (s1 == s2) {
11346             // todo: unsharing can be inefficient as it may create a clone where
11347             // none
11348             // is needed, but it has the benefit of being easy to reason about.
11349             ra_unshare_container_at_index(&x1->high_low_container, pos1);
11350             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
11351                                                  &container_type_1);
11352             assert(container_type_1 != SHARED_CONTAINER_TYPE_CODE);
11353             ra_unshare_container_at_index(&x2->high_low_container, pos2);
11354             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
11355                                                  &container_type_2);
11356             assert(container_type_2 != SHARED_CONTAINER_TYPE_CODE);
11357             void *c;
11358 
11359             if ((container_type_2 == BITSET_CONTAINER_TYPE_CODE) &&
11360                 (container_type_1 != BITSET_CONTAINER_TYPE_CODE)) {
11361                 c = container_lazy_ior(c2, container_type_2, c1,
11362                                        container_type_1,
11363                                        &container_result_type);
11364                 container_free(c1, container_type_1);
11365                 if (c != c2) {
11366                     container_free(c2, container_type_2);
11367                 }
11368             } else {
11369                 c = container_lazy_ior(c1, container_type_1, c2,
11370                                        container_type_2,
11371                                        &container_result_type);
11372                 container_free(c2, container_type_2);
11373                 if (c != c1) {
11374                     container_free(c1, container_type_1);
11375                 }
11376             }
11377             // since we assume that the initial containers are non-empty, the
11378             // result here
11379             // can only be non-empty
11380             ra_append(&answer->high_low_container, s1, c,
11381                       container_result_type);
11382             ++pos1;
11383             ++pos2;
11384             if (pos1 == length1) break;
11385             if (pos2 == length2) break;
11386             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
11387             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
11388 
11389         } else if (s1 < s2) {  // s1 < s2
11390             void *c1 = ra_get_container_at_index(&x1->high_low_container, pos1,
11391                                                  &container_type_1);
11392             ra_append(&answer->high_low_container, s1, c1, container_type_1);
11393             pos1++;
11394             if (pos1 == length1) break;
11395             s1 = ra_get_key_at_index(&x1->high_low_container, pos1);
11396 
11397         } else {  // s1 > s2
11398             void *c2 = ra_get_container_at_index(&x2->high_low_container, pos2,
11399                                                  &container_type_2);
11400             ra_append(&answer->high_low_container, s2, c2, container_type_2);
11401             pos2++;
11402             if (pos2 == length2) break;
11403             s2 = ra_get_key_at_index(&x2->high_low_container, pos2);
11404         }
11405     }
11406     if (pos1 == length1) {
11407         ra_append_move_range(&answer->high_low_container,
11408                              &x2->high_low_container, pos2, length2);
11409     } else if (pos2 == length2) {
11410         ra_append_move_range(&answer->high_low_container,
11411                              &x1->high_low_container, pos1, length1);
11412     }
11413     ra_clear_without_containers(&x1->high_low_container);
11414     ra_clear_without_containers(&x2->high_low_container);
11415     free(x1);
11416     free(x2);
11417     return answer;
11418 }
11419 
11420 /**
11421  * Compute the union of 'number' bitmaps using a heap. This can
11422  * sometimes be faster than roaring_bitmap_or_many which uses
11423  * a naive algorithm. Caller is responsible for freeing the
11424  * result.
11425  */
roaring_bitmap_or_many_heap(uint32_t number,const roaring_bitmap_t ** x)11426 roaring_bitmap_t *roaring_bitmap_or_many_heap(uint32_t number,
11427                                               const roaring_bitmap_t **x) {
11428     if (number == 0) {
11429         return roaring_bitmap_create();
11430     }
11431     if (number == 1) {
11432         return roaring_bitmap_copy(x[0]);
11433     }
11434     roaring_pq_t *pq = create_pq(x, number);
11435     while (pq->size > 1) {
11436         roaring_pq_element_t x1 = pq_poll(pq);
11437         roaring_pq_element_t x2 = pq_poll(pq);
11438 
11439         if (x1.is_temporary && x2.is_temporary) {
11440             roaring_bitmap_t *newb =
11441                 lazy_or_from_lazy_inputs(x1.bitmap, x2.bitmap);
11442             // should normally return a fresh new bitmap *except* that
11443             // it can return x1.bitmap or x2.bitmap in degenerate cases
11444             bool temporary = !((newb == x1.bitmap) && (newb == x2.bitmap));
11445             uint64_t bsize = roaring_bitmap_portable_size_in_bytes(newb);
11446             roaring_pq_element_t newelement = {
11447                 .size = bsize, .is_temporary = temporary, .bitmap = newb};
11448             pq_add(pq, &newelement);
11449         } else if (x2.is_temporary) {
11450             roaring_bitmap_lazy_or_inplace(x2.bitmap, x1.bitmap, false);
11451             x2.size = roaring_bitmap_portable_size_in_bytes(x2.bitmap);
11452             pq_add(pq, &x2);
11453         } else if (x1.is_temporary) {
11454             roaring_bitmap_lazy_or_inplace(x1.bitmap, x2.bitmap, false);
11455             x1.size = roaring_bitmap_portable_size_in_bytes(x1.bitmap);
11456 
11457             pq_add(pq, &x1);
11458         } else {
11459             roaring_bitmap_t *newb =
11460                 roaring_bitmap_lazy_or(x1.bitmap, x2.bitmap, false);
11461             uint64_t bsize = roaring_bitmap_portable_size_in_bytes(newb);
11462             roaring_pq_element_t newelement = {
11463                 .size = bsize, .is_temporary = true, .bitmap = newb};
11464 
11465             pq_add(pq, &newelement);
11466         }
11467     }
11468     roaring_pq_element_t X = pq_poll(pq);
11469     roaring_bitmap_t *answer = X.bitmap;
11470     roaring_bitmap_repair_after_lazy(answer);
11471     pq_free(pq);
11472     return answer;
11473 }
11474 /* end file src/roaring_priority_queue.c */
11475