1 /*
2 * Copyright © 2019 Manuel Stoeckl
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining
5 * a copy of this software and associated documentation files (the
6 * "Software"), to deal in the Software without restriction, including
7 * without limitation the rights to use, copy, modify, merge, publish,
8 * distribute, sublicense, and/or sell copies of the Software, and to
9 * permit persons to whom the Software is furnished to do so, subject to
10 * the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the
13 * next paragraph) shall be included in all copies or substantial
14 * portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 */
25
26 #include "shadow.h"
27
28 #include <stdlib.h>
29 #include <string.h>
30
31 struct merge_stack_elem {
32 int offset;
33 int count;
34 };
35 struct merge_stack {
36 struct interval *data;
37 int size;
38 int count;
39 };
40
stream_merge(int a_count,const struct interval * __restrict__ a_list,int b_count,const struct interval * __restrict__ b_list,struct interval * __restrict__ c_list,int margin)41 static int stream_merge(int a_count, const struct interval *__restrict__ a_list,
42 int b_count, const struct interval *__restrict__ b_list,
43 struct interval *__restrict__ c_list, int margin)
44 {
45 int ia = 0, ib = 0, ic = 0;
46 int cursor = INT32_MIN;
47 (void)a_count;
48 (void)b_count;
49
50 /* the loop exit condition appears to be faster than checking
51 * ia<a_count||ib<b_count */
52 while (!(a_list[ia].start == INT32_MAX &&
53 b_list[ib].start == INT32_MAX)) {
54 /* TODO: write a simd optimized version, in (?) kernel_sse.c,
55 * that selects 4 elements at a time. Sentinels are free,
56 * after all */
57 struct interval sel;
58 if (a_list[ia].start < b_list[ib].start) {
59 sel = a_list[ia++];
60 } else {
61 sel = b_list[ib++];
62 }
63
64 /* which path is more likely depends on the structure of
65 * the result; branch prediction works very well here */
66 int new_cursor = max(cursor, sel.end);
67 if (sel.start >= cursor + margin) {
68 c_list[ic++] = sel;
69 } else {
70 c_list[ic - 1].end = new_cursor;
71 }
72 cursor = new_cursor;
73 }
74
75 /* add end sentinel */
76 c_list[ic] = (struct interval){.start = INT32_MAX, .end = INT32_MAX};
77
78 return ic;
79 }
80
fix_merge_stack_property(int size,struct merge_stack_elem * stack,struct merge_stack * base,struct merge_stack * temp,int merge_margin,bool force_compact,int * absorbed)81 static int fix_merge_stack_property(int size, struct merge_stack_elem *stack,
82 struct merge_stack *base, struct merge_stack *temp,
83 int merge_margin, bool force_compact, int *absorbed)
84 {
85 while (size > 1) {
86 struct merge_stack_elem top = stack[size - 1];
87 struct merge_stack_elem nxt = stack[size - 2];
88
89 if (2 * top.count <= nxt.count && !force_compact) {
90 return size;
91 }
92
93 if (buf_ensure_size(top.count + nxt.count + 1,
94 sizeof(struct interval), &temp->size,
95 (void **)&temp->data) == -1) {
96 wp_error("Failed to resize a merge buffer, some damage intervals may be lost");
97 return size;
98 }
99
100 int xs = stream_merge(top.count, &base->data[top.offset],
101 nxt.count, &base->data[nxt.offset], temp->data,
102 merge_margin);
103 /* There are more complicated/multi-buffer alternatives with
104 * fewer memory copies, but this is already <20% of stream
105 * merge time */
106 memcpy(&base->data[nxt.offset], temp->data,
107 (size_t)(xs + 1) * sizeof(struct interval));
108 base->count = nxt.offset + xs + 1;
109
110 stack[size - 1] = (struct merge_stack_elem){
111 .offset = 0, .count = 0};
112 stack[size - 2] = (struct merge_stack_elem){
113 .offset = nxt.offset, .count = xs};
114 size--;
115
116 *absorbed += (top.count + nxt.count - xs);
117 }
118 return size;
119 }
120
unpack_ext_interval(struct interval * vec,const struct ext_interval e,int alignment_bits)121 static int unpack_ext_interval(struct interval *vec,
122 const struct ext_interval e, int alignment_bits)
123 {
124 int iw = 0;
125 int last_end = INT32_MIN;
126 for (int ir = 0; ir < e.rep; ir++) {
127 int start = e.start + ir * e.stride;
128 int end = start + e.width;
129 start = (start >> alignment_bits) << alignment_bits;
130 end = ((end + (1 << alignment_bits) - 1) >> alignment_bits)
131 << alignment_bits;
132
133 if (start > last_end) {
134 vec[iw].start = start;
135 vec[iw].end = end;
136 last_end = end;
137 iw++;
138 } else {
139 vec[iw - 1].end = end;
140 last_end = end;
141 }
142 }
143 /* end sentinel */
144 vec[iw] = (struct interval){.start = INT32_MAX, .end = INT32_MAX};
145 return iw;
146 }
147
148 /* By writing a mergesort by hand, we can detect duplicates early.
149 *
150 * TODO: optimize output with run-length-encoded segments
151 * TODO: explicit time limiting/adaptive margin! */
merge_mergesort(const int old_count,struct interval * old_list,const int new_count,const struct ext_interval * const new_list,int * dst_count,struct interval ** dst_list,int merge_margin,int alignment_bits)152 void merge_mergesort(const int old_count, struct interval *old_list,
153 const int new_count, const struct ext_interval *const new_list,
154 int *dst_count, struct interval **dst_list, int merge_margin,
155 int alignment_bits)
156 {
157 /* Stack-based mergesort: the buffer at position `i+1`
158 * should be <= 1/2 times the size of the buffer at
159 * position `i`; buffers will be merged
160 * to maintain this invariant */
161 // TODO: improve memory management!
162 struct merge_stack_elem substack[32];
163 int substack_size = 0;
164 memset(substack, 0, sizeof(substack));
165 struct merge_stack base = {.data = NULL, .count = 0, .size = 0};
166 struct merge_stack temp = {.data = NULL, .count = 0, .size = 0};
167 if (old_count) {
168 /* seed the stack with the previous damage
169 * interval list,
170 * including trailing terminator */
171 base.data = old_list;
172 base.size = old_count + 1;
173 base.count = old_count + 1;
174 substack[substack_size++] = (struct merge_stack_elem){
175 .offset = 0, .count = old_count};
176 }
177
178 int src_count = 0, absorbed = 0;
179
180 for (int jn = 0; jn < new_count; jn++) {
181 struct ext_interval e = new_list[jn];
182 /* ignore invalid intervals -- also, if e.start
183 * is close to INT32_MIN, the stream merge
184 * breaks */
185 if (e.width <= 0 || e.rep <= 0 || e.start < 0) {
186 continue;
187 }
188
189 /* To limit CPU time, if it is very likely that
190 * an interval would be merged anyway, then
191 * replace it with its containing interval. */
192 int remaining = src_count - absorbed;
193 bool force_combine = (absorbed > 30000) ||
194 10 * remaining < src_count;
195
196 int64_t intv_end = e.start + e.stride * (int64_t)(e.rep - 1) +
197 e.width;
198 if (intv_end >= INT32_MAX) {
199 /* overflow protection */
200 e.width = INT32_MAX - 1 - e.start;
201 e.rep = 1;
202 }
203 /* Remove internal gaps are smaller than the
204 * margin and hence
205 * would need to be merged away anyway. */
206 if (e.width > e.stride - merge_margin || force_combine) {
207 e.width = e.stride * (e.rep - 1) + e.width;
208 e.rep = 1;
209 }
210
211 if (buf_ensure_size(base.count + e.rep + 1,
212 sizeof(struct interval), &base.size,
213 (void **)&base.data) == -1) {
214 wp_error("Failed to resize a merge buffer, some damage intervals may be lost");
215 continue;
216 }
217
218 struct interval *vec = &base.data[base.count];
219 int iw = unpack_ext_interval(vec, e, alignment_bits);
220 src_count += iw;
221
222 substack[substack_size] = (struct merge_stack_elem){
223 .offset = base.count, .count = iw};
224 substack_size++;
225
226 base.count += iw + 1;
227
228 /* merge down the stack as far as possible */
229 substack_size = fix_merge_stack_property(substack_size,
230 substack, &base, &temp, merge_margin, false,
231 &absorbed);
232 }
233
234 /* collapse the stack into a final interval */
235 fix_merge_stack_property(substack_size, substack, &base, &temp,
236 merge_margin, true, &absorbed);
237 free(temp.data);
238
239 *dst_list = base.data;
240 *dst_count = substack[0].count;
241 }
242
243 /* This value must be larger than 8, or diffs will explode */
244 #define MERGE_MARGIN 256
merge_damage_records(struct damage * base,int nintervals,const struct ext_interval * const new_list,int alignment_bits)245 void merge_damage_records(struct damage *base, int nintervals,
246 const struct ext_interval *const new_list, int alignment_bits)
247 {
248 for (int i = 0; i < nintervals; i++) {
249 base->acc_damage_stat += new_list[i].width * new_list[i].rep;
250 base->acc_count++;
251 }
252
253 // Fast return if there is nothing to do
254 if (base->damage == DAMAGE_EVERYTHING || nintervals <= 0) {
255 return;
256 }
257 if (nintervals >= (1 << 30) || base->ndamage_intvs >= (1 << 30)) {
258 /* avoid overflow in merge routine; also would be cheaper to
259 * damage everything at this point; */
260 damage_everything(base);
261 return;
262 }
263
264 merge_mergesort(base->ndamage_intvs, base->damage, nintervals, new_list,
265 &base->ndamage_intvs, &base->damage, MERGE_MARGIN,
266 alignment_bits);
267 }
268
reset_damage(struct damage * base)269 void reset_damage(struct damage *base)
270 {
271 if (base->damage != DAMAGE_EVERYTHING) {
272 free(base->damage);
273 }
274 base->damage = NULL;
275 base->ndamage_intvs = 0;
276 base->acc_damage_stat = 0;
277 base->acc_count = 0;
278 }
damage_everything(struct damage * base)279 void damage_everything(struct damage *base)
280 {
281 if (base->damage != DAMAGE_EVERYTHING) {
282 free(base->damage);
283 }
284 base->damage = DAMAGE_EVERYTHING;
285 base->ndamage_intvs = 0;
286 }
287