1 static char rcsid[] = "$Id: merge-diagonals-simd-uint4.c 222629 2020-05-20 00:05:59Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "merge-diagonals-simd-uint4.h"
7 #include "assert.h"
8 #include "mem.h"
9 #include "popcount.h" /* For clz_table */
10 #include "sedgesort.h"
11 #include "merge-uint4.h"
12
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h> /* For memcpy */
16
17
18 #if defined(HAVE_SSE4_1)
19 #include <smmintrin.h>
20 #endif
21 #if defined(HAVE_AVX2)
22 #include <immintrin.h>
23 #endif
24 #if defined(HAVE_AVX512)
25 #include <immintrin.h>
26 #endif
27
28
29 /* #define PYRAMID_SIZE 4 */
30
31 #define CUTOFF 1000
32 #define PYRAMID_SIZE 32
33
34
35 #ifdef DEBUG
36 #define debug(x) x
37 #else
38 #define debug(x)
39 #endif
40
41 /* pyramid_merge */
42 #ifdef DEBUG2
43 #define debug2(x) x
44 #else
45 #define debug2(x)
46 #endif
47
48
49 #ifdef CHECK_INPUTS
50 static void
check_ascending(unsigned int * diagonals,int ndiagonals)51 check_ascending (unsigned int *diagonals, int ndiagonals) {
52 int i;
53
54 for (i = 1; i < ndiagonals; i++) {
55 if (diagonals[i] < diagonals[i-1]) {
56 abort();
57 }
58 }
59 return;
60 }
61 #endif
62
63
64 #define PARENT(i) (i >> 1)
65 #define LEFT(i) (i << 1)
66 #define RIGHT(i) ((i << 1) | 1)
67
68 static int
pyramid_merge(unsigned int ** heap,int nstreams,int heapsize,int * nelts,int pyramid_start,int pyramid_end)69 pyramid_merge (unsigned int **heap, int nstreams, int heapsize, int *nelts,
70 int pyramid_start, int pyramid_end) {
71 int nodei;
72 #ifdef DEBUG
73 int i;
74 #endif
75
76 while (pyramid_end > pyramid_start) {
77 debug(printf("Merging level: %d..%d for heapsize %d\n",pyramid_start,pyramid_end,heapsize));
78
79 if (pyramid_end > heapsize) {
80 nodei = heapsize;
81 } else {
82 nodei = pyramid_end;
83 }
84
85 while (nodei >= pyramid_start) {
86 debug2(printf("Merging nodes %d (%d elts) and %d (%d elts) => %d\n",
87 nodei-1,nelts[nodei-1],nodei,nelts[nodei],PARENT(nodei)));
88 heap[PARENT(nodei)] = Merge_uint4(/*dest*/NULL,heap[nodei-1],heap[nodei],nelts[nodei-1],nelts[nodei]);
89 CHECK_ALIGN(heap[PARENT(nodei)]);
90 nelts[PARENT(nodei)] = nelts[nodei-1] + nelts[nodei];
91 debug2(printf("Created list %p of length %d at node %d\n",
92 heap[PARENT(nodei)],nelts[PARENT(nodei)],PARENT(nodei)));
93
94 #ifdef DEBUG
95 for (i = 0; i < nelts[PARENT(nodei)]; i++) {
96 printf("%u\n",heap[PARENT(nodei)][i]);
97 }
98 #endif
99
100 /* Don't free original lists (when nodei >= nstreams) */
101 debug(printf("Freeing nodes %d and %d\n",nodei-1,nodei));
102 if (nodei < nstreams) {
103 FREE_ALIGN(heap[nodei]);
104 }
105 if (nodei-1 < nstreams) {
106 FREE_ALIGN(heap[nodei-1]);
107 }
108 nodei -= 2;
109 }
110
111 pyramid_end = PARENT(pyramid_end);
112 pyramid_start = PARENT(pyramid_start);
113 }
114
115 debug(printf("Returning ancestor %d\n\n",pyramid_start));
116 return pyramid_start;
117 }
118
119
120 static UINT4 **
make_diagonals_heap(int ** nelts,int * ncopied,UINT4 ** stream_array,int * streamsize_array,int * diagterm_array,int nstreams)121 make_diagonals_heap (int **nelts, int *ncopied, UINT4 **stream_array,
122 int *streamsize_array, int *diagterm_array, int nstreams) {
123 UINT4 **heap, **combined, *out, diagterm;
124 int heapsize, heapi, ncombined;
125 int *order, *totals, total, n, i, j, k, l;
126
127 /* Put smallest streams at the end of the heap, to increase speed */
128 /* Note: Extra space needed for Sedgesort, but querylength > (query_lastpos+1) + 1 */
129 order = Sedgesort_order_int(streamsize_array,nstreams);
130
131 combined = (UINT4 **) MALLOC(nstreams*sizeof(UINT4 *));
132 totals = (int *) MALLOC(nstreams*sizeof(int));
133 ncombined = 0;
134
135 /* Combine the smallest streams, up to CUTOFF, and just use Sedgesort */
136 i = 0;
137 total = 1; /* To start the loop */
138 while (i < nstreams && total > 0) {
139 total = 0;
140 j = i;
141 while (j < nstreams && total + streamsize_array[order[j]] < CUTOFF) {
142 total += streamsize_array[order[j++]];
143 }
144
145 if (total > 0) {
146 debug(printf("Merging from %d to %d with %d total elements\n",i,j-1,total));
147 totals[ncombined] = total;
148 /* Need an extra value for Sedgesort_uint4 */
149 out = combined[ncombined] = (UINT4 *) MALLOC_ALIGN((total+1)*sizeof(UINT4));
150 debug_align(printf("Allocating %p to %p -- Malloc of 1 bytes requested from %s:%d\n",
151 out,out,__FILE__,__LINE__));
152 for (k = i; k < j; k++) {
153 n = streamsize_array[order[k]];
154 memcpy(out,stream_array[order[k]],n*sizeof(UINT4));
155
156 /* Add diagterm (Could use SIMD here) */
157 diagterm = (UINT4) diagterm_array[order[k]];
158 for (l = 0; l < n; l++) {
159 out[l] += diagterm;
160 }
161
162 out += n;
163 }
164 Sedgesort_uint4(combined[ncombined++],total);
165 i = j;
166 }
167 }
168
169 *ncopied = (nstreams - i) + ncombined;
170 heapsize = 2*(*ncopied) - 1;
171
172 heap = (UINT4 **) CALLOC((heapsize + 1),sizeof(UINT4 *));
173 *nelts = (int *) CALLOC((heapsize + 1),sizeof(int));
174
175 heapi = heapsize;
176 /* Handle individual contents: Start with i value from before */
177 while (i < nstreams) {
178 n = (*nelts)[heapi] = streamsize_array[order[i]];
179
180 /* Copy to make the merging process non-destructive */
181 out = heap[heapi] = MALLOC_ALIGN(n*sizeof(UINT4));
182 debug_align(printf("Allocating %p to %p -- Malloc of 1 bytes requested from %s:%d\n",
183 out,out,__FILE__,__LINE__));
184 CHECK_ALIGN(heap[heapi]);
185 memcpy(heap[heapi],stream_array[order[i]],n*sizeof(UINT4));
186
187 /* Add diagterm (Could use SIMD here) */
188 diagterm = (UINT4) diagterm_array[order[i]];
189 for (l = 0; l < n; l++) {
190 out[l] += diagterm;
191 }
192
193 #ifdef DEBUG
194 printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
195 for (k = 0; k < (*nelts)[heapi]; k++) {
196 printf(" %u",heap[heapi][k]);
197 }
198 printf("\n");
199 #endif
200 heapi--;
201 i++;
202 }
203
204 /* Handle combined contents */
205 for (i = 0; i < ncombined; i++) {
206 heap[heapi] = combined[i];
207 (*nelts)[heapi] = totals[i];
208 #ifdef DEBUG
209 printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
210 for (k = 0; k < (*nelts)[heapi]; k++) {
211 printf(" %u",heap[heapi][k]);
212 }
213 printf("\n");
214 #endif
215 heapi--;
216 }
217
218 FREE(totals);
219 FREE(combined);
220 FREE(order);
221
222 return heap;
223 }
224
225
226 /* Input: streams might be aligned or not. Caller will have to free appropriately */
227 /* Output: aligned */
228 UINT4 *
Merge_diagonals(int * nelts1,UINT4 ** stream_array,int * streamsize_array,int * diagterm_array,int nstreams)229 Merge_diagonals (int *nelts1, UINT4 **stream_array, int *streamsize_array,
230 int *diagterm_array, int nstreams) {
231 UINT4 *result, **heap, *stream, diagterm;
232 int *nelts, l;
233 int ncopied, heapi, heapsize, base, ancestori, pyramid_start, pyramid_end;
234 int bits;
235 #ifdef DEBUG
236 int i;
237 #endif
238
239
240 if (nstreams == 0) {
241 *nelts1 = 0;
242 return (UINT4 *) NULL;
243
244 } else if (nstreams == 1) {
245 *nelts1 = streamsize_array[0];
246 stream = stream_array[0];
247 diagterm = diagterm_array[0];
248
249 result = MALLOC_ALIGN((*nelts1)*sizeof(UINT4)); /* Output must be aligned */
250 debug_align(printf("Allocating %p to %p -- Malloc of 1 bytes requested from %s:%d\n",
251 result,result,__FILE__,__LINE__));
252 memcpy(result,stream,(*nelts1)*sizeof(UINT4));
253
254 /* Add diagterm (Could use SIMD here) */
255 for (l = 0; l < *nelts1; l++) {
256 result[l] += diagterm;
257 }
258
259 return result;
260
261 } else {
262 heap = make_diagonals_heap(&nelts,&ncopied,stream_array,streamsize_array,diagterm_array,nstreams);
263 if (ncopied == 1) {
264 *nelts1 = nelts[1];
265 result = heap[1];
266 } else {
267 heapsize = 2*ncopied - 1; /* also index of last node */
268 #ifdef HAVE_BUILTIN_CLZ
269 bits = 31 - __builtin_clz((unsigned int) heapsize);
270 #elif defined(HAVE_ASM_BSR)
271 asm("bsr %1,%0" : "=r"(bits) : "r"(heapsize));
272 #else
273 bits = 31 - ((heapsize >> 16) ? clz_table[heapsize >> 16] : 16 + clz_table[heapsize]);
274 #endif
275
276 base = (1 << bits);
277 debug(printf("nstreams %d, ncopied %d, heapsize %d, clz %d, bits %d, base %d\n",
278 nstreams,ncopied,heapsize,__builtin_clz(heapsize),bits,base));
279
280 /* Middle pyramids */
281 while (base > PYRAMID_SIZE) {
282 for (pyramid_start = 2*base - PYRAMID_SIZE, pyramid_end = 2*base - 1; pyramid_start >= base;
283 pyramid_start -= PYRAMID_SIZE, pyramid_end -= PYRAMID_SIZE) {
284 debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
285 ancestori = pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
286 }
287 base = ancestori;
288 }
289
290 /* Last pyramid */
291 pyramid_start = base;
292 pyramid_end = 2*base - 1;
293 debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
294 /* base = */ pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
295
296 *nelts1 = nelts[1];
297 result = heap[1];
298
299 for (heapi = heapsize; heapi > heapsize - ncopied; heapi--) {
300 FREE_ALIGN(heap[heapi]);
301 }
302 }
303
304 FREE(heap);
305 FREE(nelts);
306 }
307
308
309 #ifdef DEBUG
310 printf("Merge_diagonals returning result of length %d\n",*nelts1);
311 for (i = 0; i < *nelts1; i++) {
312 printf("%u\n",result[i]);
313 }
314 #endif
315
316 return result;
317 }
318
319
320 static UINT4 **
make_univdiagonals_heap(int ** nelts,int * ncopied,UINT4 ** stream_array,int * streamsize_array,int nstreams)321 make_univdiagonals_heap (int **nelts, int *ncopied, UINT4 **stream_array,
322 int *streamsize_array, int nstreams) {
323 UINT4 **heap, **combined, *out;
324 int heapsize, heapi, ncombined;
325 int *order, *totals, total, n, i, j, k;
326
327 /* Put smallest streams at the end of the heap, to increase speed */
328 /* Note: Extra space needed for Sedgesort, but querylength > (query_lastpos+1) + 1 */
329 order = Sedgesort_order_int(streamsize_array,nstreams);
330
331 combined = (UINT4 **) MALLOC(nstreams*sizeof(UINT4 *));
332 totals = (int *) MALLOC(nstreams*sizeof(int));
333 ncombined = 0;
334
335 /* Combine the smallest streams, up to CUTOFF, and just use Sedgesort */
336 i = 0;
337 total = 1; /* To start the loop */
338 while (i < nstreams && total > 0) {
339 total = 0;
340 j = i;
341 while (j < nstreams && total + streamsize_array[order[j]] < CUTOFF) {
342 total += streamsize_array[order[j++]];
343 }
344
345 if (total > 0) {
346 debug(printf("Merging from %d to %d with %d total elements\n",i,j-1,total));
347 totals[ncombined] = total;
348 /* Need an extra value for Sedgesort_uint4 */
349 out = combined[ncombined] = (UINT4 *) MALLOC_ALIGN((total+1)*sizeof(UINT4));
350 debug_align(printf("Allocating %p to %p -- Malloc of 1 bytes requested from %s:%d\n",
351 out,out,__FILE__,__LINE__));
352 for (k = i; k < j; k++) {
353 n = streamsize_array[order[k]];
354 memcpy(out,stream_array[order[k]],n*sizeof(UINT4));
355 out += n;
356 }
357 Sedgesort_uint4(combined[ncombined++],total);
358 #ifdef CHECK_INPUTS
359 check_ascending(combined[ncombined-1],total);
360 #endif
361 i = j;
362 }
363 }
364
365 *ncopied = (nstreams - i) + ncombined;
366 heapsize = 2*(*ncopied) - 1;
367
368 heap = (UINT4 **) CALLOC((heapsize + 1),sizeof(UINT4 *));
369 *nelts = (int *) CALLOC((heapsize + 1),sizeof(int));
370
371 heapi = heapsize;
372 /* Handle individual contents: Start with i value from before */
373 while (i < nstreams) {
374 n = (*nelts)[heapi] = streamsize_array[order[i]];
375 #ifdef CHECK_INPUTS
376 check_ascending(stream_array[order[i]],n);
377 #endif
378
379 /* Copy to make the merging process non-destructive */
380 out = heap[heapi] = MALLOC_ALIGN(n*sizeof(UINT4));
381 debug_align(printf("Allocating %p to %p -- Malloc of 1 bytes requested from %s:%d\n",
382 out,out,__FILE__,__LINE__));
383 CHECK_ALIGN(heap[heapi]);
384 memcpy(heap[heapi],stream_array[order[i]],n*sizeof(UINT4));
385
386 #ifdef DEBUG
387 printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
388 for (k = 0; k < (*nelts)[heapi]; k++) {
389 printf(" %u",heap[heapi][k]);
390 }
391 printf("\n");
392 #endif
393 heapi--;
394 i++;
395 }
396
397 /* Handle combined contents */
398 for (i = 0; i < ncombined; i++) {
399 heap[heapi] = combined[i];
400 (*nelts)[heapi] = totals[i];
401 #ifdef DEBUG
402 printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
403 for (k = 0; k < (*nelts)[heapi]; k++) {
404 printf(" %u",heap[heapi][k]);
405 }
406 printf("\n");
407 #endif
408 heapi--;
409 }
410
411 FREE(totals);
412 FREE(combined);
413 FREE(order);
414
415 return heap;
416 }
417
418
419 /* Used for localdb */
420 UINT4 *
Merge_diagonals_uint4(int * nelts1,UINT4 ** stream_array,int * streamsize_array,int nstreams)421 Merge_diagonals_uint4 (int *nelts1, UINT4 **stream_array, int *streamsize_array,
422 int nstreams) {
423 UINT4 *result, **heap, *stream;
424 int *nelts;
425 int ncopied, heapi, heapsize, base, ancestori, pyramid_start, pyramid_end;
426 int bits;
427 #ifdef DEBUG
428 int i;
429 #endif
430
431
432 if (nstreams == 0) {
433 *nelts1 = 0;
434 return (UINT4 *) NULL;
435
436 } else if (nstreams == 1) {
437 *nelts1 = streamsize_array[0];
438 stream = stream_array[0];
439 #ifdef CHECK_INPUTS
440 check_ascending(stream,*nelts1);
441 #endif
442
443 result = MALLOC_ALIGN((*nelts1)*sizeof(UINT4)); /* Output must be aligned */
444 debug_align(printf("Allocating %p to %p -- Malloc of 1 bytes requested from %s:%d\n",
445 result,result,__FILE__,__LINE__));
446 memcpy(result,stream,(*nelts1)*sizeof(UINT4));
447 return result;
448
449 } else {
450 heap = make_univdiagonals_heap(&nelts,&ncopied,stream_array,streamsize_array,nstreams);
451 if (ncopied == 1) {
452 *nelts1 = nelts[1];
453 result = heap[1];
454 } else {
455 heapsize = 2*ncopied - 1; /* also index of last node */
456 #ifdef HAVE_BUILTIN_CLZ
457 bits = 31 - __builtin_clz((unsigned int) heapsize);
458 #elif defined(HAVE_ASM_BSR)
459 asm("bsr %1,%0" : "=r"(bits) : "r"(heapsize));
460 #else
461 bits = 31 - ((heapsize >> 16) ? clz_table[heapsize >> 16] : 16 + clz_table[heapsize]);
462 #endif
463
464 base = (1 << bits);
465 debug(printf("nstreams %d, ncopied %d, heapsize %d, clz %d, bits %d, base %d\n",
466 nstreams,ncopied,heapsize,__builtin_clz(heapsize),bits,base));
467
468 /* Middle pyramids */
469 while (base > PYRAMID_SIZE) {
470 for (pyramid_start = 2*base - PYRAMID_SIZE, pyramid_end = 2*base - 1; pyramid_start >= base;
471 pyramid_start -= PYRAMID_SIZE, pyramid_end -= PYRAMID_SIZE) {
472 debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
473 ancestori = pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
474 }
475 base = ancestori;
476 }
477
478 /* Last pyramid */
479 pyramid_start = base;
480 pyramid_end = 2*base - 1;
481 debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
482 /* base = */ pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
483
484 *nelts1 = nelts[1];
485 result = heap[1];
486
487 for (heapi = heapsize; heapi > heapsize - ncopied; heapi--) {
488 FREE_ALIGN(heap[heapi]);
489 }
490 }
491
492 FREE(heap);
493 FREE(nelts);
494 }
495
496
497 #ifdef DEBUG
498 printf("Merge_diagonals returning result of length %d\n",*nelts1);
499 for (i = 0; i < *nelts1; i++) {
500 printf("%u\n",result[i]);
501 }
502 #endif
503
504 return result;
505 }
506
507
508