1 static char rcsid[] = "$Id: merge-diagonals-simd-uint8.c 222445 2020-04-21 17:53:32Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "merge-diagonals-simd-uint8.h"
7 #include "assert.h"
8 #include "mem.h"
9 #include "popcount.h" /* For clz_table */
10 #include "sedgesort.h"
11 #include "merge-uint8.h"
12
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h> /* For memcpy */
16
17
18 #if defined(HAVE_SSE4_1)
19 #include <smmintrin.h>
20 #endif
21 #if defined(HAVE_AVX2)
22 #include <immintrin.h>
23 #endif
24 #if defined(HAVE_AVX512)
25 #include <immintrin.h>
26 #endif
27
28
29 /* #define PYRAMID_SIZE 4 */
30
31 #define CUTOFF 1000
32 #define PYRAMID_SIZE 32
33
34 #define GETPOS(high,low) (((Univcoord_T) high << 32) + low)
35
36
37 #ifdef DEBUG0
38 #define debug0(x) x
39 #else
40 #define debug0(x)
41 #endif
42
43 #ifdef DEBUG
44 #define debug(x) x
45 #else
46 #define debug(x)
47 #endif
48
49 #ifdef DEBUG2
50 #define debug2(x) x
51 #else
52 #define debug2(x)
53 #endif
54
55
56
57 #define PARENT(i) (i >> 1)
58 #define LEFT(i) (i << 1)
59 #define RIGHT(i) ((i << 1) | 1)
60
61 #if defined(HAVE_AVX512) || defined(HAVE_AVX2)
62 static int
pyramid_merge(UINT8 ** heap,int nstreams,int heapsize,int * nelts,int pyramid_start,int pyramid_end)63 pyramid_merge (UINT8 **heap, int nstreams, int heapsize, int *nelts,
64 int pyramid_start, int pyramid_end) {
65 int nodei;
66 #ifdef DEBUG
67 int i;
68 #endif
69
70 while (pyramid_end > pyramid_start) {
71 debug(printf("Merging level: %d..%d for heapsize %d\n",pyramid_start,pyramid_end,heapsize));
72
73 if (pyramid_end > heapsize) {
74 nodei = heapsize;
75 } else {
76 nodei = pyramid_end;
77 }
78
79 while (nodei >= pyramid_start) {
80 debug2(printf("Merging nodes %d (%d elts) and %d (%d elts) => %d\n",
81 nodei-1,nelts[nodei-1],nodei,nelts[nodei],PARENT(nodei)));
82 heap[PARENT(nodei)] = Merge_uint8(/*dest*/NULL,heap[nodei-1],heap[nodei],nelts[nodei-1],nelts[nodei]);
83 CHECK_ALIGN(heap[PARENT(nodei)]);
84 nelts[PARENT(nodei)] = nelts[nodei-1] + nelts[nodei];
85 debug2(printf("Created list %p of length %d at node %d\n",
86 heap[PARENT(nodei)],nelts[PARENT(nodei)],PARENT(nodei)));
87
88 #ifdef DEBUG
89 for (i = 0; i < nelts[PARENT(nodei)]; i++) {
90 printf("%llu\n",heap[PARENT(nodei)][i]);
91 }
92 #endif
93
94 /* Don't free original lists (when nodei >= nstreams) */
95 debug(printf("Freeing nodes %d and %d\n",nodei-1,nodei));
96 if (nodei < nstreams) {
97 FREE_ALIGN(heap[nodei]);
98 }
99 if (nodei-1 < nstreams) {
100 FREE_ALIGN(heap[nodei-1]);
101 }
102 nodei -= 2;
103 }
104
105 pyramid_end = PARENT(pyramid_end);
106 pyramid_start = PARENT(pyramid_start);
107 }
108
109 debug(printf("Returning ancestor %d\n\n",pyramid_start));
110 return pyramid_start;
111 }
112 #endif
113
114
115 #if defined(HAVE_AVX512) || defined(HAVE_AVX2)
116 static UINT8 **
make_diagonals_heap(int ** nelts,int * ncopied,unsigned char ** stream_high_array,UINT4 ** stream_low_array,int * streamsize_array,int * diagterm_array,int nstreams)117 make_diagonals_heap (int **nelts, int *ncopied,
118 unsigned char **stream_high_array, UINT4 **stream_low_array,
119 int *streamsize_array, int *diagterm_array, int nstreams) {
120 UINT8 **heap, **combined, *out, diagterm;
121 int heapsize, heapi, ncombined;
122 int *order, *totals, total, n, i, j, k, l;
123
124 debug(printf("Entered make_diagonals_heap\n"));
125
126 /* Put smallest streams at the end of the heap, to increase speed */
127 /* Note: Extra space needed for Sedgesort, but querylength > (query_lastpos+1) + 1 */
128 order = Sedgesort_order_int(streamsize_array,nstreams);
129
130 combined = (UINT8 **) MALLOC(nstreams*sizeof(UINT8 *));
131 totals = (int *) MALLOC(nstreams*sizeof(int));
132 ncombined = 0;
133
134 /* Combine the smallest streams, up to CUTOFF, and just use Sedgesort */
135 i = 0;
136 total = 1; /* To start the loop */
137 while (i < nstreams && total > 0) {
138 total = 0;
139 j = i;
140 while (j < nstreams && total + streamsize_array[order[j]] < CUTOFF) {
141 total += streamsize_array[order[j++]];
142 }
143
144 if (total > 0) {
145 debug(printf("Merging from %d to %d with %d total elements\n",i,j-1,total));
146 totals[ncombined] = total;
147 /* Need an extra value for Sedgesort_uint8 */
148 out = combined[ncombined] = (UINT8 *) MALLOC_ALIGN((total+1)*sizeof(UINT8));
149 for (k = i; k < j; k++) {
150 n = streamsize_array[order[k]];
151
152 /* Combine stream_high and stream_low and add diagterm (Could use SIMD here) */
153 diagterm = (UINT8) diagterm_array[order[k]];
154 for (l = 0; l < n; l++) {
155 out[l] = GETPOS(stream_high_array[order[k]][l],stream_low_array[order[k]][l]) + diagterm;
156 }
157
158 out += n;
159 }
160 Sedgesort_uint8(combined[ncombined++],total);
161 i = j;
162 }
163 }
164
165 *ncopied = (nstreams - i) + ncombined;
166 heapsize = 2*(*ncopied) - 1;
167
168 heap = (UINT8 **) CALLOC((heapsize + 1),sizeof(UINT8 *));
169 *nelts = (int *) CALLOC((heapsize + 1),sizeof(int));
170
171 heapi = heapsize;
172 /* Handle individual contents: Start with i value from before */
173 while (i < nstreams) {
174 n = (*nelts)[heapi] = streamsize_array[order[i]];
175
176 /* Copy to make the merging process non-destructive */
177 out = heap[heapi] = MALLOC_ALIGN(n*sizeof(UINT8));
178 CHECK_ALIGN(heap[heapi]);
179
180 /* Combine stream_high and stream_low and add diagterm (Could use SIMD here) */
181 diagterm = (UINT8) diagterm_array[order[i]];
182 for (l = 0; l < n; l++) {
183 out[l] = GETPOS(stream_high_array[order[i]][l],stream_low_array[order[i]][l]) + diagterm;
184 }
185
186 #ifdef DEBUG
187 printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
188 for (k = 0; k < (*nelts)[heapi]; k++) {
189 printf(" %llu",heap[heapi][k]);
190 }
191 printf("\n");
192 #endif
193 heapi--;
194 i++;
195 }
196
197 /* Handle combined contents */
198 for (i = 0; i < ncombined; i++) {
199 heap[heapi] = combined[i];
200 (*nelts)[heapi] = totals[i];
201 #ifdef DEBUG
202 printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
203 for (k = 0; k < (*nelts)[heapi]; k++) {
204 printf(" %llu",heap[heapi][k]);
205 }
206 printf("\n");
207 #endif
208 heapi--;
209 }
210
211 FREE(totals);
212 FREE(combined);
213 FREE(order);
214
215 return heap;
216 }
217 #endif
218
219
220 /* For non-AVX2, non-AVX512 code, see merge-diagonals-heap.c */
221 #if defined(HAVE_AVX512) || defined(HAVE_AVX2)
222 /* Input: streams might be aligned or not. Caller will have to free appropriately */
223 /* Output: aligned */
224 Univcoord_T *
Merge_diagonals_large(int * nelts1,unsigned char ** stream_high_array,UINT4 ** stream_low_array,int * streamsize_array,int * diagterm_array,int nstreams)225 Merge_diagonals_large (int *nelts1, unsigned char **stream_high_array, UINT4 **stream_low_array,
226 int *streamsize_array, int *diagterm_array, int nstreams) {
227 Univcoord_T *result, **heap, diagterm;
228 unsigned char *stream_high;
229 UINT4 *stream_low;
230 int *nelts, l;
231 int ncopied, heapi, heapsize, base, ancestori, pyramid_start, pyramid_end;
232 int bits;
233 #ifdef DEBUG
234 int i;
235 #endif
236
237
238 if (nstreams == 0) {
239 *nelts1 = 0;
240 return (Univcoord_T *) NULL;
241
242 } else if (nstreams == 1) {
243 *nelts1 = streamsize_array[0];
244 stream_high = stream_high_array[0];
245 stream_low = stream_low_array[0];
246 diagterm = diagterm_array[0];
247
248 result = MALLOC_ALIGN((*nelts1)*sizeof(UINT8)); /* Output must be aligned */
249 /* Combine stream_high and stream_low and add diagterm (Could use SIMD here) */
250 for (l = 0; l < *nelts1; l++) {
251 result[l] = GETPOS(stream_high[l],stream_low[l]) + diagterm;
252 }
253
254 return result;
255
256 } else {
257 heap = make_diagonals_heap(&nelts,&ncopied,stream_high_array,stream_low_array,
258 streamsize_array,diagterm_array,nstreams);
259 if (ncopied == 1) {
260 *nelts1 = nelts[1];
261 result = heap[1];
262 } else {
263 heapsize = 2*ncopied - 1; /* also index of last node */
264 #ifdef HAVE_BUILTIN_CLZ
265 bits = 31 - __builtin_clz((unsigned int) heapsize);
266 #elif defined(HAVE_ASM_BSR)
267 asm("bsr %1,%0" : "=r"(bits) : "r"(heapsize));
268 #else
269 bits = 31 - ((heapsize >> 16) ? clz_table[heapsize >> 16] : 16 + clz_table[heapsize]);
270 #endif
271
272 base = (1 << bits);
273 debug(printf("nstreams %d, ncopied %d, heapsize %d, clz %d, bits %d, base %d\n",
274 nstreams,ncopied,heapsize,__builtin_clz(heapsize),bits,base));
275
276 /* Middle pyramids */
277 while (base > PYRAMID_SIZE) {
278 for (pyramid_start = 2*base - PYRAMID_SIZE, pyramid_end = 2*base - 1; pyramid_start >= base;
279 pyramid_start -= PYRAMID_SIZE, pyramid_end -= PYRAMID_SIZE) {
280 debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
281 ancestori = pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
282 }
283 base = ancestori;
284 }
285
286 /* Last pyramid */
287 pyramid_start = base;
288 pyramid_end = 2*base - 1;
289 debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
290 /* base = */ pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
291
292 *nelts1 = nelts[1];
293 result = heap[1];
294
295 for (heapi = heapsize; heapi > heapsize - ncopied; heapi--) {
296 FREE_ALIGN(heap[heapi]);
297 }
298 }
299
300 FREE(heap);
301 FREE(nelts);
302 }
303
304
305 #ifdef DEBUG
306 printf("Merge_diagonals_large returning result of length %d\n",*nelts1);
307 for (i = 0; i < *nelts1; i++) {
308 printf("%llu\n",result[i]);
309 }
310 #endif
311
312 return result;
313 }
314 #endif
315
316
317 #if defined(HAVE_AVX512) || defined(HAVE_AVX2)
318 static Univcoord_T **
make_univdiagonals_heap(int ** nelts,int * ncopied,Univcoord_T ** stream_array,int * streamsize_array,int nstreams)319 make_univdiagonals_heap (int **nelts, int *ncopied, Univcoord_T **stream_array,
320 int *streamsize_array, int nstreams) {
321 Univcoord_T **heap, **combined, *out;
322 int heapsize, heapi, ncombined;
323 int *order, *totals, total, n, i, j, k;
324
325 debug(printf("Entered make_univdiagonals_heap\n"));
326 #ifdef DEBUG
327 for (i = 0; i < nstreams; i++) {
328 printf("Stream %d:",i);
329 for (j = 0; j < streamsize_array[i]; j++) {
330 printf(" %llu",stream_array[i][j]);
331 }
332 printf("\n");
333 }
334 #endif
335
336
337 /* Put smallest streams at the end of the heap, to increase speed */
338 /* Note: Extra space needed for Sedgesort, but querylength > (query_lastpos+1) + 1 */
339 order = Sedgesort_order_int(streamsize_array,nstreams);
340
341 combined = (Univcoord_T **) MALLOC(nstreams*sizeof(Univcoord_T *));
342 totals = (int *) MALLOC(nstreams*sizeof(int));
343 ncombined = 0;
344
345 /* Combine the smallest streams, up to CUTOFF, and just use Sedgesort */
346 i = 0;
347 total = 1; /* To start the loop */
348 while (i < nstreams && total > 0) {
349 total = 0;
350 j = i;
351 while (j < nstreams && total + streamsize_array[order[j]] < CUTOFF) {
352 total += streamsize_array[order[j++]];
353 }
354
355 if (total > 0) {
356 debug(printf("Merging from %d to %d with %d total elements\n",i,j-1,total));
357 totals[ncombined] = total;
358 /* Need an extra value for Sedgesort_uint8 */
359 out = combined[ncombined] = (Univcoord_T *) MALLOC_ALIGN((total+1)*sizeof(Univcoord_T));
360 for (k = i; k < j; k++) {
361 n = streamsize_array[order[k]];
362 memcpy(out,stream_array[order[k]],n*sizeof(Univcoord_T));
363 out += n;
364 }
365 Sedgesort_uint8(combined[ncombined++],total);
366 i = j;
367 }
368 }
369
370 *ncopied = (nstreams - i) + ncombined;
371 heapsize = 2*(*ncopied) - 1;
372
373 heap = (Univcoord_T **) CALLOC((heapsize + 1),sizeof(Univcoord_T *));
374 *nelts = (int *) CALLOC((heapsize + 1),sizeof(int));
375
376 heapi = heapsize;
377 /* Handle individual contents: Start with i value from before */
378 while (i < nstreams) {
379 n = (*nelts)[heapi] = streamsize_array[order[i]];
380
381 /* Copy to make the merging process non-destructive */
382 out = heap[heapi] = MALLOC_ALIGN(n*sizeof(Univcoord_T));
383 CHECK_ALIGN(heap[heapi]);
384 memcpy(heap[heapi],stream_array[order[i]],n*sizeof(Univcoord_T));
385
386 #ifdef DEBUG
387 printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
388 for (k = 0; k < (*nelts)[heapi]; k++) {
389 printf(" %llu",heap[heapi][k]);
390 }
391 printf("\n");
392 #endif
393 heapi--;
394 i++;
395 }
396
397 /* Handle combined contents */
398 for (i = 0; i < ncombined; i++) {
399 heap[heapi] = combined[i];
400 (*nelts)[heapi] = totals[i];
401 #ifdef DEBUG
402 printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
403 for (k = 0; k < (*nelts)[heapi]; k++) {
404 printf(" %llu",heap[heapi][k]);
405 }
406 printf("\n");
407 #endif
408 heapi--;
409 }
410
411 FREE(totals);
412 FREE(combined);
413 FREE(order);
414
415 return heap;
416 }
417 #endif
418
419
420 #if defined(HAVE_AVX512) || defined(HAVE_AVX2)
421 /* kmer-search.c calls Merge_diagonals_large with stream_high_list and
422 stream_low_list. localdb.c calls Merge_diagonals_uint4.
423 No procedure calls Merge_diagonals_uint8 with stream_list. */
424 Univcoord_T *
Merge_diagonals_uint8(int * nelts1,Univcoord_T ** stream_array,int * streamsize_array,int nstreams)425 Merge_diagonals_uint8 (int *nelts1, Univcoord_T **stream_array, int *streamsize_array,
426 int nstreams) {
427 Univcoord_T *result, **heap, *stream;
428 int *nelts;
429 int ncopied, heapi, heapsize, base, ancestori, pyramid_start, pyramid_end;
430 int bits;
431 #ifdef DEBUG
432 int i;
433 #endif
434
435
436 if (nstreams == 0) {
437 *nelts1 = 0;
438 return (Univcoord_T *) NULL;
439
440 } else if (nstreams == 1) {
441 *nelts1 = streamsize_array[0];
442 stream = stream_array[0];
443
444 result = MALLOC_ALIGN((*nelts1)*sizeof(Univcoord_T)); /* Output must be aligned */
445 memcpy(result,stream,(*nelts1)*sizeof(Univcoord_T));
446 return result;
447
448 } else {
449 heap = make_univdiagonals_heap(&nelts,&ncopied,stream_array,streamsize_array,nstreams);
450 if (ncopied == 1) {
451 *nelts1 = nelts[1];
452 result = heap[1];
453 } else {
454 heapsize = 2*ncopied - 1; /* also index of last node */
455 #ifdef HAVE_BUILTIN_CLZ
456 bits = 31 - __builtin_clz((unsigned int) heapsize);
457 #elif defined(HAVE_ASM_BSR)
458 asm("bsr %1,%0" : "=r"(bits) : "r"(heapsize));
459 #else
460 bits = 31 - ((heapsize >> 16) ? clz_table[heapsize >> 16] : 16 + clz_table[heapsize]);
461 #endif
462
463 base = (1 << bits);
464 debug(printf("nstreams %d, ncopied %d, heapsize %d, clz %d, bits %d, base %d\n",
465 nstreams,ncopied,heapsize,__builtin_clz(heapsize),bits,base));
466
467 /* Middle pyramids */
468 while (base > PYRAMID_SIZE) {
469 for (pyramid_start = 2*base - PYRAMID_SIZE, pyramid_end = 2*base - 1; pyramid_start >= base;
470 pyramid_start -= PYRAMID_SIZE, pyramid_end -= PYRAMID_SIZE) {
471 debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
472 ancestori = pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
473 }
474 base = ancestori;
475 }
476
477 /* Last pyramid */
478 pyramid_start = base;
479 pyramid_end = 2*base - 1;
480 debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
481 /* base = */ pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
482
483 *nelts1 = nelts[1];
484 result = heap[1];
485
486 for (heapi = heapsize; heapi > heapsize - ncopied; heapi--) {
487 FREE_ALIGN(heap[heapi]);
488 }
489 }
490
491 FREE(heap);
492 FREE(nelts);
493 }
494
495
496 #ifdef DEBUG
497 printf("Merge_diagonals_uint8 returning result of length %d\n",*nelts1);
498 for (i = 0; i < *nelts1; i++) {
499 printf("%llu\n",result[i]);
500 }
501 #endif
502
503 return result;
504 }
505 #endif
506
507
508