1 static char rcsid[] = "$Id: merge-diagonals-simd-uint8.c 222445 2020-04-21 17:53:32Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "merge-diagonals-simd-uint8.h"
7 #include "assert.h"
8 #include "mem.h"
9 #include "popcount.h"		/* For clz_table */
10 #include "sedgesort.h"
11 #include "merge-uint8.h"
12 
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>		/* For memcpy */
16 
17 
18 #if defined(HAVE_SSE4_1)
19 #include <smmintrin.h>
20 #endif
21 #if defined(HAVE_AVX2)
22 #include <immintrin.h>
23 #endif
24 #if defined(HAVE_AVX512)
25 #include <immintrin.h>
26 #endif
27 
28 
29 /* #define PYRAMID_SIZE 4 */
30 
31 #define CUTOFF 1000
32 #define PYRAMID_SIZE 32
33 
34 #define GETPOS(high,low) (((Univcoord_T) high << 32) + low)
35 
36 
37 #ifdef DEBUG0
38 #define debug0(x) x
39 #else
40 #define debug0(x)
41 #endif
42 
43 #ifdef DEBUG
44 #define debug(x) x
45 #else
46 #define debug(x)
47 #endif
48 
49 #ifdef DEBUG2
50 #define debug2(x) x
51 #else
52 #define debug2(x)
53 #endif
54 
55 
56 
57 #define PARENT(i) (i >> 1)
58 #define LEFT(i) (i << 1)
59 #define RIGHT(i) ((i << 1) | 1)
60 
61 #if defined(HAVE_AVX512) || defined(HAVE_AVX2)
62 static int
pyramid_merge(UINT8 ** heap,int nstreams,int heapsize,int * nelts,int pyramid_start,int pyramid_end)63 pyramid_merge (UINT8 **heap, int nstreams, int heapsize, int *nelts,
64 	       int pyramid_start, int pyramid_end) {
65   int nodei;
66 #ifdef DEBUG
67   int i;
68 #endif
69 
70   while (pyramid_end > pyramid_start) {
71     debug(printf("Merging level: %d..%d for heapsize %d\n",pyramid_start,pyramid_end,heapsize));
72 
73     if (pyramid_end > heapsize) {
74       nodei = heapsize;
75     } else {
76       nodei = pyramid_end;
77     }
78 
79     while (nodei >= pyramid_start) {
80       debug2(printf("Merging nodes %d (%d elts) and %d (%d elts) => %d\n",
81 		    nodei-1,nelts[nodei-1],nodei,nelts[nodei],PARENT(nodei)));
82       heap[PARENT(nodei)] = Merge_uint8(/*dest*/NULL,heap[nodei-1],heap[nodei],nelts[nodei-1],nelts[nodei]);
83       CHECK_ALIGN(heap[PARENT(nodei)]);
84       nelts[PARENT(nodei)] = nelts[nodei-1] + nelts[nodei];
85       debug2(printf("Created list %p of length %d at node %d\n",
86 		    heap[PARENT(nodei)],nelts[PARENT(nodei)],PARENT(nodei)));
87 
88 #ifdef DEBUG
89       for (i = 0; i < nelts[PARENT(nodei)]; i++) {
90 	printf("%llu\n",heap[PARENT(nodei)][i]);
91       }
92 #endif
93 
94       /* Don't free original lists (when nodei >= nstreams) */
95       debug(printf("Freeing nodes %d and %d\n",nodei-1,nodei));
96       if (nodei < nstreams) {
97 	FREE_ALIGN(heap[nodei]);
98       }
99       if (nodei-1 < nstreams) {
100 	FREE_ALIGN(heap[nodei-1]);
101       }
102       nodei -= 2;
103     }
104 
105     pyramid_end = PARENT(pyramid_end);
106     pyramid_start = PARENT(pyramid_start);
107   }
108 
109   debug(printf("Returning ancestor %d\n\n",pyramid_start));
110   return pyramid_start;
111 }
112 #endif
113 
114 
115 #if defined(HAVE_AVX512) || defined(HAVE_AVX2)
116 static UINT8 **
make_diagonals_heap(int ** nelts,int * ncopied,unsigned char ** stream_high_array,UINT4 ** stream_low_array,int * streamsize_array,int * diagterm_array,int nstreams)117 make_diagonals_heap (int **nelts, int *ncopied,
118 		     unsigned char **stream_high_array, UINT4 **stream_low_array,
119 		     int *streamsize_array, int *diagterm_array, int nstreams) {
120   UINT8 **heap, **combined, *out, diagterm;
121   int heapsize, heapi, ncombined;
122   int *order, *totals, total, n, i, j, k, l;
123 
124   debug(printf("Entered make_diagonals_heap\n"));
125 
126   /* Put smallest streams at the end of the heap, to increase speed */
127   /* Note: Extra space needed for Sedgesort, but querylength > (query_lastpos+1) + 1 */
128   order = Sedgesort_order_int(streamsize_array,nstreams);
129 
130   combined = (UINT8 **) MALLOC(nstreams*sizeof(UINT8 *));
131   totals = (int *) MALLOC(nstreams*sizeof(int));
132   ncombined = 0;
133 
134   /* Combine the smallest streams, up to CUTOFF, and just use Sedgesort */
135   i = 0;
136   total = 1;			/* To start the loop */
137   while (i < nstreams && total > 0) {
138     total = 0;
139     j = i;
140     while (j < nstreams && total + streamsize_array[order[j]] < CUTOFF) {
141       total += streamsize_array[order[j++]];
142     }
143 
144     if (total > 0) {
145       debug(printf("Merging from %d to %d with %d total elements\n",i,j-1,total));
146       totals[ncombined] = total;
147       /* Need an extra value for Sedgesort_uint8 */
148       out = combined[ncombined] = (UINT8 *) MALLOC_ALIGN((total+1)*sizeof(UINT8));
149       for (k = i; k < j; k++) {
150 	n = streamsize_array[order[k]];
151 
152 	/* Combine stream_high and stream_low and add diagterm (Could use SIMD here) */
153 	diagterm = (UINT8) diagterm_array[order[k]];
154 	for (l = 0; l < n; l++) {
155 	  out[l] = GETPOS(stream_high_array[order[k]][l],stream_low_array[order[k]][l]) + diagterm;
156 	}
157 
158 	out += n;
159       }
160       Sedgesort_uint8(combined[ncombined++],total);
161       i = j;
162     }
163   }
164 
165   *ncopied = (nstreams - i) + ncombined;
166   heapsize = 2*(*ncopied) - 1;
167 
168   heap = (UINT8 **) CALLOC((heapsize + 1),sizeof(UINT8 *));
169   *nelts = (int *) CALLOC((heapsize + 1),sizeof(int));
170 
171   heapi = heapsize;
172   /* Handle individual contents: Start with i value from before */
173   while (i < nstreams) {
174     n = (*nelts)[heapi] = streamsize_array[order[i]];
175 
176     /* Copy to make the merging process non-destructive */
177     out = heap[heapi] = MALLOC_ALIGN(n*sizeof(UINT8));
178     CHECK_ALIGN(heap[heapi]);
179 
180     /* Combine stream_high and stream_low and add diagterm (Could use SIMD here) */
181     diagterm = (UINT8) diagterm_array[order[i]];
182     for (l = 0; l < n; l++) {
183       out[l] = GETPOS(stream_high_array[order[i]][l],stream_low_array[order[i]][l]) + diagterm;
184     }
185 
186 #ifdef DEBUG
187     printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
188     for (k = 0; k < (*nelts)[heapi]; k++) {
189       printf(" %llu",heap[heapi][k]);
190     }
191     printf("\n");
192 #endif
193     heapi--;
194     i++;
195   }
196 
197   /* Handle combined contents */
198   for (i = 0; i < ncombined; i++) {
199     heap[heapi] = combined[i];
200     (*nelts)[heapi] = totals[i];
201 #ifdef DEBUG
202     printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
203     for (k = 0; k < (*nelts)[heapi]; k++) {
204       printf(" %llu",heap[heapi][k]);
205     }
206     printf("\n");
207 #endif
208     heapi--;
209   }
210 
211   FREE(totals);
212   FREE(combined);
213   FREE(order);
214 
215   return heap;
216 }
217 #endif
218 
219 
220 /* For non-AVX2, non-AVX512 code, see merge-diagonals-heap.c */
221 #if defined(HAVE_AVX512) || defined(HAVE_AVX2)
222 /* Input: streams might be aligned or not.  Caller will have to free appropriately */
223 /* Output: aligned */
224 Univcoord_T *
Merge_diagonals_large(int * nelts1,unsigned char ** stream_high_array,UINT4 ** stream_low_array,int * streamsize_array,int * diagterm_array,int nstreams)225 Merge_diagonals_large (int *nelts1, unsigned char **stream_high_array, UINT4 **stream_low_array,
226 		       int *streamsize_array, int *diagterm_array, int nstreams) {
227   Univcoord_T *result, **heap, diagterm;
228   unsigned char *stream_high;
229   UINT4 *stream_low;
230   int *nelts, l;
231   int ncopied, heapi, heapsize, base, ancestori, pyramid_start, pyramid_end;
232   int bits;
233 #ifdef DEBUG
234   int i;
235 #endif
236 
237 
238   if (nstreams == 0) {
239     *nelts1 = 0;
240     return (Univcoord_T *) NULL;
241 
242   } else if (nstreams == 1) {
243     *nelts1 = streamsize_array[0];
244     stream_high = stream_high_array[0];
245     stream_low = stream_low_array[0];
246     diagterm = diagterm_array[0];
247 
248     result = MALLOC_ALIGN((*nelts1)*sizeof(UINT8)); /* Output must be aligned */
249     /* Combine stream_high and stream_low and add diagterm (Could use SIMD here) */
250     for (l = 0; l < *nelts1; l++) {
251       result[l] = GETPOS(stream_high[l],stream_low[l]) + diagterm;
252     }
253 
254     return result;
255 
256   } else {
257     heap = make_diagonals_heap(&nelts,&ncopied,stream_high_array,stream_low_array,
258 			       streamsize_array,diagterm_array,nstreams);
259     if (ncopied == 1) {
260       *nelts1 = nelts[1];
261       result = heap[1];
262     } else {
263       heapsize = 2*ncopied - 1;	/* also index of last node */
264 #ifdef HAVE_BUILTIN_CLZ
265       bits = 31 - __builtin_clz((unsigned int) heapsize);
266 #elif defined(HAVE_ASM_BSR)
267       asm("bsr %1,%0" : "=r"(bits) : "r"(heapsize));
268 #else
269       bits = 31 - ((heapsize >> 16) ? clz_table[heapsize >> 16] : 16 + clz_table[heapsize]);
270 #endif
271 
272       base = (1 << bits);
273       debug(printf("nstreams %d, ncopied %d, heapsize %d, clz %d, bits %d, base %d\n",
274 		   nstreams,ncopied,heapsize,__builtin_clz(heapsize),bits,base));
275 
276       /* Middle pyramids */
277       while (base > PYRAMID_SIZE) {
278 	for (pyramid_start = 2*base - PYRAMID_SIZE, pyramid_end = 2*base - 1; pyramid_start >= base;
279 	     pyramid_start -= PYRAMID_SIZE, pyramid_end -= PYRAMID_SIZE) {
280 	  debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
281 	  ancestori = pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
282 	}
283 	base = ancestori;
284       }
285 
286       /* Last pyramid */
287       pyramid_start = base;
288       pyramid_end = 2*base - 1;
289       debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
290       /* base = */ pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
291 
292       *nelts1 = nelts[1];
293       result = heap[1];
294 
295       for (heapi = heapsize; heapi > heapsize - ncopied; heapi--) {
296 	FREE_ALIGN(heap[heapi]);
297       }
298     }
299 
300     FREE(heap);
301     FREE(nelts);
302   }
303 
304 
305 #ifdef DEBUG
306   printf("Merge_diagonals_large returning result of length %d\n",*nelts1);
307   for (i = 0; i < *nelts1; i++) {
308     printf("%llu\n",result[i]);
309   }
310 #endif
311 
312   return result;
313 }
314 #endif
315 
316 
317 #if defined(HAVE_AVX512) || defined(HAVE_AVX2)
318 static Univcoord_T **
make_univdiagonals_heap(int ** nelts,int * ncopied,Univcoord_T ** stream_array,int * streamsize_array,int nstreams)319 make_univdiagonals_heap (int **nelts, int *ncopied, Univcoord_T **stream_array,
320 			 int *streamsize_array, int nstreams) {
321   Univcoord_T **heap, **combined, *out;
322   int heapsize, heapi, ncombined;
323   int *order, *totals, total, n, i, j, k;
324 
325   debug(printf("Entered make_univdiagonals_heap\n"));
326 #ifdef DEBUG
327   for (i = 0; i < nstreams; i++) {
328     printf("Stream %d:",i);
329     for (j = 0; j < streamsize_array[i]; j++) {
330       printf(" %llu",stream_array[i][j]);
331     }
332     printf("\n");
333   }
334 #endif
335 
336 
337   /* Put smallest streams at the end of the heap, to increase speed */
338   /* Note: Extra space needed for Sedgesort, but querylength > (query_lastpos+1) + 1 */
339   order = Sedgesort_order_int(streamsize_array,nstreams);
340 
341   combined = (Univcoord_T **) MALLOC(nstreams*sizeof(Univcoord_T *));
342   totals = (int *) MALLOC(nstreams*sizeof(int));
343   ncombined = 0;
344 
345   /* Combine the smallest streams, up to CUTOFF, and just use Sedgesort */
346   i = 0;
347   total = 1;			/* To start the loop */
348   while (i < nstreams && total > 0) {
349     total = 0;
350     j = i;
351     while (j < nstreams && total + streamsize_array[order[j]] < CUTOFF) {
352       total += streamsize_array[order[j++]];
353     }
354 
355     if (total > 0) {
356       debug(printf("Merging from %d to %d with %d total elements\n",i,j-1,total));
357       totals[ncombined] = total;
358       /* Need an extra value for Sedgesort_uint8 */
359       out = combined[ncombined] = (Univcoord_T *) MALLOC_ALIGN((total+1)*sizeof(Univcoord_T));
360       for (k = i; k < j; k++) {
361 	n = streamsize_array[order[k]];
362 	memcpy(out,stream_array[order[k]],n*sizeof(Univcoord_T));
363 	out += n;
364       }
365       Sedgesort_uint8(combined[ncombined++],total);
366       i = j;
367     }
368   }
369 
370   *ncopied = (nstreams - i) + ncombined;
371   heapsize = 2*(*ncopied) - 1;
372 
373   heap = (Univcoord_T **) CALLOC((heapsize + 1),sizeof(Univcoord_T *));
374   *nelts = (int *) CALLOC((heapsize + 1),sizeof(int));
375 
376   heapi = heapsize;
377   /* Handle individual contents: Start with i value from before */
378   while (i < nstreams) {
379     n = (*nelts)[heapi] = streamsize_array[order[i]];
380 
381     /* Copy to make the merging process non-destructive */
382     out = heap[heapi] = MALLOC_ALIGN(n*sizeof(Univcoord_T));
383     CHECK_ALIGN(heap[heapi]);
384     memcpy(heap[heapi],stream_array[order[i]],n*sizeof(Univcoord_T));
385 
386 #ifdef DEBUG
387     printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
388     for (k = 0; k < (*nelts)[heapi]; k++) {
389       printf(" %llu",heap[heapi][k]);
390     }
391     printf("\n");
392 #endif
393     heapi--;
394     i++;
395   }
396 
397   /* Handle combined contents */
398   for (i = 0; i < ncombined; i++) {
399     heap[heapi] = combined[i];
400     (*nelts)[heapi] = totals[i];
401 #ifdef DEBUG
402     printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
403     for (k = 0; k < (*nelts)[heapi]; k++) {
404       printf(" %llu",heap[heapi][k]);
405     }
406     printf("\n");
407 #endif
408     heapi--;
409   }
410 
411   FREE(totals);
412   FREE(combined);
413   FREE(order);
414 
415   return heap;
416 }
417 #endif
418 
419 
420 #if defined(HAVE_AVX512) || defined(HAVE_AVX2)
421 /* kmer-search.c calls Merge_diagonals_large with stream_high_list and
422    stream_low_list.  localdb.c calls Merge_diagonals_uint4.
423    No procedure calls Merge_diagonals_uint8 with stream_list. */
424 Univcoord_T *
Merge_diagonals_uint8(int * nelts1,Univcoord_T ** stream_array,int * streamsize_array,int nstreams)425 Merge_diagonals_uint8 (int *nelts1, Univcoord_T **stream_array, int *streamsize_array,
426 		       int nstreams) {
427   Univcoord_T *result, **heap, *stream;
428   int *nelts;
429   int ncopied, heapi, heapsize, base, ancestori, pyramid_start, pyramid_end;
430   int bits;
431 #ifdef DEBUG
432   int i;
433 #endif
434 
435 
436   if (nstreams == 0) {
437     *nelts1 = 0;
438     return (Univcoord_T *) NULL;
439 
440   } else if (nstreams == 1) {
441     *nelts1 = streamsize_array[0];
442     stream = stream_array[0];
443 
444     result = MALLOC_ALIGN((*nelts1)*sizeof(Univcoord_T)); /* Output must be aligned */
445     memcpy(result,stream,(*nelts1)*sizeof(Univcoord_T));
446     return result;
447 
448   } else {
449     heap = make_univdiagonals_heap(&nelts,&ncopied,stream_array,streamsize_array,nstreams);
450     if (ncopied == 1) {
451       *nelts1 = nelts[1];
452       result = heap[1];
453     } else {
454       heapsize = 2*ncopied - 1;	/* also index of last node */
455 #ifdef HAVE_BUILTIN_CLZ
456       bits = 31 - __builtin_clz((unsigned int) heapsize);
457 #elif defined(HAVE_ASM_BSR)
458       asm("bsr %1,%0" : "=r"(bits) : "r"(heapsize));
459 #else
460       bits = 31 - ((heapsize >> 16) ? clz_table[heapsize >> 16] : 16 + clz_table[heapsize]);
461 #endif
462 
463       base = (1 << bits);
464       debug(printf("nstreams %d, ncopied %d, heapsize %d, clz %d, bits %d, base %d\n",
465 		   nstreams,ncopied,heapsize,__builtin_clz(heapsize),bits,base));
466 
467       /* Middle pyramids */
468       while (base > PYRAMID_SIZE) {
469 	for (pyramid_start = 2*base - PYRAMID_SIZE, pyramid_end = 2*base - 1; pyramid_start >= base;
470 	     pyramid_start -= PYRAMID_SIZE, pyramid_end -= PYRAMID_SIZE) {
471 	  debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
472 	  ancestori = pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
473 	}
474 	base = ancestori;
475       }
476 
477       /* Last pyramid */
478       pyramid_start = base;
479       pyramid_end = 2*base - 1;
480       debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
481       /* base = */ pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
482 
483       *nelts1 = nelts[1];
484       result = heap[1];
485 
486       for (heapi = heapsize; heapi > heapsize - ncopied; heapi--) {
487 	FREE_ALIGN(heap[heapi]);
488       }
489     }
490 
491     FREE(heap);
492     FREE(nelts);
493   }
494 
495 
496 #ifdef DEBUG
497   printf("Merge_diagonals_uint8 returning result of length %d\n",*nelts1);
498   for (i = 0; i < *nelts1; i++) {
499     printf("%llu\n",result[i]);
500   }
501 #endif
502 
503   return result;
504 }
505 #endif
506 
507 
508