1 static char rcsid[] = "$Id: merge-diagonals-simd-uint4.c 222629 2020-05-20 00:05:59Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "merge-diagonals-simd-uint4.h"
7 #include "assert.h"
8 #include "mem.h"
9 #include "popcount.h"		/* For clz_table */
10 #include "sedgesort.h"
11 #include "merge-uint4.h"
12 
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>		/* For memcpy */
16 
17 
18 #if defined(HAVE_SSE4_1)
19 #include <smmintrin.h>
20 #endif
21 #if defined(HAVE_AVX2)
22 #include <immintrin.h>
23 #endif
24 #if defined(HAVE_AVX512)
25 #include <immintrin.h>
26 #endif
27 
28 
29 /* #define PYRAMID_SIZE 4 */
30 
31 #define CUTOFF 1000
32 #define PYRAMID_SIZE 32
33 
34 
35 #ifdef DEBUG
36 #define debug(x) x
37 #else
38 #define debug(x)
39 #endif
40 
41 /* pyramid_merge */
42 #ifdef DEBUG2
43 #define debug2(x) x
44 #else
45 #define debug2(x)
46 #endif
47 
48 
49 #ifdef CHECK_INPUTS
50 static void
check_ascending(unsigned int * diagonals,int ndiagonals)51 check_ascending (unsigned int *diagonals, int ndiagonals) {
52   int i;
53 
54   for (i = 1; i < ndiagonals; i++) {
55     if (diagonals[i] < diagonals[i-1]) {
56       abort();
57     }
58   }
59   return;
60 }
61 #endif
62 
63 
64 #define PARENT(i) (i >> 1)
65 #define LEFT(i) (i << 1)
66 #define RIGHT(i) ((i << 1) | 1)
67 
68 static int
pyramid_merge(unsigned int ** heap,int nstreams,int heapsize,int * nelts,int pyramid_start,int pyramid_end)69 pyramid_merge (unsigned int **heap, int nstreams, int heapsize, int *nelts,
70 	       int pyramid_start, int pyramid_end) {
71   int nodei;
72 #ifdef DEBUG
73   int i;
74 #endif
75 
76   while (pyramid_end > pyramid_start) {
77     debug(printf("Merging level: %d..%d for heapsize %d\n",pyramid_start,pyramid_end,heapsize));
78 
79     if (pyramid_end > heapsize) {
80       nodei = heapsize;
81     } else {
82       nodei = pyramid_end;
83     }
84 
85     while (nodei >= pyramid_start) {
86       debug2(printf("Merging nodes %d (%d elts) and %d (%d elts) => %d\n",
87 		    nodei-1,nelts[nodei-1],nodei,nelts[nodei],PARENT(nodei)));
88       heap[PARENT(nodei)] = Merge_uint4(/*dest*/NULL,heap[nodei-1],heap[nodei],nelts[nodei-1],nelts[nodei]);
89       CHECK_ALIGN(heap[PARENT(nodei)]);
90       nelts[PARENT(nodei)] = nelts[nodei-1] + nelts[nodei];
91       debug2(printf("Created list %p of length %d at node %d\n",
92 		    heap[PARENT(nodei)],nelts[PARENT(nodei)],PARENT(nodei)));
93 
94 #ifdef DEBUG
95       for (i = 0; i < nelts[PARENT(nodei)]; i++) {
96 	printf("%u\n",heap[PARENT(nodei)][i]);
97       }
98 #endif
99 
100       /* Don't free original lists (when nodei >= nstreams) */
101       debug(printf("Freeing nodes %d and %d\n",nodei-1,nodei));
102       if (nodei < nstreams) {
103 	FREE_ALIGN(heap[nodei]);
104       }
105       if (nodei-1 < nstreams) {
106 	FREE_ALIGN(heap[nodei-1]);
107       }
108       nodei -= 2;
109     }
110 
111     pyramid_end = PARENT(pyramid_end);
112     pyramid_start = PARENT(pyramid_start);
113   }
114 
115   debug(printf("Returning ancestor %d\n\n",pyramid_start));
116   return pyramid_start;
117 }
118 
119 
120 static UINT4 **
make_diagonals_heap(int ** nelts,int * ncopied,UINT4 ** stream_array,int * streamsize_array,int * diagterm_array,int nstreams)121 make_diagonals_heap (int **nelts, int *ncopied, UINT4 **stream_array,
122 		     int *streamsize_array, int *diagterm_array, int nstreams) {
123   UINT4 **heap, **combined, *out, diagterm;
124   int heapsize, heapi, ncombined;
125   int *order, *totals, total, n, i, j, k, l;
126 
127   /* Put smallest streams at the end of the heap, to increase speed */
128   /* Note: Extra space needed for Sedgesort, but querylength > (query_lastpos+1) + 1 */
129   order = Sedgesort_order_int(streamsize_array,nstreams);
130 
131   combined = (UINT4 **) MALLOC(nstreams*sizeof(UINT4 *));
132   totals = (int *) MALLOC(nstreams*sizeof(int));
133   ncombined = 0;
134 
135   /* Combine the smallest streams, up to CUTOFF, and just use Sedgesort */
136   i = 0;
137   total = 1;			/* To start the loop */
138   while (i < nstreams && total > 0) {
139     total = 0;
140     j = i;
141     while (j < nstreams && total + streamsize_array[order[j]] < CUTOFF) {
142       total += streamsize_array[order[j++]];
143     }
144 
145     if (total > 0) {
146       debug(printf("Merging from %d to %d with %d total elements\n",i,j-1,total));
147       totals[ncombined] = total;
148       /* Need an extra value for Sedgesort_uint4 */
149       out = combined[ncombined] = (UINT4 *) MALLOC_ALIGN((total+1)*sizeof(UINT4));
150       debug_align(printf("Allocating %p to %p -- Malloc of 1 bytes requested from %s:%d\n",
151 			 out,out,__FILE__,__LINE__));
152       for (k = i; k < j; k++) {
153 	n = streamsize_array[order[k]];
154 	memcpy(out,stream_array[order[k]],n*sizeof(UINT4));
155 
156 	/* Add diagterm (Could use SIMD here) */
157 	diagterm = (UINT4) diagterm_array[order[k]];
158 	for (l = 0; l < n; l++) {
159 	  out[l] += diagterm;
160 	}
161 
162 	out += n;
163       }
164       Sedgesort_uint4(combined[ncombined++],total);
165       i = j;
166     }
167   }
168 
169   *ncopied = (nstreams - i) + ncombined;
170   heapsize = 2*(*ncopied) - 1;
171 
172   heap = (UINT4 **) CALLOC((heapsize + 1),sizeof(UINT4 *));
173   *nelts = (int *) CALLOC((heapsize + 1),sizeof(int));
174 
175   heapi = heapsize;
176   /* Handle individual contents: Start with i value from before */
177   while (i < nstreams) {
178     n = (*nelts)[heapi] = streamsize_array[order[i]];
179 
180     /* Copy to make the merging process non-destructive */
181     out = heap[heapi] = MALLOC_ALIGN(n*sizeof(UINT4));
182     debug_align(printf("Allocating %p to %p -- Malloc of 1 bytes requested from %s:%d\n",
183 		       out,out,__FILE__,__LINE__));
184     CHECK_ALIGN(heap[heapi]);
185     memcpy(heap[heapi],stream_array[order[i]],n*sizeof(UINT4));
186 
187     /* Add diagterm (Could use SIMD here) */
188     diagterm = (UINT4) diagterm_array[order[i]];
189     for (l = 0; l < n; l++) {
190       out[l] += diagterm;
191     }
192 
193 #ifdef DEBUG
194     printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
195     for (k = 0; k < (*nelts)[heapi]; k++) {
196       printf(" %u",heap[heapi][k]);
197     }
198     printf("\n");
199 #endif
200     heapi--;
201     i++;
202   }
203 
204   /* Handle combined contents */
205   for (i = 0; i < ncombined; i++) {
206     heap[heapi] = combined[i];
207     (*nelts)[heapi] = totals[i];
208 #ifdef DEBUG
209     printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
210     for (k = 0; k < (*nelts)[heapi]; k++) {
211       printf(" %u",heap[heapi][k]);
212     }
213     printf("\n");
214 #endif
215     heapi--;
216   }
217 
218   FREE(totals);
219   FREE(combined);
220   FREE(order);
221 
222   return heap;
223 }
224 
225 
226 /* Input: streams might be aligned or not.  Caller will have to free appropriately */
227 /* Output: aligned */
228 UINT4 *
Merge_diagonals(int * nelts1,UINT4 ** stream_array,int * streamsize_array,int * diagterm_array,int nstreams)229 Merge_diagonals (int *nelts1, UINT4 **stream_array, int *streamsize_array,
230 		 int *diagterm_array, int nstreams) {
231   UINT4 *result, **heap, *stream, diagterm;
232   int *nelts, l;
233   int ncopied, heapi, heapsize, base, ancestori, pyramid_start, pyramid_end;
234   int bits;
235 #ifdef DEBUG
236   int i;
237 #endif
238 
239 
240   if (nstreams == 0) {
241     *nelts1 = 0;
242     return (UINT4 *) NULL;
243 
244   } else if (nstreams == 1) {
245     *nelts1 = streamsize_array[0];
246     stream = stream_array[0];
247     diagterm = diagterm_array[0];
248 
249     result = MALLOC_ALIGN((*nelts1)*sizeof(UINT4)); /* Output must be aligned */
250     debug_align(printf("Allocating %p to %p -- Malloc of 1 bytes requested from %s:%d\n",
251 		       result,result,__FILE__,__LINE__));
252     memcpy(result,stream,(*nelts1)*sizeof(UINT4));
253 
254     /* Add diagterm (Could use SIMD here) */
255     for (l = 0; l < *nelts1; l++) {
256       result[l] += diagterm;
257     }
258 
259     return result;
260 
261   } else {
262     heap = make_diagonals_heap(&nelts,&ncopied,stream_array,streamsize_array,diagterm_array,nstreams);
263     if (ncopied == 1) {
264       *nelts1 = nelts[1];
265       result = heap[1];
266     } else {
267       heapsize = 2*ncopied - 1;	/* also index of last node */
268 #ifdef HAVE_BUILTIN_CLZ
269       bits = 31 - __builtin_clz((unsigned int) heapsize);
270 #elif defined(HAVE_ASM_BSR)
271       asm("bsr %1,%0" : "=r"(bits) : "r"(heapsize));
272 #else
273       bits = 31 - ((heapsize >> 16) ? clz_table[heapsize >> 16] : 16 + clz_table[heapsize]);
274 #endif
275 
276       base = (1 << bits);
277       debug(printf("nstreams %d, ncopied %d, heapsize %d, clz %d, bits %d, base %d\n",
278 		   nstreams,ncopied,heapsize,__builtin_clz(heapsize),bits,base));
279 
280       /* Middle pyramids */
281       while (base > PYRAMID_SIZE) {
282 	for (pyramid_start = 2*base - PYRAMID_SIZE, pyramid_end = 2*base - 1; pyramid_start >= base;
283 	     pyramid_start -= PYRAMID_SIZE, pyramid_end -= PYRAMID_SIZE) {
284 	  debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
285 	  ancestori = pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
286 	}
287 	base = ancestori;
288       }
289 
290       /* Last pyramid */
291       pyramid_start = base;
292       pyramid_end = 2*base - 1;
293       debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
294       /* base = */ pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
295 
296       *nelts1 = nelts[1];
297       result = heap[1];
298 
299       for (heapi = heapsize; heapi > heapsize - ncopied; heapi--) {
300 	FREE_ALIGN(heap[heapi]);
301       }
302     }
303 
304     FREE(heap);
305     FREE(nelts);
306   }
307 
308 
309 #ifdef DEBUG
310   printf("Merge_diagonals returning result of length %d\n",*nelts1);
311   for (i = 0; i < *nelts1; i++) {
312     printf("%u\n",result[i]);
313   }
314 #endif
315 
316   return result;
317 }
318 
319 
320 static UINT4 **
make_univdiagonals_heap(int ** nelts,int * ncopied,UINT4 ** stream_array,int * streamsize_array,int nstreams)321 make_univdiagonals_heap (int **nelts, int *ncopied, UINT4 **stream_array,
322 			 int *streamsize_array, int nstreams) {
323   UINT4 **heap, **combined, *out;
324   int heapsize, heapi, ncombined;
325   int *order, *totals, total, n, i, j, k;
326 
327   /* Put smallest streams at the end of the heap, to increase speed */
328   /* Note: Extra space needed for Sedgesort, but querylength > (query_lastpos+1) + 1 */
329   order = Sedgesort_order_int(streamsize_array,nstreams);
330 
331   combined = (UINT4 **) MALLOC(nstreams*sizeof(UINT4 *));
332   totals = (int *) MALLOC(nstreams*sizeof(int));
333   ncombined = 0;
334 
335   /* Combine the smallest streams, up to CUTOFF, and just use Sedgesort */
336   i = 0;
337   total = 1;			/* To start the loop */
338   while (i < nstreams && total > 0) {
339     total = 0;
340     j = i;
341     while (j < nstreams && total + streamsize_array[order[j]] < CUTOFF) {
342       total += streamsize_array[order[j++]];
343     }
344 
345     if (total > 0) {
346       debug(printf("Merging from %d to %d with %d total elements\n",i,j-1,total));
347       totals[ncombined] = total;
348       /* Need an extra value for Sedgesort_uint4 */
349       out = combined[ncombined] = (UINT4 *) MALLOC_ALIGN((total+1)*sizeof(UINT4));
350       debug_align(printf("Allocating %p to %p -- Malloc of 1 bytes requested from %s:%d\n",
351 			 out,out,__FILE__,__LINE__));
352       for (k = i; k < j; k++) {
353 	n = streamsize_array[order[k]];
354 	memcpy(out,stream_array[order[k]],n*sizeof(UINT4));
355 	out += n;
356       }
357       Sedgesort_uint4(combined[ncombined++],total);
358 #ifdef CHECK_INPUTS
359       check_ascending(combined[ncombined-1],total);
360 #endif
361       i = j;
362     }
363   }
364 
365   *ncopied = (nstreams - i) + ncombined;
366   heapsize = 2*(*ncopied) - 1;
367 
368   heap = (UINT4 **) CALLOC((heapsize + 1),sizeof(UINT4 *));
369   *nelts = (int *) CALLOC((heapsize + 1),sizeof(int));
370 
371   heapi = heapsize;
372   /* Handle individual contents: Start with i value from before */
373   while (i < nstreams) {
374     n = (*nelts)[heapi] = streamsize_array[order[i]];
375 #ifdef CHECK_INPUTS
376     check_ascending(stream_array[order[i]],n);
377 #endif
378 
379     /* Copy to make the merging process non-destructive */
380     out = heap[heapi] = MALLOC_ALIGN(n*sizeof(UINT4));
381     debug_align(printf("Allocating %p to %p -- Malloc of 1 bytes requested from %s:%d\n",
382 		       out,out,__FILE__,__LINE__));
383     CHECK_ALIGN(heap[heapi]);
384     memcpy(heap[heapi],stream_array[order[i]],n*sizeof(UINT4));
385 
386 #ifdef DEBUG
387     printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
388     for (k = 0; k < (*nelts)[heapi]; k++) {
389       printf(" %u",heap[heapi][k]);
390     }
391     printf("\n");
392 #endif
393     heapi--;
394     i++;
395   }
396 
397   /* Handle combined contents */
398   for (i = 0; i < ncombined; i++) {
399     heap[heapi] = combined[i];
400     (*nelts)[heapi] = totals[i];
401 #ifdef DEBUG
402     printf("Assigning node %d with %d elts:",heapi,(*nelts)[heapi]);
403     for (k = 0; k < (*nelts)[heapi]; k++) {
404       printf(" %u",heap[heapi][k]);
405     }
406     printf("\n");
407 #endif
408     heapi--;
409   }
410 
411   FREE(totals);
412   FREE(combined);
413   FREE(order);
414 
415   return heap;
416 }
417 
418 
419 /* Used for localdb */
420 UINT4 *
Merge_diagonals_uint4(int * nelts1,UINT4 ** stream_array,int * streamsize_array,int nstreams)421 Merge_diagonals_uint4 (int *nelts1, UINT4 **stream_array, int *streamsize_array,
422 		       int nstreams) {
423   UINT4 *result, **heap, *stream;
424   int *nelts;
425   int ncopied, heapi, heapsize, base, ancestori, pyramid_start, pyramid_end;
426   int bits;
427 #ifdef DEBUG
428   int i;
429 #endif
430 
431 
432   if (nstreams == 0) {
433     *nelts1 = 0;
434     return (UINT4 *) NULL;
435 
436   } else if (nstreams == 1) {
437     *nelts1 = streamsize_array[0];
438     stream = stream_array[0];
439 #ifdef CHECK_INPUTS
440     check_ascending(stream,*nelts1);
441 #endif
442 
443     result = MALLOC_ALIGN((*nelts1)*sizeof(UINT4)); /* Output must be aligned */
444     debug_align(printf("Allocating %p to %p -- Malloc of 1 bytes requested from %s:%d\n",
445 		       result,result,__FILE__,__LINE__));
446     memcpy(result,stream,(*nelts1)*sizeof(UINT4));
447     return result;
448 
449   } else {
450     heap = make_univdiagonals_heap(&nelts,&ncopied,stream_array,streamsize_array,nstreams);
451     if (ncopied == 1) {
452       *nelts1 = nelts[1];
453       result = heap[1];
454     } else {
455       heapsize = 2*ncopied - 1;	/* also index of last node */
456 #ifdef HAVE_BUILTIN_CLZ
457       bits = 31 - __builtin_clz((unsigned int) heapsize);
458 #elif defined(HAVE_ASM_BSR)
459       asm("bsr %1,%0" : "=r"(bits) : "r"(heapsize));
460 #else
461       bits = 31 - ((heapsize >> 16) ? clz_table[heapsize >> 16] : 16 + clz_table[heapsize]);
462 #endif
463 
464       base = (1 << bits);
465       debug(printf("nstreams %d, ncopied %d, heapsize %d, clz %d, bits %d, base %d\n",
466 		   nstreams,ncopied,heapsize,__builtin_clz(heapsize),bits,base));
467 
468       /* Middle pyramids */
469       while (base > PYRAMID_SIZE) {
470 	for (pyramid_start = 2*base - PYRAMID_SIZE, pyramid_end = 2*base - 1; pyramid_start >= base;
471 	     pyramid_start -= PYRAMID_SIZE, pyramid_end -= PYRAMID_SIZE) {
472 	  debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
473 	  ancestori = pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
474 	}
475 	base = ancestori;
476       }
477 
478       /* Last pyramid */
479       pyramid_start = base;
480       pyramid_end = 2*base - 1;
481       debug(printf("diagonals: pyramid_start %d, pyramid_end %d, ncopied %d\n",pyramid_start,pyramid_end,ncopied));
482       /* base = */ pyramid_merge(heap,ncopied,heapsize,nelts,pyramid_start,pyramid_end);
483 
484       *nelts1 = nelts[1];
485       result = heap[1];
486 
487       for (heapi = heapsize; heapi > heapsize - ncopied; heapi--) {
488 	FREE_ALIGN(heap[heapi]);
489       }
490     }
491 
492     FREE(heap);
493     FREE(nelts);
494   }
495 
496 
497 #ifdef DEBUG
498   printf("Merge_diagonals returning result of length %d\n",*nelts1);
499   for (i = 0; i < *nelts1; i++) {
500     printf("%u\n",result[i]);
501   }
502 #endif
503 
504   return result;
505 }
506 
507 
508