1 //------------------------------------------------------------------------------
2 // GB_msort_3b: sort a 3-by-n list of integers, using A[0:2][ ] as the key
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // A parallel mergesort of an array of 3-by-n integers.  Each key
11 // consists of three integers.
12 
13 #include "GB_msort_3b.h"
14 
15 //------------------------------------------------------------------------------
16 // GB_msort_3b_binary_search: binary search for the pivot
17 //------------------------------------------------------------------------------
18 
19 // The Pivot value is Y [pivot], and a binary search for the Pivot is made in
20 // the array X [p_pstart...p_end-1], which is sorted in non-decreasing order on
21 // input.  The return value is pleft, where
22 //
23 //    X [p_start ... pleft-1] <= Pivot and
24 //    X [pleft ... p_end-1] >= Pivot holds.
25 //
26 // pleft is returned in the range p_start to p_end.  If pleft is p_start, then
27 // the Pivot is smaller than all entries in X [p_start...p_end-1], and the left
28 // list X [p_start...pleft-1] is empty.  If pleft is p_end, then the Pivot is
29 // larger than all entries in X [p_start...p_end-1], and the right list X
30 // [pleft...p_end-1] is empty.
31 
GB_msort_3b_binary_search(const int64_t * restrict Y_0,const int64_t * restrict Y_1,const int64_t * restrict Y_2,const int64_t pivot,const int64_t * restrict X_0,const int64_t * restrict X_1,const int64_t * restrict X_2,const int64_t p_start,const int64_t p_end)32 static int64_t GB_msort_3b_binary_search    // return pleft
33 (
34     const int64_t *restrict Y_0,         // Pivot is Y [pivot]
35     const int64_t *restrict Y_1,
36     const int64_t *restrict Y_2,
37     const int64_t pivot,
38     const int64_t *restrict X_0,         // search in X [p_start..p_end_-1]
39     const int64_t *restrict X_1,
40     const int64_t *restrict X_2,
41     const int64_t p_start,
42     const int64_t p_end
43 )
44 {
45 
46     //--------------------------------------------------------------------------
47     // find where the Pivot appears in X
48     //--------------------------------------------------------------------------
49 
50     // binary search of X [p_start...p_end-1] for the Pivot
51     int64_t pleft = p_start ;
52     int64_t pright = p_end - 1 ;
53     while (pleft < pright)
54     {
55         int64_t pmiddle = (pleft + pright) >> 1 ;
56         // less = (X [pmiddle] < Pivot)
57         bool less = GB_lt_3 (X_0, X_1, X_2, pmiddle,
58                              Y_0, Y_1, Y_2, pivot) ;
59         pleft  = less ? (pmiddle+1) : pleft ;
60         pright = less ? pright : pmiddle ;
61     }
62 
63     // binary search is narrowed down to a single item
64     // or it has found the list is empty:
65     ASSERT (pleft == pright || pleft == pright + 1) ;
66 
67     // If found is true then X [pleft == pright] == Pivot.  If duplicates
68     // appear then X [pleft] is any one of the entries equal to the Pivot
69     // in the list.  If found is false then
70     //    X [p_start ... pleft-1] < Pivot and
71     //    X [pleft+1 ... p_end-1] > Pivot holds.
72     //    The value X [pleft] may be either < or > Pivot.
73     bool found = (pleft == pright) && GB_eq_3 (X_0, X_1, X_2, pleft,
74                                                Y_0, Y_1, Y_2, pivot) ;
75 
76     // Modify pleft and pright:
77     if (!found && (pleft == pright))
78     {
79         if (GB_lt_3 (X_0, X_1, X_2, pleft,
80                      Y_0, Y_1, Y_2, pivot))
81         {
82             pleft++ ;
83         }
84         else
85         {
86 //          pright++ ;  // (not needed)
87         }
88     }
89 
90     //--------------------------------------------------------------------------
91     // return result
92     //--------------------------------------------------------------------------
93 
94     // If found is false then
95     //    X [p_start ... pleft-1] < Pivot and
96     //    X [pleft ... p_end-1] > Pivot holds,
97     //    and pleft-1 == pright
98 
99     // If X has no duplicates, then whether or not Pivot is found,
100     //    X [p_start ... pleft-1] < Pivot and
101     //    X [pleft ... p_end-1] >= Pivot holds.
102 
103     // If X has duplicates, then whether or not Pivot is found,
104     //    X [p_start ... pleft-1] <= Pivot and
105     //    X [pleft ... p_end-1] >= Pivot holds.
106 
107     return (pleft) ;
108 }
109 
110 //------------------------------------------------------------------------------
111 // GB_msort_3b_create_merge_tasks
112 //------------------------------------------------------------------------------
113 
114 // Recursively constructs ntasks tasks to merge two arrays, Left and Right,
115 // into Sresult, where Left is L [pL_start...pL_end-1], Right is R
116 // [pR_start...pR_end-1], and Sresult is S [pS_start...pS_start+total_work-1],
117 // and where total_work is the total size of Left and Right.
118 //
119 // Task tid will merge L [L_task [tid] ... L_task [tid] + L_len [tid] - 1] and
120 // R [R_task [tid] ... R_task [tid] + R_len [tid] -1] into the merged output
121 // array S [S_task [tid] ... ].  The task tids created are t0 to
122 // t0+ntasks-1.
123 
GB_msort_3b_create_merge_tasks(int64_t * restrict L_task,int64_t * restrict L_len,int64_t * restrict R_task,int64_t * restrict R_len,int64_t * restrict S_task,const int t0,const int ntasks,const int64_t pS_start,const int64_t * restrict L_0,const int64_t * restrict L_1,const int64_t * restrict L_2,const int64_t pL_start,const int64_t pL_end,const int64_t * restrict R_0,const int64_t * restrict R_1,const int64_t * restrict R_2,const int64_t pR_start,const int64_t pR_end)124 void GB_msort_3b_create_merge_tasks
125 (
126     // output:
127     int64_t *restrict L_task,        // L_task [t0...t0+ntasks-1] computed
128     int64_t *restrict L_len,         // L_len  [t0...t0+ntasks-1] computed
129     int64_t *restrict R_task,        // R_task [t0...t0+ntasks-1] computed
130     int64_t *restrict R_len,         // R_len  [t0...t0+ntasks-1] computed
131     int64_t *restrict S_task,        // S_task [t0...t0+ntasks-1] computed
132     // input:
133     const int t0,                       // first task tid to create
134     const int ntasks,                   // # of tasks to create
135     const int64_t pS_start,             // merge into S [pS_start...]
136     const int64_t *restrict L_0,     // Left = L [pL_start...pL_end-1]
137     const int64_t *restrict L_1,
138     const int64_t *restrict L_2,
139     const int64_t pL_start,
140     const int64_t pL_end,
141     const int64_t *restrict R_0,     // Right = R [pR_start...pR_end-1]
142     const int64_t *restrict R_1,
143     const int64_t *restrict R_2,
144     const int64_t pR_start,
145     const int64_t pR_end
146 )
147 {
148 
149     //--------------------------------------------------------------------------
150     // get problem size
151     //--------------------------------------------------------------------------
152 
153     int64_t nleft  = pL_end - pL_start ;        // size of Left array
154     int64_t nright = pR_end - pR_start ;        // size of Right array
155     int64_t total_work = nleft + nright ;       // total work to do
156     ASSERT (ntasks >= 1) ;
157     ASSERT (total_work > 0) ;
158 
159     //--------------------------------------------------------------------------
160     // create the tasks
161     //--------------------------------------------------------------------------
162 
163     if (ntasks == 1)
164     {
165 
166         //----------------------------------------------------------------------
167         // a single task will merge all of Left and Right into Sresult
168         //----------------------------------------------------------------------
169 
170         L_task [t0] = pL_start ; L_len [t0] = nleft ;
171         R_task [t0] = pR_start ; R_len [t0] = nright ;
172         S_task [t0] = pS_start ;
173 
174     }
175     else
176     {
177 
178         //----------------------------------------------------------------------
179         // partition the Left and Right arrays for multiple merge tasks
180         //----------------------------------------------------------------------
181 
182         int64_t pleft, pright ;
183         if (nleft >= nright)
184         {
185             // split Left in half, and search for its pivot in Right
186             pleft = (pL_end + pL_start) >> 1 ;
187             pright = GB_msort_3b_binary_search (
188                         L_0, L_1, L_2, pleft,
189                         R_0, R_1, R_2, pR_start, pR_end) ;
190         }
191         else
192         {
193             // split Right in half, and search for its pivot in Left
194             pright = (pR_end + pR_start) >> 1 ;
195             pleft = GB_msort_3b_binary_search (
196                         R_0, R_1, R_2, pright,
197                         L_0, L_1, L_2, pL_start, pL_end) ;
198         }
199 
200         //----------------------------------------------------------------------
201         // partition the tasks according to the work of each partition
202         //----------------------------------------------------------------------
203 
204         // work0 is the total work in the first partition
205         int64_t work0 = (pleft - pL_start) + (pright - pR_start) ;
206         int ntasks0 = (int) round ((double) ntasks *
207             (((double) work0) / ((double) total_work))) ;
208 
209         // ensure at least one task is assigned to each partition
210         ntasks0 = GB_IMAX (ntasks0, 1) ;
211         ntasks0 = GB_IMIN (ntasks0, ntasks-1) ;
212         int ntasks1 = ntasks - ntasks0 ;
213 
214         //----------------------------------------------------------------------
215         // assign ntasks0 to the first half
216         //----------------------------------------------------------------------
217 
218         // ntasks0 tasks merge L [pL_start...pleft-1] and R [pR_start..pright-1]
219         // into the result S [pS_start...work0-1].
220 
221         GB_msort_3b_create_merge_tasks (
222             L_task, L_len, R_task, R_len, S_task, t0, ntasks0, pS_start,
223             L_0, L_1, L_2, pL_start, pleft,
224             R_0, R_1, R_2, pR_start, pright) ;
225 
226         //----------------------------------------------------------------------
227         // assign ntasks1 to the second half
228         //----------------------------------------------------------------------
229 
230         // ntasks1 tasks merge L [pleft...pL_end-1] and R [pright...pR_end-1]
231         // into the result S [pS_start+work0...pS_start+total_work].
232 
233         int t1 = t0 + ntasks0 ;     // first task id of the second set of tasks
234         int64_t pS_start1 = pS_start + work0 ;  // 2nd set starts here in S
235         GB_msort_3b_create_merge_tasks (
236             L_task, L_len, R_task, R_len, S_task, t1, ntasks1, pS_start1,
237             L_0, L_1, L_2, pleft,  pL_end,
238             R_0, R_1, R_2, pright, pR_end) ;
239     }
240 }
241 
242 //------------------------------------------------------------------------------
243 // GB_msort_3b_merge: merge two sorted lists via a single thread
244 //------------------------------------------------------------------------------
245 
246 // merge Left [0..nleft-1] and Right [0..nright-1] into S [0..nleft+nright-1] */
247 
GB_msort_3b_merge(int64_t * restrict S_0,int64_t * restrict S_1,int64_t * restrict S_2,const int64_t * restrict Left_0,const int64_t * restrict Left_1,const int64_t * restrict Left_2,const int64_t nleft,const int64_t * restrict Right_0,const int64_t * restrict Right_1,const int64_t * restrict Right_2,const int64_t nright)248 static void GB_msort_3b_merge
249 (
250     int64_t *restrict S_0,              // output of length nleft + nright
251     int64_t *restrict S_1,
252     int64_t *restrict S_2,
253     const int64_t *restrict Left_0,     // left input of length nleft
254     const int64_t *restrict Left_1,
255     const int64_t *restrict Left_2,
256     const int64_t nleft,
257     const int64_t *restrict Right_0,    // right input of length nright
258     const int64_t *restrict Right_1,
259     const int64_t *restrict Right_2,
260     const int64_t nright
261 )
262 {
263     int64_t p, pleft, pright ;
264 
265     // merge the two inputs, Left and Right, while both inputs exist
266     for (p = 0, pleft = 0, pright = 0 ; pleft < nleft && pright < nright ; p++)
267     {
268         if (GB_lt_3 (Left_0,  Left_1,  Left_2,  pleft,
269                      Right_0, Right_1, Right_2, pright))
270         {
271             // S [p] = Left [pleft++]
272             S_0 [p] = Left_0 [pleft] ;
273             S_1 [p] = Left_1 [pleft] ;
274             S_2 [p] = Left_2 [pleft] ;
275             pleft++ ;
276         }
277         else
278         {
279             // S [p] = Right [pright++]
280             S_0 [p] = Right_0 [pright] ;
281             S_1 [p] = Right_1 [pright] ;
282             S_2 [p] = Right_2 [pright] ;
283             pright++ ;
284         }
285     }
286 
287     // either input is exhausted; copy the remaining list into S
288     if (pleft < nleft)
289     {
290         int64_t nremaining = (nleft - pleft) ;
291         memcpy (S_0 + p, Left_0 + pleft, nremaining * sizeof (int64_t)) ;
292         memcpy (S_1 + p, Left_1 + pleft, nremaining * sizeof (int64_t)) ;
293         memcpy (S_2 + p, Left_2 + pleft, nremaining * sizeof (int64_t)) ;
294     }
295     else if (pright < nright)
296     {
297         int64_t nremaining = (nright - pright) ;
298         memcpy (S_0 + p, Right_0 + pright, nremaining * sizeof (int64_t)) ;
299         memcpy (S_1 + p, Right_1 + pright, nremaining * sizeof (int64_t)) ;
300         memcpy (S_2 + p, Right_2 + pright, nremaining * sizeof (int64_t)) ;
301     }
302 }
303 
304 //------------------------------------------------------------------------------
305 // GB_msort_3b: parallel mergesort
306 //------------------------------------------------------------------------------
307 
308 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
GB_msort_3b(int64_t * restrict A_0,int64_t * restrict A_1,int64_t * restrict A_2,const int64_t n,int nthreads)309 GrB_Info GB_msort_3b    // sort array A of size 3-by-n, using 3 keys (A [0:2][])
310 (
311     int64_t *restrict A_0,   // size n array
312     int64_t *restrict A_1,   // size n array
313     int64_t *restrict A_2,   // size n array
314     const int64_t n,
315     int nthreads                // # of threads to use
316 )
317 {
318 
319     //--------------------------------------------------------------------------
320     // handle small problems with a single thread
321     //--------------------------------------------------------------------------
322 
323     if (nthreads <= 1 || n <= GB_BASECASE)
324     {
325         // sequential quicksort
326         GB_qsort_3 (A_0, A_1, A_2, n) ;
327         return (GrB_SUCCESS) ;
328     }
329 
330     //--------------------------------------------------------------------------
331     // determine # of tasks
332     //--------------------------------------------------------------------------
333 
334     // determine the number of levels to create, which must always be an
335     // even number.  The # of levels is chosen to ensure that the # of leaves
336     // of the task tree is between 4*nthreads and 16*nthreads.
337 
338     //  2 to 4 threads:     4 levels, 16 qsort leaves
339     //  5 to 16 threads:    6 levels, 64 qsort leaves
340     // 17 to 64 threads:    8 levels, 256 qsort leaves
341     // 65 to 256 threads:   10 levels, 1024 qsort leaves
342     // 256 to 1024 threads: 12 levels, 4096 qsort leaves
343     // ...
344 
345     int k = (int) (2 + 2 * ceil (log2 ((double) nthreads) / 2)) ;
346     int ntasks = 1 << k ;
347 
348     //--------------------------------------------------------------------------
349     // allocate workspace
350     //--------------------------------------------------------------------------
351 
352     int64_t *restrict W = NULL ; size_t W_size = 0 ;
353     W = GB_MALLOC_WERK (3*n + 6*ntasks + 1, int64_t, &W_size) ;
354     if (W == NULL)
355     {
356         // out of memory
357         return (GrB_OUT_OF_MEMORY) ;
358     }
359 
360     int64_t *T = W ;
361     int64_t *restrict W_0    = T ; T += n ;
362     int64_t *restrict W_1    = T ; T += n ;
363     int64_t *restrict W_2    = T ; T += n ;
364     int64_t *restrict L_task = T ; T += ntasks ;
365     int64_t *restrict L_len  = T ; T += ntasks ;
366     int64_t *restrict R_task = T ; T += ntasks ;
367     int64_t *restrict R_len  = T ; T += ntasks ;
368     int64_t *restrict S_task = T ; T += ntasks ;
369     int64_t *restrict Slice  = T ; T += (ntasks+1) ;
370 
371     //--------------------------------------------------------------------------
372     // partition and sort the leaves
373     //--------------------------------------------------------------------------
374 
375     GB_eslice (Slice, n, ntasks) ;
376     int tid ;
377     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
378     for (tid = 0 ; tid < ntasks ; tid++)
379     {
380         int64_t leaf = Slice [tid] ;
381         int64_t leafsize = Slice [tid+1] - leaf ;
382         GB_qsort_3 (A_0 + leaf, A_1 + leaf, A_2 + leaf, leafsize) ;
383     }
384 
385     //--------------------------------------------------------------------------
386     // merge each level
387     //--------------------------------------------------------------------------
388 
389     int nt = 1 ;
390     for ( ; k >= 2 ; k -= 2)
391     {
392 
393         //----------------------------------------------------------------------
394         // merge level k into level k-1, from A into W
395         //----------------------------------------------------------------------
396 
397         // this could be done in parallel if ntasks was large
398         for (int tid = 0 ; tid < ntasks ; tid += 2*nt)
399         {
400             // create 2*nt tasks to merge two A sublists into one W sublist
401             GB_msort_3b_create_merge_tasks (
402                 L_task, L_len, R_task, R_len, S_task, tid, 2*nt, Slice [tid],
403                 A_0, A_1, A_2, Slice [tid],    Slice [tid+nt],
404                 A_0, A_1, A_2, Slice [tid+nt], Slice [tid+2*nt]) ;
405         }
406 
407         #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
408         for (tid = 0 ; tid < ntasks ; tid++)
409         {
410             // merge A [pL...pL+nL-1] and A [pR...pR+nR-1] into W [pS..]
411             int64_t pL = L_task [tid], nL = L_len [tid] ;
412             int64_t pR = R_task [tid], nR = R_len [tid] ;
413             int64_t pS = S_task [tid] ;
414 
415             GB_msort_3b_merge (
416                 W_0 + pS, W_1 + pS, W_2 + pS,
417                 A_0 + pL, A_1 + pL, A_2 + pL, nL,
418                 A_0 + pR, A_1 + pR, A_2 + pR, nR) ;
419         }
420         nt = 2*nt ;
421 
422         //----------------------------------------------------------------------
423         // merge level k-1 into level k-2, from W into A
424         //----------------------------------------------------------------------
425 
426         // this could be done in parallel if ntasks was large
427         for (int tid = 0 ; tid < ntasks ; tid += 2*nt)
428         {
429             // create 2*nt tasks to merge two W sublists into one A sublist
430             GB_msort_3b_create_merge_tasks (
431                 L_task, L_len, R_task, R_len, S_task, tid, 2*nt, Slice [tid],
432                 W_0, W_1, W_2, Slice [tid],    Slice [tid+nt],
433                 W_0, W_1, W_2, Slice [tid+nt], Slice [tid+2*nt]) ;
434         }
435 
436         #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
437         for (tid = 0 ; tid < ntasks ; tid++)
438         {
439             // merge A [pL...pL+nL-1] and A [pR...pR+nR-1] into W [pS..]
440             int64_t pL = L_task [tid], nL = L_len [tid] ;
441             int64_t pR = R_task [tid], nR = R_len [tid] ;
442             int64_t pS = S_task [tid] ;
443             GB_msort_3b_merge (
444                 A_0 + pS, A_1 + pS, A_2 + pS,
445                 W_0 + pL, W_1 + pL, W_2 + pL, nL,
446                 W_0 + pR, W_1 + pR, W_2 + pR, nR) ;
447         }
448         nt = 2*nt ;
449     }
450 
451     //--------------------------------------------------------------------------
452     // free workspace and return result
453     //--------------------------------------------------------------------------
454 
455     GB_FREE_WERK (&W, W_size) ;
456     return (GrB_SUCCESS) ;
457 }
458 
459