1 //------------------------------------------------------------------------------
2 // GB_msort_2b: sort a 2-by-n list of integers, using A[0:1][ ] as the key
3 //------------------------------------------------------------------------------
4
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7
8 //------------------------------------------------------------------------------
9
10 // A parallel mergesort of an array of 2-by-n integers. Each key
11 // consists of two integers.
12
13 #include "GB_msort_2b.h"
14
15 //------------------------------------------------------------------------------
16 // GB_msort_2b_binary_search: binary search for the pivot
17 //------------------------------------------------------------------------------
18
19 // The Pivot value is Y [pivot], and a binary search for the Pivot is made in
20 // the array X [p_pstart...p_end-1], which is sorted in non-decreasing order on
21 // input. The return value is pleft, where
22 //
23 // X [p_start ... pleft-1] <= Pivot and
24 // X [pleft ... p_end-1] >= Pivot holds.
25 //
26 // pleft is returned in the range p_start to p_end. If pleft is p_start, then
27 // the Pivot is smaller than all entries in X [p_start...p_end-1], and the left
28 // list X [p_start...pleft-1] is empty. If pleft is p_end, then the Pivot is
29 // larger than all entries in X [p_start...p_end-1], and the right list X
30 // [pleft...p_end-1] is empty.
31
GB_msort_2b_binary_search(const int64_t * restrict Y_0,const int64_t * restrict Y_1,const int64_t pivot,const int64_t * restrict X_0,const int64_t * restrict X_1,const int64_t p_start,const int64_t p_end)32 static int64_t GB_msort_2b_binary_search // return pleft
33 (
34 const int64_t *restrict Y_0, // Pivot is Y [pivot]
35 const int64_t *restrict Y_1,
36 const int64_t pivot,
37 const int64_t *restrict X_0, // search in X [p_start..p_end_-1]
38 const int64_t *restrict X_1,
39 const int64_t p_start,
40 const int64_t p_end
41 )
42 {
43
44 //--------------------------------------------------------------------------
45 // find where the Pivot appears in X
46 //--------------------------------------------------------------------------
47
48 // binary search of X [p_start...p_end-1] for the Pivot
49 int64_t pleft = p_start ;
50 int64_t pright = p_end - 1 ;
51 while (pleft < pright)
52 {
53 int64_t pmiddle = (pleft + pright) >> 1 ;
54 // less = (X [pmiddle] < Pivot)
55 bool less = GB_lt_2 (X_0, X_1, pmiddle,
56 Y_0, Y_1, pivot) ;
57 pleft = less ? (pmiddle+1) : pleft ;
58 pright = less ? pright : pmiddle ;
59 }
60
61 // binary search is narrowed down to a single item
62 // or it has found the list is empty:
63 ASSERT (pleft == pright || pleft == pright + 1) ;
64
65 // If found is true then X [pleft == pright] == Pivot. If duplicates
66 // appear then X [pleft] is any one of the entries equal to the Pivot
67 // in the list. If found is false then
68 // X [p_start ... pleft-1] < Pivot and
69 // X [pleft+1 ... p_end-1] > Pivot holds.
70 // The value X [pleft] may be either < or > Pivot.
71 bool found = (pleft == pright) && GB_eq_2 (X_0, X_1, pleft,
72 Y_0, Y_1, pivot) ;
73
74 // Modify pleft and pright:
75 if (!found && (pleft == pright))
76 {
77 if (GB_lt_2 (X_0, X_1, pleft,
78 Y_0, Y_1, pivot))
79 {
80 pleft++ ;
81 }
82 else
83 {
84 // pright++ ; // (not needed)
85 }
86 }
87
88 //--------------------------------------------------------------------------
89 // return result
90 //--------------------------------------------------------------------------
91
92 // If found is false then
93 // X [p_start ... pleft-1] < Pivot and
94 // X [pleft ... p_end-1] > Pivot holds,
95 // and pleft-1 == pright
96
97 // If X has no duplicates, then whether or not Pivot is found,
98 // X [p_start ... pleft-1] < Pivot and
99 // X [pleft ... p_end-1] >= Pivot holds.
100
101 // If X has duplicates, then whether or not Pivot is found,
102 // X [p_start ... pleft-1] <= Pivot and
103 // X [pleft ... p_end-1] >= Pivot holds.
104
105 return (pleft) ;
106 }
107
108 //------------------------------------------------------------------------------
109 // GB_msort_2b_create_merge_tasks
110 //------------------------------------------------------------------------------
111
112 // Recursively constructs ntasks tasks to merge two arrays, Left and Right,
113 // into Sresult, where Left is L [pL_start...pL_end-1], Right is R
114 // [pR_start...pR_end-1], and Sresult is S [pS_start...pS_start+total_work-1],
115 // and where total_work is the total size of Left and Right.
116 //
117 // Task tid will merge L [L_task [tid] ... L_task [tid] + L_len [tid] - 1] and
118 // R [R_task [tid] ... R_task [tid] + R_len [tid] -1] into the merged output
119 // array S [S_task [tid] ... ]. The task tids created are t0 to
120 // t0+ntasks-1.
121
GB_msort_2b_create_merge_tasks(int64_t * restrict L_task,int64_t * restrict L_len,int64_t * restrict R_task,int64_t * restrict R_len,int64_t * restrict S_task,const int t0,const int ntasks,const int64_t pS_start,const int64_t * restrict L_0,const int64_t * restrict L_1,const int64_t pL_start,const int64_t pL_end,const int64_t * restrict R_0,const int64_t * restrict R_1,const int64_t pR_start,const int64_t pR_end)122 void GB_msort_2b_create_merge_tasks
123 (
124 // output:
125 int64_t *restrict L_task, // L_task [t0...t0+ntasks-1] computed
126 int64_t *restrict L_len, // L_len [t0...t0+ntasks-1] computed
127 int64_t *restrict R_task, // R_task [t0...t0+ntasks-1] computed
128 int64_t *restrict R_len, // R_len [t0...t0+ntasks-1] computed
129 int64_t *restrict S_task, // S_task [t0...t0+ntasks-1] computed
130 // input:
131 const int t0, // first task tid to create
132 const int ntasks, // # of tasks to create
133 const int64_t pS_start, // merge into S [pS_start...]
134 const int64_t *restrict L_0, // Left = L [pL_start...pL_end-1]
135 const int64_t *restrict L_1,
136 const int64_t pL_start,
137 const int64_t pL_end,
138 const int64_t *restrict R_0, // Right = R [pR_start...pR_end-1]
139 const int64_t *restrict R_1,
140 const int64_t pR_start,
141 const int64_t pR_end
142 )
143 {
144
145 //--------------------------------------------------------------------------
146 // get problem size
147 //--------------------------------------------------------------------------
148
149 int64_t nleft = pL_end - pL_start ; // size of Left array
150 int64_t nright = pR_end - pR_start ; // size of Right array
151 int64_t total_work = nleft + nright ; // total work to do
152 ASSERT (ntasks >= 1) ;
153 ASSERT (total_work > 0) ;
154
155 //--------------------------------------------------------------------------
156 // create the tasks
157 //--------------------------------------------------------------------------
158
159 if (ntasks == 1)
160 {
161
162 //----------------------------------------------------------------------
163 // a single task will merge all of Left and Right into Sresult
164 //----------------------------------------------------------------------
165
166 L_task [t0] = pL_start ; L_len [t0] = nleft ;
167 R_task [t0] = pR_start ; R_len [t0] = nright ;
168 S_task [t0] = pS_start ;
169
170 }
171 else
172 {
173
174 //----------------------------------------------------------------------
175 // partition the Left and Right arrays for multiple merge tasks
176 //----------------------------------------------------------------------
177
178 int64_t pleft, pright ;
179 if (nleft >= nright)
180 {
181 // split Left in half, and search for its pivot in Right
182 pleft = (pL_end + pL_start) >> 1 ;
183 pright = GB_msort_2b_binary_search (
184 L_0, L_1, pleft,
185 R_0, R_1, pR_start, pR_end) ;
186 }
187 else
188 {
189 // split Right in half, and search for its pivot in Left
190 pright = (pR_end + pR_start) >> 1 ;
191 pleft = GB_msort_2b_binary_search (
192 R_0, R_1, pright,
193 L_0, L_1, pL_start, pL_end) ;
194 }
195
196 //----------------------------------------------------------------------
197 // partition the tasks according to the work of each partition
198 //----------------------------------------------------------------------
199
200 // work0 is the total work in the first partition
201 int64_t work0 = (pleft - pL_start) + (pright - pR_start) ;
202 int ntasks0 = (int) round ((double) ntasks *
203 (((double) work0) / ((double) total_work))) ;
204
205 // ensure at least one task is assigned to each partition
206 ntasks0 = GB_IMAX (ntasks0, 1) ;
207 ntasks0 = GB_IMIN (ntasks0, ntasks-1) ;
208 int ntasks1 = ntasks - ntasks0 ;
209
210 //----------------------------------------------------------------------
211 // assign ntasks0 to the first half
212 //----------------------------------------------------------------------
213
214 // ntasks0 tasks merge L [pL_start...pleft-1] and R [pR_start..pright-1]
215 // into the result S [pS_start...work0-1].
216
217 GB_msort_2b_create_merge_tasks (
218 L_task, L_len, R_task, R_len, S_task, t0, ntasks0, pS_start,
219 L_0, L_1, pL_start, pleft,
220 R_0, R_1, pR_start, pright) ;
221
222 //----------------------------------------------------------------------
223 // assign ntasks1 to the second half
224 //----------------------------------------------------------------------
225
226 // ntasks1 tasks merge L [pleft...pL_end-1] and R [pright...pR_end-1]
227 // into the result S [pS_start+work0...pS_start+total_work].
228
229 int t1 = t0 + ntasks0 ; // first task id of the second set of tasks
230 int64_t pS_start1 = pS_start + work0 ; // 2nd set starts here in S
231 GB_msort_2b_create_merge_tasks (
232 L_task, L_len, R_task, R_len, S_task, t1, ntasks1, pS_start1,
233 L_0, L_1, pleft, pL_end,
234 R_0, R_1, pright, pR_end) ;
235 }
236 }
237
238 //------------------------------------------------------------------------------
239 // GB_msort_2b_merge: merge two sorted lists via a single thread
240 //------------------------------------------------------------------------------
241
242 // merge Left [0..nleft-1] and Right [0..nright-1] into S [0..nleft+nright-1] */
243
GB_msort_2b_merge(int64_t * restrict S_0,int64_t * restrict S_1,const int64_t * restrict Left_0,const int64_t * restrict Left_1,const int64_t nleft,const int64_t * restrict Right_0,const int64_t * restrict Right_1,const int64_t nright)244 static void GB_msort_2b_merge
245 (
246 int64_t *restrict S_0, // output of length nleft + nright
247 int64_t *restrict S_1,
248 const int64_t *restrict Left_0, // left input of length nleft
249 const int64_t *restrict Left_1,
250 const int64_t nleft,
251 const int64_t *restrict Right_0, // right input of length nright
252 const int64_t *restrict Right_1,
253 const int64_t nright
254 )
255 {
256 int64_t p, pleft, pright ;
257
258 // merge the two inputs, Left and Right, while both inputs exist
259 for (p = 0, pleft = 0, pright = 0 ; pleft < nleft && pright < nright ; p++)
260 {
261 if (GB_lt_2 (Left_0, Left_1, pleft,
262 Right_0, Right_1, pright))
263 {
264 // S [p] = Left [pleft++]
265 S_0 [p] = Left_0 [pleft] ;
266 S_1 [p] = Left_1 [pleft] ;
267 pleft++ ;
268 }
269 else
270 {
271 // S [p] = Right [pright++]
272 S_0 [p] = Right_0 [pright] ;
273 S_1 [p] = Right_1 [pright] ;
274 pright++ ;
275 }
276 }
277
278 // either input is exhausted; copy the remaining list into S
279 if (pleft < nleft)
280 {
281 int64_t nremaining = (nleft - pleft) ;
282 memcpy (S_0 + p, Left_0 + pleft, nremaining * sizeof (int64_t)) ;
283 memcpy (S_1 + p, Left_1 + pleft, nremaining * sizeof (int64_t)) ;
284 }
285 else if (pright < nright)
286 {
287 int64_t nremaining = (nright - pright) ;
288 memcpy (S_0 + p, Right_0 + pright, nremaining * sizeof (int64_t)) ;
289 memcpy (S_1 + p, Right_1 + pright, nremaining * sizeof (int64_t)) ;
290 }
291 }
292
293 //------------------------------------------------------------------------------
294 // GB_msort_2b: parallel mergesort
295 //------------------------------------------------------------------------------
296
297 GB_PUBLIC // accessed by the MATLAB tests in GraphBLAS/Test only
GB_msort_2b(int64_t * restrict A_0,int64_t * restrict A_1,const int64_t n,int nthreads)298 GrB_Info GB_msort_2b // sort array A of size 2-by-n, using 2 keys (A [0:1][])
299 (
300 int64_t *restrict A_0, // size n array
301 int64_t *restrict A_1, // size n array
302 const int64_t n,
303 int nthreads // # of threads to use
304 )
305 {
306
307 //--------------------------------------------------------------------------
308 // handle small problems with a single thread
309 //--------------------------------------------------------------------------
310
311 if (nthreads <= 1 || n <= GB_BASECASE)
312 {
313 // sequential quicksort
314 GB_qsort_2 (A_0, A_1, n) ;
315 return (GrB_SUCCESS) ;
316 }
317
318 //--------------------------------------------------------------------------
319 // determine # of tasks
320 //--------------------------------------------------------------------------
321
322 // determine the number of levels to create, which must always be an
323 // even number. The # of levels is chosen to ensure that the # of leaves
324 // of the task tree is between 4*nthreads and 16*nthreads.
325
326 // 2 to 4 threads: 4 levels, 16 qsort leaves
327 // 5 to 16 threads: 6 levels, 64 qsort leaves
328 // 17 to 64 threads: 8 levels, 256 qsort leaves
329 // 65 to 256 threads: 10 levels, 1024 qsort leaves
330 // 256 to 1024 threads: 12 levels, 4096 qsort leaves
331 // ...
332
333 int k = (int) (2 + 2 * ceil (log2 ((double) nthreads) / 2)) ;
334 int ntasks = 1 << k ;
335
336 //--------------------------------------------------------------------------
337 // allocate workspace
338 //--------------------------------------------------------------------------
339
340 int64_t *restrict W = NULL ; size_t W_size = 0 ;
341 W = GB_MALLOC_WERK (2*n + 6*ntasks + 1, int64_t, &W_size) ;
342 if (W == NULL)
343 {
344 // out of memory
345 return (GrB_OUT_OF_MEMORY) ;
346 }
347
348 int64_t *T = W ;
349 int64_t *restrict W_0 = T ; T += n ;
350 int64_t *restrict W_1 = T ; T += n ;
351 int64_t *restrict L_task = T ; T += ntasks ;
352 int64_t *restrict L_len = T ; T += ntasks ;
353 int64_t *restrict R_task = T ; T += ntasks ;
354 int64_t *restrict R_len = T ; T += ntasks ;
355 int64_t *restrict S_task = T ; T += ntasks ;
356 int64_t *restrict Slice = T ; T += (ntasks+1) ;
357
358 //--------------------------------------------------------------------------
359 // partition and sort the leaves
360 //--------------------------------------------------------------------------
361
362 GB_eslice (Slice, n, ntasks) ;
363 int tid ;
364 #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
365 for (tid = 0 ; tid < ntasks ; tid++)
366 {
367 int64_t leaf = Slice [tid] ;
368 int64_t leafsize = Slice [tid+1] - leaf ;
369 GB_qsort_2 (A_0 + leaf, A_1 + leaf, leafsize) ;
370 }
371
372 //--------------------------------------------------------------------------
373 // merge each level
374 //--------------------------------------------------------------------------
375
376 int nt = 1 ;
377 for ( ; k >= 2 ; k -= 2)
378 {
379
380 //----------------------------------------------------------------------
381 // merge level k into level k-1, from A into W
382 //----------------------------------------------------------------------
383
384 // TODO: skip k and k-1 for each group of 4 sublists of A if they are
385 // already sorted with respect to each other.
386
387 // this could be done in parallel if ntasks was large
388 for (int tid = 0 ; tid < ntasks ; tid += 2*nt)
389 {
390 // create 2*nt tasks to merge two A sublists into one W sublist
391 GB_msort_2b_create_merge_tasks (
392 L_task, L_len, R_task, R_len, S_task, tid, 2*nt, Slice [tid],
393 A_0, A_1, Slice [tid], Slice [tid+nt],
394 A_0, A_1, Slice [tid+nt], Slice [tid+2*nt]) ;
395 }
396
397 #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
398 for (tid = 0 ; tid < ntasks ; tid++)
399 {
400 // merge A [pL...pL+nL-1] and A [pR...pR+nR-1] into W [pS..]
401 int64_t pL = L_task [tid], nL = L_len [tid] ;
402 int64_t pR = R_task [tid], nR = R_len [tid] ;
403 int64_t pS = S_task [tid] ;
404
405 GB_msort_2b_merge (
406 W_0 + pS, W_1 + pS,
407 A_0 + pL, A_1 + pL, nL,
408 A_0 + pR, A_1 + pR, nR) ;
409 }
410 nt = 2*nt ;
411
412 //----------------------------------------------------------------------
413 // merge level k-1 into level k-2, from W into A
414 //----------------------------------------------------------------------
415
416 // this could be done in parallel if ntasks was large
417 for (int tid = 0 ; tid < ntasks ; tid += 2*nt)
418 {
419 // create 2*nt tasks to merge two W sublists into one A sublist
420 GB_msort_2b_create_merge_tasks (
421 L_task, L_len, R_task, R_len, S_task, tid, 2*nt, Slice [tid],
422 W_0, W_1, Slice [tid], Slice [tid+nt],
423 W_0, W_1, Slice [tid+nt], Slice [tid+2*nt]) ;
424 }
425
426 #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
427 for (tid = 0 ; tid < ntasks ; tid++)
428 {
429 // merge A [pL...pL+nL-1] and A [pR...pR+nR-1] into W [pS..]
430 int64_t pL = L_task [tid], nL = L_len [tid] ;
431 int64_t pR = R_task [tid], nR = R_len [tid] ;
432 int64_t pS = S_task [tid] ;
433 GB_msort_2b_merge (
434 A_0 + pS, A_1 + pS,
435 W_0 + pL, W_1 + pL, nL,
436 W_0 + pR, W_1 + pR, nR) ;
437 }
438 nt = 2*nt ;
439 }
440
441 //--------------------------------------------------------------------------
442 // free workspace and return result
443 //--------------------------------------------------------------------------
444
445 GB_FREE_WERK (&W, W_size) ;
446 return (GrB_SUCCESS) ;
447 }
448
449