1 //------------------------------------------------------------------------------
2 // GB_subref_slice: construct coarse/fine tasks for C = A(I,J)
3 //------------------------------------------------------------------------------
4
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7
8 //------------------------------------------------------------------------------
9
10 // Determine the tasks for computing C=A(I,J). The matrix C has Cnvec vectors,
11 // and these are divided into coarse and fine tasks. A coarse task will
12 // compute one or more whole vectors of C. A fine task operates on a slice of
13 // a single vector of C. The slice can be done by the # of entries in the
14 // corresponding vector of A, or by the list of indices I, depending on how the
15 // work is done for that method.
16
17 // The (kC)th vector will access A(imin:imax,kA) in Ai,Ax [pA:pA_end-1], where
18 // pA = Ap_start [kC] and pA_end = Ap_end [kC].
19
20 // The computation of each vector C(:,kC) = A(I,kA) is by done using one of 12
21 // different cases, depending on the vector, as determined by GB_subref_method.
22 // Not all vectors in C are computed using the same method.
23
24 // Note that J can have duplicates. kC is unique (0:Cnvec-1) but the
25 // corresponding vector kA in A may repeat, if J has duplicates. Duplicates in
26 // J are not exploited, since the coarse/fine tasks are constructed by slicing
27 // slicing the list of vectors Ch of size Cnvec, not the vectors of A.
28
29 // Compare this function with GB_ewise_slice, which constructs coarse/fine
30 // tasks for the eWise operations (C=A+B, C=A.*B, and C<M>=Z).
31
32 #define GB_FREE_WORK \
33 { \
34 GB_WERK_POP (Coarse, int64_t) ; \
35 GB_FREE_WERK (&Cwork, Cwork_size) ; \
36 }
37
38 #define GB_FREE_ALL \
39 { \
40 GB_FREE_WORK ; \
41 GB_FREE_WERK (&TaskList, TaskList_size) ; \
42 GB_FREE_WERK (&Mark, Mark_size) ; \
43 GB_FREE_WERK (&Inext, Inext_size) ; \
44 }
45
46 #include "GB_subref.h"
47
GB_subref_slice(GB_task_struct ** p_TaskList,size_t * p_TaskList_size,int * p_ntasks,int * p_nthreads,bool * p_post_sort,int64_t * restrict * p_Mark,size_t * p_Mark_size,int64_t * restrict * p_Inext,size_t * p_Inext_size,int64_t * p_nduplicates,const int64_t * restrict Ap_start,const int64_t * restrict Ap_end,const int64_t Cnvec,const bool need_qsort,const int Ikind,const int64_t nI,const int64_t Icolon[3],const int64_t avlen,const int64_t anz,const GrB_Index * I,GB_Context Context)48 GrB_Info GB_subref_slice
49 (
50 // output:
51 GB_task_struct **p_TaskList, // array of structs
52 size_t *p_TaskList_size, // size of TaskList
53 int *p_ntasks, // # of tasks constructed
54 int *p_nthreads, // # of threads for subref operation
55 bool *p_post_sort, // true if a final post-sort is needed
56 int64_t *restrict *p_Mark, // for I inverse, if needed; size avlen
57 size_t *p_Mark_size,
58 int64_t *restrict *p_Inext, // for I inverse, if needed; size nI
59 size_t *p_Inext_size,
60 int64_t *p_nduplicates, // # of duplicates, if I inverse computed
61 // from phase0:
62 const int64_t *restrict Ap_start, // location of A(imin:imax,kA)
63 const int64_t *restrict Ap_end,
64 const int64_t Cnvec, // # of vectors of C
65 const bool need_qsort, // true if C must be sorted
66 const int Ikind, // GB_ALL, GB_RANGE, GB_STRIDE or GB_LIST
67 const int64_t nI, // length of I
68 const int64_t Icolon [3], // for GB_RANGE and GB_STRIDE
69 // original input:
70 const int64_t avlen, // A->vlen
71 const int64_t anz, // nnz (A)
72 const GrB_Index *I,
73 GB_Context Context
74 )
75 {
76
77 //--------------------------------------------------------------------------
78 // check inputs
79 //--------------------------------------------------------------------------
80
81 ASSERT (p_TaskList != NULL) ;
82 ASSERT (p_TaskList_size != NULL) ;
83 ASSERT (p_ntasks != NULL) ;
84 ASSERT (p_nthreads != NULL) ;
85 ASSERT (p_post_sort != NULL) ;
86 ASSERT (p_Mark != NULL) ;
87 ASSERT (p_Inext != NULL) ;
88 ASSERT (p_nduplicates != NULL) ;
89
90 ASSERT ((Cnvec > 0) == (Ap_start != NULL)) ;
91 ASSERT ((Cnvec > 0) == (Ap_end != NULL)) ;
92
93 (*p_TaskList) = NULL ;
94 (*p_TaskList_size) = 0 ;
95 (*p_Mark ) = NULL ;
96 (*p_Inext ) = NULL ;
97
98 int64_t *restrict Mark = NULL ; size_t Mark_size = 0 ;
99 int64_t *restrict Inext = NULL ; size_t Inext_size = 0 ;
100
101 int64_t *restrict Cwork = NULL ; size_t Cwork_size = 0 ;
102 GB_WERK_DECLARE (Coarse, int64_t) ; // size ntasks1+1
103 int ntasks1 = 0 ;
104
105 GrB_Info info ;
106
107 //--------------------------------------------------------------------------
108 // determine # of threads to use
109 //--------------------------------------------------------------------------
110
111 GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
112
113 //--------------------------------------------------------------------------
114 // allocate the initial TaskList
115 //--------------------------------------------------------------------------
116
117 // Allocate the TaskList to hold at least 2*ntask0 tasks. It will grow
118 // later, if needed. Usually, 64*nthreads_max is enough, but in a few cases
119 // fine tasks can cause this number to be exceeded. If that occurs,
120 // TaskList is reallocated.
121
122 // When the mask is present, it is often fastest to break the work up
123 // into tasks, even when nthreads_max is 1.
124
125 GB_task_struct *restrict TaskList = NULL ; size_t TaskList_size = 0 ;
126 int max_ntasks = 0 ;
127 int ntasks0 = (nthreads_max == 1) ? 1 : (32 * nthreads_max) ;
128 GB_REALLOC_TASK_WERK (TaskList, ntasks0, max_ntasks) ;
129
130 //--------------------------------------------------------------------------
131 // determine if I_inverse can be constructed
132 //--------------------------------------------------------------------------
133
134 // I_inverse_ok is true if I might be inverted. If false, then I will not
135 // be inverted. I can be inverted only if the workspace for the inverse
136 // does not exceed nnz(A). Note that if I was provided on input as an
137 // explicit list, but consists of a contiguous range imin:imax, then Ikind
138 // is now GB_LIST and the list I is ignored.
139
140 // If I_inverse_ok is true, the inverse of I might still not be needed.
141 // need_I_inverse becomes true if any C(:,kC) = A (I,kA) computation
142 // requires I inverse.
143
144 int64_t I_inverse_limit = GB_IMAX (4096, anz) ;
145 bool I_inverse_ok = (Ikind == GB_LIST &&
146 ((nI > avlen / 256) || ((nI + avlen) < I_inverse_limit))) ;
147 bool need_I_inverse = false ;
148 bool post_sort = false ;
149 int64_t iinc = Icolon [GxB_INC] ;
150
151 //--------------------------------------------------------------------------
152 // allocate workspace
153 //--------------------------------------------------------------------------
154
155 Cwork = GB_MALLOC_WERK (Cnvec+1, int64_t, &Cwork_size) ;
156 if (Cwork == NULL)
157 {
158 // out of memory
159 GB_FREE_ALL ;
160 return (GrB_OUT_OF_MEMORY) ;
161 }
162
163 //--------------------------------------------------------------------------
164 // estimate the work required for each vector of C
165 //--------------------------------------------------------------------------
166
167 int nthreads_for_Cwork = GB_nthreads (Cnvec, chunk, nthreads_max) ;
168
169 int64_t kC ;
170 #pragma omp parallel for num_threads(nthreads_for_Cwork) schedule(static) \
171 reduction(||:need_I_inverse)
172 for (kC = 0 ; kC < Cnvec ; kC++)
173 {
174 // jC is the (kC)th vector of C = A(I,J)
175 // int64_t jC = GBH (Ch, kC) ;
176 // C(:,kC) = A(I,kA) will be constructed
177 int64_t pA = Ap_start [kC] ;
178 int64_t pA_end = Ap_end [kC] ;
179 int64_t alen = pA_end - pA ; // nnz (A (imin:imax,j))
180
181 int64_t work ; // amount of work for C(:,kC) = A (I,kA)
182 bool this_needs_I_inverse ; // true if this vector needs I inverse
183
184 // ndupl in I not yet known; it is found when I is inverted. For
185 // now, assume I has no duplicate entries. All that is needed for now
186 // is the work required for each C(:,kC), and whether or not I inverse
187 // must be created. The # of duplicates has no impact on the I inverse
188 // decision, and a minor effect on the work (which is ignored).
189
190 GB_subref_method (&work, &this_needs_I_inverse, alen, avlen,
191 Ikind, nI, I_inverse_ok, need_qsort, iinc, 0) ;
192
193 // log the result
194 need_I_inverse = need_I_inverse || this_needs_I_inverse ;
195 Cwork [kC] = work ;
196 }
197
198 //--------------------------------------------------------------------------
199 // replace Cwork with its cumulative sum
200 //--------------------------------------------------------------------------
201
202 GB_cumsum (Cwork, Cnvec, NULL, nthreads_for_Cwork, Context) ;
203 double cwork = (double) Cwork [Cnvec] ;
204
205 //--------------------------------------------------------------------------
206 // determine # of threads and tasks to use for C=A(I,J)
207 //--------------------------------------------------------------------------
208
209 int nthreads = GB_nthreads (cwork, chunk, nthreads_max) ;
210
211 ntasks1 = (nthreads == 1) ? 1 : (32 * nthreads) ;
212 double target_task_size = cwork / (double) (ntasks1) ;
213 target_task_size = GB_IMAX (target_task_size, chunk) ;
214
215 //--------------------------------------------------------------------------
216 // invert I if required
217 //--------------------------------------------------------------------------
218
219 int64_t ndupl = 0 ;
220 if (need_I_inverse)
221 {
222 GB_OK (GB_I_inverse (I, nI, avlen, &Mark, &Mark_size,
223 &Inext, &Inext_size, &ndupl, Context)) ;
224 ASSERT (Mark != NULL) ;
225 ASSERT (Inext != NULL) ;
226 }
227
228 //--------------------------------------------------------------------------
229 // check for quick return for a single task
230 //--------------------------------------------------------------------------
231
232 if (Cnvec == 0 || ntasks1 == 1)
233 {
234 // construct a single coarse task that computes all of C
235 TaskList [0].kfirst = 0 ;
236 TaskList [0].klast = Cnvec-1 ;
237
238 // free workspace and return result
239 GB_FREE_WORK ;
240 (*p_TaskList ) = TaskList ;
241 (*p_TaskList_size) = TaskList_size ;
242 (*p_ntasks ) = (Cnvec == 0) ? 0 : 1 ;
243 (*p_nthreads ) = 1 ;
244 (*p_post_sort ) = false ;
245 (*p_Mark ) = Mark ;
246 (*p_Mark_size ) = Mark_size ;
247 (*p_Inext ) = Inext ;
248 (*p_Inext_size ) = Inext_size ;
249 (*p_nduplicates) = ndupl ;
250 return (GrB_SUCCESS) ;
251 }
252
253 //--------------------------------------------------------------------------
254 // slice the work into coarse tasks
255 //--------------------------------------------------------------------------
256
257 GB_WERK_PUSH (Coarse, ntasks1 + 1, int64_t) ;
258 if (Coarse == NULL)
259 {
260 // out of memory
261 GB_FREE_ALL ;
262 return (GrB_OUT_OF_MEMORY) ;
263 }
264 GB_pslice (Coarse, Cwork, Cnvec, ntasks1, false) ;
265
266 //--------------------------------------------------------------------------
267 // construct all tasks, both coarse and fine
268 //--------------------------------------------------------------------------
269
270 int ntasks = 0 ;
271
272 for (int t = 0 ; t < ntasks1 ; t++)
273 {
274
275 //----------------------------------------------------------------------
276 // coarse task computes C (:,k:klast)
277 //----------------------------------------------------------------------
278
279 int64_t k = Coarse [t] ;
280 int64_t klast = Coarse [t+1] - 1 ;
281
282 if (k >= Cnvec)
283 {
284
285 //------------------------------------------------------------------
286 // all tasks have been constructed
287 //------------------------------------------------------------------
288
289 break ;
290
291 }
292 else if (k < klast)
293 {
294
295 //------------------------------------------------------------------
296 // coarse task has 2 or more vectors
297 //------------------------------------------------------------------
298
299 // This is a non-empty coarse-grain task that does two or more
300 // entire vectors of C, vectors k:klast, inclusive.
301 GB_REALLOC_TASK_WERK (TaskList, ntasks + 1, max_ntasks) ;
302 TaskList [ntasks].kfirst = k ;
303 TaskList [ntasks].klast = klast ;
304 ntasks++ ;
305
306 }
307 else
308 {
309
310 //------------------------------------------------------------------
311 // coarse task has 0 or 1 vectors
312 //------------------------------------------------------------------
313
314 // As a coarse-grain task, this task is empty or does a single
315 // vector, k. Vector k must be removed from the work done by this
316 // and any other coarse-grain task, and split into one or more
317 // fine-grain tasks.
318
319 for (int tt = t ; tt < ntasks1 ; tt++)
320 {
321 // remove k from the initial slice tt
322 if (Coarse [tt] == k)
323 {
324 // remove k from task tt
325 Coarse [tt] = k+1 ;
326 }
327 else
328 {
329 // break, k not in task tt
330 break ;
331 }
332 }
333
334 //------------------------------------------------------------------
335 // determine the # of fine-grain tasks to create for vector k
336 //------------------------------------------------------------------
337
338 double ckwork = Cwork [k+1] - Cwork [k] ;
339 int nfine = ckwork / target_task_size ;
340 nfine = GB_IMAX (nfine, 1) ;
341
342 // make the TaskList bigger, if needed
343 GB_REALLOC_TASK_WERK (TaskList, ntasks + nfine, max_ntasks) ;
344
345 //------------------------------------------------------------------
346 // create the fine-grain tasks
347 //------------------------------------------------------------------
348
349 if (nfine == 1)
350 {
351
352 //--------------------------------------------------------------
353 // this is a single coarse task for all of vector k
354 //--------------------------------------------------------------
355
356 TaskList [ntasks].kfirst = k ;
357 TaskList [ntasks].klast = k ;
358 ntasks++ ;
359
360 }
361 else
362 {
363
364 //--------------------------------------------------------------
365 // slice vector k into nfine fine tasks
366 //--------------------------------------------------------------
367
368 // There are two kinds of fine tasks, depending on the method
369 // used to compute C(:,kC) = A(I,kA). If the method iterates
370 // across all entries in A(imin:imax,kA), then those entries
371 // are sliced (of size alen). Three methods (1, 2, and 6)
372 // iterate across all entries in I instead (of size nI).
373
374 int64_t pA = Ap_start [k] ;
375 int64_t pA_end = Ap_end [k] ;
376 int64_t alen = pA_end - pA ; // nnz (A (imin:imax,j))
377
378 int method = GB_subref_method (NULL, NULL, alen, avlen,
379 Ikind, nI, I_inverse_ok, need_qsort, iinc, ndupl) ;
380
381 if (method == 10)
382 {
383 // multiple fine tasks operate on a single vector C(:,kC)
384 // using method 10, and so a post-sort is needed.
385 post_sort = true ;
386 }
387
388 if (method == 1 || method == 2 || method == 6)
389 {
390
391 // slice I for this task
392 nfine = GB_IMIN (nfine, nI) ;
393 nfine = GB_IMAX (nfine, 1) ;
394
395 for (int tfine = 0 ; tfine < nfine ; tfine++)
396 {
397 // flag this as a fine task, and record the method.
398 // Methods 1, 2, and 6 slice I, not A(:,kA)
399 TaskList [ntasks].kfirst = k ;
400 TaskList [ntasks].klast = -method ;
401 // do not partition A(:,kA)
402 TaskList [ntasks].pA = pA ;
403 TaskList [ntasks].pA_end = pA_end ;
404 // partition I for this task
405 GB_PARTITION (TaskList [ntasks].pB,
406 TaskList [ntasks].pB_end, nI, tfine, nfine) ;
407 // unused
408 TaskList [ntasks].pM = -1 ;
409 TaskList [ntasks].pM_end = -1 ;
410 // no post sort
411 TaskList [ntasks].len = 0 ;
412 ntasks++ ;
413 }
414
415 }
416 else
417 {
418
419 // slice A(:,kA) for this task
420 nfine = GB_IMIN (nfine, alen) ;
421 nfine = GB_IMAX (nfine, 1) ;
422
423 bool reverse = (method == 8 || method == 9) ;
424
425 for (int tfine = 0 ; tfine < nfine ; tfine++)
426 {
427 // flag this as a fine task, and record the method.
428 // These methods slice A(:,kA). Methods 8 and 9
429 // must do so in reverse order.
430 TaskList [ntasks].kfirst = k ;
431 TaskList [ntasks].klast = -method ;
432 // partition the items for this task
433 GB_PARTITION (TaskList [ntasks].pA,
434 TaskList [ntasks].pA_end, alen,
435 (reverse) ? (nfine-tfine-1) : tfine, nfine) ;
436 TaskList [ntasks].pA += pA ;
437 TaskList [ntasks].pA_end += pA ;
438 // do not partition I
439 TaskList [ntasks].pB = 0 ;
440 TaskList [ntasks].pB_end = nI ;
441 // unused
442 TaskList [ntasks].pM = -1 ;
443 TaskList [ntasks].pM_end = -1 ;
444
445 // flag the task that does the post sort
446 TaskList [ntasks].len = (tfine == 0 && method == 10) ;
447 ntasks++ ;
448 }
449 }
450 }
451 }
452 }
453
454 ASSERT (ntasks > 0) ;
455
456 //--------------------------------------------------------------------------
457 // free workspace and return result
458 //--------------------------------------------------------------------------
459
460 GB_FREE_WORK ;
461 (*p_TaskList ) = TaskList ;
462 (*p_TaskList_size) = TaskList_size ;
463 (*p_ntasks ) = ntasks ;
464 (*p_nthreads ) = nthreads ;
465 (*p_post_sort ) = post_sort ;
466 (*p_Mark ) = Mark ;
467 (*p_Mark_size ) = Mark_size ;
468 (*p_Inext ) = Inext ;
469 (*p_Inext_size ) = Inext_size ;
470 (*p_nduplicates) = ndupl ;
471 return (GrB_SUCCESS) ;
472 }
473
474