1 //------------------------------------------------------------------------------
2 // GB_subref_slice: construct coarse/fine tasks for C = A(I,J)
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // Determine the tasks for computing C=A(I,J).  The matrix C has Cnvec vectors,
11 // and these are divided into coarse and fine tasks.  A coarse task will
12 // compute one or more whole vectors of C.  A fine task operates on a slice of
13 // a single vector of C.  The slice can be done by the # of entries in the
14 // corresponding vector of A, or by the list of indices I, depending on how the
15 // work is done for that method.
16 
17 // The (kC)th vector will access A(imin:imax,kA) in Ai,Ax [pA:pA_end-1], where
18 // pA = Ap_start [kC] and pA_end = Ap_end [kC].
19 
20 // The computation of each vector C(:,kC) = A(I,kA) is by done using one of 12
21 // different cases, depending on the vector, as determined by GB_subref_method.
22 // Not all vectors in C are computed using the same method.
23 
24 // Note that J can have duplicates.  kC is unique (0:Cnvec-1) but the
25 // corresponding vector kA in A may repeat, if J has duplicates.  Duplicates in
26 // J are not exploited, since the coarse/fine tasks are constructed by slicing
27 // slicing the list of vectors Ch of size Cnvec, not the vectors of A.
28 
29 // Compare this function with GB_ewise_slice, which constructs coarse/fine
30 // tasks for the eWise operations (C=A+B, C=A.*B, and C<M>=Z).
31 
32 #define GB_FREE_WORK                            \
33 {                                               \
34     GB_WERK_POP (Coarse, int64_t) ;             \
35     GB_FREE_WERK (&Cwork, Cwork_size) ;         \
36 }
37 
38 #define GB_FREE_ALL                             \
39 {                                               \
40     GB_FREE_WORK ;                              \
41     GB_FREE_WERK (&TaskList, TaskList_size) ;   \
42     GB_FREE_WERK (&Mark, Mark_size) ;           \
43     GB_FREE_WERK (&Inext, Inext_size) ;         \
44 }
45 
46 #include "GB_subref.h"
47 
GB_subref_slice(GB_task_struct ** p_TaskList,size_t * p_TaskList_size,int * p_ntasks,int * p_nthreads,bool * p_post_sort,int64_t * restrict * p_Mark,size_t * p_Mark_size,int64_t * restrict * p_Inext,size_t * p_Inext_size,int64_t * p_nduplicates,const int64_t * restrict Ap_start,const int64_t * restrict Ap_end,const int64_t Cnvec,const bool need_qsort,const int Ikind,const int64_t nI,const int64_t Icolon[3],const int64_t avlen,const int64_t anz,const GrB_Index * I,GB_Context Context)48 GrB_Info GB_subref_slice
49 (
50     // output:
51     GB_task_struct **p_TaskList,    // array of structs
52     size_t *p_TaskList_size,        // size of TaskList
53     int *p_ntasks,                  // # of tasks constructed
54     int *p_nthreads,                // # of threads for subref operation
55     bool *p_post_sort,              // true if a final post-sort is needed
56     int64_t *restrict *p_Mark,      // for I inverse, if needed; size avlen
57     size_t *p_Mark_size,
58     int64_t *restrict *p_Inext,     // for I inverse, if needed; size nI
59     size_t *p_Inext_size,
60     int64_t *p_nduplicates,         // # of duplicates, if I inverse computed
61     // from phase0:
62     const int64_t *restrict Ap_start,   // location of A(imin:imax,kA)
63     const int64_t *restrict Ap_end,
64     const int64_t Cnvec,            // # of vectors of C
65     const bool need_qsort,          // true if C must be sorted
66     const int Ikind,                // GB_ALL, GB_RANGE, GB_STRIDE or GB_LIST
67     const int64_t nI,               // length of I
68     const int64_t Icolon [3],       // for GB_RANGE and GB_STRIDE
69     // original input:
70     const int64_t avlen,            // A->vlen
71     const int64_t anz,              // nnz (A)
72     const GrB_Index *I,
73     GB_Context Context
74 )
75 {
76 
77     //--------------------------------------------------------------------------
78     // check inputs
79     //--------------------------------------------------------------------------
80 
81     ASSERT (p_TaskList != NULL) ;
82     ASSERT (p_TaskList_size != NULL) ;
83     ASSERT (p_ntasks != NULL) ;
84     ASSERT (p_nthreads != NULL) ;
85     ASSERT (p_post_sort != NULL) ;
86     ASSERT (p_Mark  != NULL) ;
87     ASSERT (p_Inext != NULL) ;
88     ASSERT (p_nduplicates != NULL) ;
89 
90     ASSERT ((Cnvec > 0) == (Ap_start != NULL)) ;
91     ASSERT ((Cnvec > 0) == (Ap_end != NULL)) ;
92 
93     (*p_TaskList) = NULL ;
94     (*p_TaskList_size) = 0 ;
95     (*p_Mark    ) = NULL ;
96     (*p_Inext   ) = NULL ;
97 
98     int64_t *restrict Mark  = NULL ; size_t Mark_size = 0 ;
99     int64_t *restrict Inext = NULL ; size_t Inext_size = 0 ;
100 
101     int64_t *restrict Cwork = NULL ; size_t Cwork_size = 0 ;
102     GB_WERK_DECLARE (Coarse, int64_t) ;     // size ntasks1+1
103     int ntasks1 = 0 ;
104 
105     GrB_Info info ;
106 
107     //--------------------------------------------------------------------------
108     // determine # of threads to use
109     //--------------------------------------------------------------------------
110 
111     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
112 
113     //--------------------------------------------------------------------------
114     // allocate the initial TaskList
115     //--------------------------------------------------------------------------
116 
117     // Allocate the TaskList to hold at least 2*ntask0 tasks.  It will grow
118     // later, if needed.  Usually, 64*nthreads_max is enough, but in a few cases
119     // fine tasks can cause this number to be exceeded.  If that occurs,
120     // TaskList is reallocated.
121 
122     // When the mask is present, it is often fastest to break the work up
123     // into tasks, even when nthreads_max is 1.
124 
125     GB_task_struct *restrict TaskList = NULL ; size_t TaskList_size = 0 ;
126     int max_ntasks = 0 ;
127     int ntasks0 = (nthreads_max == 1) ? 1 : (32 * nthreads_max) ;
128     GB_REALLOC_TASK_WERK (TaskList, ntasks0, max_ntasks) ;
129 
130     //--------------------------------------------------------------------------
131     // determine if I_inverse can be constructed
132     //--------------------------------------------------------------------------
133 
134     // I_inverse_ok is true if I might be inverted.  If false, then I will not
135     // be inverted.  I can be inverted only if the workspace for the inverse
136     // does not exceed nnz(A).  Note that if I was provided on input as an
137     // explicit list, but consists of a contiguous range imin:imax, then Ikind
138     // is now GB_LIST and the list I is ignored.
139 
140     // If I_inverse_ok is true, the inverse of I might still not be needed.
141     // need_I_inverse becomes true if any C(:,kC) = A (I,kA) computation
142     // requires I inverse.
143 
144     int64_t I_inverse_limit = GB_IMAX (4096, anz) ;
145     bool I_inverse_ok = (Ikind == GB_LIST &&
146         ((nI > avlen / 256) || ((nI + avlen) < I_inverse_limit))) ;
147     bool need_I_inverse = false ;
148     bool post_sort = false ;
149     int64_t iinc = Icolon [GxB_INC] ;
150 
151     //--------------------------------------------------------------------------
152     // allocate workspace
153     //--------------------------------------------------------------------------
154 
155     Cwork = GB_MALLOC_WERK (Cnvec+1, int64_t, &Cwork_size) ;
156     if (Cwork == NULL)
157     {
158         // out of memory
159         GB_FREE_ALL ;
160         return (GrB_OUT_OF_MEMORY) ;
161     }
162 
163     //--------------------------------------------------------------------------
164     // estimate the work required for each vector of C
165     //--------------------------------------------------------------------------
166 
167     int nthreads_for_Cwork = GB_nthreads (Cnvec, chunk, nthreads_max) ;
168 
169     int64_t kC ;
170     #pragma omp parallel for num_threads(nthreads_for_Cwork) schedule(static) \
171         reduction(||:need_I_inverse)
172     for (kC = 0 ; kC < Cnvec ; kC++)
173     {
174         // jC is the (kC)th vector of C = A(I,J)
175         // int64_t jC = GBH (Ch, kC) ;
176         // C(:,kC) = A(I,kA) will be constructed
177         int64_t pA      = Ap_start [kC] ;
178         int64_t pA_end  = Ap_end   [kC] ;
179         int64_t alen = pA_end - pA ;      // nnz (A (imin:imax,j))
180 
181         int64_t work ;              // amount of work for C(:,kC) = A (I,kA)
182         bool this_needs_I_inverse ; // true if this vector needs I inverse
183 
184         // ndupl in I not yet known; it is found when I is inverted.  For
185         // now, assume I has no duplicate entries.  All that is needed for now
186         // is the work required for each C(:,kC), and whether or not I inverse
187         // must be created.  The # of duplicates has no impact on the I inverse
188         // decision, and a minor effect on the work (which is ignored).
189 
190         GB_subref_method (&work, &this_needs_I_inverse, alen, avlen,
191             Ikind, nI, I_inverse_ok, need_qsort, iinc, 0) ;
192 
193         // log the result
194         need_I_inverse = need_I_inverse || this_needs_I_inverse ;
195         Cwork [kC] = work ;
196     }
197 
198     //--------------------------------------------------------------------------
199     // replace Cwork with its cumulative sum
200     //--------------------------------------------------------------------------
201 
202     GB_cumsum (Cwork, Cnvec, NULL, nthreads_for_Cwork, Context) ;
203     double cwork = (double) Cwork [Cnvec] ;
204 
205     //--------------------------------------------------------------------------
206     // determine # of threads and tasks to use for C=A(I,J)
207     //--------------------------------------------------------------------------
208 
209     int nthreads = GB_nthreads (cwork, chunk, nthreads_max) ;
210 
211     ntasks1 = (nthreads == 1) ? 1 : (32 * nthreads) ;
212     double target_task_size = cwork / (double) (ntasks1) ;
213     target_task_size = GB_IMAX (target_task_size, chunk) ;
214 
215     //--------------------------------------------------------------------------
216     // invert I if required
217     //--------------------------------------------------------------------------
218 
219     int64_t ndupl = 0 ;
220     if (need_I_inverse)
221     {
222         GB_OK (GB_I_inverse (I, nI, avlen, &Mark, &Mark_size,
223             &Inext, &Inext_size, &ndupl, Context)) ;
224         ASSERT (Mark != NULL) ;
225         ASSERT (Inext != NULL) ;
226     }
227 
228     //--------------------------------------------------------------------------
229     // check for quick return for a single task
230     //--------------------------------------------------------------------------
231 
232     if (Cnvec == 0 || ntasks1 == 1)
233     {
234         // construct a single coarse task that computes all of C
235         TaskList [0].kfirst = 0 ;
236         TaskList [0].klast  = Cnvec-1 ;
237 
238         // free workspace and return result
239         GB_FREE_WORK ;
240         (*p_TaskList   ) = TaskList ;
241         (*p_TaskList_size) = TaskList_size ;
242         (*p_ntasks     ) = (Cnvec == 0) ? 0 : 1 ;
243         (*p_nthreads   ) = 1 ;
244         (*p_post_sort  ) = false ;
245         (*p_Mark       ) = Mark ;
246         (*p_Mark_size  ) = Mark_size ;
247         (*p_Inext      ) = Inext ;
248         (*p_Inext_size ) = Inext_size ;
249         (*p_nduplicates) = ndupl ;
250         return (GrB_SUCCESS) ;
251     }
252 
253     //--------------------------------------------------------------------------
254     // slice the work into coarse tasks
255     //--------------------------------------------------------------------------
256 
257     GB_WERK_PUSH (Coarse, ntasks1 + 1, int64_t) ;
258     if (Coarse == NULL)
259     {
260         // out of memory
261         GB_FREE_ALL ;
262         return (GrB_OUT_OF_MEMORY) ;
263     }
264     GB_pslice (Coarse, Cwork, Cnvec, ntasks1, false) ;
265 
266     //--------------------------------------------------------------------------
267     // construct all tasks, both coarse and fine
268     //--------------------------------------------------------------------------
269 
270     int ntasks = 0 ;
271 
272     for (int t = 0 ; t < ntasks1 ; t++)
273     {
274 
275         //----------------------------------------------------------------------
276         // coarse task computes C (:,k:klast)
277         //----------------------------------------------------------------------
278 
279         int64_t k = Coarse [t] ;
280         int64_t klast = Coarse [t+1] - 1 ;
281 
282         if (k >= Cnvec)
283         {
284 
285             //------------------------------------------------------------------
286             // all tasks have been constructed
287             //------------------------------------------------------------------
288 
289             break ;
290 
291         }
292         else if (k < klast)
293         {
294 
295             //------------------------------------------------------------------
296             // coarse task has 2 or more vectors
297             //------------------------------------------------------------------
298 
299             // This is a non-empty coarse-grain task that does two or more
300             // entire vectors of C, vectors k:klast, inclusive.
301             GB_REALLOC_TASK_WERK (TaskList, ntasks + 1, max_ntasks) ;
302             TaskList [ntasks].kfirst = k ;
303             TaskList [ntasks].klast  = klast ;
304             ntasks++ ;
305 
306         }
307         else
308         {
309 
310             //------------------------------------------------------------------
311             // coarse task has 0 or 1 vectors
312             //------------------------------------------------------------------
313 
314             // As a coarse-grain task, this task is empty or does a single
315             // vector, k.  Vector k must be removed from the work done by this
316             // and any other coarse-grain task, and split into one or more
317             // fine-grain tasks.
318 
319             for (int tt = t ; tt < ntasks1 ; tt++)
320             {
321                 // remove k from the initial slice tt
322                 if (Coarse [tt] == k)
323                 {
324                     // remove k from task tt
325                     Coarse [tt] = k+1 ;
326                 }
327                 else
328                 {
329                     // break, k not in task tt
330                     break ;
331                 }
332             }
333 
334             //------------------------------------------------------------------
335             // determine the # of fine-grain tasks to create for vector k
336             //------------------------------------------------------------------
337 
338             double ckwork = Cwork [k+1] - Cwork [k] ;
339             int nfine = ckwork / target_task_size ;
340             nfine = GB_IMAX (nfine, 1) ;
341 
342             // make the TaskList bigger, if needed
343             GB_REALLOC_TASK_WERK (TaskList, ntasks + nfine, max_ntasks) ;
344 
345             //------------------------------------------------------------------
346             // create the fine-grain tasks
347             //------------------------------------------------------------------
348 
349             if (nfine == 1)
350             {
351 
352                 //--------------------------------------------------------------
353                 // this is a single coarse task for all of vector k
354                 //--------------------------------------------------------------
355 
356                 TaskList [ntasks].kfirst = k ;
357                 TaskList [ntasks].klast  = k ;
358                 ntasks++ ;
359 
360             }
361             else
362             {
363 
364                 //--------------------------------------------------------------
365                 // slice vector k into nfine fine tasks
366                 //--------------------------------------------------------------
367 
368                 // There are two kinds of fine tasks, depending on the method
369                 // used to compute C(:,kC) = A(I,kA).  If the method iterates
370                 // across all entries in A(imin:imax,kA), then those entries
371                 // are sliced (of size alen).  Three methods (1, 2, and 6)
372                 // iterate across all entries in I instead (of size nI).
373 
374                 int64_t pA     = Ap_start [k] ;
375                 int64_t pA_end = Ap_end   [k] ;
376                 int64_t alen = pA_end - pA ;      // nnz (A (imin:imax,j))
377 
378                 int method = GB_subref_method (NULL, NULL, alen, avlen,
379                     Ikind, nI, I_inverse_ok, need_qsort, iinc, ndupl) ;
380 
381                 if (method == 10)
382                 {
383                     // multiple fine tasks operate on a single vector C(:,kC)
384                     // using method 10, and so a post-sort is needed.
385                     post_sort = true ;
386                 }
387 
388                 if (method == 1 || method == 2 || method == 6)
389                 {
390 
391                     // slice I for this task
392                     nfine = GB_IMIN (nfine, nI) ;
393                     nfine = GB_IMAX (nfine, 1) ;
394 
395                     for (int tfine = 0 ; tfine < nfine ; tfine++)
396                     {
397                         // flag this as a fine task, and record the method.
398                         // Methods 1, 2, and 6 slice I, not A(:,kA)
399                         TaskList [ntasks].kfirst = k ;
400                         TaskList [ntasks].klast = -method ;
401                         // do not partition A(:,kA)
402                         TaskList [ntasks].pA = pA ;
403                         TaskList [ntasks].pA_end = pA_end ;
404                         // partition I for this task
405                         GB_PARTITION (TaskList [ntasks].pB,
406                             TaskList [ntasks].pB_end, nI, tfine, nfine) ;
407                         // unused
408                         TaskList [ntasks].pM = -1 ;
409                         TaskList [ntasks].pM_end = -1 ;
410                         // no post sort
411                         TaskList [ntasks].len = 0 ;
412                         ntasks++ ;
413                     }
414 
415                 }
416                 else
417                 {
418 
419                     // slice A(:,kA) for this task
420                     nfine = GB_IMIN (nfine, alen) ;
421                     nfine = GB_IMAX (nfine, 1) ;
422 
423                     bool reverse = (method == 8 || method == 9) ;
424 
425                     for (int tfine = 0 ; tfine < nfine ; tfine++)
426                     {
427                         // flag this as a fine task, and record the method.
428                         // These methods slice A(:,kA).  Methods 8 and 9
429                         // must do so in reverse order.
430                         TaskList [ntasks].kfirst = k ;
431                         TaskList [ntasks].klast = -method ;
432                         // partition the items for this task
433                         GB_PARTITION (TaskList [ntasks].pA,
434                             TaskList [ntasks].pA_end, alen,
435                             (reverse) ? (nfine-tfine-1) : tfine, nfine) ;
436                         TaskList [ntasks].pA += pA ;
437                         TaskList [ntasks].pA_end += pA  ;
438                         // do not partition I
439                         TaskList [ntasks].pB = 0 ;
440                         TaskList [ntasks].pB_end = nI ;
441                         // unused
442                         TaskList [ntasks].pM = -1 ;
443                         TaskList [ntasks].pM_end = -1 ;
444 
445                         // flag the task that does the post sort
446                         TaskList [ntasks].len = (tfine == 0 && method == 10) ;
447                         ntasks++ ;
448                     }
449                 }
450             }
451         }
452     }
453 
454     ASSERT (ntasks > 0) ;
455 
456     //--------------------------------------------------------------------------
457     // free workspace and return result
458     //--------------------------------------------------------------------------
459 
460     GB_FREE_WORK ;
461     (*p_TaskList   ) = TaskList ;
462     (*p_TaskList_size) = TaskList_size ;
463     (*p_ntasks     ) = ntasks ;
464     (*p_nthreads   ) = nthreads ;
465     (*p_post_sort  ) = post_sort ;
466     (*p_Mark       ) = Mark ;
467     (*p_Mark_size  ) = Mark_size ;
468     (*p_Inext      ) = Inext ;
469     (*p_Inext_size ) = Inext_size ;
470     (*p_nduplicates) = ndupl ;
471     return (GrB_SUCCESS) ;
472 }
473 
474