1 //------------------------------------------------------------------------------
2 // GB_AxB_saxpy3_slice_balanced: construct balanced tasks for GB_AxB_saxpy3
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // If the mask is present but must be discarded, this function returns
11 // GrB_NO_VALUE, to indicate that the analysis was terminated early.
12 
13 #include "GB_AxB_saxpy3.h"
14 
15 // control parameters for generating parallel tasks
16 #define GB_NTASKS_PER_THREAD 2
17 #define GB_COSTLY 1.2
18 #define GB_FINE_WORK 2
19 #define GB_MWORK_ALPHA 0.01
20 #define GB_MWORK_BETA 0.10
21 
22 #define GB_FREE_WORK                        \
23 {                                           \
24     GB_WERK_POP (Fine_fl, int64_t) ;        \
25     GB_WERK_POP (Fine_slice, int64_t) ;     \
26     GB_WERK_POP (Coarse_Work, int64_t) ;    \
27     GB_WERK_POP (Coarse_initial, int64_t) ; \
28 }
29 
30 #define GB_FREE_ALL                                 \
31 {                                                   \
32     GB_FREE_WORK ;                                  \
33     GB_FREE_WERK (&SaxpyTasks, SaxpyTasks_size) ;   \
34 }
35 
36 //------------------------------------------------------------------------------
37 // GB_hash_table_size
38 //------------------------------------------------------------------------------
39 
40 // flmax is the max flop count for computing A*B(:,j), for any vector j that
41 // this task computes.  If the mask M is present, flmax also includes the
42 // number of entries in M(:,j).  GB_hash_table_size determines the hash table
43 // size for this task, which is twice the smallest power of 2 larger than
44 // flmax.  If flmax is large enough, the hash_size is returned as cvlen, so
45 // that Gustavson's method will be used instead of the Hash method.
46 
47 // By default, Gustavson vs Hash is selected automatically.  AxB_method can be
48 // selected via the descriptor or a global setting, as the non-default
49 // GxB_AxB_GUSTAVSON or GxB_AxB_HASH settings, to enforce the selection of
50 // either of those methods.  However, if Hash is selected but the hash table
51 // equals or exceeds cvlen, then Gustavson's method is used instead.
52 
GB_hash_table_size(int64_t flmax,int64_t cvlen,const GrB_Desc_Value AxB_method)53 static inline int64_t GB_hash_table_size
54 (
55     int64_t flmax,      // max flop count for any vector computed by this task
56     int64_t cvlen,      // vector length of C
57     const GrB_Desc_Value AxB_method     // Default, Gustavson, or Hash
58 )
59 {
60     int64_t hash_size ;
61 
62     if (AxB_method == GxB_AxB_GUSTAVSON || flmax >= cvlen/2)
63     {
64 
65         //----------------------------------------------------------------------
66         // use Gustavson if selected explicitly or if flmax is large
67         //----------------------------------------------------------------------
68 
69         hash_size = cvlen ;
70 
71     }
72     else
73     {
74 
75         //----------------------------------------------------------------------
76         // flmax is small; consider hash vs Gustavson
77         //----------------------------------------------------------------------
78 
79         // hash_size = 2 * (smallest power of 2 >= flmax)
80         hash_size = ((uint64_t) 2) << (GB_FLOOR_LOG2 (flmax) + 1) ;
81         bool use_Gustavson ;
82         if (AxB_method == GxB_AxB_HASH)
83         {
84             // always use Hash method, unless the hash_size >= cvlen
85             use_Gustavson = (hash_size >= cvlen) ;
86         }
87         else
88         {
89             // default: auto selection:
90             // use Gustavson's method if hash_size is too big
91             use_Gustavson = (hash_size >= cvlen/12) ;
92         }
93         if (use_Gustavson)
94         {
95             hash_size = cvlen ;
96         }
97     }
98 
99     //--------------------------------------------------------------------------
100     // return result
101     //--------------------------------------------------------------------------
102 
103     return (hash_size) ;
104 }
105 
106 //------------------------------------------------------------------------------
107 // GB_create_coarse_task: create a single coarse task
108 //------------------------------------------------------------------------------
109 
110 // Compute the max flop count for any vector in a coarse task, determine the
111 // hash table size, and construct the coarse task.
112 
GB_create_coarse_task(int64_t kfirst,int64_t klast,GB_saxpy3task_struct * SaxpyTasks,int taskid,int64_t * Bflops,int64_t cvlen,double chunk,int nthreads_max,int64_t * Coarse_Work,const GrB_Desc_Value AxB_method)113 static inline void GB_create_coarse_task
114 (
115     int64_t kfirst,     // coarse task consists of vectors kfirst:klast
116     int64_t klast,
117     GB_saxpy3task_struct *SaxpyTasks,
118     int taskid,         // taskid for this coarse task
119     int64_t *Bflops,    // size bnvec; cum sum of flop counts for vectors of B
120     int64_t cvlen,      // vector length of B and C
121     double chunk,
122     int nthreads_max,
123     int64_t *Coarse_Work,   // workspace for parallel reduction for flop count
124     const GrB_Desc_Value AxB_method     // Default, Gustavson, or Hash
125 )
126 {
127 
128     //--------------------------------------------------------------------------
129     // find the max # of flops for any vector in this task
130     //--------------------------------------------------------------------------
131 
132     int64_t nk = klast - kfirst + 1 ;
133     int nth = GB_nthreads (nk, chunk, nthreads_max) ;
134 
135     // each thread finds the max flop count for a subset of the vectors
136     int tid ;
137     #pragma omp parallel for num_threads(nth) schedule(static)
138     for (tid = 0 ; tid < nth ; tid++)
139     {
140         int64_t my_flmax = 1, istart, iend ;
141         GB_PARTITION (istart, iend, nk, tid, nth) ;
142         for (int64_t i = istart ; i < iend ; i++)
143         {
144             int64_t kk = kfirst + i ;
145             int64_t fl = Bflops [kk+1] - Bflops [kk] ;
146             my_flmax = GB_IMAX (my_flmax, fl) ;
147         }
148         Coarse_Work [tid] = my_flmax ;
149     }
150 
151     // combine results from each thread
152     int64_t flmax = 1 ;
153     for (tid = 0 ; tid < nth ; tid++)
154     {
155         flmax = GB_IMAX (flmax, Coarse_Work [tid]) ;
156     }
157 
158     // check the parallel computation
159     #ifdef GB_DEBUG
160     int64_t flmax2 = 1 ;
161     for (int64_t kk = kfirst ; kk <= klast ; kk++)
162     {
163         int64_t fl = Bflops [kk+1] - Bflops [kk] ;
164         flmax2 = GB_IMAX (flmax2, fl) ;
165     }
166     ASSERT (flmax == flmax2) ;
167     #endif
168 
169     //--------------------------------------------------------------------------
170     // define the coarse task
171     //--------------------------------------------------------------------------
172 
173     SaxpyTasks [taskid].start  = kfirst ;
174     SaxpyTasks [taskid].end    = klast ;
175     SaxpyTasks [taskid].vector = -1 ;
176     SaxpyTasks [taskid].hsize  = GB_hash_table_size (flmax, cvlen, AxB_method) ;
177     SaxpyTasks [taskid].Hi     = NULL ;      // assigned later
178     SaxpyTasks [taskid].Hf     = NULL ;      // assigned later
179     SaxpyTasks [taskid].Hx     = NULL ;      // assigned later
180     SaxpyTasks [taskid].my_cjnz = 0 ;        // for fine tasks only
181     SaxpyTasks [taskid].leader  = taskid ;
182     SaxpyTasks [taskid].team_size = 1 ;
183 }
184 
185 //------------------------------------------------------------------------------
186 // GB_AxB_saxpy3_slice_balanced: create balanced tasks for saxpy3
187 //------------------------------------------------------------------------------
188 
GB_AxB_saxpy3_slice_balanced(GrB_Matrix C,const GrB_Matrix M,const bool Mask_comp,const GrB_Matrix A,const GrB_Matrix B,GrB_Desc_Value AxB_method,GB_saxpy3task_struct ** SaxpyTasks_handle,size_t * SaxpyTasks_size_handle,bool * apply_mask,bool * M_packed_in_place,int * ntasks,int * nfine,int * nthreads,GB_Context Context)189 GrB_Info GB_AxB_saxpy3_slice_balanced
190 (
191     // inputs
192     GrB_Matrix C,                   // output matrix
193     const GrB_Matrix M,             // optional mask matrix
194     const bool Mask_comp,           // if true, use !M
195     const GrB_Matrix A,             // input matrix A
196     const GrB_Matrix B,             // input matrix B
197     GrB_Desc_Value AxB_method,      // Default, Gustavson, or Hash
198     // outputs
199     GB_saxpy3task_struct **SaxpyTasks_handle,
200     size_t *SaxpyTasks_size_handle,
201     bool *apply_mask,               // if true, apply M during sapxy3
202     bool *M_packed_in_place,        // if true, use M in-place
203     int *ntasks,                    // # of tasks created (coarse and fine)
204     int *nfine,                     // # of fine tasks created
205     int *nthreads,                  // # of threads to use
206     GB_Context Context
207 )
208 {
209 
210     //--------------------------------------------------------------------------
211     // check inputs
212     //--------------------------------------------------------------------------
213 
214     GrB_Info info ;
215 
216     (*apply_mask) = false ;
217     (*M_packed_in_place) = false ;
218     (*ntasks) = 0 ;
219     (*nfine) = 0 ;
220     (*nthreads) = 0 ;
221 
222     ASSERT_MATRIX_OK_OR_NULL (M, "M for saxpy3_slice_balanced A*B", GB0) ;
223     ASSERT (!GB_PENDING (M)) ;
224     ASSERT (GB_JUMBLED_OK (M)) ;
225     ASSERT (!GB_ZOMBIES (M)) ;
226 
227     ASSERT_MATRIX_OK (A, "A for saxpy3_slice_balanced A*B", GB0) ;
228     ASSERT (!GB_PENDING (A)) ;
229     ASSERT (GB_JUMBLED_OK (A)) ;
230     ASSERT (!GB_ZOMBIES (A)) ;
231 
232     ASSERT_MATRIX_OK (B, "B for saxpy3_slice_balanced A*B", GB0) ;
233     ASSERT (!GB_PENDING (B)) ;
234     ASSERT (GB_JUMBLED_OK (B)) ;
235     ASSERT (!GB_ZOMBIES (B)) ;
236 
237     //--------------------------------------------------------------------------
238     // determine the # of threads to use
239     //--------------------------------------------------------------------------
240 
241     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
242 
243     //--------------------------------------------------------------------------
244     // define result and workspace
245     //--------------------------------------------------------------------------
246 
247     GB_saxpy3task_struct *restrict SaxpyTasks = NULL ;
248     size_t SaxpyTasks_size = 0 ;
249 
250     GB_WERK_DECLARE (Coarse_initial, int64_t) ; // initial coarse tasks
251     GB_WERK_DECLARE (Coarse_Work, int64_t) ;    // workspace for flop counts
252     GB_WERK_DECLARE (Fine_slice, int64_t) ;
253     GB_WERK_DECLARE (Fine_fl, int64_t) ;        // size max(nnz(B(:,j)))
254 
255     //--------------------------------------------------------------------------
256     // get A, and B
257     //--------------------------------------------------------------------------
258 
259     const int64_t *restrict Ap = A->p ;
260     const int64_t *restrict Ah = A->h ;
261     const int64_t avlen = A->vlen ;
262     const int64_t anvec = A->nvec ;
263     const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
264 
265     const int64_t *restrict Bp = B->p ;
266     const int64_t *restrict Bh = B->h ;
267     const int8_t  *restrict Bb = B->b ;
268     const int64_t *restrict Bi = B->i ;
269     const int64_t bvdim = B->vdim ;
270     const int64_t bnz = GB_NNZ_HELD (B) ;
271     const int64_t bnvec = B->nvec ;
272     const int64_t bvlen = B->vlen ;
273     const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
274 
275     int64_t cvlen = avlen ;
276     int64_t cvdim = bvdim ;
277 
278     //--------------------------------------------------------------------------
279     // compute flop counts for each vector of B and C
280     //--------------------------------------------------------------------------
281 
282     int64_t Mwork = 0 ;
283     int64_t *restrict Bflops = C->p ;    // use C->p as workspace for Bflops
284     GB_OK (GB_AxB_saxpy3_flopcount (&Mwork, Bflops, M, Mask_comp, A, B,
285         Context)) ;
286     int64_t total_flops = Bflops [bnvec] ;
287     double axbflops = total_flops - Mwork ;
288     GBURBLE ("axbwork %g ", axbflops) ;
289     if (Mwork > 0) GBURBLE ("mwork %g ", (double) Mwork) ;
290 
291     //--------------------------------------------------------------------------
292     // determine if the mask M should be applied, or done later
293     //--------------------------------------------------------------------------
294 
295     if (M == NULL)
296     {
297 
298         //----------------------------------------------------------------------
299         // M is not present
300         //----------------------------------------------------------------------
301 
302         (*apply_mask) = false ;
303 
304     }
305     else if (GB_is_packed (M))
306     {
307 
308         //----------------------------------------------------------------------
309         // M is present and full, bitmap, or sparse/hyper with all entries
310         //----------------------------------------------------------------------
311 
312         // Choose all-hash or all-Gustavson tasks, and apply M during saxpy3.
313 
314         (*apply_mask) = true ;
315 
316         // The work for M has not yet been added Bflops.
317         // Each vector M(:,j) has cvlen entries.
318         Mwork = cvlen * cvdim ;
319 
320         if (!(AxB_method == GxB_AxB_HASH || AxB_method == GxB_AxB_GUSTAVSON))
321         {
322             if (axbflops < (double) Mwork * GB_MWORK_BETA)
323             {
324                 // The mask is too costly to scatter into the Hf workspace.
325                 // Leave it in place and use all-hash tasks.
326                 AxB_method = GxB_AxB_HASH ;
327             }
328             else
329             {
330                 // Scatter M into Hf and use all-Gustavson tasks.
331                 AxB_method = GxB_AxB_GUSTAVSON ;
332             }
333         }
334 
335         if (AxB_method == GxB_AxB_HASH)
336         {
337             // Use the hash method for all tasks (except for those tasks which
338             // require a hash table size >= cvlen; those tasks use Gustavson).
339             // Do not scatter the mask into the Hf hash workspace.  The work
340             // for the mask is not accounted for in Bflops, so the hash tables
341             // can be small.
342             (*M_packed_in_place) = true ;
343             GBURBLE ("(use packed mask in-place) ") ;
344         }
345         else
346         {
347             // Use the Gustavson method for all tasks, and scatter M into the
348             // fine Gustavson workspace.  The work for M is not yet in the
349             // Bflops cumulative sum.  Add it now.
350             ASSERT (AxB_method == GxB_AxB_GUSTAVSON)
351             int nth = GB_nthreads (bnvec, chunk, nthreads_max) ;
352             int64_t kk ;
353             #pragma omp parallel for num_threads(nth) schedule(static)
354             for (kk = 0 ; kk <= bnvec ; kk++)
355             {
356                 Bflops [kk] += cvlen * (kk+1) ;
357             }
358             total_flops = Bflops [bnvec] ;
359             GBURBLE ("(use packed mask) ") ;
360         }
361 
362     }
363     else if (axbflops < ((double) Mwork * GB_MWORK_ALPHA))
364     {
365 
366         //----------------------------------------------------------------------
367         // M is costly to use; apply it after C=A*B
368         //----------------------------------------------------------------------
369 
370         // Do not use M during the computation of A*B.  Instead, compute C=A*B
371         // and then apply the mask later.  Tell the caller that the mask should
372         // not be applied, so that it will be applied later in GB_mxm.
373 
374         (*apply_mask) = false ;
375         GBURBLE ("(discard mask) ") ;
376         GB_FREE_ALL ;
377         return (GrB_NO_VALUE) ;
378 
379     }
380     else
381     {
382 
383         //----------------------------------------------------------------------
384         // use M during saxpy3
385         //----------------------------------------------------------------------
386 
387         (*apply_mask) = true ;
388         GBURBLE ("(use mask) ") ;
389     }
390 
391     //--------------------------------------------------------------------------
392     // determine # of threads and # of initial coarse tasks
393     //--------------------------------------------------------------------------
394 
395     (*nthreads) = GB_nthreads ((double) total_flops, chunk, nthreads_max) ;
396     int ntasks_initial = ((*nthreads) == 1) ? 1 :
397         (GB_NTASKS_PER_THREAD * (*nthreads)) ;
398 
399     //--------------------------------------------------------------------------
400     // give preference to Gustavson when using few threads
401     //--------------------------------------------------------------------------
402 
403     if ((*nthreads) <= 8 &&
404         (!(AxB_method == GxB_AxB_HASH || AxB_method == GxB_AxB_GUSTAVSON)))
405     {
406         // Unless a specific method has been explicitly requested, see if
407         // Gustavson should be used with a small number of threads.
408         // Matrix-vector has a maximum intensity of 1, so this heuristic only
409         // applies to GrB_mxm.
410         double abnz = GB_NNZ (A) + GB_NNZ (B) + 1 ;
411         double workspace = (double) ntasks_initial * (double) cvlen ;
412         double intensity = total_flops / abnz ;
413         GBURBLE ("(intensity: %0.3g workspace/(nnz(A)+nnz(B)): %0.3g",
414             intensity, workspace / abnz) ;
415         if (intensity >= 8 && workspace < abnz)
416         {
417             // work intensity is large, and Gustvason workspace is modest;
418             // use Gustavson for all tasks
419             AxB_method = GxB_AxB_GUSTAVSON ;
420             GBURBLE (": select Gustvason) ") ;
421         }
422         else
423         {
424             // use default task creation: mix of Hash and Gustavson
425             GBURBLE (") ") ;
426         }
427     }
428 
429     //--------------------------------------------------------------------------
430     // determine target task size
431     //--------------------------------------------------------------------------
432 
433     double target_task_size = ((double) total_flops) / ntasks_initial ;
434     target_task_size = GB_IMAX (target_task_size, chunk) ;
435     double target_fine_size = target_task_size / GB_FINE_WORK ;
436     target_fine_size = GB_IMAX (target_fine_size, chunk) ;
437 
438     //--------------------------------------------------------------------------
439     // determine # of parallel tasks
440     //--------------------------------------------------------------------------
441 
442     int ncoarse = 0 ;       // # of coarse tasks
443     int max_bjnz = 0 ;      // max (nnz (B (:,j))) of fine tasks
444 
445     // FUTURE: also use ultra-fine tasks that compute A(i1:i2,k)*B(k,j)
446 
447     if (ntasks_initial > 1)
448     {
449 
450         //----------------------------------------------------------------------
451         // construct initial coarse tasks
452         //----------------------------------------------------------------------
453 
454         GB_WERK_PUSH (Coarse_initial, ntasks_initial + 1, int64_t) ;
455         if (Coarse_initial == NULL)
456         {
457             // out of memory
458             GB_FREE_ALL ;
459             return (GrB_OUT_OF_MEMORY) ;
460         }
461         GB_pslice (Coarse_initial, Bflops, bnvec, ntasks_initial, true) ;
462 
463         //----------------------------------------------------------------------
464         // split the work into coarse and fine tasks
465         //----------------------------------------------------------------------
466 
467         for (int taskid = 0 ; taskid < ntasks_initial ; taskid++)
468         {
469             // get the initial coarse task
470             int64_t kfirst = Coarse_initial [taskid] ;
471             int64_t klast  = Coarse_initial [taskid+1] ;
472             int64_t task_ncols = klast - kfirst ;
473             int64_t task_flops = Bflops [klast] - Bflops [kfirst] ;
474 
475             if (task_ncols == 0)
476             {
477                 // This coarse task is empty, having been squeezed out by
478                 // costly vectors in adjacent coarse tasks.
479             }
480             else if (task_flops > 2 * GB_COSTLY * target_task_size)
481             {
482                 // This coarse task is too costly, because it contains one or
483                 // more costly vectors.  Split its vectors into a mixture of
484                 // coarse and fine tasks.
485 
486                 int64_t kcoarse_start = kfirst ;
487 
488                 for (int64_t kk = kfirst ; kk < klast ; kk++)
489                 {
490                     // jflops = # of flops to compute a single vector A*B(:,j)
491                     // where j == GBH (Bh, kk)
492                     double jflops = Bflops [kk+1] - Bflops [kk] ;
493                     // bjnz = nnz (B (:,j))
494                     int64_t bjnz = (Bp == NULL) ? bvlen : (Bp [kk+1] - Bp [kk]);
495 
496                     if (jflops > GB_COSTLY * target_task_size && bjnz > 1)
497                     {
498                         // A*B(:,j) is costly; split it into 2 or more fine
499                         // tasks.  First flush the prior coarse task, if any.
500                         if (kcoarse_start < kk)
501                         {
502                             // vectors kcoarse_start to kk-1 form a single
503                             // coarse task
504                             ncoarse++ ;
505                         }
506 
507                         // next coarse task (if any) starts at kk+1
508                         kcoarse_start = kk+1 ;
509 
510                         // vectors kk will be split into multiple fine tasks
511                         max_bjnz = GB_IMAX (max_bjnz, bjnz) ;
512                         int team_size = ceil (jflops / target_fine_size) ;
513                         (*nfine) += team_size ;
514                     }
515                 }
516 
517                 // flush the last coarse task, if any
518                 if (kcoarse_start < klast)
519                 {
520                     // vectors kcoarse_start to klast-1 form a single
521                     // coarse task
522                     ncoarse++ ;
523                 }
524 
525             }
526             else
527             {
528                 // This coarse task is OK as-is.
529                 ncoarse++ ;
530             }
531         }
532     }
533     else
534     {
535 
536         //----------------------------------------------------------------------
537         // entire computation in a single fine or coarse task
538         //----------------------------------------------------------------------
539 
540         if (bnvec == 1)
541         {
542             // If B is a single vector, and is computed by a single thread,
543             // then a single fine task is used.
544             (*nfine) = 1 ;
545             ncoarse = 0 ;
546         }
547         else
548         {
549             // One thread uses a single coarse task if B is not a vector.
550             (*nfine) = 0 ;
551             ncoarse = 1 ;
552         }
553     }
554 
555     (*ntasks) = ncoarse + (*nfine) ;
556 
557     //--------------------------------------------------------------------------
558     // allocate the tasks, and workspace to construct fine tasks
559     //--------------------------------------------------------------------------
560 
561     SaxpyTasks = GB_MALLOC_WERK ((*ntasks), GB_saxpy3task_struct,
562         &SaxpyTasks_size) ;
563     GB_WERK_PUSH (Coarse_Work, nthreads_max, int64_t) ;
564     if (max_bjnz > 0)
565     {
566         // also allocate workspace to construct fine tasks
567         GB_WERK_PUSH (Fine_slice, (*ntasks)+1, int64_t) ;
568         // Fine_fl will only fit on the Werk stack if max_bjnz is small,
569         // but try anyway, in case it fits.  It is placed at the top of the
570         // Werk stack.
571         GB_WERK_PUSH (Fine_fl, max_bjnz+1, int64_t) ;
572     }
573 
574     if (SaxpyTasks == NULL || Coarse_Work == NULL ||
575         (max_bjnz > 0 && (Fine_slice == NULL || Fine_fl == NULL)))
576     {
577         // out of memory
578         GB_FREE_ALL ;
579         return (GrB_OUT_OF_MEMORY) ;
580     }
581 
582     // clear SaxpyTasks
583     memset (SaxpyTasks, 0, SaxpyTasks_size) ;
584 
585     //--------------------------------------------------------------------------
586     // create the tasks
587     //--------------------------------------------------------------------------
588 
589     if (ntasks_initial > 1)
590     {
591 
592         //----------------------------------------------------------------------
593         // create the coarse and fine tasks
594         //----------------------------------------------------------------------
595 
596         int nf = 0 ;            // fine tasks have task id 0:nfine-1
597         int nc = (*nfine) ;     // coarse task ids are nfine:ntasks-1
598 
599         for (int taskid = 0 ; taskid < ntasks_initial ; taskid++)
600         {
601             // get the initial coarse task
602             int64_t kfirst = Coarse_initial [taskid] ;
603             int64_t klast  = Coarse_initial [taskid+1] ;
604             int64_t task_ncols = klast - kfirst ;
605             int64_t task_flops = Bflops [klast] - Bflops [kfirst] ;
606 
607             if (task_ncols == 0)
608             {
609                 // This coarse task is empty, having been squeezed out by
610                 // costly vectors in adjacent coarse tasks.
611             }
612             else if (task_flops > 2 * GB_COSTLY * target_task_size)
613             {
614                 // This coarse task is too costly, because it contains one or
615                 // more costly vectors.  Split its vectors into a mixture of
616                 // coarse and fine tasks.
617 
618                 int64_t kcoarse_start = kfirst ;
619 
620                 for (int64_t kk = kfirst ; kk < klast ; kk++)
621                 {
622                     // jflops = # of flops to compute a single vector A*B(:,j)
623                     double jflops = Bflops [kk+1] - Bflops [kk] ;
624                     // bjnz = nnz (B (:,j))
625                     int64_t bjnz = (Bp == NULL) ? bvlen : (Bp [kk+1] - Bp [kk]);
626 
627                     if (jflops > GB_COSTLY * target_task_size && bjnz > 1)
628                     {
629                         // A*B(:,j) is costly; split it into 2 or more fine
630                         // tasks.  First flush the prior coarse task, if any.
631                         if (kcoarse_start < kk)
632                         {
633                             // kcoarse_start:kk-1 form a single coarse task
634                             GB_create_coarse_task (kcoarse_start, kk-1,
635                                 SaxpyTasks, nc++, Bflops, cvlen, chunk,
636                                 nthreads_max, Coarse_Work, AxB_method) ;
637                         }
638 
639                         // next coarse task (if any) starts at kk+1
640                         kcoarse_start = kk+1 ;
641 
642                         // count the work for each entry B(k,j).  Do not
643                         // include the work to scan M(:,j), since that will
644                         // be evenly divided between all tasks in this team.
645                         int64_t pB_start = GBP (Bp, kk, bvlen) ;
646                         int nth = GB_nthreads (bjnz, chunk, nthreads_max) ;
647                         int64_t s ;
648                         #pragma omp parallel for num_threads(nth) \
649                             schedule(static)
650                         for (s = 0 ; s < bjnz ; s++)
651                         {
652                             // get B(k,j)
653                             Fine_fl [s] = 1 ;
654                             int64_t pB = pB_start + s ;
655                             if (!GBB (Bb, pB)) continue ;
656                             int64_t k = GBI (Bi, pB, bvlen) ;
657                             // fl = flop count for just A(:,k)*B(k,j)
658                             int64_t pA, pA_end ;
659                             int64_t pleft = 0 ;
660                             GB_lookup (A_is_hyper, Ah, Ap, avlen, &pleft,
661                                 anvec-1, k, &pA, &pA_end) ;
662                             int64_t fl = pA_end - pA ;
663                             Fine_fl [s] = fl ;
664                             ASSERT (fl >= 0) ;
665                         }
666 
667                         // cumulative sum of flops to compute A*B(:,j)
668                         GB_cumsum (Fine_fl, bjnz, NULL, nth, Context) ;
669 
670                         // slice B(:,j) into fine tasks
671                         int team_size = ceil (jflops / target_fine_size) ;
672                         ASSERT (Fine_slice != NULL) ;
673                         GB_pslice (Fine_slice, Fine_fl, bjnz, team_size, false);
674 
675                         // shared hash table for all fine tasks for A*B(:,j)
676                         int64_t hsize =
677                             GB_hash_table_size (jflops, cvlen, AxB_method) ;
678 
679                         // construct the fine tasks for C(:,j)=A*B(:,j)
680                         int leader = nf ;
681                         for (int fid = 0 ; fid < team_size ; fid++)
682                         {
683                             int64_t pstart = Fine_slice [fid] ;
684                             int64_t pend   = Fine_slice [fid+1] ;
685                             int64_t fl = Fine_fl [pend] - Fine_fl [pstart] ;
686                             SaxpyTasks [nf].start  = pB_start + pstart ;
687                             SaxpyTasks [nf].end    = pB_start + pend - 1 ;
688                             SaxpyTasks [nf].vector = kk ;
689                             SaxpyTasks [nf].hsize  = hsize ;
690                             SaxpyTasks [nf].Hi = NULL ;   // assigned later
691                             SaxpyTasks [nf].Hf = NULL ;   // assigned later
692                             SaxpyTasks [nf].Hx = NULL ;   // assigned later
693                             SaxpyTasks [nf].my_cjnz = 0 ;
694                             SaxpyTasks [nf].leader = leader ;
695                             SaxpyTasks [nf].team_size = team_size ;
696                             nf++ ;
697                         }
698                     }
699                 }
700 
701                 // flush the last coarse task, if any
702                 if (kcoarse_start < klast)
703                 {
704                     // kcoarse_start:klast-1 form a single coarse task
705                     GB_create_coarse_task (kcoarse_start, klast-1, SaxpyTasks,
706                         nc++, Bflops, cvlen, chunk, nthreads_max,
707                         Coarse_Work, AxB_method) ;
708                 }
709 
710             }
711             else
712             {
713                 // This coarse task is OK as-is.
714                 GB_create_coarse_task (kfirst, klast-1, SaxpyTasks,
715                     nc++, Bflops, cvlen, chunk, nthreads_max,
716                     Coarse_Work, AxB_method) ;
717             }
718         }
719 
720     }
721     else
722     {
723 
724         //----------------------------------------------------------------------
725         // entire computation in a single fine or coarse task
726         //----------------------------------------------------------------------
727 
728         // create a single coarse task: hash or Gustavson
729         GB_create_coarse_task (0, bnvec-1, SaxpyTasks, 0, Bflops, cvlen, 1, 1,
730             Coarse_Work, AxB_method) ;
731 
732         if (bnvec == 1)
733         {
734             // convert the single coarse task into a single fine task
735             SaxpyTasks [0].start  = 0 ;           // first entry in B(:,0)
736             SaxpyTasks [0].end = bnz - 1 ;        // last entry in B(:,0)
737             SaxpyTasks [0].vector = 0 ;
738         }
739     }
740 
741     //--------------------------------------------------------------------------
742     // free workspace and return result
743     //--------------------------------------------------------------------------
744 
745     GB_FREE_WORK ;
746     (*SaxpyTasks_handle) = SaxpyTasks ;
747     (*SaxpyTasks_size_handle) = SaxpyTasks_size ;
748     return (GrB_SUCCESS) ;
749 }
750 
751