1 //------------------------------------------------------------------------------
2 // GB_AxB_saxpy3_slice_balanced: construct balanced tasks for GB_AxB_saxpy3
3 //------------------------------------------------------------------------------
4
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7
8 //------------------------------------------------------------------------------
9
10 // If the mask is present but must be discarded, this function returns
11 // GrB_NO_VALUE, to indicate that the analysis was terminated early.
12
13 #include "GB_AxB_saxpy3.h"
14
15 // control parameters for generating parallel tasks
16 #define GB_NTASKS_PER_THREAD 2
17 #define GB_COSTLY 1.2
18 #define GB_FINE_WORK 2
19 #define GB_MWORK_ALPHA 0.01
20 #define GB_MWORK_BETA 0.10
21
22 #define GB_FREE_WORK \
23 { \
24 GB_WERK_POP (Fine_fl, int64_t) ; \
25 GB_WERK_POP (Fine_slice, int64_t) ; \
26 GB_WERK_POP (Coarse_Work, int64_t) ; \
27 GB_WERK_POP (Coarse_initial, int64_t) ; \
28 }
29
30 #define GB_FREE_ALL \
31 { \
32 GB_FREE_WORK ; \
33 GB_FREE_WERK (&SaxpyTasks, SaxpyTasks_size) ; \
34 }
35
36 //------------------------------------------------------------------------------
37 // GB_hash_table_size
38 //------------------------------------------------------------------------------
39
40 // flmax is the max flop count for computing A*B(:,j), for any vector j that
41 // this task computes. If the mask M is present, flmax also includes the
42 // number of entries in M(:,j). GB_hash_table_size determines the hash table
43 // size for this task, which is twice the smallest power of 2 larger than
44 // flmax. If flmax is large enough, the hash_size is returned as cvlen, so
45 // that Gustavson's method will be used instead of the Hash method.
46
47 // By default, Gustavson vs Hash is selected automatically. AxB_method can be
48 // selected via the descriptor or a global setting, as the non-default
49 // GxB_AxB_GUSTAVSON or GxB_AxB_HASH settings, to enforce the selection of
50 // either of those methods. However, if Hash is selected but the hash table
51 // equals or exceeds cvlen, then Gustavson's method is used instead.
52
GB_hash_table_size(int64_t flmax,int64_t cvlen,const GrB_Desc_Value AxB_method)53 static inline int64_t GB_hash_table_size
54 (
55 int64_t flmax, // max flop count for any vector computed by this task
56 int64_t cvlen, // vector length of C
57 const GrB_Desc_Value AxB_method // Default, Gustavson, or Hash
58 )
59 {
60 int64_t hash_size ;
61
62 if (AxB_method == GxB_AxB_GUSTAVSON || flmax >= cvlen/2)
63 {
64
65 //----------------------------------------------------------------------
66 // use Gustavson if selected explicitly or if flmax is large
67 //----------------------------------------------------------------------
68
69 hash_size = cvlen ;
70
71 }
72 else
73 {
74
75 //----------------------------------------------------------------------
76 // flmax is small; consider hash vs Gustavson
77 //----------------------------------------------------------------------
78
79 // hash_size = 2 * (smallest power of 2 >= flmax)
80 hash_size = ((uint64_t) 2) << (GB_FLOOR_LOG2 (flmax) + 1) ;
81 bool use_Gustavson ;
82 if (AxB_method == GxB_AxB_HASH)
83 {
84 // always use Hash method, unless the hash_size >= cvlen
85 use_Gustavson = (hash_size >= cvlen) ;
86 }
87 else
88 {
89 // default: auto selection:
90 // use Gustavson's method if hash_size is too big
91 use_Gustavson = (hash_size >= cvlen/12) ;
92 }
93 if (use_Gustavson)
94 {
95 hash_size = cvlen ;
96 }
97 }
98
99 //--------------------------------------------------------------------------
100 // return result
101 //--------------------------------------------------------------------------
102
103 return (hash_size) ;
104 }
105
106 //------------------------------------------------------------------------------
107 // GB_create_coarse_task: create a single coarse task
108 //------------------------------------------------------------------------------
109
110 // Compute the max flop count for any vector in a coarse task, determine the
111 // hash table size, and construct the coarse task.
112
GB_create_coarse_task(int64_t kfirst,int64_t klast,GB_saxpy3task_struct * SaxpyTasks,int taskid,int64_t * Bflops,int64_t cvlen,double chunk,int nthreads_max,int64_t * Coarse_Work,const GrB_Desc_Value AxB_method)113 static inline void GB_create_coarse_task
114 (
115 int64_t kfirst, // coarse task consists of vectors kfirst:klast
116 int64_t klast,
117 GB_saxpy3task_struct *SaxpyTasks,
118 int taskid, // taskid for this coarse task
119 int64_t *Bflops, // size bnvec; cum sum of flop counts for vectors of B
120 int64_t cvlen, // vector length of B and C
121 double chunk,
122 int nthreads_max,
123 int64_t *Coarse_Work, // workspace for parallel reduction for flop count
124 const GrB_Desc_Value AxB_method // Default, Gustavson, or Hash
125 )
126 {
127
128 //--------------------------------------------------------------------------
129 // find the max # of flops for any vector in this task
130 //--------------------------------------------------------------------------
131
132 int64_t nk = klast - kfirst + 1 ;
133 int nth = GB_nthreads (nk, chunk, nthreads_max) ;
134
135 // each thread finds the max flop count for a subset of the vectors
136 int tid ;
137 #pragma omp parallel for num_threads(nth) schedule(static)
138 for (tid = 0 ; tid < nth ; tid++)
139 {
140 int64_t my_flmax = 1, istart, iend ;
141 GB_PARTITION (istart, iend, nk, tid, nth) ;
142 for (int64_t i = istart ; i < iend ; i++)
143 {
144 int64_t kk = kfirst + i ;
145 int64_t fl = Bflops [kk+1] - Bflops [kk] ;
146 my_flmax = GB_IMAX (my_flmax, fl) ;
147 }
148 Coarse_Work [tid] = my_flmax ;
149 }
150
151 // combine results from each thread
152 int64_t flmax = 1 ;
153 for (tid = 0 ; tid < nth ; tid++)
154 {
155 flmax = GB_IMAX (flmax, Coarse_Work [tid]) ;
156 }
157
158 // check the parallel computation
159 #ifdef GB_DEBUG
160 int64_t flmax2 = 1 ;
161 for (int64_t kk = kfirst ; kk <= klast ; kk++)
162 {
163 int64_t fl = Bflops [kk+1] - Bflops [kk] ;
164 flmax2 = GB_IMAX (flmax2, fl) ;
165 }
166 ASSERT (flmax == flmax2) ;
167 #endif
168
169 //--------------------------------------------------------------------------
170 // define the coarse task
171 //--------------------------------------------------------------------------
172
173 SaxpyTasks [taskid].start = kfirst ;
174 SaxpyTasks [taskid].end = klast ;
175 SaxpyTasks [taskid].vector = -1 ;
176 SaxpyTasks [taskid].hsize = GB_hash_table_size (flmax, cvlen, AxB_method) ;
177 SaxpyTasks [taskid].Hi = NULL ; // assigned later
178 SaxpyTasks [taskid].Hf = NULL ; // assigned later
179 SaxpyTasks [taskid].Hx = NULL ; // assigned later
180 SaxpyTasks [taskid].my_cjnz = 0 ; // for fine tasks only
181 SaxpyTasks [taskid].leader = taskid ;
182 SaxpyTasks [taskid].team_size = 1 ;
183 }
184
185 //------------------------------------------------------------------------------
186 // GB_AxB_saxpy3_slice_balanced: create balanced tasks for saxpy3
187 //------------------------------------------------------------------------------
188
GB_AxB_saxpy3_slice_balanced(GrB_Matrix C,const GrB_Matrix M,const bool Mask_comp,const GrB_Matrix A,const GrB_Matrix B,GrB_Desc_Value AxB_method,GB_saxpy3task_struct ** SaxpyTasks_handle,size_t * SaxpyTasks_size_handle,bool * apply_mask,bool * M_packed_in_place,int * ntasks,int * nfine,int * nthreads,GB_Context Context)189 GrB_Info GB_AxB_saxpy3_slice_balanced
190 (
191 // inputs
192 GrB_Matrix C, // output matrix
193 const GrB_Matrix M, // optional mask matrix
194 const bool Mask_comp, // if true, use !M
195 const GrB_Matrix A, // input matrix A
196 const GrB_Matrix B, // input matrix B
197 GrB_Desc_Value AxB_method, // Default, Gustavson, or Hash
198 // outputs
199 GB_saxpy3task_struct **SaxpyTasks_handle,
200 size_t *SaxpyTasks_size_handle,
201 bool *apply_mask, // if true, apply M during sapxy3
202 bool *M_packed_in_place, // if true, use M in-place
203 int *ntasks, // # of tasks created (coarse and fine)
204 int *nfine, // # of fine tasks created
205 int *nthreads, // # of threads to use
206 GB_Context Context
207 )
208 {
209
210 //--------------------------------------------------------------------------
211 // check inputs
212 //--------------------------------------------------------------------------
213
214 GrB_Info info ;
215
216 (*apply_mask) = false ;
217 (*M_packed_in_place) = false ;
218 (*ntasks) = 0 ;
219 (*nfine) = 0 ;
220 (*nthreads) = 0 ;
221
222 ASSERT_MATRIX_OK_OR_NULL (M, "M for saxpy3_slice_balanced A*B", GB0) ;
223 ASSERT (!GB_PENDING (M)) ;
224 ASSERT (GB_JUMBLED_OK (M)) ;
225 ASSERT (!GB_ZOMBIES (M)) ;
226
227 ASSERT_MATRIX_OK (A, "A for saxpy3_slice_balanced A*B", GB0) ;
228 ASSERT (!GB_PENDING (A)) ;
229 ASSERT (GB_JUMBLED_OK (A)) ;
230 ASSERT (!GB_ZOMBIES (A)) ;
231
232 ASSERT_MATRIX_OK (B, "B for saxpy3_slice_balanced A*B", GB0) ;
233 ASSERT (!GB_PENDING (B)) ;
234 ASSERT (GB_JUMBLED_OK (B)) ;
235 ASSERT (!GB_ZOMBIES (B)) ;
236
237 //--------------------------------------------------------------------------
238 // determine the # of threads to use
239 //--------------------------------------------------------------------------
240
241 GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
242
243 //--------------------------------------------------------------------------
244 // define result and workspace
245 //--------------------------------------------------------------------------
246
247 GB_saxpy3task_struct *restrict SaxpyTasks = NULL ;
248 size_t SaxpyTasks_size = 0 ;
249
250 GB_WERK_DECLARE (Coarse_initial, int64_t) ; // initial coarse tasks
251 GB_WERK_DECLARE (Coarse_Work, int64_t) ; // workspace for flop counts
252 GB_WERK_DECLARE (Fine_slice, int64_t) ;
253 GB_WERK_DECLARE (Fine_fl, int64_t) ; // size max(nnz(B(:,j)))
254
255 //--------------------------------------------------------------------------
256 // get A, and B
257 //--------------------------------------------------------------------------
258
259 const int64_t *restrict Ap = A->p ;
260 const int64_t *restrict Ah = A->h ;
261 const int64_t avlen = A->vlen ;
262 const int64_t anvec = A->nvec ;
263 const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
264
265 const int64_t *restrict Bp = B->p ;
266 const int64_t *restrict Bh = B->h ;
267 const int8_t *restrict Bb = B->b ;
268 const int64_t *restrict Bi = B->i ;
269 const int64_t bvdim = B->vdim ;
270 const int64_t bnz = GB_NNZ_HELD (B) ;
271 const int64_t bnvec = B->nvec ;
272 const int64_t bvlen = B->vlen ;
273 const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
274
275 int64_t cvlen = avlen ;
276 int64_t cvdim = bvdim ;
277
278 //--------------------------------------------------------------------------
279 // compute flop counts for each vector of B and C
280 //--------------------------------------------------------------------------
281
282 int64_t Mwork = 0 ;
283 int64_t *restrict Bflops = C->p ; // use C->p as workspace for Bflops
284 GB_OK (GB_AxB_saxpy3_flopcount (&Mwork, Bflops, M, Mask_comp, A, B,
285 Context)) ;
286 int64_t total_flops = Bflops [bnvec] ;
287 double axbflops = total_flops - Mwork ;
288 GBURBLE ("axbwork %g ", axbflops) ;
289 if (Mwork > 0) GBURBLE ("mwork %g ", (double) Mwork) ;
290
291 //--------------------------------------------------------------------------
292 // determine if the mask M should be applied, or done later
293 //--------------------------------------------------------------------------
294
295 if (M == NULL)
296 {
297
298 //----------------------------------------------------------------------
299 // M is not present
300 //----------------------------------------------------------------------
301
302 (*apply_mask) = false ;
303
304 }
305 else if (GB_is_packed (M))
306 {
307
308 //----------------------------------------------------------------------
309 // M is present and full, bitmap, or sparse/hyper with all entries
310 //----------------------------------------------------------------------
311
312 // Choose all-hash or all-Gustavson tasks, and apply M during saxpy3.
313
314 (*apply_mask) = true ;
315
316 // The work for M has not yet been added Bflops.
317 // Each vector M(:,j) has cvlen entries.
318 Mwork = cvlen * cvdim ;
319
320 if (!(AxB_method == GxB_AxB_HASH || AxB_method == GxB_AxB_GUSTAVSON))
321 {
322 if (axbflops < (double) Mwork * GB_MWORK_BETA)
323 {
324 // The mask is too costly to scatter into the Hf workspace.
325 // Leave it in place and use all-hash tasks.
326 AxB_method = GxB_AxB_HASH ;
327 }
328 else
329 {
330 // Scatter M into Hf and use all-Gustavson tasks.
331 AxB_method = GxB_AxB_GUSTAVSON ;
332 }
333 }
334
335 if (AxB_method == GxB_AxB_HASH)
336 {
337 // Use the hash method for all tasks (except for those tasks which
338 // require a hash table size >= cvlen; those tasks use Gustavson).
339 // Do not scatter the mask into the Hf hash workspace. The work
340 // for the mask is not accounted for in Bflops, so the hash tables
341 // can be small.
342 (*M_packed_in_place) = true ;
343 GBURBLE ("(use packed mask in-place) ") ;
344 }
345 else
346 {
347 // Use the Gustavson method for all tasks, and scatter M into the
348 // fine Gustavson workspace. The work for M is not yet in the
349 // Bflops cumulative sum. Add it now.
350 ASSERT (AxB_method == GxB_AxB_GUSTAVSON)
351 int nth = GB_nthreads (bnvec, chunk, nthreads_max) ;
352 int64_t kk ;
353 #pragma omp parallel for num_threads(nth) schedule(static)
354 for (kk = 0 ; kk <= bnvec ; kk++)
355 {
356 Bflops [kk] += cvlen * (kk+1) ;
357 }
358 total_flops = Bflops [bnvec] ;
359 GBURBLE ("(use packed mask) ") ;
360 }
361
362 }
363 else if (axbflops < ((double) Mwork * GB_MWORK_ALPHA))
364 {
365
366 //----------------------------------------------------------------------
367 // M is costly to use; apply it after C=A*B
368 //----------------------------------------------------------------------
369
370 // Do not use M during the computation of A*B. Instead, compute C=A*B
371 // and then apply the mask later. Tell the caller that the mask should
372 // not be applied, so that it will be applied later in GB_mxm.
373
374 (*apply_mask) = false ;
375 GBURBLE ("(discard mask) ") ;
376 GB_FREE_ALL ;
377 return (GrB_NO_VALUE) ;
378
379 }
380 else
381 {
382
383 //----------------------------------------------------------------------
384 // use M during saxpy3
385 //----------------------------------------------------------------------
386
387 (*apply_mask) = true ;
388 GBURBLE ("(use mask) ") ;
389 }
390
391 //--------------------------------------------------------------------------
392 // determine # of threads and # of initial coarse tasks
393 //--------------------------------------------------------------------------
394
395 (*nthreads) = GB_nthreads ((double) total_flops, chunk, nthreads_max) ;
396 int ntasks_initial = ((*nthreads) == 1) ? 1 :
397 (GB_NTASKS_PER_THREAD * (*nthreads)) ;
398
399 //--------------------------------------------------------------------------
400 // give preference to Gustavson when using few threads
401 //--------------------------------------------------------------------------
402
403 if ((*nthreads) <= 8 &&
404 (!(AxB_method == GxB_AxB_HASH || AxB_method == GxB_AxB_GUSTAVSON)))
405 {
406 // Unless a specific method has been explicitly requested, see if
407 // Gustavson should be used with a small number of threads.
408 // Matrix-vector has a maximum intensity of 1, so this heuristic only
409 // applies to GrB_mxm.
410 double abnz = GB_NNZ (A) + GB_NNZ (B) + 1 ;
411 double workspace = (double) ntasks_initial * (double) cvlen ;
412 double intensity = total_flops / abnz ;
413 GBURBLE ("(intensity: %0.3g workspace/(nnz(A)+nnz(B)): %0.3g",
414 intensity, workspace / abnz) ;
415 if (intensity >= 8 && workspace < abnz)
416 {
417 // work intensity is large, and Gustvason workspace is modest;
418 // use Gustavson for all tasks
419 AxB_method = GxB_AxB_GUSTAVSON ;
420 GBURBLE (": select Gustvason) ") ;
421 }
422 else
423 {
424 // use default task creation: mix of Hash and Gustavson
425 GBURBLE (") ") ;
426 }
427 }
428
429 //--------------------------------------------------------------------------
430 // determine target task size
431 //--------------------------------------------------------------------------
432
433 double target_task_size = ((double) total_flops) / ntasks_initial ;
434 target_task_size = GB_IMAX (target_task_size, chunk) ;
435 double target_fine_size = target_task_size / GB_FINE_WORK ;
436 target_fine_size = GB_IMAX (target_fine_size, chunk) ;
437
438 //--------------------------------------------------------------------------
439 // determine # of parallel tasks
440 //--------------------------------------------------------------------------
441
442 int ncoarse = 0 ; // # of coarse tasks
443 int max_bjnz = 0 ; // max (nnz (B (:,j))) of fine tasks
444
445 // FUTURE: also use ultra-fine tasks that compute A(i1:i2,k)*B(k,j)
446
447 if (ntasks_initial > 1)
448 {
449
450 //----------------------------------------------------------------------
451 // construct initial coarse tasks
452 //----------------------------------------------------------------------
453
454 GB_WERK_PUSH (Coarse_initial, ntasks_initial + 1, int64_t) ;
455 if (Coarse_initial == NULL)
456 {
457 // out of memory
458 GB_FREE_ALL ;
459 return (GrB_OUT_OF_MEMORY) ;
460 }
461 GB_pslice (Coarse_initial, Bflops, bnvec, ntasks_initial, true) ;
462
463 //----------------------------------------------------------------------
464 // split the work into coarse and fine tasks
465 //----------------------------------------------------------------------
466
467 for (int taskid = 0 ; taskid < ntasks_initial ; taskid++)
468 {
469 // get the initial coarse task
470 int64_t kfirst = Coarse_initial [taskid] ;
471 int64_t klast = Coarse_initial [taskid+1] ;
472 int64_t task_ncols = klast - kfirst ;
473 int64_t task_flops = Bflops [klast] - Bflops [kfirst] ;
474
475 if (task_ncols == 0)
476 {
477 // This coarse task is empty, having been squeezed out by
478 // costly vectors in adjacent coarse tasks.
479 }
480 else if (task_flops > 2 * GB_COSTLY * target_task_size)
481 {
482 // This coarse task is too costly, because it contains one or
483 // more costly vectors. Split its vectors into a mixture of
484 // coarse and fine tasks.
485
486 int64_t kcoarse_start = kfirst ;
487
488 for (int64_t kk = kfirst ; kk < klast ; kk++)
489 {
490 // jflops = # of flops to compute a single vector A*B(:,j)
491 // where j == GBH (Bh, kk)
492 double jflops = Bflops [kk+1] - Bflops [kk] ;
493 // bjnz = nnz (B (:,j))
494 int64_t bjnz = (Bp == NULL) ? bvlen : (Bp [kk+1] - Bp [kk]);
495
496 if (jflops > GB_COSTLY * target_task_size && bjnz > 1)
497 {
498 // A*B(:,j) is costly; split it into 2 or more fine
499 // tasks. First flush the prior coarse task, if any.
500 if (kcoarse_start < kk)
501 {
502 // vectors kcoarse_start to kk-1 form a single
503 // coarse task
504 ncoarse++ ;
505 }
506
507 // next coarse task (if any) starts at kk+1
508 kcoarse_start = kk+1 ;
509
510 // vectors kk will be split into multiple fine tasks
511 max_bjnz = GB_IMAX (max_bjnz, bjnz) ;
512 int team_size = ceil (jflops / target_fine_size) ;
513 (*nfine) += team_size ;
514 }
515 }
516
517 // flush the last coarse task, if any
518 if (kcoarse_start < klast)
519 {
520 // vectors kcoarse_start to klast-1 form a single
521 // coarse task
522 ncoarse++ ;
523 }
524
525 }
526 else
527 {
528 // This coarse task is OK as-is.
529 ncoarse++ ;
530 }
531 }
532 }
533 else
534 {
535
536 //----------------------------------------------------------------------
537 // entire computation in a single fine or coarse task
538 //----------------------------------------------------------------------
539
540 if (bnvec == 1)
541 {
542 // If B is a single vector, and is computed by a single thread,
543 // then a single fine task is used.
544 (*nfine) = 1 ;
545 ncoarse = 0 ;
546 }
547 else
548 {
549 // One thread uses a single coarse task if B is not a vector.
550 (*nfine) = 0 ;
551 ncoarse = 1 ;
552 }
553 }
554
555 (*ntasks) = ncoarse + (*nfine) ;
556
557 //--------------------------------------------------------------------------
558 // allocate the tasks, and workspace to construct fine tasks
559 //--------------------------------------------------------------------------
560
561 SaxpyTasks = GB_MALLOC_WERK ((*ntasks), GB_saxpy3task_struct,
562 &SaxpyTasks_size) ;
563 GB_WERK_PUSH (Coarse_Work, nthreads_max, int64_t) ;
564 if (max_bjnz > 0)
565 {
566 // also allocate workspace to construct fine tasks
567 GB_WERK_PUSH (Fine_slice, (*ntasks)+1, int64_t) ;
568 // Fine_fl will only fit on the Werk stack if max_bjnz is small,
569 // but try anyway, in case it fits. It is placed at the top of the
570 // Werk stack.
571 GB_WERK_PUSH (Fine_fl, max_bjnz+1, int64_t) ;
572 }
573
574 if (SaxpyTasks == NULL || Coarse_Work == NULL ||
575 (max_bjnz > 0 && (Fine_slice == NULL || Fine_fl == NULL)))
576 {
577 // out of memory
578 GB_FREE_ALL ;
579 return (GrB_OUT_OF_MEMORY) ;
580 }
581
582 // clear SaxpyTasks
583 memset (SaxpyTasks, 0, SaxpyTasks_size) ;
584
585 //--------------------------------------------------------------------------
586 // create the tasks
587 //--------------------------------------------------------------------------
588
589 if (ntasks_initial > 1)
590 {
591
592 //----------------------------------------------------------------------
593 // create the coarse and fine tasks
594 //----------------------------------------------------------------------
595
596 int nf = 0 ; // fine tasks have task id 0:nfine-1
597 int nc = (*nfine) ; // coarse task ids are nfine:ntasks-1
598
599 for (int taskid = 0 ; taskid < ntasks_initial ; taskid++)
600 {
601 // get the initial coarse task
602 int64_t kfirst = Coarse_initial [taskid] ;
603 int64_t klast = Coarse_initial [taskid+1] ;
604 int64_t task_ncols = klast - kfirst ;
605 int64_t task_flops = Bflops [klast] - Bflops [kfirst] ;
606
607 if (task_ncols == 0)
608 {
609 // This coarse task is empty, having been squeezed out by
610 // costly vectors in adjacent coarse tasks.
611 }
612 else if (task_flops > 2 * GB_COSTLY * target_task_size)
613 {
614 // This coarse task is too costly, because it contains one or
615 // more costly vectors. Split its vectors into a mixture of
616 // coarse and fine tasks.
617
618 int64_t kcoarse_start = kfirst ;
619
620 for (int64_t kk = kfirst ; kk < klast ; kk++)
621 {
622 // jflops = # of flops to compute a single vector A*B(:,j)
623 double jflops = Bflops [kk+1] - Bflops [kk] ;
624 // bjnz = nnz (B (:,j))
625 int64_t bjnz = (Bp == NULL) ? bvlen : (Bp [kk+1] - Bp [kk]);
626
627 if (jflops > GB_COSTLY * target_task_size && bjnz > 1)
628 {
629 // A*B(:,j) is costly; split it into 2 or more fine
630 // tasks. First flush the prior coarse task, if any.
631 if (kcoarse_start < kk)
632 {
633 // kcoarse_start:kk-1 form a single coarse task
634 GB_create_coarse_task (kcoarse_start, kk-1,
635 SaxpyTasks, nc++, Bflops, cvlen, chunk,
636 nthreads_max, Coarse_Work, AxB_method) ;
637 }
638
639 // next coarse task (if any) starts at kk+1
640 kcoarse_start = kk+1 ;
641
642 // count the work for each entry B(k,j). Do not
643 // include the work to scan M(:,j), since that will
644 // be evenly divided between all tasks in this team.
645 int64_t pB_start = GBP (Bp, kk, bvlen) ;
646 int nth = GB_nthreads (bjnz, chunk, nthreads_max) ;
647 int64_t s ;
648 #pragma omp parallel for num_threads(nth) \
649 schedule(static)
650 for (s = 0 ; s < bjnz ; s++)
651 {
652 // get B(k,j)
653 Fine_fl [s] = 1 ;
654 int64_t pB = pB_start + s ;
655 if (!GBB (Bb, pB)) continue ;
656 int64_t k = GBI (Bi, pB, bvlen) ;
657 // fl = flop count for just A(:,k)*B(k,j)
658 int64_t pA, pA_end ;
659 int64_t pleft = 0 ;
660 GB_lookup (A_is_hyper, Ah, Ap, avlen, &pleft,
661 anvec-1, k, &pA, &pA_end) ;
662 int64_t fl = pA_end - pA ;
663 Fine_fl [s] = fl ;
664 ASSERT (fl >= 0) ;
665 }
666
667 // cumulative sum of flops to compute A*B(:,j)
668 GB_cumsum (Fine_fl, bjnz, NULL, nth, Context) ;
669
670 // slice B(:,j) into fine tasks
671 int team_size = ceil (jflops / target_fine_size) ;
672 ASSERT (Fine_slice != NULL) ;
673 GB_pslice (Fine_slice, Fine_fl, bjnz, team_size, false);
674
675 // shared hash table for all fine tasks for A*B(:,j)
676 int64_t hsize =
677 GB_hash_table_size (jflops, cvlen, AxB_method) ;
678
679 // construct the fine tasks for C(:,j)=A*B(:,j)
680 int leader = nf ;
681 for (int fid = 0 ; fid < team_size ; fid++)
682 {
683 int64_t pstart = Fine_slice [fid] ;
684 int64_t pend = Fine_slice [fid+1] ;
685 int64_t fl = Fine_fl [pend] - Fine_fl [pstart] ;
686 SaxpyTasks [nf].start = pB_start + pstart ;
687 SaxpyTasks [nf].end = pB_start + pend - 1 ;
688 SaxpyTasks [nf].vector = kk ;
689 SaxpyTasks [nf].hsize = hsize ;
690 SaxpyTasks [nf].Hi = NULL ; // assigned later
691 SaxpyTasks [nf].Hf = NULL ; // assigned later
692 SaxpyTasks [nf].Hx = NULL ; // assigned later
693 SaxpyTasks [nf].my_cjnz = 0 ;
694 SaxpyTasks [nf].leader = leader ;
695 SaxpyTasks [nf].team_size = team_size ;
696 nf++ ;
697 }
698 }
699 }
700
701 // flush the last coarse task, if any
702 if (kcoarse_start < klast)
703 {
704 // kcoarse_start:klast-1 form a single coarse task
705 GB_create_coarse_task (kcoarse_start, klast-1, SaxpyTasks,
706 nc++, Bflops, cvlen, chunk, nthreads_max,
707 Coarse_Work, AxB_method) ;
708 }
709
710 }
711 else
712 {
713 // This coarse task is OK as-is.
714 GB_create_coarse_task (kfirst, klast-1, SaxpyTasks,
715 nc++, Bflops, cvlen, chunk, nthreads_max,
716 Coarse_Work, AxB_method) ;
717 }
718 }
719
720 }
721 else
722 {
723
724 //----------------------------------------------------------------------
725 // entire computation in a single fine or coarse task
726 //----------------------------------------------------------------------
727
728 // create a single coarse task: hash or Gustavson
729 GB_create_coarse_task (0, bnvec-1, SaxpyTasks, 0, Bflops, cvlen, 1, 1,
730 Coarse_Work, AxB_method) ;
731
732 if (bnvec == 1)
733 {
734 // convert the single coarse task into a single fine task
735 SaxpyTasks [0].start = 0 ; // first entry in B(:,0)
736 SaxpyTasks [0].end = bnz - 1 ; // last entry in B(:,0)
737 SaxpyTasks [0].vector = 0 ;
738 }
739 }
740
741 //--------------------------------------------------------------------------
742 // free workspace and return result
743 //--------------------------------------------------------------------------
744
745 GB_FREE_WORK ;
746 (*SaxpyTasks_handle) = SaxpyTasks ;
747 (*SaxpyTasks_size_handle) = SaxpyTasks_size ;
748 return (GrB_SUCCESS) ;
749 }
750
751