1 //------------------------------------------------------------------------------
2 // GB_bitmap_AxB_saxpy_A_sparse_B_bitmap: C<#M>+=A*B, C bitmap, M any format
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 {
11 
12     if (use_coarse_tasks)
13     {
14 
15         //----------------------------------------------------------------------
16         // C<#M> += A*B using coarse tasks
17         //----------------------------------------------------------------------
18 
19         // number of columns in the workspace for each task
20         #define GB_PANEL_SIZE 4
21 
22         //----------------------------------------------------------------------
23         // allocate workspace for each task
24         //----------------------------------------------------------------------
25 
26         GB_WERK_PUSH (GH_slice, 2*ntasks, int64_t) ;
27         if (GH_slice == NULL)
28         {
29             // out of memory
30             GB_FREE_ALL ;
31             return (GrB_OUT_OF_MEMORY) ;
32         }
33 
34         int64_t *restrict G_slice = GH_slice ;
35         int64_t *restrict H_slice = GH_slice + ntasks ;
36 
37         int64_t gwork = 0 ;
38         int64_t hwork = 0 ;
39         int tid ;
40         for (tid = 0 ; tid < ntasks ; tid++)
41         {
42             int64_t jstart, jend ;
43             GB_PARTITION (jstart, jend, bvdim, tid, ntasks) ;
44             int64_t jtask = jend - jstart ;
45             int64_t jpanel = GB_IMIN (jtask, GB_PANEL_SIZE) ;
46             G_slice [tid] = gwork ;
47             H_slice [tid] = hwork ;
48             if (jpanel > 1)
49             {
50                 // no need to allocate workspace for Gb and Gx if jpanel == 1
51                 gwork += jpanel ;
52             }
53             hwork += jpanel ;
54         }
55 
56         int64_t bvlenx = (B_is_pattern ? 0 : bvlen) * GB_BSIZE ;
57         int64_t cvlenx = (GB_IS_ANY_PAIR_SEMIRING ? 0 : cvlen) * GB_CSIZE ;
58         int64_t bvlenb = (GB_B_IS_BITMAP ? bvlen : 0) ;
59         size_t gfspace = gwork * bvlenb ;
60         size_t wfspace = gfspace + hwork * cvlen ;
61         size_t wbxspace = gwork * bvlenx ;
62         size_t wcxspace = hwork * cvlenx ;
63         Wf  = GB_MALLOC_WERK (wfspace, int8_t, &Wf_size) ;
64         Wbx = GB_MALLOC_WERK (wbxspace, GB_void, &Wbx_size) ;
65         Wcx = GB_MALLOC_WERK (wcxspace, GB_void, &Wcx_size) ;
66         if (Wf == NULL || Wcx == NULL || Wbx == NULL)
67         {
68             // out of memory
69             GB_FREE_ALL ;
70             return (GrB_OUT_OF_MEMORY) ;
71         }
72 
73         //----------------------------------------------------------------------
74         // C<#M> += A*B
75         //----------------------------------------------------------------------
76 
77         #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
78             reduction(+:cnvals)
79         for (tid = 0 ; tid < ntasks ; tid++)
80         {
81 
82             //------------------------------------------------------------------
83             // determine the vectors of B and C for this coarse task
84             //------------------------------------------------------------------
85 
86             int64_t jstart, jend ;
87             GB_PARTITION (jstart, jend, bvdim, tid, ntasks) ;
88             int64_t jtask = jend - jstart ;
89             int64_t jpanel = GB_IMIN (jtask, GB_PANEL_SIZE) ;
90             int64_t task_cnvals = 0 ;
91 
92             //------------------------------------------------------------------
93             // get the workspace for this task
94             //------------------------------------------------------------------
95 
96             // Gb and Gx workspace to load the panel of B
97             int8_t   *restrict Gb = Wf + G_slice [tid] * bvlenb ;
98             GB_BTYPE *restrict Gx = (GB_BTYPE *)
99                 (Wbx + G_slice [tid] * bvlenx) ;
100 
101             // Hf and Hx workspace to compute the panel of C
102             int8_t   *restrict Hf = Wf + (H_slice [tid] * cvlen) + gfspace ;
103             GB_CTYPE *restrict Hx = (GB_CTYPE *)
104                 (Wcx +  H_slice [tid] * cvlenx) ;
105             #if GB_IS_PLUS_FC32_MONOID
106             float  *restrict Hx_real = (float *) Hx ;
107             float  *restrict Hx_imag = Hx_real + 1 ;
108             #elif GB_IS_PLUS_FC64_MONOID
109             double *restrict Hx_real = (double *) Hx ;
110             double *restrict Hx_imag = Hx_real + 1 ;
111             #endif
112 
113             //------------------------------------------------------------------
114             // clear the panel
115             //------------------------------------------------------------------
116 
117             memset (Hf, 0, jpanel * cvlen) ;
118 
119             //------------------------------------------------------------------
120             // C<#M>(:,jstart:jend-1) += A * B(:,jstart:jend-1) by panel
121             //------------------------------------------------------------------
122 
123             for (int64_t j1 = jstart ; j1 < jend ; j1 += jpanel)
124             {
125 
126                 //--------------------------------------------------------------
127                 // get the panel of np vectors j1:j2-1
128                 //--------------------------------------------------------------
129 
130                 int64_t j2 = GB_IMIN (jend, j1 + jpanel) ;
131                 int64_t np = j2 - j1 ;
132 
133                 //--------------------------------------------------------------
134                 // load and transpose B(:,j1:j2-1) for one panel
135                 //--------------------------------------------------------------
136 
137                 #if GB_B_IS_BITMAP
138                 {
139                     if (np == 1)
140                     {
141                         // no need to load a single vector of B
142                         Gb = (int8_t *) (Bb + (j1 * bvlen)) ;
143                     }
144                     else
145                     {
146                         // load and transpose the bitmap of B(:,j1:j2-1)
147                         for (int64_t jj = 0 ; jj < np ; jj++)
148                         {
149                             int64_t j = j1 + jj ;
150                             for (int64_t i = 0 ; i < bvlen ; i++)
151                             {
152                                 Gb [i*np + jj] = Bb [i + j * bvlen] ;
153                             }
154                         }
155                     }
156                 }
157                 #endif
158 
159                 if (!B_is_pattern)
160                 {
161                     if (np == 1)
162                     {
163                         // no need to load a single vector of B
164                         GB_void *restrict Bx = (GB_void *) (B->x) ;
165                         Gx = (GB_BTYPE *) (Bx + (j1 * bvlen) * GB_BSIZE) ;
166                     }
167                     else
168                     {
169                         // load and transpose the values of B(:,j1:j2-1)
170                         for (int64_t jj = 0 ; jj < np ; jj++)
171                         {
172                             int64_t j = j1 + jj ;
173                             for (int64_t i = 0 ; i < bvlen ; i++)
174                             {
175                                 // G(i,jj) = B(i,j), and change storage order
176                                 int64_t pG = i*np + jj ;
177                                 int64_t pB = i + j * bvlen ;
178                                 GB_LOADB (Gx, pG, Bx, pB) ;
179                             }
180                         }
181                     }
182                 }
183 
184                 //--------------------------------------------------------------
185                 // H = A*G for one panel
186                 //--------------------------------------------------------------
187 
188                 for (int64_t kA = 0 ; kA < anvec ; kA++)
189                 {
190 
191                     //----------------------------------------------------------
192                     // get A(:,k)
193                     //----------------------------------------------------------
194 
195                     int64_t k = GBH (Ah, kA) ;
196                     int64_t pA = Ap [kA] ;
197                     int64_t pA_end = Ap [kA+1] ;
198                     int64_t pG = k * np ;
199 
200                     #undef  GB_MULT_A_ik_G_kjj
201                     #if GB_IS_PAIR_MULTIPLIER
202                         // t = A(i,k) * G (k,jj) is always equal to 1
203                         #define GB_MULT_A_ik_G_kjj(jj)
204                     #else
205                         // t = A(i,k) * G (k,jj)
206                         GB_CIJ_DECLARE (t) ;
207                         #define GB_MULT_A_ik_G_kjj(jj)                      \
208                             GB_GETB (gkj, Gx, pG+jj) ;                      \
209                             GB_MULT (t, aik, gkj, i, k, j1 + jj) ;
210                     #endif
211 
212                     #undef  GB_HX_COMPUTE
213                     #define GB_HX_COMPUTE(jj)                               \
214                     {                                                       \
215                         /* H (i,jj) += A(i,k)*G(k,jj) */                    \
216                         if (!GB_B_IS_BITMAP || Gb [pG+jj])                  \
217                         {                                                   \
218                             GB_MULT_A_ik_G_kjj (jj) ;                       \
219                             if (Hf [pH+jj] == 0)                            \
220                             {                                               \
221                                 /* H(i,jj) is a new entry */                \
222                                 GB_HX_WRITE (pH+jj, t) ; /* Hx(i,jj)=t */   \
223                                 Hf [pH+jj] = 1 ;                            \
224                             }                                               \
225                             else                                            \
226                             {                                               \
227                                 /* H(i,jj) is already present */            \
228                                 GB_HX_UPDATE (pH+jj, t) ; /* Hx(i,jj)+=t */ \
229                             }                                               \
230                         }                                                   \
231                     }
232 
233                     #undef  GB_LOAD_A_ij
234                     #define GB_LOAD_A_ij                                    \
235                         int64_t i = Ai [pA] ;                               \
236                         GB_GETA (aik, Ax, pA) ;                             \
237                         int64_t pH = i * np ;
238 
239                     //----------------------------------------------------------
240                     // H += A(:,k)*G(k,:)
241                     //----------------------------------------------------------
242 
243                     #if GB_B_IS_BITMAP
244                     bool gb = false ;
245                     switch (np)
246                     {
247                         case 4 : gb  = Gb [pG+3] ;
248                         case 3 : gb |= Gb [pG+2] ;
249                         case 2 : gb |= Gb [pG+1] ;
250                         case 1 : gb |= Gb [pG  ] ;
251                         default: ;
252                     }
253                     if (gb)
254                     #endif
255                     {
256                         switch (np)
257                         {
258 
259                             case 4 :
260                                 for ( ; pA < pA_end ; pA++)
261                                 {
262                                     GB_LOAD_A_ij ;
263                                     GB_HX_COMPUTE (0) ;
264                                     GB_HX_COMPUTE (1) ;
265                                     GB_HX_COMPUTE (2) ;
266                                     GB_HX_COMPUTE (3) ;
267                                 }
268                                 break ;
269 
270                             case 3 :
271                                 for ( ; pA < pA_end ; pA++)
272                                 {
273                                     GB_LOAD_A_ij ;
274                                     GB_HX_COMPUTE (0) ;
275                                     GB_HX_COMPUTE (1) ;
276                                     GB_HX_COMPUTE (2) ;
277                                 }
278                                 break ;
279 
280                             case 2 :
281                                 for ( ; pA < pA_end ; pA++)
282                                 {
283                                     GB_LOAD_A_ij ;
284                                     GB_HX_COMPUTE (0) ;
285                                     GB_HX_COMPUTE (1) ;
286                                 }
287                                 break ;
288 
289                             case 1 :
290                                 for ( ; pA < pA_end ; pA++)
291                                 {
292                                     GB_LOAD_A_ij ;
293                                     GB_HX_COMPUTE (0) ;
294                                 }
295                                 break ;
296                             default:;
297                         }
298                     }
299 
300                     #undef  GB_MULT_A_ik_G_kjj
301                     #undef  GB_HX_COMPUTE
302                     #undef  GB_LOAD_A_ij
303                 }
304 
305                 //--------------------------------------------------------------
306                 // C<#M>(:,j1:j2-1) += H
307                 //--------------------------------------------------------------
308 
309                 for (int64_t jj = 0 ; jj < np ; jj++)
310                 {
311 
312                     //----------------------------------------------------------
313                     // C<#M>(:,j) += H (:,jj)
314                     //----------------------------------------------------------
315 
316                     int64_t j = j1 + jj ;
317                     int64_t pC_start = j * cvlen ;  // get pointer to C(:,j)
318 
319                     for (int64_t i = 0 ; i < cvlen ; i++)
320                     {
321                         int64_t pC = pC_start + i ;     // pointer to C(i,j)
322                         int64_t pH = i * np + jj ;      // pointer to H(i,jj)
323                         if (!Hf [pH]) continue ;
324                         Hf [pH] = 0 ;                   // clear the panel
325                         int8_t cb = Cb [pC] ;
326 
327                         //------------------------------------------------------
328                         // check M(i,j)
329                         //------------------------------------------------------
330 
331                         #if GB_MASK_IS_SPARSE_OR_HYPER
332 
333                             // M is sparse or hypersparse
334                             bool mij = ((cb & 2) != 0) ^ Mask_comp ;
335                             if (!mij) continue ;
336                             cb = (cb & 1) ;
337 
338                         #elif GB_MASK_IS_BITMAP_OR_FULL
339 
340                             // M is bitmap or full
341                             GB_GET_M_ij (pC) ;
342                             mij = mij ^ Mask_comp ;
343                             if (!mij) continue ;
344 
345                         #endif
346 
347                         //------------------------------------------------------
348                         // C(i,j) += H(i,jj)
349                         //------------------------------------------------------
350 
351                         if (cb == 0)
352                         {
353                             // C(i,j) = H(i,jj)
354                             #if GB_IS_ANY_PAIR_SEMIRING
355                             Cx [pC] = GB_CTYPE_CAST (1, 0) ;    // C(i,j) = 1
356                             #else
357                             GB_CIJ_GATHER (pC, pH) ;
358                             #endif
359                             Cb [pC] = keep ;
360                             task_cnvals++ ;
361                         }
362                         else
363                         {
364                             // Currently, the matrix C is a newly allocated
365                             // matrix, not the C_in input matrix to GrB_mxm.
366                             // As a result, this condition is not used.  It
367                             // will be in the future when this method is
368                             // modified to modify C in-place.
369                             ASSERT (GB_DEAD_CODE) ;
370                             // C(i,j) += H(i,jj)
371                             GB_CIJ_GATHER_UPDATE (pC, pH) ;
372                         }
373                     }
374                 }
375             }
376             cnvals += task_cnvals ;
377         }
378 
379         #undef GB_PANEL_SIZE
380 
381     }
382     else if (use_atomics)
383     {
384 
385         //----------------------------------------------------------------------
386         // C<#M> += A*B using fine tasks and atomics
387         //----------------------------------------------------------------------
388 
389         int tid ;
390         #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
391             reduction(+:cnvals)
392         for (tid = 0 ; tid < ntasks ; tid++)
393         {
394 
395             //------------------------------------------------------------------
396             // determine the vector of B and C for this fine task
397             //------------------------------------------------------------------
398 
399             // The fine task operates on C(:,j) and B(:,j).  Its fine task
400             // id ranges from 0 to nfine_tasks_per_vector-1, and determines
401             // which slice of A to operate on.
402 
403             int64_t j    = tid / nfine_tasks_per_vector ;
404             int fine_tid = tid % nfine_tasks_per_vector ;
405             int64_t kfirst = A_slice [fine_tid] ;
406             int64_t klast = A_slice [fine_tid + 1] ;
407             int64_t pB_start = j * bvlen ;      // pointer to B(:,j)
408             int64_t pC_start = j * cvlen ;      // pointer to C(:,j)
409             GB_GET_T_FOR_SECONDJ ;              // t = j or j+1 for SECONDJ*
410             int64_t task_cnvals = 0 ;
411 
412             // for Hx Gustavason workspace: use C(:,j) in-place:
413             GB_CTYPE *restrict Hx = (GB_CTYPE *)
414                 (((GB_void *) Cx) + (pC_start * GB_CSIZE)) ;
415             #if GB_IS_PLUS_FC32_MONOID || GB_IS_ANY_FC32_MONOID
416             float  *restrict Hx_real = (float *) Hx ;
417             float  *restrict Hx_imag = Hx_real + 1 ;
418             #elif GB_IS_PLUS_FC64_MONOID || GB_IS_ANY_FC64_MONOID
419             double *restrict Hx_real = (double *) Hx ;
420             double *restrict Hx_imag = Hx_real + 1 ;
421             #endif
422 
423             //------------------------------------------------------------------
424             // C<#M>(:,j) += A(:,k1:k2) * B(k1:k2,j)
425             //------------------------------------------------------------------
426 
427             for (int64_t kk = kfirst ; kk < klast ; kk++)
428             {
429 
430                 //--------------------------------------------------------------
431                 // C<#M>(:,j) += A(:,k) * B(k,j)
432                 //--------------------------------------------------------------
433 
434                 int64_t k = GBH (Ah, kk) ;      // k in range k1:k2
435                 int64_t pB = pB_start + k ;     // get pointer to B(k,j)
436                 if (!GBB (Bb, pB)) continue ;
437                 int64_t pA = Ap [kk] ;
438                 int64_t pA_end = Ap [kk+1] ;
439                 GB_GET_B_kj ;                   // bkj = B(k,j)
440 
441                 for ( ; pA < pA_end ; pA++)
442                 {
443 
444                     //----------------------------------------------------------
445                     // get A(i,k) and C(i,j)
446                     //----------------------------------------------------------
447 
448                     int64_t i = Ai [pA] ;       // get A(i,k) index
449                     int64_t pC = pC_start + i ; // get C(i,j) pointer
450                     int8_t cb ;
451 
452                     //----------------------------------------------------------
453                     // C<#M>(i,j) += A(i,k) * B(k,j)
454                     //----------------------------------------------------------
455 
456                     #if GB_MASK_IS_SPARSE_OR_HYPER
457                     {
458 
459                         //------------------------------------------------------
460                         // M is sparse, and scattered into the C bitmap
461                         //------------------------------------------------------
462 
463                         // finite-state machine in Cb [pC]:
464                         // 0:   cij not present, mij zero
465                         // 1:   cij present, mij zero (keep==1 for !M)
466                         // 2:   cij not present, mij one
467                         // 3:   cij present, mij one (keep==3 for M)
468                         // 7:   cij is locked
469 
470                         #if GB_HAS_ATOMIC
471                         {
472                             // if C(i,j) is already present and can be modified
473                             // (cb==keep), and the monoid can be done
474                             // atomically, then do the atomic update.  No need
475                             // to modify Cb [pC].
476                             GB_ATOMIC_READ
477                             cb = Cb [pC] ;          // grab the entry
478                             if (cb == keep)
479                             {
480                                 #if !GB_IS_ANY_MONOID
481                                 GB_MULT_A_ik_B_kj ;     // t = A(i,k) * B(k,j)
482                                 GB_ATOMIC_UPDATE_HX (i, t) ;    // C(i,j) += t
483                                 #endif
484                                 continue ;          // C(i,j) has been updated
485                             }
486                         }
487                         #endif
488 
489                         do  // lock the entry
490                         {
491                             // do this atomically:
492                             // { cb = Cb [pC] ;  Cb [pC] = 7 ; }
493                             GB_ATOMIC_CAPTURE_INT8 (cb, Cb [pC], 7) ;
494                         } while (cb == 7) ; // lock owner gets 0, 1, 2, or 3
495                         if (cb == keep-1)
496                         {
497                             // C(i,j) is a new entry
498                             GB_MULT_A_ik_B_kj ;             // t = A(i,k)*B(k,j)
499                             #if GB_IS_ANY_PAIR_SEMIRING
500                             GB_ATOMIC_SET_HX_ONE (i) ;      // C(i,j) = 1
501                             #else
502                             GB_ATOMIC_WRITE_HX (i, t) ;     // C(i,j) = t
503                             #endif
504                             task_cnvals++ ;
505                             cb = keep ;                     // keep the entry
506                         }
507                         else if (cb == keep)
508                         {
509                             // C(i,j) is already present
510                             #if !GB_IS_ANY_MONOID
511                             GB_MULT_A_ik_B_kj ;             // t = A(i,k)*B(k,j)
512                             GB_ATOMIC_UPDATE_HX (i, t) ;    // C(i,j) += t
513                             #endif
514                         }
515                         GB_ATOMIC_WRITE
516                         Cb [pC] = cb ;                  // unlock the entry
517 
518                     }
519                     #else
520                     {
521 
522                         //------------------------------------------------------
523                         // M is not present, or bitmap/full
524                         //------------------------------------------------------
525 
526                         // finite-state machine in Cb [pC]:
527                         // 0:   cij not present; can be written
528                         // 1:   cij present; can be updated
529                         // 7:   cij is locked
530 
531                         #if GB_MASK_IS_BITMAP_OR_FULL
532                         {
533                             // M is bitmap or full, and not in C bitmap.
534                             // Do not modify C(i,j) if not permitted by the mask
535                             GB_GET_M_ij (pC) ;
536                             mij = mij ^ Mask_comp ;
537                             if (!mij) continue ;
538                         }
539                         #endif
540 
541                         //------------------------------------------------------
542                         // C(i,j) += A(i,j) * B(k,j)
543                         //------------------------------------------------------
544 
545                         #if GB_HAS_ATOMIC
546                         {
547                             // if C(i,j) is already present (cb==1), and the
548                             // monoid can be done atomically, then do the
549                             // atomic update.  No need to modify Cb [pC].
550                             GB_ATOMIC_READ
551                             cb = Cb [pC] ;          // grab the entry
552                             if (cb == 1)
553                             {
554                                 #if !GB_IS_ANY_MONOID
555                                 GB_MULT_A_ik_B_kj ;     // t = A(i,k) * B(k,j)
556                                 GB_ATOMIC_UPDATE_HX (i, t) ;    // C(i,j) += t
557                                 #endif
558                                 continue ;          // C(i,j) has been updated
559                             }
560                         }
561                         #endif
562 
563                         do  // lock the entry
564                         {
565                             // do this atomically:
566                             // { cb = Cb [pC] ;  Cb [pC] = 7 ; }
567                             GB_ATOMIC_CAPTURE_INT8 (cb, Cb [pC], 7) ;
568                         } while (cb == 7) ; // lock owner gets 0 or 1
569                         if (cb == 0)
570                         {
571                             // C(i,j) is a new entry
572                             GB_MULT_A_ik_B_kj ;             // t = A(i,k)*B(k,j)
573                             #if GB_IS_ANY_PAIR_SEMIRING
574                             GB_ATOMIC_SET_HX_ONE (i) ;      // C(i,j) = 1
575                             #else
576                             GB_ATOMIC_WRITE_HX (i, t) ;     // C(i,j) = t
577                             #endif
578                             task_cnvals++ ;
579                         }
580                         else // cb == 1
581                         {
582                             // C(i,j) is already present
583                             #if !GB_IS_ANY_MONOID
584                             GB_MULT_A_ik_B_kj ;             // t = A(i,k)*B(k,j)
585                             GB_ATOMIC_UPDATE_HX (i, t) ;    // C(i,j) += t
586                             #endif
587                         }
588                         GB_ATOMIC_WRITE
589                         Cb [pC] = 1 ;               // unlock the entry
590 
591                     }
592                     #endif
593 
594                 }
595             }
596             cnvals += task_cnvals ;
597         }
598 
599     }
600     else
601     {
602 
603         //----------------------------------------------------------------------
604         // C<#M> += A*B using fine tasks and workspace, with no atomics
605         //----------------------------------------------------------------------
606 
607         // Each fine task is given size-cvlen workspace to compute its result
608         // in the first phase, W(:,tid) = A(:,k1:k2) * B(k1:k2,j), where k1:k2
609         // is defined by the fine_tid of the task.  The workspaces are then
610         // summed into C in the second phase.
611 
612         //----------------------------------------------------------------------
613         // allocate workspace
614         //----------------------------------------------------------------------
615 
616         size_t workspace = cvlen * ntasks ;
617         size_t cxsize = (GB_IS_ANY_PAIR_SEMIRING) ? 0 : GB_CSIZE ;
618         Wf  = GB_MALLOC_WERK (workspace, int8_t, &Wf_size) ;
619         Wcx = GB_MALLOC_WERK (workspace * cxsize, GB_void, &Wcx_size) ;
620         if (Wf == NULL || Wcx == NULL)
621         {
622             // out of memory
623             GB_FREE_ALL ;
624             return (GrB_OUT_OF_MEMORY) ;
625         }
626 
627         //----------------------------------------------------------------------
628         // first phase: W (:,tid) = A (:,k1:k2) * B (k2:k2,j) for each fine task
629         //----------------------------------------------------------------------
630 
631         int tid ;
632         #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
633         for (tid = 0 ; tid < ntasks ; tid++)
634         {
635 
636             //------------------------------------------------------------------
637             // determine the vector of B and C for this fine task
638             //------------------------------------------------------------------
639 
640             // The fine task operates on C(:,j) and B(:,j).  Its fine task
641             // id ranges from 0 to nfine_tasks_per_vector-1, and determines
642             // which slice of A to operate on.
643 
644             int64_t j    = tid / nfine_tasks_per_vector ;
645             int fine_tid = tid % nfine_tasks_per_vector ;
646             int64_t kfirst = A_slice [fine_tid] ;
647             int64_t klast = A_slice [fine_tid + 1] ;
648             int64_t pB_start = j * bvlen ;      // pointer to B(:,j)
649             int64_t pC_start = j * cvlen ;      // pointer to C(:,j), for bitmap
650             int64_t pW_start = tid * cvlen ;    // pointer to W(:,tid)
651             GB_GET_T_FOR_SECONDJ ;              // t = j or j+1 for SECONDJ*
652             int64_t task_cnvals = 0 ;
653 
654             // for Hf and Hx Gustavason workspace: use W(:,tid):
655             int8_t   *restrict Hf = Wf + pW_start ;
656             GB_CTYPE *restrict Hx = (GB_CTYPE *)
657                 (Wcx + (pW_start * cxsize)) ;
658             #if GB_IS_PLUS_FC32_MONOID
659             float  *restrict Hx_real = (float *) Hx ;
660             float  *restrict Hx_imag = Hx_real + 1 ;
661             #elif GB_IS_PLUS_FC64_MONOID
662             double *restrict Hx_real = (double *) Hx ;
663             double *restrict Hx_imag = Hx_real + 1 ;
664             #endif
665 
666             //------------------------------------------------------------------
667             // clear Hf
668             //------------------------------------------------------------------
669 
670             memset (Hf, 0, cvlen) ;
671 
672             //------------------------------------------------------------------
673             // W<#M> = A(:,k1:k2) * B(k1:k2,j)
674             //------------------------------------------------------------------
675 
676             for (int64_t kk = kfirst ; kk < klast ; kk++)
677             {
678 
679                 //--------------------------------------------------------------
680                 // W<#M>(:,tid) += A(:,k) * B(k,j)
681                 //--------------------------------------------------------------
682 
683                 int64_t k = GBH (Ah, kk) ;      // k in range k1:k2
684                 int64_t pB = pB_start + k ;     // get pointer to B(k,j)
685                 if (!GBB (Bb, pB)) continue ;
686                 int64_t pA = Ap [kk] ;
687                 int64_t pA_end = Ap [kk+1] ;
688                 GB_GET_B_kj ;                   // bkj = B(k,j)
689 
690                 for ( ; pA < pA_end ; pA++)
691                 {
692 
693                     //----------------------------------------------------------
694                     // get A(i,k)
695                     //----------------------------------------------------------
696 
697                     int64_t i = Ai [pA] ;       // get A(i,k) index
698 
699                     //----------------------------------------------------------
700                     // check M(i,j)
701                     //----------------------------------------------------------
702 
703                     #if GB_MASK_IS_SPARSE_OR_HYPER
704                     {
705                         // M is sparse or hypersparse
706                         int64_t pC = pC_start + i ;
707                         int8_t cb = Cb [pC] ;
708                         bool mij = ((cb & 2) != 0) ^ Mask_comp ;
709                         if (!mij) continue ;
710                     }
711                     #elif GB_MASK_IS_BITMAP_OR_FULL
712                     {
713                         // M is bitmap or full
714                         int64_t pC = pC_start + i ;
715                         GB_GET_M_ij (pC) ;
716                         mij = mij ^ Mask_comp ;
717                         if (!mij) continue ;
718                     }
719                     #endif
720 
721                     //----------------------------------------------------------
722                     // W<#M>(i) += A(i,k) * B(k,j)
723                     //----------------------------------------------------------
724 
725                     #if GB_IS_ANY_PAIR_SEMIRING
726                     {
727                         // Hx is not used; Cx [...] = 1 is done below
728                         Hf [i] = 1 ;
729                     }
730                     #else
731                     {
732                         GB_MULT_A_ik_B_kj ;         // t = A(i,k)*B(k,j)
733                         if (Hf [i] == 0)
734                         {
735                             // W(i,j) is a new entry
736                             GB_HX_WRITE (i, t) ;    // Hx(i) = t
737                             Hf [i] = 1 ;
738                         }
739                         else
740                         {
741                             // W(i) is already present
742                             GB_HX_UPDATE (i, t) ;   // Hx(i) += t
743                         }
744                     }
745                     #endif
746                 }
747             }
748         }
749 
750         //----------------------------------------------------------------------
751         // second phase: C<#M> += reduce (W)
752         //----------------------------------------------------------------------
753 
754         #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
755             reduction(+:cnvals)
756         for (tid = 0 ; tid < ntasks ; tid++)
757         {
758 
759             //------------------------------------------------------------------
760             // determine the W and C for this fine task
761             //------------------------------------------------------------------
762 
763             // The fine task operates on C(i1:i2,j) and W(i1:i2,w1:w2), where
764             // i1:i2 is defined by the fine task id.  Its fine task id ranges
765             // from 0 to nfine_tasks_per_vector-1.
766 
767             // w1:w2 are the updates to C(:,j), where w1:w2 =
768             // [j*nfine_tasks_per_vector : (j+1)*nfine_tasks_per_vector-1].
769 
770             int64_t j    = tid / nfine_tasks_per_vector ;
771             int fine_tid = tid % nfine_tasks_per_vector ;
772             int64_t istart, iend ;
773             GB_PARTITION (istart, iend, cvlen, fine_tid,
774                 nfine_tasks_per_vector) ;
775             int64_t pC_start = j * cvlen ;          // pointer to C(:,j)
776             int64_t wstart = j * nfine_tasks_per_vector ;
777             int64_t wend = (j + 1) * nfine_tasks_per_vector ;
778             int64_t task_cnvals = 0 ;
779 
780             // Hx = (typecasted) Wcx workspace, use Wf as-is
781             GB_CTYPE *restrict Hx = ((GB_CTYPE *) Wcx) ;
782             #if GB_IS_PLUS_FC32_MONOID
783             float  *restrict Hx_real = (float *) Hx ;
784             float  *restrict Hx_imag = Hx_real + 1 ;
785             #elif GB_IS_PLUS_FC64_MONOID
786             double *restrict Hx_real = (double *) Hx ;
787             double *restrict Hx_imag = Hx_real + 1 ;
788             #endif
789 
790             //------------------------------------------------------------------
791             // C<#M>(i1:i2,j) += reduce (W (i2:i2, wstart:wend))
792             //------------------------------------------------------------------
793 
794             for (int64_t w = wstart ; w < wend ; w++)
795             {
796 
797                 //--------------------------------------------------------------
798                 // C<#M>(i1:i2,j) += W (i1:i2,w)
799                 //--------------------------------------------------------------
800 
801                 int64_t pW_start = w * cvlen ;      // pointer to W (:,w)
802 
803                 for (int64_t i = istart ; i < iend ; i++)
804                 {
805 
806                     //----------------------------------------------------------
807                     // get pointer and bitmap C(i,j) and W(i,w)
808                     //----------------------------------------------------------
809 
810                     int64_t pW = pW_start + i ;     // pointer to W(i,w)
811                     if (Wf [pW] == 0) continue ;    // skip if not present
812                     int64_t pC = pC_start + i ;     // pointer to C(i,j)
813                     int8_t cb = Cb [pC] ;           // bitmap status of C(i,j)
814 
815                     //----------------------------------------------------------
816                     // M(i,j) already checked, but adjust Cb if M is sparse
817                     //----------------------------------------------------------
818 
819                     #if GB_MASK_IS_SPARSE_OR_HYPER
820                     {
821                         // M is sparse or hypersparse
822                         cb = (cb & 1) ;
823                     }
824                     #endif
825 
826                     //----------------------------------------------------------
827                     // C(i,j) += W (i,w)
828                     //----------------------------------------------------------
829 
830                     if (cb == 0)
831                     {
832                         // C(i,j) = W(i,w)
833                         #if GB_IS_ANY_PAIR_SEMIRING
834                         Cx [pC] = GB_CTYPE_CAST (1, 0) ;        // C(i,j) = 1
835                         #else
836                         GB_CIJ_GATHER (pC, pW) ;
837                         #endif
838                         Cb [pC] = keep ;
839                         task_cnvals++ ;
840                     }
841                     else
842                     {
843                         // C(i,j) += W(i,w)
844                         GB_CIJ_GATHER_UPDATE (pC, pW) ;
845                     }
846                 }
847             }
848             cnvals += task_cnvals ;
849         }
850     }
851 }
852 
853