1 //------------------------------------------------------------------------------
2 // GB_AxB_saxpy3_coarseGus_notM_phase5: C<!M>=A*B, coarse Gustavson, phase5
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 {
11 
12     //--------------------------------------------------------------------------
13     // phase5: coarse Gustavson task, C<!M>=A*B
14     //--------------------------------------------------------------------------
15 
16     // Since the mask is !M:
17     // Hf [i] < mark    : M(i,j)=0, C(i,j) is not yet seen.
18     // Hf [i] == mark   : M(i,j)=1, so C(i,j) is ignored.
19     // Hf [i] == mark+1 : M(i,j)=0, and C(i,j) has been seen.
20 
21     for (int64_t kk = kfirst ; kk <= klast ; kk++)
22     {
23         int64_t pC = Cp [kk] ;
24         int64_t cjnz = Cp [kk+1] - pC ;
25         if (cjnz == 0) continue ;   // nothing to do
26         GB_GET_B_j ;                // get B(:,j)
27         #ifndef GB_GENERIC
28         if (cjnz == cvlen)          // C(:,j) is dense
29         {
30             // This is not used for the generic saxpy3.
31             GB_COMPUTE_DENSE_C_j ;  // C(:,j) = A*B(:,j)
32             continue ;
33         }
34         #endif
35         GB_GET_M_j ;            // get M(:,j)
36         mark += 2 ;
37         int64_t mark1 = mark+1 ;
38 
39         // scatter M(:,j) into the Gustavson workspace
40         GB_SCATTER_M_j (pM_start, pM_end, mark) ;
41 
42         if (16 * cjnz > cvlen)
43         {
44 
45             //------------------------------------------------------------------
46             // C(:,j) is not very sparse
47             //------------------------------------------------------------------
48 
49             for ( ; pB < pB_end ; pB++)     // scan B(:,j)
50             {
51                 GB_GET_B_kj_INDEX ;         // get k of B(k,j)
52                 GB_GET_A_k ;                // get A(:,k)
53                 if (aknz == 0) continue ;
54                 GB_GET_B_kj ;               // bkj = B(k,j)
55                 // scan A(:,k)
56                 for (int64_t pA = pA_start ; pA < pA_end ; pA++)
57                 {
58                     GB_GET_A_ik_INDEX ;     // get i of A(i,k)
59                     int64_t hf = Hf [i] ;
60                     if (hf < mark)
61                     {
62                         // C(i,j) = A(i,k) * B(k,j)
63                         Hf [i] = mark1 ;     // mark as seen
64                         GB_MULT_A_ik_B_kj ;  // t =A(i,k)*B(k,j)
65                         GB_HX_WRITE (i, t) ; // Hx [i] = t
66                     }
67                     else if (hf == mark1)
68                     {
69                         // C(i,j) += A(i,k) * B(k,j)
70                         GB_MULT_A_ik_B_kj ;  // t =A(i,k)*B(k,j)
71                         GB_HX_UPDATE (i, t) ;// Hx [i] += t
72                     }
73                 }
74             }
75             GB_GATHER_ALL_C_j(mark1) ;  // gather into C(:,j)
76 
77         }
78         else
79         {
80 
81             //------------------------------------------------------------------
82             // C(:,j) is very sparse
83             //------------------------------------------------------------------
84 
85             for ( ; pB < pB_end ; pB++)     // scan B(:,j)
86             {
87                 GB_GET_B_kj_INDEX ;         // get k of B(k,j)
88                 GB_GET_A_k ;                // get A(:,k)
89                 if (aknz == 0) continue ;
90                 GB_GET_B_kj ;               // bkj = B(k,j)
91                 // scan A(:,k)
92                 for (int64_t pA = pA_start ; pA < pA_end ; pA++)
93                 {
94                     GB_GET_A_ik_INDEX ;     // get i of A(i,k)
95                     int64_t hf = Hf [i] ;
96                     if (hf < mark)
97                     {
98                         // C(i,j) = A(i,k) * B(k,j)
99                         Hf [i] = mark1 ;        // mark as seen
100                         GB_MULT_A_ik_B_kj ;  // t =A(i,k)*B(k,j)
101                         GB_HX_WRITE (i, t) ;    // Hx [i] = t
102                         Ci [pC++] = i ; // create C(:,j) pattern
103                     }
104                     else if (hf == mark1)
105                     {
106                         // C(i,j) += A(i,k) * B(k,j)
107                         GB_MULT_A_ik_B_kj ;  // t =A(i,k)*B(k,j)
108                         GB_HX_UPDATE (i, t) ;   // Hx [i] += t
109                     }
110                 }
111             }
112             GB_SORT_AND_GATHER_C_j ;    // gather into C(:,j)
113         }
114     }
115 }
116 
117