1 //------------------------------------------------------------------------------ 2 // GB_AxB_saxpy3_coarseGus_notM_phase5: C<!M>=A*B, coarse Gustavson, phase5 3 //------------------------------------------------------------------------------ 4 5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved. 6 // SPDX-License-Identifier: Apache-2.0 7 8 //------------------------------------------------------------------------------ 9 10 { 11 12 //-------------------------------------------------------------------------- 13 // phase5: coarse Gustavson task, C<!M>=A*B 14 //-------------------------------------------------------------------------- 15 16 // Since the mask is !M: 17 // Hf [i] < mark : M(i,j)=0, C(i,j) is not yet seen. 18 // Hf [i] == mark : M(i,j)=1, so C(i,j) is ignored. 19 // Hf [i] == mark+1 : M(i,j)=0, and C(i,j) has been seen. 20 21 for (int64_t kk = kfirst ; kk <= klast ; kk++) 22 { 23 int64_t pC = Cp [kk] ; 24 int64_t cjnz = Cp [kk+1] - pC ; 25 if (cjnz == 0) continue ; // nothing to do 26 GB_GET_B_j ; // get B(:,j) 27 #ifndef GB_GENERIC 28 if (cjnz == cvlen) // C(:,j) is dense 29 { 30 // This is not used for the generic saxpy3. 31 GB_COMPUTE_DENSE_C_j ; // C(:,j) = A*B(:,j) 32 continue ; 33 } 34 #endif 35 GB_GET_M_j ; // get M(:,j) 36 mark += 2 ; 37 int64_t mark1 = mark+1 ; 38 39 // scatter M(:,j) into the Gustavson workspace 40 GB_SCATTER_M_j (pM_start, pM_end, mark) ; 41 42 if (16 * cjnz > cvlen) 43 { 44 45 //------------------------------------------------------------------ 46 // C(:,j) is not very sparse 47 //------------------------------------------------------------------ 48 49 for ( ; pB < pB_end ; pB++) // scan B(:,j) 50 { 51 GB_GET_B_kj_INDEX ; // get k of B(k,j) 52 GB_GET_A_k ; // get A(:,k) 53 if (aknz == 0) continue ; 54 GB_GET_B_kj ; // bkj = B(k,j) 55 // scan A(:,k) 56 for (int64_t pA = pA_start ; pA < pA_end ; pA++) 57 { 58 GB_GET_A_ik_INDEX ; // get i of A(i,k) 59 int64_t hf = Hf [i] ; 60 if (hf < mark) 61 { 62 // C(i,j) = A(i,k) * B(k,j) 63 Hf [i] = mark1 ; // mark as seen 64 GB_MULT_A_ik_B_kj ; // t =A(i,k)*B(k,j) 65 GB_HX_WRITE (i, t) ; // Hx [i] = t 66 } 67 else if (hf == mark1) 68 { 69 // C(i,j) += A(i,k) * B(k,j) 70 GB_MULT_A_ik_B_kj ; // t =A(i,k)*B(k,j) 71 GB_HX_UPDATE (i, t) ;// Hx [i] += t 72 } 73 } 74 } 75 GB_GATHER_ALL_C_j(mark1) ; // gather into C(:,j) 76 77 } 78 else 79 { 80 81 //------------------------------------------------------------------ 82 // C(:,j) is very sparse 83 //------------------------------------------------------------------ 84 85 for ( ; pB < pB_end ; pB++) // scan B(:,j) 86 { 87 GB_GET_B_kj_INDEX ; // get k of B(k,j) 88 GB_GET_A_k ; // get A(:,k) 89 if (aknz == 0) continue ; 90 GB_GET_B_kj ; // bkj = B(k,j) 91 // scan A(:,k) 92 for (int64_t pA = pA_start ; pA < pA_end ; pA++) 93 { 94 GB_GET_A_ik_INDEX ; // get i of A(i,k) 95 int64_t hf = Hf [i] ; 96 if (hf < mark) 97 { 98 // C(i,j) = A(i,k) * B(k,j) 99 Hf [i] = mark1 ; // mark as seen 100 GB_MULT_A_ik_B_kj ; // t =A(i,k)*B(k,j) 101 GB_HX_WRITE (i, t) ; // Hx [i] = t 102 Ci [pC++] = i ; // create C(:,j) pattern 103 } 104 else if (hf == mark1) 105 { 106 // C(i,j) += A(i,k) * B(k,j) 107 GB_MULT_A_ik_B_kj ; // t =A(i,k)*B(k,j) 108 GB_HX_UPDATE (i, t) ; // Hx [i] += t 109 } 110 } 111 } 112 GB_SORT_AND_GATHER_C_j ; // gather into C(:,j) 113 } 114 } 115 } 116 117