1 //------------------------------------------------------------------------------
2 // GB_AxB_dot: C<M>=A'*B using dot products
3 //------------------------------------------------------------------------------
4
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7
8 //------------------------------------------------------------------------------
9
10 // Parallel matrix-matrix multiply, A'*B, with optional mask M. This
11 // method is used by GrB_mxm, GrB_vxm, and GrB_mxv. For both of the latter two
12 // methods, B on input will be an nrows-by-1 column vxector.
13
14 // This function, and the matrices C, M, A, and B are all CSR/CSC agnostic.
15 // For this discussion, suppose they are CSC, with vlen = # of rows, and vdim =
16 // # of columns.
17
18 // C=A'*B, C<M>=A'*B or C<!M>=A'*B is being computed. A has not been
19 // transposed yet (and will not be). A and B must have the same vector length,
20 // vlen (as if both A and B are CSC matrices with the same number of rows, for
21 // example). GB_AxB_dot2 and GB_AxB_dot3 operate on A' without forming it.
22 // GB_AxB_dot2 computes C=A'*B and C<!M>=A'*B, and it takes Omega(m*n) time,
23 // if C is m-by-n. It is thus only suitable for cases when A and B are large,
24 // and C is small. GB_AxB_dot3 computes C<M>=A'*B, and it only needs to
25 // examine entries in M, taking Omega(nnz(M)) time. It can thus be used for
26 // very large matrices C. GB_AxB_dot4 computes C+=A'*B when C is dense.
27
28 // The output matrix C has not been allocated. It is an uninitialzed static
29 // header on input. The mask M is optional.
30
31 // If the result is computed in-place, then the C parameger is ignored, and the
32 // result is computed in C_in instead. This case requires the accum operator
33 // to match the monoid of the semiring.
34
35 // The semiring defines C=A*B. flipxy modifies how the semiring multiply
36 // operator is applied. If false, then fmult(aik,bkj) is computed. If true,
37 // then the operands are swapped, and fmult(bkj,aij) is done instead.
38
39 // Context: the GB_Context containing the # of threads to use, a string of the
40 // user-callable function that is calling this function (GrB_mxm, GrB_mxv, or
41 // GxB_vxm) and detailed error reports.
42
43 #include "GB_mxm.h"
44 #define GB_FREE_ALL ;
45
GB_AxB_dot(GrB_Matrix C,GrB_Matrix C_in,GrB_Matrix M,const bool Mask_comp,const bool Mask_struct,const GrB_Matrix A,const GrB_Matrix B,const GrB_Semiring semiring,const bool flipxy,bool * mask_applied,bool * done_in_place,GB_Context Context)46 GrB_Info GB_AxB_dot // dot product (multiple methods)
47 (
48 GrB_Matrix C, // output matrix, static header
49 GrB_Matrix C_in, // input/output matrix, if done in-place
50 GrB_Matrix M, // optional mask matrix
51 const bool Mask_comp, // if true, use !M
52 const bool Mask_struct, // if true, use the only structure of M
53 const GrB_Matrix A, // input matrix A
54 const GrB_Matrix B, // input matrix B
55 const GrB_Semiring semiring, // semiring that defines C=A*B
56 const bool flipxy, // if true, do z=fmult(b,a) vs fmult(a,b)
57 bool *mask_applied, // if true, mask was applied
58 bool *done_in_place, // if true, C_in was computed in-place
59 GB_Context Context
60 )
61 {
62
63 //--------------------------------------------------------------------------
64 // check inputs
65 //--------------------------------------------------------------------------
66
67 ASSERT (C != NULL && C->static_header) ;
68
69 ASSERT_MATRIX_OK_OR_NULL (M, "M for dot A'*B", GB0) ;
70 ASSERT (!GB_PENDING (M)) ;
71 ASSERT (GB_JUMBLED_OK (M)) ;
72 ASSERT (!GB_ZOMBIES (M)) ;
73
74 ASSERT_MATRIX_OK (A, "A for dot A'*B", GB0) ;
75 GB_MATRIX_WAIT (A) ;
76 ASSERT (!GB_PENDING (A)) ;
77 ASSERT (!GB_JUMBLED (A)) ;
78 ASSERT (!GB_ZOMBIES (A)) ;
79
80 ASSERT_MATRIX_OK (B, "B for dot A'*B", GB0) ;
81 GB_MATRIX_WAIT (B) ;
82 ASSERT (!GB_PENDING (B)) ;
83 ASSERT (!GB_JUMBLED (B)) ;
84 ASSERT (!GB_ZOMBIES (B)) ;
85
86 ASSERT_SEMIRING_OK (semiring, "semiring for dot A'*B", GB0) ;
87
88 //--------------------------------------------------------------------------
89 // in-place C+=A'*B. mask is not present (and not applied)
90 //--------------------------------------------------------------------------
91
92 if (GB_AxB_dot4_control (C_in, M, Mask_comp))
93 {
94 (*done_in_place) = true ;
95 (*mask_applied) = false ; // no mask to apply
96 return (GB_AxB_dot4 (C_in, A, B, semiring, flipxy, Context)) ;
97 }
98
99 //--------------------------------------------------------------------------
100 // check the empty case
101 //--------------------------------------------------------------------------
102
103 if (A->vlen == 0)
104 {
105 // no work to do; C is an empty matrix, normally hypersparse
106 if (C_in != NULL) return (GrB_SUCCESS) ;
107 return (GB_new (&C, true, // auto sparsity, static header
108 semiring->add->op->ztype, A->vdim, B->vdim, GB_Ap_calloc, true,
109 GxB_AUTO_SPARSITY, GB_Global_hyper_switch_get ( ), 1, Context)) ;
110 }
111
112 //--------------------------------------------------------------------------
113 // C<M>=A'*B: general case
114 //--------------------------------------------------------------------------
115
116 if (GB_AxB_dot3_control (M, Mask_comp))
117 {
118
119 // use dot3 if M is present and not complemented, and either sparse or
120 // hypersparse
121 GBURBLE ("dot3 ") ;
122 (*mask_applied) = true ; // mask is always applied
123 (*done_in_place) = false ;
124
125 #if defined ( GBCUDA )
126
127 // [ replace this with:
128 // if (GB_AxB_dot3_cuda_branch (M, Mask_struct, A, B, semiring, flipxy, Context)
129
130 // very rough estimate of the work to do
131 int64_t anz = GB_IS_FULL (A) ? GB_NNZ_FULL (A) : GB_NNZ (A) ;
132 int64_t bnz = GB_IS_FULL (B) ? GB_NNZ_FULL (B) : GB_NNZ (B) ;
133 int64_t mnz = GB_NNZ (M) ;
134
135 double adeg = ((double) anz) / ((double) GB_IMAX (1, A->nvec)) ;
136 double bdeg = ((double) bnz) / ((double) GB_IMAX (1, B->nvec)) ;
137 double work = mnz * GB_IMIN (adeg, bdeg) ;
138
139 // TODO for GPU: if A or B are not accessed (first, 2nd, or pair
140 // ops) then the type of A can be user-defined here, for CUDA.
141
142 int ngpus_to_use = GB_ngpus_to_use (work) ;
143 GBURBLE (" work:%g gpus:%d ", work, ngpus_to_use) ;
144 if (ngpus_to_use > 0
145 && (semiring->header_size == 0) // semiring is built-in
146 && (A->type->code != GB_UDT_code)
147 && (B->type->code != GB_UDT_code)
148 && !GB_IS_BITMAP (A) && !GB_IS_BITMAP (B))
149 // to here ... ]
150 {
151 // use "the" GPU (TODO for GPU: could use multiple GPUs too)
152 return (GB_AxB_dot3_cuda (C, M, Mask_struct, A, B, semiring,
153 flipxy, Context)) ;
154 }
155 else
156 #endif
157 {
158 // use the CPU
159 return (GB_AxB_dot3 (C, M, Mask_struct, A, B, semiring,
160 flipxy, Context)) ;
161 }
162 }
163
164 //--------------------------------------------------------------------------
165 // general case: C<M>=A'*B, C<!M>=A'B*, or C=A'*B, not in-place
166 //--------------------------------------------------------------------------
167
168 (*mask_applied) = (M != NULL) ; // mask applied if present
169 (*done_in_place) = false ; // TODO: allow dot2 to work in-place
170 return (GB_AxB_dot2 (C, M, Mask_comp, Mask_struct, A, B, semiring,
171 flipxy, Context)) ;
172 }
173
174