1 //------------------------------------------------------------------------------
2 // GB_AxB_dot2: compute C=A'*B or C<!M>=A'*B in parallel, in-place
3 //------------------------------------------------------------------------------
4
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7
8 //------------------------------------------------------------------------------
9
10 // This method always constructs C as bitmap; it then converts C to sparse or
11 // hyper if A or B are hypersparse. The C<M>=A'*B dot product when C is sparse
12 // is computed by GB_AxB_dot3. This method handles the case when C is bitmap.
13
14 // TODO: this is slower than it could be if A and B are both bitmap, when
15 // A->vlen is large, and likely if A and B are both either bitmap or full.
16 // This is because the inner loop is a simple full/bitmap dot product, across
17 // the entire input vectors. No tiling is used, so cache performance is not
18 // as good as it could be. For large problems, C=(A')*B is faster with
19 // the saxpy3 method, as compared to this method with C=A'*B.
20
21 #include "GB_mxm.h"
22 #include "GB_subref.h"
23 #include "GB_binop.h"
24 #include "GB_ek_slice.h"
25 #include "GB_bitmap_assign_methods.h"
26 #ifndef GBCOMPACT
27 #include "GB_AxB__include.h"
28 #endif
29
30 #define GB_FREE_ALL \
31 { \
32 GB_phbix_free (M2) ; \
33 GB_WERK_POP (M_ek_slicing, int64_t) ; \
34 GB_WERK_POP (B_slice, int64_t) ; \
35 GB_WERK_POP (A_slice, int64_t) ; \
36 }
37
38 GB_PUBLIC // accessed by the MATLAB tests in GraphBLAS/Test only
GB_AxB_dot2(GrB_Matrix C,const GrB_Matrix M_in,const bool Mask_comp,const bool Mask_struct,const GrB_Matrix A_in,const GrB_Matrix B_in,const GrB_Semiring semiring,const bool flipxy,GB_Context Context)39 GrB_Info GB_AxB_dot2 // C=A'*B or C<!M>=A'*B, dot product method
40 (
41 GrB_Matrix C, // output matrix, static header
42 const GrB_Matrix M_in, // mask matrix for C<!M>=A'*B, may be NULL
43 const bool Mask_comp, // if true, use !M
44 const bool Mask_struct, // if true, use the only structure of M
45 const GrB_Matrix A_in, // input matrix
46 const GrB_Matrix B_in, // input matrix
47 const GrB_Semiring semiring, // semiring that defines C=A*B
48 const bool flipxy, // if true, do z=fmult(b,a) vs fmult(a,b)
49 GB_Context Context
50 )
51 {
52 // double ttt = omp_get_wtime ( ) ;
53
54 //--------------------------------------------------------------------------
55 // check inputs
56 //--------------------------------------------------------------------------
57
58 GrB_Info info ;
59
60 ASSERT (C != NULL && C->static_header) ;
61 ASSERT_MATRIX_OK_OR_NULL (M_in, "M for dot A'*B", GB0) ;
62 ASSERT_MATRIX_OK (A_in, "A for dot A'*B", GB0) ;
63 ASSERT_MATRIX_OK (B_in, "B for dot A'*B", GB0) ;
64
65 ASSERT (!GB_ZOMBIES (M_in)) ;
66 ASSERT (GB_JUMBLED_OK (M_in)) ;
67 ASSERT (!GB_PENDING (M_in)) ;
68 ASSERT (!GB_ZOMBIES (A_in)) ;
69 ASSERT (!GB_JUMBLED (A_in)) ;
70 ASSERT (!GB_PENDING (A_in)) ;
71 ASSERT (!GB_ZOMBIES (B_in)) ;
72 ASSERT (!GB_JUMBLED (B_in)) ;
73 ASSERT (!GB_PENDING (B_in)) ;
74
75 ASSERT_SEMIRING_OK (semiring, "semiring for numeric A'*B", GB0) ;
76
77 GrB_Matrix M = NULL ;
78
79 struct GB_Matrix_opaque M2_header ;
80 GrB_Matrix M2 = NULL ;
81 GB_WERK_DECLARE (A_slice, int64_t) ;
82 GB_WERK_DECLARE (B_slice, int64_t) ;
83 GB_WERK_DECLARE (M_ek_slicing, int64_t) ;
84 ASSERT (A_in->vlen == B_in->vlen) ;
85 ASSERT (A_in->vlen > 0) ;
86
87 if (M_in == NULL)
88 {
89 GBURBLE ("(%s=%s'*%s) ",
90 GB_sparsity_char (GxB_BITMAP),
91 GB_sparsity_char_matrix (A_in),
92 GB_sparsity_char_matrix (B_in)) ;
93 }
94 else
95 {
96 GBURBLE ("(%s%s%s%s%s=%s'*%s) ",
97 GB_sparsity_char (GxB_BITMAP),
98 Mask_struct ? "{" : "<",
99 Mask_comp ? "!" : "",
100 GB_sparsity_char_matrix (M_in),
101 Mask_struct ? "}" : ">",
102 GB_sparsity_char_matrix (A_in),
103 GB_sparsity_char_matrix (B_in)) ;
104 }
105
106 //--------------------------------------------------------------------------
107 // construct shallow copies of A and B, if hypersparse
108 //--------------------------------------------------------------------------
109
110 // If A_in is hypersparse, a new sparse matrix A is constructed with
111 // A->vdim = A_in->nvec and the same vlen as A_in, and then the packed
112 // C->vlen will equal A->vdim < cvlen_final.
113
114 // If B_in is hypersparse, a new sparse matrix B is constructed with
115 // B->vdim = B_in->nvec and the same vlen as B_in, and then the packed
116 // C->vdim will equal B->vdim < cvdim_final.
117
118 int64_t cvlen_final = A_in->vdim ;
119 int64_t cvdim_final = B_in->vdim ;
120 bool A_is_hyper = GB_IS_HYPERSPARSE (A_in) ;
121 bool B_is_hyper = GB_IS_HYPERSPARSE (B_in) ;
122 bool A_or_B_hyper = A_is_hyper || B_is_hyper ;
123 GrB_Index *restrict Ah = (GrB_Index *) A_in->h ;
124 GrB_Index *restrict Bh = (GrB_Index *) B_in->h ;
125 struct GB_Matrix_opaque A_header, B_header ;
126 GrB_Matrix A = (A_is_hyper) ? GB_hyper_pack (&A_header, A_in) : A_in ;
127 GrB_Matrix B = (B_is_hyper) ? GB_hyper_pack (&B_header, B_in) : B_in ;
128 ASSERT (!GB_IS_HYPERSPARSE (A)) ;
129 ASSERT (!GB_IS_HYPERSPARSE (B)) ;
130
131 //--------------------------------------------------------------------------
132 // determine the size of C
133 //--------------------------------------------------------------------------
134
135 int64_t cnvec = B->nvec ;
136 int64_t cvlen = A->vdim ;
137 int64_t cvdim = B->vdim ;
138
139 int64_t cnz ;
140 if (!GB_Index_multiply ((GrB_Index *) (&cnz), cvlen, cvdim))
141 {
142 // problem too large
143 return (GrB_OUT_OF_MEMORY) ;
144 }
145
146 //--------------------------------------------------------------------------
147 // extract the submask if A or B are hypersparse
148 //--------------------------------------------------------------------------
149
150 if (A_or_B_hyper && M_in != NULL)
151 {
152 // M2 = M_in (Ah, Bh), where M2 has a static header
153 M2 = GB_clear_static_header (&M2_header) ;
154 GB_OK (GB_subref (M2, M_in->is_csc, M_in,
155 (A_is_hyper) ? Ah : GrB_ALL, cvlen,
156 (B_is_hyper) ? Bh : GrB_ALL, cvdim, false, Context)) ;
157 // TODO: if Mask_struct is true, only extract the pattern of M_in
158 M = M2 ;
159 ASSERT_MATRIX_OK_OR_NULL (M, "M submask dot A'*B", GB0) ;
160 }
161 else
162 {
163 // use the mask as-is
164 M = M_in ;
165 }
166
167 //--------------------------------------------------------------------------
168 // determine the number of threads to use
169 //--------------------------------------------------------------------------
170
171 int64_t naslice = 0 ;
172 int64_t nbslice = 0 ;
173
174 int64_t anvec = A->nvec ;
175 int64_t anz = GB_NNZ_HELD (A) ;
176
177 int64_t bnvec = B->nvec ;
178 int64_t bnz = GB_NNZ_HELD (B) ;
179
180 GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
181 int nthreads = GB_nthreads (anz + bnz, chunk, nthreads_max) ;
182
183 #define GB_NTASKS_PER_THREAD 32
184
185 if (nthreads == 1)
186 {
187 // do the entire computation with a single thread
188 naslice = 1 ;
189 nbslice = 1 ;
190 }
191 else
192 {
193 // determine number of slices for A' and B
194 if (bnvec == 1)
195 {
196 // C and B are single vectors
197 naslice = GB_NTASKS_PER_THREAD * nthreads ;
198 nbslice = 1 ;
199 }
200 else if (anvec == 1 || bnvec == 0
201 || bnvec > GB_NTASKS_PER_THREAD * nthreads)
202 {
203 // A is a single vector, or B is empty, or B is large: just slice B
204 naslice = 1 ;
205 nbslice = GB_NTASKS_PER_THREAD * nthreads ;
206 }
207 else
208 {
209 // slice B into individual vectors
210 nbslice = bnvec ;
211
212 // slice A' to get a total of about 16*nthreads tasks
213 naslice = (GB_NTASKS_PER_THREAD * nthreads) / nbslice ;
214
215 // but do not slice A too finely
216 naslice = GB_IMIN (naslice, anvec/4) ;
217 naslice = GB_IMAX (naslice, nthreads) ;
218 }
219 }
220
221 //--------------------------------------------------------------------------
222 // get the semiring operators
223 //--------------------------------------------------------------------------
224
225 GrB_BinaryOp mult = semiring->multiply ;
226 GrB_Monoid add = semiring->add ;
227 ASSERT (mult->ztype == add->op->ztype) ;
228 bool A_is_pattern, B_is_pattern ;
229 GB_AxB_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult->opcode) ;
230
231 //--------------------------------------------------------------------------
232 // allocate workspace and slice A and B
233 //--------------------------------------------------------------------------
234
235 // A and B can have any sparsity: full, bitmap, sparse, or hypersparse.
236 // C is always created as bitmap
237
238 GB_WERK_PUSH (A_slice, naslice + 1, int64_t) ;
239 GB_WERK_PUSH (B_slice, nbslice + 1, int64_t) ;
240 if (A_slice == NULL || B_slice == NULL)
241 {
242 // out of memory
243 GB_FREE_ALL ;
244 return (GrB_OUT_OF_MEMORY) ;
245 }
246 GB_pslice (A_slice, A->p, A->nvec, naslice, false) ;
247 GB_pslice (B_slice, B->p, B->nvec, nbslice, false) ;
248
249 // ttt = omp_get_wtime ( ) - ttt ;
250 // GB_Global_timing_add (17, ttt) ;
251 // ttt = omp_get_wtime ( ) ;
252
253 //--------------------------------------------------------------------------
254 // allocate C
255 //--------------------------------------------------------------------------
256
257 // if M is sparse/hyper, then calloc C->b; otherwise use malloc
258 bool M_is_sparse_or_hyper = (M != NULL) &&
259 (GB_IS_SPARSE (M) || GB_IS_HYPERSPARSE (M)) ;
260 GrB_Type ctype = add->op->ztype ;
261 GB_OK (GB_new_bix (&C, true, // bitmap, static header
262 ctype, cvlen, cvdim, GB_Ap_malloc, true,
263 GxB_BITMAP, M_is_sparse_or_hyper, B->hyper_switch, cnvec, cnz, true,
264 Context)) ;
265
266 // ttt = omp_get_wtime ( ) - ttt ;
267 // GB_Global_timing_add (18, ttt) ;
268 // ttt = omp_get_wtime ( ) ;
269
270 //--------------------------------------------------------------------------
271 // if M is sparse/hyper, scatter it into the C bitmap
272 //--------------------------------------------------------------------------
273
274 if (M_is_sparse_or_hyper)
275 {
276 // FUTURE:: could just set Cb [pC] = 2 since Cb has just been calloc'd.
277 // However, in the future, this method might be able to modify C on
278 // input, in which case C->b will not be all zero.
279
280 int M_ntasks, M_nthreads ;
281 GB_SLICE_MATRIX (M, 8, chunk) ;
282
283 // Cb [pC] += 2 for each entry M(i,j) in the mask
284 GB_bitmap_M_scatter (C,
285 NULL, 0, GB_ALL, NULL, NULL, 0, GB_ALL, NULL,
286 M, Mask_struct, GB_ASSIGN, GB_BITMAP_M_SCATTER_PLUS_2,
287 M_ek_slicing, M_ntasks, M_nthreads, Context) ;
288 // the bitmap of C now contains:
289 // Cb (i,j) = 0: cij not present, mij zero
290 // Cb (i,j) = 1: cij present, mij zero (not used yet)
291 // Cb (i,j) = 2: cij not present, mij 1
292 // Cb (i,j) = 3: cij present, mij 1 (not used yet)
293 GB_WERK_POP (M_ek_slicing, int64_t) ;
294 }
295
296 //--------------------------------------------------------------------------
297 // C<#>=A'*B, computing each entry with a dot product, via builtin semiring
298 //--------------------------------------------------------------------------
299
300 bool done = false ;
301
302 #ifndef GBCOMPACT
303
304 //----------------------------------------------------------------------
305 // define the worker for the switch factory
306 //----------------------------------------------------------------------
307
308 #define GB_Adot2B(add,mult,xname) GB (_Adot2B_ ## add ## mult ## xname)
309
310 #define GB_AxB_WORKER(add,mult,xname) \
311 { \
312 info = GB_Adot2B (add,mult,xname) (C, M, Mask_comp, Mask_struct, \
313 A, A_is_pattern, A_slice, B, B_is_pattern, B_slice, \
314 nthreads, naslice, nbslice) ; \
315 done = (info != GrB_NO_VALUE) ; \
316 } \
317 break ;
318
319 //----------------------------------------------------------------------
320 // launch the switch factory
321 //----------------------------------------------------------------------
322
323 GB_Opcode mult_opcode, add_opcode ;
324 GB_Type_code xcode, ycode, zcode ;
325
326 if (GB_AxB_semiring_builtin (A, A_is_pattern, B, B_is_pattern, semiring,
327 flipxy, &mult_opcode, &add_opcode, &xcode, &ycode, &zcode))
328 {
329 #include "GB_AxB_factory.c"
330 }
331 ASSERT (info == GrB_SUCCESS || info == GrB_NO_VALUE) ;
332
333 #endif
334
335 //--------------------------------------------------------------------------
336 // C = A'*B, computing each entry with a dot product, with typecasting
337 //--------------------------------------------------------------------------
338
339 if (!done)
340 {
341 #define GB_DOT2_GENERIC
342 GB_BURBLE_MATRIX (C, "(generic C%s=A'*B) ", (M == NULL) ? "" :
343 (Mask_comp ? "<!M>" : "<M>")) ;
344 #include "GB_AxB_dot_generic.c"
345 }
346
347 //--------------------------------------------------------------------------
348 // free workspace
349 //--------------------------------------------------------------------------
350
351 GB_FREE_ALL ;
352 C->magic = GB_MAGIC ;
353 ASSERT_MATRIX_OK (C, "dot2: C = A'*B output", GB0) ;
354 ASSERT (!GB_ZOMBIES (C)) ;
355
356 //--------------------------------------------------------------------------
357 // unpack C if A or B are hypersparse
358 //--------------------------------------------------------------------------
359
360 if (A_or_B_hyper)
361 {
362
363 //----------------------------------------------------------------------
364 // unpack C from bitmap to sparse/hyper
365 //----------------------------------------------------------------------
366
367 // C is currently A_in->nvec by B_in->nvec, in bitmap form. It must be
368 // unpacked into sparse/hypersparse form, with zombies.
369
370 //----------------------------------------------------------------------
371 // allocate the sparse/hypersparse structure of the final C
372 //----------------------------------------------------------------------
373
374 int64_t *restrict Cp = NULL ; size_t Cp_size = 0 ;
375 int64_t *restrict Ch = NULL ; size_t Ch_size = 0 ;
376 int64_t *restrict Ci = NULL ; size_t Ci_size = 0 ;
377
378 Cp = GB_MALLOC (cvdim+1, int64_t, &Cp_size) ;
379 Ch = NULL ;
380 if (B_is_hyper)
381 {
382 Ch = GB_MALLOC (cvdim, int64_t, &Ch_size) ;
383 }
384 Ci = GB_MALLOC (cnz, int64_t, &Ci_size) ;
385 if (Cp == NULL || (B_is_hyper && Ch == NULL) || Ci == NULL)
386 {
387 // out of memory
388 GB_phbix_free (C) ;
389 GB_FREE (&Cp, Cp_size) ;
390 GB_FREE (&Ch, Ch_size) ;
391 GB_FREE (&Ci, Ci_size) ;
392 return (GrB_OUT_OF_MEMORY) ;
393 }
394
395 //----------------------------------------------------------------------
396 // construct the hyperlist of C, if B is hypersparse
397 //----------------------------------------------------------------------
398
399 nthreads = GB_nthreads (cvdim, chunk, nthreads_max) ;
400 if (B_is_hyper)
401 {
402 // C becomes hypersparse
403 ASSERT (cvdim == B_in->nvec) ;
404 GB_memcpy (Ch, B_in->h, cvdim * sizeof (int64_t), nthreads) ;
405 }
406
407 //----------------------------------------------------------------------
408 // construct the vector pointers of C
409 //----------------------------------------------------------------------
410
411 int64_t pC ;
412 #pragma omp parallel for num_threads(nthreads) schedule(static)
413 for (pC = 0 ; pC < cvdim+1 ; pC++)
414 {
415 Cp [pC] = pC * cvlen ;
416 }
417
418 //----------------------------------------------------------------------
419 // construct the pattern of C from its bitmap
420 //----------------------------------------------------------------------
421
422 // C(i,j) becomes a zombie if not present in the bitmap
423 nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
424
425 int8_t *restrict Cb = C->b ;
426 if (A_is_hyper)
427 {
428 ASSERT (cvlen == A_in->nvec) ;
429 #pragma omp parallel for num_threads(nthreads) schedule(static)
430 for (pC = 0 ; pC < cnz ; pC++)
431 {
432 int64_t i = Ah [pC % cvlen] ;
433 Ci [pC] = (Cb [pC]) ? i : GB_FLIP (i) ;
434 }
435 }
436 else
437 {
438 ASSERT (cvlen == cvlen_final && cvlen == A->vdim) ;
439 #pragma omp parallel for num_threads(nthreads) schedule(static)
440 for (pC = 0 ; pC < cnz ; pC++)
441 {
442 int64_t i = pC % cvlen ;
443 Ci [pC] = (Cb [pC]) ? i : GB_FLIP (i) ;
444 }
445 }
446
447 //----------------------------------------------------------------------
448 // transplant the new content and finalize C
449 //----------------------------------------------------------------------
450
451 C->p = Cp ; Cp = NULL ; C->p_size = Cp_size ;
452 C->h = Ch ; Ch = NULL ; C->h_size = Ch_size ;
453 C->i = Ci ; Ci = NULL ; C->i_size = Ci_size ;
454 C->nzombies = cnz - C->nvals ;
455 C->vdim = cvdim_final ;
456 C->vlen = cvlen_final ;
457 C->nvals = -1 ;
458 C->nvec = cvdim ;
459 C->plen = cvdim ;
460 C->nvec_nonempty = (cvlen == 0) ? 0 : cvdim ;
461
462 // free the bitmap
463 GB_FREE ((&C->b), C->b_size) ;
464
465 // C is now sparse or hypersparse
466 ASSERT_MATRIX_OK (C, "dot2: unpacked C", GB0) ;
467 ASSERT (GB_ZOMBIES_OK (C)) ;
468 }
469
470 //--------------------------------------------------------------------------
471 // return result
472 //--------------------------------------------------------------------------
473
474 ASSERT (GB_ZOMBIES_OK (C)) ;
475 ASSERT (!GB_JUMBLED (C)) ;
476 ASSERT (!GB_PENDING (C)) ;
477
478 // ttt = omp_get_wtime ( ) - ttt ;
479 // GB_Global_timing_add (19, ttt) ;
480 // ttt = omp_get_wtime ( ) ;
481
482 return (GrB_SUCCESS) ;
483 }
484
485