1 //------------------------------------------------------------------------------
2 // GB_AxB_dot2: compute C=A'*B or C<!M>=A'*B in parallel, in-place
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // This method always constructs C as bitmap; it then converts C to sparse or
11 // hyper if A or B are hypersparse.  The C<M>=A'*B dot product when C is sparse
12 // is computed by GB_AxB_dot3.  This method handles the case when C is bitmap.
13 
14 // TODO:  this is slower than it could be if A and B are both bitmap, when
15 // A->vlen is large, and likely if A and B are both either bitmap or full.
16 // This is because the inner loop is a simple full/bitmap dot product, across
17 // the entire input vectors.  No tiling is used, so cache performance is not
18 // as good as it could be.  For large problems, C=(A')*B is faster with
19 // the saxpy3 method, as compared to this method with C=A'*B.
20 
21 #include "GB_mxm.h"
22 #include "GB_subref.h"
23 #include "GB_binop.h"
24 #include "GB_ek_slice.h"
25 #include "GB_bitmap_assign_methods.h"
26 #ifndef GBCOMPACT
27 #include "GB_AxB__include.h"
28 #endif
29 
30 #define GB_FREE_ALL                         \
31 {                                           \
32     GB_phbix_free (M2) ;                  \
33     GB_WERK_POP (M_ek_slicing, int64_t) ;   \
34     GB_WERK_POP (B_slice, int64_t) ;        \
35     GB_WERK_POP (A_slice, int64_t) ;        \
36 }
37 
38 GB_PUBLIC   // accessed by the MATLAB tests in GraphBLAS/Test only
GB_AxB_dot2(GrB_Matrix C,const GrB_Matrix M_in,const bool Mask_comp,const bool Mask_struct,const GrB_Matrix A_in,const GrB_Matrix B_in,const GrB_Semiring semiring,const bool flipxy,GB_Context Context)39 GrB_Info GB_AxB_dot2                // C=A'*B or C<!M>=A'*B, dot product method
40 (
41     GrB_Matrix C,                   // output matrix, static header
42     const GrB_Matrix M_in,          // mask matrix for C<!M>=A'*B, may be NULL
43     const bool Mask_comp,           // if true, use !M
44     const bool Mask_struct,         // if true, use the only structure of M
45     const GrB_Matrix A_in,          // input matrix
46     const GrB_Matrix B_in,          // input matrix
47     const GrB_Semiring semiring,    // semiring that defines C=A*B
48     const bool flipxy,              // if true, do z=fmult(b,a) vs fmult(a,b)
49     GB_Context Context
50 )
51 {
52 // double ttt = omp_get_wtime ( ) ;
53 
54     //--------------------------------------------------------------------------
55     // check inputs
56     //--------------------------------------------------------------------------
57 
58     GrB_Info info ;
59 
60     ASSERT (C != NULL && C->static_header) ;
61     ASSERT_MATRIX_OK_OR_NULL (M_in, "M for dot A'*B", GB0) ;
62     ASSERT_MATRIX_OK (A_in, "A for dot A'*B", GB0) ;
63     ASSERT_MATRIX_OK (B_in, "B for dot A'*B", GB0) ;
64 
65     ASSERT (!GB_ZOMBIES (M_in)) ;
66     ASSERT (GB_JUMBLED_OK (M_in)) ;
67     ASSERT (!GB_PENDING (M_in)) ;
68     ASSERT (!GB_ZOMBIES (A_in)) ;
69     ASSERT (!GB_JUMBLED (A_in)) ;
70     ASSERT (!GB_PENDING (A_in)) ;
71     ASSERT (!GB_ZOMBIES (B_in)) ;
72     ASSERT (!GB_JUMBLED (B_in)) ;
73     ASSERT (!GB_PENDING (B_in)) ;
74 
75     ASSERT_SEMIRING_OK (semiring, "semiring for numeric A'*B", GB0) ;
76 
77     GrB_Matrix M = NULL ;
78 
79     struct GB_Matrix_opaque M2_header ;
80     GrB_Matrix M2 = NULL ;
81     GB_WERK_DECLARE (A_slice, int64_t) ;
82     GB_WERK_DECLARE (B_slice, int64_t) ;
83     GB_WERK_DECLARE (M_ek_slicing, int64_t) ;
84     ASSERT (A_in->vlen == B_in->vlen) ;
85     ASSERT (A_in->vlen > 0) ;
86 
87     if (M_in == NULL)
88     {
89         GBURBLE ("(%s=%s'*%s) ",
90             GB_sparsity_char (GxB_BITMAP),
91             GB_sparsity_char_matrix (A_in),
92             GB_sparsity_char_matrix (B_in)) ;
93     }
94     else
95     {
96         GBURBLE ("(%s%s%s%s%s=%s'*%s) ",
97             GB_sparsity_char (GxB_BITMAP),
98             Mask_struct ? "{" : "<",
99             Mask_comp ? "!" : "",
100             GB_sparsity_char_matrix (M_in),
101             Mask_struct ? "}" : ">",
102             GB_sparsity_char_matrix (A_in),
103             GB_sparsity_char_matrix (B_in)) ;
104     }
105 
106     //--------------------------------------------------------------------------
107     // construct shallow copies of A and B, if hypersparse
108     //--------------------------------------------------------------------------
109 
110     // If A_in is hypersparse, a new sparse matrix A is constructed with
111     // A->vdim = A_in->nvec and the same vlen as A_in, and then the packed
112     // C->vlen will equal A->vdim < cvlen_final.
113 
114     // If B_in is hypersparse, a new sparse matrix B is constructed with
115     // B->vdim = B_in->nvec and the same vlen as B_in, and then the packed
116     // C->vdim will equal B->vdim < cvdim_final.
117 
118     int64_t cvlen_final = A_in->vdim ;
119     int64_t cvdim_final = B_in->vdim ;
120     bool A_is_hyper = GB_IS_HYPERSPARSE (A_in) ;
121     bool B_is_hyper = GB_IS_HYPERSPARSE (B_in) ;
122     bool A_or_B_hyper = A_is_hyper || B_is_hyper ;
123     GrB_Index *restrict Ah = (GrB_Index *) A_in->h ;
124     GrB_Index *restrict Bh = (GrB_Index *) B_in->h ;
125     struct GB_Matrix_opaque A_header, B_header ;
126     GrB_Matrix A = (A_is_hyper) ? GB_hyper_pack (&A_header, A_in) : A_in ;
127     GrB_Matrix B = (B_is_hyper) ? GB_hyper_pack (&B_header, B_in) : B_in ;
128     ASSERT (!GB_IS_HYPERSPARSE (A)) ;
129     ASSERT (!GB_IS_HYPERSPARSE (B)) ;
130 
131     //--------------------------------------------------------------------------
132     // determine the size of C
133     //--------------------------------------------------------------------------
134 
135     int64_t cnvec = B->nvec ;
136     int64_t cvlen = A->vdim ;
137     int64_t cvdim = B->vdim ;
138 
139     int64_t cnz ;
140     if (!GB_Index_multiply ((GrB_Index *) (&cnz), cvlen, cvdim))
141     {
142         // problem too large
143         return (GrB_OUT_OF_MEMORY) ;
144     }
145 
146     //--------------------------------------------------------------------------
147     // extract the submask if A or B are hypersparse
148     //--------------------------------------------------------------------------
149 
150     if (A_or_B_hyper && M_in != NULL)
151     {
152         // M2 = M_in (Ah, Bh), where M2 has a static header
153         M2 = GB_clear_static_header (&M2_header) ;
154         GB_OK (GB_subref (M2, M_in->is_csc, M_in,
155             (A_is_hyper) ? Ah : GrB_ALL, cvlen,
156             (B_is_hyper) ? Bh : GrB_ALL, cvdim, false, Context)) ;
157         // TODO: if Mask_struct is true, only extract the pattern of M_in
158         M = M2 ;
159         ASSERT_MATRIX_OK_OR_NULL (M, "M submask dot A'*B", GB0) ;
160     }
161     else
162     {
163         // use the mask as-is
164         M = M_in ;
165     }
166 
167     //--------------------------------------------------------------------------
168     // determine the number of threads to use
169     //--------------------------------------------------------------------------
170 
171     int64_t naslice = 0 ;
172     int64_t nbslice = 0 ;
173 
174     int64_t anvec = A->nvec ;
175     int64_t anz   = GB_NNZ_HELD (A) ;
176 
177     int64_t bnvec = B->nvec ;
178     int64_t bnz   = GB_NNZ_HELD (B) ;
179 
180     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
181     int nthreads = GB_nthreads (anz + bnz, chunk, nthreads_max) ;
182 
183     #define GB_NTASKS_PER_THREAD 32
184 
185     if (nthreads == 1)
186     {
187         // do the entire computation with a single thread
188         naslice = 1 ;
189         nbslice = 1 ;
190     }
191     else
192     {
193         // determine number of slices for A' and B
194         if (bnvec == 1)
195         {
196             // C and B are single vectors
197             naslice = GB_NTASKS_PER_THREAD * nthreads ;
198             nbslice = 1 ;
199         }
200         else if (anvec == 1 || bnvec == 0
201             || bnvec > GB_NTASKS_PER_THREAD * nthreads)
202         {
203             // A is a single vector, or B is empty, or B is large: just slice B
204             naslice = 1 ;
205             nbslice = GB_NTASKS_PER_THREAD * nthreads ;
206         }
207         else
208         {
209             // slice B into individual vectors
210             nbslice = bnvec ;
211 
212             // slice A' to get a total of about 16*nthreads tasks
213             naslice = (GB_NTASKS_PER_THREAD * nthreads) / nbslice ;
214 
215             // but do not slice A too finely
216             naslice = GB_IMIN (naslice, anvec/4) ;
217             naslice = GB_IMAX (naslice, nthreads) ;
218         }
219     }
220 
221     //--------------------------------------------------------------------------
222     // get the semiring operators
223     //--------------------------------------------------------------------------
224 
225     GrB_BinaryOp mult = semiring->multiply ;
226     GrB_Monoid add = semiring->add ;
227     ASSERT (mult->ztype == add->op->ztype) ;
228     bool A_is_pattern, B_is_pattern ;
229     GB_AxB_pattern (&A_is_pattern, &B_is_pattern, flipxy, mult->opcode) ;
230 
231     //--------------------------------------------------------------------------
232     // allocate workspace and slice A and B
233     //--------------------------------------------------------------------------
234 
235     // A and B can have any sparsity: full, bitmap, sparse, or hypersparse.
236     // C is always created as bitmap
237 
238     GB_WERK_PUSH (A_slice, naslice + 1, int64_t) ;
239     GB_WERK_PUSH (B_slice, nbslice + 1, int64_t) ;
240     if (A_slice == NULL || B_slice == NULL)
241     {
242         // out of memory
243         GB_FREE_ALL ;
244         return (GrB_OUT_OF_MEMORY) ;
245     }
246     GB_pslice (A_slice, A->p, A->nvec, naslice, false) ;
247     GB_pslice (B_slice, B->p, B->nvec, nbslice, false) ;
248 
249 // ttt = omp_get_wtime ( ) - ttt ;
250 // GB_Global_timing_add (17, ttt) ;
251 // ttt = omp_get_wtime ( ) ;
252 
253     //--------------------------------------------------------------------------
254     // allocate C
255     //--------------------------------------------------------------------------
256 
257     // if M is sparse/hyper, then calloc C->b; otherwise use malloc
258     bool M_is_sparse_or_hyper = (M != NULL) &&
259         (GB_IS_SPARSE (M) || GB_IS_HYPERSPARSE (M)) ;
260     GrB_Type ctype = add->op->ztype ;
261     GB_OK (GB_new_bix (&C, true, // bitmap, static header
262         ctype, cvlen, cvdim, GB_Ap_malloc, true,
263         GxB_BITMAP, M_is_sparse_or_hyper, B->hyper_switch, cnvec, cnz, true,
264         Context)) ;
265 
266 // ttt = omp_get_wtime ( ) - ttt ;
267 // GB_Global_timing_add (18, ttt) ;
268 // ttt = omp_get_wtime ( ) ;
269 
270     //--------------------------------------------------------------------------
271     // if M is sparse/hyper, scatter it into the C bitmap
272     //--------------------------------------------------------------------------
273 
274     if (M_is_sparse_or_hyper)
275     {
276         // FUTURE:: could just set Cb [pC] = 2 since Cb has just been calloc'd.
277         // However, in the future, this method might be able to modify C on
278         // input, in which case C->b will not be all zero.
279 
280         int M_ntasks, M_nthreads ;
281         GB_SLICE_MATRIX (M, 8, chunk) ;
282 
283         // Cb [pC] += 2 for each entry M(i,j) in the mask
284         GB_bitmap_M_scatter (C,
285             NULL, 0, GB_ALL, NULL, NULL, 0, GB_ALL, NULL,
286             M, Mask_struct, GB_ASSIGN, GB_BITMAP_M_SCATTER_PLUS_2,
287             M_ek_slicing, M_ntasks, M_nthreads, Context) ;
288         // the bitmap of C now contains:
289         //  Cb (i,j) = 0:   cij not present, mij zero
290         //  Cb (i,j) = 1:   cij present, mij zero           (not used yet)
291         //  Cb (i,j) = 2:   cij not present, mij 1
292         //  Cb (i,j) = 3:   cij present, mij 1              (not used yet)
293         GB_WERK_POP (M_ek_slicing, int64_t) ;
294     }
295 
296     //--------------------------------------------------------------------------
297     // C<#>=A'*B, computing each entry with a dot product, via builtin semiring
298     //--------------------------------------------------------------------------
299 
300     bool done = false ;
301 
302     #ifndef GBCOMPACT
303 
304         //----------------------------------------------------------------------
305         // define the worker for the switch factory
306         //----------------------------------------------------------------------
307 
308         #define GB_Adot2B(add,mult,xname) GB (_Adot2B_ ## add ## mult ## xname)
309 
310         #define GB_AxB_WORKER(add,mult,xname)                                \
311         {                                                                    \
312             info = GB_Adot2B (add,mult,xname) (C, M, Mask_comp, Mask_struct, \
313                 A, A_is_pattern, A_slice, B, B_is_pattern, B_slice,          \
314                 nthreads, naslice, nbslice) ;                                \
315             done = (info != GrB_NO_VALUE) ;                                  \
316         }                                                                    \
317         break ;
318 
319         //----------------------------------------------------------------------
320         // launch the switch factory
321         //----------------------------------------------------------------------
322 
323         GB_Opcode mult_opcode, add_opcode ;
324         GB_Type_code xcode, ycode, zcode ;
325 
326         if (GB_AxB_semiring_builtin (A, A_is_pattern, B, B_is_pattern, semiring,
327             flipxy, &mult_opcode, &add_opcode, &xcode, &ycode, &zcode))
328         {
329             #include "GB_AxB_factory.c"
330         }
331         ASSERT (info == GrB_SUCCESS || info == GrB_NO_VALUE) ;
332 
333     #endif
334 
335     //--------------------------------------------------------------------------
336     // C = A'*B, computing each entry with a dot product, with typecasting
337     //--------------------------------------------------------------------------
338 
339     if (!done)
340     {
341         #define GB_DOT2_GENERIC
342         GB_BURBLE_MATRIX (C, "(generic C%s=A'*B) ", (M == NULL) ? "" :
343             (Mask_comp ? "<!M>" : "<M>")) ;
344         #include "GB_AxB_dot_generic.c"
345     }
346 
347     //--------------------------------------------------------------------------
348     // free workspace
349     //--------------------------------------------------------------------------
350 
351     GB_FREE_ALL ;
352     C->magic = GB_MAGIC ;
353     ASSERT_MATRIX_OK (C, "dot2: C = A'*B output", GB0) ;
354     ASSERT (!GB_ZOMBIES (C)) ;
355 
356     //--------------------------------------------------------------------------
357     // unpack C if A or B are hypersparse
358     //--------------------------------------------------------------------------
359 
360     if (A_or_B_hyper)
361     {
362 
363         //----------------------------------------------------------------------
364         // unpack C from bitmap to sparse/hyper
365         //----------------------------------------------------------------------
366 
367         // C is currently A_in->nvec by B_in->nvec, in bitmap form.  It must be
368         // unpacked into sparse/hypersparse form, with zombies.
369 
370         //----------------------------------------------------------------------
371         // allocate the sparse/hypersparse structure of the final C
372         //----------------------------------------------------------------------
373 
374         int64_t *restrict Cp = NULL ; size_t Cp_size = 0 ;
375         int64_t *restrict Ch = NULL ; size_t Ch_size = 0 ;
376         int64_t *restrict Ci = NULL ; size_t Ci_size = 0 ;
377 
378         Cp = GB_MALLOC (cvdim+1, int64_t, &Cp_size) ;
379         Ch = NULL ;
380         if (B_is_hyper)
381         {
382             Ch = GB_MALLOC (cvdim, int64_t, &Ch_size) ;
383         }
384         Ci = GB_MALLOC (cnz, int64_t, &Ci_size) ;
385         if (Cp == NULL || (B_is_hyper && Ch == NULL) || Ci == NULL)
386         {
387             // out of memory
388             GB_phbix_free (C) ;
389             GB_FREE (&Cp, Cp_size) ;
390             GB_FREE (&Ch, Ch_size) ;
391             GB_FREE (&Ci, Ci_size) ;
392             return (GrB_OUT_OF_MEMORY) ;
393         }
394 
395         //----------------------------------------------------------------------
396         // construct the hyperlist of C, if B is hypersparse
397         //----------------------------------------------------------------------
398 
399         nthreads = GB_nthreads (cvdim, chunk, nthreads_max) ;
400         if (B_is_hyper)
401         {
402             // C becomes hypersparse
403             ASSERT (cvdim == B_in->nvec) ;
404             GB_memcpy (Ch, B_in->h, cvdim * sizeof (int64_t), nthreads) ;
405         }
406 
407         //----------------------------------------------------------------------
408         // construct the vector pointers of C
409         //----------------------------------------------------------------------
410 
411         int64_t pC ;
412         #pragma omp parallel for num_threads(nthreads) schedule(static)
413         for (pC = 0 ; pC < cvdim+1 ; pC++)
414         {
415             Cp [pC] = pC * cvlen ;
416         }
417 
418         //----------------------------------------------------------------------
419         // construct the pattern of C from its bitmap
420         //----------------------------------------------------------------------
421 
422         // C(i,j) becomes a zombie if not present in the bitmap
423         nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
424 
425         int8_t *restrict Cb = C->b ;
426         if (A_is_hyper)
427         {
428             ASSERT (cvlen == A_in->nvec) ;
429             #pragma omp parallel for num_threads(nthreads) schedule(static)
430             for (pC = 0 ; pC < cnz ; pC++)
431             {
432                 int64_t i = Ah [pC % cvlen] ;
433                 Ci [pC] = (Cb [pC]) ? i : GB_FLIP (i) ;
434             }
435         }
436         else
437         {
438             ASSERT (cvlen == cvlen_final && cvlen == A->vdim) ;
439             #pragma omp parallel for num_threads(nthreads) schedule(static)
440             for (pC = 0 ; pC < cnz ; pC++)
441             {
442                 int64_t i = pC % cvlen ;
443                 Ci [pC] = (Cb [pC]) ? i : GB_FLIP (i) ;
444             }
445         }
446 
447         //----------------------------------------------------------------------
448         // transplant the new content and finalize C
449         //----------------------------------------------------------------------
450 
451         C->p = Cp ; Cp = NULL ; C->p_size = Cp_size ;
452         C->h = Ch ; Ch = NULL ; C->h_size = Ch_size ;
453         C->i = Ci ; Ci = NULL ; C->i_size = Ci_size ;
454         C->nzombies = cnz - C->nvals ;
455         C->vdim = cvdim_final ;
456         C->vlen = cvlen_final ;
457         C->nvals = -1 ;
458         C->nvec = cvdim ;
459         C->plen = cvdim ;
460         C->nvec_nonempty = (cvlen == 0) ? 0 : cvdim ;
461 
462         // free the bitmap
463         GB_FREE ((&C->b), C->b_size) ;
464 
465         // C is now sparse or hypersparse
466         ASSERT_MATRIX_OK (C, "dot2: unpacked C", GB0) ;
467         ASSERT (GB_ZOMBIES_OK (C)) ;
468     }
469 
470     //--------------------------------------------------------------------------
471     // return result
472     //--------------------------------------------------------------------------
473 
474     ASSERT (GB_ZOMBIES_OK (C)) ;
475     ASSERT (!GB_JUMBLED (C)) ;
476     ASSERT (!GB_PENDING (C)) ;
477 
478 // ttt = omp_get_wtime ( ) - ttt ;
479 // GB_Global_timing_add (19, ttt) ;
480 // ttt = omp_get_wtime ( ) ;
481 
482     return (GrB_SUCCESS) ;
483 }
484 
485