1 //------------------------------------------------------------------------------
2 // GB_bitmap_AxB_saxpy_A_bitmap_B_sparse: C<#M>+=A*B, C bitmap, M any format
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // C is bitmap, A is bitmap or full, B is sparse or hypersparse.
11 // M has any format.
12 
13 {
14 
15     //--------------------------------------------------------------------------
16     // allocate workspace for each task
17     //--------------------------------------------------------------------------
18 
19     // imeta = total number of rows of A and H in all panels
20     int64_t imeta = naslice * GB_PANEL_SIZE ;
21 
22     // number of entries in one panel of G for A.
23     #if GB_HAS_BITMAP_MULTADD && !GB_IS_ANY_PAIR_SEMIRING
24     // Always load the A panel into G, since Ax [pA] has uninitialized values
25     // where Ab [pA] == 0.  The GB_BITMAP_MULTADD update will access these
26     // values, and they must be initialized.
27     const bool load_apanel = true ;
28     #else
29     // only load the A panel into G if it consists of more than one panel
30     const bool load_apanel = (avlen > GB_PANEL_SIZE) ;
31     #endif
32     // Each panel of G is GB_PANEL_SIZE-by-avdim, held by column.
33     int64_t apanel_size = load_apanel ? (GB_PANEL_SIZE * avdim) : 0 ;
34     int64_t afpanel_size = GB_A_IS_BITMAP  ? (apanel_size) : 0 ;
35     int64_t axpanel_size = A_is_pattern ? 0 : (apanel_size * GB_ASIZE) ;
36 
37     // each panel of H is GB_PANEL_SIZE-by-bnvec, held by column; note that
38     // H has bnvec vectors, not bvdim.  The C bitmap has bvdim vectors,
39     // and bnvec <= bvdim if B is hypersparse.
40     int64_t hpanel_size = GB_PANEL_SIZE * bnvec ;
41 
42     //--------------------------------------------------------------------------
43     // allocate the panels
44     //--------------------------------------------------------------------------
45 
46     // The G panels are not needed if A would fit into a single panel.
47     // In that case A is used in place and not copied into G.
48 
49     int64_t wafsize = naslice * afpanel_size ;
50     int64_t waxsize = naslice * axpanel_size ;
51     int64_t wcsize  = naslice * hpanel_size ;
52     int64_t wcxsize = GB_IS_ANY_PAIR_SEMIRING ? 0 : (wcsize * GB_CSIZE) ;
53     Wf  = GB_MALLOC_WERK (wafsize + wcsize, int8_t, &Wf_size) ;
54     Wax = GB_MALLOC_WERK (waxsize, GB_void, &Wax_size) ;
55     Wcx = GB_MALLOC_WERK (wcxsize, GB_void, &Wcx_size) ;
56     if (Wf == NULL || Wax == NULL || Wcx == NULL)
57     {
58         // out of memory
59         GB_FREE_ALL ;
60         return (GrB_OUT_OF_MEMORY) ;
61     }
62 
63     //--------------------------------------------------------------------------
64     // initialize the panels
65     //--------------------------------------------------------------------------
66 
67     // for all semirings: set the bitmaps Gb and Hf to zero
68     GB_memset (Wf,  0, wafsize + wcsize, nthreads_max) ;
69 
70     #if GB_HAS_BITMAP_MULTADD && !GB_IS_ANY_PAIR_SEMIRING
71     {
72         // Initialize the Hx workspace to identity, if this semiring has a
73         // concise bitmap multiply-add expression.  For the any_pair semiring,
74         // the numerical values are not needed so Hx is not allocated.
75         #if GB_HAS_IDENTITY_BYTE
76             // the identity value can be assigned via memset
77             GB_memset (Wcx, GB_IDENTITY_BYTE, wcxsize, nthreads_max) ;
78         #else
79             // an explicit loop is required to set Hx to identity
80             // TODO: should each task initialize its own Hf and Hx,
81             // and use a static schedule here and for H=G*B?
82             GB_CTYPE *restrict Hx = (GB_CTYPE *) Wcx ;
83             int64_t pH ;
84             #pragma omp parallel for num_threads(nthreads) schedule(static)
85             for (pH = 0 ; pH < wcsize ; pH++)
86             {
87                 Hx [pH] = GB_IDENTITY ;
88             }
89         #endif
90     }
91     #endif
92 
93     //--------------------------------------------------------------------------
94     // C<#M>=A*B, one metapanel at a time
95     //--------------------------------------------------------------------------
96 
97     int tid ;
98 
99     for (int64_t iouter = 0 ; iouter < avlen ; iouter += imeta)
100     {
101 
102         //----------------------------------------------------------------------
103         // C<#M>(metapanel,:) += A (metapanel,:)*B
104         //----------------------------------------------------------------------
105 
106         // The rows in this metapanel are iouter:iouter+imeta-1.
107 
108         //----------------------------------------------------------------------
109         // load the metapanel: G = A (iouter:iouter+imeta-1,:)
110         //----------------------------------------------------------------------
111 
112         if ((GB_A_IS_BITMAP || !A_is_pattern) && load_apanel)
113         {
114 
115             // Loading the panel into G keeps its storage order.  A is not
116             // transposed when loaded into the G panels.  However, the leading
117             // dimension is reduced.  A is avlen-by-avdim with a leading
118             // dimension of avlen, which can be large.  G is np-by-avdim, with
119             // np <= GB_PANEL_SIZE.  The loading of A into G can be skipped
120             // if all of A can be used in-place.
121 
122             #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
123             for (tid = 0 ; tid < ntasks ; tid++)
124             {
125 
126                 //--------------------------------------------------------------
127                 // get the panel for this task
128                 //--------------------------------------------------------------
129 
130                 int a_tid = tid / nbslice ;
131                 int b_tid = tid % nbslice ;
132                 int64_t istart = iouter + a_tid     * GB_PANEL_SIZE ;
133                 int64_t iend   = iouter + (a_tid+1) * GB_PANEL_SIZE ;
134                 iend = GB_IMIN (iend, avlen) ;
135                 int64_t np = iend - istart ;
136                 if (np <= 0) continue ;
137                 int64_t kstart, kend ;
138                 GB_PARTITION (kstart, kend, avdim, b_tid, nbslice) ;
139                 int8_t   *restrict Gb = Wf  + (a_tid * afpanel_size) ;
140                 GB_ATYPE *restrict Gx = (GB_ATYPE *)
141                     (Wax + (a_tid * axpanel_size)) ;
142 
143                 //--------------------------------------------------------------
144                 // load A for this panel
145                 //--------------------------------------------------------------
146 
147                 #if ( GB_A_IS_BITMAP )
148                 {
149 
150                     //----------------------------------------------------------
151                     // A is bitmap
152                     //----------------------------------------------------------
153 
154                     if (!A_is_pattern)
155                     {
156                         // load Ab and Ax into Gb and Gx
157                         for (int64_t k = kstart ; k < kend ; k++)
158                         {
159                             for (int64_t ii = 0 ; ii < np ; ii++)
160                             {
161                                 // Gb (ii,k) = Ab (istart+ii,k)
162                                 const int64_t pG = ii + k*np ;
163                                 const int64_t pA = istart + ii + k*avlen ;
164                                 const int8_t gb = Ab [pA] ;
165                                 Gb [pG] = gb ;
166                                 if (gb)
167                                 {
168                                     // Gx (ii,k) = Ax (istart+ii,k)
169                                     GB_LOADA (Gx, pG, Ax, pA) ;
170                                 }
171                                 #if GB_HAS_BITMAP_MULTADD
172                                 else
173                                 {
174                                     // Gx (ii,k) = 0
175                                     Gx [pG] = GB_ATYPE_CAST (0, 0) ;
176                                 }
177                                 #endif
178                             }
179                         }
180                     }
181                     else
182                     {
183                         // just load the Ab bitmap into Gb, not the values
184                         for (int64_t k = kstart ; k < kend ; k++)
185                         {
186                             for (int64_t ii = 0 ; ii < np ; ii++)
187                             {
188                                 // Gb (ii,k) = Ab (istart+ii,k)
189                                 const int64_t pG = ii + k*np ;
190                                 const int64_t pA = istart + ii + k*avlen ;
191                                 Gb [pG] = Ab [pA] ;
192                             }
193                         }
194                     }
195 
196                 }
197                 #else
198                 {
199 
200                     //----------------------------------------------------------
201                     // A is full
202                     //----------------------------------------------------------
203 
204                     if (!A_is_pattern)
205                     {
206                         for (int64_t k = kstart ; k < kend ; k++)
207                         {
208                             for (int64_t ii = 0 ; ii < np ; ii++)
209                             {
210                                 // Gx (ii,k) = Ax (istart+ii,k)
211                                 const int64_t pG = ii + k*np ;
212                                 const int64_t pA = istart + ii + k*avlen ;
213                                 GB_LOADA (Gx, pG, Ax, pA) ;
214                             }
215                         }
216                     }
217                 }
218                 #endif
219             }
220         }
221 
222         //----------------------------------------------------------------------
223         // H = G*B
224         //----------------------------------------------------------------------
225 
226         #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
227         for (tid = 0 ; tid < ntasks ; tid++)
228         {
229 
230             //------------------------------------------------------------------
231             // get the panel of H and G for this task
232             //------------------------------------------------------------------
233 
234             int a_tid = tid / nbslice ;
235             int b_tid = tid % nbslice ;
236             int64_t istart = iouter + a_tid     * GB_PANEL_SIZE ;
237             int64_t iend   = iouter + (a_tid+1) * GB_PANEL_SIZE ;
238             iend = GB_IMIN (iend, avlen) ;
239             int64_t np = iend - istart ;
240             if (np <= 0) continue ;
241 
242             const int8_t   *restrict Gb ;
243             const GB_ATYPE *restrict Gx ;
244 
245             if (load_apanel)
246             {
247                 // A has been loaded into the G panel
248                 Gb = Wf  + (a_tid * afpanel_size) ;
249                 Gx = (GB_ATYPE *) (Wax + (a_tid * axpanel_size)) ;
250             }
251             else
252             {
253                 // use A in-place
254                 Gb = Ab ;
255                 Gx = (GB_ATYPE *) Ax ;
256             }
257 
258             int8_t   *restrict Hf = Wf  + (a_tid * hpanel_size) + wafsize ;
259             GB_CTYPE *restrict Hx = (GB_CTYPE *)
260                 (Wcx + (a_tid * hpanel_size) * GB_CSIZE) ;
261             GB_XINIT ;  // for plus, bor, band, and bxor monoids only
262 
263             //------------------------------------------------------------------
264             // H_panel (:,kfirst:klast-1) = G_panel * B (:, kfirst:klast-1)
265             //------------------------------------------------------------------
266 
267             int64_t kfirst = B_slice [b_tid] ;
268             int64_t klast = B_slice [b_tid + 1] ;
269             for (int64_t kk = kfirst ; kk < klast ; kk++)
270             {
271 
272                 //--------------------------------------------------------------
273                 // H_panel (:,kk) = G_panel * B (:,kk)
274                 //--------------------------------------------------------------
275 
276                 // H and B are indexed in the compact space kk = 0:bnvec-1,
277                 // not by the names j = 0:bvdim-1.  When B is sparse, these are
278                 // the same.  If B is hypersparse, j is Bh [kk].  However, j is
279                 // needed for the SECONDJ and SECONDJ1 multipliers.
280 
281                 int64_t j = GBH (Bh, kk) ;
282                 int64_t pB = Bp [kk] ;
283                 int64_t pB_end = Bp [kk+1] ;
284                 int64_t pH = kk * np ;
285                 #if GB_IS_SECONDJ_MULTIPLIER
286                     // t = j or j+1 for SECONDJ and SECONDJ1 multipliers
287                     GB_CIJ_DECLARE (t) ;
288                     GB_MULT (t, ignore, ignore, ignore, ignore, j) ;
289                 #endif
290 
291                 #undef GB_MULT_G_iik_B_kj
292                 #if GB_IS_PAIR_MULTIPLIER
293                     // t = G(ii,k) * B(k,j) is always equal to 1
294                     #define GB_MULT_G_iik_B_kj(ii)
295                 #elif ( GB_IS_FIRSTJ_MULTIPLIER || GB_IS_SECONDJ_MULTIPLIER )
296                     // t is already defined for these multipliers
297                     #define GB_MULT_G_iik_B_kj(ii)
298                 #else
299                     // t = G(ii,k) * B(k,j)
300                     #define GB_MULT_G_iik_B_kj(ii)                          \
301                         GB_GETA (giik, Gx, pG + ii) ;                       \
302                         GB_CIJ_DECLARE (t) ;                                \
303                         GB_MULT (t, giik, bkj, istart + ii, k, j)
304                 #endif
305 
306                 for ( ; pB < pB_end ; pB++)
307                 {
308                     int64_t k = Bi [pB] ;       // get B(k,j)
309                     int64_t pG = k * np ;       // get G(:,k)
310                     GB_GET_B_kj ;               // bkj = B(k,j)
311                     GB_XLOAD (bkj) ;            // X [1] = bkj (plus_times only)
312                     // H_panel (:,j) = G_panel (:,k) * B(k,j)
313                     for (int64_t ii = 0 ; ii < np ; ii++)
314                     {
315                         #if GB_HAS_BITMAP_MULTADD
316                         {
317                             // if (Gb (ii,k))
318                             //      if (Hf (ii,j) == 0)
319                             //          Hx (ii,j) = G (ii,k) * B(k,j) ;
320                             //          Hf (ii,j) = 1
321                             //      else
322                             //          Hx (ii,j) += G (ii,k) * B(k,j) ;
323                             #if GB_IS_FIRSTI_MULTIPLIER
324                             int64_t i = istart + ii ;
325                             #endif
326                             #if GB_A_IS_BITMAP
327                                 GB_BITMAP_MULTADD (
328                                     Hf [pH+ii], Hx [pH+ii],
329                                     Gb [pG+ii], Gx [pG+ii], bkj) ;
330                             #else
331                                 GB_BITMAP_MULTADD (
332                                     Hf [pH+ii], Hx [pH+ii],
333                                     1,          Gx [pG+ii], bkj) ;
334                             #endif
335                         }
336                         #else
337                         {
338                             #if GB_A_IS_BITMAP
339                             if (Gb [pG+ii])
340                             #endif
341                             {
342                                 // t = G(ii,k) * B(k,j)
343                                 GB_MULT_G_iik_B_kj (ii) ;
344                                 if (Hf [pH+ii] == 0)
345                                 {
346                                     // H (ii,j) is a new entry
347                                     GB_HX_WRITE (pH+ii, t) ;    // Hx (ii,j)=t
348                                     Hf [pH+ii] = 1 ;
349                                 }
350                                 else
351                                 {
352                                     // H (ii,j) is already present
353                                     GB_HX_UPDATE (pH+ii, t) ;   // Hx (ii,j)+=t
354                                 }
355                             }
356                         }
357                         #endif
358                     }
359                 }
360                 #undef GB_MULT_G_iik_B_kj
361             }
362         }
363 
364         //----------------------------------------------------------------------
365         // C (metapanel,:) += H
366         //----------------------------------------------------------------------
367 
368         #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
369             reduction(+:cnvals)
370         for (tid = 0 ; tid < ntasks ; tid++)
371         {
372 
373             //------------------------------------------------------------------
374             // get the panel of H and G for this task
375             //------------------------------------------------------------------
376 
377             int a_tid = tid / nbslice ;
378             int b_tid = tid % nbslice ;
379             int64_t istart = iouter + a_tid     * GB_PANEL_SIZE ;
380             int64_t iend   = iouter + (a_tid+1) * GB_PANEL_SIZE ;
381             iend = GB_IMIN (iend, avlen) ;
382             int64_t np = iend - istart ;
383             if (np <= 0) continue ;
384             int64_t task_cnvals = 0 ;
385 
386             int64_t kstart, kend ;
387             GB_PARTITION (kstart, kend, bnvec, b_tid, nbslice) ;
388 
389             int8_t   *restrict Hf = Wf  + (a_tid * hpanel_size) + wafsize ;
390             GB_CTYPE *restrict Hx = (GB_CTYPE *)
391                 (Wcx + (a_tid * hpanel_size) * GB_CSIZE) ;
392 
393             //------------------------------------------------------------------
394             // C<#M>(metapanel,j1:j2-1) += H (:,kstart:kend-1)
395             //------------------------------------------------------------------
396 
397             // If B is hypersparse, the kk-th vector of H is the jth vector
398             // of C, where j = Bh [kk].
399 
400             for (int64_t kk = kstart ; kk < kend ; kk++)
401             {
402                 int64_t j = GBH (Bh, kk) ;      // j is the range j1:j2-1
403                 int64_t pC_start = istart + j * avlen ; // get C(istart,j)
404                 int64_t pH_start = kk * np ;            // get H(:,kk)
405 
406                 for (int64_t ii = 0 ; ii < np ; ii++)
407                 {
408                     int64_t pC = pC_start + ii ;    // get C(i,j)
409                     int64_t pH = pH_start + ii ;    // get H(ii,kk)
410                     if (!Hf [pH]) continue ;
411                     Hf [pH] = 0 ;                   // clear the panel
412                     int8_t cb = Cb [pC] ;
413 
414                     //----------------------------------------------------------
415                     // check M(i,j)
416                     //----------------------------------------------------------
417 
418                     #undef GB_IF_MIJ
419                     #if GB_MASK_IS_SPARSE_OR_HYPER
420 
421                         // M is sparse or hypersparse
422                         bool mij = ((cb & 2) != 0) ^ Mask_comp ;
423                         cb = (cb & 1) ;
424                         #define GB_IF_MIJ if (mij)
425 
426                     #elif GB_MASK_IS_BITMAP_OR_FULL
427 
428                         // M is bitmap or full
429                         GB_GET_M_ij (pC) ;
430                         mij = mij ^ Mask_comp ;
431                         #define GB_IF_MIJ if (mij)
432 
433                     #else
434 
435                         #define GB_IF_MIJ
436 
437                     #endif
438 
439                     //----------------------------------------------------------
440                     // C(i,j) += H(ii,kk)
441                     //----------------------------------------------------------
442 
443                     GB_IF_MIJ
444                     {
445                         if (cb == 0)
446                         {
447                             // C(i,j) = H(ii,kk)
448                             #if GB_IS_ANY_PAIR_SEMIRING
449                             Cx [pC] = GB_CTYPE_CAST (1,0) ; // C(i,j) = 1
450                             #else
451                             GB_CIJ_GATHER (pC, pH) ;
452                             #endif
453                             Cb [pC] = keep ;
454                             task_cnvals++ ;
455                         }
456                         else
457                         {
458                             // Currently, the matrix C is a newly allocated
459                             // matrix, not the C_in input matrix to GrB_mxm.
460                             // As a result, this condition is not used.  It
461                             // will be in the future when this method is
462                             // modified to modify C in-place.
463                             ASSERT (GB_DEAD_CODE) ;
464                             // C(i,j) += H(ii,kk)
465                             GB_CIJ_GATHER_UPDATE (pC, pH) ;
466                         }
467                     }
468 
469                     //----------------------------------------------------------
470                     // clear the panel
471                     //----------------------------------------------------------
472 
473                     #if GB_HAS_BITMAP_MULTADD && !GB_IS_ANY_PAIR_SEMIRING
474                     {
475                         // H(ii,kk) = identity
476                         Hx [pH] = GB_IDENTITY ;
477                     }
478                     #endif
479                 }
480             }
481             cnvals += task_cnvals ;
482         }
483     }
484 }
485 
486 #undef GB_IF_MIJ
487 
488