1 //------------------------------------------------------------------------------ 2 // GB_bitmap_AxB_saxpy_A_bitmap_B_sparse: C<#M>+=A*B, C bitmap, M any format 3 //------------------------------------------------------------------------------ 4 5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved. 6 // SPDX-License-Identifier: Apache-2.0 7 8 //------------------------------------------------------------------------------ 9 10 // C is bitmap, A is bitmap or full, B is sparse or hypersparse. 11 // M has any format. 12 13 { 14 15 //-------------------------------------------------------------------------- 16 // allocate workspace for each task 17 //-------------------------------------------------------------------------- 18 19 // imeta = total number of rows of A and H in all panels 20 int64_t imeta = naslice * GB_PANEL_SIZE ; 21 22 // number of entries in one panel of G for A. 23 #if GB_HAS_BITMAP_MULTADD && !GB_IS_ANY_PAIR_SEMIRING 24 // Always load the A panel into G, since Ax [pA] has uninitialized values 25 // where Ab [pA] == 0. The GB_BITMAP_MULTADD update will access these 26 // values, and they must be initialized. 27 const bool load_apanel = true ; 28 #else 29 // only load the A panel into G if it consists of more than one panel 30 const bool load_apanel = (avlen > GB_PANEL_SIZE) ; 31 #endif 32 // Each panel of G is GB_PANEL_SIZE-by-avdim, held by column. 33 int64_t apanel_size = load_apanel ? (GB_PANEL_SIZE * avdim) : 0 ; 34 int64_t afpanel_size = GB_A_IS_BITMAP ? (apanel_size) : 0 ; 35 int64_t axpanel_size = A_is_pattern ? 0 : (apanel_size * GB_ASIZE) ; 36 37 // each panel of H is GB_PANEL_SIZE-by-bnvec, held by column; note that 38 // H has bnvec vectors, not bvdim. The C bitmap has bvdim vectors, 39 // and bnvec <= bvdim if B is hypersparse. 40 int64_t hpanel_size = GB_PANEL_SIZE * bnvec ; 41 42 //-------------------------------------------------------------------------- 43 // allocate the panels 44 //-------------------------------------------------------------------------- 45 46 // The G panels are not needed if A would fit into a single panel. 47 // In that case A is used in place and not copied into G. 48 49 int64_t wafsize = naslice * afpanel_size ; 50 int64_t waxsize = naslice * axpanel_size ; 51 int64_t wcsize = naslice * hpanel_size ; 52 int64_t wcxsize = GB_IS_ANY_PAIR_SEMIRING ? 0 : (wcsize * GB_CSIZE) ; 53 Wf = GB_MALLOC_WERK (wafsize + wcsize, int8_t, &Wf_size) ; 54 Wax = GB_MALLOC_WERK (waxsize, GB_void, &Wax_size) ; 55 Wcx = GB_MALLOC_WERK (wcxsize, GB_void, &Wcx_size) ; 56 if (Wf == NULL || Wax == NULL || Wcx == NULL) 57 { 58 // out of memory 59 GB_FREE_ALL ; 60 return (GrB_OUT_OF_MEMORY) ; 61 } 62 63 //-------------------------------------------------------------------------- 64 // initialize the panels 65 //-------------------------------------------------------------------------- 66 67 // for all semirings: set the bitmaps Gb and Hf to zero 68 GB_memset (Wf, 0, wafsize + wcsize, nthreads_max) ; 69 70 #if GB_HAS_BITMAP_MULTADD && !GB_IS_ANY_PAIR_SEMIRING 71 { 72 // Initialize the Hx workspace to identity, if this semiring has a 73 // concise bitmap multiply-add expression. For the any_pair semiring, 74 // the numerical values are not needed so Hx is not allocated. 75 #if GB_HAS_IDENTITY_BYTE 76 // the identity value can be assigned via memset 77 GB_memset (Wcx, GB_IDENTITY_BYTE, wcxsize, nthreads_max) ; 78 #else 79 // an explicit loop is required to set Hx to identity 80 // TODO: should each task initialize its own Hf and Hx, 81 // and use a static schedule here and for H=G*B? 82 GB_CTYPE *restrict Hx = (GB_CTYPE *) Wcx ; 83 int64_t pH ; 84 #pragma omp parallel for num_threads(nthreads) schedule(static) 85 for (pH = 0 ; pH < wcsize ; pH++) 86 { 87 Hx [pH] = GB_IDENTITY ; 88 } 89 #endif 90 } 91 #endif 92 93 //-------------------------------------------------------------------------- 94 // C<#M>=A*B, one metapanel at a time 95 //-------------------------------------------------------------------------- 96 97 int tid ; 98 99 for (int64_t iouter = 0 ; iouter < avlen ; iouter += imeta) 100 { 101 102 //---------------------------------------------------------------------- 103 // C<#M>(metapanel,:) += A (metapanel,:)*B 104 //---------------------------------------------------------------------- 105 106 // The rows in this metapanel are iouter:iouter+imeta-1. 107 108 //---------------------------------------------------------------------- 109 // load the metapanel: G = A (iouter:iouter+imeta-1,:) 110 //---------------------------------------------------------------------- 111 112 if ((GB_A_IS_BITMAP || !A_is_pattern) && load_apanel) 113 { 114 115 // Loading the panel into G keeps its storage order. A is not 116 // transposed when loaded into the G panels. However, the leading 117 // dimension is reduced. A is avlen-by-avdim with a leading 118 // dimension of avlen, which can be large. G is np-by-avdim, with 119 // np <= GB_PANEL_SIZE. The loading of A into G can be skipped 120 // if all of A can be used in-place. 121 122 #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) 123 for (tid = 0 ; tid < ntasks ; tid++) 124 { 125 126 //-------------------------------------------------------------- 127 // get the panel for this task 128 //-------------------------------------------------------------- 129 130 int a_tid = tid / nbslice ; 131 int b_tid = tid % nbslice ; 132 int64_t istart = iouter + a_tid * GB_PANEL_SIZE ; 133 int64_t iend = iouter + (a_tid+1) * GB_PANEL_SIZE ; 134 iend = GB_IMIN (iend, avlen) ; 135 int64_t np = iend - istart ; 136 if (np <= 0) continue ; 137 int64_t kstart, kend ; 138 GB_PARTITION (kstart, kend, avdim, b_tid, nbslice) ; 139 int8_t *restrict Gb = Wf + (a_tid * afpanel_size) ; 140 GB_ATYPE *restrict Gx = (GB_ATYPE *) 141 (Wax + (a_tid * axpanel_size)) ; 142 143 //-------------------------------------------------------------- 144 // load A for this panel 145 //-------------------------------------------------------------- 146 147 #if ( GB_A_IS_BITMAP ) 148 { 149 150 //---------------------------------------------------------- 151 // A is bitmap 152 //---------------------------------------------------------- 153 154 if (!A_is_pattern) 155 { 156 // load Ab and Ax into Gb and Gx 157 for (int64_t k = kstart ; k < kend ; k++) 158 { 159 for (int64_t ii = 0 ; ii < np ; ii++) 160 { 161 // Gb (ii,k) = Ab (istart+ii,k) 162 const int64_t pG = ii + k*np ; 163 const int64_t pA = istart + ii + k*avlen ; 164 const int8_t gb = Ab [pA] ; 165 Gb [pG] = gb ; 166 if (gb) 167 { 168 // Gx (ii,k) = Ax (istart+ii,k) 169 GB_LOADA (Gx, pG, Ax, pA) ; 170 } 171 #if GB_HAS_BITMAP_MULTADD 172 else 173 { 174 // Gx (ii,k) = 0 175 Gx [pG] = GB_ATYPE_CAST (0, 0) ; 176 } 177 #endif 178 } 179 } 180 } 181 else 182 { 183 // just load the Ab bitmap into Gb, not the values 184 for (int64_t k = kstart ; k < kend ; k++) 185 { 186 for (int64_t ii = 0 ; ii < np ; ii++) 187 { 188 // Gb (ii,k) = Ab (istart+ii,k) 189 const int64_t pG = ii + k*np ; 190 const int64_t pA = istart + ii + k*avlen ; 191 Gb [pG] = Ab [pA] ; 192 } 193 } 194 } 195 196 } 197 #else 198 { 199 200 //---------------------------------------------------------- 201 // A is full 202 //---------------------------------------------------------- 203 204 if (!A_is_pattern) 205 { 206 for (int64_t k = kstart ; k < kend ; k++) 207 { 208 for (int64_t ii = 0 ; ii < np ; ii++) 209 { 210 // Gx (ii,k) = Ax (istart+ii,k) 211 const int64_t pG = ii + k*np ; 212 const int64_t pA = istart + ii + k*avlen ; 213 GB_LOADA (Gx, pG, Ax, pA) ; 214 } 215 } 216 } 217 } 218 #endif 219 } 220 } 221 222 //---------------------------------------------------------------------- 223 // H = G*B 224 //---------------------------------------------------------------------- 225 226 #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) 227 for (tid = 0 ; tid < ntasks ; tid++) 228 { 229 230 //------------------------------------------------------------------ 231 // get the panel of H and G for this task 232 //------------------------------------------------------------------ 233 234 int a_tid = tid / nbslice ; 235 int b_tid = tid % nbslice ; 236 int64_t istart = iouter + a_tid * GB_PANEL_SIZE ; 237 int64_t iend = iouter + (a_tid+1) * GB_PANEL_SIZE ; 238 iend = GB_IMIN (iend, avlen) ; 239 int64_t np = iend - istart ; 240 if (np <= 0) continue ; 241 242 const int8_t *restrict Gb ; 243 const GB_ATYPE *restrict Gx ; 244 245 if (load_apanel) 246 { 247 // A has been loaded into the G panel 248 Gb = Wf + (a_tid * afpanel_size) ; 249 Gx = (GB_ATYPE *) (Wax + (a_tid * axpanel_size)) ; 250 } 251 else 252 { 253 // use A in-place 254 Gb = Ab ; 255 Gx = (GB_ATYPE *) Ax ; 256 } 257 258 int8_t *restrict Hf = Wf + (a_tid * hpanel_size) + wafsize ; 259 GB_CTYPE *restrict Hx = (GB_CTYPE *) 260 (Wcx + (a_tid * hpanel_size) * GB_CSIZE) ; 261 GB_XINIT ; // for plus, bor, band, and bxor monoids only 262 263 //------------------------------------------------------------------ 264 // H_panel (:,kfirst:klast-1) = G_panel * B (:, kfirst:klast-1) 265 //------------------------------------------------------------------ 266 267 int64_t kfirst = B_slice [b_tid] ; 268 int64_t klast = B_slice [b_tid + 1] ; 269 for (int64_t kk = kfirst ; kk < klast ; kk++) 270 { 271 272 //-------------------------------------------------------------- 273 // H_panel (:,kk) = G_panel * B (:,kk) 274 //-------------------------------------------------------------- 275 276 // H and B are indexed in the compact space kk = 0:bnvec-1, 277 // not by the names j = 0:bvdim-1. When B is sparse, these are 278 // the same. If B is hypersparse, j is Bh [kk]. However, j is 279 // needed for the SECONDJ and SECONDJ1 multipliers. 280 281 int64_t j = GBH (Bh, kk) ; 282 int64_t pB = Bp [kk] ; 283 int64_t pB_end = Bp [kk+1] ; 284 int64_t pH = kk * np ; 285 #if GB_IS_SECONDJ_MULTIPLIER 286 // t = j or j+1 for SECONDJ and SECONDJ1 multipliers 287 GB_CIJ_DECLARE (t) ; 288 GB_MULT (t, ignore, ignore, ignore, ignore, j) ; 289 #endif 290 291 #undef GB_MULT_G_iik_B_kj 292 #if GB_IS_PAIR_MULTIPLIER 293 // t = G(ii,k) * B(k,j) is always equal to 1 294 #define GB_MULT_G_iik_B_kj(ii) 295 #elif ( GB_IS_FIRSTJ_MULTIPLIER || GB_IS_SECONDJ_MULTIPLIER ) 296 // t is already defined for these multipliers 297 #define GB_MULT_G_iik_B_kj(ii) 298 #else 299 // t = G(ii,k) * B(k,j) 300 #define GB_MULT_G_iik_B_kj(ii) \ 301 GB_GETA (giik, Gx, pG + ii) ; \ 302 GB_CIJ_DECLARE (t) ; \ 303 GB_MULT (t, giik, bkj, istart + ii, k, j) 304 #endif 305 306 for ( ; pB < pB_end ; pB++) 307 { 308 int64_t k = Bi [pB] ; // get B(k,j) 309 int64_t pG = k * np ; // get G(:,k) 310 GB_GET_B_kj ; // bkj = B(k,j) 311 GB_XLOAD (bkj) ; // X [1] = bkj (plus_times only) 312 // H_panel (:,j) = G_panel (:,k) * B(k,j) 313 for (int64_t ii = 0 ; ii < np ; ii++) 314 { 315 #if GB_HAS_BITMAP_MULTADD 316 { 317 // if (Gb (ii,k)) 318 // if (Hf (ii,j) == 0) 319 // Hx (ii,j) = G (ii,k) * B(k,j) ; 320 // Hf (ii,j) = 1 321 // else 322 // Hx (ii,j) += G (ii,k) * B(k,j) ; 323 #if GB_IS_FIRSTI_MULTIPLIER 324 int64_t i = istart + ii ; 325 #endif 326 #if GB_A_IS_BITMAP 327 GB_BITMAP_MULTADD ( 328 Hf [pH+ii], Hx [pH+ii], 329 Gb [pG+ii], Gx [pG+ii], bkj) ; 330 #else 331 GB_BITMAP_MULTADD ( 332 Hf [pH+ii], Hx [pH+ii], 333 1, Gx [pG+ii], bkj) ; 334 #endif 335 } 336 #else 337 { 338 #if GB_A_IS_BITMAP 339 if (Gb [pG+ii]) 340 #endif 341 { 342 // t = G(ii,k) * B(k,j) 343 GB_MULT_G_iik_B_kj (ii) ; 344 if (Hf [pH+ii] == 0) 345 { 346 // H (ii,j) is a new entry 347 GB_HX_WRITE (pH+ii, t) ; // Hx (ii,j)=t 348 Hf [pH+ii] = 1 ; 349 } 350 else 351 { 352 // H (ii,j) is already present 353 GB_HX_UPDATE (pH+ii, t) ; // Hx (ii,j)+=t 354 } 355 } 356 } 357 #endif 358 } 359 } 360 #undef GB_MULT_G_iik_B_kj 361 } 362 } 363 364 //---------------------------------------------------------------------- 365 // C (metapanel,:) += H 366 //---------------------------------------------------------------------- 367 368 #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \ 369 reduction(+:cnvals) 370 for (tid = 0 ; tid < ntasks ; tid++) 371 { 372 373 //------------------------------------------------------------------ 374 // get the panel of H and G for this task 375 //------------------------------------------------------------------ 376 377 int a_tid = tid / nbslice ; 378 int b_tid = tid % nbslice ; 379 int64_t istart = iouter + a_tid * GB_PANEL_SIZE ; 380 int64_t iend = iouter + (a_tid+1) * GB_PANEL_SIZE ; 381 iend = GB_IMIN (iend, avlen) ; 382 int64_t np = iend - istart ; 383 if (np <= 0) continue ; 384 int64_t task_cnvals = 0 ; 385 386 int64_t kstart, kend ; 387 GB_PARTITION (kstart, kend, bnvec, b_tid, nbslice) ; 388 389 int8_t *restrict Hf = Wf + (a_tid * hpanel_size) + wafsize ; 390 GB_CTYPE *restrict Hx = (GB_CTYPE *) 391 (Wcx + (a_tid * hpanel_size) * GB_CSIZE) ; 392 393 //------------------------------------------------------------------ 394 // C<#M>(metapanel,j1:j2-1) += H (:,kstart:kend-1) 395 //------------------------------------------------------------------ 396 397 // If B is hypersparse, the kk-th vector of H is the jth vector 398 // of C, where j = Bh [kk]. 399 400 for (int64_t kk = kstart ; kk < kend ; kk++) 401 { 402 int64_t j = GBH (Bh, kk) ; // j is the range j1:j2-1 403 int64_t pC_start = istart + j * avlen ; // get C(istart,j) 404 int64_t pH_start = kk * np ; // get H(:,kk) 405 406 for (int64_t ii = 0 ; ii < np ; ii++) 407 { 408 int64_t pC = pC_start + ii ; // get C(i,j) 409 int64_t pH = pH_start + ii ; // get H(ii,kk) 410 if (!Hf [pH]) continue ; 411 Hf [pH] = 0 ; // clear the panel 412 int8_t cb = Cb [pC] ; 413 414 //---------------------------------------------------------- 415 // check M(i,j) 416 //---------------------------------------------------------- 417 418 #undef GB_IF_MIJ 419 #if GB_MASK_IS_SPARSE_OR_HYPER 420 421 // M is sparse or hypersparse 422 bool mij = ((cb & 2) != 0) ^ Mask_comp ; 423 cb = (cb & 1) ; 424 #define GB_IF_MIJ if (mij) 425 426 #elif GB_MASK_IS_BITMAP_OR_FULL 427 428 // M is bitmap or full 429 GB_GET_M_ij (pC) ; 430 mij = mij ^ Mask_comp ; 431 #define GB_IF_MIJ if (mij) 432 433 #else 434 435 #define GB_IF_MIJ 436 437 #endif 438 439 //---------------------------------------------------------- 440 // C(i,j) += H(ii,kk) 441 //---------------------------------------------------------- 442 443 GB_IF_MIJ 444 { 445 if (cb == 0) 446 { 447 // C(i,j) = H(ii,kk) 448 #if GB_IS_ANY_PAIR_SEMIRING 449 Cx [pC] = GB_CTYPE_CAST (1,0) ; // C(i,j) = 1 450 #else 451 GB_CIJ_GATHER (pC, pH) ; 452 #endif 453 Cb [pC] = keep ; 454 task_cnvals++ ; 455 } 456 else 457 { 458 // Currently, the matrix C is a newly allocated 459 // matrix, not the C_in input matrix to GrB_mxm. 460 // As a result, this condition is not used. It 461 // will be in the future when this method is 462 // modified to modify C in-place. 463 ASSERT (GB_DEAD_CODE) ; 464 // C(i,j) += H(ii,kk) 465 GB_CIJ_GATHER_UPDATE (pC, pH) ; 466 } 467 } 468 469 //---------------------------------------------------------- 470 // clear the panel 471 //---------------------------------------------------------- 472 473 #if GB_HAS_BITMAP_MULTADD && !GB_IS_ANY_PAIR_SEMIRING 474 { 475 // H(ii,kk) = identity 476 Hx [pH] = GB_IDENTITY ; 477 } 478 #endif 479 } 480 } 481 cnvals += task_cnvals ; 482 } 483 } 484 } 485 486 #undef GB_IF_MIJ 487 488