1 //------------------------------------------------------------------------------ 2 // GB_bitmap_AxB_saxpy_A_sparse_B_bitmap: C<#M>+=A*B, C bitmap, M any format 3 //------------------------------------------------------------------------------ 4 5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved. 6 // SPDX-License-Identifier: Apache-2.0 7 8 //------------------------------------------------------------------------------ 9 10 { 11 12 if (use_coarse_tasks) 13 { 14 15 //---------------------------------------------------------------------- 16 // C<#M> += A*B using coarse tasks 17 //---------------------------------------------------------------------- 18 19 // number of columns in the workspace for each task 20 #define GB_PANEL_SIZE 4 21 22 //---------------------------------------------------------------------- 23 // allocate workspace for each task 24 //---------------------------------------------------------------------- 25 26 GB_WERK_PUSH (GH_slice, 2*ntasks, int64_t) ; 27 if (GH_slice == NULL) 28 { 29 // out of memory 30 GB_FREE_ALL ; 31 return (GrB_OUT_OF_MEMORY) ; 32 } 33 34 int64_t *restrict G_slice = GH_slice ; 35 int64_t *restrict H_slice = GH_slice + ntasks ; 36 37 int64_t gwork = 0 ; 38 int64_t hwork = 0 ; 39 int tid ; 40 for (tid = 0 ; tid < ntasks ; tid++) 41 { 42 int64_t jstart, jend ; 43 GB_PARTITION (jstart, jend, bvdim, tid, ntasks) ; 44 int64_t jtask = jend - jstart ; 45 int64_t jpanel = GB_IMIN (jtask, GB_PANEL_SIZE) ; 46 G_slice [tid] = gwork ; 47 H_slice [tid] = hwork ; 48 if (jpanel > 1) 49 { 50 // no need to allocate workspace for Gb and Gx if jpanel == 1 51 gwork += jpanel ; 52 } 53 hwork += jpanel ; 54 } 55 56 int64_t bvlenx = (B_is_pattern ? 0 : bvlen) * GB_BSIZE ; 57 int64_t cvlenx = (GB_IS_ANY_PAIR_SEMIRING ? 0 : cvlen) * GB_CSIZE ; 58 int64_t bvlenb = (GB_B_IS_BITMAP ? bvlen : 0) ; 59 size_t gfspace = gwork * bvlenb ; 60 size_t wfspace = gfspace + hwork * cvlen ; 61 size_t wbxspace = gwork * bvlenx ; 62 size_t wcxspace = hwork * cvlenx ; 63 Wf = GB_MALLOC_WERK (wfspace, int8_t, &Wf_size) ; 64 Wbx = GB_MALLOC_WERK (wbxspace, GB_void, &Wbx_size) ; 65 Wcx = GB_MALLOC_WERK (wcxspace, GB_void, &Wcx_size) ; 66 if (Wf == NULL || Wcx == NULL || Wbx == NULL) 67 { 68 // out of memory 69 GB_FREE_ALL ; 70 return (GrB_OUT_OF_MEMORY) ; 71 } 72 73 //---------------------------------------------------------------------- 74 // C<#M> += A*B 75 //---------------------------------------------------------------------- 76 77 #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \ 78 reduction(+:cnvals) 79 for (tid = 0 ; tid < ntasks ; tid++) 80 { 81 82 //------------------------------------------------------------------ 83 // determine the vectors of B and C for this coarse task 84 //------------------------------------------------------------------ 85 86 int64_t jstart, jend ; 87 GB_PARTITION (jstart, jend, bvdim, tid, ntasks) ; 88 int64_t jtask = jend - jstart ; 89 int64_t jpanel = GB_IMIN (jtask, GB_PANEL_SIZE) ; 90 int64_t task_cnvals = 0 ; 91 92 //------------------------------------------------------------------ 93 // get the workspace for this task 94 //------------------------------------------------------------------ 95 96 // Gb and Gx workspace to load the panel of B 97 int8_t *restrict Gb = Wf + G_slice [tid] * bvlenb ; 98 GB_BTYPE *restrict Gx = (GB_BTYPE *) 99 (Wbx + G_slice [tid] * bvlenx) ; 100 101 // Hf and Hx workspace to compute the panel of C 102 int8_t *restrict Hf = Wf + (H_slice [tid] * cvlen) + gfspace ; 103 GB_CTYPE *restrict Hx = (GB_CTYPE *) 104 (Wcx + H_slice [tid] * cvlenx) ; 105 #if GB_IS_PLUS_FC32_MONOID 106 float *restrict Hx_real = (float *) Hx ; 107 float *restrict Hx_imag = Hx_real + 1 ; 108 #elif GB_IS_PLUS_FC64_MONOID 109 double *restrict Hx_real = (double *) Hx ; 110 double *restrict Hx_imag = Hx_real + 1 ; 111 #endif 112 113 //------------------------------------------------------------------ 114 // clear the panel 115 //------------------------------------------------------------------ 116 117 memset (Hf, 0, jpanel * cvlen) ; 118 119 //------------------------------------------------------------------ 120 // C<#M>(:,jstart:jend-1) += A * B(:,jstart:jend-1) by panel 121 //------------------------------------------------------------------ 122 123 for (int64_t j1 = jstart ; j1 < jend ; j1 += jpanel) 124 { 125 126 //-------------------------------------------------------------- 127 // get the panel of np vectors j1:j2-1 128 //-------------------------------------------------------------- 129 130 int64_t j2 = GB_IMIN (jend, j1 + jpanel) ; 131 int64_t np = j2 - j1 ; 132 133 //-------------------------------------------------------------- 134 // load and transpose B(:,j1:j2-1) for one panel 135 //-------------------------------------------------------------- 136 137 #if GB_B_IS_BITMAP 138 { 139 if (np == 1) 140 { 141 // no need to load a single vector of B 142 Gb = (int8_t *) (Bb + (j1 * bvlen)) ; 143 } 144 else 145 { 146 // load and transpose the bitmap of B(:,j1:j2-1) 147 for (int64_t jj = 0 ; jj < np ; jj++) 148 { 149 int64_t j = j1 + jj ; 150 for (int64_t i = 0 ; i < bvlen ; i++) 151 { 152 Gb [i*np + jj] = Bb [i + j * bvlen] ; 153 } 154 } 155 } 156 } 157 #endif 158 159 if (!B_is_pattern) 160 { 161 if (np == 1) 162 { 163 // no need to load a single vector of B 164 GB_void *restrict Bx = (GB_void *) (B->x) ; 165 Gx = (GB_BTYPE *) (Bx + (j1 * bvlen) * GB_BSIZE) ; 166 } 167 else 168 { 169 // load and transpose the values of B(:,j1:j2-1) 170 for (int64_t jj = 0 ; jj < np ; jj++) 171 { 172 int64_t j = j1 + jj ; 173 for (int64_t i = 0 ; i < bvlen ; i++) 174 { 175 // G(i,jj) = B(i,j), and change storage order 176 int64_t pG = i*np + jj ; 177 int64_t pB = i + j * bvlen ; 178 GB_LOADB (Gx, pG, Bx, pB) ; 179 } 180 } 181 } 182 } 183 184 //-------------------------------------------------------------- 185 // H = A*G for one panel 186 //-------------------------------------------------------------- 187 188 for (int64_t kA = 0 ; kA < anvec ; kA++) 189 { 190 191 //---------------------------------------------------------- 192 // get A(:,k) 193 //---------------------------------------------------------- 194 195 int64_t k = GBH (Ah, kA) ; 196 int64_t pA = Ap [kA] ; 197 int64_t pA_end = Ap [kA+1] ; 198 int64_t pG = k * np ; 199 200 #undef GB_MULT_A_ik_G_kjj 201 #if GB_IS_PAIR_MULTIPLIER 202 // t = A(i,k) * G (k,jj) is always equal to 1 203 #define GB_MULT_A_ik_G_kjj(jj) 204 #else 205 // t = A(i,k) * G (k,jj) 206 GB_CIJ_DECLARE (t) ; 207 #define GB_MULT_A_ik_G_kjj(jj) \ 208 GB_GETB (gkj, Gx, pG+jj) ; \ 209 GB_MULT (t, aik, gkj, i, k, j1 + jj) ; 210 #endif 211 212 #undef GB_HX_COMPUTE 213 #define GB_HX_COMPUTE(jj) \ 214 { \ 215 /* H (i,jj) += A(i,k)*G(k,jj) */ \ 216 if (!GB_B_IS_BITMAP || Gb [pG+jj]) \ 217 { \ 218 GB_MULT_A_ik_G_kjj (jj) ; \ 219 if (Hf [pH+jj] == 0) \ 220 { \ 221 /* H(i,jj) is a new entry */ \ 222 GB_HX_WRITE (pH+jj, t) ; /* Hx(i,jj)=t */ \ 223 Hf [pH+jj] = 1 ; \ 224 } \ 225 else \ 226 { \ 227 /* H(i,jj) is already present */ \ 228 GB_HX_UPDATE (pH+jj, t) ; /* Hx(i,jj)+=t */ \ 229 } \ 230 } \ 231 } 232 233 #undef GB_LOAD_A_ij 234 #define GB_LOAD_A_ij \ 235 int64_t i = Ai [pA] ; \ 236 GB_GETA (aik, Ax, pA) ; \ 237 int64_t pH = i * np ; 238 239 //---------------------------------------------------------- 240 // H += A(:,k)*G(k,:) 241 //---------------------------------------------------------- 242 243 #if GB_B_IS_BITMAP 244 bool gb = false ; 245 switch (np) 246 { 247 case 4 : gb = Gb [pG+3] ; 248 case 3 : gb |= Gb [pG+2] ; 249 case 2 : gb |= Gb [pG+1] ; 250 case 1 : gb |= Gb [pG ] ; 251 default: ; 252 } 253 if (gb) 254 #endif 255 { 256 switch (np) 257 { 258 259 case 4 : 260 for ( ; pA < pA_end ; pA++) 261 { 262 GB_LOAD_A_ij ; 263 GB_HX_COMPUTE (0) ; 264 GB_HX_COMPUTE (1) ; 265 GB_HX_COMPUTE (2) ; 266 GB_HX_COMPUTE (3) ; 267 } 268 break ; 269 270 case 3 : 271 for ( ; pA < pA_end ; pA++) 272 { 273 GB_LOAD_A_ij ; 274 GB_HX_COMPUTE (0) ; 275 GB_HX_COMPUTE (1) ; 276 GB_HX_COMPUTE (2) ; 277 } 278 break ; 279 280 case 2 : 281 for ( ; pA < pA_end ; pA++) 282 { 283 GB_LOAD_A_ij ; 284 GB_HX_COMPUTE (0) ; 285 GB_HX_COMPUTE (1) ; 286 } 287 break ; 288 289 case 1 : 290 for ( ; pA < pA_end ; pA++) 291 { 292 GB_LOAD_A_ij ; 293 GB_HX_COMPUTE (0) ; 294 } 295 break ; 296 default:; 297 } 298 } 299 300 #undef GB_MULT_A_ik_G_kjj 301 #undef GB_HX_COMPUTE 302 #undef GB_LOAD_A_ij 303 } 304 305 //-------------------------------------------------------------- 306 // C<#M>(:,j1:j2-1) += H 307 //-------------------------------------------------------------- 308 309 for (int64_t jj = 0 ; jj < np ; jj++) 310 { 311 312 //---------------------------------------------------------- 313 // C<#M>(:,j) += H (:,jj) 314 //---------------------------------------------------------- 315 316 int64_t j = j1 + jj ; 317 int64_t pC_start = j * cvlen ; // get pointer to C(:,j) 318 319 for (int64_t i = 0 ; i < cvlen ; i++) 320 { 321 int64_t pC = pC_start + i ; // pointer to C(i,j) 322 int64_t pH = i * np + jj ; // pointer to H(i,jj) 323 if (!Hf [pH]) continue ; 324 Hf [pH] = 0 ; // clear the panel 325 int8_t cb = Cb [pC] ; 326 327 //------------------------------------------------------ 328 // check M(i,j) 329 //------------------------------------------------------ 330 331 #if GB_MASK_IS_SPARSE_OR_HYPER 332 333 // M is sparse or hypersparse 334 bool mij = ((cb & 2) != 0) ^ Mask_comp ; 335 if (!mij) continue ; 336 cb = (cb & 1) ; 337 338 #elif GB_MASK_IS_BITMAP_OR_FULL 339 340 // M is bitmap or full 341 GB_GET_M_ij (pC) ; 342 mij = mij ^ Mask_comp ; 343 if (!mij) continue ; 344 345 #endif 346 347 //------------------------------------------------------ 348 // C(i,j) += H(i,jj) 349 //------------------------------------------------------ 350 351 if (cb == 0) 352 { 353 // C(i,j) = H(i,jj) 354 #if GB_IS_ANY_PAIR_SEMIRING 355 Cx [pC] = GB_CTYPE_CAST (1, 0) ; // C(i,j) = 1 356 #else 357 GB_CIJ_GATHER (pC, pH) ; 358 #endif 359 Cb [pC] = keep ; 360 task_cnvals++ ; 361 } 362 else 363 { 364 // Currently, the matrix C is a newly allocated 365 // matrix, not the C_in input matrix to GrB_mxm. 366 // As a result, this condition is not used. It 367 // will be in the future when this method is 368 // modified to modify C in-place. 369 ASSERT (GB_DEAD_CODE) ; 370 // C(i,j) += H(i,jj) 371 GB_CIJ_GATHER_UPDATE (pC, pH) ; 372 } 373 } 374 } 375 } 376 cnvals += task_cnvals ; 377 } 378 379 #undef GB_PANEL_SIZE 380 381 } 382 else if (use_atomics) 383 { 384 385 //---------------------------------------------------------------------- 386 // C<#M> += A*B using fine tasks and atomics 387 //---------------------------------------------------------------------- 388 389 int tid ; 390 #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \ 391 reduction(+:cnvals) 392 for (tid = 0 ; tid < ntasks ; tid++) 393 { 394 395 //------------------------------------------------------------------ 396 // determine the vector of B and C for this fine task 397 //------------------------------------------------------------------ 398 399 // The fine task operates on C(:,j) and B(:,j). Its fine task 400 // id ranges from 0 to nfine_tasks_per_vector-1, and determines 401 // which slice of A to operate on. 402 403 int64_t j = tid / nfine_tasks_per_vector ; 404 int fine_tid = tid % nfine_tasks_per_vector ; 405 int64_t kfirst = A_slice [fine_tid] ; 406 int64_t klast = A_slice [fine_tid + 1] ; 407 int64_t pB_start = j * bvlen ; // pointer to B(:,j) 408 int64_t pC_start = j * cvlen ; // pointer to C(:,j) 409 GB_GET_T_FOR_SECONDJ ; // t = j or j+1 for SECONDJ* 410 int64_t task_cnvals = 0 ; 411 412 // for Hx Gustavason workspace: use C(:,j) in-place: 413 GB_CTYPE *restrict Hx = (GB_CTYPE *) 414 (((GB_void *) Cx) + (pC_start * GB_CSIZE)) ; 415 #if GB_IS_PLUS_FC32_MONOID || GB_IS_ANY_FC32_MONOID 416 float *restrict Hx_real = (float *) Hx ; 417 float *restrict Hx_imag = Hx_real + 1 ; 418 #elif GB_IS_PLUS_FC64_MONOID || GB_IS_ANY_FC64_MONOID 419 double *restrict Hx_real = (double *) Hx ; 420 double *restrict Hx_imag = Hx_real + 1 ; 421 #endif 422 423 //------------------------------------------------------------------ 424 // C<#M>(:,j) += A(:,k1:k2) * B(k1:k2,j) 425 //------------------------------------------------------------------ 426 427 for (int64_t kk = kfirst ; kk < klast ; kk++) 428 { 429 430 //-------------------------------------------------------------- 431 // C<#M>(:,j) += A(:,k) * B(k,j) 432 //-------------------------------------------------------------- 433 434 int64_t k = GBH (Ah, kk) ; // k in range k1:k2 435 int64_t pB = pB_start + k ; // get pointer to B(k,j) 436 if (!GBB (Bb, pB)) continue ; 437 int64_t pA = Ap [kk] ; 438 int64_t pA_end = Ap [kk+1] ; 439 GB_GET_B_kj ; // bkj = B(k,j) 440 441 for ( ; pA < pA_end ; pA++) 442 { 443 444 //---------------------------------------------------------- 445 // get A(i,k) and C(i,j) 446 //---------------------------------------------------------- 447 448 int64_t i = Ai [pA] ; // get A(i,k) index 449 int64_t pC = pC_start + i ; // get C(i,j) pointer 450 int8_t cb ; 451 452 //---------------------------------------------------------- 453 // C<#M>(i,j) += A(i,k) * B(k,j) 454 //---------------------------------------------------------- 455 456 #if GB_MASK_IS_SPARSE_OR_HYPER 457 { 458 459 //------------------------------------------------------ 460 // M is sparse, and scattered into the C bitmap 461 //------------------------------------------------------ 462 463 // finite-state machine in Cb [pC]: 464 // 0: cij not present, mij zero 465 // 1: cij present, mij zero (keep==1 for !M) 466 // 2: cij not present, mij one 467 // 3: cij present, mij one (keep==3 for M) 468 // 7: cij is locked 469 470 #if GB_HAS_ATOMIC 471 { 472 // if C(i,j) is already present and can be modified 473 // (cb==keep), and the monoid can be done 474 // atomically, then do the atomic update. No need 475 // to modify Cb [pC]. 476 GB_ATOMIC_READ 477 cb = Cb [pC] ; // grab the entry 478 if (cb == keep) 479 { 480 #if !GB_IS_ANY_MONOID 481 GB_MULT_A_ik_B_kj ; // t = A(i,k) * B(k,j) 482 GB_ATOMIC_UPDATE_HX (i, t) ; // C(i,j) += t 483 #endif 484 continue ; // C(i,j) has been updated 485 } 486 } 487 #endif 488 489 do // lock the entry 490 { 491 // do this atomically: 492 // { cb = Cb [pC] ; Cb [pC] = 7 ; } 493 GB_ATOMIC_CAPTURE_INT8 (cb, Cb [pC], 7) ; 494 } while (cb == 7) ; // lock owner gets 0, 1, 2, or 3 495 if (cb == keep-1) 496 { 497 // C(i,j) is a new entry 498 GB_MULT_A_ik_B_kj ; // t = A(i,k)*B(k,j) 499 #if GB_IS_ANY_PAIR_SEMIRING 500 GB_ATOMIC_SET_HX_ONE (i) ; // C(i,j) = 1 501 #else 502 GB_ATOMIC_WRITE_HX (i, t) ; // C(i,j) = t 503 #endif 504 task_cnvals++ ; 505 cb = keep ; // keep the entry 506 } 507 else if (cb == keep) 508 { 509 // C(i,j) is already present 510 #if !GB_IS_ANY_MONOID 511 GB_MULT_A_ik_B_kj ; // t = A(i,k)*B(k,j) 512 GB_ATOMIC_UPDATE_HX (i, t) ; // C(i,j) += t 513 #endif 514 } 515 GB_ATOMIC_WRITE 516 Cb [pC] = cb ; // unlock the entry 517 518 } 519 #else 520 { 521 522 //------------------------------------------------------ 523 // M is not present, or bitmap/full 524 //------------------------------------------------------ 525 526 // finite-state machine in Cb [pC]: 527 // 0: cij not present; can be written 528 // 1: cij present; can be updated 529 // 7: cij is locked 530 531 #if GB_MASK_IS_BITMAP_OR_FULL 532 { 533 // M is bitmap or full, and not in C bitmap. 534 // Do not modify C(i,j) if not permitted by the mask 535 GB_GET_M_ij (pC) ; 536 mij = mij ^ Mask_comp ; 537 if (!mij) continue ; 538 } 539 #endif 540 541 //------------------------------------------------------ 542 // C(i,j) += A(i,j) * B(k,j) 543 //------------------------------------------------------ 544 545 #if GB_HAS_ATOMIC 546 { 547 // if C(i,j) is already present (cb==1), and the 548 // monoid can be done atomically, then do the 549 // atomic update. No need to modify Cb [pC]. 550 GB_ATOMIC_READ 551 cb = Cb [pC] ; // grab the entry 552 if (cb == 1) 553 { 554 #if !GB_IS_ANY_MONOID 555 GB_MULT_A_ik_B_kj ; // t = A(i,k) * B(k,j) 556 GB_ATOMIC_UPDATE_HX (i, t) ; // C(i,j) += t 557 #endif 558 continue ; // C(i,j) has been updated 559 } 560 } 561 #endif 562 563 do // lock the entry 564 { 565 // do this atomically: 566 // { cb = Cb [pC] ; Cb [pC] = 7 ; } 567 GB_ATOMIC_CAPTURE_INT8 (cb, Cb [pC], 7) ; 568 } while (cb == 7) ; // lock owner gets 0 or 1 569 if (cb == 0) 570 { 571 // C(i,j) is a new entry 572 GB_MULT_A_ik_B_kj ; // t = A(i,k)*B(k,j) 573 #if GB_IS_ANY_PAIR_SEMIRING 574 GB_ATOMIC_SET_HX_ONE (i) ; // C(i,j) = 1 575 #else 576 GB_ATOMIC_WRITE_HX (i, t) ; // C(i,j) = t 577 #endif 578 task_cnvals++ ; 579 } 580 else // cb == 1 581 { 582 // C(i,j) is already present 583 #if !GB_IS_ANY_MONOID 584 GB_MULT_A_ik_B_kj ; // t = A(i,k)*B(k,j) 585 GB_ATOMIC_UPDATE_HX (i, t) ; // C(i,j) += t 586 #endif 587 } 588 GB_ATOMIC_WRITE 589 Cb [pC] = 1 ; // unlock the entry 590 591 } 592 #endif 593 594 } 595 } 596 cnvals += task_cnvals ; 597 } 598 599 } 600 else 601 { 602 603 //---------------------------------------------------------------------- 604 // C<#M> += A*B using fine tasks and workspace, with no atomics 605 //---------------------------------------------------------------------- 606 607 // Each fine task is given size-cvlen workspace to compute its result 608 // in the first phase, W(:,tid) = A(:,k1:k2) * B(k1:k2,j), where k1:k2 609 // is defined by the fine_tid of the task. The workspaces are then 610 // summed into C in the second phase. 611 612 //---------------------------------------------------------------------- 613 // allocate workspace 614 //---------------------------------------------------------------------- 615 616 size_t workspace = cvlen * ntasks ; 617 size_t cxsize = (GB_IS_ANY_PAIR_SEMIRING) ? 0 : GB_CSIZE ; 618 Wf = GB_MALLOC_WERK (workspace, int8_t, &Wf_size) ; 619 Wcx = GB_MALLOC_WERK (workspace * cxsize, GB_void, &Wcx_size) ; 620 if (Wf == NULL || Wcx == NULL) 621 { 622 // out of memory 623 GB_FREE_ALL ; 624 return (GrB_OUT_OF_MEMORY) ; 625 } 626 627 //---------------------------------------------------------------------- 628 // first phase: W (:,tid) = A (:,k1:k2) * B (k2:k2,j) for each fine task 629 //---------------------------------------------------------------------- 630 631 int tid ; 632 #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) 633 for (tid = 0 ; tid < ntasks ; tid++) 634 { 635 636 //------------------------------------------------------------------ 637 // determine the vector of B and C for this fine task 638 //------------------------------------------------------------------ 639 640 // The fine task operates on C(:,j) and B(:,j). Its fine task 641 // id ranges from 0 to nfine_tasks_per_vector-1, and determines 642 // which slice of A to operate on. 643 644 int64_t j = tid / nfine_tasks_per_vector ; 645 int fine_tid = tid % nfine_tasks_per_vector ; 646 int64_t kfirst = A_slice [fine_tid] ; 647 int64_t klast = A_slice [fine_tid + 1] ; 648 int64_t pB_start = j * bvlen ; // pointer to B(:,j) 649 int64_t pC_start = j * cvlen ; // pointer to C(:,j), for bitmap 650 int64_t pW_start = tid * cvlen ; // pointer to W(:,tid) 651 GB_GET_T_FOR_SECONDJ ; // t = j or j+1 for SECONDJ* 652 int64_t task_cnvals = 0 ; 653 654 // for Hf and Hx Gustavason workspace: use W(:,tid): 655 int8_t *restrict Hf = Wf + pW_start ; 656 GB_CTYPE *restrict Hx = (GB_CTYPE *) 657 (Wcx + (pW_start * cxsize)) ; 658 #if GB_IS_PLUS_FC32_MONOID 659 float *restrict Hx_real = (float *) Hx ; 660 float *restrict Hx_imag = Hx_real + 1 ; 661 #elif GB_IS_PLUS_FC64_MONOID 662 double *restrict Hx_real = (double *) Hx ; 663 double *restrict Hx_imag = Hx_real + 1 ; 664 #endif 665 666 //------------------------------------------------------------------ 667 // clear Hf 668 //------------------------------------------------------------------ 669 670 memset (Hf, 0, cvlen) ; 671 672 //------------------------------------------------------------------ 673 // W<#M> = A(:,k1:k2) * B(k1:k2,j) 674 //------------------------------------------------------------------ 675 676 for (int64_t kk = kfirst ; kk < klast ; kk++) 677 { 678 679 //-------------------------------------------------------------- 680 // W<#M>(:,tid) += A(:,k) * B(k,j) 681 //-------------------------------------------------------------- 682 683 int64_t k = GBH (Ah, kk) ; // k in range k1:k2 684 int64_t pB = pB_start + k ; // get pointer to B(k,j) 685 if (!GBB (Bb, pB)) continue ; 686 int64_t pA = Ap [kk] ; 687 int64_t pA_end = Ap [kk+1] ; 688 GB_GET_B_kj ; // bkj = B(k,j) 689 690 for ( ; pA < pA_end ; pA++) 691 { 692 693 //---------------------------------------------------------- 694 // get A(i,k) 695 //---------------------------------------------------------- 696 697 int64_t i = Ai [pA] ; // get A(i,k) index 698 699 //---------------------------------------------------------- 700 // check M(i,j) 701 //---------------------------------------------------------- 702 703 #if GB_MASK_IS_SPARSE_OR_HYPER 704 { 705 // M is sparse or hypersparse 706 int64_t pC = pC_start + i ; 707 int8_t cb = Cb [pC] ; 708 bool mij = ((cb & 2) != 0) ^ Mask_comp ; 709 if (!mij) continue ; 710 } 711 #elif GB_MASK_IS_BITMAP_OR_FULL 712 { 713 // M is bitmap or full 714 int64_t pC = pC_start + i ; 715 GB_GET_M_ij (pC) ; 716 mij = mij ^ Mask_comp ; 717 if (!mij) continue ; 718 } 719 #endif 720 721 //---------------------------------------------------------- 722 // W<#M>(i) += A(i,k) * B(k,j) 723 //---------------------------------------------------------- 724 725 #if GB_IS_ANY_PAIR_SEMIRING 726 { 727 // Hx is not used; Cx [...] = 1 is done below 728 Hf [i] = 1 ; 729 } 730 #else 731 { 732 GB_MULT_A_ik_B_kj ; // t = A(i,k)*B(k,j) 733 if (Hf [i] == 0) 734 { 735 // W(i,j) is a new entry 736 GB_HX_WRITE (i, t) ; // Hx(i) = t 737 Hf [i] = 1 ; 738 } 739 else 740 { 741 // W(i) is already present 742 GB_HX_UPDATE (i, t) ; // Hx(i) += t 743 } 744 } 745 #endif 746 } 747 } 748 } 749 750 //---------------------------------------------------------------------- 751 // second phase: C<#M> += reduce (W) 752 //---------------------------------------------------------------------- 753 754 #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \ 755 reduction(+:cnvals) 756 for (tid = 0 ; tid < ntasks ; tid++) 757 { 758 759 //------------------------------------------------------------------ 760 // determine the W and C for this fine task 761 //------------------------------------------------------------------ 762 763 // The fine task operates on C(i1:i2,j) and W(i1:i2,w1:w2), where 764 // i1:i2 is defined by the fine task id. Its fine task id ranges 765 // from 0 to nfine_tasks_per_vector-1. 766 767 // w1:w2 are the updates to C(:,j), where w1:w2 = 768 // [j*nfine_tasks_per_vector : (j+1)*nfine_tasks_per_vector-1]. 769 770 int64_t j = tid / nfine_tasks_per_vector ; 771 int fine_tid = tid % nfine_tasks_per_vector ; 772 int64_t istart, iend ; 773 GB_PARTITION (istart, iend, cvlen, fine_tid, 774 nfine_tasks_per_vector) ; 775 int64_t pC_start = j * cvlen ; // pointer to C(:,j) 776 int64_t wstart = j * nfine_tasks_per_vector ; 777 int64_t wend = (j + 1) * nfine_tasks_per_vector ; 778 int64_t task_cnvals = 0 ; 779 780 // Hx = (typecasted) Wcx workspace, use Wf as-is 781 GB_CTYPE *restrict Hx = ((GB_CTYPE *) Wcx) ; 782 #if GB_IS_PLUS_FC32_MONOID 783 float *restrict Hx_real = (float *) Hx ; 784 float *restrict Hx_imag = Hx_real + 1 ; 785 #elif GB_IS_PLUS_FC64_MONOID 786 double *restrict Hx_real = (double *) Hx ; 787 double *restrict Hx_imag = Hx_real + 1 ; 788 #endif 789 790 //------------------------------------------------------------------ 791 // C<#M>(i1:i2,j) += reduce (W (i2:i2, wstart:wend)) 792 //------------------------------------------------------------------ 793 794 for (int64_t w = wstart ; w < wend ; w++) 795 { 796 797 //-------------------------------------------------------------- 798 // C<#M>(i1:i2,j) += W (i1:i2,w) 799 //-------------------------------------------------------------- 800 801 int64_t pW_start = w * cvlen ; // pointer to W (:,w) 802 803 for (int64_t i = istart ; i < iend ; i++) 804 { 805 806 //---------------------------------------------------------- 807 // get pointer and bitmap C(i,j) and W(i,w) 808 //---------------------------------------------------------- 809 810 int64_t pW = pW_start + i ; // pointer to W(i,w) 811 if (Wf [pW] == 0) continue ; // skip if not present 812 int64_t pC = pC_start + i ; // pointer to C(i,j) 813 int8_t cb = Cb [pC] ; // bitmap status of C(i,j) 814 815 //---------------------------------------------------------- 816 // M(i,j) already checked, but adjust Cb if M is sparse 817 //---------------------------------------------------------- 818 819 #if GB_MASK_IS_SPARSE_OR_HYPER 820 { 821 // M is sparse or hypersparse 822 cb = (cb & 1) ; 823 } 824 #endif 825 826 //---------------------------------------------------------- 827 // C(i,j) += W (i,w) 828 //---------------------------------------------------------- 829 830 if (cb == 0) 831 { 832 // C(i,j) = W(i,w) 833 #if GB_IS_ANY_PAIR_SEMIRING 834 Cx [pC] = GB_CTYPE_CAST (1, 0) ; // C(i,j) = 1 835 #else 836 GB_CIJ_GATHER (pC, pW) ; 837 #endif 838 Cb [pC] = keep ; 839 task_cnvals++ ; 840 } 841 else 842 { 843 // C(i,j) += W(i,w) 844 GB_CIJ_GATHER_UPDATE (pC, pW) ; 845 } 846 } 847 } 848 cnvals += task_cnvals ; 849 } 850 } 851 } 852 853