1 //------------------------------------------------------------------------------ 2 // GB_sparse_add_template: C=A+B, C<M>=A+B when C is sparse/hypersparse 3 //------------------------------------------------------------------------------ 4 5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved. 6 // SPDX-License-Identifier: Apache-2.0 7 8 //------------------------------------------------------------------------------ 9 10 // C is sparse or hypersparse: 11 12 // ------------------------------------------ 13 // C = A + B 14 // ------------------------------------------ 15 // sparse . sparse sparse 16 17 // ------------------------------------------ 18 // C <M> = A + B 19 // ------------------------------------------ 20 // sparse sparse sparse sparse 21 // sparse sparse sparse bitmap 22 // sparse sparse sparse full 23 // sparse sparse bitmap sparse 24 // sparse sparse bitmap bitmap 25 // sparse sparse bitmap full 26 // sparse sparse full sparse 27 // sparse sparse full bitmap 28 // sparse sparse full full (same as emult) 29 30 // sparse bitmap sparse sparse 31 // sparse full sparse sparse 32 33 // ------------------------------------------ 34 // C <!M> = A + B 35 // ------------------------------------------ 36 // sparse bitmap sparse sparse 37 // sparse full sparse sparse 38 39 // If all four matrices are sparse/hypersparse, and C<!M>=A+B is being 40 // computed, then M is passed in as NULL to GB_add_phase*. GB_add_sparsity 41 // returns apply_mask as false. The methods below do not handle the case when 42 // C is sparse, M is sparse, and !M is used. All other uses of !M when M 43 // is sparse result in a bitmap structure for C, and this is handled by 44 // GB_bitmap_add_template. 45 46 // For this case: the mask is done later, so C=A+B is computed here: 47 48 // ------------------------------------------ 49 // C <!M> = A + B 50 // ------------------------------------------ 51 // sparse sparse sparse sparse (mask later) 52 53 { 54 55 #ifdef GB_DEBUG 56 if (M == NULL || M_is_bitmap || M_is_full) 57 { 58 ASSERT (A_is_sparse || A_is_hyper) ; 59 ASSERT (B_is_sparse || B_is_hyper) ; 60 } 61 #endif 62 63 //-------------------------------------------------------------------------- 64 // phase1: count entries in each C(:,j) 65 // phase2: compute C 66 //-------------------------------------------------------------------------- 67 68 #pragma omp parallel for num_threads(C_nthreads) schedule(dynamic,1) 69 for (taskid = 0 ; taskid < C_ntasks ; taskid++) 70 { 71 72 //---------------------------------------------------------------------- 73 // get the task descriptor 74 //---------------------------------------------------------------------- 75 76 int64_t kfirst = TaskList [taskid].kfirst ; 77 int64_t klast = TaskList [taskid].klast ; 78 bool fine_task = (klast == -1) ; 79 int64_t len ; 80 if (fine_task) 81 { 82 // a fine task operates on a slice of a single vector 83 klast = kfirst ; 84 len = TaskList [taskid].len ; 85 } 86 else 87 { 88 // a coarse task operates on one or more whole vectors 89 len = vlen ; 90 } 91 92 //---------------------------------------------------------------------- 93 // compute all vectors in this task 94 //---------------------------------------------------------------------- 95 96 for (int64_t k = kfirst ; k <= klast ; k++) 97 { 98 99 //------------------------------------------------------------------ 100 // get j, the kth vector of C 101 //------------------------------------------------------------------ 102 103 int64_t j = GBH (Ch, k) ; 104 105 #if defined ( GB_PHASE_1_OF_2 ) 106 int64_t cjnz = 0 ; 107 #else 108 int64_t pC, pC_end ; 109 if (fine_task) 110 { 111 // A fine task computes a slice of C(:,j) 112 pC = TaskList [taskid ].pC ; 113 pC_end = TaskList [taskid+1].pC ; 114 ASSERT (Cp [k] <= pC && pC <= pC_end && pC_end <= Cp [k+1]) ; 115 } 116 else 117 { 118 // The vectors of C are never sliced for a coarse task. 119 pC = Cp [k ] ; 120 pC_end = Cp [k+1] ; 121 } 122 int64_t cjnz = pC_end - pC ; 123 if (cjnz == 0) continue ; 124 #endif 125 126 //------------------------------------------------------------------ 127 // get A(:,j) 128 //------------------------------------------------------------------ 129 130 int64_t pA = -1, pA_end = -1 ; 131 if (fine_task) 132 { 133 // A fine task operates on Ai,Ax [pA...pA_end-1], which is 134 // a subset of the vector A(:,j) 135 pA = TaskList [taskid].pA ; 136 pA_end = TaskList [taskid].pA_end ; 137 } 138 else 139 { 140 // A coarse task operates on the entire vector A (:,j) 141 int64_t kA = (C_to_A == NULL) ? j : C_to_A [k] ; 142 if (kA >= 0) 143 { 144 pA = GBP (Ap, kA, vlen) ; 145 pA_end = GBP (Ap, kA+1, vlen) ; 146 } 147 } 148 149 int64_t ajnz = pA_end - pA ; // nnz in A(:,j) for this slice 150 int64_t pA_start = pA ; 151 bool adense = (ajnz == len) ; 152 153 // get the first and last indices in A(:,j) for this vector 154 int64_t iA_first = -1, iA_last = -1 ; 155 if (ajnz > 0) 156 { 157 iA_first = GBI (Ai, pA, vlen) ; 158 iA_last = GBI (Ai, pA_end-1, vlen) ; 159 } 160 161 //------------------------------------------------------------------ 162 // get B(:,j) 163 //------------------------------------------------------------------ 164 165 int64_t pB = -1, pB_end = -1 ; 166 if (fine_task) 167 { 168 // A fine task operates on Bi,Bx [pB...pB_end-1], which is 169 // a subset of the vector B(:,j) 170 pB = TaskList [taskid].pB ; 171 pB_end = TaskList [taskid].pB_end ; 172 } 173 else 174 { 175 // A coarse task operates on the entire vector B (:,j) 176 int64_t kB = (C_to_B == NULL) ? j : C_to_B [k] ; 177 if (kB >= 0) 178 { 179 pB = GBP (Bp, kB, vlen) ; 180 pB_end = GBP (Bp, kB+1, vlen) ; 181 } 182 } 183 184 int64_t bjnz = pB_end - pB ; // nnz in B(:,j) for this slice 185 int64_t pB_start = pB ; 186 bool bdense = (bjnz == len) ; 187 188 // get the first and last indices in B(:,j) for this vector 189 int64_t iB_first = -1, iB_last = -1 ; 190 if (bjnz > 0) 191 { 192 iB_first = GBI (Bi, pB, vlen) ; 193 iB_last = GBI (Bi, pB_end-1, vlen) ; 194 } 195 196 //------------------------------------------------------------------ 197 // get M(:,j) if M is sparse or hypersparse 198 //------------------------------------------------------------------ 199 200 bool sparse_mask_is_easy = false ; 201 int64_t pM = -1 ; 202 int64_t pM_end = -1 ; 203 if (M_is_sparse_or_hyper) 204 { 205 if (fine_task) 206 { 207 // A fine task operates on Mi,Mx [pM...pM_end-1], 208 // which is a subset of the vector M(:,j) 209 pM = TaskList [taskid].pM ; 210 pM_end = TaskList [taskid].pM_end ; 211 } 212 else 213 { 214 int64_t kM = -1 ; 215 if (Ch_is_Mh) 216 { 217 // Ch is the same as Mh (a deep copy) 218 ASSERT (Ch != NULL) ; 219 ASSERT (M_is_hyper) ; 220 ASSERT (Ch [k] == M->h [k]) ; 221 kM = k ; 222 } 223 else 224 { 225 kM = (C_to_M == NULL) ? j : C_to_M [k] ; 226 } 227 if (kM >= 0) 228 { 229 pM = GBP (Mp, kM , vlen) ; 230 pM_end = GBP (Mp, kM+1, vlen) ; 231 } 232 } 233 234 // The "easy mask" condition requires M to be sparse/hyper 235 // and structural. A and B cannot be bitmap. Also one of 236 // the following 3 conditions must hold: 237 // (1) all entries are present in A(:,j) and B == M 238 // (2) all entries are present in B(:,j) and A == M 239 // (3) both A and B are aliased to M 240 sparse_mask_is_easy = 241 Mask_struct && // M must be structural 242 !A_is_bitmap && // A must not be bitmap 243 !B_is_bitmap && // B must not be bitmap 244 ((adense && B == M) || // one of 3 conditions holds 245 (bdense && A == M) || 246 (A == M && B == M)) ; 247 248 // TODO: add the condition above to GB_add_sparsity, 249 // where adense/bdense are true for the whole matrix 250 // (adense is true if A is full, or sparse/hypersparse with 251 // all entries present). The test here is done vector by 252 // vector, for each A(:,j) and B(:,j). This is a finer grain 253 // test, as compared to a test for all of A and B. 254 255 } 256 257 //------------------------------------------------------------------ 258 // C(:,j)<optional mask> = A (:,j) + B (:,j) or subvector 259 //------------------------------------------------------------------ 260 261 if (M == NULL) 262 { 263 264 //-------------------------------------------------------------- 265 // M is not present, or !M is sparse but not applied here 266 //-------------------------------------------------------------- 267 268 // ------------------------------------------ 269 // C = A + B 270 // ------------------------------------------ 271 // sparse . sparse sparse 272 273 // ------------------------------------------ 274 // C <!M> = A + B 275 // ------------------------------------------ 276 // sparse sparse sparse sparse (mask later) 277 278 // If all four matrices are sparse or hypersparse, and 279 // Mask_comp is true, the mask M is passed in to this method as 280 // NULL. C=A+B is computed with no mask, and !M is applied 281 // later. 282 283 // A and B are both sparse or hypersparse, not bitmap or 284 // full, but individual vectors of A and B might have all 285 // entries present (adense and/or bdense). 286 ASSERT (A_is_sparse || A_is_hyper) ; 287 ASSERT (B_is_sparse || B_is_hyper) ; 288 289 #if defined ( GB_PHASE_1_OF_2 ) 290 291 if (A_and_B_are_disjoint) 292 { 293 294 // only used by GB_Matrix_wait, which computes A+T 295 // where T is the matrix of pending tuples for A. The 296 // pattern of pending tuples is always disjoint with 297 // the pattern of A. 298 299 cjnz = ajnz + bjnz ; 300 301 } 302 else 303 304 #endif 305 306 if (adense && bdense) 307 { 308 309 //---------------------------------------------------------- 310 // Method01: A(:,j) and B(:,j) dense: thus C(:,j) dense 311 //---------------------------------------------------------- 312 313 ASSERT (ajnz == bjnz) ; 314 ASSERT (iA_first == iB_first) ; 315 ASSERT (iA_last == iB_last ) ; 316 #if defined ( GB_PHASE_1_OF_2 ) 317 cjnz = ajnz ; 318 #else 319 ASSERT (cjnz == ajnz) ; 320 GB_PRAGMA_SIMD_VECTORIZE 321 for (int64_t p = 0 ; p < ajnz ; p++) 322 { 323 // C (i,j) = A (i,j) + B (i,j) 324 int64_t i = p + iA_first ; 325 Ci [pC + p] = i ; 326 ASSERT (Ai [pA + p] == i) ; 327 ASSERT (Bi [pB + p] == i) ; 328 GB_GETA (aij, Ax, pA + p) ; 329 GB_GETB (bij, Bx, pB + p) ; 330 GB_BINOP (GB_CX (pC + p), aij, bij, i, j) ; 331 } 332 #endif 333 334 } 335 else if (adense) 336 { 337 338 //---------------------------------------------------------- 339 // Method02: A(:,j) dense, B(:,j) sparse: C(:,j) dense 340 //---------------------------------------------------------- 341 342 #if defined ( GB_PHASE_1_OF_2 ) 343 cjnz = ajnz ; 344 #else 345 ASSERT (cjnz == ajnz) ; 346 GB_PRAGMA_SIMD_VECTORIZE 347 for (int64_t p = 0 ; p < ajnz ; p++) 348 { 349 // C (i,j) = A (i,j) 350 int64_t i = p + iA_first ; 351 Ci [pC + p] = i ; 352 ASSERT (Ai [pA + p] == i) ; 353 GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ; 354 } 355 GB_PRAGMA_SIMD_VECTORIZE 356 for (int64_t p = 0 ; p < bjnz ; p++) 357 { 358 // C (i,j) = A (i,j) + B (i,j) 359 int64_t i = Bi [pB + p] ; 360 int64_t ii = i - iA_first ; 361 ASSERT (Ai [pA + ii] == i) ; 362 GB_GETA (aij, Ax, pA + ii) ; 363 GB_GETB (bij, Bx, pB + p) ; 364 GB_BINOP (GB_CX (pC + ii), aij, bij, i, j) ; 365 } 366 #endif 367 368 } 369 else if (bdense) 370 { 371 372 //---------------------------------------------------------- 373 // Method03: A(:,j) sparse, B(:,j) dense: C(:,j) dense 374 //---------------------------------------------------------- 375 376 #if defined ( GB_PHASE_1_OF_2 ) 377 cjnz = bjnz ; 378 #else 379 ASSERT (cjnz == bjnz) ; 380 GB_PRAGMA_SIMD_VECTORIZE 381 for (int64_t p = 0 ; p < bjnz ; p++) 382 { 383 // C (i,j) = B (i,j) 384 int64_t i = p + iB_first ; 385 Ci [pC + p] = i ; 386 ASSERT (Bi [pB + p] == i) ; 387 GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ; 388 } 389 GB_PRAGMA_SIMD_VECTORIZE 390 for (int64_t p = 0 ; p < ajnz ; p++) 391 { 392 // C (i,j) = A (i,j) + B (i,j) 393 int64_t i = Ai [pA + p] ; 394 int64_t ii = i - iB_first ; 395 ASSERT (Bi [pB + ii] == i) ; 396 GB_GETA (aij, Ax, pA + p) ; 397 GB_GETB (bij, Bx, pB + ii) ; 398 GB_BINOP (GB_CX (pC + ii), aij, bij, i, j) ; 399 } 400 #endif 401 402 } 403 else if (ajnz == 0) 404 { 405 406 //---------------------------------------------------------- 407 // Method04: A(:,j) is empty 408 //---------------------------------------------------------- 409 410 #if defined ( GB_PHASE_1_OF_2 ) 411 cjnz = bjnz ; 412 #else 413 ASSERT (cjnz == bjnz) ; 414 memcpy (Ci + pC, Bi + pB, bjnz * sizeof (int64_t)) ; 415 GB_PRAGMA_SIMD_VECTORIZE 416 for (int64_t p = 0 ; p < bjnz ; p++) 417 { 418 // C (i,j) = B (i,j) 419 GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ; 420 } 421 #endif 422 423 } 424 else if (bjnz == 0) 425 { 426 427 //---------------------------------------------------------- 428 // Method05: B(:,j) is empty 429 //---------------------------------------------------------- 430 431 #if defined ( GB_PHASE_1_OF_2 ) 432 cjnz = ajnz ; 433 #else 434 ASSERT (cjnz == ajnz) ; 435 memcpy (Ci + pC, Ai + pA, ajnz * sizeof (int64_t)) ; 436 GB_PRAGMA_SIMD_VECTORIZE 437 for (int64_t p = 0 ; p < ajnz ; p++) 438 { 439 // C (i,j) = A (i,j) 440 GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ; 441 } 442 #endif 443 444 } 445 else if (iA_last < iB_first) 446 { 447 448 //---------------------------------------------------------- 449 // Method06: last A(:,j) comes before 1st B(:,j) 450 //---------------------------------------------------------- 451 452 #if defined ( GB_PHASE_1_OF_2 ) 453 cjnz = ajnz + bjnz ; 454 #else 455 ASSERT (cjnz == ajnz + bjnz) ; 456 memcpy (Ci + pC, Ai + pA, ajnz * sizeof (int64_t)) ; 457 GB_PRAGMA_SIMD_VECTORIZE 458 for (int64_t p = 0 ; p < ajnz ; p++) 459 { 460 // C (i,j) = A (i,j) 461 GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ; 462 } 463 pC += ajnz ; 464 memcpy (Ci + pC, Bi + pB, bjnz * sizeof (int64_t)) ; 465 GB_PRAGMA_SIMD_VECTORIZE 466 for (int64_t p = 0 ; p < bjnz ; p++) 467 { 468 // C (i,j) = B (i,j) 469 GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ; 470 } 471 #endif 472 473 } 474 else if (iB_last < iA_first) 475 { 476 477 //---------------------------------------------------------- 478 // Method07: last B(:,j) comes before 1st A(:,j) 479 //---------------------------------------------------------- 480 481 #if defined ( GB_PHASE_1_OF_2 ) 482 cjnz = ajnz + bjnz ; 483 #else 484 ASSERT (cjnz == ajnz + bjnz) ; 485 memcpy (Ci + pC, Bi + pB, bjnz * sizeof (int64_t)) ; 486 GB_PRAGMA_SIMD_VECTORIZE 487 for (int64_t p = 0 ; p < bjnz ; p++) 488 { 489 // C (i,j) = B (i,j) 490 GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ; 491 } 492 pC += bjnz ; 493 memcpy (Ci + pC, Ai + pA, ajnz * sizeof (int64_t)) ; 494 GB_PRAGMA_SIMD_VECTORIZE 495 for (int64_t p = 0 ; p < ajnz ; p++) 496 { 497 // C (i,j) = A (i,j) 498 GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ; 499 } 500 #endif 501 502 } 503 504 #if defined ( GB_PHASE_1_OF_2 ) 505 else if (ajnz > 32 * bjnz) 506 { 507 508 //---------------------------------------------------------- 509 // Method08: A(:,j) is much denser than B(:,j) 510 //---------------------------------------------------------- 511 512 // cjnz = ajnz + bjnz - nnz in the intersection 513 514 cjnz = ajnz + bjnz ; 515 for ( ; pB < pB_end ; pB++) 516 { 517 int64_t i = Bi [pB] ; 518 // find i in A(:,j) 519 int64_t pright = pA_end - 1 ; 520 bool found ; 521 GB_BINARY_SEARCH (i, Ai, pA, pright, found) ; 522 if (found) cjnz-- ; 523 } 524 525 } 526 else if (bjnz > 32 * ajnz) 527 { 528 529 //---------------------------------------------------------- 530 // Method09: B(:,j) is much denser than A(:,j) 531 //---------------------------------------------------------- 532 533 // cjnz = ajnz + bjnz - nnz in the intersection 534 535 cjnz = ajnz + bjnz ; 536 for ( ; pA < pA_end ; pA++) 537 { 538 int64_t i = Ai [pA] ; 539 // find i in B(:,j) 540 int64_t pright = pB_end - 1 ; 541 bool found ; 542 GB_BINARY_SEARCH (i, Bi, pB, pright, found) ; 543 if (found) cjnz-- ; 544 } 545 546 } 547 #endif 548 549 else 550 { 551 552 //---------------------------------------------------------- 553 // Method10: A(:,j) and B(:,j) about the same sparsity 554 //---------------------------------------------------------- 555 556 while (pA < pA_end && pB < pB_end) 557 { 558 int64_t iA = Ai [pA] ; 559 int64_t iB = Bi [pB] ; 560 if (iA < iB) 561 { 562 // C (iA,j) = A (iA,j) 563 #if defined ( GB_PHASE_2_OF_2 ) 564 Ci [pC] = iA ; 565 GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ; 566 #endif 567 pA++ ; 568 } 569 else if (iA > iB) 570 { 571 // C (iB,j) = B (iB,j) 572 #if defined ( GB_PHASE_2_OF_2 ) 573 Ci [pC] = iB ; 574 GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ; 575 #endif 576 pB++ ; 577 } 578 else 579 { 580 // C (i,j) = A (i,j) + B (i,j) 581 #if defined ( GB_PHASE_2_OF_2 ) 582 Ci [pC] = iB ; 583 GB_GETA (aij, Ax, pA) ; 584 GB_GETB (bij, Bx, pB) ; 585 GB_BINOP (GB_CX (pC), aij, bij, iB, j) ; 586 #endif 587 pA++ ; 588 pB++ ; 589 } 590 #if defined ( GB_PHASE_2_OF_2 ) 591 pC++ ; 592 #else 593 cjnz++ ; 594 #endif 595 } 596 597 //---------------------------------------------------------- 598 // A (:,j) or B (:,j) have entries left; not both 599 //---------------------------------------------------------- 600 601 ajnz = (pA_end - pA) ; 602 bjnz = (pB_end - pB) ; 603 ASSERT (ajnz == 0 || bjnz == 0) ; 604 #if defined ( GB_PHASE_1_OF_2 ) 605 cjnz += ajnz + bjnz ; 606 #else 607 memcpy (Ci + pC, Ai + pA, ajnz * sizeof (int64_t)) ; 608 for (int64_t p = 0 ; p < ajnz ; p++) 609 { 610 // C (i,j) = A (i,j) 611 GB_COPY_A_TO_C (GB_CX (pC + p), Ax, pA + p) ; 612 } 613 memcpy (Ci + pC, Bi + pB, bjnz * sizeof (int64_t)) ; 614 for (int64_t p = 0 ; p < bjnz ; p++) 615 { 616 // C (i,j) = B (i,j) 617 GB_COPY_B_TO_C (GB_CX (pC + p), Bx, pB + p) ; 618 } 619 ASSERT (pC + ajnz + bjnz == pC_end) ; 620 #endif 621 } 622 623 } 624 else if (sparse_mask_is_easy) 625 { 626 627 //-------------------------------------------------------------- 628 // special case: M is present and very easy to use 629 //-------------------------------------------------------------- 630 631 // ------------------------------------------ 632 // C <M> = A + B 633 // ------------------------------------------ 634 // sparse sparse sparse sparse 635 // sparse sparse sparse full 636 // sparse sparse full sparse 637 // sparse sparse full full 638 639 // A and B are sparse, hypersparse or full, not bitmap. 640 ASSERT (!A_is_bitmap) ; 641 ASSERT (!B_is_bitmap) ; 642 ASSERT (Mask_struct) ; 643 644 int64_t mjnz = pM_end - pM ; // nnz (M (:,j)) 645 646 #if defined ( GB_PHASE_1_OF_2 ) 647 648 // M is structural, and sparse or hypersparse, so every entry 649 // in the mask is guaranteed to appear in A+B. The symbolic 650 // count is thus trivial. 651 652 cjnz = mjnz ; 653 654 #else 655 656 // copy the pattern into C (:,j) 657 int64_t pC_start = pC ; 658 int64_t pM_start = pM ; 659 memcpy (Ci + pC, Mi + pM, mjnz * sizeof (int64_t)) ; 660 int64_t pA_offset = pA_start - iA_first ; 661 int64_t pB_offset = pB_start - iB_first ; 662 663 if (adense && B == M) 664 { 665 666 //---------------------------------------------------------- 667 // Method11: A dense, B == M 668 //---------------------------------------------------------- 669 670 GB_PRAGMA_SIMD_VECTORIZE 671 for (int64_t p = 0 ; p < mjnz ; p++) 672 { 673 int64_t pM = p + pM_start ; 674 int64_t pC = p + pC_start ; 675 int64_t i = Mi [pM] ; 676 ASSERT (GB_mcast (Mx, pM, msize)) ; 677 ASSERT (GBI (Ai, pA_offset + i, vlen) == i) ; 678 ASSERT (GBI (Bi, pM, vlen) == i) ; 679 GB_GETA (aij, Ax, pA_offset + i) ; 680 GB_GETB (bij, Bx, pM) ; 681 GB_BINOP (GB_CX (pC), aij, bij, i, j) ; 682 } 683 684 } 685 else if (bdense && A == M) 686 { 687 688 //---------------------------------------------------------- 689 // Method12: B dense, A == M 690 //---------------------------------------------------------- 691 692 GB_PRAGMA_SIMD_VECTORIZE 693 for (int64_t p = 0 ; p < mjnz ; p++) 694 { 695 int64_t pM = p + pM_start ; 696 int64_t pC = p + pC_start ; 697 int64_t i = Mi [pM] ; 698 ASSERT (GB_mcast (Mx, pM, msize)) ; 699 ASSERT (GBI (Ai, pM, vlen) == i) ; 700 ASSERT (GBI (Bi, pB_offset + i, vlen) == i) ; 701 GB_GETA (aij, Ax, pM) ; 702 GB_GETB (bij, Bx, pB_offset + i) ; 703 GB_BINOP (GB_CX (pC), aij, bij, i, j) ; 704 } 705 706 } 707 else // (A == M) && (B == M) 708 { 709 710 //---------------------------------------------------------- 711 // Method13: A == M == B: all three matrices the same 712 //---------------------------------------------------------- 713 714 GB_PRAGMA_SIMD_VECTORIZE 715 for (int64_t p = 0 ; p < mjnz ; p++) 716 { 717 int64_t pM = p + pM_start ; 718 int64_t pC = p + pC_start ; 719 #if GB_OP_IS_SECOND 720 GB_GETB (t, Bx, pM) ; 721 #else 722 GB_GETA (t, Ax, pM) ; 723 #endif 724 GB_BINOP (GB_CX (pC), t, t, Mi [pM], j) ; 725 } 726 } 727 #endif 728 729 } 730 else if (M_is_sparse_or_hyper) 731 { 732 733 //-------------------------------------------------------------- 734 // Method14: C and M are sparse or hypersparse 735 //-------------------------------------------------------------- 736 737 // ------------------------------------------ 738 // C <M> = A + B 739 // ------------------------------------------ 740 // sparse sparse sparse sparse (*) 741 // sparse sparse sparse bitmap (*) 742 // sparse sparse sparse full (*) 743 // sparse sparse bitmap sparse (*) 744 // sparse sparse bitmap bitmap 745 // sparse sparse bitmap full 746 // sparse sparse full sparse (*) 747 // sparse sparse full bitmap 748 // sparse sparse full full 749 750 // (*) This method is efficient except when either A or B are 751 // sparse, and when M is sparse but with many entries. When M 752 // is sparse and either A or B are sparse, the method is 753 // designed to be very efficient when M is very sparse compared 754 // with A and/or B. It traverses all entries in the sparse M, 755 // and (for sparse A or B) does a binary search for entries in 756 // A or B. In that case, if M has many entries, the mask M 757 // should be ignored, and C=A+B should be computed without any 758 // mask. The test for when to use M here should ignore A or B 759 // if they are bitmap or full. 760 761 // A and B can have any sparsity pattern (hypersparse, 762 // sparse, bitmap, or full). 763 764 for ( ; pM < pM_end ; pM++) 765 { 766 767 //---------------------------------------------------------- 768 // get M(i,j) for A(i,j) + B (i,j) 769 //---------------------------------------------------------- 770 771 int64_t i = Mi [pM] ; 772 bool mij = GB_mcast (Mx, pM, msize) ; 773 if (!mij) continue ; 774 775 //---------------------------------------------------------- 776 // get A(i,j) 777 //---------------------------------------------------------- 778 779 bool afound ; 780 if (adense) 781 { 782 // A is dense, bitmap, or full; use quick lookup 783 pA = pA_start + (i - iA_first) ; 784 afound = GBB (Ab, pA) ; 785 } 786 else if (A == M) 787 { 788 // A is aliased to M 789 pA = pM ; 790 afound = true ; 791 } 792 else 793 { 794 // A is sparse; use binary search. This is slow unless 795 // M is very sparse compared with A. 796 int64_t apright = pA_end - 1 ; 797 GB_BINARY_SEARCH (i, Ai, pA, apright, afound) ; 798 } 799 800 ASSERT (GB_IMPLIES (afound, GBI (Ai, pA, vlen) == i)) ; 801 802 //---------------------------------------------------------- 803 // get B(i,j) 804 //---------------------------------------------------------- 805 806 bool bfound ; 807 if (bdense) 808 { 809 // B is dense; use quick lookup 810 pB = pB_start + (i - iB_first) ; 811 bfound = GBB (Bb, pB) ; 812 } 813 else if (B == M) 814 { 815 // B is aliased to M 816 pB = pM ; 817 bfound = true ; 818 } 819 else 820 { 821 // B is sparse; use binary search. This is slow unless 822 // M is very sparse compared with B. 823 int64_t bpright = pB_end - 1 ; 824 GB_BINARY_SEARCH (i, Bi, pB, bpright, bfound) ; 825 } 826 827 ASSERT (GB_IMPLIES (bfound, GBI (Bi, pB, vlen) == i)) ; 828 829 //---------------------------------------------------------- 830 // C(i,j) = A(i,j) + B(i,j) 831 //---------------------------------------------------------- 832 833 if (afound && bfound) 834 { 835 // C (i,j) = A (i,j) + B (i,j) 836 #if defined ( GB_PHASE_1_OF_2 ) 837 cjnz++ ; 838 #else 839 Ci [pC] = i ; 840 GB_GETA (aij, Ax, pA) ; 841 GB_GETB (bij, Bx, pB) ; 842 GB_BINOP (GB_CX (pC), aij, bij, i, j) ; 843 pC++ ; 844 #endif 845 } 846 else if (afound) 847 { 848 // C (i,j) = A (i,j) 849 #if defined ( GB_PHASE_1_OF_2 ) 850 cjnz++ ; 851 #else 852 Ci [pC] = i ; 853 GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ; 854 pC++ ; 855 #endif 856 } 857 else if (bfound) 858 { 859 // C (i,j) = B (i,j) 860 #if defined ( GB_PHASE_1_OF_2 ) 861 cjnz++ ; 862 #else 863 Ci [pC] = i ; 864 GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ; 865 pC++ ; 866 #endif 867 } 868 } 869 870 #if defined ( GB_PHASE_2_OF_2 ) 871 ASSERT (pC == pC_end) ; 872 #endif 873 874 } 875 else 876 { 877 878 //-------------------------------------------------------------- 879 // M is bitmap or full, for either C<M>=A+B or C<!M>=A+B 880 //-------------------------------------------------------------- 881 882 // ------------------------------------------ 883 // C <M> = A + B 884 // ------------------------------------------ 885 // sparse bitmap sparse sparse 886 // sparse full sparse sparse 887 888 // ------------------------------------------ 889 // C <!M> = A + B 890 // ------------------------------------------ 891 // sparse bitmap sparse sparse 892 // sparse full sparse sparse 893 894 // This method is very efficient for any mask, and should 895 // always be used if M is bitmap or full, even if the mask must 896 // also be applied later in GB_mask or GB_accum_mask. 897 // Exploiting the mask here adds no extra search time, and it 898 // reduces the size of C on output. 899 900 // GB_GET_MIJ: get M(i,j) where M is bitmap or full 901 #undef GB_GET_MIJ 902 #define GB_GET_MIJ(i) \ 903 int64_t pM = pM_start + i ; \ 904 bool mij = GBB (Mb, pM) && GB_mcast (Mx, pM, msize) ; \ 905 if (Mask_comp) mij = !mij ; 906 907 // A and B are sparse or hypersparse, not bitmap or full, 908 // but individual vectors of A and B might have all entries 909 // present (adense and/or bdense). 910 ASSERT (A_is_sparse || A_is_hyper) ; 911 ASSERT (B_is_sparse || B_is_hyper) ; 912 913 int64_t pM_start = j * vlen ; 914 915 if (adense && bdense) 916 { 917 918 //---------------------------------------------------------- 919 // Method15: A(:,j) and B(:,j) dense, M bitmap/full 920 //---------------------------------------------------------- 921 922 ASSERT (ajnz == bjnz) ; 923 ASSERT (iA_first == iB_first) ; 924 ASSERT (iA_last == iB_last ) ; 925 for (int64_t p = 0 ; p < ajnz ; p++) 926 { 927 int64_t i = p + iA_first ; 928 ASSERT (Ai [pA + p] == i) ; 929 ASSERT (Bi [pB + p] == i) ; 930 GB_GET_MIJ (i) ; 931 if (mij) 932 { 933 // C (i,j) = A (i,j) + B (i,j) 934 #if defined ( GB_PHASE_1_OF_2 ) 935 cjnz++ ; 936 #else 937 Ci [pC] = i ; 938 GB_GETA (aij, Ax, pA + p) ; 939 GB_GETB (bij, Bx, pB + p) ; 940 GB_BINOP (GB_CX (pC), aij, bij, i, j) ; 941 pC++ ; 942 #endif 943 } 944 } 945 946 } 947 else if (ajnz == 0) 948 { 949 950 //---------------------------------------------------------- 951 // Method16: A(:,j) is empty, M bitmap/full 952 //---------------------------------------------------------- 953 954 for ( ; pB < pB_end ; pB++) 955 { 956 int64_t i = Bi [pB] ; 957 GB_GET_MIJ (i) ; 958 if (mij) 959 { 960 // C (i,j) = B (i,j) 961 #if defined ( GB_PHASE_1_OF_2 ) 962 cjnz++ ; 963 #else 964 Ci [pC] = i ; 965 GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ; 966 pC++ ; 967 #endif 968 } 969 } 970 971 } 972 else if (bjnz == 0) 973 { 974 975 //---------------------------------------------------------- 976 // Method17: B(:,j) is empty, M bitmap/full 977 //---------------------------------------------------------- 978 979 for ( ; pA < pA_end ; pA++) 980 { 981 int64_t i = Ai [pA] ; 982 GB_GET_MIJ (i) ; 983 if (mij) 984 { 985 // C (i,j) = A (i,j) 986 #if defined ( GB_PHASE_1_OF_2 ) 987 cjnz++ ; 988 #else 989 Ci [pC] = i ; 990 GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ; 991 pC++ ; 992 #endif 993 } 994 } 995 996 } 997 else if (iA_last < iB_first) 998 { 999 1000 //---------------------------------------------------------- 1001 // Method18:last A(:,j) before 1st B(:,j), M bitmap/full 1002 //---------------------------------------------------------- 1003 1004 for ( ; pA < pA_end ; pA++) 1005 { 1006 int64_t i = Ai [pA] ; 1007 GB_GET_MIJ (i) ; 1008 if (mij) 1009 { 1010 // C (i,j) = A (i,j) 1011 #if defined ( GB_PHASE_1_OF_2 ) 1012 cjnz++ ; 1013 #else 1014 Ci [pC] = i ; 1015 GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ; 1016 pC++ ; 1017 #endif 1018 } 1019 } 1020 1021 for ( ; pB < pB_end ; pB++) 1022 { 1023 int64_t i = Bi [pB] ; 1024 GB_GET_MIJ (i) ; 1025 if (mij) 1026 { 1027 // C (i,j) = B (i,j) 1028 #if defined ( GB_PHASE_1_OF_2 ) 1029 cjnz++ ; 1030 #else 1031 Ci [pC] = i ; 1032 GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ; 1033 pC++ ; 1034 #endif 1035 } 1036 } 1037 1038 } 1039 else if (iB_last < iA_first) 1040 { 1041 1042 //---------------------------------------------------------- 1043 // Method19:last B(:,j) before 1st A(:,j), M bitmap/full 1044 //---------------------------------------------------------- 1045 1046 for ( ; pB < pB_end ; pB++) 1047 { 1048 int64_t i = Bi [pB] ; 1049 GB_GET_MIJ (i) ; 1050 if (mij) 1051 { 1052 // C (i,j) = B (i,j) 1053 #if defined ( GB_PHASE_1_OF_2 ) 1054 cjnz++ ; 1055 #else 1056 Ci [pC] = i ; 1057 GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ; 1058 pC++ ; 1059 #endif 1060 } 1061 } 1062 1063 for ( ; pA < pA_end ; pA++) 1064 { 1065 int64_t i = Ai [pA] ; 1066 GB_GET_MIJ (i) ; 1067 if (mij) 1068 { 1069 // C (i,j) = A (i,j) 1070 #if defined ( GB_PHASE_1_OF_2 ) 1071 cjnz++ ; 1072 #else 1073 Ci [pC] = i ; 1074 GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ; 1075 pC++ ; 1076 #endif 1077 } 1078 } 1079 1080 } 1081 else 1082 { 1083 1084 //---------------------------------------------------------- 1085 // Method20: merge A(:,j) and B(:,j), M bitmap/full 1086 //---------------------------------------------------------- 1087 1088 while (pA < pA_end && pB < pB_end) 1089 { 1090 int64_t iA = Ai [pA] ; 1091 int64_t iB = Bi [pB] ; 1092 if (iA < iB) 1093 { 1094 GB_GET_MIJ (iA) ; 1095 if (mij) 1096 { 1097 // C (iA,j) = A (iA,j) 1098 #if defined ( GB_PHASE_1_OF_2 ) 1099 cjnz++ ; 1100 #else 1101 Ci [pC] = iA ; 1102 GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ; 1103 pC++ ; 1104 #endif 1105 } 1106 pA++ ; 1107 } 1108 else if (iA > iB) 1109 { 1110 GB_GET_MIJ (iB) ; 1111 if (mij) 1112 { 1113 // C (iB,j) = B (iB,j) 1114 #if defined ( GB_PHASE_1_OF_2 ) 1115 cjnz++ ; 1116 #else 1117 Ci [pC] = iB ; 1118 GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ; 1119 pC++ ; 1120 #endif 1121 } 1122 pB++ ; 1123 } 1124 else 1125 { 1126 GB_GET_MIJ (iB) ; 1127 if (mij) 1128 { 1129 // C (i,j) = A (i,j) + B (i,j) 1130 #if defined ( GB_PHASE_1_OF_2 ) 1131 cjnz++ ; 1132 #else 1133 Ci [pC] = iB ; 1134 GB_GETA (aij, Ax, pA) ; 1135 GB_GETB (bij, Bx, pB) ; 1136 GB_BINOP (GB_CX (pC), aij, bij, iB, j) ; 1137 pC++ ; 1138 #endif 1139 } 1140 pA++ ; 1141 pB++ ; 1142 } 1143 } 1144 1145 //---------------------------------------------------------- 1146 // A (:,j) or B (:,j) have entries left; not both 1147 //---------------------------------------------------------- 1148 1149 for ( ; pA < pA_end ; pA++) 1150 { 1151 int64_t iA = Ai [pA] ; 1152 GB_GET_MIJ (iA) ; 1153 if (mij) 1154 { 1155 // C (iA,j) = A (iA,j) 1156 #if defined ( GB_PHASE_1_OF_2 ) 1157 cjnz++ ; 1158 #else 1159 Ci [pC] = iA ; 1160 GB_COPY_A_TO_C (GB_CX (pC), Ax, pA) ; 1161 pC++ ; 1162 #endif 1163 } 1164 } 1165 1166 for ( ; pB < pB_end ; pB++) 1167 { 1168 int64_t iB = Bi [pB] ; 1169 GB_GET_MIJ (iB) ; 1170 if (mij) 1171 { 1172 // C (iB,j) = B (iB,j) 1173 #if defined ( GB_PHASE_1_OF_2 ) 1174 cjnz++ ; 1175 #else 1176 Ci [pC] = iB ; 1177 GB_COPY_B_TO_C (GB_CX (pC), Bx, pB) ; 1178 pC++ ; 1179 #endif 1180 } 1181 } 1182 } 1183 } 1184 1185 //------------------------------------------------------------------ 1186 // final count of nnz (C (:,j)) 1187 //------------------------------------------------------------------ 1188 1189 #if defined ( GB_PHASE_1_OF_2 ) 1190 if (fine_task) 1191 { 1192 TaskList [taskid].pC = cjnz ; 1193 } 1194 else 1195 { 1196 Cp [k] = cjnz ; 1197 } 1198 #endif 1199 } 1200 } 1201 } 1202 1203