1 //------------------------------------------------------------------------------ 2 // GB_AxB_saxpy3_template: C=A*B, C<M>=A*B, or C<!M>=A*B via saxpy3 method 3 //------------------------------------------------------------------------------ 4 5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved. 6 // SPDX-License-Identifier: Apache-2.0 7 8 //------------------------------------------------------------------------------ 9 10 // GB_AxB_saxpy3_template.c computes C=A*B for any semiring and matrix types, 11 // where C is sparse or hypersparse. 12 13 #include "GB_unused.h" 14 15 //------------------------------------------------------------------------------ 16 // template code for C=A*B via the saxpy3 method 17 //------------------------------------------------------------------------------ 18 19 { 20 21 // double ttt = omp_get_wtime ( ) ; 22 23 //-------------------------------------------------------------------------- 24 // get the chunk size 25 //-------------------------------------------------------------------------- 26 27 GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ; 28 29 //-------------------------------------------------------------------------- 30 // get M, A, B, and C 31 //-------------------------------------------------------------------------- 32 33 int64_t *restrict Cp = C->p ; 34 // const int64_t *restrict Ch = C->h ; 35 const int64_t cvlen = C->vlen ; 36 const int64_t cnvec = C->nvec ; 37 38 const int64_t *restrict Bp = B->p ; 39 const int64_t *restrict Bh = B->h ; 40 const int8_t *restrict Bb = B->b ; 41 const int64_t *restrict Bi = B->i ; 42 const GB_BTYPE *restrict Bx = (GB_BTYPE *) (B_is_pattern ? NULL : B->x) ; 43 const int64_t bvlen = B->vlen ; 44 const bool B_jumbled = B->jumbled ; 45 const bool B_is_sparse = GB_IS_SPARSE (B) ; 46 const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ; 47 const bool B_is_bitmap = GB_IS_BITMAP (B) ; 48 const bool B_is_sparse_or_hyper = B_is_sparse || B_is_hyper ; 49 50 const int64_t *restrict Ap = A->p ; 51 const int64_t *restrict Ah = A->h ; 52 const int8_t *restrict Ab = A->b ; 53 const int64_t *restrict Ai = A->i ; 54 const int64_t anvec = A->nvec ; 55 const int64_t avlen = A->vlen ; 56 const bool A_is_sparse = GB_IS_SPARSE (A) ; 57 const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ; 58 const bool A_is_bitmap = GB_IS_BITMAP (A) ; 59 const GB_ATYPE *restrict Ax = (GB_ATYPE *) (A_is_pattern ? NULL : A->x) ; 60 const bool A_jumbled = A->jumbled ; 61 const bool A_ok_for_binary_search = 62 ((A_is_sparse || A_is_hyper) && !A_jumbled) ; 63 64 #if ( !GB_NO_MASK ) 65 const int64_t *restrict Mp = M->p ; 66 const int64_t *restrict Mh = M->h ; 67 const int8_t *restrict Mb = M->b ; 68 const int64_t *restrict Mi = M->i ; 69 const GB_void *restrict Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ; 70 const bool M_is_hyper = GB_IS_HYPERSPARSE (M) ; 71 const bool M_is_bitmap = GB_IS_BITMAP (M) ; 72 const bool M_jumbled = GB_JUMBLED (M) ; 73 size_t msize = M->type->size ; 74 int64_t mnvec = M->nvec ; 75 int64_t mvlen = M->vlen ; 76 #endif 77 78 //========================================================================== 79 // phase2: numeric work for fine tasks 80 //========================================================================== 81 82 // Coarse tasks: nothing to do in phase2. 83 // Fine tasks: compute nnz (C(:,j)), and values in Hx via atomics. 84 85 int taskid ; 86 #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) 87 for (taskid = 0 ; taskid < nfine ; taskid++) 88 { 89 90 //---------------------------------------------------------------------- 91 // get the task descriptor 92 //---------------------------------------------------------------------- 93 94 int64_t kk = SaxpyTasks [taskid].vector ; 95 int team_size = SaxpyTasks [taskid].team_size ; 96 int64_t hash_size = SaxpyTasks [taskid].hsize ; 97 bool use_Gustavson = (hash_size == cvlen) ; 98 int64_t pB = SaxpyTasks [taskid].start ; 99 int64_t pB_end = SaxpyTasks [taskid].end + 1 ; 100 int64_t pleft = 0, pright = anvec-1 ; 101 int64_t j = GBH (Bh, kk) ; 102 103 GB_GET_T_FOR_SECONDJ ; 104 105 #if !GB_IS_ANY_PAIR_SEMIRING 106 GB_CTYPE *restrict Hx = (GB_CTYPE *) SaxpyTasks [taskid].Hx ; 107 #endif 108 109 #if GB_IS_PLUS_FC32_MONOID 110 float *restrict Hx_real = (float *) Hx ; 111 float *restrict Hx_imag = Hx_real + 1 ; 112 #elif GB_IS_PLUS_FC64_MONOID 113 double *restrict Hx_real = (double *) Hx ; 114 double *restrict Hx_imag = Hx_real + 1 ; 115 #endif 116 117 if (use_Gustavson) 118 { 119 120 //------------------------------------------------------------------ 121 // phase2: fine Gustavson task 122 //------------------------------------------------------------------ 123 124 // Hf [i] == 0: unlocked, i has not been seen in C(:,j). 125 // Hx [i] is not initialized. 126 // M(i,j) is 0, or M is not present. 127 // if M: Hf [i] stays equal to 0 (or 3 if locked) 128 // if !M, or no M: C(i,j) is a new entry seen for 1st time 129 130 // Hf [i] == 1: unlocked, i has not been seen in C(:,j). 131 // Hx [i] is not initialized. M is present. 132 // M(i,j) is 1. (either M or !M case) 133 // if M: C(i,j) is a new entry seen for the first time. 134 // if !M: Hf [i] stays equal to 1 (or 3 if locked) 135 136 // Hf [i] == 2: unlocked, i has been seen in C(:,j). 137 // Hx [i] is initialized. This case is independent of M. 138 139 // Hf [i] == 3: locked. Hx [i] cannot be accessed. 140 141 int8_t *restrict 142 Hf = (int8_t *restrict) SaxpyTasks [taskid].Hf ; 143 144 #if ( GB_NO_MASK ) 145 { 146 // phase2: fine Gustavson task, C(:,j)=A*B(:,j) 147 #include "GB_AxB_saxpy3_fineGus_phase2.c" 148 } 149 #elif ( !GB_MASK_COMP ) 150 { 151 // phase2: fine Gustavson task, C(:,j)<M(:,j)>=A*B(:,j) 152 #include "GB_AxB_saxpy3_fineGus_M_phase2.c" 153 } 154 #else 155 { 156 // phase2: fine Gustavson task, C(:,j)<!M(:,j)>=A*B(:,j) 157 #include "GB_AxB_saxpy3_fineGus_notM_phase2.c" 158 } 159 #endif 160 161 } 162 else 163 { 164 165 //------------------------------------------------------------------ 166 // phase2: fine hash task 167 //------------------------------------------------------------------ 168 169 // Each hash entry Hf [hash] splits into two parts, (h,f). f 170 // is in the 2 least significant bits. h is 62 bits, and is 171 // the 1-based index i of the C(i,j) entry stored at that 172 // location in the hash table. 173 174 // If M is present (M or !M), and M(i,j)=1, then (i+1,1) 175 // has been inserted into the hash table, in phase0. 176 177 // Given Hf [hash] split into (h,f) 178 179 // h == 0, f == 0: unlocked and unoccupied. 180 // note that if f=0, h must be zero too. 181 182 // h == i+1, f == 1: unlocked, occupied by M(i,j)=1. 183 // C(i,j) has not been seen, or is ignored. 184 // Hx is not initialized. M is present. 185 // if !M: this entry will be ignored in C. 186 187 // h == i+1, f == 2: unlocked, occupied by C(i,j). 188 // Hx is initialized. M is no longer 189 // relevant. 190 191 // h == (anything), f == 3: locked. 192 193 int64_t *restrict Hf = (int64_t *restrict) SaxpyTasks [taskid].Hf ; 194 int64_t hash_bits = (hash_size-1) ; 195 196 #if ( GB_NO_MASK ) 197 { 198 199 //-------------------------------------------------------------- 200 // phase2: fine hash task, C(:,j)=A*B(:,j) 201 //-------------------------------------------------------------- 202 203 // no mask present, or mask ignored 204 #undef GB_CHECK_MASK_ij 205 #include "GB_AxB_saxpy3_fineHash_phase2.c" 206 207 } 208 #elif ( !GB_MASK_COMP ) 209 { 210 211 //-------------------------------------------------------------- 212 // phase2: fine hash task, C(:,j)<M(:,j)>=A*B(:,j) 213 //-------------------------------------------------------------- 214 215 GB_GET_M_j ; // get M(:,j) 216 if (M_packed_in_place) 217 { 218 // M(:,j) is packed, and thus not scattered into Hf 219 if (M_is_bitmap && Mask_struct) 220 { 221 // M is bitmap and structural 222 const int8_t *restrict Mjb = Mb + pM_start ; 223 #undef GB_CHECK_MASK_ij 224 #define GB_CHECK_MASK_ij \ 225 if (!Mjb [i]) continue ; 226 #include "GB_AxB_saxpy3_fineHash_phase2.c" 227 } 228 else 229 { 230 // M is bitmap/dense 231 #undef GB_CHECK_MASK_ij 232 #define GB_CHECK_MASK_ij \ 233 const int64_t pM = pM_start + i ; \ 234 GB_GET_M_ij (pM) ; \ 235 if (!mij) continue ; 236 #include "GB_AxB_saxpy3_fineHash_phase2.c" 237 } 238 } 239 else 240 { 241 // M(:,j) is sparse and scattered into Hf 242 #include "GB_AxB_saxpy3_fineHash_M_phase2.c" 243 } 244 245 } 246 #else 247 { 248 249 //-------------------------------------------------------------- 250 // phase2: fine hash task, C(:,j)<!M(:,j)>=A*B(:,j) 251 //-------------------------------------------------------------- 252 253 GB_GET_M_j ; // get M(:,j) 254 if (M_packed_in_place) 255 { 256 // M(:,j) is packed, and thus not scattered into Hf 257 if (M_is_bitmap && Mask_struct) 258 { 259 // M is bitmap and structural 260 const int8_t *restrict Mjb = Mb + pM_start ; 261 #undef GB_CHECK_MASK_ij 262 #define GB_CHECK_MASK_ij \ 263 if (Mjb [i]) continue ; 264 #include "GB_AxB_saxpy3_fineHash_phase2.c" 265 } 266 else 267 { 268 // M is bitmap/dense 269 #undef GB_CHECK_MASK_ij 270 #define GB_CHECK_MASK_ij \ 271 const int64_t pM = pM_start + i ; \ 272 GB_GET_M_ij (pM) ; \ 273 if (mij) continue ; 274 #include "GB_AxB_saxpy3_fineHash_phase2.c" 275 } 276 } 277 else 278 { 279 // M(:,j) is sparse/hyper and scattered into Hf 280 #include "GB_AxB_saxpy3_fineHash_notM_phase2.c" 281 } 282 } 283 #endif 284 } 285 } 286 287 // ttt = omp_get_wtime ( ) - ttt ; 288 // GB_Global_timing_add (9, ttt) ; 289 // ttt = omp_get_wtime ( ) ; 290 291 //========================================================================== 292 // phase3/phase4: count nnz(C(:,j)) for fine tasks, cumsum of Cp 293 //========================================================================== 294 295 GB_AxB_saxpy3_cumsum (C, SaxpyTasks, nfine, chunk, nthreads, Context) ; 296 297 // ttt = omp_get_wtime ( ) - ttt ; 298 // GB_Global_timing_add (10, ttt) ; 299 // ttt = omp_get_wtime ( ) ; 300 301 //========================================================================== 302 // phase5: numeric phase for coarse tasks, gather for fine tasks 303 //========================================================================== 304 305 // allocate Ci and Cx 306 int64_t cnz = Cp [cnvec] ; 307 GrB_Info info = GB_bix_alloc (C, cnz, false, false, true, true, Context) ; 308 if (info != GrB_SUCCESS) 309 { 310 // out of memory 311 return (GrB_OUT_OF_MEMORY) ; 312 } 313 314 int64_t *restrict Ci = C->i ; 315 GB_CTYPE *restrict Cx = (GB_CTYPE *) C->x ; 316 317 // printf ("check Ci size %p %ld\n", C->i, C->i_size) ; 318 ASSERT (C->i_size == GB_Global_memtable_size (C->i)) ; 319 320 #if GB_IS_ANY_PAIR_SEMIRING 321 322 // TODO: create C as a constant-value matrix. 323 324 // ANY_PAIR semiring: result is purely symbolic 325 int64_t pC ; 326 #pragma omp parallel for num_threads(nthreads) schedule(static) 327 for (pC = 0 ; pC < cnz ; pC++) 328 { 329 Cx [pC] = GB_CTYPE_CAST (1, 0) ; 330 } 331 332 // Just a precaution; these variables are not used below. Any attempt 333 // to access them will lead to a compile error. 334 #define Cx is not used 335 #define Hx is not used 336 337 // these have been renamed to ANY_PAIR: 338 // EQ_PAIR 339 // LAND_PAIR 340 // LOR_PAIR 341 // MAX_PAIR 342 // MIN_PAIR 343 // TIMES_PAIR 344 345 #endif 346 347 // ttt = omp_get_wtime ( ) - ttt ; 348 // GB_Global_timing_add (11, ttt) ; 349 // ttt = omp_get_wtime ( ) ; 350 351 bool C_jumbled = false ; 352 #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \ 353 reduction(||:C_jumbled) 354 for (taskid = 0 ; taskid < ntasks ; taskid++) 355 { 356 357 //---------------------------------------------------------------------- 358 // get the task descriptor 359 //---------------------------------------------------------------------- 360 361 #if !GB_IS_ANY_PAIR_SEMIRING 362 GB_CTYPE *restrict Hx = (GB_CTYPE *) SaxpyTasks [taskid].Hx ; 363 #endif 364 int64_t hash_size = SaxpyTasks [taskid].hsize ; 365 bool use_Gustavson = (hash_size == cvlen) ; 366 bool task_C_jumbled = false ; 367 368 if (taskid < nfine) 369 { 370 371 //------------------------------------------------------------------ 372 // fine task: gather pattern and values 373 //------------------------------------------------------------------ 374 375 int64_t kk = SaxpyTasks [taskid].vector ; 376 int team_size = SaxpyTasks [taskid].team_size ; 377 int leader = SaxpyTasks [taskid].leader ; 378 int my_teamid = taskid - leader ; 379 int64_t pC = Cp [kk] ; 380 381 if (use_Gustavson) 382 { 383 384 //-------------------------------------------------------------- 385 // phase5: fine Gustavson task, C=A*B, C<M>=A*B, or C<!M>=A*B 386 //-------------------------------------------------------------- 387 388 // Hf [i] == 2 if C(i,j) is an entry in C(:,j) 389 int8_t *restrict 390 Hf = (int8_t *restrict) SaxpyTasks [taskid].Hf ; 391 int64_t cjnz = Cp [kk+1] - pC ; 392 int64_t istart, iend ; 393 GB_PARTITION (istart, iend, cvlen, my_teamid, team_size) ; 394 if (cjnz == cvlen) 395 { 396 // C(:,j) is dense 397 for (int64_t i = istart ; i < iend ; i++) 398 { 399 Ci [pC + i] = i ; 400 } 401 #if !GB_IS_ANY_PAIR_SEMIRING 402 // copy Hx [istart:iend-1] into Cx [pC+istart:pC+iend-1] 403 GB_CIJ_MEMCPY (pC + istart, istart, iend - istart) ; 404 #endif 405 } 406 else 407 { 408 // C(:,j) is sparse 409 pC += SaxpyTasks [taskid].my_cjnz ; 410 for (int64_t i = istart ; i < iend ; i++) 411 { 412 if (Hf [i] == 2) 413 { 414 GB_CIJ_GATHER (pC, i) ; // Cx [pC] = Hx [i] 415 Ci [pC++] = i ; 416 } 417 } 418 } 419 420 } 421 else 422 { 423 424 //-------------------------------------------------------------- 425 // phase5: fine hash task, C=A*B, C<M>=A*B, C<!M>=A*B 426 //-------------------------------------------------------------- 427 428 // (Hf [hash] & 3) == 2 if C(i,j) is an entry in C(:,j), 429 // and the index i of the entry is (Hf [hash] >> 2) - 1. 430 431 int64_t *restrict 432 Hf = (int64_t *restrict) SaxpyTasks [taskid].Hf ; 433 int64_t mystart, myend ; 434 GB_PARTITION (mystart, myend, hash_size, my_teamid, team_size) ; 435 pC += SaxpyTasks [taskid].my_cjnz ; 436 for (int64_t hash = mystart ; hash < myend ; hash++) 437 { 438 int64_t hf = Hf [hash] ; 439 if ((hf & 3) == 2) 440 { 441 int64_t i = (hf >> 2) - 1 ; // found C(i,j) in hash 442 Ci [pC] = i ; 443 GB_CIJ_GATHER (pC, hash) ; // Cx [pC] = Hx [hash] 444 pC++ ; 445 } 446 } 447 task_C_jumbled = true ; 448 } 449 450 } 451 else 452 { 453 454 //------------------------------------------------------------------ 455 // numeric coarse task: compute C(:,kfirst:klast) 456 //------------------------------------------------------------------ 457 458 int64_t *restrict 459 Hf = (int64_t *restrict) SaxpyTasks [taskid].Hf ; 460 int64_t kfirst = SaxpyTasks [taskid].start ; 461 int64_t klast = SaxpyTasks [taskid].end ; 462 int64_t nk = klast - kfirst + 1 ; 463 int64_t mark = 2*nk + 1 ; 464 465 if (use_Gustavson) 466 { 467 468 //-------------------------------------------------------------- 469 // phase5: coarse Gustavson task 470 //-------------------------------------------------------------- 471 472 #if ( GB_NO_MASK ) 473 { 474 // phase5: coarse Gustavson task, C=A*B 475 #include "GB_AxB_saxpy3_coarseGus_noM_phase5.c" 476 } 477 #elif ( !GB_MASK_COMP ) 478 { 479 // phase5: coarse Gustavson task, C<M>=A*B 480 #include "GB_AxB_saxpy3_coarseGus_M_phase5.c" 481 } 482 #else 483 { 484 // phase5: coarse Gustavson task, C<!M>=A*B 485 #include "GB_AxB_saxpy3_coarseGus_notM_phase5.c" 486 } 487 #endif 488 489 } 490 else 491 { 492 493 //-------------------------------------------------------------- 494 // phase5: coarse hash task 495 //-------------------------------------------------------------- 496 497 int64_t *restrict Hi = SaxpyTasks [taskid].Hi ; 498 int64_t hash_bits = (hash_size-1) ; 499 500 #if ( GB_NO_MASK ) 501 { 502 503 //---------------------------------------------------------- 504 // phase5: coarse hash task, C=A*B 505 //---------------------------------------------------------- 506 507 // no mask present, or mask ignored (see below) 508 #undef GB_CHECK_MASK_ij 509 #include "GB_AxB_saxpy3_coarseHash_phase5.c" 510 511 } 512 #elif ( !GB_MASK_COMP ) 513 { 514 515 //---------------------------------------------------------- 516 // phase5: coarse hash task, C<M>=A*B 517 //---------------------------------------------------------- 518 519 if (M_packed_in_place) 520 { 521 // M is packed, and thus not scattered into Hf 522 if (M_is_bitmap && Mask_struct) 523 { 524 // M is bitmap and structural 525 #define GB_MASK_IS_BITMAP_AND_STRUCTURAL 526 #undef GB_CHECK_MASK_ij 527 #define GB_CHECK_MASK_ij \ 528 if (!Mjb [i]) continue ; 529 #include "GB_AxB_saxpy3_coarseHash_phase5.c" 530 } 531 else 532 { 533 // M is bitmap/dense 534 #undef GB_CHECK_MASK_ij 535 #define GB_CHECK_MASK_ij \ 536 const int64_t pM = pM_start + i ; \ 537 GB_GET_M_ij (pM) ; \ 538 if (!mij) continue ; 539 #include "GB_AxB_saxpy3_coarseHash_phase5.c" 540 } 541 } 542 else 543 { 544 // M is sparse and scattered into Hf 545 #include "GB_AxB_saxpy3_coarseHash_M_phase5.c" 546 } 547 548 } 549 #else 550 { 551 552 //---------------------------------------------------------- 553 // phase5: coarse hash task, C<!M>=A*B 554 //---------------------------------------------------------- 555 if (M_packed_in_place) 556 { 557 // M is packed, and thus not scattered into Hf 558 if (M_is_bitmap && Mask_struct) 559 { 560 // M is bitmap and structural 561 #define GB_MASK_IS_BITMAP_AND_STRUCTURAL 562 #undef GB_CHECK_MASK_ij 563 #define GB_CHECK_MASK_ij \ 564 if (Mjb [i]) continue ; 565 #include "GB_AxB_saxpy3_coarseHash_phase5.c" 566 } 567 else 568 { 569 // M is bitmap/dense 570 #undef GB_CHECK_MASK_ij 571 #define GB_CHECK_MASK_ij \ 572 const int64_t pM = pM_start + i ; \ 573 GB_GET_M_ij (pM) ; \ 574 if (mij) continue ; 575 #include "GB_AxB_saxpy3_coarseHash_phase5.c" 576 } 577 } 578 else 579 { 580 // M is sparse and scattered into Hf 581 #include "GB_AxB_saxpy3_coarseHash_notM_phase5.c" 582 } 583 } 584 #endif 585 } 586 } 587 C_jumbled = C_jumbled || task_C_jumbled ; 588 } 589 590 //-------------------------------------------------------------------------- 591 // log the state of C->jumbled 592 //-------------------------------------------------------------------------- 593 594 C->jumbled = C_jumbled ; // C is jumbled if any task left it jumbled 595 596 // ttt = omp_get_wtime ( ) - ttt ; 597 // GB_Global_timing_add (12, ttt) ; 598 599 } 600 601 #undef Cx 602 #undef Hx 603 604 #undef GB_NO_MASK 605 #undef GB_MASK_COMP 606 607