1 //------------------------------------------------------------------------------
2 // GB_AxB_saxpy3_template: C=A*B, C<M>=A*B, or C<!M>=A*B via saxpy3 method
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // GB_AxB_saxpy3_template.c computes C=A*B for any semiring and matrix types,
11 // where C is sparse or hypersparse.
12 
13 #include "GB_unused.h"
14 
15 //------------------------------------------------------------------------------
16 // template code for C=A*B via the saxpy3 method
17 //------------------------------------------------------------------------------
18 
19 {
20 
21 // double ttt = omp_get_wtime ( ) ;
22 
23     //--------------------------------------------------------------------------
24     // get the chunk size
25     //--------------------------------------------------------------------------
26 
27     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
28 
29     //--------------------------------------------------------------------------
30     // get M, A, B, and C
31     //--------------------------------------------------------------------------
32 
33     int64_t *restrict Cp = C->p ;
34     // const int64_t *restrict Ch = C->h ;
35     const int64_t cvlen = C->vlen ;
36     const int64_t cnvec = C->nvec ;
37 
38     const int64_t *restrict Bp = B->p ;
39     const int64_t *restrict Bh = B->h ;
40     const int8_t  *restrict Bb = B->b ;
41     const int64_t *restrict Bi = B->i ;
42     const GB_BTYPE *restrict Bx = (GB_BTYPE *) (B_is_pattern ? NULL : B->x) ;
43     const int64_t bvlen = B->vlen ;
44     const bool B_jumbled = B->jumbled ;
45     const bool B_is_sparse = GB_IS_SPARSE (B) ;
46     const bool B_is_hyper = GB_IS_HYPERSPARSE (B) ;
47     const bool B_is_bitmap = GB_IS_BITMAP (B) ;
48     const bool B_is_sparse_or_hyper = B_is_sparse || B_is_hyper ;
49 
50     const int64_t *restrict Ap = A->p ;
51     const int64_t *restrict Ah = A->h ;
52     const int8_t  *restrict Ab = A->b ;
53     const int64_t *restrict Ai = A->i ;
54     const int64_t anvec = A->nvec ;
55     const int64_t avlen = A->vlen ;
56     const bool A_is_sparse = GB_IS_SPARSE (A) ;
57     const bool A_is_hyper = GB_IS_HYPERSPARSE (A) ;
58     const bool A_is_bitmap = GB_IS_BITMAP (A) ;
59     const GB_ATYPE *restrict Ax = (GB_ATYPE *) (A_is_pattern ? NULL : A->x) ;
60     const bool A_jumbled = A->jumbled ;
61     const bool A_ok_for_binary_search =
62         ((A_is_sparse || A_is_hyper) && !A_jumbled) ;
63 
64     #if ( !GB_NO_MASK )
65     const int64_t *restrict Mp = M->p ;
66     const int64_t *restrict Mh = M->h ;
67     const int8_t  *restrict Mb = M->b ;
68     const int64_t *restrict Mi = M->i ;
69     const GB_void *restrict Mx = (GB_void *) (Mask_struct ? NULL : (M->x)) ;
70     const bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
71     const bool M_is_bitmap = GB_IS_BITMAP (M) ;
72     const bool M_jumbled = GB_JUMBLED (M) ;
73     size_t msize = M->type->size ;
74     int64_t mnvec = M->nvec ;
75     int64_t mvlen = M->vlen ;
76     #endif
77 
78     //==========================================================================
79     // phase2: numeric work for fine tasks
80     //==========================================================================
81 
82     // Coarse tasks: nothing to do in phase2.
83     // Fine tasks: compute nnz (C(:,j)), and values in Hx via atomics.
84 
85     int taskid ;
86     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
87     for (taskid = 0 ; taskid < nfine ; taskid++)
88     {
89 
90         //----------------------------------------------------------------------
91         // get the task descriptor
92         //----------------------------------------------------------------------
93 
94         int64_t kk = SaxpyTasks [taskid].vector ;
95         int team_size = SaxpyTasks [taskid].team_size ;
96         int64_t hash_size = SaxpyTasks [taskid].hsize ;
97         bool use_Gustavson = (hash_size == cvlen) ;
98         int64_t pB     = SaxpyTasks [taskid].start ;
99         int64_t pB_end = SaxpyTasks [taskid].end + 1 ;
100         int64_t pleft = 0, pright = anvec-1 ;
101         int64_t j = GBH (Bh, kk) ;
102 
103         GB_GET_T_FOR_SECONDJ ;
104 
105         #if !GB_IS_ANY_PAIR_SEMIRING
106         GB_CTYPE *restrict Hx = (GB_CTYPE *) SaxpyTasks [taskid].Hx ;
107         #endif
108 
109         #if GB_IS_PLUS_FC32_MONOID
110         float  *restrict Hx_real = (float *) Hx ;
111         float  *restrict Hx_imag = Hx_real + 1 ;
112         #elif GB_IS_PLUS_FC64_MONOID
113         double *restrict Hx_real = (double *) Hx ;
114         double *restrict Hx_imag = Hx_real + 1 ;
115         #endif
116 
117         if (use_Gustavson)
118         {
119 
120             //------------------------------------------------------------------
121             // phase2: fine Gustavson task
122             //------------------------------------------------------------------
123 
124             // Hf [i] == 0: unlocked, i has not been seen in C(:,j).
125             //      Hx [i] is not initialized.
126             //      M(i,j) is 0, or M is not present.
127             //      if M: Hf [i] stays equal to 0 (or 3 if locked)
128             //      if !M, or no M: C(i,j) is a new entry seen for 1st time
129 
130             // Hf [i] == 1: unlocked, i has not been seen in C(:,j).
131             //      Hx [i] is not initialized.  M is present.
132             //      M(i,j) is 1. (either M or !M case)
133             //      if M: C(i,j) is a new entry seen for the first time.
134             //      if !M: Hf [i] stays equal to 1 (or 3 if locked)
135 
136             // Hf [i] == 2: unlocked, i has been seen in C(:,j).
137             //      Hx [i] is initialized.  This case is independent of M.
138 
139             // Hf [i] == 3: locked.  Hx [i] cannot be accessed.
140 
141             int8_t *restrict
142                 Hf = (int8_t *restrict) SaxpyTasks [taskid].Hf ;
143 
144             #if ( GB_NO_MASK )
145             {
146                 // phase2: fine Gustavson task, C(:,j)=A*B(:,j)
147                 #include "GB_AxB_saxpy3_fineGus_phase2.c"
148             }
149             #elif ( !GB_MASK_COMP )
150             {
151                 // phase2: fine Gustavson task, C(:,j)<M(:,j)>=A*B(:,j)
152                 #include "GB_AxB_saxpy3_fineGus_M_phase2.c"
153             }
154             #else
155             {
156                 // phase2: fine Gustavson task, C(:,j)<!M(:,j)>=A*B(:,j)
157                 #include "GB_AxB_saxpy3_fineGus_notM_phase2.c"
158             }
159             #endif
160 
161         }
162         else
163         {
164 
165             //------------------------------------------------------------------
166             // phase2: fine hash task
167             //------------------------------------------------------------------
168 
169             // Each hash entry Hf [hash] splits into two parts, (h,f).  f
170             // is in the 2 least significant bits.  h is 62 bits, and is
171             // the 1-based index i of the C(i,j) entry stored at that
172             // location in the hash table.
173 
174             // If M is present (M or !M), and M(i,j)=1, then (i+1,1)
175             // has been inserted into the hash table, in phase0.
176 
177             // Given Hf [hash] split into (h,f)
178 
179             // h == 0, f == 0: unlocked and unoccupied.
180             //                  note that if f=0, h must be zero too.
181 
182             // h == i+1, f == 1: unlocked, occupied by M(i,j)=1.
183             //                  C(i,j) has not been seen, or is ignored.
184             //                  Hx is not initialized.  M is present.
185             //                  if !M: this entry will be ignored in C.
186 
187             // h == i+1, f == 2: unlocked, occupied by C(i,j).
188             //                  Hx is initialized.  M is no longer
189             //                  relevant.
190 
191             // h == (anything), f == 3: locked.
192 
193             int64_t *restrict Hf = (int64_t *restrict) SaxpyTasks [taskid].Hf ;
194             int64_t hash_bits = (hash_size-1) ;
195 
196             #if ( GB_NO_MASK )
197             {
198 
199                 //--------------------------------------------------------------
200                 // phase2: fine hash task, C(:,j)=A*B(:,j)
201                 //--------------------------------------------------------------
202 
203                 // no mask present, or mask ignored
204                 #undef GB_CHECK_MASK_ij
205                 #include "GB_AxB_saxpy3_fineHash_phase2.c"
206 
207             }
208             #elif ( !GB_MASK_COMP )
209             {
210 
211                 //--------------------------------------------------------------
212                 // phase2: fine hash task, C(:,j)<M(:,j)>=A*B(:,j)
213                 //--------------------------------------------------------------
214 
215                 GB_GET_M_j ;                // get M(:,j)
216                 if (M_packed_in_place)
217                 {
218                     // M(:,j) is packed, and thus not scattered into Hf
219                     if (M_is_bitmap && Mask_struct)
220                     {
221                         // M is bitmap and structural
222                         const int8_t *restrict Mjb = Mb + pM_start ;
223                         #undef  GB_CHECK_MASK_ij
224                         #define GB_CHECK_MASK_ij                        \
225                             if (!Mjb [i]) continue ;
226                         #include "GB_AxB_saxpy3_fineHash_phase2.c"
227                     }
228                     else
229                     {
230                         // M is bitmap/dense
231                         #undef  GB_CHECK_MASK_ij
232                         #define GB_CHECK_MASK_ij                        \
233                             const int64_t pM = pM_start + i ;           \
234                             GB_GET_M_ij (pM) ;                          \
235                             if (!mij) continue ;
236                         #include "GB_AxB_saxpy3_fineHash_phase2.c"
237                     }
238                 }
239                 else
240                 {
241                     // M(:,j) is sparse and scattered into Hf
242                     #include "GB_AxB_saxpy3_fineHash_M_phase2.c"
243                 }
244 
245             }
246             #else
247             {
248 
249                 //--------------------------------------------------------------
250                 // phase2: fine hash task, C(:,j)<!M(:,j)>=A*B(:,j)
251                 //--------------------------------------------------------------
252 
253                 GB_GET_M_j ;                // get M(:,j)
254                 if (M_packed_in_place)
255                 {
256                     // M(:,j) is packed, and thus not scattered into Hf
257                     if (M_is_bitmap && Mask_struct)
258                     {
259                         // M is bitmap and structural
260                         const int8_t *restrict Mjb = Mb + pM_start ;
261                         #undef  GB_CHECK_MASK_ij
262                         #define GB_CHECK_MASK_ij                        \
263                             if (Mjb [i]) continue ;
264                         #include "GB_AxB_saxpy3_fineHash_phase2.c"
265                     }
266                     else
267                     {
268                         // M is bitmap/dense
269                         #undef  GB_CHECK_MASK_ij
270                         #define GB_CHECK_MASK_ij                        \
271                             const int64_t pM = pM_start + i ;           \
272                             GB_GET_M_ij (pM) ;                          \
273                             if (mij) continue ;
274                         #include "GB_AxB_saxpy3_fineHash_phase2.c"
275                     }
276                 }
277                 else
278                 {
279                     // M(:,j) is sparse/hyper and scattered into Hf
280                     #include "GB_AxB_saxpy3_fineHash_notM_phase2.c"
281                 }
282             }
283             #endif
284         }
285     }
286 
287 // ttt = omp_get_wtime ( ) - ttt ;
288 // GB_Global_timing_add (9, ttt) ;
289 // ttt = omp_get_wtime ( ) ;
290 
291     //==========================================================================
292     // phase3/phase4: count nnz(C(:,j)) for fine tasks, cumsum of Cp
293     //==========================================================================
294 
295     GB_AxB_saxpy3_cumsum (C, SaxpyTasks, nfine, chunk, nthreads, Context) ;
296 
297 // ttt = omp_get_wtime ( ) - ttt ;
298 // GB_Global_timing_add (10, ttt) ;
299 // ttt = omp_get_wtime ( ) ;
300 
301     //==========================================================================
302     // phase5: numeric phase for coarse tasks, gather for fine tasks
303     //==========================================================================
304 
305     // allocate Ci and Cx
306     int64_t cnz = Cp [cnvec] ;
307     GrB_Info info = GB_bix_alloc (C, cnz, false, false, true, true, Context) ;
308     if (info != GrB_SUCCESS)
309     {
310         // out of memory
311         return (GrB_OUT_OF_MEMORY) ;
312     }
313 
314     int64_t  *restrict Ci = C->i ;
315     GB_CTYPE *restrict Cx = (GB_CTYPE *) C->x ;
316 
317 //  printf ("check Ci size %p %ld\n", C->i, C->i_size) ;
318     ASSERT (C->i_size == GB_Global_memtable_size (C->i)) ;
319 
320     #if GB_IS_ANY_PAIR_SEMIRING
321 
322         // TODO: create C as a constant-value matrix.
323 
324         // ANY_PAIR semiring: result is purely symbolic
325         int64_t pC ;
326         #pragma omp parallel for num_threads(nthreads) schedule(static)
327         for (pC = 0 ; pC < cnz ; pC++)
328         {
329             Cx [pC] = GB_CTYPE_CAST (1, 0) ;
330         }
331 
332         // Just a precaution; these variables are not used below.  Any attempt
333         // to access them will lead to a compile error.
334         #define Cx is not used
335         #define Hx is not used
336 
337         // these have been renamed to ANY_PAIR:
338         // EQ_PAIR
339         // LAND_PAIR
340         // LOR_PAIR
341         // MAX_PAIR
342         // MIN_PAIR
343         // TIMES_PAIR
344 
345     #endif
346 
347 // ttt = omp_get_wtime ( ) - ttt ;
348 // GB_Global_timing_add (11, ttt) ;
349 // ttt = omp_get_wtime ( ) ;
350 
351     bool C_jumbled = false ;
352     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1) \
353         reduction(||:C_jumbled)
354     for (taskid = 0 ; taskid < ntasks ; taskid++)
355     {
356 
357         //----------------------------------------------------------------------
358         // get the task descriptor
359         //----------------------------------------------------------------------
360 
361         #if !GB_IS_ANY_PAIR_SEMIRING
362         GB_CTYPE *restrict Hx = (GB_CTYPE *) SaxpyTasks [taskid].Hx ;
363         #endif
364         int64_t hash_size = SaxpyTasks [taskid].hsize ;
365         bool use_Gustavson = (hash_size == cvlen) ;
366         bool task_C_jumbled = false ;
367 
368         if (taskid < nfine)
369         {
370 
371             //------------------------------------------------------------------
372             // fine task: gather pattern and values
373             //------------------------------------------------------------------
374 
375             int64_t kk = SaxpyTasks [taskid].vector ;
376             int team_size = SaxpyTasks [taskid].team_size ;
377             int leader    = SaxpyTasks [taskid].leader ;
378             int my_teamid = taskid - leader ;
379             int64_t pC = Cp [kk] ;
380 
381             if (use_Gustavson)
382             {
383 
384                 //--------------------------------------------------------------
385                 // phase5: fine Gustavson task, C=A*B, C<M>=A*B, or C<!M>=A*B
386                 //--------------------------------------------------------------
387 
388                 // Hf [i] == 2 if C(i,j) is an entry in C(:,j)
389                 int8_t *restrict
390                     Hf = (int8_t *restrict) SaxpyTasks [taskid].Hf ;
391                 int64_t cjnz = Cp [kk+1] - pC ;
392                 int64_t istart, iend ;
393                 GB_PARTITION (istart, iend, cvlen, my_teamid, team_size) ;
394                 if (cjnz == cvlen)
395                 {
396                     // C(:,j) is dense
397                     for (int64_t i = istart ; i < iend ; i++)
398                     {
399                         Ci [pC + i] = i ;
400                     }
401                     #if !GB_IS_ANY_PAIR_SEMIRING
402                     // copy Hx [istart:iend-1] into Cx [pC+istart:pC+iend-1]
403                     GB_CIJ_MEMCPY (pC + istart, istart, iend - istart) ;
404                     #endif
405                 }
406                 else
407                 {
408                     // C(:,j) is sparse
409                     pC += SaxpyTasks [taskid].my_cjnz ;
410                     for (int64_t i = istart ; i < iend ; i++)
411                     {
412                         if (Hf [i] == 2)
413                         {
414                             GB_CIJ_GATHER (pC, i) ; // Cx [pC] = Hx [i]
415                             Ci [pC++] = i ;
416                         }
417                     }
418                 }
419 
420             }
421             else
422             {
423 
424                 //--------------------------------------------------------------
425                 // phase5: fine hash task, C=A*B, C<M>=A*B, C<!M>=A*B
426                 //--------------------------------------------------------------
427 
428                 // (Hf [hash] & 3) == 2 if C(i,j) is an entry in C(:,j),
429                 // and the index i of the entry is (Hf [hash] >> 2) - 1.
430 
431                 int64_t *restrict
432                     Hf = (int64_t *restrict) SaxpyTasks [taskid].Hf ;
433                 int64_t mystart, myend ;
434                 GB_PARTITION (mystart, myend, hash_size, my_teamid, team_size) ;
435                 pC += SaxpyTasks [taskid].my_cjnz ;
436                 for (int64_t hash = mystart ; hash < myend ; hash++)
437                 {
438                     int64_t hf = Hf [hash] ;
439                     if ((hf & 3) == 2)
440                     {
441                         int64_t i = (hf >> 2) - 1 ; // found C(i,j) in hash
442                         Ci [pC] = i ;
443                         GB_CIJ_GATHER (pC, hash) ;  // Cx [pC] = Hx [hash]
444                         pC++ ;
445                     }
446                 }
447                 task_C_jumbled = true ;
448             }
449 
450         }
451         else
452         {
453 
454             //------------------------------------------------------------------
455             // numeric coarse task: compute C(:,kfirst:klast)
456             //------------------------------------------------------------------
457 
458             int64_t *restrict
459                 Hf = (int64_t *restrict) SaxpyTasks [taskid].Hf ;
460             int64_t kfirst = SaxpyTasks [taskid].start ;
461             int64_t klast = SaxpyTasks [taskid].end ;
462             int64_t nk = klast - kfirst + 1 ;
463             int64_t mark = 2*nk + 1 ;
464 
465             if (use_Gustavson)
466             {
467 
468                 //--------------------------------------------------------------
469                 // phase5: coarse Gustavson task
470                 //--------------------------------------------------------------
471 
472                 #if ( GB_NO_MASK )
473                 {
474                     // phase5: coarse Gustavson task, C=A*B
475                     #include "GB_AxB_saxpy3_coarseGus_noM_phase5.c"
476                 }
477                 #elif ( !GB_MASK_COMP )
478                 {
479                     // phase5: coarse Gustavson task, C<M>=A*B
480                     #include "GB_AxB_saxpy3_coarseGus_M_phase5.c"
481                 }
482                 #else
483                 {
484                     // phase5: coarse Gustavson task, C<!M>=A*B
485                     #include "GB_AxB_saxpy3_coarseGus_notM_phase5.c"
486                 }
487                 #endif
488 
489             }
490             else
491             {
492 
493                 //--------------------------------------------------------------
494                 // phase5: coarse hash task
495                 //--------------------------------------------------------------
496 
497                 int64_t *restrict Hi = SaxpyTasks [taskid].Hi ;
498                 int64_t hash_bits = (hash_size-1) ;
499 
500                 #if ( GB_NO_MASK )
501                 {
502 
503                     //----------------------------------------------------------
504                     // phase5: coarse hash task, C=A*B
505                     //----------------------------------------------------------
506 
507                     // no mask present, or mask ignored (see below)
508                     #undef GB_CHECK_MASK_ij
509                     #include "GB_AxB_saxpy3_coarseHash_phase5.c"
510 
511                 }
512                 #elif ( !GB_MASK_COMP )
513                 {
514 
515                     //----------------------------------------------------------
516                     // phase5: coarse hash task, C<M>=A*B
517                     //----------------------------------------------------------
518 
519                     if (M_packed_in_place)
520                     {
521                         // M is packed, and thus not scattered into Hf
522                         if (M_is_bitmap && Mask_struct)
523                         {
524                             // M is bitmap and structural
525                             #define GB_MASK_IS_BITMAP_AND_STRUCTURAL
526                             #undef  GB_CHECK_MASK_ij
527                             #define GB_CHECK_MASK_ij                        \
528                                 if (!Mjb [i]) continue ;
529                             #include "GB_AxB_saxpy3_coarseHash_phase5.c"
530                         }
531                         else
532                         {
533                             // M is bitmap/dense
534                             #undef  GB_CHECK_MASK_ij
535                             #define GB_CHECK_MASK_ij                        \
536                                 const int64_t pM = pM_start + i ;           \
537                                 GB_GET_M_ij (pM) ;                          \
538                                 if (!mij) continue ;
539                             #include "GB_AxB_saxpy3_coarseHash_phase5.c"
540                         }
541                     }
542                     else
543                     {
544                         // M is sparse and scattered into Hf
545                         #include "GB_AxB_saxpy3_coarseHash_M_phase5.c"
546                     }
547 
548                 }
549                 #else
550                 {
551 
552                     //----------------------------------------------------------
553                     // phase5: coarse hash task, C<!M>=A*B
554                     //----------------------------------------------------------
555                     if (M_packed_in_place)
556                     {
557                         // M is packed, and thus not scattered into Hf
558                         if (M_is_bitmap && Mask_struct)
559                         {
560                             // M is bitmap and structural
561                             #define GB_MASK_IS_BITMAP_AND_STRUCTURAL
562                             #undef  GB_CHECK_MASK_ij
563                             #define GB_CHECK_MASK_ij                        \
564                                 if (Mjb [i]) continue ;
565                             #include "GB_AxB_saxpy3_coarseHash_phase5.c"
566                         }
567                         else
568                         {
569                             // M is bitmap/dense
570                             #undef  GB_CHECK_MASK_ij
571                             #define GB_CHECK_MASK_ij                        \
572                                 const int64_t pM = pM_start + i ;           \
573                                 GB_GET_M_ij (pM) ;                          \
574                                 if (mij) continue ;
575                             #include "GB_AxB_saxpy3_coarseHash_phase5.c"
576                         }
577                     }
578                     else
579                     {
580                         // M is sparse and scattered into Hf
581                         #include "GB_AxB_saxpy3_coarseHash_notM_phase5.c"
582                     }
583                 }
584                 #endif
585             }
586         }
587         C_jumbled = C_jumbled || task_C_jumbled ;
588     }
589 
590     //--------------------------------------------------------------------------
591     // log the state of C->jumbled
592     //--------------------------------------------------------------------------
593 
594     C->jumbled = C_jumbled ;    // C is jumbled if any task left it jumbled
595 
596 // ttt = omp_get_wtime ( ) - ttt ;
597 // GB_Global_timing_add (12, ttt) ;
598 
599 }
600 
601 #undef Cx
602 #undef Hx
603 
604 #undef GB_NO_MASK
605 #undef GB_MASK_COMP
606 
607