1 //------------------------------------------------------------------------------
2 // GB_AxB_saxpy3_cumsum: finalize nnz(C(:,j)) and find cumulative sum of Cp
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // phase3: fine tasks finalize their computation nnz(C(:,j))
11 // phase4: cumulative sum of C->p
12 
13 #include "GB_AxB_saxpy3.h"
14 
GB_AxB_saxpy3_cumsum(GrB_Matrix C,GB_saxpy3task_struct * SaxpyTasks,int nfine,double chunk,int nthreads,GB_Context Context)15 void GB_AxB_saxpy3_cumsum
16 (
17     GrB_Matrix C,               // finalize C->p
18     GB_saxpy3task_struct *SaxpyTasks, // list of tasks, and workspace
19     int nfine,                  // number of fine tasks
20     double chunk,               // chunk size
21     int nthreads,               // number of threads
22     GB_Context Context
23 )
24 {
25 
26     //--------------------------------------------------------------------------
27     // get C
28     //--------------------------------------------------------------------------
29 
30     ASSERT (!GB_IS_BITMAP (C)) ;
31     ASSERT (!GB_IS_FULL (C)) ;
32     int64_t *restrict Cp = C->p ;
33     const int64_t cvlen = C->vlen ;
34     const int64_t cnvec = C->nvec ;
35 
36     //==========================================================================
37     // phase3: count nnz(C(:,j)) for fine tasks
38     //==========================================================================
39 
40     int taskid ;
41     #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
42     for (taskid = 0 ; taskid < nfine ; taskid++)
43     {
44 
45         //----------------------------------------------------------------------
46         // get the task descriptor
47         //----------------------------------------------------------------------
48 
49         // int64_t kk = SaxpyTasks [taskid].vector ;
50         int64_t hash_size = SaxpyTasks [taskid].hsize ;
51         bool use_Gustavson = (hash_size == cvlen) ;
52         int team_size = SaxpyTasks [taskid].team_size ;
53         int leader    = SaxpyTasks [taskid].leader ;
54         int my_teamid = taskid - leader ;
55         int64_t my_cjnz = 0 ;
56 
57         if (use_Gustavson)
58         {
59 
60             //------------------------------------------------------------------
61             // phase3: fine Gustavson task, C=A*B, C<M>=A*B, or C<!M>=A*B
62             //------------------------------------------------------------------
63 
64             // Hf [i] == 2 if C(i,j) is an entry in C(:,j)
65 
66             int8_t *restrict Hf ;
67             Hf = (int8_t *restrict) SaxpyTasks [taskid].Hf ;
68             int64_t istart, iend ;
69             GB_PARTITION (istart, iend, cvlen, my_teamid, team_size) ;
70             for (int64_t i = istart ; i < iend ; i++)
71             {
72                 if (Hf [i] == 2)
73                 {
74                     my_cjnz++ ;
75                 }
76             }
77 
78         }
79         else
80         {
81 
82             //------------------------------------------------------------------
83             // phase3: fine hash task, C=A*B, C<M>=A*B, or C<!M>=A*B
84             //------------------------------------------------------------------
85 
86             // (Hf [hash] & 3) == 2 if C(i,j) is an entry in C(:,j),
87             // and the index i of the entry is (Hf [hash] >> 2) - 1.
88 
89             int64_t *restrict Hf = (int64_t *restrict) SaxpyTasks [taskid].Hf ;
90             int64_t mystart, myend ;
91             GB_PARTITION (mystart, myend, hash_size, my_teamid, team_size) ;
92             for (int64_t hash = mystart ; hash < myend ; hash++)
93             {
94                 if ((Hf [hash] & 3) == 2)
95                 {
96                     my_cjnz++ ;
97                 }
98             }
99         }
100 
101         SaxpyTasks [taskid].my_cjnz = my_cjnz ;   // count my nnz(C(:,j))
102     }
103 
104     //==========================================================================
105     // phase4: compute Cp with cumulative sum
106     //==========================================================================
107 
108     //--------------------------------------------------------------------------
109     // sum nnz (C (:,j)) for fine tasks
110     //--------------------------------------------------------------------------
111 
112     // SaxpyTasks [taskid].my_cjnz is the # of unique entries found in C(:,j) by
113     // that task.  Sum these terms to compute total # of entries in C(:,j).
114 
115     for (taskid = 0 ; taskid < nfine ; taskid++)
116     {
117         int64_t kk = SaxpyTasks [taskid].vector ;
118         Cp [kk] = 0 ;
119     }
120 
121     for (taskid = 0 ; taskid < nfine ; taskid++)
122     {
123         int64_t kk = SaxpyTasks [taskid].vector ;
124         int64_t my_cjnz = SaxpyTasks [taskid].my_cjnz ;
125         Cp [kk] += my_cjnz ;
126         ASSERT (my_cjnz <= cvlen) ;
127     }
128 
129     //--------------------------------------------------------------------------
130     // cumulative sum for Cp (fine and coarse tasks)
131     //--------------------------------------------------------------------------
132 
133     // Cp [kk] is now nnz (C (:,j)), for all vectors j, whether computed by
134     // fine tasks or coarse tasks, and where j == GBH (Bh, kk)
135 
136     int nth = GB_nthreads (cnvec, chunk, nthreads) ;
137     GB_cumsum (Cp, cnvec, &(C->nvec_nonempty), nth, Context) ;
138 
139     //--------------------------------------------------------------------------
140     // cumulative sum of nnz (C (:,j)) for each team of fine tasks
141     //--------------------------------------------------------------------------
142 
143     int64_t cjnz_sum = 0 ;
144     for (taskid = 0 ; taskid < nfine ; taskid++)
145     {
146         if (taskid == SaxpyTasks [taskid].leader)
147         {
148             cjnz_sum = 0 ;
149             // also find the max (C (:,j)) for any fine hash tasks
150             int64_t hash_size = SaxpyTasks [taskid].hsize ;
151             bool use_Gustavson = (hash_size == cvlen) ;
152             if (!use_Gustavson)
153             {
154                 int64_t kk = SaxpyTasks [taskid].vector ;
155                 int64_t cjnz = Cp [kk+1] - Cp [kk] ;
156             }
157         }
158         int64_t my_cjnz = SaxpyTasks [taskid].my_cjnz ;
159         SaxpyTasks [taskid].my_cjnz = cjnz_sum ;
160         cjnz_sum += my_cjnz ;
161     }
162 }
163 
164