1 //------------------------------------------------------------------------------
2 // GB_AxB_dot3_slice: slice the entries and vectors for C<M>=A'*B
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // Constructs a set of tasks that slice the matrix C for C<M>=A'*B.  The matrix
11 // C has been allocated, and its pattern will be a copy of M, but with some
12 // entries possibly turned into zombies.  However, on input, the pattern C->i
13 // does yet not contain the indices, but the work required to compute each
14 // entry in the dot product C(i,j)<M(i,j)> = A(:,i)'*B(:,j).
15 
16 // The strategy for slicing of C and M is like GB_ek_slice, for coarse tasks.
17 // These coarse tasks differ from the tasks generated by GB_ewise_slice,
18 // since they may start in the middle of a vector.  If a single entry C(i,j)
19 // is costly to compute, it is possible that it is placed by itself in a
20 // single coarse task.
21 
22 // FUTURE:: Ultra-fine tasks could also be constructed, so that the computation
23 // of a single entry C(i,j) can be broken into multiple tasks.  The slice of
24 // A(:,i) and B(:,j) would use GB_slice_vector, where no mask would be used.
25 
26 #define GB_FREE_WORK                            \
27 {                                               \
28     GB_WERK_POP (Coarse, int64_t) ;             \
29 }
30 
31 #define GB_FREE_ALL                             \
32 {                                               \
33     GB_FREE_WORK ;                              \
34     GB_FREE_WERK (&TaskList, TaskList_size) ;   \
35 }
36 
37 #include "GB_mxm.h"
38 #include "GB_search_for_vector_template.c"
39 
40 //------------------------------------------------------------------------------
41 // GB_AxB_dot3_slice
42 //------------------------------------------------------------------------------
43 
GB_AxB_dot3_slice(GB_task_struct ** p_TaskList,size_t * p_TaskList_size,int * p_ntasks,int * p_nthreads,const GrB_Matrix C,GB_Context Context)44 GrB_Info GB_AxB_dot3_slice
45 (
46     // output:
47     GB_task_struct **p_TaskList,    // array of structs
48     size_t *p_TaskList_size,        // size of TaskList
49     int *p_ntasks,                  // # of tasks constructed
50     int *p_nthreads,                // # of threads to use
51     // input:
52     const GrB_Matrix C,             // matrix to slice
53     GB_Context Context
54 )
55 {
56 
57     //--------------------------------------------------------------------------
58     // check inputs
59     //--------------------------------------------------------------------------
60 
61     ASSERT (p_TaskList != NULL) ;
62     ASSERT (p_TaskList_size != NULL) ;
63     ASSERT (p_ntasks != NULL) ;
64     ASSERT (p_nthreads != NULL) ;
65     // ASSERT_MATRIX_OK (C, ...) cannot be done since C->i is the work need to
66     // compute the entry, not the row index itself.
67 
68     // C is always constructed as sparse or hypersparse, not full, since it
69     // must accomodate zombies
70     ASSERT (!GB_IS_FULL (C)) ;
71     ASSERT (!GB_IS_BITMAP (C)) ;
72 
73     (*p_TaskList  ) = NULL ;
74     (*p_TaskList_size) = 0 ;
75     (*p_ntasks    ) = 0 ;
76     (*p_nthreads  ) = 1 ;
77 
78     //--------------------------------------------------------------------------
79     // determine # of threads to use
80     //--------------------------------------------------------------------------
81 
82     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
83 
84     //--------------------------------------------------------------------------
85     // get C
86     //--------------------------------------------------------------------------
87 
88     const int64_t *restrict Cp = C->p ;
89     int64_t *restrict Cwork = C->i ;
90     const int64_t cnvec = C->nvec ;
91     const int64_t cvlen = C->vlen ;
92     const int64_t cnz = GB_NNZ_HELD (C) ;
93 
94     //--------------------------------------------------------------------------
95     // compute the cumulative sum of the work
96     //--------------------------------------------------------------------------
97 
98     // FUTURE:: handle possible int64_t overflow
99 
100     int nthreads = GB_nthreads (cnz, chunk, nthreads_max) ;
101     GB_cumsum (Cwork, cnz, NULL, nthreads, Context) ;
102     double total_work = (double) Cwork [cnz] ;
103 
104     //--------------------------------------------------------------------------
105     // allocate the initial TaskList
106     //--------------------------------------------------------------------------
107 
108     GB_WERK_DECLARE (Coarse, int64_t) ;
109     int ntasks1 = 0 ;
110     nthreads = GB_nthreads (total_work, chunk, nthreads_max) ;
111     GB_task_struct *restrict TaskList = NULL ; size_t TaskList_size = 0 ;
112     int max_ntasks = 0 ;
113     int ntasks = 0 ;
114     int ntasks0 = (nthreads == 1) ? 1 : (32 * nthreads) ;
115     GB_REALLOC_TASK_WERK (TaskList, ntasks0, max_ntasks) ;
116 
117     //--------------------------------------------------------------------------
118     // check for quick return for a single task
119     //--------------------------------------------------------------------------
120 
121     if (cnvec == 0 || ntasks0 == 1)
122     {
123         // construct a single coarse task that does all the work
124         TaskList [0].kfirst = 0 ;
125         TaskList [0].klast  = cnvec-1 ;
126         TaskList [0].pC = 0 ;
127         TaskList [0].pC_end  = cnz ;
128         (*p_TaskList  ) = TaskList ;
129         (*p_TaskList_size) = TaskList_size ;
130         (*p_ntasks    ) = (cnvec == 0) ? 0 : 1 ;
131         (*p_nthreads  ) = 1 ;
132         return (GrB_SUCCESS) ;
133     }
134 
135     //--------------------------------------------------------------------------
136     // determine # of threads and tasks
137     //--------------------------------------------------------------------------
138 
139     double target_task_size = total_work / (double) (ntasks0) ;
140     target_task_size = GB_IMAX (target_task_size, chunk) ;
141     ntasks1 = total_work / target_task_size ;
142     ntasks1 = GB_IMIN (ntasks1, cnz) ;
143     ntasks1 = GB_IMAX (ntasks1, 1) ;
144 
145     //--------------------------------------------------------------------------
146     // slice the work into coarse tasks
147     //--------------------------------------------------------------------------
148 
149     GB_WERK_PUSH (Coarse, ntasks1 + 1, int64_t) ;
150     if (Coarse == NULL)
151     {
152         // out of memory
153         GB_FREE_ALL ;
154         return (GrB_OUT_OF_MEMORY) ;
155     }
156     GB_pslice (Coarse, Cwork, cnz, ntasks1, false) ;
157 
158     //--------------------------------------------------------------------------
159     // construct all tasks, both coarse and fine
160     //--------------------------------------------------------------------------
161 
162     for (int t = 0 ; t < ntasks1 ; t++)
163     {
164 
165         //----------------------------------------------------------------------
166         // coarse task operates on A (:, k:klast)
167         //----------------------------------------------------------------------
168 
169         int64_t pfirst = Coarse [t] ;
170         int64_t plast  = Coarse [t+1] - 1 ;
171 
172         if (pfirst <= plast)
173         {
174             // find the first vector of the slice for task taskid: the
175             // vector that owns the entry Ci [pfirst] and Cx [pfirst].
176             int64_t kfirst = GB_search_for_vector (pfirst, Cp, 0, cnvec,
177                 cvlen) ;
178 
179             // find the last vector of the slice for task taskid: the
180             // vector that owns the entry Ci [plast] and Cx [plast].
181             int64_t klast = GB_search_for_vector (plast, Cp, kfirst, cnvec,
182                 cvlen) ;
183 
184             // construct a coarse task that computes Ci,Cx [pfirst:plast].
185             // These entries appear in C(:,kfirst:klast), but this task does
186             // not compute all of C(:,kfirst), but just the subset starting at
187             // Ci,Cx [pstart].  The task computes all of the vectors
188             // C(:,kfirst+1:klast-1).  The task computes only part of the last
189             // vector, ending at Ci,Cx [pC_end-1] or Ci,Cx [plast].  This
190             // slice strategy is the same as GB_ek_slice.
191 
192             GB_REALLOC_TASK_WERK (TaskList, ntasks + 1, max_ntasks) ;
193             TaskList [ntasks].kfirst = kfirst ;
194             TaskList [ntasks].klast  = klast ;
195             ASSERT (kfirst <= klast) ;
196             TaskList [ntasks].pC     = pfirst ;
197             TaskList [ntasks].pC_end = plast + 1 ;
198             ntasks++ ;
199 
200         }
201         else
202         {
203             // This task is empty, which means the coarse task that computes
204             // C(i,j) is doing too much work.
205             // FUTURE:: Use ultra-fine tasks here instead, and split the work
206             // for computing the single dot product C(i,j) amongst several
207             // ultra-fine tasks.
208             ;
209         }
210     }
211 
212     ASSERT (ntasks <= max_ntasks) ;
213 
214     //--------------------------------------------------------------------------
215     // free workspace and return result
216     //--------------------------------------------------------------------------
217 
218     GB_FREE_WORK ;
219     (*p_TaskList  ) = TaskList ;
220     (*p_TaskList_size) = TaskList_size ;
221     (*p_ntasks    ) = ntasks ;
222     (*p_nthreads  ) = nthreads ;
223     return (GrB_SUCCESS) ;
224 }
225 
226