1 //------------------------------------------------------------------------------
2 // GB_AxB_dot3_one_slice: slice the entries and vectors of a single matrix
3 //------------------------------------------------------------------------------
4
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7
8 //------------------------------------------------------------------------------
9
10 // Constructs a set of tasks that slice a single input matrix M. This function
11 // is currently only used by GB_AxB_dot3, to slice the mask matrix M, which has
12 // the same pattern as the output matrix C. However, this function is a very
13 // simple general-purpose method for slicing a single matrix. It could be
14 // called GB_one_slice, and used for other methods as well.
15
16 #define GB_FREE_WORK \
17 { \
18 GB_WERK_POP (Coarse, int64_t) ; \
19 }
20
21 #define GB_FREE_ALL \
22 { \
23 GB_FREE_WORK ; \
24 GB_FREE_WERK (&TaskList, TaskList_size) ; \
25 }
26
27 #include "GB_mxm.h"
28
29 #define GB_NTASKS_PER_THREAD 256
30
31 //------------------------------------------------------------------------------
32 // GB_AxB_dot3_one_slice
33 //------------------------------------------------------------------------------
34
GB_AxB_dot3_one_slice(GB_task_struct ** p_TaskList,size_t * p_TaskList_size,int * p_ntasks,int * p_nthreads,const GrB_Matrix M,GB_Context Context)35 GrB_Info GB_AxB_dot3_one_slice
36 (
37 // output:
38 GB_task_struct **p_TaskList, // array of structs
39 size_t *p_TaskList_size, // size of TaskList
40 int *p_ntasks, // # of tasks constructed
41 int *p_nthreads, // # of threads to use
42 // input:
43 const GrB_Matrix M, // matrix to slice
44 GB_Context Context
45 )
46 {
47
48 //--------------------------------------------------------------------------
49 // check inputs
50 //--------------------------------------------------------------------------
51
52 ASSERT (p_TaskList != NULL) ;
53 ASSERT (p_TaskList_size != NULL) ;
54 ASSERT (p_ntasks != NULL) ;
55 ASSERT (p_nthreads != NULL) ;
56 ASSERT_MATRIX_OK (M, "M for dot3_one_slice", GB0) ;
57
58 // the pattern of M is not accessed
59 ASSERT (GB_ZOMBIES_OK (M)) ;
60 ASSERT (GB_JUMBLED_OK (M)) ;
61 ASSERT (GB_PENDING_OK (M)) ;
62 ASSERT (!GB_IS_BITMAP (M)) ;
63 ASSERT (!GB_IS_FULL (M)) ;
64
65 (*p_TaskList ) = NULL ;
66 (*p_TaskList_size) = 0 ;
67 (*p_ntasks ) = 0 ;
68 (*p_nthreads ) = 1 ;
69
70 //--------------------------------------------------------------------------
71 // determine # of threads to use
72 //--------------------------------------------------------------------------
73
74 GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
75
76 //--------------------------------------------------------------------------
77 // get M
78 //--------------------------------------------------------------------------
79
80 const int64_t *restrict Mp = M->p ;
81 const int64_t mnz = GB_NNZ_HELD (M) ;
82 const int64_t mnvec = M->nvec ;
83 const int64_t mvlen = M->vlen ;
84
85 //--------------------------------------------------------------------------
86 // allocate the initial TaskList
87 //--------------------------------------------------------------------------
88
89 GB_WERK_DECLARE (Coarse, int64_t) ;
90 int ntasks1 = 0 ;
91 int nthreads = GB_nthreads (mnz, chunk, nthreads_max) ;
92 GB_task_struct *restrict TaskList = NULL ; size_t TaskList_size = 0 ;
93 int max_ntasks = 0 ;
94 int ntasks = 0 ;
95 int ntasks0 = (nthreads == 1) ? 1 : (GB_NTASKS_PER_THREAD * nthreads) ;
96 GB_REALLOC_TASK_WERK (TaskList, ntasks0, max_ntasks) ;
97
98 //--------------------------------------------------------------------------
99 // check for quick return for a single task
100 //--------------------------------------------------------------------------
101
102 if (mnvec == 0 || ntasks0 == 1)
103 {
104 // construct a single coarse task that does all the work
105 TaskList [0].kfirst = 0 ;
106 TaskList [0].klast = mnvec-1 ;
107 (*p_TaskList ) = TaskList ;
108 (*p_TaskList_size) = TaskList_size ;
109 (*p_ntasks ) = (mnvec == 0) ? 0 : 1 ;
110 (*p_nthreads ) = 1 ;
111 return (GrB_SUCCESS) ;
112 }
113
114 //--------------------------------------------------------------------------
115 // determine # of threads and tasks
116 //--------------------------------------------------------------------------
117
118 double target_task_size = ((double) mnz) / (double) (ntasks0) ;
119 target_task_size = GB_IMAX (target_task_size, chunk) ;
120 ntasks1 = ((double) mnz) / target_task_size ;
121 ntasks1 = GB_IMAX (ntasks1, 1) ;
122
123 //--------------------------------------------------------------------------
124 // slice the work into coarse tasks
125 //--------------------------------------------------------------------------
126
127 GB_WERK_PUSH (Coarse, ntasks1 + 1, int64_t) ;
128 if (Coarse == NULL)
129 {
130 // out of memory
131 GB_FREE_ALL ;
132 return (GrB_OUT_OF_MEMORY) ;
133 }
134 GB_pslice (Coarse, Mp, mnvec, ntasks1, false) ;
135
136 //--------------------------------------------------------------------------
137 // construct all tasks, both coarse and fine
138 //--------------------------------------------------------------------------
139
140 for (int t = 0 ; t < ntasks1 ; t++)
141 {
142
143 //----------------------------------------------------------------------
144 // coarse task operates on M (:, k:klast)
145 //----------------------------------------------------------------------
146
147 int64_t k = Coarse [t] ;
148 int64_t klast = Coarse [t+1] - 1 ;
149
150 if (k >= mnvec)
151 {
152
153 //------------------------------------------------------------------
154 // all tasks have been constructed
155 //------------------------------------------------------------------
156
157 break ;
158
159 }
160 else if (k < klast)
161 {
162
163 //------------------------------------------------------------------
164 // coarse task has 2 or more vectors
165 //------------------------------------------------------------------
166
167 // This is a non-empty coarse-grain task that does two or more
168 // entire vectors of M and C, vectors k:klast, inclusive.
169 GB_REALLOC_TASK_WERK (TaskList, ntasks + 1, max_ntasks) ;
170 TaskList [ntasks].kfirst = k ;
171 TaskList [ntasks].klast = klast ;
172 ntasks++ ;
173
174 }
175 else
176 {
177
178 //------------------------------------------------------------------
179 // coarse task has 0 or 1 vectors
180 //------------------------------------------------------------------
181
182 // As a coarse-grain task, this task is empty or does a single
183 // vector, k. Vector k must be removed from the work done by this
184 // and any other coarse-grain task, and split into one or more
185 // fine-grain tasks.
186
187 for (int tt = t ; tt < ntasks1 ; tt++)
188 {
189 // remove k from the initial slice tt
190 if (Coarse [tt] == k)
191 {
192 // remove k from task tt
193 Coarse [tt] = k+1 ;
194 }
195 else
196 {
197 // break, k not in task tt
198 break ;
199 }
200 }
201
202 //------------------------------------------------------------------
203 // determine the # of fine-grain tasks to create for vector k
204 //------------------------------------------------------------------
205
206 int64_t mknz = (Mp == NULL) ? mvlen : (Mp [k+1] - Mp [k]) ;
207 int nfine = ((double) mknz) / target_task_size ;
208 nfine = GB_IMAX (nfine, 1) ;
209
210 // make the TaskList bigger, if needed
211 GB_REALLOC_TASK_WERK (TaskList, ntasks + nfine, max_ntasks) ;
212
213 //------------------------------------------------------------------
214 // create the fine-grain tasks
215 //------------------------------------------------------------------
216
217 if (nfine == 1)
218 {
219
220 //--------------------------------------------------------------
221 // this is a single coarse task for all of vector k
222 //--------------------------------------------------------------
223
224 TaskList [ntasks].kfirst = k ;
225 TaskList [ntasks].klast = k ;
226 ntasks++ ;
227
228 }
229 else
230 {
231
232 //--------------------------------------------------------------
233 // slice vector M(:,k) into nfine fine tasks
234 //--------------------------------------------------------------
235
236 ASSERT (ntasks < max_ntasks) ;
237
238 for (int tfine = 0 ; tfine < nfine ; tfine++)
239 {
240
241 // this fine task operates on vector M(:,k)
242 TaskList [ntasks].kfirst = k ;
243 TaskList [ntasks].klast = -1 ;
244
245 // slice M(:,k) for this task
246 int64_t p1, p2 ;
247 GB_PARTITION (p1, p2, mknz, tfine, nfine) ;
248 int64_t pM_start = GBP (Mp, k, mvlen) ;
249 int64_t pM = pM_start + p1 ;
250 int64_t pM_end = pM_start + p2 ;
251 TaskList [ntasks].pM = pM ;
252 TaskList [ntasks].pM_end = pM_end ;
253
254 ASSERT (TaskList [ntasks].pM <= TaskList [ntasks].pM_end) ;
255 ntasks++ ;
256 }
257 }
258 }
259 }
260
261 ASSERT (ntasks <= max_ntasks) ;
262
263 //--------------------------------------------------------------------------
264 // free workspace and return result
265 //--------------------------------------------------------------------------
266
267 GB_FREE_WORK ;
268 (*p_TaskList ) = TaskList ;
269 (*p_TaskList_size) = TaskList_size ;
270 (*p_ntasks ) = ntasks ;
271 (*p_nthreads ) = nthreads ;
272 return (GrB_SUCCESS) ;
273 }
274
275