1 //------------------------------------------------------------------------------
2 // GB_AxB_dot3_one_slice: slice the entries and vectors of a single matrix
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // Constructs a set of tasks that slice a single input matrix M.  This function
11 // is currently only used by GB_AxB_dot3, to slice the mask matrix M, which has
12 // the same pattern as the output matrix C.  However, this function is a very
13 // simple general-purpose method for slicing a single matrix.  It could be
14 // called GB_one_slice, and used for other methods as well.
15 
16 #define GB_FREE_WORK                            \
17 {                                               \
18     GB_WERK_POP (Coarse, int64_t) ;             \
19 }
20 
21 #define GB_FREE_ALL                             \
22 {                                               \
23     GB_FREE_WORK ;                              \
24     GB_FREE_WERK (&TaskList, TaskList_size) ;   \
25 }
26 
27 #include "GB_mxm.h"
28 
29 #define GB_NTASKS_PER_THREAD 256
30 
31 //------------------------------------------------------------------------------
32 // GB_AxB_dot3_one_slice
33 //------------------------------------------------------------------------------
34 
GB_AxB_dot3_one_slice(GB_task_struct ** p_TaskList,size_t * p_TaskList_size,int * p_ntasks,int * p_nthreads,const GrB_Matrix M,GB_Context Context)35 GrB_Info GB_AxB_dot3_one_slice
36 (
37     // output:
38     GB_task_struct **p_TaskList,    // array of structs
39     size_t *p_TaskList_size,        // size of TaskList
40     int *p_ntasks,                  // # of tasks constructed
41     int *p_nthreads,                // # of threads to use
42     // input:
43     const GrB_Matrix M,             // matrix to slice
44     GB_Context Context
45 )
46 {
47 
48     //--------------------------------------------------------------------------
49     // check inputs
50     //--------------------------------------------------------------------------
51 
52     ASSERT (p_TaskList != NULL) ;
53     ASSERT (p_TaskList_size != NULL) ;
54     ASSERT (p_ntasks != NULL) ;
55     ASSERT (p_nthreads != NULL) ;
56     ASSERT_MATRIX_OK (M, "M for dot3_one_slice", GB0) ;
57 
58     // the pattern of M is not accessed
59     ASSERT (GB_ZOMBIES_OK (M)) ;
60     ASSERT (GB_JUMBLED_OK (M)) ;
61     ASSERT (GB_PENDING_OK (M)) ;
62     ASSERT (!GB_IS_BITMAP (M)) ;
63     ASSERT (!GB_IS_FULL (M)) ;
64 
65     (*p_TaskList  ) = NULL ;
66     (*p_TaskList_size) = 0 ;
67     (*p_ntasks    ) = 0 ;
68     (*p_nthreads  ) = 1 ;
69 
70     //--------------------------------------------------------------------------
71     // determine # of threads to use
72     //--------------------------------------------------------------------------
73 
74     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
75 
76     //--------------------------------------------------------------------------
77     // get M
78     //--------------------------------------------------------------------------
79 
80     const int64_t *restrict Mp = M->p ;
81     const int64_t mnz = GB_NNZ_HELD (M) ;
82     const int64_t mnvec = M->nvec ;
83     const int64_t mvlen = M->vlen ;
84 
85     //--------------------------------------------------------------------------
86     // allocate the initial TaskList
87     //--------------------------------------------------------------------------
88 
89     GB_WERK_DECLARE (Coarse, int64_t) ;
90     int ntasks1 = 0 ;
91     int nthreads = GB_nthreads (mnz, chunk, nthreads_max) ;
92     GB_task_struct *restrict TaskList = NULL ; size_t TaskList_size = 0 ;
93     int max_ntasks = 0 ;
94     int ntasks = 0 ;
95     int ntasks0 = (nthreads == 1) ? 1 : (GB_NTASKS_PER_THREAD * nthreads) ;
96     GB_REALLOC_TASK_WERK (TaskList, ntasks0, max_ntasks) ;
97 
98     //--------------------------------------------------------------------------
99     // check for quick return for a single task
100     //--------------------------------------------------------------------------
101 
102     if (mnvec == 0 || ntasks0 == 1)
103     {
104         // construct a single coarse task that does all the work
105         TaskList [0].kfirst = 0 ;
106         TaskList [0].klast  = mnvec-1 ;
107         (*p_TaskList  ) = TaskList ;
108         (*p_TaskList_size) = TaskList_size ;
109         (*p_ntasks    ) = (mnvec == 0) ? 0 : 1 ;
110         (*p_nthreads  ) = 1 ;
111         return (GrB_SUCCESS) ;
112     }
113 
114     //--------------------------------------------------------------------------
115     // determine # of threads and tasks
116     //--------------------------------------------------------------------------
117 
118     double target_task_size = ((double) mnz) / (double) (ntasks0) ;
119     target_task_size = GB_IMAX (target_task_size, chunk) ;
120     ntasks1 = ((double) mnz) / target_task_size ;
121     ntasks1 = GB_IMAX (ntasks1, 1) ;
122 
123     //--------------------------------------------------------------------------
124     // slice the work into coarse tasks
125     //--------------------------------------------------------------------------
126 
127     GB_WERK_PUSH (Coarse, ntasks1 + 1, int64_t) ;
128     if (Coarse == NULL)
129     {
130         // out of memory
131         GB_FREE_ALL ;
132         return (GrB_OUT_OF_MEMORY) ;
133     }
134     GB_pslice (Coarse, Mp, mnvec, ntasks1, false) ;
135 
136     //--------------------------------------------------------------------------
137     // construct all tasks, both coarse and fine
138     //--------------------------------------------------------------------------
139 
140     for (int t = 0 ; t < ntasks1 ; t++)
141     {
142 
143         //----------------------------------------------------------------------
144         // coarse task operates on M (:, k:klast)
145         //----------------------------------------------------------------------
146 
147         int64_t k = Coarse [t] ;
148         int64_t klast = Coarse [t+1] - 1 ;
149 
150         if (k >= mnvec)
151         {
152 
153             //------------------------------------------------------------------
154             // all tasks have been constructed
155             //------------------------------------------------------------------
156 
157             break ;
158 
159         }
160         else if (k < klast)
161         {
162 
163             //------------------------------------------------------------------
164             // coarse task has 2 or more vectors
165             //------------------------------------------------------------------
166 
167             // This is a non-empty coarse-grain task that does two or more
168             // entire vectors of M and C, vectors k:klast, inclusive.
169             GB_REALLOC_TASK_WERK (TaskList, ntasks + 1, max_ntasks) ;
170             TaskList [ntasks].kfirst = k ;
171             TaskList [ntasks].klast  = klast ;
172             ntasks++ ;
173 
174         }
175         else
176         {
177 
178             //------------------------------------------------------------------
179             // coarse task has 0 or 1 vectors
180             //------------------------------------------------------------------
181 
182             // As a coarse-grain task, this task is empty or does a single
183             // vector, k.  Vector k must be removed from the work done by this
184             // and any other coarse-grain task, and split into one or more
185             // fine-grain tasks.
186 
187             for (int tt = t ; tt < ntasks1 ; tt++)
188             {
189                 // remove k from the initial slice tt
190                 if (Coarse [tt] == k)
191                 {
192                     // remove k from task tt
193                     Coarse [tt] = k+1 ;
194                 }
195                 else
196                 {
197                     // break, k not in task tt
198                     break ;
199                 }
200             }
201 
202             //------------------------------------------------------------------
203             // determine the # of fine-grain tasks to create for vector k
204             //------------------------------------------------------------------
205 
206             int64_t mknz = (Mp == NULL) ? mvlen : (Mp [k+1] - Mp [k]) ;
207             int nfine = ((double) mknz) / target_task_size ;
208             nfine = GB_IMAX (nfine, 1) ;
209 
210             // make the TaskList bigger, if needed
211             GB_REALLOC_TASK_WERK (TaskList, ntasks + nfine, max_ntasks) ;
212 
213             //------------------------------------------------------------------
214             // create the fine-grain tasks
215             //------------------------------------------------------------------
216 
217             if (nfine == 1)
218             {
219 
220                 //--------------------------------------------------------------
221                 // this is a single coarse task for all of vector k
222                 //--------------------------------------------------------------
223 
224                 TaskList [ntasks].kfirst = k ;
225                 TaskList [ntasks].klast  = k ;
226                 ntasks++ ;
227 
228             }
229             else
230             {
231 
232                 //--------------------------------------------------------------
233                 // slice vector M(:,k) into nfine fine tasks
234                 //--------------------------------------------------------------
235 
236                 ASSERT (ntasks < max_ntasks) ;
237 
238                 for (int tfine = 0 ; tfine < nfine ; tfine++)
239                 {
240 
241                     // this fine task operates on vector M(:,k)
242                     TaskList [ntasks].kfirst = k ;
243                     TaskList [ntasks].klast  = -1 ;
244 
245                     // slice M(:,k) for this task
246                     int64_t p1, p2 ;
247                     GB_PARTITION (p1, p2, mknz, tfine, nfine) ;
248                     int64_t pM_start = GBP (Mp, k, mvlen) ;
249                     int64_t pM     = pM_start + p1 ;
250                     int64_t pM_end = pM_start + p2 ;
251                     TaskList [ntasks].pM     = pM ;
252                     TaskList [ntasks].pM_end = pM_end ;
253 
254                     ASSERT (TaskList [ntasks].pM <= TaskList [ntasks].pM_end) ;
255                     ntasks++ ;
256                 }
257             }
258         }
259     }
260 
261     ASSERT (ntasks <= max_ntasks) ;
262 
263     //--------------------------------------------------------------------------
264     // free workspace and return result
265     //--------------------------------------------------------------------------
266 
267     GB_FREE_WORK ;
268     (*p_TaskList  ) = TaskList ;
269     (*p_TaskList_size) = TaskList_size ;
270     (*p_ntasks    ) = ntasks ;
271     (*p_nthreads  ) = nthreads ;
272     return (GrB_SUCCESS) ;
273 }
274 
275