1 //------------------------------------------------------------------------------
2 // GB_subassign_one_slice: slice the entries and vectors for subassign
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // Constructs a set of tasks to compute C for a subassign method, based on
11 // slicing a single input matrix (M or A).  Fine tasks must also find their
12 // location in their vector C(:,jC).  Currently this method is only used to
13 // slice M, not A.
14 
15 // This method is used by GB_subassign_05, 06n, and 07.  Each of those methods
16 // apply this function to M, but they use TaskList[...].pA and pA_end to
17 // partition the matrix.
18 
19         //  =====================       ==============
20         //  M   cmp rpl acc A   S       method: action
21         //  =====================       ==============
22         //  M   -   -   -   -   -       05:  C(I,J)<M> = x       for M
23         //  M   -   -   +   -   -       07:  C(I,J)<M> += x      for M
24         //  M   -   -   -   A   -       06n: C(I,J)<M> = A       for M
25 
26 // C: not bitmap
27 
28 #include "GB_subassign_methods.h"
29 
30 #undef  GB_FREE_WORK
31 #define GB_FREE_WORK                \
32 {                                   \
33     GB_WERK_POP (Coarse, int64_t) ; \
34 }
35 
36 #undef  GB_FREE_ALL
37 #define GB_FREE_ALL                             \
38 {                                               \
39     GB_FREE_WORK ;                              \
40     GB_FREE_WERK (&TaskList, TaskList_size) ;   \
41 }
42 
43 //------------------------------------------------------------------------------
44 // GB_subassign_one_slice
45 //------------------------------------------------------------------------------
46 
GB_subassign_one_slice(GB_task_struct ** p_TaskList,size_t * p_TaskList_size,int * p_ntasks,int * p_nthreads,const GrB_Matrix C,const GrB_Index * I,const int64_t nI,const int Ikind,const int64_t Icolon[3],const GrB_Index * J,const int64_t nJ,const int Jkind,const int64_t Jcolon[3],const GrB_Matrix M,GB_Context Context)47 GrB_Info GB_subassign_one_slice
48 (
49     // output:
50     GB_task_struct **p_TaskList,    // array of structs
51     size_t *p_TaskList_size,        // size of TaskList
52     int *p_ntasks,                  // # of tasks constructed
53     int *p_nthreads,                // # of threads to use
54     // input:
55     const GrB_Matrix C,             // output matrix C
56     const GrB_Index *I,
57     const int64_t nI,
58     const int Ikind,
59     const int64_t Icolon [3],
60     const GrB_Index *J,
61     const int64_t nJ,
62     const int Jkind,
63     const int64_t Jcolon [3],
64     const GrB_Matrix M,             // matrix to slice
65     GB_Context Context
66 )
67 {
68 
69     //--------------------------------------------------------------------------
70     // check inputs
71     //--------------------------------------------------------------------------
72 
73     ASSERT (p_TaskList != NULL) ;
74     ASSERT (p_ntasks != NULL) ;
75     ASSERT (p_nthreads != NULL) ;
76     ASSERT_MATRIX_OK (C, "C for 1_slice", GB0) ;
77     ASSERT_MATRIX_OK (M, "M for 1_slice", GB0) ;
78 
79     ASSERT (!GB_IS_BITMAP (C)) ;
80 
81     ASSERT (!GB_JUMBLED (C)) ;
82     ASSERT (!GB_JUMBLED (M)) ;
83 
84     (*p_TaskList  ) = NULL ;
85     (*p_ntasks    ) = 0 ;
86     (*p_nthreads  ) = 1 ;
87 
88     //--------------------------------------------------------------------------
89     // determine # of threads to use
90     //--------------------------------------------------------------------------
91 
92     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
93 
94     //--------------------------------------------------------------------------
95     // get M and C
96     //--------------------------------------------------------------------------
97 
98     const int64_t *restrict Mp = M->p ;
99     const int64_t *restrict Mh = M->h ;
100 //  const int8_t  *restrict Mb = M->b ;
101     const int64_t *restrict Mi = M->i ;
102     const int64_t mnz = GB_NNZ_HELD (M) ;
103     const int64_t mnvec = M->nvec ;
104     const int64_t mvlen = M->vlen ;
105 
106     const int64_t *restrict Cp = C->p ;
107     const int64_t *restrict Ch = C->h ;
108     const int64_t *restrict Ci = C->i ;
109     const bool C_is_hyper = (Ch != NULL) ;
110     const int64_t nzombies = C->nzombies ;
111     const int64_t Cnvec = C->nvec ;
112     const int64_t Cvlen = C->vlen ;
113 
114     //--------------------------------------------------------------------------
115     // allocate the initial TaskList
116     //--------------------------------------------------------------------------
117 
118     GB_WERK_DECLARE (Coarse, int64_t) ;     // size ntasks1+1
119     int ntasks1 = 0 ;
120     int nthreads = GB_nthreads (mnz, chunk, nthreads_max) ;
121     GB_task_struct *restrict TaskList = NULL ; size_t TaskList_size = 0 ;
122     int max_ntasks = 0 ;
123     int ntasks = 0 ;
124     int ntasks0 = (nthreads == 1) ? 1 : (32 * nthreads) ;
125     GB_REALLOC_TASK_WERK (TaskList, ntasks0, max_ntasks) ;
126 
127     //--------------------------------------------------------------------------
128     // check for quick return for a single task
129     //--------------------------------------------------------------------------
130 
131     if (mnvec == 0 || ntasks0 == 1)
132     {
133         // construct a single coarse task that does all the work
134         TaskList [0].kfirst = 0 ;
135         TaskList [0].klast  = mnvec-1 ;
136         (*p_TaskList  ) = TaskList ;
137         (*p_TaskList_size) = TaskList_size ;
138         (*p_ntasks    ) = (mnvec == 0) ? 0 : 1 ;
139         (*p_nthreads  ) = 1 ;
140         return (GrB_SUCCESS) ;
141     }
142 
143     //--------------------------------------------------------------------------
144     // determine # of threads and tasks for the subassign operation
145     //--------------------------------------------------------------------------
146 
147     double target_task_size = ((double) mnz) / (double) (ntasks0) ;
148     target_task_size = GB_IMAX (target_task_size, chunk) ;
149     ntasks1 = ((double) mnz) / target_task_size ;
150     ntasks1 = GB_IMAX (ntasks1, 1) ;
151 
152     //--------------------------------------------------------------------------
153     // slice the work into coarse tasks
154     //--------------------------------------------------------------------------
155 
156     // M may be hypersparse, sparse, bitmap, or full
157     GB_WERK_PUSH (Coarse, ntasks1 + 1, int64_t) ;
158     if (Coarse == NULL)
159     {
160         // out of memory
161         GB_FREE_ALL ;
162         return (GrB_OUT_OF_MEMORY) ;
163     }
164     GB_pslice (Coarse, Mp, mnvec, ntasks1, false) ;
165 
166     //--------------------------------------------------------------------------
167     // construct all tasks, both coarse and fine
168     //--------------------------------------------------------------------------
169 
170     for (int t = 0 ; t < ntasks1 ; t++)
171     {
172 
173         //----------------------------------------------------------------------
174         // coarse task computes C (I, J(k:klast)) = M (I, k:klast)
175         //----------------------------------------------------------------------
176 
177         int64_t k = Coarse [t] ;
178         int64_t klast = Coarse [t+1] - 1 ;
179 
180         if (k >= mnvec)
181         {
182 
183             //------------------------------------------------------------------
184             // all tasks have been constructed
185             //------------------------------------------------------------------
186 
187             break ;
188 
189         }
190         else if (k < klast)
191         {
192 
193             //------------------------------------------------------------------
194             // coarse task has 2 or more vectors
195             //------------------------------------------------------------------
196 
197             // This is a non-empty coarse-grain task that does two or more
198             // entire vectors of M, vectors k:klast, inclusive.
199             GB_REALLOC_TASK_WERK (TaskList, ntasks + 1, max_ntasks) ;
200             TaskList [ntasks].kfirst = k ;
201             TaskList [ntasks].klast  = klast ;
202             ntasks++ ;
203 
204         }
205         else
206         {
207 
208             //------------------------------------------------------------------
209             // coarse task has 0 or 1 vectors
210             //------------------------------------------------------------------
211 
212             // As a coarse-grain task, this task is empty or does a single
213             // vector, k.  Vector k must be removed from the work done by this
214             // and any other coarse-grain task, and split into one or more
215             // fine-grain tasks.
216 
217             for (int tt = t ; tt < ntasks1 ; tt++)
218             {
219                 // remove k from the initial slice tt
220                 if (Coarse [tt] == k)
221                 {
222                     // remove k from task tt
223                     Coarse [tt] = k+1 ;
224                 }
225                 else
226                 {
227                     // break, k not in task tt
228                     break ;
229                 }
230             }
231 
232             //------------------------------------------------------------------
233             // get the vector of C
234             //------------------------------------------------------------------
235 
236             ASSERT (k >= 0 && k < mnvec) ;
237             int64_t j = GBH (Mh, k) ;
238             ASSERT (j >= 0 && j < nJ) ;
239             int64_t GB_LOOKUP_jC ;
240 
241             bool jC_dense = (pC_end - pC_start == Cvlen) ;
242 
243             //------------------------------------------------------------------
244             // determine the # of fine-grain tasks to create for vector k
245             //------------------------------------------------------------------
246 
247             int64_t mknz = (Mp == NULL) ? mvlen : (Mp [k+1] - Mp [k]) ;
248             int nfine = ((double) mknz) / target_task_size ;
249             nfine = GB_IMAX (nfine, 1) ;
250 
251             // make the TaskList bigger, if needed
252             GB_REALLOC_TASK_WERK (TaskList, ntasks + nfine, max_ntasks) ;
253 
254             //------------------------------------------------------------------
255             // create the fine-grain tasks
256             //------------------------------------------------------------------
257 
258             if (nfine == 1)
259             {
260 
261                 //--------------------------------------------------------------
262                 // this is a single coarse task for all of vector k
263                 //--------------------------------------------------------------
264 
265                 TaskList [ntasks].kfirst = k ;
266                 TaskList [ntasks].klast  = k ;
267                 ntasks++ ;
268 
269             }
270             else
271             {
272 
273                 //--------------------------------------------------------------
274                 // slice vector M(:,k) into nfine fine tasks
275                 //--------------------------------------------------------------
276 
277                 ASSERT (ntasks < max_ntasks) ;
278 
279                 for (int tfine = 0 ; tfine < nfine ; tfine++)
280                 {
281 
282                     // this fine task operates on vector M(:,k)
283                     TaskList [ntasks].kfirst = k ;
284                     TaskList [ntasks].klast  = -1 ;
285 
286                     // slice M(:,k) for this task
287                     int64_t p1, p2 ;
288                     GB_PARTITION (p1, p2, mknz, tfine, nfine) ;
289                     int64_t pM_start = GBP (Mp, k, mvlen) ;
290                     int64_t pM     = pM_start + p1 ;
291                     int64_t pM_end = pM_start + p2 ;
292                     TaskList [ntasks].pA     = pM ;
293                     TaskList [ntasks].pA_end = pM_end ;
294 
295                     if (jC_dense)
296                     {
297                         // do not slice C(:,jC) if it is dense
298                         TaskList [ntasks].pC     = pC_start ;
299                         TaskList [ntasks].pC_end = pC_end ;
300                     }
301                     else
302                     {
303                         // find where this task starts and ends in C(:,jC)
304                         int64_t iM_start = GBI (Mi, pM, mvlen) ;
305                         int64_t iC1 = GB_ijlist (I, iM_start, Ikind, Icolon) ;
306                         int64_t iM_end = GBI (Mi, pM_end-1, mvlen) ;
307                         int64_t iC2 = GB_ijlist (I, iM_end, Ikind, Icolon) ;
308 
309                         // If I is an explicit list, it must be already sorted
310                         // in ascending order, and thus iC1 <= iC2.  If I is
311                         // GB_ALL or GB_STRIDE with inc >= 0, then iC1 < iC2.
312                         // But if inc < 0, then iC1 > iC2.  iC_start and iC_end
313                         // are used for a binary search bracket, so iC_start <=
314                         // iC_end must hold.
315                         int64_t iC_start = GB_IMIN (iC1, iC2) ;
316                         int64_t iC_end   = GB_IMAX (iC1, iC2) ;
317 
318                         // this task works on Ci,Cx [pC:pC_end-1]
319                         int64_t pleft = pC_start ;
320                         int64_t pright = pC_end - 1 ;
321                         bool found, is_zombie ;
322                         GB_SPLIT_BINARY_SEARCH_ZOMBIE (iC_start, Ci,
323                             pleft, pright, found, nzombies, is_zombie) ;
324                         TaskList [ntasks].pC = pleft ;
325 
326                         pleft = pC_start ;
327                         pright = pC_end - 1 ;
328                         GB_SPLIT_BINARY_SEARCH_ZOMBIE (iC_end, Ci,
329                             pleft, pright, found, nzombies, is_zombie) ;
330                         TaskList [ntasks].pC_end = (found) ? (pleft+1) : pleft ;
331                     }
332 
333                     ASSERT (TaskList [ntasks].pA <= TaskList [ntasks].pA_end) ;
334                     ASSERT (TaskList [ntasks].pC <= TaskList [ntasks].pC_end) ;
335                     ntasks++ ;
336                 }
337             }
338         }
339     }
340 
341     ASSERT (ntasks <= max_ntasks) ;
342 
343     //--------------------------------------------------------------------------
344     // free workspace and return result
345     //--------------------------------------------------------------------------
346 
347     GB_FREE_WORK ;
348     (*p_TaskList  ) = TaskList ;
349     (*p_TaskList_size) = TaskList_size ;
350     (*p_ntasks    ) = ntasks ;
351     (*p_nthreads  ) = nthreads ;
352     return (GrB_SUCCESS) ;
353 }
354 
355