1 //------------------------------------------------------------------------------
2 // GB_ewise_slice: slice the entries and vectors for an ewise operation
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // Constructs a set of tasks to compute C, for an element-wise operation
11 // (GB_add, GB_emult, and GB_mask) that operates on two input matrices,
12 // C=op(A,B).  The mask is ignored for computing where to slice the work, but
13 // it is sliced once the location has been found.
14 
15 // M, A, B: any sparsity structure (hypersparse, sparse, bitmap, or full).
16 // C: constructed as sparse or hypersparse in the caller.
17 
18 #define GB_FREE_WORK                            \
19 {                                               \
20     GB_WERK_POP (Coarse, int64_t) ;             \
21     GB_FREE_WERK (&Cwork, Cwork_size) ;         \
22 }
23 
24 #define GB_FREE_ALL                             \
25 {                                               \
26     GB_FREE_WORK ;                              \
27     GB_FREE_WERK (&TaskList, TaskList_size) ;   \
28 }
29 
30 #include "GB.h"
31 
32 //------------------------------------------------------------------------------
33 // GB_ewise_slice
34 //------------------------------------------------------------------------------
35 
GB_ewise_slice(GB_task_struct ** p_TaskList,size_t * p_TaskList_size,int * p_ntasks,int * p_nthreads,const int64_t Cnvec,const int64_t * restrict Ch,const int64_t * restrict C_to_M,const int64_t * restrict C_to_A,const int64_t * restrict C_to_B,bool Ch_is_Mh,const GrB_Matrix M,const GrB_Matrix A,const GrB_Matrix B,GB_Context Context)36 GrB_Info GB_ewise_slice
37 (
38     // output:
39     GB_task_struct **p_TaskList,    // array of structs
40     size_t *p_TaskList_size,        // size of TaskList
41     int *p_ntasks,                  // # of tasks constructed
42     int *p_nthreads,                // # of threads for eWise operation
43     // input:
44     const int64_t Cnvec,            // # of vectors of C
45     const int64_t *restrict Ch,     // vectors of C, if hypersparse
46     const int64_t *restrict C_to_M, // mapping of C to M
47     const int64_t *restrict C_to_A, // mapping of C to A
48     const int64_t *restrict C_to_B, // mapping of C to B
49     bool Ch_is_Mh,                  // if true, then Ch == Mh; GB_add only
50     const GrB_Matrix M,             // mask matrix to slice (optional)
51     const GrB_Matrix A,             // matrix to slice
52     const GrB_Matrix B,             // matrix to slice
53     GB_Context Context
54 )
55 {
56 
57     //--------------------------------------------------------------------------
58     // check inputs
59     //--------------------------------------------------------------------------
60 
61     ASSERT (p_TaskList != NULL) ;
62     ASSERT (p_TaskList_size != NULL) ;
63     ASSERT (p_ntasks != NULL) ;
64     ASSERT (p_nthreads != NULL) ;
65 
66     ASSERT_MATRIX_OK (A, "A for ewise_slice", GB0) ;
67     ASSERT (!GB_ZOMBIES (A)) ;
68     ASSERT (!GB_JUMBLED (A)) ;
69     ASSERT (!GB_PENDING (A)) ;
70 
71     ASSERT_MATRIX_OK (B, "B for ewise_slice", GB0) ;
72     ASSERT (!GB_ZOMBIES (B)) ;
73     ASSERT (!GB_JUMBLED (B)) ;
74     ASSERT (!GB_PENDING (B)) ;
75 
76     ASSERT_MATRIX_OK_OR_NULL (M, "M for ewise_slice", GB0) ;
77     ASSERT (!GB_ZOMBIES (M)) ;
78     ASSERT (!GB_JUMBLED (M)) ;
79     ASSERT (!GB_PENDING (M)) ;
80 
81     (*p_TaskList  ) = NULL ;
82     (*p_TaskList_size) = 0 ;
83     (*p_ntasks    ) = 0 ;
84     (*p_nthreads  ) = 1 ;
85 
86     int64_t *restrict Cwork = NULL ; size_t Cwork_size = 0 ;
87     GB_WERK_DECLARE (Coarse, int64_t) ;     // size ntasks1+1
88     int ntasks1 = 0 ;
89 
90     //--------------------------------------------------------------------------
91     // determine # of threads to use
92     //--------------------------------------------------------------------------
93 
94     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
95 
96     //--------------------------------------------------------------------------
97     // allocate the initial TaskList
98     //--------------------------------------------------------------------------
99 
100     // Allocate the TaskList to hold at least 2*ntask0 tasks.  It will grow
101     // later, if needed.  Usually, 64*nthreads_max is enough, but in a few cases
102     // fine tasks can cause this number to be exceeded.  If that occurs,
103     // TaskList is reallocated.
104 
105     // When the mask is present, it is often fastest to break the work up
106     // into tasks, even when nthreads_max is 1.
107 
108     GB_task_struct *restrict TaskList = NULL ; size_t TaskList_size = 0 ;
109     int max_ntasks = 0 ;
110     int ntasks0 = (M == NULL && nthreads_max == 1) ? 1 : (32 * nthreads_max) ;
111     GB_REALLOC_TASK_WERK (TaskList, ntasks0, max_ntasks) ;
112 
113     //--------------------------------------------------------------------------
114     // check for quick return for a single task
115     //--------------------------------------------------------------------------
116 
117     if (Cnvec == 0 || ntasks0 == 1)
118     {
119         // construct a single coarse task that computes all of C
120         TaskList [0].kfirst = 0 ;
121         TaskList [0].klast  = Cnvec-1 ;
122         (*p_TaskList  ) = TaskList ;
123         (*p_TaskList_size) = TaskList_size ;
124         (*p_ntasks    ) = (Cnvec == 0) ? 0 : 1 ;
125         (*p_nthreads  ) = 1 ;
126         return (GrB_SUCCESS) ;
127     }
128 
129     //--------------------------------------------------------------------------
130     // get A, B, and M
131     //--------------------------------------------------------------------------
132 
133     const int64_t vlen = A->vlen ;
134     const int64_t *restrict Ap = A->p ;
135     const int64_t *restrict Ai = A->i ;
136     const int64_t *restrict Bp = B->p ;
137     const int64_t *restrict Bi = B->i ;
138     bool Ch_is_Ah = (Ch != NULL && A->h != NULL && Ch == A->h) ;
139     bool Ch_is_Bh = (Ch != NULL && B->h != NULL && Ch == B->h) ;
140 
141     const int64_t *restrict Mp = NULL ;
142     const int64_t *restrict Mi = NULL ;
143     bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
144     if (M != NULL)
145     {
146         Mp = M->p ;
147         Mi = M->i ;
148         // Ch_is_Mh is true if either true on input (for GB_add, which denotes
149         // that Ch is a deep copy of M->h), or if Ch is a shallow copy of M->h.
150         Ch_is_Mh = Ch_is_Mh || (Ch != NULL && M_is_hyper && Ch == M->h) ;
151     }
152 
153     //--------------------------------------------------------------------------
154     // allocate workspace
155     //--------------------------------------------------------------------------
156 
157     Cwork = GB_MALLOC_WERK (Cnvec+1, int64_t, &Cwork_size) ;
158     if (Cwork == NULL)
159     {
160         // out of memory
161         GB_FREE_ALL ;
162         return (GrB_OUT_OF_MEMORY) ;
163     }
164 
165     //--------------------------------------------------------------------------
166     // compute an estimate of the work for each vector of C
167     //--------------------------------------------------------------------------
168 
169     int nthreads_for_Cwork = GB_nthreads (Cnvec, chunk, nthreads_max) ;
170 
171     int64_t k ;
172     #pragma omp parallel for num_threads(nthreads_for_Cwork) schedule(static)
173     for (k = 0 ; k < Cnvec ; k++)
174     {
175 
176         //----------------------------------------------------------------------
177         // get the C(:,j) vector
178         //----------------------------------------------------------------------
179 
180         int64_t j = GBH (Ch, k) ;
181 
182         //----------------------------------------------------------------------
183         // get the corresponding vector of A
184         //----------------------------------------------------------------------
185 
186         int64_t kA ;
187         if (C_to_A != NULL)
188         {
189             // A is hypersparse and the C_to_A mapping has been created
190             ASSERT (GB_IS_HYPERSPARSE (A)) ;
191             kA = C_to_A [k] ;
192             ASSERT (kA >= -1 && kA < A->nvec) ;
193             if (kA >= 0)
194             {
195                 ASSERT (j == GBH (A->h, kA)) ;
196             }
197         }
198         else if (Ch_is_Ah)
199         {
200             // A is hypersparse, but Ch is a shallow copy of A->h
201             ASSERT (GB_IS_HYPERSPARSE (A)) ;
202             kA = k ;
203             ASSERT (j == A->h [kA]) ;
204         }
205         else
206         {
207             // A is sparse, bitmap, or full
208             ASSERT (!GB_IS_HYPERSPARSE (A)) ;
209             kA = j ;
210         }
211 
212         //----------------------------------------------------------------------
213         // get the corresponding vector of B
214         //----------------------------------------------------------------------
215 
216         int64_t kB ;
217         if (C_to_B != NULL)
218         {
219             // B is hypersparse and the C_to_B mapping has been created
220             ASSERT (GB_IS_HYPERSPARSE (B)) ;
221             kB = C_to_B [k] ;
222             ASSERT (kB >= -1 && kB < B->nvec) ;
223             if (kB >= 0)
224             {
225                 ASSERT (j == GBH (B->h, kB)) ;
226             }
227         }
228         else if (Ch_is_Bh)
229         {
230             // B is hypersparse, but Ch is a shallow copy of B->h
231             ASSERT (GB_IS_HYPERSPARSE (B)) ;
232             kB = k ;
233             ASSERT (j == B->h [kB]) ;
234         }
235         else
236         {
237             // B is sparse, bitmap, or full
238             ASSERT (!GB_IS_HYPERSPARSE (B)) ;
239             kB = j ;
240         }
241 
242         //----------------------------------------------------------------------
243         // estimate the work for C(:,j)
244         //----------------------------------------------------------------------
245 
246         ASSERT (kA >= -1 && kA < A->nvec) ;
247         ASSERT (kB >= -1 && kB < B->nvec) ;
248         int64_t aknz = (kA < 0) ? 0 :
249             ((Ap == NULL) ? vlen : (Ap [kA+1] - Ap [kA])) ;
250         int64_t bknz = (kB < 0) ? 0 :
251             ((Bp == NULL) ? vlen : (Bp [kB+1] - Bp [kB])) ;
252 
253         Cwork [k] = aknz + bknz + 1 ;
254     }
255 
256     //--------------------------------------------------------------------------
257     // replace Cwork with its cumulative sum
258     //--------------------------------------------------------------------------
259 
260     GB_cumsum (Cwork, Cnvec, NULL, nthreads_for_Cwork, Context) ;
261     double cwork = (double) Cwork [Cnvec] ;
262 
263     //--------------------------------------------------------------------------
264     // determine # of threads and tasks for the eWise operation
265     //--------------------------------------------------------------------------
266 
267     int nthreads = GB_nthreads (cwork, chunk, nthreads_max) ;
268 
269     ntasks0 = (M == NULL && nthreads == 1) ? 1 : (32 * nthreads) ;
270     double target_task_size = cwork / (double) (ntasks0) ;
271     target_task_size = GB_IMAX (target_task_size, chunk) ;
272     ntasks1 = cwork / target_task_size ;
273     ntasks1 = GB_IMAX (ntasks1, 1) ;
274 
275     //--------------------------------------------------------------------------
276     // slice the work into coarse tasks
277     //--------------------------------------------------------------------------
278 
279     GB_WERK_PUSH (Coarse, ntasks1 + 1, int64_t) ;
280     if (Coarse == NULL)
281     {
282         // out of memory
283         GB_FREE_ALL ;
284         return (GrB_OUT_OF_MEMORY) ;
285     }
286     GB_pslice (Coarse, Cwork, Cnvec, ntasks1, false) ;
287 
288     //--------------------------------------------------------------------------
289     // construct all tasks, both coarse and fine
290     //--------------------------------------------------------------------------
291 
292     int ntasks = 0 ;
293 
294     for (int t = 0 ; t < ntasks1 ; t++)
295     {
296 
297         //----------------------------------------------------------------------
298         // coarse task computes C (:,k:klast)
299         //----------------------------------------------------------------------
300 
301         int64_t k = Coarse [t] ;
302         int64_t klast  = Coarse [t+1] - 1 ;
303 
304         if (k >= Cnvec)
305         {
306 
307             //------------------------------------------------------------------
308             // all tasks have been constructed
309             //------------------------------------------------------------------
310 
311             break ;
312 
313         }
314         else if (k < klast)
315         {
316 
317             //------------------------------------------------------------------
318             // coarse task has 2 or more vectors
319             //------------------------------------------------------------------
320 
321             // This is a non-empty coarse-grain task that does two or more
322             // entire vectors of C, vectors k:klast, inclusive.
323             GB_REALLOC_TASK_WERK (TaskList, ntasks + 1, max_ntasks) ;
324             TaskList [ntasks].kfirst = k ;
325             TaskList [ntasks].klast  = klast ;
326             ntasks++ ;
327 
328         }
329         else
330         {
331 
332             //------------------------------------------------------------------
333             // coarse task has 0 or 1 vectors
334             //------------------------------------------------------------------
335 
336             // As a coarse-grain task, this task is empty or does a single
337             // vector, k.  Vector k must be removed from the work done by this
338             // and any other coarse-grain task, and split into one or more
339             // fine-grain tasks.
340 
341             for (int tt = t ; tt < ntasks1 ; tt++)
342             {
343                 // remove k from the initial slice tt
344                 if (Coarse [tt] == k)
345                 {
346                     // remove k from task tt
347                     Coarse [tt] = k+1 ;
348                 }
349                 else
350                 {
351                     // break, k not in task tt
352                     break ;
353                 }
354             }
355 
356             //------------------------------------------------------------------
357             // get the vector of C
358             //------------------------------------------------------------------
359 
360             int64_t j = GBH (Ch, k) ;
361 
362             //------------------------------------------------------------------
363             // get the corresponding vector of A
364             //------------------------------------------------------------------
365 
366             int64_t kA ;
367             if (C_to_A != NULL)
368             {
369                 // A is hypersparse and the C_to_A mapping has been created
370                 ASSERT (GB_IS_HYPERSPARSE (A)) ;
371                 kA = C_to_A [k] ;
372             }
373             else if (Ch_is_Ah)
374             {
375                 // A is hypersparse, but Ch is a shallow copy of A->h
376                 ASSERT (GB_IS_HYPERSPARSE (A)) ;
377                 kA = k ;
378             }
379             else
380             {
381                 // A is sparse, bitmap, or full
382                 ASSERT (!GB_IS_HYPERSPARSE (A)) ;
383                 kA = j ;
384             }
385             int64_t pA_start = (kA < 0) ? (-1) : GBP (Ap, kA, vlen) ;
386             int64_t pA_end   = (kA < 0) ? (-1) : GBP (Ap, kA+1, vlen) ;
387             bool a_empty = (pA_end == pA_start) ;
388 
389             //------------------------------------------------------------------
390             // get the corresponding vector of B
391             //------------------------------------------------------------------
392 
393             int64_t kB ;
394             if (C_to_B != NULL)
395             {
396                 // B is hypersparse and the C_to_B mapping has been created
397                 ASSERT (GB_IS_HYPERSPARSE (B)) ;
398                 kB = C_to_B [k] ;
399             }
400             else if (Ch_is_Bh)
401             {
402                 // B is hypersparse, but Ch is a shallow copy of B->h
403                 ASSERT (GB_IS_HYPERSPARSE (B)) ;
404                 kB = k ;
405             }
406             else
407             {
408                 // B is sparse, bitmap, or full
409                 ASSERT (!GB_IS_HYPERSPARSE (B)) ;
410                 kB = j ;
411             }
412             int64_t pB_start = (kB < 0) ? (-1) : GBP (Bp, kB, vlen) ;
413             int64_t pB_end   = (kB < 0) ? (-1) : GBP (Bp, kB+1, vlen) ;
414             bool b_empty = (pB_end == pB_start) ;
415 
416             //------------------------------------------------------------------
417             // get the corresponding vector of M, if present
418             //------------------------------------------------------------------
419 
420             // M can have any sparsity structure (hyper, sparse, bitmap, full)
421 
422             int64_t pM_start = -1 ;
423             int64_t pM_end   = -1 ;
424             if (M != NULL)
425             {
426                 int64_t kM ;
427                 if (C_to_M != NULL)
428                 {
429                     // M is hypersparse and the C_to_M mapping has been created
430                     ASSERT (GB_IS_HYPERSPARSE (M)) ;
431                     kM = C_to_M [k] ;
432                 }
433                 else if (Ch_is_Mh)
434                 {
435                     // M is hypersparse, but Ch is a copy of Mh
436                     ASSERT (GB_IS_HYPERSPARSE (M)) ;
437                     // Ch is a deep or shallow copy of Mh
438                     kM = k ;
439                 }
440                 else
441                 {
442                     // M is sparse, bitmap, or full
443                     ASSERT (!GB_IS_HYPERSPARSE (M)) ;
444                     kM = j ;
445                 }
446                 pM_start = (kM < 0) ? -1 : GBP (Mp, kM, vlen) ;
447                 pM_end   = (kM < 0) ? -1 : GBP (Mp, kM+1, vlen) ;
448             }
449             bool m_empty = (pM_end == pM_start) ;
450 
451             //------------------------------------------------------------------
452             // determine the # of fine-grain tasks to create for vector k
453             //------------------------------------------------------------------
454 
455             double ckwork = Cwork [k+1] - Cwork [k] ;
456             int nfine = ckwork / target_task_size ;
457             nfine = GB_IMAX (nfine, 1) ;
458 
459             // make the TaskList bigger, if needed
460             GB_REALLOC_TASK_WERK (TaskList, ntasks + nfine, max_ntasks) ;
461 
462             //------------------------------------------------------------------
463             // create the fine-grain tasks
464             //------------------------------------------------------------------
465 
466             if (nfine == 1)
467             {
468 
469                 //--------------------------------------------------------------
470                 // this is a single coarse task for all of vector k
471                 //--------------------------------------------------------------
472 
473                 TaskList [ntasks].kfirst = k ;
474                 TaskList [ntasks].klast  = k ;
475                 ntasks++ ;
476 
477             }
478             else
479             {
480 
481                 //--------------------------------------------------------------
482                 // slice vector k into nfine fine tasks
483                 //--------------------------------------------------------------
484 
485                 // first fine task starts at the top of vector k
486                 ASSERT (ntasks < max_ntasks) ;
487                 TaskList [ntasks].kfirst = k ;
488                 TaskList [ntasks].klast  = -1 ; // this is a fine task
489                 TaskList [ntasks].pM = (m_empty) ? -1 : pM_start ;
490                 TaskList [ntasks].pA = (a_empty) ? -1 : pA_start ;
491                 TaskList [ntasks].pB = (b_empty) ? -1 : pB_start ;
492                 TaskList [ntasks].len = 0 ;     // to be determined below
493                 ntasks++ ;
494                 int64_t ilast = 0, i = 0 ;
495 
496                 for (int tfine = 1 ; tfine < nfine ; tfine++)
497                 {
498                     double target_work = ((nfine-tfine) * ckwork) / nfine ;
499                     int64_t pM, pA, pB ;
500                     GB_slice_vector (&i, &pM, &pA, &pB,
501                         pM_start, pM_end, Mi,
502                         pA_start, pA_end, Ai,
503                         pB_start, pB_end, Bi,
504                         vlen, target_work) ;
505 
506                     // prior task ends at pM-1, pA-1, and pB-1
507                     TaskList [ntasks-1].pM_end = pM ;
508                     TaskList [ntasks-1].pA_end = pA ;
509                     TaskList [ntasks-1].pB_end = pB ;
510 
511                     // prior task handles indices ilast:i-1
512                     TaskList [ntasks-1].len = i - ilast ;
513 
514                     // this task starts at pM, pA, and pB
515                     ASSERT (ntasks < max_ntasks) ;
516                     TaskList [ntasks].kfirst = k ;
517                     TaskList [ntasks].klast  = -1 ; // this is a fine task
518                     TaskList [ntasks].pM = pM ;
519                     TaskList [ntasks].pA = pA ;
520                     TaskList [ntasks].pB = pB ;
521 
522                     // advance to the next task
523                     ntasks++ ;
524                     ilast = i ;
525                 }
526 
527                 // Terminate the last fine task.
528                 ASSERT (ntasks <= max_ntasks) ;
529                 TaskList [ntasks-1].pM_end = (m_empty) ? -1 : pM_end ;
530                 TaskList [ntasks-1].pA_end = (a_empty) ? -1 : pA_end ;
531                 TaskList [ntasks-1].pB_end = (b_empty) ? -1 : pB_end ;
532                 TaskList [ntasks-1].len = vlen - i ;
533             }
534         }
535     }
536 
537     ASSERT (ntasks <= max_ntasks) ;
538 
539     //--------------------------------------------------------------------------
540     // free workspace and return result
541     //--------------------------------------------------------------------------
542 
543     GB_FREE_WORK ;
544     (*p_TaskList     ) = TaskList ;
545     (*p_TaskList_size) = TaskList_size ;
546     (*p_ntasks       ) = ntasks ;
547     (*p_nthreads     ) = nthreads ;
548     return (GrB_SUCCESS) ;
549 }
550 
551