1 //------------------------------------------------------------------------------
2 // GB_ewise_slice: slice the entries and vectors for an ewise operation
3 //------------------------------------------------------------------------------
4
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7
8 //------------------------------------------------------------------------------
9
10 // Constructs a set of tasks to compute C, for an element-wise operation
11 // (GB_add, GB_emult, and GB_mask) that operates on two input matrices,
12 // C=op(A,B). The mask is ignored for computing where to slice the work, but
13 // it is sliced once the location has been found.
14
15 // M, A, B: any sparsity structure (hypersparse, sparse, bitmap, or full).
16 // C: constructed as sparse or hypersparse in the caller.
17
18 #define GB_FREE_WORK \
19 { \
20 GB_WERK_POP (Coarse, int64_t) ; \
21 GB_FREE_WERK (&Cwork, Cwork_size) ; \
22 }
23
24 #define GB_FREE_ALL \
25 { \
26 GB_FREE_WORK ; \
27 GB_FREE_WERK (&TaskList, TaskList_size) ; \
28 }
29
30 #include "GB.h"
31
32 //------------------------------------------------------------------------------
33 // GB_ewise_slice
34 //------------------------------------------------------------------------------
35
GB_ewise_slice(GB_task_struct ** p_TaskList,size_t * p_TaskList_size,int * p_ntasks,int * p_nthreads,const int64_t Cnvec,const int64_t * restrict Ch,const int64_t * restrict C_to_M,const int64_t * restrict C_to_A,const int64_t * restrict C_to_B,bool Ch_is_Mh,const GrB_Matrix M,const GrB_Matrix A,const GrB_Matrix B,GB_Context Context)36 GrB_Info GB_ewise_slice
37 (
38 // output:
39 GB_task_struct **p_TaskList, // array of structs
40 size_t *p_TaskList_size, // size of TaskList
41 int *p_ntasks, // # of tasks constructed
42 int *p_nthreads, // # of threads for eWise operation
43 // input:
44 const int64_t Cnvec, // # of vectors of C
45 const int64_t *restrict Ch, // vectors of C, if hypersparse
46 const int64_t *restrict C_to_M, // mapping of C to M
47 const int64_t *restrict C_to_A, // mapping of C to A
48 const int64_t *restrict C_to_B, // mapping of C to B
49 bool Ch_is_Mh, // if true, then Ch == Mh; GB_add only
50 const GrB_Matrix M, // mask matrix to slice (optional)
51 const GrB_Matrix A, // matrix to slice
52 const GrB_Matrix B, // matrix to slice
53 GB_Context Context
54 )
55 {
56
57 //--------------------------------------------------------------------------
58 // check inputs
59 //--------------------------------------------------------------------------
60
61 ASSERT (p_TaskList != NULL) ;
62 ASSERT (p_TaskList_size != NULL) ;
63 ASSERT (p_ntasks != NULL) ;
64 ASSERT (p_nthreads != NULL) ;
65
66 ASSERT_MATRIX_OK (A, "A for ewise_slice", GB0) ;
67 ASSERT (!GB_ZOMBIES (A)) ;
68 ASSERT (!GB_JUMBLED (A)) ;
69 ASSERT (!GB_PENDING (A)) ;
70
71 ASSERT_MATRIX_OK (B, "B for ewise_slice", GB0) ;
72 ASSERT (!GB_ZOMBIES (B)) ;
73 ASSERT (!GB_JUMBLED (B)) ;
74 ASSERT (!GB_PENDING (B)) ;
75
76 ASSERT_MATRIX_OK_OR_NULL (M, "M for ewise_slice", GB0) ;
77 ASSERT (!GB_ZOMBIES (M)) ;
78 ASSERT (!GB_JUMBLED (M)) ;
79 ASSERT (!GB_PENDING (M)) ;
80
81 (*p_TaskList ) = NULL ;
82 (*p_TaskList_size) = 0 ;
83 (*p_ntasks ) = 0 ;
84 (*p_nthreads ) = 1 ;
85
86 int64_t *restrict Cwork = NULL ; size_t Cwork_size = 0 ;
87 GB_WERK_DECLARE (Coarse, int64_t) ; // size ntasks1+1
88 int ntasks1 = 0 ;
89
90 //--------------------------------------------------------------------------
91 // determine # of threads to use
92 //--------------------------------------------------------------------------
93
94 GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
95
96 //--------------------------------------------------------------------------
97 // allocate the initial TaskList
98 //--------------------------------------------------------------------------
99
100 // Allocate the TaskList to hold at least 2*ntask0 tasks. It will grow
101 // later, if needed. Usually, 64*nthreads_max is enough, but in a few cases
102 // fine tasks can cause this number to be exceeded. If that occurs,
103 // TaskList is reallocated.
104
105 // When the mask is present, it is often fastest to break the work up
106 // into tasks, even when nthreads_max is 1.
107
108 GB_task_struct *restrict TaskList = NULL ; size_t TaskList_size = 0 ;
109 int max_ntasks = 0 ;
110 int ntasks0 = (M == NULL && nthreads_max == 1) ? 1 : (32 * nthreads_max) ;
111 GB_REALLOC_TASK_WERK (TaskList, ntasks0, max_ntasks) ;
112
113 //--------------------------------------------------------------------------
114 // check for quick return for a single task
115 //--------------------------------------------------------------------------
116
117 if (Cnvec == 0 || ntasks0 == 1)
118 {
119 // construct a single coarse task that computes all of C
120 TaskList [0].kfirst = 0 ;
121 TaskList [0].klast = Cnvec-1 ;
122 (*p_TaskList ) = TaskList ;
123 (*p_TaskList_size) = TaskList_size ;
124 (*p_ntasks ) = (Cnvec == 0) ? 0 : 1 ;
125 (*p_nthreads ) = 1 ;
126 return (GrB_SUCCESS) ;
127 }
128
129 //--------------------------------------------------------------------------
130 // get A, B, and M
131 //--------------------------------------------------------------------------
132
133 const int64_t vlen = A->vlen ;
134 const int64_t *restrict Ap = A->p ;
135 const int64_t *restrict Ai = A->i ;
136 const int64_t *restrict Bp = B->p ;
137 const int64_t *restrict Bi = B->i ;
138 bool Ch_is_Ah = (Ch != NULL && A->h != NULL && Ch == A->h) ;
139 bool Ch_is_Bh = (Ch != NULL && B->h != NULL && Ch == B->h) ;
140
141 const int64_t *restrict Mp = NULL ;
142 const int64_t *restrict Mi = NULL ;
143 bool M_is_hyper = GB_IS_HYPERSPARSE (M) ;
144 if (M != NULL)
145 {
146 Mp = M->p ;
147 Mi = M->i ;
148 // Ch_is_Mh is true if either true on input (for GB_add, which denotes
149 // that Ch is a deep copy of M->h), or if Ch is a shallow copy of M->h.
150 Ch_is_Mh = Ch_is_Mh || (Ch != NULL && M_is_hyper && Ch == M->h) ;
151 }
152
153 //--------------------------------------------------------------------------
154 // allocate workspace
155 //--------------------------------------------------------------------------
156
157 Cwork = GB_MALLOC_WERK (Cnvec+1, int64_t, &Cwork_size) ;
158 if (Cwork == NULL)
159 {
160 // out of memory
161 GB_FREE_ALL ;
162 return (GrB_OUT_OF_MEMORY) ;
163 }
164
165 //--------------------------------------------------------------------------
166 // compute an estimate of the work for each vector of C
167 //--------------------------------------------------------------------------
168
169 int nthreads_for_Cwork = GB_nthreads (Cnvec, chunk, nthreads_max) ;
170
171 int64_t k ;
172 #pragma omp parallel for num_threads(nthreads_for_Cwork) schedule(static)
173 for (k = 0 ; k < Cnvec ; k++)
174 {
175
176 //----------------------------------------------------------------------
177 // get the C(:,j) vector
178 //----------------------------------------------------------------------
179
180 int64_t j = GBH (Ch, k) ;
181
182 //----------------------------------------------------------------------
183 // get the corresponding vector of A
184 //----------------------------------------------------------------------
185
186 int64_t kA ;
187 if (C_to_A != NULL)
188 {
189 // A is hypersparse and the C_to_A mapping has been created
190 ASSERT (GB_IS_HYPERSPARSE (A)) ;
191 kA = C_to_A [k] ;
192 ASSERT (kA >= -1 && kA < A->nvec) ;
193 if (kA >= 0)
194 {
195 ASSERT (j == GBH (A->h, kA)) ;
196 }
197 }
198 else if (Ch_is_Ah)
199 {
200 // A is hypersparse, but Ch is a shallow copy of A->h
201 ASSERT (GB_IS_HYPERSPARSE (A)) ;
202 kA = k ;
203 ASSERT (j == A->h [kA]) ;
204 }
205 else
206 {
207 // A is sparse, bitmap, or full
208 ASSERT (!GB_IS_HYPERSPARSE (A)) ;
209 kA = j ;
210 }
211
212 //----------------------------------------------------------------------
213 // get the corresponding vector of B
214 //----------------------------------------------------------------------
215
216 int64_t kB ;
217 if (C_to_B != NULL)
218 {
219 // B is hypersparse and the C_to_B mapping has been created
220 ASSERT (GB_IS_HYPERSPARSE (B)) ;
221 kB = C_to_B [k] ;
222 ASSERT (kB >= -1 && kB < B->nvec) ;
223 if (kB >= 0)
224 {
225 ASSERT (j == GBH (B->h, kB)) ;
226 }
227 }
228 else if (Ch_is_Bh)
229 {
230 // B is hypersparse, but Ch is a shallow copy of B->h
231 ASSERT (GB_IS_HYPERSPARSE (B)) ;
232 kB = k ;
233 ASSERT (j == B->h [kB]) ;
234 }
235 else
236 {
237 // B is sparse, bitmap, or full
238 ASSERT (!GB_IS_HYPERSPARSE (B)) ;
239 kB = j ;
240 }
241
242 //----------------------------------------------------------------------
243 // estimate the work for C(:,j)
244 //----------------------------------------------------------------------
245
246 ASSERT (kA >= -1 && kA < A->nvec) ;
247 ASSERT (kB >= -1 && kB < B->nvec) ;
248 int64_t aknz = (kA < 0) ? 0 :
249 ((Ap == NULL) ? vlen : (Ap [kA+1] - Ap [kA])) ;
250 int64_t bknz = (kB < 0) ? 0 :
251 ((Bp == NULL) ? vlen : (Bp [kB+1] - Bp [kB])) ;
252
253 Cwork [k] = aknz + bknz + 1 ;
254 }
255
256 //--------------------------------------------------------------------------
257 // replace Cwork with its cumulative sum
258 //--------------------------------------------------------------------------
259
260 GB_cumsum (Cwork, Cnvec, NULL, nthreads_for_Cwork, Context) ;
261 double cwork = (double) Cwork [Cnvec] ;
262
263 //--------------------------------------------------------------------------
264 // determine # of threads and tasks for the eWise operation
265 //--------------------------------------------------------------------------
266
267 int nthreads = GB_nthreads (cwork, chunk, nthreads_max) ;
268
269 ntasks0 = (M == NULL && nthreads == 1) ? 1 : (32 * nthreads) ;
270 double target_task_size = cwork / (double) (ntasks0) ;
271 target_task_size = GB_IMAX (target_task_size, chunk) ;
272 ntasks1 = cwork / target_task_size ;
273 ntasks1 = GB_IMAX (ntasks1, 1) ;
274
275 //--------------------------------------------------------------------------
276 // slice the work into coarse tasks
277 //--------------------------------------------------------------------------
278
279 GB_WERK_PUSH (Coarse, ntasks1 + 1, int64_t) ;
280 if (Coarse == NULL)
281 {
282 // out of memory
283 GB_FREE_ALL ;
284 return (GrB_OUT_OF_MEMORY) ;
285 }
286 GB_pslice (Coarse, Cwork, Cnvec, ntasks1, false) ;
287
288 //--------------------------------------------------------------------------
289 // construct all tasks, both coarse and fine
290 //--------------------------------------------------------------------------
291
292 int ntasks = 0 ;
293
294 for (int t = 0 ; t < ntasks1 ; t++)
295 {
296
297 //----------------------------------------------------------------------
298 // coarse task computes C (:,k:klast)
299 //----------------------------------------------------------------------
300
301 int64_t k = Coarse [t] ;
302 int64_t klast = Coarse [t+1] - 1 ;
303
304 if (k >= Cnvec)
305 {
306
307 //------------------------------------------------------------------
308 // all tasks have been constructed
309 //------------------------------------------------------------------
310
311 break ;
312
313 }
314 else if (k < klast)
315 {
316
317 //------------------------------------------------------------------
318 // coarse task has 2 or more vectors
319 //------------------------------------------------------------------
320
321 // This is a non-empty coarse-grain task that does two or more
322 // entire vectors of C, vectors k:klast, inclusive.
323 GB_REALLOC_TASK_WERK (TaskList, ntasks + 1, max_ntasks) ;
324 TaskList [ntasks].kfirst = k ;
325 TaskList [ntasks].klast = klast ;
326 ntasks++ ;
327
328 }
329 else
330 {
331
332 //------------------------------------------------------------------
333 // coarse task has 0 or 1 vectors
334 //------------------------------------------------------------------
335
336 // As a coarse-grain task, this task is empty or does a single
337 // vector, k. Vector k must be removed from the work done by this
338 // and any other coarse-grain task, and split into one or more
339 // fine-grain tasks.
340
341 for (int tt = t ; tt < ntasks1 ; tt++)
342 {
343 // remove k from the initial slice tt
344 if (Coarse [tt] == k)
345 {
346 // remove k from task tt
347 Coarse [tt] = k+1 ;
348 }
349 else
350 {
351 // break, k not in task tt
352 break ;
353 }
354 }
355
356 //------------------------------------------------------------------
357 // get the vector of C
358 //------------------------------------------------------------------
359
360 int64_t j = GBH (Ch, k) ;
361
362 //------------------------------------------------------------------
363 // get the corresponding vector of A
364 //------------------------------------------------------------------
365
366 int64_t kA ;
367 if (C_to_A != NULL)
368 {
369 // A is hypersparse and the C_to_A mapping has been created
370 ASSERT (GB_IS_HYPERSPARSE (A)) ;
371 kA = C_to_A [k] ;
372 }
373 else if (Ch_is_Ah)
374 {
375 // A is hypersparse, but Ch is a shallow copy of A->h
376 ASSERT (GB_IS_HYPERSPARSE (A)) ;
377 kA = k ;
378 }
379 else
380 {
381 // A is sparse, bitmap, or full
382 ASSERT (!GB_IS_HYPERSPARSE (A)) ;
383 kA = j ;
384 }
385 int64_t pA_start = (kA < 0) ? (-1) : GBP (Ap, kA, vlen) ;
386 int64_t pA_end = (kA < 0) ? (-1) : GBP (Ap, kA+1, vlen) ;
387 bool a_empty = (pA_end == pA_start) ;
388
389 //------------------------------------------------------------------
390 // get the corresponding vector of B
391 //------------------------------------------------------------------
392
393 int64_t kB ;
394 if (C_to_B != NULL)
395 {
396 // B is hypersparse and the C_to_B mapping has been created
397 ASSERT (GB_IS_HYPERSPARSE (B)) ;
398 kB = C_to_B [k] ;
399 }
400 else if (Ch_is_Bh)
401 {
402 // B is hypersparse, but Ch is a shallow copy of B->h
403 ASSERT (GB_IS_HYPERSPARSE (B)) ;
404 kB = k ;
405 }
406 else
407 {
408 // B is sparse, bitmap, or full
409 ASSERT (!GB_IS_HYPERSPARSE (B)) ;
410 kB = j ;
411 }
412 int64_t pB_start = (kB < 0) ? (-1) : GBP (Bp, kB, vlen) ;
413 int64_t pB_end = (kB < 0) ? (-1) : GBP (Bp, kB+1, vlen) ;
414 bool b_empty = (pB_end == pB_start) ;
415
416 //------------------------------------------------------------------
417 // get the corresponding vector of M, if present
418 //------------------------------------------------------------------
419
420 // M can have any sparsity structure (hyper, sparse, bitmap, full)
421
422 int64_t pM_start = -1 ;
423 int64_t pM_end = -1 ;
424 if (M != NULL)
425 {
426 int64_t kM ;
427 if (C_to_M != NULL)
428 {
429 // M is hypersparse and the C_to_M mapping has been created
430 ASSERT (GB_IS_HYPERSPARSE (M)) ;
431 kM = C_to_M [k] ;
432 }
433 else if (Ch_is_Mh)
434 {
435 // M is hypersparse, but Ch is a copy of Mh
436 ASSERT (GB_IS_HYPERSPARSE (M)) ;
437 // Ch is a deep or shallow copy of Mh
438 kM = k ;
439 }
440 else
441 {
442 // M is sparse, bitmap, or full
443 ASSERT (!GB_IS_HYPERSPARSE (M)) ;
444 kM = j ;
445 }
446 pM_start = (kM < 0) ? -1 : GBP (Mp, kM, vlen) ;
447 pM_end = (kM < 0) ? -1 : GBP (Mp, kM+1, vlen) ;
448 }
449 bool m_empty = (pM_end == pM_start) ;
450
451 //------------------------------------------------------------------
452 // determine the # of fine-grain tasks to create for vector k
453 //------------------------------------------------------------------
454
455 double ckwork = Cwork [k+1] - Cwork [k] ;
456 int nfine = ckwork / target_task_size ;
457 nfine = GB_IMAX (nfine, 1) ;
458
459 // make the TaskList bigger, if needed
460 GB_REALLOC_TASK_WERK (TaskList, ntasks + nfine, max_ntasks) ;
461
462 //------------------------------------------------------------------
463 // create the fine-grain tasks
464 //------------------------------------------------------------------
465
466 if (nfine == 1)
467 {
468
469 //--------------------------------------------------------------
470 // this is a single coarse task for all of vector k
471 //--------------------------------------------------------------
472
473 TaskList [ntasks].kfirst = k ;
474 TaskList [ntasks].klast = k ;
475 ntasks++ ;
476
477 }
478 else
479 {
480
481 //--------------------------------------------------------------
482 // slice vector k into nfine fine tasks
483 //--------------------------------------------------------------
484
485 // first fine task starts at the top of vector k
486 ASSERT (ntasks < max_ntasks) ;
487 TaskList [ntasks].kfirst = k ;
488 TaskList [ntasks].klast = -1 ; // this is a fine task
489 TaskList [ntasks].pM = (m_empty) ? -1 : pM_start ;
490 TaskList [ntasks].pA = (a_empty) ? -1 : pA_start ;
491 TaskList [ntasks].pB = (b_empty) ? -1 : pB_start ;
492 TaskList [ntasks].len = 0 ; // to be determined below
493 ntasks++ ;
494 int64_t ilast = 0, i = 0 ;
495
496 for (int tfine = 1 ; tfine < nfine ; tfine++)
497 {
498 double target_work = ((nfine-tfine) * ckwork) / nfine ;
499 int64_t pM, pA, pB ;
500 GB_slice_vector (&i, &pM, &pA, &pB,
501 pM_start, pM_end, Mi,
502 pA_start, pA_end, Ai,
503 pB_start, pB_end, Bi,
504 vlen, target_work) ;
505
506 // prior task ends at pM-1, pA-1, and pB-1
507 TaskList [ntasks-1].pM_end = pM ;
508 TaskList [ntasks-1].pA_end = pA ;
509 TaskList [ntasks-1].pB_end = pB ;
510
511 // prior task handles indices ilast:i-1
512 TaskList [ntasks-1].len = i - ilast ;
513
514 // this task starts at pM, pA, and pB
515 ASSERT (ntasks < max_ntasks) ;
516 TaskList [ntasks].kfirst = k ;
517 TaskList [ntasks].klast = -1 ; // this is a fine task
518 TaskList [ntasks].pM = pM ;
519 TaskList [ntasks].pA = pA ;
520 TaskList [ntasks].pB = pB ;
521
522 // advance to the next task
523 ntasks++ ;
524 ilast = i ;
525 }
526
527 // Terminate the last fine task.
528 ASSERT (ntasks <= max_ntasks) ;
529 TaskList [ntasks-1].pM_end = (m_empty) ? -1 : pM_end ;
530 TaskList [ntasks-1].pA_end = (a_empty) ? -1 : pA_end ;
531 TaskList [ntasks-1].pB_end = (b_empty) ? -1 : pB_end ;
532 TaskList [ntasks-1].len = vlen - i ;
533 }
534 }
535 }
536
537 ASSERT (ntasks <= max_ntasks) ;
538
539 //--------------------------------------------------------------------------
540 // free workspace and return result
541 //--------------------------------------------------------------------------
542
543 GB_FREE_WORK ;
544 (*p_TaskList ) = TaskList ;
545 (*p_TaskList_size) = TaskList_size ;
546 (*p_ntasks ) = ntasks ;
547 (*p_nthreads ) = nthreads ;
548 return (GrB_SUCCESS) ;
549 }
550
551