1 //------------------------------------------------------------------------------
2 // GB_select_phase1: count entries in each vector for C=select(A,thunk)
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10     const int64_t *restrict kfirst_Aslice = A_ek_slicing ;
11     const int64_t *restrict klast_Aslice  = A_ek_slicing + A_ntasks ;
12     const int64_t *restrict pstart_Aslice = A_ek_slicing + A_ntasks * 2 ;
13 
14 #if defined ( GB_ENTRY_SELECTOR )
15 
16     //--------------------------------------------------------------------------
17     // entry selector
18     //--------------------------------------------------------------------------
19 
20     ASSERT (GB_JUMBLED_OK (A)) ;
21 
22     // The count of live entries kth vector A(:,k) is reduced to the kth scalar
23     // Cp(k).  Each thread computes the reductions on roughly the same number
24     // of entries, which means that a vector A(:,k) may be reduced by more than
25     // one thread.  The first vector A(:,kfirst) reduced by thread tid may be
26     // partial, where the prior thread tid-1 (and other prior threads) may also
27     // do some of the reductions for this same vector A(:,kfirst).  The thread
28     // tid reduces all vectors A(:,k) for k in the range kfirst+1 to klast-1.
29     // The last vector A(:,klast) reduced by thread tid may also be partial.
30     // Thread tid+1, and following threads, may also do some of the reduces for
31     // A(:,klast).
32 
33     //--------------------------------------------------------------------------
34     // get A
35     //--------------------------------------------------------------------------
36 
37     const int64_t  *restrict Ap = A->p ;
38     const int64_t  *restrict Ah = A->h ;
39     const int64_t  *restrict Ai = A->i ;
40     const GB_ATYPE *restrict Ax = (GB_ATYPE *) A->x ;
41     size_t  asize = A->type->size ;
42     int64_t avlen = A->vlen ;
43     int64_t avdim = A->vdim ;
44     ASSERT (GB_JUMBLED_OK (A)) ;
45 
46     //--------------------------------------------------------------------------
47     // reduce each slice
48     //--------------------------------------------------------------------------
49 
50     // each thread reduces its own part in parallel
51     int tid ;
52     #pragma omp parallel for num_threads(A_nthreads) schedule(dynamic,1)
53     for (tid = 0 ; tid < A_ntasks ; tid++)
54     {
55 
56         // if kfirst > klast then thread tid does no work at all
57         int64_t kfirst = kfirst_Aslice [tid] ;
58         int64_t klast  = klast_Aslice  [tid] ;
59         Wfirst [tid] = 0 ;
60         Wlast  [tid] = 0 ;
61 
62         //----------------------------------------------------------------------
63         // reduce vectors kfirst to klast
64         //----------------------------------------------------------------------
65 
66         for (int64_t k = kfirst ; k <= klast ; k++)
67         {
68 
69             //------------------------------------------------------------------
70             // find the part of A(:,k) to be reduced by this thread
71             //------------------------------------------------------------------
72 
73             GB_GET_J ; // int64_t j = GBH (Ah, k) ; but for user selectop only
74             int64_t pA, pA_end ;
75             GB_get_pA (&pA, &pA_end, tid, k,
76                 kfirst, klast, pstart_Aslice, Ap, avlen) ;
77 
78             //------------------------------------------------------------------
79             // count entries in Ax [pA ... pA_end-1]
80             //------------------------------------------------------------------
81 
82             int64_t cjnz = 0 ;
83             for ( ; pA < pA_end ; pA++)
84             {
85                 if (GB_TEST_VALUE_OF_ENTRY (pA)) cjnz++ ;
86             }
87             if (k == kfirst)
88             {
89                 Wfirst [tid] = cjnz ;
90             }
91             else if (k == klast)
92             {
93                 Wlast [tid] = cjnz ;
94             }
95             else
96             {
97                 Cp [k] = cjnz ;
98             }
99         }
100     }
101 
102     //--------------------------------------------------------------------------
103     // reduce the first and last vector of each slice using a single thread
104     //--------------------------------------------------------------------------
105 
106     GB_ek_slice_merge1 (Cp, Wfirst, Wlast, A_ek_slicing, A_ntasks) ;
107 
108 #else
109 
110     //--------------------------------------------------------------------------
111     // positional selector (tril, triu, diag, offdiag, resize)
112     //--------------------------------------------------------------------------
113 
114     const int64_t *restrict Ap = A->p ;
115     const int64_t *restrict Ah = A->h ;
116     const int64_t *restrict Ai = A->i ;
117     int64_t anvec = A->nvec ;
118     int64_t avlen = A->vlen ;
119     ASSERT (!GB_JUMBLED (A)) ;
120 
121     //--------------------------------------------------------------------------
122     // tril, triu, diag, offdiag, resize: binary search in each vector
123     //--------------------------------------------------------------------------
124 
125     int64_t k ;
126     #pragma omp parallel for num_threads(A_nthreads) schedule(guided)
127     for (k = 0 ; k < anvec ; k++)
128     {
129 
130         //----------------------------------------------------------------------
131         // get A(:,k)
132         //----------------------------------------------------------------------
133 
134         int64_t pA_start = GBP (Ap, k, avlen) ;
135         int64_t pA_end   = GBP (Ap, k+1, avlen) ;
136         int64_t p = pA_start ;
137         int64_t cjnz = 0 ;
138         int64_t ajnz = pA_end - pA_start ;
139         bool found = false ;
140 
141         if (ajnz > 0)
142         {
143 
144             //------------------------------------------------------------------
145             // search for the entry A(i,k)
146             //------------------------------------------------------------------
147 
148             int64_t ifirst = GBI (Ai, pA_start, avlen) ;
149             int64_t ilast  = GBI (Ai, pA_end-1, avlen) ;
150 
151             #if defined ( GB_RESIZE_SELECTOR )
152             int64_t i = ithunk ;
153             #else
154             int64_t j = GBH (Ah, k) ;
155             int64_t i = j-ithunk ;
156             #endif
157 
158             if (i < ifirst)
159             {
160                 // all entries in A(:,k) come after i
161                 ;
162             }
163             else if (i > ilast)
164             {
165                 // all entries in A(:,k) come before i
166                 p = pA_end ;
167             }
168             else if (ajnz == avlen)
169             {
170                 // A(:,k) is dense
171                 found = true ;
172                 p += i ;
173                 ASSERT (GBI (Ai, p, avlen) == i) ;
174             }
175             else
176             {
177                 // binary search for A (i,k)
178                 int64_t pright = pA_end - 1 ;
179                 GB_SPLIT_BINARY_SEARCH (i, Ai, p, pright, found) ;
180             }
181 
182             #if defined ( GB_TRIL_SELECTOR )
183 
184                 // keep p to pA_end-1
185                 cjnz = pA_end - p ;
186 
187             #elif defined ( GB_TRIU_SELECTOR ) \
188                || defined ( GB_RESIZE_SELECTOR )
189 
190                 // if found, keep pA_start to p
191                 // else keep pA_start to p-1
192                 if (found)
193                 {
194                     p++ ;
195                     // now in both cases, keep pA_start to p-1
196                 }
197                 // keep pA_start to p-1
198                 cjnz = p - pA_start ;
199 
200             #elif defined ( GB_DIAG_SELECTOR )
201 
202                 // if found, keep p
203                 // else keep nothing
204                 cjnz = found ;
205                 if (!found) p = -1 ;
206                 // if (cjnz >= 0) keep p, else keep nothing
207 
208             #elif defined ( GB_OFFDIAG_SELECTOR )
209 
210                 // if found, keep pA_start to p-1 and p+1 to pA_end-1
211                 // else keep pA_start to pA_end
212                 cjnz = ajnz - found ;
213                 if (!found)
214                 {
215                     p = pA_end ;
216                     // now just keep pA_start to p-1; p+1 to pA_end is
217                     // now empty
218                 }
219                 // in both cases, keep pA_start to p-1 and
220                 // p+1 to pA_end-1.  If the entry is not found, then
221                 // p == pA_end, and all entries are kept.
222 
223             #endif
224         }
225 
226         //----------------------------------------------------------------------
227         // log the result for the kth vector
228         //----------------------------------------------------------------------
229 
230         Zp [k] = p ;
231         Cp [k] = cjnz ;
232     }
233 
234     //--------------------------------------------------------------------------
235     // compute Wfirst and Wlast for each task
236     //--------------------------------------------------------------------------
237 
238     // Wfirst [0..A_ntasks-1] and Wlast [0..A_ntasks-1] are required for
239     // constructing C_start_slice [0..A_ntasks-1] in GB_selector.
240 
241     for (int tid = 0 ; tid < A_ntasks ; tid++)
242     {
243 
244         // if kfirst > klast then task tid does no work at all
245         int64_t kfirst = kfirst_Aslice [tid] ;
246         int64_t klast  = klast_Aslice  [tid] ;
247         Wfirst [tid] = 0 ;
248         Wlast  [tid] = 0 ;
249 
250         if (kfirst <= klast)
251         {
252             int64_t pA_start = pstart_Aslice [tid] ;
253             int64_t pA_end   = GBP (Ap, kfirst+1, avlen) ;
254             pA_end = GB_IMIN (pA_end, pstart_Aslice [tid+1]) ;
255             if (pA_start < pA_end)
256             {
257                 #if defined ( GB_TRIL_SELECTOR )
258 
259                     // keep Zp [kfirst] to pA_end-1
260                     int64_t p = GB_IMAX (Zp [kfirst], pA_start) ;
261                     Wfirst [tid] = GB_IMAX (0, pA_end - p) ;
262 
263                 #elif defined ( GB_TRIU_SELECTOR ) \
264                    || defined ( GB_RESIZE_SELECTOR )
265 
266                     // keep pA_start to Zp [kfirst]-1
267                     int64_t p = GB_IMIN (Zp [kfirst], pA_end) ;
268                     Wfirst [tid] = GB_IMAX (0, p - pA_start) ;
269 
270                 #elif defined ( GB_DIAG_SELECTOR )
271 
272                     // task that owns the diagonal entry does this work
273                     int64_t p = Zp [kfirst] ;
274                     Wfirst [tid] = (pA_start <= p && p < pA_end) ? 1 : 0 ;
275 
276                 #elif defined ( GB_OFFDIAG_SELECTOR )
277 
278                     // keep pA_start to Zp [kfirst]-1
279                     int64_t p = GB_IMIN (Zp [kfirst], pA_end) ;
280                     Wfirst [tid] = GB_IMAX (0, p - pA_start) ;
281 
282                     // keep Zp [kfirst]+1 to pA_end-1
283                     p = GB_IMAX (Zp [kfirst]+1, pA_start) ;
284                     Wfirst [tid] += GB_IMAX (0, pA_end - p) ;
285 
286                 #endif
287             }
288         }
289 
290         if (kfirst < klast)
291         {
292             int64_t pA_start = GBP (Ap, klast, avlen) ;
293             int64_t pA_end   = pstart_Aslice [tid+1] ;
294             if (pA_start < pA_end)
295             {
296                 #if defined ( GB_TRIL_SELECTOR )
297 
298                     // keep Zp [klast] to pA_end-1
299                     int64_t p = GB_IMAX (Zp [klast], pA_start) ;
300                     Wlast [tid] = GB_IMAX (0, pA_end - p) ;
301 
302                 #elif defined ( GB_TRIU_SELECTOR ) \
303                    || defined ( GB_RESIZE_SELECTOR )
304 
305                     // keep pA_start to Zp [klast]-1
306                     int64_t p = GB_IMIN (Zp [klast], pA_end) ;
307                     Wlast [tid] = GB_IMAX (0, p - pA_start) ;
308 
309                 #elif defined ( GB_DIAG_SELECTOR )
310 
311                     // task that owns the diagonal entry does this work
312                     int64_t p = Zp [klast] ;
313                     Wlast [tid] = (pA_start <= p && p < pA_end) ? 1 : 0 ;
314 
315                 #elif defined ( GB_OFFDIAG_SELECTOR )
316 
317                     // keep pA_start to Zp [klast]-1
318                     int64_t p = GB_IMIN (Zp [klast], pA_end) ;
319                     Wlast [tid] = GB_IMAX (0, p - pA_start) ;
320 
321                     // keep Zp [klast]+1 to pA_end-1
322                     p = GB_IMAX (Zp [klast]+1, pA_start) ;
323                     Wlast [tid] += GB_IMAX (0, pA_end - p) ;
324 
325                 #endif
326             }
327         }
328     }
329 
330 #endif
331 
332