1 //------------------------------------------------------------------------------ 2 // GB_select_phase1: count entries in each vector for C=select(A,thunk) 3 //------------------------------------------------------------------------------ 4 5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved. 6 // SPDX-License-Identifier: Apache-2.0 7 8 //------------------------------------------------------------------------------ 9 10 const int64_t *restrict kfirst_Aslice = A_ek_slicing ; 11 const int64_t *restrict klast_Aslice = A_ek_slicing + A_ntasks ; 12 const int64_t *restrict pstart_Aslice = A_ek_slicing + A_ntasks * 2 ; 13 14 #if defined ( GB_ENTRY_SELECTOR ) 15 16 //-------------------------------------------------------------------------- 17 // entry selector 18 //-------------------------------------------------------------------------- 19 20 ASSERT (GB_JUMBLED_OK (A)) ; 21 22 // The count of live entries kth vector A(:,k) is reduced to the kth scalar 23 // Cp(k). Each thread computes the reductions on roughly the same number 24 // of entries, which means that a vector A(:,k) may be reduced by more than 25 // one thread. The first vector A(:,kfirst) reduced by thread tid may be 26 // partial, where the prior thread tid-1 (and other prior threads) may also 27 // do some of the reductions for this same vector A(:,kfirst). The thread 28 // tid reduces all vectors A(:,k) for k in the range kfirst+1 to klast-1. 29 // The last vector A(:,klast) reduced by thread tid may also be partial. 30 // Thread tid+1, and following threads, may also do some of the reduces for 31 // A(:,klast). 32 33 //-------------------------------------------------------------------------- 34 // get A 35 //-------------------------------------------------------------------------- 36 37 const int64_t *restrict Ap = A->p ; 38 const int64_t *restrict Ah = A->h ; 39 const int64_t *restrict Ai = A->i ; 40 const GB_ATYPE *restrict Ax = (GB_ATYPE *) A->x ; 41 size_t asize = A->type->size ; 42 int64_t avlen = A->vlen ; 43 int64_t avdim = A->vdim ; 44 ASSERT (GB_JUMBLED_OK (A)) ; 45 46 //-------------------------------------------------------------------------- 47 // reduce each slice 48 //-------------------------------------------------------------------------- 49 50 // each thread reduces its own part in parallel 51 int tid ; 52 #pragma omp parallel for num_threads(A_nthreads) schedule(dynamic,1) 53 for (tid = 0 ; tid < A_ntasks ; tid++) 54 { 55 56 // if kfirst > klast then thread tid does no work at all 57 int64_t kfirst = kfirst_Aslice [tid] ; 58 int64_t klast = klast_Aslice [tid] ; 59 Wfirst [tid] = 0 ; 60 Wlast [tid] = 0 ; 61 62 //---------------------------------------------------------------------- 63 // reduce vectors kfirst to klast 64 //---------------------------------------------------------------------- 65 66 for (int64_t k = kfirst ; k <= klast ; k++) 67 { 68 69 //------------------------------------------------------------------ 70 // find the part of A(:,k) to be reduced by this thread 71 //------------------------------------------------------------------ 72 73 GB_GET_J ; // int64_t j = GBH (Ah, k) ; but for user selectop only 74 int64_t pA, pA_end ; 75 GB_get_pA (&pA, &pA_end, tid, k, 76 kfirst, klast, pstart_Aslice, Ap, avlen) ; 77 78 //------------------------------------------------------------------ 79 // count entries in Ax [pA ... pA_end-1] 80 //------------------------------------------------------------------ 81 82 int64_t cjnz = 0 ; 83 for ( ; pA < pA_end ; pA++) 84 { 85 if (GB_TEST_VALUE_OF_ENTRY (pA)) cjnz++ ; 86 } 87 if (k == kfirst) 88 { 89 Wfirst [tid] = cjnz ; 90 } 91 else if (k == klast) 92 { 93 Wlast [tid] = cjnz ; 94 } 95 else 96 { 97 Cp [k] = cjnz ; 98 } 99 } 100 } 101 102 //-------------------------------------------------------------------------- 103 // reduce the first and last vector of each slice using a single thread 104 //-------------------------------------------------------------------------- 105 106 GB_ek_slice_merge1 (Cp, Wfirst, Wlast, A_ek_slicing, A_ntasks) ; 107 108 #else 109 110 //-------------------------------------------------------------------------- 111 // positional selector (tril, triu, diag, offdiag, resize) 112 //-------------------------------------------------------------------------- 113 114 const int64_t *restrict Ap = A->p ; 115 const int64_t *restrict Ah = A->h ; 116 const int64_t *restrict Ai = A->i ; 117 int64_t anvec = A->nvec ; 118 int64_t avlen = A->vlen ; 119 ASSERT (!GB_JUMBLED (A)) ; 120 121 //-------------------------------------------------------------------------- 122 // tril, triu, diag, offdiag, resize: binary search in each vector 123 //-------------------------------------------------------------------------- 124 125 int64_t k ; 126 #pragma omp parallel for num_threads(A_nthreads) schedule(guided) 127 for (k = 0 ; k < anvec ; k++) 128 { 129 130 //---------------------------------------------------------------------- 131 // get A(:,k) 132 //---------------------------------------------------------------------- 133 134 int64_t pA_start = GBP (Ap, k, avlen) ; 135 int64_t pA_end = GBP (Ap, k+1, avlen) ; 136 int64_t p = pA_start ; 137 int64_t cjnz = 0 ; 138 int64_t ajnz = pA_end - pA_start ; 139 bool found = false ; 140 141 if (ajnz > 0) 142 { 143 144 //------------------------------------------------------------------ 145 // search for the entry A(i,k) 146 //------------------------------------------------------------------ 147 148 int64_t ifirst = GBI (Ai, pA_start, avlen) ; 149 int64_t ilast = GBI (Ai, pA_end-1, avlen) ; 150 151 #if defined ( GB_RESIZE_SELECTOR ) 152 int64_t i = ithunk ; 153 #else 154 int64_t j = GBH (Ah, k) ; 155 int64_t i = j-ithunk ; 156 #endif 157 158 if (i < ifirst) 159 { 160 // all entries in A(:,k) come after i 161 ; 162 } 163 else if (i > ilast) 164 { 165 // all entries in A(:,k) come before i 166 p = pA_end ; 167 } 168 else if (ajnz == avlen) 169 { 170 // A(:,k) is dense 171 found = true ; 172 p += i ; 173 ASSERT (GBI (Ai, p, avlen) == i) ; 174 } 175 else 176 { 177 // binary search for A (i,k) 178 int64_t pright = pA_end - 1 ; 179 GB_SPLIT_BINARY_SEARCH (i, Ai, p, pright, found) ; 180 } 181 182 #if defined ( GB_TRIL_SELECTOR ) 183 184 // keep p to pA_end-1 185 cjnz = pA_end - p ; 186 187 #elif defined ( GB_TRIU_SELECTOR ) \ 188 || defined ( GB_RESIZE_SELECTOR ) 189 190 // if found, keep pA_start to p 191 // else keep pA_start to p-1 192 if (found) 193 { 194 p++ ; 195 // now in both cases, keep pA_start to p-1 196 } 197 // keep pA_start to p-1 198 cjnz = p - pA_start ; 199 200 #elif defined ( GB_DIAG_SELECTOR ) 201 202 // if found, keep p 203 // else keep nothing 204 cjnz = found ; 205 if (!found) p = -1 ; 206 // if (cjnz >= 0) keep p, else keep nothing 207 208 #elif defined ( GB_OFFDIAG_SELECTOR ) 209 210 // if found, keep pA_start to p-1 and p+1 to pA_end-1 211 // else keep pA_start to pA_end 212 cjnz = ajnz - found ; 213 if (!found) 214 { 215 p = pA_end ; 216 // now just keep pA_start to p-1; p+1 to pA_end is 217 // now empty 218 } 219 // in both cases, keep pA_start to p-1 and 220 // p+1 to pA_end-1. If the entry is not found, then 221 // p == pA_end, and all entries are kept. 222 223 #endif 224 } 225 226 //---------------------------------------------------------------------- 227 // log the result for the kth vector 228 //---------------------------------------------------------------------- 229 230 Zp [k] = p ; 231 Cp [k] = cjnz ; 232 } 233 234 //-------------------------------------------------------------------------- 235 // compute Wfirst and Wlast for each task 236 //-------------------------------------------------------------------------- 237 238 // Wfirst [0..A_ntasks-1] and Wlast [0..A_ntasks-1] are required for 239 // constructing C_start_slice [0..A_ntasks-1] in GB_selector. 240 241 for (int tid = 0 ; tid < A_ntasks ; tid++) 242 { 243 244 // if kfirst > klast then task tid does no work at all 245 int64_t kfirst = kfirst_Aslice [tid] ; 246 int64_t klast = klast_Aslice [tid] ; 247 Wfirst [tid] = 0 ; 248 Wlast [tid] = 0 ; 249 250 if (kfirst <= klast) 251 { 252 int64_t pA_start = pstart_Aslice [tid] ; 253 int64_t pA_end = GBP (Ap, kfirst+1, avlen) ; 254 pA_end = GB_IMIN (pA_end, pstart_Aslice [tid+1]) ; 255 if (pA_start < pA_end) 256 { 257 #if defined ( GB_TRIL_SELECTOR ) 258 259 // keep Zp [kfirst] to pA_end-1 260 int64_t p = GB_IMAX (Zp [kfirst], pA_start) ; 261 Wfirst [tid] = GB_IMAX (0, pA_end - p) ; 262 263 #elif defined ( GB_TRIU_SELECTOR ) \ 264 || defined ( GB_RESIZE_SELECTOR ) 265 266 // keep pA_start to Zp [kfirst]-1 267 int64_t p = GB_IMIN (Zp [kfirst], pA_end) ; 268 Wfirst [tid] = GB_IMAX (0, p - pA_start) ; 269 270 #elif defined ( GB_DIAG_SELECTOR ) 271 272 // task that owns the diagonal entry does this work 273 int64_t p = Zp [kfirst] ; 274 Wfirst [tid] = (pA_start <= p && p < pA_end) ? 1 : 0 ; 275 276 #elif defined ( GB_OFFDIAG_SELECTOR ) 277 278 // keep pA_start to Zp [kfirst]-1 279 int64_t p = GB_IMIN (Zp [kfirst], pA_end) ; 280 Wfirst [tid] = GB_IMAX (0, p - pA_start) ; 281 282 // keep Zp [kfirst]+1 to pA_end-1 283 p = GB_IMAX (Zp [kfirst]+1, pA_start) ; 284 Wfirst [tid] += GB_IMAX (0, pA_end - p) ; 285 286 #endif 287 } 288 } 289 290 if (kfirst < klast) 291 { 292 int64_t pA_start = GBP (Ap, klast, avlen) ; 293 int64_t pA_end = pstart_Aslice [tid+1] ; 294 if (pA_start < pA_end) 295 { 296 #if defined ( GB_TRIL_SELECTOR ) 297 298 // keep Zp [klast] to pA_end-1 299 int64_t p = GB_IMAX (Zp [klast], pA_start) ; 300 Wlast [tid] = GB_IMAX (0, pA_end - p) ; 301 302 #elif defined ( GB_TRIU_SELECTOR ) \ 303 || defined ( GB_RESIZE_SELECTOR ) 304 305 // keep pA_start to Zp [klast]-1 306 int64_t p = GB_IMIN (Zp [klast], pA_end) ; 307 Wlast [tid] = GB_IMAX (0, p - pA_start) ; 308 309 #elif defined ( GB_DIAG_SELECTOR ) 310 311 // task that owns the diagonal entry does this work 312 int64_t p = Zp [klast] ; 313 Wlast [tid] = (pA_start <= p && p < pA_end) ? 1 : 0 ; 314 315 #elif defined ( GB_OFFDIAG_SELECTOR ) 316 317 // keep pA_start to Zp [klast]-1 318 int64_t p = GB_IMIN (Zp [klast], pA_end) ; 319 Wlast [tid] = GB_IMAX (0, p - pA_start) ; 320 321 // keep Zp [klast]+1 to pA_end-1 322 p = GB_IMAX (Zp [klast]+1, pA_start) ; 323 Wlast [tid] += GB_IMAX (0, pA_end - p) ; 324 325 #endif 326 } 327 } 328 } 329 330 #endif 331 332