1 //------------------------------------------------------------------------------
2 // GB_transpose_bucket: transpose and optionally typecast and/or apply operator
3 //------------------------------------------------------------------------------
4
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7
8 //------------------------------------------------------------------------------
9
10 // C = A' or op(A'). Optionally typecasts from A->type to the new type ctype,
11 // and/or optionally applies a unary operator.
12
13 // If an operator z=op(x) is provided, the type of z must be the same as the
14 // type of C. The type of A must be compatible with the type of of x (A is
15 // typecasted into the type of x). These conditions must be checked in the
16 // caller.
17
18 // This function is agnostic for the CSR/CSC format of C and A. C_is_csc is
19 // defined by the caller and assigned to C->is_csc, but otherwise unused.
20 // A->is_csc is ignored.
21
22 // The input can be hypersparse or non-hypersparse. The output C is always
23 // non-hypersparse, and never shallow. On input, C is a static header.
24
25 // If A is m-by-n in CSC format, with e nonzeros, the time and memory taken is
26 // O(m+n+e) if A is non-hypersparse, or O(m+e) if hypersparse. This is fine if
27 // most rows and columns of A are non-empty, but can be very costly if A or A'
28 // is hypersparse. In particular, if A is a non-hypersparse column vector with
29 // m >> e, the time and memory is O(m), which can be huge. Thus, for
30 // hypersparse matrices, or for very sparse matrices, the qsort method should
31 // be used instead (see GB_transpose).
32
33 // This method is parallel, but not highly scalable. At most O(e/m) threads
34 // are used.
35
36 #include "GB_transpose.h"
37
38 #define GB_FREE_WORK \
39 { \
40 if (Workspaces != NULL && Workspaces_size != NULL) \
41 { \
42 for (int tid = 0 ; tid < nworkspaces ; tid++) \
43 { \
44 GB_FREE_WERK (&(Workspaces [tid]), Workspaces_size [tid]) ; \
45 } \
46 } \
47 GB_WERK_POP (A_slice, int64_t) ; \
48 GB_WERK_POP (Workspaces_size, size_t) ; \
49 GB_WERK_POP (Workspaces, int64_t *) ; \
50 }
51
52 #define GB_FREE_ALL \
53 { \
54 GB_phbix_free (C) ; \
55 GB_FREE_WORK ; \
56 }
57
GB_transpose_bucket(GrB_Matrix C,const GrB_Type ctype,const bool C_is_csc,const GrB_Matrix A,const GrB_UnaryOp op1,const GrB_BinaryOp op2,const GxB_Scalar scalar,bool binop_bind1st,const int nworkspaces,const int nthreads,GB_Context Context)58 GrB_Info GB_transpose_bucket // bucket transpose; typecast and apply op
59 (
60 GrB_Matrix C, // output matrix (static header)
61 const GrB_Type ctype, // type of output matrix C
62 const bool C_is_csc, // format of output matrix C
63 const GrB_Matrix A, // input matrix
64 // no operator is applied if both op1 and op2 are NULL
65 const GrB_UnaryOp op1, // unary operator to apply
66 const GrB_BinaryOp op2, // binary operator to apply
67 const GxB_Scalar scalar, // scalar to bind to binary operator
68 bool binop_bind1st, // if true, binop(x,A) else binop(A,y)
69 const int nworkspaces, // # of workspaces to use (1, or nthreads)
70 const int nthreads, // # of threads to use
71 GB_Context Context
72 )
73 {
74
75 //--------------------------------------------------------------------------
76 // check inputs
77 //--------------------------------------------------------------------------
78
79 ASSERT (C != NULL) ;
80 ASSERT (C->static_header) ;
81 ASSERT_TYPE_OK (ctype, "ctype for transpose", GB0) ;
82 ASSERT_MATRIX_OK (A, "A input for transpose_bucket", GB0) ;
83 ASSERT (!GB_PENDING (A)) ;
84 ASSERT (!GB_ZOMBIES (A)) ;
85 ASSERT (GB_JUMBLED_OK (A)) ;
86
87 // if op1 and op2 are NULL, then no operator is applied
88
89 // This method is only be used when A is sparse or hypersparse.
90 // The full and bitmap cases are handled in GB_transpose.
91 ASSERT (!GB_IS_FULL (A)) ;
92 ASSERT (!GB_IS_BITMAP (A)) ;
93 ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A)) ;
94
95 GB_WERK_DECLARE (A_slice, int64_t) ; // size nthreads+1
96 GB_WERK_DECLARE (Workspaces, int64_t *) ; // size nworkspaces
97 GB_WERK_DECLARE (Workspaces_size, size_t) ; // size nworkspaces
98
99 //--------------------------------------------------------------------------
100 // get A
101 //--------------------------------------------------------------------------
102
103 int64_t anz = GB_NNZ (A) ;
104 int64_t vlen = A->vlen ;
105
106 //--------------------------------------------------------------------------
107 // determine the number of threads to use
108 //--------------------------------------------------------------------------
109
110 // # of threads to use in the O(vlen) loops below
111 GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
112 int nth = GB_nthreads (vlen, chunk, nthreads_max) ;
113
114 //--------------------------------------------------------------------------
115 // allocate C: always sparse
116 //--------------------------------------------------------------------------
117
118 // The bucket transpose only works when C is sparse.
119 // A can be sparse or hypersparse.
120
121 // C->p is allocated but not initialized.
122 GrB_Info info ;
123 GB_OK (GB_new_bix (&C, true, // sparse, static header
124 ctype, A->vdim, vlen, GB_Ap_malloc, C_is_csc,
125 GxB_SPARSE, true, A->hyper_switch, vlen, anz, true, Context)) ;
126
127 int64_t *restrict Cp = C->p ;
128
129 //--------------------------------------------------------------------------
130 // allocate workspace
131 //--------------------------------------------------------------------------
132
133 GB_WERK_PUSH (Workspaces, nworkspaces, int64_t *) ;
134 GB_WERK_PUSH (Workspaces_size, nworkspaces, size_t) ;
135 if (Workspaces == NULL || Workspaces_size == NULL)
136 {
137 // out of memory
138 GB_FREE_ALL ;
139 return (GrB_OUT_OF_MEMORY) ;
140 }
141
142 bool ok = true ;
143 for (int tid = 0 ; tid < nworkspaces ; tid++)
144 {
145 Workspaces [tid] = GB_MALLOC_WERK (vlen + 1, int64_t,
146 &Workspaces_size [tid]) ;
147 ok = ok && (Workspaces [tid] != NULL) ;
148 }
149
150 if (!ok)
151 {
152 // out of memory
153 GB_FREE_ALL ;
154 return (GrB_OUT_OF_MEMORY) ;
155 }
156
157 //==========================================================================
158 // phase1: symbolic analysis
159 //==========================================================================
160
161 // slice the A matrix, perfectly balanced for one task per thread
162 GB_WERK_PUSH (A_slice, nthreads + 1, int64_t) ;
163 if (A_slice == NULL)
164 {
165 // out of memory
166 GB_FREE_ALL ;
167 return (GrB_OUT_OF_MEMORY) ;
168 }
169 GB_pslice (A_slice, A->p, A->nvec, nthreads, true) ;
170
171 // sum up the row counts and find C->p
172 if (nthreads == 1)
173 {
174
175 //----------------------------------------------------------------------
176 // sequential method: A is not sliced
177 //----------------------------------------------------------------------
178
179 // Only requires a single int64 workspace of size vlen for a single
180 // thread. The resulting C matrix is not jumbled.
181
182 // compute the row counts of A. No need to scan the A->p pointers
183 ASSERT (nworkspaces == 1) ;
184 int64_t *restrict workspace = Workspaces [0] ;
185 memset (workspace, 0, (vlen + 1) * sizeof (int64_t)) ;
186 const int64_t *restrict Ai = A->i ;
187 for (int64_t p = 0 ; p < anz ; p++)
188 {
189 int64_t i = Ai [p] ;
190 workspace [i]++ ;
191 }
192
193 // cumulative sum of the workspace, and copy back into C->p
194 GB_cumsum (workspace, vlen, &(C->nvec_nonempty), 1, NULL) ;
195 memcpy (Cp, workspace, (vlen + 1) * sizeof (int64_t)) ;
196
197 }
198 else if (nworkspaces == 1)
199 {
200
201 //----------------------------------------------------------------------
202 // atomic method: A is sliced but workspace is shared
203 //----------------------------------------------------------------------
204
205 // Only requires a single int64 workspace of size vlen, shared by all
206 // threads. Scales well, but requires atomics. If the # of rows is
207 // very small and the average row degree is high, this can be very slow
208 // because of contention on the atomic workspace. Otherwise, it is
209 // typically faster than the non-atomic method. The resulting C matrix
210 // is jumbled.
211
212 // compute the row counts of A. No need to scan the A->p pointers
213 int64_t *restrict workspace = Workspaces [0] ;
214 GB_memset (workspace, 0, (vlen + 1) * sizeof (int64_t), nth) ;
215 const int64_t *restrict Ai = A->i ;
216 int64_t p ;
217 #pragma omp parallel for num_threads(nthreads) schedule(static)
218 for (p = 0 ; p < anz ; p++)
219 {
220 int64_t i = Ai [p] ;
221 // update workspace [i]++ automically:
222 GB_ATOMIC_UPDATE
223 workspace [i]++ ;
224 }
225
226 C->jumbled = true ; // atomic transpose leaves C jumbled
227
228 // cumulative sum of the workspace, and copy back into C->p
229 GB_cumsum (workspace, vlen, &(C->nvec_nonempty), nth, Context) ;
230 GB_memcpy (Cp, workspace, (vlen+ 1) * sizeof (int64_t), nth) ;
231
232 }
233 else
234 {
235
236 //----------------------------------------------------------------------
237 // non-atomic method
238 //----------------------------------------------------------------------
239
240 // compute the row counts of A for each slice, one per thread; This
241 // method is parallel, but not highly scalable. Each thread requires
242 // int64 workspace of size vlen, but no atomics are required. The
243 // resulting C matrix is not jumbled, so this can save work if C needs
244 // to be unjumbled later.
245
246 ASSERT (nworkspaces == nthreads) ;
247 const int64_t *restrict Ap = A->p ;
248 const int64_t *restrict Ah = A->h ;
249 const int64_t *restrict Ai = A->i ;
250
251 int tid ;
252 #pragma omp parallel for num_threads(nthreads) schedule(static)
253 for (tid = 0 ; tid < nthreads ; tid++)
254 {
255 // get the row counts for this slice, of size A->vlen
256 int64_t *restrict workspace = Workspaces [tid] ;
257 memset (workspace, 0, (vlen + 1) * sizeof (int64_t)) ;
258 for (int64_t k = A_slice [tid] ; k < A_slice [tid+1] ; k++)
259 {
260 // iterate over the entries in A(:,j)
261 int64_t j = GBH (Ah, k) ;
262 int64_t pA_start = Ap [k] ;
263 int64_t pA_end = Ap [k+1] ;
264 for (int64_t pA = pA_start ; pA < pA_end ; pA++)
265 {
266 // count one more entry in C(i,:) for this slice
267 int64_t i = Ai [pA] ;
268 workspace [i]++ ;
269 }
270 }
271 }
272
273 // cumulative sum of the workspaces across the slices
274 int64_t i ;
275 #pragma omp parallel for num_threads(nth) schedule(static)
276 for (i = 0 ; i < vlen ; i++)
277 {
278 int64_t s = 0 ;
279 for (int tid = 0 ; tid < nthreads ; tid++)
280 {
281 int64_t *restrict workspace = Workspaces [tid] ;
282 int64_t c = workspace [i] ;
283 workspace [i] = s ;
284 s += c ;
285 }
286 Cp [i] = s ;
287 }
288 Cp [vlen] = 0 ;
289
290 // compute the vector pointers for C
291 GB_cumsum (Cp, vlen, &(C->nvec_nonempty), nth, Context) ;
292
293 // add Cp back to all Workspaces
294 #pragma omp parallel for num_threads(nth) schedule(static)
295 for (i = 0 ; i < vlen ; i++)
296 {
297 int64_t s = Cp [i] ;
298 int64_t *restrict workspace = Workspaces [0] ;
299 workspace [i] = s ;
300 for (int tid = 1 ; tid < nthreads ; tid++)
301 {
302 int64_t *restrict workspace = Workspaces [tid] ;
303 workspace [i] += s ;
304 }
305 }
306 }
307
308 C->magic = GB_MAGIC ;
309
310 //==========================================================================
311 // phase2: transpose A into C
312 //==========================================================================
313
314 // transpose both the pattern and the values
315 if (op1 == NULL && op2 == NULL)
316 {
317 // do not apply an operator; optional typecast to C->type
318 GB_transpose_ix (C, A, Workspaces, A_slice, nworkspaces, nthreads) ;
319 }
320 else
321 {
322 // apply an operator, C has type op->ztype
323 GB_transpose_op (C, op1, op2, scalar, binop_bind1st, A,
324 Workspaces, A_slice, nworkspaces, nthreads) ;
325 }
326
327 //--------------------------------------------------------------------------
328 // free workspace and return result
329 //--------------------------------------------------------------------------
330
331 GB_FREE_WORK ;
332 ASSERT_MATRIX_OK (C, "C transpose of A", GB0) ;
333 ASSERT (C->h == NULL) ;
334 return (GrB_SUCCESS) ;
335 }
336
337