1 //------------------------------------------------------------------------------
2 // GB_transpose_bucket: transpose and optionally typecast and/or apply operator
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // C = A' or op(A').  Optionally typecasts from A->type to the new type ctype,
11 // and/or optionally applies a unary operator.
12 
13 // If an operator z=op(x) is provided, the type of z must be the same as the
14 // type of C.  The type of A must be compatible with the type of of x (A is
15 // typecasted into the type of x).  These conditions must be checked in the
16 // caller.
17 
18 // This function is agnostic for the CSR/CSC format of C and A.  C_is_csc is
19 // defined by the caller and assigned to C->is_csc, but otherwise unused.
20 // A->is_csc is ignored.
21 
22 // The input can be hypersparse or non-hypersparse.  The output C is always
23 // non-hypersparse, and never shallow.  On input, C is a static header.
24 
25 // If A is m-by-n in CSC format, with e nonzeros, the time and memory taken is
26 // O(m+n+e) if A is non-hypersparse, or O(m+e) if hypersparse.  This is fine if
27 // most rows and columns of A are non-empty, but can be very costly if A or A'
28 // is hypersparse.  In particular, if A is a non-hypersparse column vector with
29 // m >> e, the time and memory is O(m), which can be huge.  Thus, for
30 // hypersparse matrices, or for very sparse matrices, the qsort method should
31 // be used instead (see GB_transpose).
32 
33 // This method is parallel, but not highly scalable.  At most O(e/m) threads
34 // are used.
35 
36 #include "GB_transpose.h"
37 
38 #define GB_FREE_WORK                                                    \
39 {                                                                       \
40     if (Workspaces != NULL && Workspaces_size != NULL)                  \
41     {                                                                   \
42         for (int tid = 0 ; tid < nworkspaces ; tid++)                   \
43         {                                                               \
44             GB_FREE_WERK (&(Workspaces [tid]), Workspaces_size [tid]) ; \
45         }                                                               \
46     }                                                                   \
47     GB_WERK_POP (A_slice, int64_t) ;                                    \
48     GB_WERK_POP (Workspaces_size, size_t) ;                             \
49     GB_WERK_POP (Workspaces, int64_t *) ;                               \
50 }
51 
52 #define GB_FREE_ALL                                                     \
53 {                                                                       \
54     GB_phbix_free (C) ;                                               \
55     GB_FREE_WORK ;                                                      \
56 }
57 
GB_transpose_bucket(GrB_Matrix C,const GrB_Type ctype,const bool C_is_csc,const GrB_Matrix A,const GrB_UnaryOp op1,const GrB_BinaryOp op2,const GxB_Scalar scalar,bool binop_bind1st,const int nworkspaces,const int nthreads,GB_Context Context)58 GrB_Info GB_transpose_bucket    // bucket transpose; typecast and apply op
59 (
60     GrB_Matrix C,               // output matrix (static header)
61     const GrB_Type ctype,       // type of output matrix C
62     const bool C_is_csc,        // format of output matrix C
63     const GrB_Matrix A,         // input matrix
64         // no operator is applied if both op1 and op2 are NULL
65         const GrB_UnaryOp op1,          // unary operator to apply
66         const GrB_BinaryOp op2,         // binary operator to apply
67         const GxB_Scalar scalar,        // scalar to bind to binary operator
68         bool binop_bind1st,             // if true, binop(x,A) else binop(A,y)
69     const int nworkspaces,      // # of workspaces to use (1, or nthreads)
70     const int nthreads,         // # of threads to use
71     GB_Context Context
72 )
73 {
74 
75     //--------------------------------------------------------------------------
76     // check inputs
77     //--------------------------------------------------------------------------
78 
79     ASSERT (C != NULL) ;
80     ASSERT (C->static_header) ;
81     ASSERT_TYPE_OK (ctype, "ctype for transpose", GB0) ;
82     ASSERT_MATRIX_OK (A, "A input for transpose_bucket", GB0) ;
83     ASSERT (!GB_PENDING (A)) ;
84     ASSERT (!GB_ZOMBIES (A)) ;
85     ASSERT (GB_JUMBLED_OK (A)) ;
86 
87     // if op1 and op2 are NULL, then no operator is applied
88 
89     // This method is only be used when A is sparse or hypersparse.
90     // The full and bitmap cases are handled in GB_transpose.
91     ASSERT (!GB_IS_FULL (A)) ;
92     ASSERT (!GB_IS_BITMAP (A)) ;
93     ASSERT (GB_IS_SPARSE (A) || GB_IS_HYPERSPARSE (A)) ;
94 
95     GB_WERK_DECLARE (A_slice, int64_t) ;            // size nthreads+1
96     GB_WERK_DECLARE (Workspaces, int64_t *) ;       // size nworkspaces
97     GB_WERK_DECLARE (Workspaces_size, size_t) ;     // size nworkspaces
98 
99     //--------------------------------------------------------------------------
100     // get A
101     //--------------------------------------------------------------------------
102 
103     int64_t anz = GB_NNZ (A) ;
104     int64_t vlen = A->vlen ;
105 
106     //--------------------------------------------------------------------------
107     // determine the number of threads to use
108     //--------------------------------------------------------------------------
109 
110     // # of threads to use in the O(vlen) loops below
111     GB_GET_NTHREADS_MAX (nthreads_max, chunk, Context) ;
112     int nth = GB_nthreads (vlen, chunk, nthreads_max) ;
113 
114     //--------------------------------------------------------------------------
115     // allocate C: always sparse
116     //--------------------------------------------------------------------------
117 
118     // The bucket transpose only works when C is sparse.
119     // A can be sparse or hypersparse.
120 
121     // C->p is allocated but not initialized.
122     GrB_Info info ;
123     GB_OK (GB_new_bix (&C, true, // sparse, static header
124         ctype, A->vdim, vlen, GB_Ap_malloc, C_is_csc,
125         GxB_SPARSE, true, A->hyper_switch, vlen, anz, true, Context)) ;
126 
127     int64_t *restrict Cp = C->p ;
128 
129     //--------------------------------------------------------------------------
130     // allocate workspace
131     //--------------------------------------------------------------------------
132 
133     GB_WERK_PUSH (Workspaces, nworkspaces, int64_t *) ;
134     GB_WERK_PUSH (Workspaces_size, nworkspaces, size_t) ;
135     if (Workspaces == NULL || Workspaces_size == NULL)
136     {
137         // out of memory
138         GB_FREE_ALL ;
139         return (GrB_OUT_OF_MEMORY) ;
140     }
141 
142     bool ok = true ;
143     for (int tid = 0 ; tid < nworkspaces ; tid++)
144     {
145         Workspaces [tid] = GB_MALLOC_WERK (vlen + 1, int64_t,
146             &Workspaces_size [tid]) ;
147         ok = ok && (Workspaces [tid] != NULL) ;
148     }
149 
150     if (!ok)
151     {
152         // out of memory
153         GB_FREE_ALL ;
154         return (GrB_OUT_OF_MEMORY) ;
155     }
156 
157     //==========================================================================
158     // phase1: symbolic analysis
159     //==========================================================================
160 
161     // slice the A matrix, perfectly balanced for one task per thread
162     GB_WERK_PUSH (A_slice, nthreads + 1, int64_t) ;
163     if (A_slice == NULL)
164     {
165         // out of memory
166         GB_FREE_ALL ;
167         return (GrB_OUT_OF_MEMORY) ;
168     }
169     GB_pslice (A_slice, A->p, A->nvec, nthreads, true) ;
170 
171     // sum up the row counts and find C->p
172     if (nthreads == 1)
173     {
174 
175         //----------------------------------------------------------------------
176         // sequential method: A is not sliced
177         //----------------------------------------------------------------------
178 
179         // Only requires a single int64 workspace of size vlen for a single
180         // thread.  The resulting C matrix is not jumbled.
181 
182         // compute the row counts of A.  No need to scan the A->p pointers
183         ASSERT (nworkspaces == 1) ;
184         int64_t *restrict workspace = Workspaces [0] ;
185         memset (workspace, 0, (vlen + 1) * sizeof (int64_t)) ;
186         const int64_t *restrict Ai = A->i ;
187         for (int64_t p = 0 ; p < anz ; p++)
188         {
189             int64_t i = Ai [p] ;
190             workspace [i]++ ;
191         }
192 
193         // cumulative sum of the workspace, and copy back into C->p
194         GB_cumsum (workspace, vlen, &(C->nvec_nonempty), 1, NULL) ;
195         memcpy (Cp, workspace, (vlen + 1) * sizeof (int64_t)) ;
196 
197     }
198     else if (nworkspaces == 1)
199     {
200 
201         //----------------------------------------------------------------------
202         // atomic method: A is sliced but workspace is shared
203         //----------------------------------------------------------------------
204 
205         // Only requires a single int64 workspace of size vlen, shared by all
206         // threads.  Scales well, but requires atomics.  If the # of rows is
207         // very small and the average row degree is high, this can be very slow
208         // because of contention on the atomic workspace.  Otherwise, it is
209         // typically faster than the non-atomic method.  The resulting C matrix
210         // is jumbled.
211 
212         // compute the row counts of A.  No need to scan the A->p pointers
213         int64_t *restrict workspace = Workspaces [0] ;
214         GB_memset (workspace, 0, (vlen + 1) * sizeof (int64_t), nth) ;
215         const int64_t *restrict Ai = A->i ;
216         int64_t p ;
217         #pragma omp parallel for num_threads(nthreads) schedule(static)
218         for (p = 0 ; p < anz ; p++)
219         {
220             int64_t i = Ai [p] ;
221             // update workspace [i]++ automically:
222             GB_ATOMIC_UPDATE
223             workspace [i]++ ;
224         }
225 
226         C->jumbled = true ; // atomic transpose leaves C jumbled
227 
228         // cumulative sum of the workspace, and copy back into C->p
229         GB_cumsum (workspace, vlen, &(C->nvec_nonempty), nth, Context) ;
230         GB_memcpy (Cp, workspace, (vlen+ 1) * sizeof (int64_t), nth) ;
231 
232     }
233     else
234     {
235 
236         //----------------------------------------------------------------------
237         // non-atomic method
238         //----------------------------------------------------------------------
239 
240         // compute the row counts of A for each slice, one per thread; This
241         // method is parallel, but not highly scalable.  Each thread requires
242         // int64 workspace of size vlen, but no atomics are required.  The
243         // resulting C matrix is not jumbled, so this can save work if C needs
244         // to be unjumbled later.
245 
246         ASSERT (nworkspaces == nthreads) ;
247         const int64_t *restrict Ap = A->p ;
248         const int64_t *restrict Ah = A->h ;
249         const int64_t *restrict Ai = A->i ;
250 
251         int tid ;
252         #pragma omp parallel for num_threads(nthreads) schedule(static)
253         for (tid = 0 ; tid < nthreads ; tid++)
254         {
255             // get the row counts for this slice, of size A->vlen
256             int64_t *restrict workspace = Workspaces [tid] ;
257             memset (workspace, 0, (vlen + 1) * sizeof (int64_t)) ;
258             for (int64_t k = A_slice [tid] ; k < A_slice [tid+1] ; k++)
259             {
260                 // iterate over the entries in A(:,j)
261                 int64_t j = GBH (Ah, k) ;
262                 int64_t pA_start = Ap [k] ;
263                 int64_t pA_end = Ap [k+1] ;
264                 for (int64_t pA = pA_start ; pA < pA_end ; pA++)
265                 {
266                     // count one more entry in C(i,:) for this slice
267                     int64_t i = Ai [pA] ;
268                     workspace [i]++ ;
269                 }
270             }
271         }
272 
273         // cumulative sum of the workspaces across the slices
274         int64_t i ;
275         #pragma omp parallel for num_threads(nth) schedule(static)
276         for (i = 0 ; i < vlen ; i++)
277         {
278             int64_t s = 0 ;
279             for (int tid = 0 ; tid < nthreads ; tid++)
280             {
281                 int64_t *restrict workspace = Workspaces [tid] ;
282                 int64_t c = workspace [i] ;
283                 workspace [i] = s ;
284                 s += c ;
285             }
286             Cp [i] = s ;
287         }
288         Cp [vlen] = 0 ;
289 
290         // compute the vector pointers for C
291         GB_cumsum (Cp, vlen, &(C->nvec_nonempty), nth, Context) ;
292 
293         // add Cp back to all Workspaces
294         #pragma omp parallel for num_threads(nth) schedule(static)
295         for (i = 0 ; i < vlen ; i++)
296         {
297             int64_t s = Cp [i] ;
298             int64_t *restrict workspace = Workspaces [0] ;
299             workspace [i] = s ;
300             for (int tid = 1 ; tid < nthreads ; tid++)
301             {
302                 int64_t *restrict workspace = Workspaces [tid] ;
303                 workspace [i] += s ;
304             }
305         }
306     }
307 
308     C->magic = GB_MAGIC ;
309 
310     //==========================================================================
311     // phase2: transpose A into C
312     //==========================================================================
313 
314     // transpose both the pattern and the values
315     if (op1 == NULL && op2 == NULL)
316     {
317         // do not apply an operator; optional typecast to C->type
318         GB_transpose_ix (C, A, Workspaces, A_slice, nworkspaces, nthreads) ;
319     }
320     else
321     {
322         // apply an operator, C has type op->ztype
323         GB_transpose_op (C, op1, op2, scalar, binop_bind1st, A,
324             Workspaces, A_slice, nworkspaces, nthreads) ;
325     }
326 
327     //--------------------------------------------------------------------------
328     // free workspace and return result
329     //--------------------------------------------------------------------------
330 
331     GB_FREE_WORK ;
332     ASSERT_MATRIX_OK (C, "C transpose of A", GB0) ;
333     ASSERT (C->h == NULL) ;
334     return (GrB_SUCCESS) ;
335 }
336 
337