1 //------------------------------------------------------------------------------
2 // GB_unop_transpose: C=op(cast(A')), transpose, typecast, and apply op
3 //------------------------------------------------------------------------------
4 
5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved.
6 // SPDX-License-Identifier: Apache-2.0
7 
8 //------------------------------------------------------------------------------
9 
10 // TODO:: if OP is ONE and uniform-valued matrices are exploited, then do
11 // the values in O(1) time.
12 
13 {
14 
15     // Ax unused for some uses of this template
16     #include "GB_unused.h"
17 
18     //--------------------------------------------------------------------------
19     // get A and C
20     //--------------------------------------------------------------------------
21 
22     const GB_ATYPE *restrict Ax = (GB_ATYPE *) A->x ;
23     GB_CTYPE *restrict Cx = (GB_CTYPE *) C->x ;
24 
25     //--------------------------------------------------------------------------
26     // C = op (cast (A'))
27     //--------------------------------------------------------------------------
28 
29     if (Workspaces == NULL)
30     {
31 
32         //----------------------------------------------------------------------
33         // A and C are both full or both bitmap
34         //----------------------------------------------------------------------
35 
36         // A is avlen-by-avdim; C is avdim-by-avlen
37         int64_t avlen = A->vlen ;
38         int64_t avdim = A->vdim ;
39         int64_t anz = avlen * avdim ;
40 
41         const int8_t *restrict Ab = A->b ;
42         int8_t *restrict Cb = C->b ;
43         ASSERT ((Cb == NULL) == (Ab == NULL)) ;
44 
45         //----------------------------------------------------------------------
46         // A and C are both full or bitmap
47         //----------------------------------------------------------------------
48 
49         // TODO: it would be faster to by tiles, not rows/columns, for large
50         // matrices, but in most of the cases, A and C will be tall-and-thin
51         // or short-and-fat.
52 
53         int tid ;
54         #pragma omp parallel for num_threads(nthreads) schedule(static)
55         for (tid = 0 ; tid < nthreads ; tid++)
56         {
57             int64_t pC_start, pC_end ;
58             GB_PARTITION (pC_start, pC_end, anz, tid, nthreads) ;
59             if (Ab == NULL)
60             {
61                 // A and C are both full
62                 for (int64_t pC = pC_start ; pC < pC_end ; pC++)
63                 {
64                     // get i and j of the entry C(i,j)
65                     // i = (pC % avdim) ;
66                     // j = (pC / avdim) ;
67                     // find the position of the entry A(j,i)
68                     // pA = j + i * avlen
69                     // Cx [pC] = op (Ax [pA])
70                     GB_CAST_OP (pC, ((pC / avdim) + (pC % avdim) * avlen)) ;
71                 }
72             }
73             else
74             {
75                 // A and C are both bitmap
76                 for (int64_t pC = pC_start ; pC < pC_end ; pC++)
77                 {
78                     // get i and j of the entry C(i,j)
79                     // i = (pC % avdim) ;
80                     // j = (pC / avdim) ;
81                     // find the position of the entry A(j,i)
82                     // pA = j + i * avlen
83                     int64_t pA = ((pC / avdim) + (pC % avdim) * avlen) ;
84                     int8_t cij_exists = Ab [pA] ;
85                     Cb [pC] = cij_exists ;
86                     if (cij_exists)
87                     {
88                         // Cx [pC] = op (Ax [pA])
89                         GB_CAST_OP (pC, pA) ;
90                     }
91                 }
92             }
93         }
94 
95     }
96     else
97     {
98 
99         //----------------------------------------------------------------------
100         // A is sparse or hypersparse; C is sparse
101         //----------------------------------------------------------------------
102 
103         const int64_t *restrict Ap = A->p ;
104         const int64_t *restrict Ah = A->h ;
105         const int64_t *restrict Ai = A->i ;
106         const int64_t anvec = A->nvec ;
107         int64_t *restrict Ci = C->i ;
108 
109         if (nthreads == 1)
110         {
111 
112             //------------------------------------------------------------------
113             // sequential method
114             //------------------------------------------------------------------
115 
116             int64_t *restrict workspace = Workspaces [0] ;
117             for (int64_t k = 0 ; k < anvec ; k++)
118             {
119                 // iterate over the entries in A(:,j)
120                 int64_t j = GBH (Ah, k) ;
121                 int64_t pA_start = Ap [k] ;
122                 int64_t pA_end = Ap [k+1] ;
123                 for (int64_t pA = pA_start ; pA < pA_end ; pA++)
124                 {
125                     // C(j,i) = A(i,j)
126                     int64_t i = Ai [pA] ;
127                     int64_t pC = workspace [i]++ ;
128                     Ci [pC] = j ;
129                     // Cx [pC] = op (Ax [pA])
130                     GB_CAST_OP (pC, pA) ;
131                 }
132             }
133 
134         }
135         else if (nworkspaces == 1)
136         {
137 
138             //------------------------------------------------------------------
139             // atomic method
140             //------------------------------------------------------------------
141 
142             int64_t *restrict workspace = Workspaces [0] ;
143             int tid ;
144             #pragma omp parallel for num_threads(nthreads) schedule(static)
145             for (tid = 0 ; tid < nthreads ; tid++)
146             {
147                 for (int64_t k = A_slice [tid] ; k < A_slice [tid+1] ; k++)
148                 {
149                     // iterate over the entries in A(:,j)
150                     int64_t j = GBH (Ah, k) ;
151                     int64_t pA_start = Ap [k] ;
152                     int64_t pA_end = Ap [k+1] ;
153                     for (int64_t pA = pA_start ; pA < pA_end ; pA++)
154                     {
155                         // C(j,i) = A(i,j)
156                         int64_t i = Ai [pA] ;
157                         // do this atomically:  pC = workspace [i]++
158                         int64_t pC ;
159                         GB_ATOMIC_CAPTURE_INC64 (pC, workspace [i]) ;
160                         Ci [pC] = j ;
161                         // Cx [pC] = op (Ax [pA])
162                         GB_CAST_OP (pC, pA) ;
163                     }
164                 }
165             }
166 
167         }
168         else
169         {
170 
171             //------------------------------------------------------------------
172             // non-atomic method
173             //------------------------------------------------------------------
174 
175             int tid ;
176             #pragma omp parallel for num_threads(nthreads) schedule(static)
177             for (tid = 0 ; tid < nthreads ; tid++)
178             {
179                 int64_t *restrict workspace = Workspaces [tid] ;
180                 for (int64_t k = A_slice [tid] ; k < A_slice [tid+1] ; k++)
181                 {
182                     // iterate over the entries in A(:,j)
183                     int64_t j = GBH (Ah, k) ;
184                     int64_t pA_start = Ap [k] ;
185                     int64_t pA_end = Ap [k+1] ;
186                     for (int64_t pA = pA_start ; pA < pA_end ; pA++)
187                     {
188                         // C(j,i) = A(i,j)
189                         int64_t i = Ai [pA] ;
190                         int64_t pC = workspace [i]++ ;
191                         Ci [pC] = j ;
192                         // Cx [pC] = op (Ax [pA])
193                         GB_CAST_OP (pC, pA) ;
194                     }
195                 }
196             }
197         }
198     }
199 }
200 
201