1 //------------------------------------------------------------------------------ 2 // GB_unop_transpose: C=op(cast(A')), transpose, typecast, and apply op 3 //------------------------------------------------------------------------------ 4 5 // SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2021, All Rights Reserved. 6 // SPDX-License-Identifier: Apache-2.0 7 8 //------------------------------------------------------------------------------ 9 10 // TODO:: if OP is ONE and uniform-valued matrices are exploited, then do 11 // the values in O(1) time. 12 13 { 14 15 // Ax unused for some uses of this template 16 #include "GB_unused.h" 17 18 //-------------------------------------------------------------------------- 19 // get A and C 20 //-------------------------------------------------------------------------- 21 22 const GB_ATYPE *restrict Ax = (GB_ATYPE *) A->x ; 23 GB_CTYPE *restrict Cx = (GB_CTYPE *) C->x ; 24 25 //-------------------------------------------------------------------------- 26 // C = op (cast (A')) 27 //-------------------------------------------------------------------------- 28 29 if (Workspaces == NULL) 30 { 31 32 //---------------------------------------------------------------------- 33 // A and C are both full or both bitmap 34 //---------------------------------------------------------------------- 35 36 // A is avlen-by-avdim; C is avdim-by-avlen 37 int64_t avlen = A->vlen ; 38 int64_t avdim = A->vdim ; 39 int64_t anz = avlen * avdim ; 40 41 const int8_t *restrict Ab = A->b ; 42 int8_t *restrict Cb = C->b ; 43 ASSERT ((Cb == NULL) == (Ab == NULL)) ; 44 45 //---------------------------------------------------------------------- 46 // A and C are both full or bitmap 47 //---------------------------------------------------------------------- 48 49 // TODO: it would be faster to by tiles, not rows/columns, for large 50 // matrices, but in most of the cases, A and C will be tall-and-thin 51 // or short-and-fat. 52 53 int tid ; 54 #pragma omp parallel for num_threads(nthreads) schedule(static) 55 for (tid = 0 ; tid < nthreads ; tid++) 56 { 57 int64_t pC_start, pC_end ; 58 GB_PARTITION (pC_start, pC_end, anz, tid, nthreads) ; 59 if (Ab == NULL) 60 { 61 // A and C are both full 62 for (int64_t pC = pC_start ; pC < pC_end ; pC++) 63 { 64 // get i and j of the entry C(i,j) 65 // i = (pC % avdim) ; 66 // j = (pC / avdim) ; 67 // find the position of the entry A(j,i) 68 // pA = j + i * avlen 69 // Cx [pC] = op (Ax [pA]) 70 GB_CAST_OP (pC, ((pC / avdim) + (pC % avdim) * avlen)) ; 71 } 72 } 73 else 74 { 75 // A and C are both bitmap 76 for (int64_t pC = pC_start ; pC < pC_end ; pC++) 77 { 78 // get i and j of the entry C(i,j) 79 // i = (pC % avdim) ; 80 // j = (pC / avdim) ; 81 // find the position of the entry A(j,i) 82 // pA = j + i * avlen 83 int64_t pA = ((pC / avdim) + (pC % avdim) * avlen) ; 84 int8_t cij_exists = Ab [pA] ; 85 Cb [pC] = cij_exists ; 86 if (cij_exists) 87 { 88 // Cx [pC] = op (Ax [pA]) 89 GB_CAST_OP (pC, pA) ; 90 } 91 } 92 } 93 } 94 95 } 96 else 97 { 98 99 //---------------------------------------------------------------------- 100 // A is sparse or hypersparse; C is sparse 101 //---------------------------------------------------------------------- 102 103 const int64_t *restrict Ap = A->p ; 104 const int64_t *restrict Ah = A->h ; 105 const int64_t *restrict Ai = A->i ; 106 const int64_t anvec = A->nvec ; 107 int64_t *restrict Ci = C->i ; 108 109 if (nthreads == 1) 110 { 111 112 //------------------------------------------------------------------ 113 // sequential method 114 //------------------------------------------------------------------ 115 116 int64_t *restrict workspace = Workspaces [0] ; 117 for (int64_t k = 0 ; k < anvec ; k++) 118 { 119 // iterate over the entries in A(:,j) 120 int64_t j = GBH (Ah, k) ; 121 int64_t pA_start = Ap [k] ; 122 int64_t pA_end = Ap [k+1] ; 123 for (int64_t pA = pA_start ; pA < pA_end ; pA++) 124 { 125 // C(j,i) = A(i,j) 126 int64_t i = Ai [pA] ; 127 int64_t pC = workspace [i]++ ; 128 Ci [pC] = j ; 129 // Cx [pC] = op (Ax [pA]) 130 GB_CAST_OP (pC, pA) ; 131 } 132 } 133 134 } 135 else if (nworkspaces == 1) 136 { 137 138 //------------------------------------------------------------------ 139 // atomic method 140 //------------------------------------------------------------------ 141 142 int64_t *restrict workspace = Workspaces [0] ; 143 int tid ; 144 #pragma omp parallel for num_threads(nthreads) schedule(static) 145 for (tid = 0 ; tid < nthreads ; tid++) 146 { 147 for (int64_t k = A_slice [tid] ; k < A_slice [tid+1] ; k++) 148 { 149 // iterate over the entries in A(:,j) 150 int64_t j = GBH (Ah, k) ; 151 int64_t pA_start = Ap [k] ; 152 int64_t pA_end = Ap [k+1] ; 153 for (int64_t pA = pA_start ; pA < pA_end ; pA++) 154 { 155 // C(j,i) = A(i,j) 156 int64_t i = Ai [pA] ; 157 // do this atomically: pC = workspace [i]++ 158 int64_t pC ; 159 GB_ATOMIC_CAPTURE_INC64 (pC, workspace [i]) ; 160 Ci [pC] = j ; 161 // Cx [pC] = op (Ax [pA]) 162 GB_CAST_OP (pC, pA) ; 163 } 164 } 165 } 166 167 } 168 else 169 { 170 171 //------------------------------------------------------------------ 172 // non-atomic method 173 //------------------------------------------------------------------ 174 175 int tid ; 176 #pragma omp parallel for num_threads(nthreads) schedule(static) 177 for (tid = 0 ; tid < nthreads ; tid++) 178 { 179 int64_t *restrict workspace = Workspaces [tid] ; 180 for (int64_t k = A_slice [tid] ; k < A_slice [tid+1] ; k++) 181 { 182 // iterate over the entries in A(:,j) 183 int64_t j = GBH (Ah, k) ; 184 int64_t pA_start = Ap [k] ; 185 int64_t pA_end = Ap [k+1] ; 186 for (int64_t pA = pA_start ; pA < pA_end ; pA++) 187 { 188 // C(j,i) = A(i,j) 189 int64_t i = Ai [pA] ; 190 int64_t pC = workspace [i]++ ; 191 Ci [pC] = j ; 192 // Cx [pC] = op (Ax [pA]) 193 GB_CAST_OP (pC, pA) ; 194 } 195 } 196 } 197 } 198 } 199 } 200 201