1 #ifndef STAN_MATH_GPU_KERNELS_TRIDIAGONALIZATION_HPP 2 #define STAN_MATH_GPU_KERNELS_TRIDIAGONALIZATION_HPP 3 4 #ifdef STAN_OPENCL 5 6 #include <stan/math/opencl/kernel_cl.hpp> 7 8 namespace stan { 9 namespace math { 10 namespace opencl_kernels { 11 12 // \cond 13 static const char* tridiagonalization_householder_kernel_code = STRINGIFY( 14 // \endcond 15 /** 16 * Calculates householder vector and first element of the vector v. 17 * Must be run with 1 workgroup of LOCAL_SIZE_ threads. 18 * @param[in,out] P packed matrix being constructed 19 * @param[in,out] V matrix V 20 * @param[out] q_glob q 21 * @param P_rows Number of rows of the packed matrix 22 * @param V_rows Number of rows of the matrix V 23 * @param j Start column of the block to work on 24 * @param k Index of the householder vector in the block to create 25 */ 26 __kernel void tridiagonalization_householder( 27 __global double* P, __global double* V, __global double* q_glob, 28 const int P_rows, const int V_rows, const int j, const int k) { 29 const int lid = get_local_id(0); 30 const int gid = get_global_id(0); 31 const int gsize = get_global_size(0); 32 const int lsize = get_local_size(0); 33 const int ngroups = get_num_groups(0); 34 const int wgid = get_group_id(0); 35 36 double q = 0; 37 38 const int P_start = P_rows * (k + j) + k + j; 39 const int P_span = P_rows * (k + j + 1) - P_start; 40 for (int i = lid; i < P_span; i += lsize) { 41 double acc = 0; 42 // apply previous householder reflections from current block to the 43 // column we are making the Householder vector from 44 for (int l = 0; l < j; l++) { 45 acc += P[P_rows * (k + l) + k + j + i] * V[V_rows * l + j - 1] 46 + V[V_rows * l + j - 1 + i] * P[P_rows * (k + l) + k + j]; 47 } 48 double tmp = P[P_start + i] - acc; 49 P[P_start + i] = tmp; 50 if (i != 0) { 51 q += tmp * tmp; 52 } 53 } 54 // calculate column norm between threads 55 __local double q_local[LOCAL_SIZE_]; 56 q_local[lid] = q; 57 barrier(CLK_LOCAL_MEM_FENCE); 58 for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; 59 step /= REDUCTION_STEP_SIZE) { 60 if (lid < step) { 61 for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { 62 q_local[lid] += q_local[lid + step * i]; 63 } 64 } 65 barrier(CLK_LOCAL_MEM_FENCE); 66 } 67 68 double alpha; 69 if (lid == 0) { 70 q = q_local[0]; 71 double p1 = P[P_start + 1]; 72 // make Householder vector 73 alpha = -copysign(sqrt(q), P[P_start]); 74 q -= p1 * p1; 75 p1 -= alpha; 76 P[P_start + 1] = p1; 77 q += p1 * p1; 78 q = sqrt(q); 79 q_local[0] = q; 80 q_local[1] = alpha; 81 *q_glob = q; 82 } 83 barrier(CLK_LOCAL_MEM_FENCE); 84 q = q_local[0]; 85 alpha = q_local[1]; 86 if (q != 0) { 87 double multi = sqrt(2.) / q; 88 // normalize the Householder vector 89 for (int i = lid + 1; i < P_span; i += lsize) { 90 P[P_start + i] *= multi; 91 } 92 } 93 if (gid == 0) { 94 P[P_rows * (k + j + 1) + k + j] 95 = P[P_rows * (k + j) + k + j + 1] * q / sqrt(2.) + alpha; 96 } 97 } 98 // \cond 99 ); 100 // \endcond 101 102 // \cond 103 static const char* tridiagonalization_v_step_1_kernel_code = STRINGIFY( 104 // \endcond 105 /** 106 * Calculates first part of constructing the vector v: Uu = Pb * u and Vu = 107 * Vl * u. Pb is a block of packed matrix, Vl is left part of matrix V and u 108 * is householder vector. Must be run with number of work groups equal to 109 * size of resulting vectors and 64 threads per work group. 110 * @param P Packed matrix being constructed. 111 * @param V Matrix V. 112 * @param[out] Uu First resulting vector. 113 * @param[out] Vu Second resulting vector. 114 * @param P_rows Number of rows of the packed matrix 115 * @param V_rows Number of rows of the matrix V 116 * @param k Index of the householder vector in the block we use as input 117 */ 118 __kernel void tridiagonalization_v_step_1( 119 const __global double* P, const __global double* V, __global double* Uu, 120 __global double* Vu, const int P_rows, const int V_rows, const int k) { 121 const int lid = get_local_id(0); 122 const int gid = get_global_id(0); 123 const int gsize = get_global_size(0); 124 const int lsize = get_local_size(0); 125 const int ngroups = get_num_groups(0); 126 const int wgid = get_group_id(0); 127 128 __local double res_loc1[LOCAL_SIZE_]; 129 __local double res_loc2[LOCAL_SIZE_]; 130 double acc1 = 0; 131 double acc2 = 0; 132 133 const __global double* vec = P + P_rows * (k + ngroups) + k + ngroups + 1; 134 const __global double* M1 = P + P_rows * (k + wgid) + k + ngroups + 1; 135 const __global double* M2 = V + V_rows * wgid + ngroups; 136 for (int i = lid; i < P_rows - k - ngroups - 1; 137 i += LOCAL_SIZE_) { // go over column of the matrix in steps of 64 138 double v = vec[i]; 139 acc1 += M1[i] * v; 140 acc2 += M2[i] * v; 141 } 142 res_loc1[lid] = acc1; 143 res_loc2[lid] = acc2; 144 barrier(CLK_LOCAL_MEM_FENCE); 145 146 for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; 147 step /= REDUCTION_STEP_SIZE) { 148 if (lid < step) { 149 for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { 150 res_loc1[lid] += res_loc1[lid + step * i]; 151 res_loc2[lid] += res_loc2[lid + step * i]; 152 } 153 } 154 barrier(CLK_LOCAL_MEM_FENCE); 155 } 156 if (lid == 0) { 157 Uu[wgid] = res_loc1[0]; 158 Vu[wgid] = res_loc2[0]; 159 } 160 } 161 // \cond 162 ); 163 // \endcond 164 165 // \cond 166 static const char* tridiagonalization_v_step_2_kernel_code = STRINGIFY( 167 // \endcond 168 /** 169 * Second part in constructing vector v: v = Pb * u + V * Uu + U * Vu. Pb is 170 * a block of packed matrix and U is householder vector. Pb is symmetric 171 * with only lower triangle having values. That is why two columns of V are 172 * written that must be added to obtain the vector v. Must be run with 64 173 * threads per work group and total number of threads equal or greater than 174 * size of result vector. 175 * @param P Packed matrix being constructed. 176 * @param V Matrix V. 177 * @param Uu Uu from previous kernel. 178 * @param Vu Vu from previous kernel. 179 * @param P_rows Number of rows of the packed matrix 180 * @param V_rows Number of rows of the matrix V 181 * @param k Index of the householder vector in the block we use as input 182 * @param j Start column of the block to work on 183 */ 184 __kernel void tridiagonalization_v_step_2( 185 const __global double* P, __global double* V, const __global double* Uu, 186 const __global double* Vu, const int P_rows, const int V_rows, 187 const int k, const int j) { 188 const int lid = get_local_id(0); 189 const int gid = get_global_id(0); 190 const int gsize = get_global_size(0); 191 const int lsize = get_local_size(0); 192 const int ngroups = get_num_groups(0); 193 const int wgid = get_group_id(0); 194 195 int work = P_rows - k - j - 1; 196 double acc = 0; 197 198 const __global double* vec = P + P_rows * (k + j) + k + j + 1; 199 const __global double* M1 = P + P_rows * (k + j + 1) + k + j + 1; 200 const __global double* M2 = P + P_rows * k + k + j + 1; 201 const __global double* M3 = V + j; 202 int i; 203 if (gid < work) { 204 for (i = 0; i <= gid; i++) { 205 acc += M1[P_rows * i + gid] * vec[i]; 206 } 207 for (int i = 0; i < j; i++) { 208 acc -= M2[P_rows * i + gid] * Vu[i]; 209 acc -= M3[V_rows * i + gid] * Uu[i]; 210 } 211 V[V_rows * j + gid + j] = acc; 212 } 213 float work_per_group 214 = (float)work / ngroups; // NOLINT(readability/casting) 215 int start = work_per_group * wgid; 216 int end = work_per_group * (wgid + 1); 217 __local double res_loc[LOCAL_SIZE_]; 218 for (int i = start; i < end; i += 1) { 219 acc = 0; 220 for (int l = i + 1 + lid; l < work; l += LOCAL_SIZE_) { 221 acc += M1[P_rows * i + l] * vec[l]; 222 } 223 res_loc[lid] = acc; 224 barrier(CLK_LOCAL_MEM_FENCE); 225 for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; 226 step /= REDUCTION_STEP_SIZE) { 227 if (lid < step) { 228 for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { 229 res_loc[lid] += res_loc[lid + step * i]; 230 } 231 } 232 barrier(CLK_LOCAL_MEM_FENCE); 233 } 234 if (lid == 0) { 235 V[V_rows * (j + 1) + i + j] = res_loc[lid]; 236 } 237 barrier(CLK_LOCAL_MEM_FENCE); 238 } 239 } 240 // \cond 241 ); 242 // \endcond 243 244 // \cond 245 static const char* tridiagonalization_v_step_3_kernel_code = STRINGIFY( 246 // \endcond 247 /** 248 * Third part in constructing vector v: v-=0.5*(v^T*u)*u, where u is the 249 * householder vector. 250 * @param[in,out] P packed matrix being constructed 251 * @param[in,out] V matrix V 252 * @param[out] q q 253 * @param P_rows Number of rows of the packed matrix 254 * @param V_rows Number of rows of the matrix V 255 * @param k Index of the householder vector in the block to create 256 * @param j Start column of the block to work on 257 */ 258 __kernel void tridiagonalization_v_step_3( 259 __global double* P, __global double* V, __global double* q, 260 const int P_rows, const int V_rows, const int k, const int j) { 261 const int lid = get_local_id(0); 262 const int gid = get_global_id(0); 263 const int gsize = get_global_size(0); 264 const int lsize = get_local_size(0); 265 const int ngroups = get_num_groups(0); 266 const int wgid = get_group_id(0); 267 268 __global double* u = P + P_rows * (k + j) + k + j + 1; 269 __global double* v = V + V_rows * j + j; 270 double acc = 0; 271 272 for (int i = lid; i < P_rows - k - j - 1; i += LOCAL_SIZE_) { 273 double vi = v[i] + v[i + V_rows]; 274 v[i] = vi; 275 acc += u[i] * vi; 276 } 277 __local double res_loc[LOCAL_SIZE_]; 278 res_loc[lid] = acc; 279 barrier(CLK_LOCAL_MEM_FENCE); 280 for (int step = lsize / REDUCTION_STEP_SIZE; step > 0; 281 step /= REDUCTION_STEP_SIZE) { 282 if (lid < step) { 283 for (int i = 1; i < REDUCTION_STEP_SIZE; i++) { 284 res_loc[lid] += res_loc[lid + step * i]; 285 } 286 } 287 barrier(CLK_LOCAL_MEM_FENCE); 288 } 289 acc = res_loc[0] * 0.5; 290 for (int i = lid; i < P_rows - k - j - 1; i += LOCAL_SIZE_) { 291 v[i] -= acc * u[i]; 292 } 293 if (gid == 0) { 294 P[P_rows * (k + j + 1) + k + j] -= *q / sqrt(2.) * u[0]; 295 } 296 } 297 // \cond 298 ); 299 // \endcond 300 301 const kernel_cl<in_out_buffer, in_out_buffer, out_buffer, int, int, int, int> 302 tridiagonalization_householder("tridiagonalization_householder", 303 {tridiagonalization_householder_kernel_code}, 304 {{"REDUCTION_STEP_SIZE", 4}, 305 {"LOCAL_SIZE_", 1024}}); 306 307 const kernel_cl<in_buffer, in_buffer, out_buffer, out_buffer, int, int, int> 308 tridiagonalization_v_step_1("tridiagonalization_v_step_1", 309 {tridiagonalization_v_step_1_kernel_code}, 310 {{"REDUCTION_STEP_SIZE", 4}, 311 {"LOCAL_SIZE_", 64}}); 312 313 const kernel_cl<in_buffer, out_buffer, in_buffer, in_buffer, int, int, int, int> 314 tridiagonalization_v_step_2("tridiagonalization_v_step_2", 315 {tridiagonalization_v_step_2_kernel_code}, 316 {{"REDUCTION_STEP_SIZE", 4}, 317 {"LOCAL_SIZE_", 64}}); 318 319 const kernel_cl<in_out_buffer, in_out_buffer, out_buffer, int, int, int, int> 320 tridiagonalization_v_step_3("tridiagonalization_v_step_3", 321 {tridiagonalization_v_step_3_kernel_code}, 322 {{"REDUCTION_STEP_SIZE", 4}, 323 {"LOCAL_SIZE_", 1024}}); 324 325 } // namespace opencl_kernels 326 } // namespace math 327 } // namespace stan 328 #endif 329 #endif 330