1 #ifndef STAN_MATH_GPU_KERNELS_TRIDIAGONALIZATION_HPP
2 #define STAN_MATH_GPU_KERNELS_TRIDIAGONALIZATION_HPP
3 
4 #ifdef STAN_OPENCL
5 
6 #include <stan/math/opencl/kernel_cl.hpp>
7 
8 namespace stan {
9 namespace math {
10 namespace opencl_kernels {
11 
12 // \cond
13 static const char* tridiagonalization_householder_kernel_code = STRINGIFY(
14     // \endcond
15     /**
16      * Calculates householder vector and first element of the vector v.
17      * Must be run with 1 workgroup of LOCAL_SIZE_ threads.
18      * @param[in,out] P packed matrix being constructed
19      * @param[in,out] V matrix V
20      * @param[out] q_glob q
21      * @param P_rows Number of rows of the packed matrix
22      * @param V_rows Number of rows of the matrix V
23      * @param j Start column of the block to work on
24      * @param k Index of the householder vector in the block to create
25      */
26     __kernel void tridiagonalization_householder(
27         __global double* P, __global double* V, __global double* q_glob,
28         const int P_rows, const int V_rows, const int j, const int k) {
29       const int lid = get_local_id(0);
30       const int gid = get_global_id(0);
31       const int gsize = get_global_size(0);
32       const int lsize = get_local_size(0);
33       const int ngroups = get_num_groups(0);
34       const int wgid = get_group_id(0);
35 
36       double q = 0;
37 
38       const int P_start = P_rows * (k + j) + k + j;
39       const int P_span = P_rows * (k + j + 1) - P_start;
40       for (int i = lid; i < P_span; i += lsize) {
41         double acc = 0;
42         // apply previous householder reflections from current block to the
43         // column we are making the Householder vector from
44         for (int l = 0; l < j; l++) {
45           acc += P[P_rows * (k + l) + k + j + i] * V[V_rows * l + j - 1]
46                  + V[V_rows * l + j - 1 + i] * P[P_rows * (k + l) + k + j];
47         }
48         double tmp = P[P_start + i] - acc;
49         P[P_start + i] = tmp;
50         if (i != 0) {
51           q += tmp * tmp;
52         }
53       }
54       // calculate column norm between threads
55       __local double q_local[LOCAL_SIZE_];
56       q_local[lid] = q;
57       barrier(CLK_LOCAL_MEM_FENCE);
58       for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
59            step /= REDUCTION_STEP_SIZE) {
60         if (lid < step) {
61           for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
62             q_local[lid] += q_local[lid + step * i];
63           }
64         }
65         barrier(CLK_LOCAL_MEM_FENCE);
66       }
67 
68       double alpha;
69       if (lid == 0) {
70         q = q_local[0];
71         double p1 = P[P_start + 1];
72         // make Householder vector
73         alpha = -copysign(sqrt(q), P[P_start]);
74         q -= p1 * p1;
75         p1 -= alpha;
76         P[P_start + 1] = p1;
77         q += p1 * p1;
78         q = sqrt(q);
79         q_local[0] = q;
80         q_local[1] = alpha;
81         *q_glob = q;
82       }
83       barrier(CLK_LOCAL_MEM_FENCE);
84       q = q_local[0];
85       alpha = q_local[1];
86       if (q != 0) {
87         double multi = sqrt(2.) / q;
88         // normalize the Householder vector
89         for (int i = lid + 1; i < P_span; i += lsize) {
90           P[P_start + i] *= multi;
91         }
92       }
93       if (gid == 0) {
94         P[P_rows * (k + j + 1) + k + j]
95             = P[P_rows * (k + j) + k + j + 1] * q / sqrt(2.) + alpha;
96       }
97     }
98     // \cond
99 );
100 // \endcond
101 
102 // \cond
103 static const char* tridiagonalization_v_step_1_kernel_code = STRINGIFY(
104     // \endcond
105     /**
106      * Calculates first part of constructing the vector v: Uu = Pb * u and Vu =
107      * Vl * u. Pb is a block of packed matrix, Vl is left part of matrix V and u
108      * is householder vector. Must be run with number of work groups equal to
109      * size of resulting vectors and 64 threads per work group.
110      * @param P Packed matrix being constructed.
111      * @param V Matrix V.
112      * @param[out] Uu First resulting vector.
113      * @param[out] Vu Second resulting vector.
114      * @param P_rows Number of rows of the packed matrix
115      * @param V_rows Number of rows of the matrix V
116      * @param k Index of the householder vector in the block we use as input
117      */
118     __kernel void tridiagonalization_v_step_1(
119         const __global double* P, const __global double* V, __global double* Uu,
120         __global double* Vu, const int P_rows, const int V_rows, const int k) {
121       const int lid = get_local_id(0);
122       const int gid = get_global_id(0);
123       const int gsize = get_global_size(0);
124       const int lsize = get_local_size(0);
125       const int ngroups = get_num_groups(0);
126       const int wgid = get_group_id(0);
127 
128       __local double res_loc1[LOCAL_SIZE_];
129       __local double res_loc2[LOCAL_SIZE_];
130       double acc1 = 0;
131       double acc2 = 0;
132 
133       const __global double* vec = P + P_rows * (k + ngroups) + k + ngroups + 1;
134       const __global double* M1 = P + P_rows * (k + wgid) + k + ngroups + 1;
135       const __global double* M2 = V + V_rows * wgid + ngroups;
136       for (int i = lid; i < P_rows - k - ngroups - 1;
137            i += LOCAL_SIZE_) {  // go over column of the matrix in steps of 64
138         double v = vec[i];
139         acc1 += M1[i] * v;
140         acc2 += M2[i] * v;
141       }
142       res_loc1[lid] = acc1;
143       res_loc2[lid] = acc2;
144       barrier(CLK_LOCAL_MEM_FENCE);
145 
146       for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
147            step /= REDUCTION_STEP_SIZE) {
148         if (lid < step) {
149           for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
150             res_loc1[lid] += res_loc1[lid + step * i];
151             res_loc2[lid] += res_loc2[lid + step * i];
152           }
153         }
154         barrier(CLK_LOCAL_MEM_FENCE);
155       }
156       if (lid == 0) {
157         Uu[wgid] = res_loc1[0];
158         Vu[wgid] = res_loc2[0];
159       }
160     }
161     // \cond
162 );
163 // \endcond
164 
165 // \cond
166 static const char* tridiagonalization_v_step_2_kernel_code = STRINGIFY(
167     // \endcond
168     /**
169      * Second part in constructing vector v: v = Pb * u + V * Uu + U * Vu. Pb is
170      * a block of packed matrix and U is householder vector. Pb is symmetric
171      * with only lower triangle having values. That is why two columns of V are
172      * written that must be added to obtain the vector v. Must be run with 64
173      * threads per work group and total number of threads equal or greater than
174      * size of result vector.
175      * @param P Packed matrix being constructed.
176      * @param V Matrix V.
177      * @param Uu Uu from previous kernel.
178      * @param Vu Vu from previous kernel.
179      * @param P_rows Number of rows of the packed matrix
180      * @param V_rows Number of rows of the matrix V
181      * @param k Index of the householder vector in the block we use as input
182      * @param j Start column of the block to work on
183      */
184     __kernel void tridiagonalization_v_step_2(
185         const __global double* P, __global double* V, const __global double* Uu,
186         const __global double* Vu, const int P_rows, const int V_rows,
187         const int k, const int j) {
188       const int lid = get_local_id(0);
189       const int gid = get_global_id(0);
190       const int gsize = get_global_size(0);
191       const int lsize = get_local_size(0);
192       const int ngroups = get_num_groups(0);
193       const int wgid = get_group_id(0);
194 
195       int work = P_rows - k - j - 1;
196       double acc = 0;
197 
198       const __global double* vec = P + P_rows * (k + j) + k + j + 1;
199       const __global double* M1 = P + P_rows * (k + j + 1) + k + j + 1;
200       const __global double* M2 = P + P_rows * k + k + j + 1;
201       const __global double* M3 = V + j;
202       int i;
203       if (gid < work) {
204         for (i = 0; i <= gid; i++) {
205           acc += M1[P_rows * i + gid] * vec[i];
206         }
207         for (int i = 0; i < j; i++) {
208           acc -= M2[P_rows * i + gid] * Vu[i];
209           acc -= M3[V_rows * i + gid] * Uu[i];
210         }
211         V[V_rows * j + gid + j] = acc;
212       }
213       float work_per_group
214           = (float)work / ngroups;  // NOLINT(readability/casting)
215       int start = work_per_group * wgid;
216       int end = work_per_group * (wgid + 1);
217       __local double res_loc[LOCAL_SIZE_];
218       for (int i = start; i < end; i += 1) {
219         acc = 0;
220         for (int l = i + 1 + lid; l < work; l += LOCAL_SIZE_) {
221           acc += M1[P_rows * i + l] * vec[l];
222         }
223         res_loc[lid] = acc;
224         barrier(CLK_LOCAL_MEM_FENCE);
225         for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
226              step /= REDUCTION_STEP_SIZE) {
227           if (lid < step) {
228             for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
229               res_loc[lid] += res_loc[lid + step * i];
230             }
231           }
232           barrier(CLK_LOCAL_MEM_FENCE);
233         }
234         if (lid == 0) {
235           V[V_rows * (j + 1) + i + j] = res_loc[lid];
236         }
237         barrier(CLK_LOCAL_MEM_FENCE);
238       }
239     }
240     // \cond
241 );
242 // \endcond
243 
244 // \cond
245 static const char* tridiagonalization_v_step_3_kernel_code = STRINGIFY(
246     // \endcond
247     /**
248      * Third part in constructing vector v: v-=0.5*(v^T*u)*u, where u is the
249      * householder vector.
250      * @param[in,out] P packed matrix being constructed
251      * @param[in,out] V matrix V
252      * @param[out] q q
253      * @param P_rows Number of rows of the packed matrix
254      * @param V_rows Number of rows of the matrix V
255      * @param k Index of the householder vector in the block to create
256      * @param j Start column of the block to work on
257      */
258     __kernel void tridiagonalization_v_step_3(
259         __global double* P, __global double* V, __global double* q,
260         const int P_rows, const int V_rows, const int k, const int j) {
261       const int lid = get_local_id(0);
262       const int gid = get_global_id(0);
263       const int gsize = get_global_size(0);
264       const int lsize = get_local_size(0);
265       const int ngroups = get_num_groups(0);
266       const int wgid = get_group_id(0);
267 
268       __global double* u = P + P_rows * (k + j) + k + j + 1;
269       __global double* v = V + V_rows * j + j;
270       double acc = 0;
271 
272       for (int i = lid; i < P_rows - k - j - 1; i += LOCAL_SIZE_) {
273         double vi = v[i] + v[i + V_rows];
274         v[i] = vi;
275         acc += u[i] * vi;
276       }
277       __local double res_loc[LOCAL_SIZE_];
278       res_loc[lid] = acc;
279       barrier(CLK_LOCAL_MEM_FENCE);
280       for (int step = lsize / REDUCTION_STEP_SIZE; step > 0;
281            step /= REDUCTION_STEP_SIZE) {
282         if (lid < step) {
283           for (int i = 1; i < REDUCTION_STEP_SIZE; i++) {
284             res_loc[lid] += res_loc[lid + step * i];
285           }
286         }
287         barrier(CLK_LOCAL_MEM_FENCE);
288       }
289       acc = res_loc[0] * 0.5;
290       for (int i = lid; i < P_rows - k - j - 1; i += LOCAL_SIZE_) {
291         v[i] -= acc * u[i];
292       }
293       if (gid == 0) {
294         P[P_rows * (k + j + 1) + k + j] -= *q / sqrt(2.) * u[0];
295       }
296     }
297     // \cond
298 );
299 // \endcond
300 
301 const kernel_cl<in_out_buffer, in_out_buffer, out_buffer, int, int, int, int>
302     tridiagonalization_householder("tridiagonalization_householder",
303                                    {tridiagonalization_householder_kernel_code},
304                                    {{"REDUCTION_STEP_SIZE", 4},
305                                     {"LOCAL_SIZE_", 1024}});
306 
307 const kernel_cl<in_buffer, in_buffer, out_buffer, out_buffer, int, int, int>
308     tridiagonalization_v_step_1("tridiagonalization_v_step_1",
309                                 {tridiagonalization_v_step_1_kernel_code},
310                                 {{"REDUCTION_STEP_SIZE", 4},
311                                  {"LOCAL_SIZE_", 64}});
312 
313 const kernel_cl<in_buffer, out_buffer, in_buffer, in_buffer, int, int, int, int>
314     tridiagonalization_v_step_2("tridiagonalization_v_step_2",
315                                 {tridiagonalization_v_step_2_kernel_code},
316                                 {{"REDUCTION_STEP_SIZE", 4},
317                                  {"LOCAL_SIZE_", 64}});
318 
319 const kernel_cl<in_out_buffer, in_out_buffer, out_buffer, int, int, int, int>
320     tridiagonalization_v_step_3("tridiagonalization_v_step_3",
321                                 {tridiagonalization_v_step_3_kernel_code},
322                                 {{"REDUCTION_STEP_SIZE", 4},
323                                  {"LOCAL_SIZE_", 1024}});
324 
325 }  // namespace opencl_kernels
326 }  // namespace math
327 }  // namespace stan
328 #endif
329 #endif
330