1 /*
2  * Copyright (c) 2011-2019, The University of Oxford
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  * 1. Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright notice,
10  *    this list of conditions and the following disclaimer in the documentation
11  *    and/or other materials provided with the distribution.
12  * 3. Neither the name of the University of Oxford nor the names of its
13  *    contributors may be used to endorse or promote products derived from this
14  *    software without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 #include "correlate/define_correlate_utils.h"
30 #include "correlate/define_cross_correlate.h"
31 #include "correlate/oskar_cross_correlate_scalar_cuda.h"
32 #include "math/define_multiply.h"
33 #include "utility/oskar_kernel_macros.h"
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <cuda_runtime.h>
39 
40 enum {
41     VER_UNKNOWN    = -1,  // Not checked.
42     VER_NONE       =  0,  // Not specified.
43     // Actual versions:
44     VER_OLD        =  1,
45     VER_NON_SM     =  2,
46     VER_SM         =  3
47 };
48 static int ver_specified_ = VER_UNKNOWN;
49 static int ver_cc_ = VER_UNKNOWN;
50 static int correlate_version(bool prec_double,
51         double frac_bandwidth, double time_int_sec);
52 
53 // Original kernel.
54 template
55 <
56 // Compile-time parameters.
57 bool BANDWIDTH_SMEARING, bool TIME_SMEARING, bool GAUSSIAN,
58 typename FP, typename FP2
59 >
OSKAR_XCORR_SCALAR_GPU(oskar_xcorr_scalar_cudak,BANDWIDTH_SMEARING,TIME_SMEARING,GAUSSIAN,FP,FP2)60 OSKAR_XCORR_SCALAR_GPU(oskar_xcorr_scalar_cudak, BANDWIDTH_SMEARING, TIME_SMEARING, GAUSSIAN, FP, FP2)
61 
62 // Indices into the visibility/baseline matrix.
63 #define SP blockIdx.x /* Column index. */
64 #define SQ blockIdx.y /* Row index. */
65 
66 #define OKN_NSOURCES 32
67 #define OKN_BPK 4 /* baselines per kernel */
68 #define WARP 32
69 
70 template
71 <
72 // Compile-time parameters.
73 bool BANDWIDTH_SMEARING, bool TIME_SMEARING, bool GAUSSIAN,
74 typename FP, typename FP2
75 >
76 __global__
77 void oskar_xcorr_scalar_NON_SM_cudak(
78         OSKAR_XCORR_ARGS(FP)
79         const FP2* const __restrict__ jones,
80         FP2*             __restrict__ vis)
81 {
82     (void) src_Q;
83     (void) src_U;
84     (void) src_V;
85     __shared__ FP uv_len[OKN_BPK], uu[OKN_BPK], vv[OKN_BPK], ww[OKN_BPK];
86     __shared__ FP uu2[OKN_BPK], vv2[OKN_BPK], uuvv[OKN_BPK];
87     __shared__ FP du[OKN_BPK], dv[OKN_BPK], dw[OKN_BPK];
88     __shared__ const FP2 *st_q[OKN_BPK];
89     FP2 t1, t2, sum;
90 
91     const int w = (threadIdx.x >> 5); // Warp ID.
92     const int i = (threadIdx.x & 31); // ID within warp (local ID).
93 
94     // Return immediately if in the wrong half of the visibility matrix.
95     if (OKN_BPK * SQ >= SP) return;
96 
97     // Get baseline values per warp.
98     if (i == 0)
99     {
100         const int i_sq = OKN_BPK * SQ + w;
101 
102         // Set pointer to source vector for station q to safe position
103         // so non-existence SQ >= SP does not cause problems.
104         st_q[w] = &jones[0];
105 
106         if (i_sq < num_stations)
107         {
108             OSKAR_BASELINE_TERMS(FP, st_u[SP], st_u[i_sq], st_v[SP], st_v[i_sq],
109                     st_w[SP], st_w[i_sq], uu[w], vv[w], ww[w],
110                     uu2[w], vv2[w], uuvv[w], uv_len[w]);
111 
112             if (TIME_SMEARING)
113                 OSKAR_BASELINE_DELTAS(FP, st_x[SP], st_x[i_sq],
114                         st_y[SP], st_y[i_sq], du[w], dv[w], dw[w]);
115 
116             // Get valid pointer to source vector for station q.
117             st_q[w] = &jones[num_src * i_sq];
118         }
119     }
120     __syncthreads();
121 
122     // Get pointer to source vector for station p.
123     const FP2* const __restrict__ st_p = &jones[num_src * SP];
124 
125     // Each thread from given warp loops over a subset of the sources,
126     // and each warp works with a different station q.
127     sum.x = sum.y = (FP)0;
128     const int itemp = (num_src >> 5) * WARP;
129     for (int s = i; s < itemp; s += WARP)
130     {
131         FP smearing;
132         if (GAUSSIAN)
133         {
134             const FP t = src_a[s] * uu2[w] +
135                     src_b[s] * uuvv[w] + src_c[s] * vv2[w];
136             smearing = exp((FP) -t);
137         }
138         else smearing = (FP) 1;
139         if (BANDWIDTH_SMEARING || TIME_SMEARING)
140         {
141             const FP l = src_l[s], m = src_m[s], n = src_n[s] - (FP) 1;
142             if (BANDWIDTH_SMEARING)
143             {
144                 const FP t = uu[w] * l + vv[w] * m + ww[w] * n;
145                 smearing *= OSKAR_SINC(FP, t);
146             }
147             if (TIME_SMEARING)
148             {
149                 const FP t = du[w] * l + dv[w] * m + dw[w] * n;
150                 smearing *= OSKAR_SINC(FP, t);
151             }
152         }
153 
154         // Multiply Jones scalars.
155         smearing *= src_I[s];
156         t1 = st_p[s];
157         t2 = (st_q[w])[s];
158         OSKAR_MUL_COMPLEX_CONJUGATE_IN_PLACE(FP2, t1, t2)
159 
160         // Multiply result by smearing term and accumulate.
161         sum.x += t1.x * smearing; sum.y += t1.y * smearing;
162         __syncthreads();
163     }
164     if ((num_src & 31) > 0)
165     {
166         int s = (num_src >> 5) * WARP + i;
167         if (s < num_src)
168         {
169             FP smearing;
170             if (GAUSSIAN)
171             {
172                 const FP t = src_a[s] * uu2[w] +
173                         src_b[s] * uuvv[w] + src_c[s] * vv2[w];
174                 smearing = exp((FP) -t);
175             }
176             else smearing = (FP) 1;
177             if (BANDWIDTH_SMEARING || TIME_SMEARING)
178             {
179                 const FP l = src_l[s], m = src_m[s], n = src_n[s] - (FP) 1;
180                 if (BANDWIDTH_SMEARING)
181                 {
182                     const FP t = uu[w] * l + vv[w] * m + ww[w] * n;
183                     smearing *= OSKAR_SINC(FP, t);
184                 }
185                 if (TIME_SMEARING)
186                 {
187                     const FP t = du[w] * l + dv[w] * m + dw[w] * n;
188                     smearing *= OSKAR_SINC(FP, t);
189                 }
190             }
191 
192             // Multiply Jones scalars.
193             smearing *= src_I[s];
194             t1 = st_p[s];
195             t2 = (st_q[w])[s];
196             OSKAR_MUL_COMPLEX_CONJUGATE_IN_PLACE(FP2, t1, t2)
197 
198             // Multiply result by smearing term and accumulate.
199             sum.x += t1.x * smearing; sum.y += t1.y * smearing;
200         }
201     }
202 
203     // Reduce results within warp.
204     WARP_REDUCE(sum.x);
205     WARP_REDUCE(sum.y);
206 
207     // Add result of this warp to the baseline visibility.
208     if (i == 0 && (OKN_BPK * SQ + w) < SP)
209     {
210         if (uv_len[w] < uv_min_lambda || uv_len[w] > uv_max_lambda) return;
211         const int q = OKN_BPK * SQ + w;
212         const int j = OSKAR_BASELINE_INDEX(num_stations, SP, q) + offset_out;
213         vis[j].x += sum.x; vis[j].y += sum.y;
214     }
215 }
216 
217 template
218 <
219 // Compile-time parameters.
220 bool BANDWIDTH_SMEARING, bool TIME_SMEARING, bool GAUSSIAN,
221 typename FP, typename FP2
222 >
223 __global__
oskar_xcorr_scalar_SM_cudak(OSKAR_XCORR_ARGS (FP)const FP2 * const __restrict__ jones,FP2 * __restrict__ vis)224 void oskar_xcorr_scalar_SM_cudak(
225         OSKAR_XCORR_ARGS(FP)
226         const FP2* const __restrict__ jones,
227         FP2*             __restrict__ vis)
228 {
229     (void) src_Q;
230     (void) src_U;
231     (void) src_V;
232     __shared__ FP uv_len[OKN_BPK], uu[OKN_BPK], vv[OKN_BPK], ww[OKN_BPK];
233     __shared__ FP uu2[OKN_BPK], vv2[OKN_BPK], uuvv[OKN_BPK];
234     __shared__ FP du[OKN_BPK], dv[OKN_BPK], dw[OKN_BPK];
235     __shared__ const FP2 *st_q[OKN_BPK];
236     __shared__ FP   s_I[OKN_NSOURCES];
237     __shared__ FP   s_l[OKN_NSOURCES];
238     __shared__ FP   s_m[OKN_NSOURCES];
239     __shared__ FP   s_n[OKN_NSOURCES];
240     __shared__ FP   s_a[OKN_NSOURCES];
241     __shared__ FP   s_b[OKN_NSOURCES];
242     __shared__ FP   s_c[OKN_NSOURCES];
243     __shared__ FP2 s_sp[OKN_NSOURCES];
244     FP2 t1, t2, sum;
245 
246     const int w = (threadIdx.x >> 5); // Warp ID.
247     const int i = (threadIdx.x & 31); // ID within warp (local ID).
248 
249     // Return immediately if in the wrong half of the visibility matrix.
250     if (OKN_BPK * SQ >= SP) return;
251 
252     // Get baseline values per warp.
253     if (i == 0)
254     {
255         const int i_sq = OKN_BPK * SQ + w;
256 
257         // Set pointer to source vector for station q to safe position
258         // so non-existence SQ >= SP does not cause problems.
259         st_q[w] = &jones[0];
260 
261         if (i_sq < num_stations)
262         {
263             OSKAR_BASELINE_TERMS(FP, st_u[SP], st_u[i_sq], st_v[SP], st_v[i_sq],
264                     st_w[SP], st_w[i_sq], uu[w], vv[w], ww[w],
265                     uu2[w], vv2[w], uuvv[w], uv_len[w]);
266 
267             if (TIME_SMEARING)
268                 OSKAR_BASELINE_DELTAS(FP, st_x[SP], st_x[i_sq],
269                         st_y[SP], st_y[i_sq], du[w], dv[w], dw[w]);
270 
271             // Get valid pointer to source vector for station q.
272             st_q[w] = &jones[num_src * i_sq];
273         }
274     }
275     __syncthreads();
276 
277     // Get pointer to source vector for station p.
278     const FP2* const __restrict__ st_p = &jones[num_src * SP];
279 
280     // Each thread from given warp loops over a subset of the sources,
281     // and each warp works with a different station q.
282     sum.x = sum.y = (FP)0;
283     const int itemp = (num_src >> 5) * WARP;
284     for (int s = i; s < itemp; s += WARP)
285     {
286         if (w == 0)
287         {
288             s_I[i] = src_I[s];
289             if (BANDWIDTH_SMEARING || TIME_SMEARING)
290                 s_l[i] = src_l[s];
291             if (GAUSSIAN)
292             {
293                 s_a[i] = src_a[s];
294                 s_b[i] = src_b[s];
295             }
296         }
297         if (w == 1)
298         {
299             if (BANDWIDTH_SMEARING || TIME_SMEARING)
300                 s_m[i] = src_m[s];
301             if (GAUSSIAN)
302                 s_c[i] = src_c[s];
303         }
304         if (w == 2)
305         {
306             if (BANDWIDTH_SMEARING || TIME_SMEARING)
307                 s_n[i] = src_n[s];
308         }
309         if (w == 3)
310         {
311             s_sp[i] = st_p[s];
312         }
313         __syncthreads();
314 
315         FP smearing;
316         if (GAUSSIAN)
317         {
318             const FP t = s_a[i] * uu2[w] + s_b[i] * uuvv[w] + s_c[i] * vv2[w];
319             smearing = exp((FP) -t);
320         }
321         else smearing = (FP) 1;
322         if (BANDWIDTH_SMEARING || TIME_SMEARING)
323         {
324             const FP l = s_l[i], m = s_m[i], n = s_n[i] - (FP) 1;
325             if (BANDWIDTH_SMEARING)
326             {
327                 const FP t = uu[w] * l + vv[w] * m + ww[w] * n;
328                 smearing *= OSKAR_SINC(FP, t);
329             }
330             if (TIME_SMEARING)
331             {
332                 const FP t = du[w] * l + dv[w] * m + dw[w] * n;
333                 smearing *= OSKAR_SINC(FP, t);
334             }
335         }
336 
337         // Multiply Jones scalars.
338         smearing *= s_I[i];
339         t1 = s_sp[i];
340         t2 = (st_q[w])[s];
341         OSKAR_MUL_COMPLEX_CONJUGATE_IN_PLACE(FP2, t1, t2)
342 
343         // Multiply result by smearing term and accumulate.
344         sum.x += t1.x * smearing; sum.y += t1.y * smearing;
345         __syncthreads();
346     }
347     if ((num_src & 31) > 0)
348     {
349         int s = (num_src >> 5) * WARP + i;
350         if (s < num_src)
351         {
352             if (w == 0)
353             {
354                 s_I[i] = src_I[s];
355                 if (BANDWIDTH_SMEARING || TIME_SMEARING)
356                     s_l[i] = src_l[s];
357                 if (GAUSSIAN)
358                 {
359                     s_a[i] = src_a[s];
360                     s_b[i] = src_b[s];
361                 }
362             }
363             if (w == 1)
364             {
365                 if (BANDWIDTH_SMEARING || TIME_SMEARING)
366                     s_m[i] = src_m[s];
367                 if (GAUSSIAN)
368                     s_c[i] = src_c[s];
369             }
370             if (w == 2)
371             {
372                 if (BANDWIDTH_SMEARING || TIME_SMEARING)
373                     s_n[i] = src_n[s];
374             }
375             if (w == 3)
376             {
377                 s_sp[i] = st_p[s];
378             }
379         }
380         __syncthreads();
381         if (s < num_src)
382         {
383             FP smearing;
384             if (GAUSSIAN)
385             {
386                 const FP t = s_a[i] * uu2[w] +
387                         s_b[i] * uuvv[w] + s_c[i] * vv2[w];
388                 smearing = exp((FP) -t);
389             }
390             else smearing = (FP) 1;
391             if (BANDWIDTH_SMEARING || TIME_SMEARING)
392             {
393                 const FP l = s_l[i], m = s_m[i], n = s_n[i] - (FP) 1;
394                 if (BANDWIDTH_SMEARING)
395                 {
396                     const FP t = uu[w] * l + vv[w] * m + ww[w] * n;
397                     smearing *= OSKAR_SINC(FP, t);
398                 }
399                 if (TIME_SMEARING)
400                 {
401                     const FP t = du[w] * l + dv[w] * m + dw[w] * n;
402                     smearing *= OSKAR_SINC(FP, t);
403                 }
404             }
405 
406             // Multiply Jones scalars.
407             smearing *= s_I[i];
408             t1 = s_sp[i];
409             t2 = (st_q[w])[s];
410             OSKAR_MUL_COMPLEX_CONJUGATE_IN_PLACE(FP2, t1, t2)
411 
412             // Multiply result by smearing term and accumulate.
413             sum.x += t1.x * smearing; sum.y += t1.y * smearing;
414         }
415     }
416 
417     // Reduce results within warp.
418     WARP_REDUCE(sum.x);
419     WARP_REDUCE(sum.y);
420 
421     // Add result of this warp to the baseline visibility.
422     if (i == 0 && (OKN_BPK * SQ + w) < SP)
423     {
424         if (uv_len[w] < uv_min_lambda || uv_len[w] > uv_max_lambda) return;
425         const int q = OKN_BPK * SQ + w;
426         const int j = OSKAR_BASELINE_INDEX(num_stations, SP, q) + offset_out;
427         vis[j].x += sum.x; vis[j].y += sum.y;
428     }
429 }
430 
431 #define XCORR_KERNEL(NAME, BS, TS, GAUSSIAN, FP, FP2)\
432         NAME<BS, TS, GAUSSIAN, FP, FP2>\
433         OSKAR_CUDAK_CONF(num_blocks, num_threads, shared_mem)\
434         (num_sources, num_stations, offset_out, d_I, 0, 0, 0, d_l, d_m, d_n,\
435                 d_a, d_b, d_c, d_station_u, d_station_v, d_station_w,\
436                 d_station_x, d_station_y, uv_min_lambda, uv_max_lambda,\
437                 inv_wavelength, frac_bandwidth, time_int_sec,\
438                 gha0_rad, dec0_rad, d_jones, d_vis);
439 
440 #define XCORR_SELECT(NAME, GAUSSIAN, FP, FP2)\
441         if (frac_bandwidth == (FP)0 && time_int_sec == (FP)0)\
442             XCORR_KERNEL(NAME, false, false, GAUSSIAN, FP, FP2)\
443         else if (frac_bandwidth != (FP)0 && time_int_sec == (FP)0)\
444             XCORR_KERNEL(NAME, true, false, GAUSSIAN, FP, FP2)\
445         else if (frac_bandwidth == (FP)0 && time_int_sec != (FP)0)\
446             XCORR_KERNEL(NAME, false, true, GAUSSIAN, FP, FP2)\
447         else if (frac_bandwidth != (FP)0 && time_int_sec != (FP)0)\
448             XCORR_KERNEL(NAME, true, true, GAUSSIAN, FP, FP2)
449 
oskar_cross_correlate_scalar_point_cuda_f(int num_sources,int num_stations,int offset_out,const float2 * d_jones,const float * d_I,const float * d_l,const float * d_m,const float * d_n,const float * d_station_u,const float * d_station_v,const float * d_station_w,const float * d_station_x,const float * d_station_y,float uv_min_lambda,float uv_max_lambda,float inv_wavelength,float frac_bandwidth,const float time_int_sec,const float gha0_rad,const float dec0_rad,float2 * d_vis)450 void oskar_cross_correlate_scalar_point_cuda_f(
451         int num_sources, int num_stations, int offset_out,
452         const float2* d_jones, const float* d_I, const float* d_l,
453         const float* d_m, const float* d_n,
454         const float* d_station_u, const float* d_station_v,
455         const float* d_station_w, const float* d_station_x,
456         const float* d_station_y, float uv_min_lambda, float uv_max_lambda,
457         float inv_wavelength, float frac_bandwidth, const float time_int_sec,
458         const float gha0_rad, const float dec0_rad, float2* d_vis)
459 {
460     const dim3 num_threads(128, 1);
461     const float *d_a = 0, *d_b = 0, *d_c = 0;
462     const int ver = correlate_version(false, frac_bandwidth, time_int_sec);
463     if (ver == VER_NON_SM)
464     {
465         dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
466         const size_t shared_mem = 0;
467         XCORR_SELECT(oskar_xcorr_scalar_NON_SM_cudak, false, float, float2)
468     }
469     else if (ver == VER_SM)
470     {
471         dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
472         const size_t shared_mem = 0;
473         XCORR_SELECT(oskar_xcorr_scalar_SM_cudak, false, float, float2)
474     }
475     else
476     {
477         dim3 num_blocks(num_stations, num_stations);
478         const size_t shared_mem = num_threads.x * sizeof(float2);
479         XCORR_SELECT(oskar_xcorr_scalar_cudak, false, float, float2)
480     }
481 }
482 
oskar_cross_correlate_scalar_point_cuda_d(int num_sources,int num_stations,int offset_out,const double2 * d_jones,const double * d_I,const double * d_l,const double * d_m,const double * d_n,const double * d_station_u,const double * d_station_v,const double * d_station_w,const double * d_station_x,const double * d_station_y,double uv_min_lambda,double uv_max_lambda,double inv_wavelength,double frac_bandwidth,const double time_int_sec,const double gha0_rad,const double dec0_rad,double2 * d_vis)483 void oskar_cross_correlate_scalar_point_cuda_d(
484         int num_sources, int num_stations, int offset_out,
485         const double2* d_jones, const double* d_I, const double* d_l,
486         const double* d_m, const double* d_n,
487         const double* d_station_u, const double* d_station_v,
488         const double* d_station_w, const double* d_station_x,
489         const double* d_station_y, double uv_min_lambda, double uv_max_lambda,
490         double inv_wavelength, double frac_bandwidth, const double time_int_sec,
491         const double gha0_rad, const double dec0_rad, double2* d_vis)
492 {
493     const dim3 num_threads(128, 1);
494     const double *d_a = 0, *d_b = 0, *d_c = 0;
495     const int ver = correlate_version(true, frac_bandwidth, time_int_sec);
496     if (ver == VER_NON_SM)
497     {
498         dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
499         const size_t shared_mem = 0;
500         XCORR_SELECT(oskar_xcorr_scalar_NON_SM_cudak, false, double, double2)
501     }
502     else if (ver == VER_SM)
503     {
504         dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
505         const size_t shared_mem = 0;
506         XCORR_SELECT(oskar_xcorr_scalar_SM_cudak, false, double, double2)
507     }
508     else
509     {
510         dim3 num_blocks(num_stations, num_stations);
511         const size_t shared_mem = num_threads.x * sizeof(double2);
512         XCORR_SELECT(oskar_xcorr_scalar_cudak, false, double, double2)
513     }
514 }
515 
oskar_cross_correlate_scalar_gaussian_cuda_f(int num_sources,int num_stations,int offset_out,const float2 * d_jones,const float * d_I,const float * d_l,const float * d_m,const float * d_n,const float * d_a,const float * d_b,const float * d_c,const float * d_station_u,const float * d_station_v,const float * d_station_w,const float * d_station_x,const float * d_station_y,float uv_min_lambda,float uv_max_lambda,float inv_wavelength,float frac_bandwidth,float time_int_sec,float gha0_rad,float dec0_rad,float2 * d_vis)516 void oskar_cross_correlate_scalar_gaussian_cuda_f(
517         int num_sources, int num_stations, int offset_out,
518         const float2* d_jones, const float* d_I, const float* d_l,
519         const float* d_m, const float* d_n,
520         const float* d_a, const float* d_b,
521         const float* d_c, const float* d_station_u,
522         const float* d_station_v, const float* d_station_w,
523         const float* d_station_x, const float* d_station_y,
524         float uv_min_lambda, float uv_max_lambda, float inv_wavelength,
525         float frac_bandwidth, float time_int_sec, float gha0_rad,
526         float dec0_rad, float2* d_vis)
527 {
528     const dim3 num_threads(128, 1);
529     const int ver = correlate_version(false, frac_bandwidth, time_int_sec);
530     if (ver == VER_NON_SM)
531     {
532         dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
533         const size_t shared_mem = 0;
534         XCORR_SELECT(oskar_xcorr_scalar_NON_SM_cudak, true, float, float2)
535     }
536     else if (ver == VER_SM)
537     {
538         dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
539         const size_t shared_mem = 0;
540         XCORR_SELECT(oskar_xcorr_scalar_SM_cudak, true, float, float2)
541     }
542     else
543     {
544         dim3 num_blocks(num_stations, num_stations);
545         const size_t shared_mem = num_threads.x * sizeof(float2);
546         XCORR_SELECT(oskar_xcorr_scalar_cudak, true, float, float2)
547     }
548 }
549 
oskar_cross_correlate_scalar_gaussian_cuda_d(int num_sources,int num_stations,int offset_out,const double2 * d_jones,const double * d_I,const double * d_l,const double * d_m,const double * d_n,const double * d_a,const double * d_b,const double * d_c,const double * d_station_u,const double * d_station_v,const double * d_station_w,const double * d_station_x,const double * d_station_y,double uv_min_lambda,double uv_max_lambda,double inv_wavelength,double frac_bandwidth,double time_int_sec,double gha0_rad,double dec0_rad,double2 * d_vis)550 void oskar_cross_correlate_scalar_gaussian_cuda_d(
551         int num_sources, int num_stations, int offset_out,
552         const double2* d_jones, const double* d_I, const double* d_l,
553         const double* d_m, const double* d_n,
554         const double* d_a, const double* d_b,
555         const double* d_c, const double* d_station_u,
556         const double* d_station_v, const double* d_station_w,
557         const double* d_station_x, const double* d_station_y,
558         double uv_min_lambda, double uv_max_lambda, double inv_wavelength,
559         double frac_bandwidth, double time_int_sec, double gha0_rad,
560         double dec0_rad, double2* d_vis)
561 {
562     const dim3 num_threads(128, 1);
563     const int ver = correlate_version(true, frac_bandwidth, time_int_sec);
564     if (ver == VER_NON_SM)
565     {
566         dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
567         const size_t shared_mem = 0;
568         XCORR_SELECT(oskar_xcorr_scalar_NON_SM_cudak, true, double, double2)
569     }
570     else if (ver == VER_SM)
571     {
572         dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
573         const size_t shared_mem = 0;
574         XCORR_SELECT(oskar_xcorr_scalar_SM_cudak, true, double, double2)
575     }
576     else
577     {
578         dim3 num_blocks(num_stations, num_stations);
579         const size_t shared_mem = num_threads.x * sizeof(double2);
580         XCORR_SELECT(oskar_xcorr_scalar_cudak, true, double, double2)
581     }
582 }
583 
correlate_version(bool prec_double,double frac_bandwidth,double time_int_sec)584 int correlate_version(bool prec_double,
585         double frac_bandwidth, double time_int_sec)
586 {
587     // Check the environment variable if necessary
588     // and use the specified version if it has been set.
589     if (ver_specified_ == VER_UNKNOWN)
590     {
591         const char* v = getenv("OSKAR_CORRELATE");
592         if (v)
593         {
594             if (!strcmp(v, "OLD") || !strcmp(v, "old"))
595                 ver_specified_ = VER_OLD;
596             else if (!strcmp(v, "SM") || !strcmp(v, "sm"))
597                 ver_specified_ = VER_SM;
598             else if (strstr(v, "NO") || strstr(v, "no"))
599                 ver_specified_ = VER_NON_SM;
600         }
601         if (ver_specified_ == VER_UNKNOWN)
602             ver_specified_ = VER_NONE;
603     }
604     if (ver_specified_ > VER_NONE) return ver_specified_;
605 
606     // Check the device compute capability if required.
607     if (ver_cc_ == VER_UNKNOWN)
608     {
609         int ma = 0, mi = 0, id = 0;
610         cudaGetDevice(&id);
611         cudaDeviceGetAttribute(&ma, cudaDevAttrComputeCapabilityMajor, id);
612         cudaDeviceGetAttribute(&mi, cudaDevAttrComputeCapabilityMinor, id);
613         ver_cc_ = 10 * ma + mi;
614     }
615 
616     // Use non-shared-memory version on Volta.
617     if (ver_cc_ >= 70) return VER_NON_SM;
618 
619     // Decide which is the best version to use on pre-Volta architectures.
620     if (ver_cc_ >= 30)
621     {
622         const bool smearing = (frac_bandwidth != 0.0 || time_int_sec != 0.0);
623         if (prec_double && smearing) return VER_NON_SM;
624         if (prec_double && !smearing) return VER_SM;
625         if (!prec_double && !smearing) return VER_NON_SM;
626     }
627 
628     // Otherwise, use the old version.
629     return VER_OLD;
630 }
631