1 /*
2 * Copyright (c) 2011-2019, The University of Oxford
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions are met:
7 * 1. Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright notice,
10 * this list of conditions and the following disclaimer in the documentation
11 * and/or other materials provided with the distribution.
12 * 3. Neither the name of the University of Oxford nor the names of its
13 * contributors may be used to endorse or promote products derived from this
14 * software without specific prior written permission.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
20 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 #include "correlate/define_correlate_utils.h"
30 #include "correlate/define_cross_correlate.h"
31 #include "correlate/oskar_cross_correlate_scalar_cuda.h"
32 #include "math/define_multiply.h"
33 #include "utility/oskar_kernel_macros.h"
34
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <cuda_runtime.h>
39
40 enum {
41 VER_UNKNOWN = -1, // Not checked.
42 VER_NONE = 0, // Not specified.
43 // Actual versions:
44 VER_OLD = 1,
45 VER_NON_SM = 2,
46 VER_SM = 3
47 };
48 static int ver_specified_ = VER_UNKNOWN;
49 static int ver_cc_ = VER_UNKNOWN;
50 static int correlate_version(bool prec_double,
51 double frac_bandwidth, double time_int_sec);
52
53 // Original kernel.
54 template
55 <
56 // Compile-time parameters.
57 bool BANDWIDTH_SMEARING, bool TIME_SMEARING, bool GAUSSIAN,
58 typename FP, typename FP2
59 >
OSKAR_XCORR_SCALAR_GPU(oskar_xcorr_scalar_cudak,BANDWIDTH_SMEARING,TIME_SMEARING,GAUSSIAN,FP,FP2)60 OSKAR_XCORR_SCALAR_GPU(oskar_xcorr_scalar_cudak, BANDWIDTH_SMEARING, TIME_SMEARING, GAUSSIAN, FP, FP2)
61
62 // Indices into the visibility/baseline matrix.
63 #define SP blockIdx.x /* Column index. */
64 #define SQ blockIdx.y /* Row index. */
65
66 #define OKN_NSOURCES 32
67 #define OKN_BPK 4 /* baselines per kernel */
68 #define WARP 32
69
70 template
71 <
72 // Compile-time parameters.
73 bool BANDWIDTH_SMEARING, bool TIME_SMEARING, bool GAUSSIAN,
74 typename FP, typename FP2
75 >
76 __global__
77 void oskar_xcorr_scalar_NON_SM_cudak(
78 OSKAR_XCORR_ARGS(FP)
79 const FP2* const __restrict__ jones,
80 FP2* __restrict__ vis)
81 {
82 (void) src_Q;
83 (void) src_U;
84 (void) src_V;
85 __shared__ FP uv_len[OKN_BPK], uu[OKN_BPK], vv[OKN_BPK], ww[OKN_BPK];
86 __shared__ FP uu2[OKN_BPK], vv2[OKN_BPK], uuvv[OKN_BPK];
87 __shared__ FP du[OKN_BPK], dv[OKN_BPK], dw[OKN_BPK];
88 __shared__ const FP2 *st_q[OKN_BPK];
89 FP2 t1, t2, sum;
90
91 const int w = (threadIdx.x >> 5); // Warp ID.
92 const int i = (threadIdx.x & 31); // ID within warp (local ID).
93
94 // Return immediately if in the wrong half of the visibility matrix.
95 if (OKN_BPK * SQ >= SP) return;
96
97 // Get baseline values per warp.
98 if (i == 0)
99 {
100 const int i_sq = OKN_BPK * SQ + w;
101
102 // Set pointer to source vector for station q to safe position
103 // so non-existence SQ >= SP does not cause problems.
104 st_q[w] = &jones[0];
105
106 if (i_sq < num_stations)
107 {
108 OSKAR_BASELINE_TERMS(FP, st_u[SP], st_u[i_sq], st_v[SP], st_v[i_sq],
109 st_w[SP], st_w[i_sq], uu[w], vv[w], ww[w],
110 uu2[w], vv2[w], uuvv[w], uv_len[w]);
111
112 if (TIME_SMEARING)
113 OSKAR_BASELINE_DELTAS(FP, st_x[SP], st_x[i_sq],
114 st_y[SP], st_y[i_sq], du[w], dv[w], dw[w]);
115
116 // Get valid pointer to source vector for station q.
117 st_q[w] = &jones[num_src * i_sq];
118 }
119 }
120 __syncthreads();
121
122 // Get pointer to source vector for station p.
123 const FP2* const __restrict__ st_p = &jones[num_src * SP];
124
125 // Each thread from given warp loops over a subset of the sources,
126 // and each warp works with a different station q.
127 sum.x = sum.y = (FP)0;
128 const int itemp = (num_src >> 5) * WARP;
129 for (int s = i; s < itemp; s += WARP)
130 {
131 FP smearing;
132 if (GAUSSIAN)
133 {
134 const FP t = src_a[s] * uu2[w] +
135 src_b[s] * uuvv[w] + src_c[s] * vv2[w];
136 smearing = exp((FP) -t);
137 }
138 else smearing = (FP) 1;
139 if (BANDWIDTH_SMEARING || TIME_SMEARING)
140 {
141 const FP l = src_l[s], m = src_m[s], n = src_n[s] - (FP) 1;
142 if (BANDWIDTH_SMEARING)
143 {
144 const FP t = uu[w] * l + vv[w] * m + ww[w] * n;
145 smearing *= OSKAR_SINC(FP, t);
146 }
147 if (TIME_SMEARING)
148 {
149 const FP t = du[w] * l + dv[w] * m + dw[w] * n;
150 smearing *= OSKAR_SINC(FP, t);
151 }
152 }
153
154 // Multiply Jones scalars.
155 smearing *= src_I[s];
156 t1 = st_p[s];
157 t2 = (st_q[w])[s];
158 OSKAR_MUL_COMPLEX_CONJUGATE_IN_PLACE(FP2, t1, t2)
159
160 // Multiply result by smearing term and accumulate.
161 sum.x += t1.x * smearing; sum.y += t1.y * smearing;
162 __syncthreads();
163 }
164 if ((num_src & 31) > 0)
165 {
166 int s = (num_src >> 5) * WARP + i;
167 if (s < num_src)
168 {
169 FP smearing;
170 if (GAUSSIAN)
171 {
172 const FP t = src_a[s] * uu2[w] +
173 src_b[s] * uuvv[w] + src_c[s] * vv2[w];
174 smearing = exp((FP) -t);
175 }
176 else smearing = (FP) 1;
177 if (BANDWIDTH_SMEARING || TIME_SMEARING)
178 {
179 const FP l = src_l[s], m = src_m[s], n = src_n[s] - (FP) 1;
180 if (BANDWIDTH_SMEARING)
181 {
182 const FP t = uu[w] * l + vv[w] * m + ww[w] * n;
183 smearing *= OSKAR_SINC(FP, t);
184 }
185 if (TIME_SMEARING)
186 {
187 const FP t = du[w] * l + dv[w] * m + dw[w] * n;
188 smearing *= OSKAR_SINC(FP, t);
189 }
190 }
191
192 // Multiply Jones scalars.
193 smearing *= src_I[s];
194 t1 = st_p[s];
195 t2 = (st_q[w])[s];
196 OSKAR_MUL_COMPLEX_CONJUGATE_IN_PLACE(FP2, t1, t2)
197
198 // Multiply result by smearing term and accumulate.
199 sum.x += t1.x * smearing; sum.y += t1.y * smearing;
200 }
201 }
202
203 // Reduce results within warp.
204 WARP_REDUCE(sum.x);
205 WARP_REDUCE(sum.y);
206
207 // Add result of this warp to the baseline visibility.
208 if (i == 0 && (OKN_BPK * SQ + w) < SP)
209 {
210 if (uv_len[w] < uv_min_lambda || uv_len[w] > uv_max_lambda) return;
211 const int q = OKN_BPK * SQ + w;
212 const int j = OSKAR_BASELINE_INDEX(num_stations, SP, q) + offset_out;
213 vis[j].x += sum.x; vis[j].y += sum.y;
214 }
215 }
216
217 template
218 <
219 // Compile-time parameters.
220 bool BANDWIDTH_SMEARING, bool TIME_SMEARING, bool GAUSSIAN,
221 typename FP, typename FP2
222 >
223 __global__
oskar_xcorr_scalar_SM_cudak(OSKAR_XCORR_ARGS (FP)const FP2 * const __restrict__ jones,FP2 * __restrict__ vis)224 void oskar_xcorr_scalar_SM_cudak(
225 OSKAR_XCORR_ARGS(FP)
226 const FP2* const __restrict__ jones,
227 FP2* __restrict__ vis)
228 {
229 (void) src_Q;
230 (void) src_U;
231 (void) src_V;
232 __shared__ FP uv_len[OKN_BPK], uu[OKN_BPK], vv[OKN_BPK], ww[OKN_BPK];
233 __shared__ FP uu2[OKN_BPK], vv2[OKN_BPK], uuvv[OKN_BPK];
234 __shared__ FP du[OKN_BPK], dv[OKN_BPK], dw[OKN_BPK];
235 __shared__ const FP2 *st_q[OKN_BPK];
236 __shared__ FP s_I[OKN_NSOURCES];
237 __shared__ FP s_l[OKN_NSOURCES];
238 __shared__ FP s_m[OKN_NSOURCES];
239 __shared__ FP s_n[OKN_NSOURCES];
240 __shared__ FP s_a[OKN_NSOURCES];
241 __shared__ FP s_b[OKN_NSOURCES];
242 __shared__ FP s_c[OKN_NSOURCES];
243 __shared__ FP2 s_sp[OKN_NSOURCES];
244 FP2 t1, t2, sum;
245
246 const int w = (threadIdx.x >> 5); // Warp ID.
247 const int i = (threadIdx.x & 31); // ID within warp (local ID).
248
249 // Return immediately if in the wrong half of the visibility matrix.
250 if (OKN_BPK * SQ >= SP) return;
251
252 // Get baseline values per warp.
253 if (i == 0)
254 {
255 const int i_sq = OKN_BPK * SQ + w;
256
257 // Set pointer to source vector for station q to safe position
258 // so non-existence SQ >= SP does not cause problems.
259 st_q[w] = &jones[0];
260
261 if (i_sq < num_stations)
262 {
263 OSKAR_BASELINE_TERMS(FP, st_u[SP], st_u[i_sq], st_v[SP], st_v[i_sq],
264 st_w[SP], st_w[i_sq], uu[w], vv[w], ww[w],
265 uu2[w], vv2[w], uuvv[w], uv_len[w]);
266
267 if (TIME_SMEARING)
268 OSKAR_BASELINE_DELTAS(FP, st_x[SP], st_x[i_sq],
269 st_y[SP], st_y[i_sq], du[w], dv[w], dw[w]);
270
271 // Get valid pointer to source vector for station q.
272 st_q[w] = &jones[num_src * i_sq];
273 }
274 }
275 __syncthreads();
276
277 // Get pointer to source vector for station p.
278 const FP2* const __restrict__ st_p = &jones[num_src * SP];
279
280 // Each thread from given warp loops over a subset of the sources,
281 // and each warp works with a different station q.
282 sum.x = sum.y = (FP)0;
283 const int itemp = (num_src >> 5) * WARP;
284 for (int s = i; s < itemp; s += WARP)
285 {
286 if (w == 0)
287 {
288 s_I[i] = src_I[s];
289 if (BANDWIDTH_SMEARING || TIME_SMEARING)
290 s_l[i] = src_l[s];
291 if (GAUSSIAN)
292 {
293 s_a[i] = src_a[s];
294 s_b[i] = src_b[s];
295 }
296 }
297 if (w == 1)
298 {
299 if (BANDWIDTH_SMEARING || TIME_SMEARING)
300 s_m[i] = src_m[s];
301 if (GAUSSIAN)
302 s_c[i] = src_c[s];
303 }
304 if (w == 2)
305 {
306 if (BANDWIDTH_SMEARING || TIME_SMEARING)
307 s_n[i] = src_n[s];
308 }
309 if (w == 3)
310 {
311 s_sp[i] = st_p[s];
312 }
313 __syncthreads();
314
315 FP smearing;
316 if (GAUSSIAN)
317 {
318 const FP t = s_a[i] * uu2[w] + s_b[i] * uuvv[w] + s_c[i] * vv2[w];
319 smearing = exp((FP) -t);
320 }
321 else smearing = (FP) 1;
322 if (BANDWIDTH_SMEARING || TIME_SMEARING)
323 {
324 const FP l = s_l[i], m = s_m[i], n = s_n[i] - (FP) 1;
325 if (BANDWIDTH_SMEARING)
326 {
327 const FP t = uu[w] * l + vv[w] * m + ww[w] * n;
328 smearing *= OSKAR_SINC(FP, t);
329 }
330 if (TIME_SMEARING)
331 {
332 const FP t = du[w] * l + dv[w] * m + dw[w] * n;
333 smearing *= OSKAR_SINC(FP, t);
334 }
335 }
336
337 // Multiply Jones scalars.
338 smearing *= s_I[i];
339 t1 = s_sp[i];
340 t2 = (st_q[w])[s];
341 OSKAR_MUL_COMPLEX_CONJUGATE_IN_PLACE(FP2, t1, t2)
342
343 // Multiply result by smearing term and accumulate.
344 sum.x += t1.x * smearing; sum.y += t1.y * smearing;
345 __syncthreads();
346 }
347 if ((num_src & 31) > 0)
348 {
349 int s = (num_src >> 5) * WARP + i;
350 if (s < num_src)
351 {
352 if (w == 0)
353 {
354 s_I[i] = src_I[s];
355 if (BANDWIDTH_SMEARING || TIME_SMEARING)
356 s_l[i] = src_l[s];
357 if (GAUSSIAN)
358 {
359 s_a[i] = src_a[s];
360 s_b[i] = src_b[s];
361 }
362 }
363 if (w == 1)
364 {
365 if (BANDWIDTH_SMEARING || TIME_SMEARING)
366 s_m[i] = src_m[s];
367 if (GAUSSIAN)
368 s_c[i] = src_c[s];
369 }
370 if (w == 2)
371 {
372 if (BANDWIDTH_SMEARING || TIME_SMEARING)
373 s_n[i] = src_n[s];
374 }
375 if (w == 3)
376 {
377 s_sp[i] = st_p[s];
378 }
379 }
380 __syncthreads();
381 if (s < num_src)
382 {
383 FP smearing;
384 if (GAUSSIAN)
385 {
386 const FP t = s_a[i] * uu2[w] +
387 s_b[i] * uuvv[w] + s_c[i] * vv2[w];
388 smearing = exp((FP) -t);
389 }
390 else smearing = (FP) 1;
391 if (BANDWIDTH_SMEARING || TIME_SMEARING)
392 {
393 const FP l = s_l[i], m = s_m[i], n = s_n[i] - (FP) 1;
394 if (BANDWIDTH_SMEARING)
395 {
396 const FP t = uu[w] * l + vv[w] * m + ww[w] * n;
397 smearing *= OSKAR_SINC(FP, t);
398 }
399 if (TIME_SMEARING)
400 {
401 const FP t = du[w] * l + dv[w] * m + dw[w] * n;
402 smearing *= OSKAR_SINC(FP, t);
403 }
404 }
405
406 // Multiply Jones scalars.
407 smearing *= s_I[i];
408 t1 = s_sp[i];
409 t2 = (st_q[w])[s];
410 OSKAR_MUL_COMPLEX_CONJUGATE_IN_PLACE(FP2, t1, t2)
411
412 // Multiply result by smearing term and accumulate.
413 sum.x += t1.x * smearing; sum.y += t1.y * smearing;
414 }
415 }
416
417 // Reduce results within warp.
418 WARP_REDUCE(sum.x);
419 WARP_REDUCE(sum.y);
420
421 // Add result of this warp to the baseline visibility.
422 if (i == 0 && (OKN_BPK * SQ + w) < SP)
423 {
424 if (uv_len[w] < uv_min_lambda || uv_len[w] > uv_max_lambda) return;
425 const int q = OKN_BPK * SQ + w;
426 const int j = OSKAR_BASELINE_INDEX(num_stations, SP, q) + offset_out;
427 vis[j].x += sum.x; vis[j].y += sum.y;
428 }
429 }
430
431 #define XCORR_KERNEL(NAME, BS, TS, GAUSSIAN, FP, FP2)\
432 NAME<BS, TS, GAUSSIAN, FP, FP2>\
433 OSKAR_CUDAK_CONF(num_blocks, num_threads, shared_mem)\
434 (num_sources, num_stations, offset_out, d_I, 0, 0, 0, d_l, d_m, d_n,\
435 d_a, d_b, d_c, d_station_u, d_station_v, d_station_w,\
436 d_station_x, d_station_y, uv_min_lambda, uv_max_lambda,\
437 inv_wavelength, frac_bandwidth, time_int_sec,\
438 gha0_rad, dec0_rad, d_jones, d_vis);
439
440 #define XCORR_SELECT(NAME, GAUSSIAN, FP, FP2)\
441 if (frac_bandwidth == (FP)0 && time_int_sec == (FP)0)\
442 XCORR_KERNEL(NAME, false, false, GAUSSIAN, FP, FP2)\
443 else if (frac_bandwidth != (FP)0 && time_int_sec == (FP)0)\
444 XCORR_KERNEL(NAME, true, false, GAUSSIAN, FP, FP2)\
445 else if (frac_bandwidth == (FP)0 && time_int_sec != (FP)0)\
446 XCORR_KERNEL(NAME, false, true, GAUSSIAN, FP, FP2)\
447 else if (frac_bandwidth != (FP)0 && time_int_sec != (FP)0)\
448 XCORR_KERNEL(NAME, true, true, GAUSSIAN, FP, FP2)
449
oskar_cross_correlate_scalar_point_cuda_f(int num_sources,int num_stations,int offset_out,const float2 * d_jones,const float * d_I,const float * d_l,const float * d_m,const float * d_n,const float * d_station_u,const float * d_station_v,const float * d_station_w,const float * d_station_x,const float * d_station_y,float uv_min_lambda,float uv_max_lambda,float inv_wavelength,float frac_bandwidth,const float time_int_sec,const float gha0_rad,const float dec0_rad,float2 * d_vis)450 void oskar_cross_correlate_scalar_point_cuda_f(
451 int num_sources, int num_stations, int offset_out,
452 const float2* d_jones, const float* d_I, const float* d_l,
453 const float* d_m, const float* d_n,
454 const float* d_station_u, const float* d_station_v,
455 const float* d_station_w, const float* d_station_x,
456 const float* d_station_y, float uv_min_lambda, float uv_max_lambda,
457 float inv_wavelength, float frac_bandwidth, const float time_int_sec,
458 const float gha0_rad, const float dec0_rad, float2* d_vis)
459 {
460 const dim3 num_threads(128, 1);
461 const float *d_a = 0, *d_b = 0, *d_c = 0;
462 const int ver = correlate_version(false, frac_bandwidth, time_int_sec);
463 if (ver == VER_NON_SM)
464 {
465 dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
466 const size_t shared_mem = 0;
467 XCORR_SELECT(oskar_xcorr_scalar_NON_SM_cudak, false, float, float2)
468 }
469 else if (ver == VER_SM)
470 {
471 dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
472 const size_t shared_mem = 0;
473 XCORR_SELECT(oskar_xcorr_scalar_SM_cudak, false, float, float2)
474 }
475 else
476 {
477 dim3 num_blocks(num_stations, num_stations);
478 const size_t shared_mem = num_threads.x * sizeof(float2);
479 XCORR_SELECT(oskar_xcorr_scalar_cudak, false, float, float2)
480 }
481 }
482
oskar_cross_correlate_scalar_point_cuda_d(int num_sources,int num_stations,int offset_out,const double2 * d_jones,const double * d_I,const double * d_l,const double * d_m,const double * d_n,const double * d_station_u,const double * d_station_v,const double * d_station_w,const double * d_station_x,const double * d_station_y,double uv_min_lambda,double uv_max_lambda,double inv_wavelength,double frac_bandwidth,const double time_int_sec,const double gha0_rad,const double dec0_rad,double2 * d_vis)483 void oskar_cross_correlate_scalar_point_cuda_d(
484 int num_sources, int num_stations, int offset_out,
485 const double2* d_jones, const double* d_I, const double* d_l,
486 const double* d_m, const double* d_n,
487 const double* d_station_u, const double* d_station_v,
488 const double* d_station_w, const double* d_station_x,
489 const double* d_station_y, double uv_min_lambda, double uv_max_lambda,
490 double inv_wavelength, double frac_bandwidth, const double time_int_sec,
491 const double gha0_rad, const double dec0_rad, double2* d_vis)
492 {
493 const dim3 num_threads(128, 1);
494 const double *d_a = 0, *d_b = 0, *d_c = 0;
495 const int ver = correlate_version(true, frac_bandwidth, time_int_sec);
496 if (ver == VER_NON_SM)
497 {
498 dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
499 const size_t shared_mem = 0;
500 XCORR_SELECT(oskar_xcorr_scalar_NON_SM_cudak, false, double, double2)
501 }
502 else if (ver == VER_SM)
503 {
504 dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
505 const size_t shared_mem = 0;
506 XCORR_SELECT(oskar_xcorr_scalar_SM_cudak, false, double, double2)
507 }
508 else
509 {
510 dim3 num_blocks(num_stations, num_stations);
511 const size_t shared_mem = num_threads.x * sizeof(double2);
512 XCORR_SELECT(oskar_xcorr_scalar_cudak, false, double, double2)
513 }
514 }
515
oskar_cross_correlate_scalar_gaussian_cuda_f(int num_sources,int num_stations,int offset_out,const float2 * d_jones,const float * d_I,const float * d_l,const float * d_m,const float * d_n,const float * d_a,const float * d_b,const float * d_c,const float * d_station_u,const float * d_station_v,const float * d_station_w,const float * d_station_x,const float * d_station_y,float uv_min_lambda,float uv_max_lambda,float inv_wavelength,float frac_bandwidth,float time_int_sec,float gha0_rad,float dec0_rad,float2 * d_vis)516 void oskar_cross_correlate_scalar_gaussian_cuda_f(
517 int num_sources, int num_stations, int offset_out,
518 const float2* d_jones, const float* d_I, const float* d_l,
519 const float* d_m, const float* d_n,
520 const float* d_a, const float* d_b,
521 const float* d_c, const float* d_station_u,
522 const float* d_station_v, const float* d_station_w,
523 const float* d_station_x, const float* d_station_y,
524 float uv_min_lambda, float uv_max_lambda, float inv_wavelength,
525 float frac_bandwidth, float time_int_sec, float gha0_rad,
526 float dec0_rad, float2* d_vis)
527 {
528 const dim3 num_threads(128, 1);
529 const int ver = correlate_version(false, frac_bandwidth, time_int_sec);
530 if (ver == VER_NON_SM)
531 {
532 dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
533 const size_t shared_mem = 0;
534 XCORR_SELECT(oskar_xcorr_scalar_NON_SM_cudak, true, float, float2)
535 }
536 else if (ver == VER_SM)
537 {
538 dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
539 const size_t shared_mem = 0;
540 XCORR_SELECT(oskar_xcorr_scalar_SM_cudak, true, float, float2)
541 }
542 else
543 {
544 dim3 num_blocks(num_stations, num_stations);
545 const size_t shared_mem = num_threads.x * sizeof(float2);
546 XCORR_SELECT(oskar_xcorr_scalar_cudak, true, float, float2)
547 }
548 }
549
oskar_cross_correlate_scalar_gaussian_cuda_d(int num_sources,int num_stations,int offset_out,const double2 * d_jones,const double * d_I,const double * d_l,const double * d_m,const double * d_n,const double * d_a,const double * d_b,const double * d_c,const double * d_station_u,const double * d_station_v,const double * d_station_w,const double * d_station_x,const double * d_station_y,double uv_min_lambda,double uv_max_lambda,double inv_wavelength,double frac_bandwidth,double time_int_sec,double gha0_rad,double dec0_rad,double2 * d_vis)550 void oskar_cross_correlate_scalar_gaussian_cuda_d(
551 int num_sources, int num_stations, int offset_out,
552 const double2* d_jones, const double* d_I, const double* d_l,
553 const double* d_m, const double* d_n,
554 const double* d_a, const double* d_b,
555 const double* d_c, const double* d_station_u,
556 const double* d_station_v, const double* d_station_w,
557 const double* d_station_x, const double* d_station_y,
558 double uv_min_lambda, double uv_max_lambda, double inv_wavelength,
559 double frac_bandwidth, double time_int_sec, double gha0_rad,
560 double dec0_rad, double2* d_vis)
561 {
562 const dim3 num_threads(128, 1);
563 const int ver = correlate_version(true, frac_bandwidth, time_int_sec);
564 if (ver == VER_NON_SM)
565 {
566 dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
567 const size_t shared_mem = 0;
568 XCORR_SELECT(oskar_xcorr_scalar_NON_SM_cudak, true, double, double2)
569 }
570 else if (ver == VER_SM)
571 {
572 dim3 num_blocks(num_stations, (num_stations + OKN_BPK - 1) / OKN_BPK);
573 const size_t shared_mem = 0;
574 XCORR_SELECT(oskar_xcorr_scalar_SM_cudak, true, double, double2)
575 }
576 else
577 {
578 dim3 num_blocks(num_stations, num_stations);
579 const size_t shared_mem = num_threads.x * sizeof(double2);
580 XCORR_SELECT(oskar_xcorr_scalar_cudak, true, double, double2)
581 }
582 }
583
correlate_version(bool prec_double,double frac_bandwidth,double time_int_sec)584 int correlate_version(bool prec_double,
585 double frac_bandwidth, double time_int_sec)
586 {
587 // Check the environment variable if necessary
588 // and use the specified version if it has been set.
589 if (ver_specified_ == VER_UNKNOWN)
590 {
591 const char* v = getenv("OSKAR_CORRELATE");
592 if (v)
593 {
594 if (!strcmp(v, "OLD") || !strcmp(v, "old"))
595 ver_specified_ = VER_OLD;
596 else if (!strcmp(v, "SM") || !strcmp(v, "sm"))
597 ver_specified_ = VER_SM;
598 else if (strstr(v, "NO") || strstr(v, "no"))
599 ver_specified_ = VER_NON_SM;
600 }
601 if (ver_specified_ == VER_UNKNOWN)
602 ver_specified_ = VER_NONE;
603 }
604 if (ver_specified_ > VER_NONE) return ver_specified_;
605
606 // Check the device compute capability if required.
607 if (ver_cc_ == VER_UNKNOWN)
608 {
609 int ma = 0, mi = 0, id = 0;
610 cudaGetDevice(&id);
611 cudaDeviceGetAttribute(&ma, cudaDevAttrComputeCapabilityMajor, id);
612 cudaDeviceGetAttribute(&mi, cudaDevAttrComputeCapabilityMinor, id);
613 ver_cc_ = 10 * ma + mi;
614 }
615
616 // Use non-shared-memory version on Volta.
617 if (ver_cc_ >= 70) return VER_NON_SM;
618
619 // Decide which is the best version to use on pre-Volta architectures.
620 if (ver_cc_ >= 30)
621 {
622 const bool smearing = (frac_bandwidth != 0.0 || time_int_sec != 0.0);
623 if (prec_double && smearing) return VER_NON_SM;
624 if (prec_double && !smearing) return VER_SM;
625 if (!prec_double && !smearing) return VER_NON_SM;
626 }
627
628 // Otherwise, use the old version.
629 return VER_OLD;
630 }
631