1 /*
2  * Copyright (c) 2017-2021, The OSKAR Developers.
3  * See the LICENSE file at the top-level directory of this distribution.
4  */
5 
6 #include "math/define_dft_c2r.h"
7 #include "math/oskar_dft_c2r.h"
8 #include "utility/oskar_device.h"
9 #include "utility/oskar_kernel_macros.h"
10 
11 OSKAR_DFT_C2R_CPU(dft_c2r_2d_float, 0, float, float2)
12 OSKAR_DFT_C2R_CPU(dft_c2r_3d_float, 1, float, float2)
13 OSKAR_DFT_C2R_CPU(dft_c2r_2d_double, 0, double, double2)
14 OSKAR_DFT_C2R_CPU(dft_c2r_3d_double, 1, double, double2)
15 
oskar_int_range_clamp(int value,int minimum,int maximum)16 static int oskar_int_range_clamp(int value, int minimum, int maximum)
17 {
18    if (value < minimum) return minimum;
19    if (value > maximum) return maximum;
20    return value;
21 }
22 
oskar_dft_c2r(int num_in,double wavenumber,const oskar_Mem * x_in,const oskar_Mem * y_in,const oskar_Mem * z_in,const oskar_Mem * data_in,const oskar_Mem * weights_in,int num_out,const oskar_Mem * x_out,const oskar_Mem * y_out,const oskar_Mem * z_out,oskar_Mem * output,int * status)23 void oskar_dft_c2r(
24         int num_in,
25         double wavenumber,
26         const oskar_Mem* x_in,
27         const oskar_Mem* y_in,
28         const oskar_Mem* z_in,
29         const oskar_Mem* data_in,
30         const oskar_Mem* weights_in,
31         int num_out,
32         const oskar_Mem* x_out,
33         const oskar_Mem* y_out,
34         const oskar_Mem* z_out,
35         oskar_Mem* output,
36         int* status)
37 {
38     if (*status) return;
39     const int location = oskar_mem_location(output);
40     const int type = oskar_mem_precision(output);
41     const int is_dbl = oskar_mem_is_double(output);
42     const int is_3d =
43             (z_in != NULL && z_out != NULL && oskar_mem_length(z_out) > 0);
44     if (!oskar_mem_is_complex(data_in) ||
45             oskar_mem_is_complex(output) ||
46             oskar_mem_is_complex(weights_in) ||
47             oskar_mem_is_matrix(weights_in))
48     {
49         *status = OSKAR_ERR_BAD_DATA_TYPE;
50         return;
51     }
52     if (oskar_mem_location(weights_in) != location ||
53             oskar_mem_location(x_in) != location ||
54             oskar_mem_location(y_in) != location ||
55             oskar_mem_location(x_out) != location ||
56             oskar_mem_location(y_out) != location)
57     {
58         *status = OSKAR_ERR_LOCATION_MISMATCH;
59         return;
60     }
61     if (oskar_mem_precision(data_in) != type ||
62             oskar_mem_precision(weights_in) != type ||
63             oskar_mem_type(x_in) != type ||
64             oskar_mem_type(y_in) != type ||
65             oskar_mem_type(x_out) != type ||
66             oskar_mem_type(y_out) != type)
67     {
68         *status = OSKAR_ERR_TYPE_MISMATCH;
69         return;
70     }
71     if (is_3d)
72     {
73         if (oskar_mem_location(z_in) != location ||
74                 oskar_mem_location(z_out) != location)
75         {
76             *status = OSKAR_ERR_LOCATION_MISMATCH;
77             return;
78         }
79         if (oskar_mem_type(z_in) != type || oskar_mem_type(z_out) != type)
80         {
81             *status = OSKAR_ERR_TYPE_MISMATCH;
82             return;
83         }
84     }
85     oskar_mem_ensure(output, (size_t) num_out, status);
86     if (*status) return;
87     if (location == OSKAR_CPU)
88     {
89         if (is_3d)
90         {
91             if (is_dbl)
92             {
93                 dft_c2r_3d_double(num_in, wavenumber,
94                         oskar_mem_double_const(x_in, status),
95                         oskar_mem_double_const(y_in, status),
96                         oskar_mem_double_const(z_in, status),
97                         oskar_mem_double2_const(data_in, status),
98                         oskar_mem_double_const(weights_in, status), 0, num_out,
99                         oskar_mem_double_const(x_out, status),
100                         oskar_mem_double_const(y_out, status),
101                         oskar_mem_double_const(z_out, status), 0,
102                         oskar_mem_double(output, status), 0);
103             }
104             else
105             {
106                 dft_c2r_3d_float(num_in, (float)wavenumber,
107                         oskar_mem_float_const(x_in, status),
108                         oskar_mem_float_const(y_in, status),
109                         oskar_mem_float_const(z_in, status),
110                         oskar_mem_float2_const(data_in, status),
111                         oskar_mem_float_const(weights_in, status), 0, num_out,
112                         oskar_mem_float_const(x_out, status),
113                         oskar_mem_float_const(y_out, status),
114                         oskar_mem_float_const(z_out, status), 0,
115                         oskar_mem_float(output, status), 0);
116             }
117         }
118         else
119         {
120             if (is_dbl)
121             {
122                 dft_c2r_2d_double(num_in, wavenumber,
123                         oskar_mem_double_const(x_in, status),
124                         oskar_mem_double_const(y_in, status), 0,
125                         oskar_mem_double2_const(data_in, status),
126                         oskar_mem_double_const(weights_in, status), 0, num_out,
127                         oskar_mem_double_const(x_out, status),
128                         oskar_mem_double_const(y_out, status), 0, 0,
129                         oskar_mem_double(output, status), 0);
130             }
131             else
132             {
133                 dft_c2r_2d_float(num_in, (float)wavenumber,
134                         oskar_mem_float_const(x_in, status),
135                         oskar_mem_float_const(y_in, status), 0,
136                         oskar_mem_float2_const(data_in, status),
137                         oskar_mem_float_const(weights_in, status), 0, num_out,
138                         oskar_mem_float_const(x_out, status),
139                         oskar_mem_float_const(y_out, status), 0, 0,
140                         oskar_mem_float(output, status), 0);
141             }
142         }
143     }
144     else
145     {
146         size_t local_size[] = {1, 1, 1}, global_size[] = {1, 1, 1};
147         float wavenumber_f = (float) wavenumber;
148         const void* np = 0;
149         const char* k = 0;
150         int out_size = 0, max_out_size = 0, start = 0;
151         local_size[0] = oskar_device_is_nv(location) ? 384 : 256;
152         oskar_device_check_local_size(location, 0, local_size);
153 
154         /* Select the kernel. */
155         if (is_3d)
156         {
157             k = is_dbl ? "dft_c2r_3d_double" : "dft_c2r_3d_float";
158         }
159         else
160         {
161             k = is_dbl ? "dft_c2r_2d_double" : "dft_c2r_2d_float";
162         }
163 
164         /* Compute the maximum manageable output chunk size. */
165         /* Product of max output and input sizes. */
166         max_out_size = 8192 * (is_dbl ? 32768 : 65536);
167         max_out_size /= num_in;
168         max_out_size = (int) oskar_device_global_size(
169                 (size_t) max_out_size, local_size[0]); /* Last was 1024. */
170         max_out_size = oskar_int_range_clamp(max_out_size,
171                 (int) local_size[0] * 2,
172                 (int) local_size[0] * (is_dbl ? 80 : 160));
173         /* max_in_chunk must be multiple of 16. */
174         const int max_in_chunk = is_3d ?
175                 (is_dbl ? 384 : 800) : (is_dbl ? 448 : 896);
176         const size_t element_size = is_dbl ? sizeof(double) : sizeof(float);
177         const size_t local_mem_size = max_in_chunk * element_size;
178         const size_t arg_size_local[] = {
179                 2 * local_mem_size, 2 * local_mem_size,
180                 (is_3d ? local_mem_size : 0)
181         };
182 
183         /* Loop over output chunks. */
184         for (start = 0; start < num_out; start += max_out_size)
185         {
186             if (*status) break;
187 
188             /* Get the chunk size. */
189             out_size = num_out - start;
190             if (out_size > max_out_size) out_size = max_out_size;
191             global_size[0] = oskar_device_global_size(out_size, local_size[0]);
192 
193             /* Set kernel arguments. */
194             const oskar_Arg args[] = {
195                     {INT_SZ, &num_in},
196                     {is_dbl ? DBL_SZ : FLT_SZ,
197                             is_dbl ? (void*)&wavenumber : (void*)&wavenumber_f},
198                     {PTR_SZ, oskar_mem_buffer_const(x_in)},
199                     {PTR_SZ, oskar_mem_buffer_const(y_in)},
200                     {PTR_SZ, is_3d ? oskar_mem_buffer_const(z_in) : &np},
201                     {PTR_SZ, oskar_mem_buffer_const(data_in)},
202                     {PTR_SZ, oskar_mem_buffer_const(weights_in)},
203                     {INT_SZ, &start},
204                     {INT_SZ, &out_size},
205                     {PTR_SZ, oskar_mem_buffer_const(x_out)},
206                     {PTR_SZ, oskar_mem_buffer_const(y_out)},
207                     {PTR_SZ, is_3d ? oskar_mem_buffer_const(z_out) : &np},
208                     {INT_SZ, &start},
209                     {PTR_SZ, oskar_mem_buffer(output)},
210                     {INT_SZ, &max_in_chunk}
211             };
212             oskar_device_launch_kernel(k, location, 1, local_size, global_size,
213                     sizeof(args) / sizeof(oskar_Arg), args,
214                     sizeof(arg_size_local) / sizeof(size_t), arg_size_local,
215                     status);
216         }
217     }
218 }
219