1 /*
2 * Copyright (c) 2017-2021, The OSKAR Developers.
3 * See the LICENSE file at the top-level directory of this distribution.
4 */
5
6 #include "math/define_dft_c2r.h"
7 #include "math/oskar_dft_c2r.h"
8 #include "utility/oskar_device.h"
9 #include "utility/oskar_kernel_macros.h"
10
11 OSKAR_DFT_C2R_CPU(dft_c2r_2d_float, 0, float, float2)
12 OSKAR_DFT_C2R_CPU(dft_c2r_3d_float, 1, float, float2)
13 OSKAR_DFT_C2R_CPU(dft_c2r_2d_double, 0, double, double2)
14 OSKAR_DFT_C2R_CPU(dft_c2r_3d_double, 1, double, double2)
15
oskar_int_range_clamp(int value,int minimum,int maximum)16 static int oskar_int_range_clamp(int value, int minimum, int maximum)
17 {
18 if (value < minimum) return minimum;
19 if (value > maximum) return maximum;
20 return value;
21 }
22
oskar_dft_c2r(int num_in,double wavenumber,const oskar_Mem * x_in,const oskar_Mem * y_in,const oskar_Mem * z_in,const oskar_Mem * data_in,const oskar_Mem * weights_in,int num_out,const oskar_Mem * x_out,const oskar_Mem * y_out,const oskar_Mem * z_out,oskar_Mem * output,int * status)23 void oskar_dft_c2r(
24 int num_in,
25 double wavenumber,
26 const oskar_Mem* x_in,
27 const oskar_Mem* y_in,
28 const oskar_Mem* z_in,
29 const oskar_Mem* data_in,
30 const oskar_Mem* weights_in,
31 int num_out,
32 const oskar_Mem* x_out,
33 const oskar_Mem* y_out,
34 const oskar_Mem* z_out,
35 oskar_Mem* output,
36 int* status)
37 {
38 if (*status) return;
39 const int location = oskar_mem_location(output);
40 const int type = oskar_mem_precision(output);
41 const int is_dbl = oskar_mem_is_double(output);
42 const int is_3d =
43 (z_in != NULL && z_out != NULL && oskar_mem_length(z_out) > 0);
44 if (!oskar_mem_is_complex(data_in) ||
45 oskar_mem_is_complex(output) ||
46 oskar_mem_is_complex(weights_in) ||
47 oskar_mem_is_matrix(weights_in))
48 {
49 *status = OSKAR_ERR_BAD_DATA_TYPE;
50 return;
51 }
52 if (oskar_mem_location(weights_in) != location ||
53 oskar_mem_location(x_in) != location ||
54 oskar_mem_location(y_in) != location ||
55 oskar_mem_location(x_out) != location ||
56 oskar_mem_location(y_out) != location)
57 {
58 *status = OSKAR_ERR_LOCATION_MISMATCH;
59 return;
60 }
61 if (oskar_mem_precision(data_in) != type ||
62 oskar_mem_precision(weights_in) != type ||
63 oskar_mem_type(x_in) != type ||
64 oskar_mem_type(y_in) != type ||
65 oskar_mem_type(x_out) != type ||
66 oskar_mem_type(y_out) != type)
67 {
68 *status = OSKAR_ERR_TYPE_MISMATCH;
69 return;
70 }
71 if (is_3d)
72 {
73 if (oskar_mem_location(z_in) != location ||
74 oskar_mem_location(z_out) != location)
75 {
76 *status = OSKAR_ERR_LOCATION_MISMATCH;
77 return;
78 }
79 if (oskar_mem_type(z_in) != type || oskar_mem_type(z_out) != type)
80 {
81 *status = OSKAR_ERR_TYPE_MISMATCH;
82 return;
83 }
84 }
85 oskar_mem_ensure(output, (size_t) num_out, status);
86 if (*status) return;
87 if (location == OSKAR_CPU)
88 {
89 if (is_3d)
90 {
91 if (is_dbl)
92 {
93 dft_c2r_3d_double(num_in, wavenumber,
94 oskar_mem_double_const(x_in, status),
95 oskar_mem_double_const(y_in, status),
96 oskar_mem_double_const(z_in, status),
97 oskar_mem_double2_const(data_in, status),
98 oskar_mem_double_const(weights_in, status), 0, num_out,
99 oskar_mem_double_const(x_out, status),
100 oskar_mem_double_const(y_out, status),
101 oskar_mem_double_const(z_out, status), 0,
102 oskar_mem_double(output, status), 0);
103 }
104 else
105 {
106 dft_c2r_3d_float(num_in, (float)wavenumber,
107 oskar_mem_float_const(x_in, status),
108 oskar_mem_float_const(y_in, status),
109 oskar_mem_float_const(z_in, status),
110 oskar_mem_float2_const(data_in, status),
111 oskar_mem_float_const(weights_in, status), 0, num_out,
112 oskar_mem_float_const(x_out, status),
113 oskar_mem_float_const(y_out, status),
114 oskar_mem_float_const(z_out, status), 0,
115 oskar_mem_float(output, status), 0);
116 }
117 }
118 else
119 {
120 if (is_dbl)
121 {
122 dft_c2r_2d_double(num_in, wavenumber,
123 oskar_mem_double_const(x_in, status),
124 oskar_mem_double_const(y_in, status), 0,
125 oskar_mem_double2_const(data_in, status),
126 oskar_mem_double_const(weights_in, status), 0, num_out,
127 oskar_mem_double_const(x_out, status),
128 oskar_mem_double_const(y_out, status), 0, 0,
129 oskar_mem_double(output, status), 0);
130 }
131 else
132 {
133 dft_c2r_2d_float(num_in, (float)wavenumber,
134 oskar_mem_float_const(x_in, status),
135 oskar_mem_float_const(y_in, status), 0,
136 oskar_mem_float2_const(data_in, status),
137 oskar_mem_float_const(weights_in, status), 0, num_out,
138 oskar_mem_float_const(x_out, status),
139 oskar_mem_float_const(y_out, status), 0, 0,
140 oskar_mem_float(output, status), 0);
141 }
142 }
143 }
144 else
145 {
146 size_t local_size[] = {1, 1, 1}, global_size[] = {1, 1, 1};
147 float wavenumber_f = (float) wavenumber;
148 const void* np = 0;
149 const char* k = 0;
150 int out_size = 0, max_out_size = 0, start = 0;
151 local_size[0] = oskar_device_is_nv(location) ? 384 : 256;
152 oskar_device_check_local_size(location, 0, local_size);
153
154 /* Select the kernel. */
155 if (is_3d)
156 {
157 k = is_dbl ? "dft_c2r_3d_double" : "dft_c2r_3d_float";
158 }
159 else
160 {
161 k = is_dbl ? "dft_c2r_2d_double" : "dft_c2r_2d_float";
162 }
163
164 /* Compute the maximum manageable output chunk size. */
165 /* Product of max output and input sizes. */
166 max_out_size = 8192 * (is_dbl ? 32768 : 65536);
167 max_out_size /= num_in;
168 max_out_size = (int) oskar_device_global_size(
169 (size_t) max_out_size, local_size[0]); /* Last was 1024. */
170 max_out_size = oskar_int_range_clamp(max_out_size,
171 (int) local_size[0] * 2,
172 (int) local_size[0] * (is_dbl ? 80 : 160));
173 /* max_in_chunk must be multiple of 16. */
174 const int max_in_chunk = is_3d ?
175 (is_dbl ? 384 : 800) : (is_dbl ? 448 : 896);
176 const size_t element_size = is_dbl ? sizeof(double) : sizeof(float);
177 const size_t local_mem_size = max_in_chunk * element_size;
178 const size_t arg_size_local[] = {
179 2 * local_mem_size, 2 * local_mem_size,
180 (is_3d ? local_mem_size : 0)
181 };
182
183 /* Loop over output chunks. */
184 for (start = 0; start < num_out; start += max_out_size)
185 {
186 if (*status) break;
187
188 /* Get the chunk size. */
189 out_size = num_out - start;
190 if (out_size > max_out_size) out_size = max_out_size;
191 global_size[0] = oskar_device_global_size(out_size, local_size[0]);
192
193 /* Set kernel arguments. */
194 const oskar_Arg args[] = {
195 {INT_SZ, &num_in},
196 {is_dbl ? DBL_SZ : FLT_SZ,
197 is_dbl ? (void*)&wavenumber : (void*)&wavenumber_f},
198 {PTR_SZ, oskar_mem_buffer_const(x_in)},
199 {PTR_SZ, oskar_mem_buffer_const(y_in)},
200 {PTR_SZ, is_3d ? oskar_mem_buffer_const(z_in) : &np},
201 {PTR_SZ, oskar_mem_buffer_const(data_in)},
202 {PTR_SZ, oskar_mem_buffer_const(weights_in)},
203 {INT_SZ, &start},
204 {INT_SZ, &out_size},
205 {PTR_SZ, oskar_mem_buffer_const(x_out)},
206 {PTR_SZ, oskar_mem_buffer_const(y_out)},
207 {PTR_SZ, is_3d ? oskar_mem_buffer_const(z_out) : &np},
208 {INT_SZ, &start},
209 {PTR_SZ, oskar_mem_buffer(output)},
210 {INT_SZ, &max_in_chunk}
211 };
212 oskar_device_launch_kernel(k, location, 1, local_size, global_size,
213 sizeof(args) / sizeof(oskar_Arg), args,
214 sizeof(arg_size_local) / sizeof(size_t), arg_size_local,
215 status);
216 }
217 }
218 }
219