1 /*
2 * Copyright (c) 2016-2021, The OSKAR Developers.
3 * See the LICENSE file at the top-level directory of this distribution.
4 */
5
6 #include "imager/private_imager.h"
7 #include "imager/oskar_imager.h"
8
9 #include "imager/oskar_grid_functions_spheroidal.h"
10 #include "imager/private_imager_composite_nearest_even.h"
11 #include "imager/private_imager_generate_w_phase_screen.h"
12 #include "imager/private_imager_init_wproj.h"
13 #include "math/oskar_cmath.h"
14 #include "math/oskar_fft.h"
15 #include "utility/oskar_device.h"
16 #include "utility/oskar_get_memory_usage.h"
17
18 #include <stdlib.h>
19 #include <string.h>
20 #include <stdio.h>
21
22 #ifdef __cplusplus
23 extern "C" {
24 #endif
25
26 #define MIN(a,b) ((a) < (b) ? (a) : (b))
27 #define MAX(a,b) ((a) > (b) ? (a) : (b))
28
29 #include <fitsio.h>
30
31 static void oskar_imager_evaluate_w_kernel_params(const oskar_Imager* h,
32 int* num_w_planes, double* w_scale);
33
34 static oskar_Mem* oskar_imager_evaluate_w_kernel_cube(oskar_Imager* h,
35 int num_w_planes, double w_scale,
36 size_t* conv_size_half, double* norm_factor, int* status);
37
38 static oskar_Mem* oskar_imager_evaluate_w_kernel_support_sizes(
39 int num_w_planes, int oversample, size_t conv_size_half,
40 const oskar_Mem* kernel_cube, double norm_factor, int* status);
41
42 static void oskar_imager_normalise_kernel_cube(const oskar_Mem* support,
43 int oversample, size_t conv_size_half, oskar_Mem* kernel_cube,
44 int* status);
45
46 static void oskar_imager_trim_and_save_kernel_cube(oskar_Imager* h,
47 int num_planes, oskar_Mem* support, size_t* conv_size_half,
48 oskar_Mem* kernel_cube, int* status);
49
50 static void oskar_imager_rearrange_kernels(int num_w_planes,
51 const oskar_Mem* support, int oversample, size_t conv_size_half,
52 const oskar_Mem* kernels_in, oskar_Mem* kernels_out,
53 int* rearranged_kernel_start, int* status);
54
55 /*
56 * W-kernel generation is based on CASA implementation
57 * in code/synthesis/TransformMachines/WPConvFunc.cc
58 */
oskar_imager_init_wproj(oskar_Imager * h,int * status)59 void oskar_imager_init_wproj(oskar_Imager* h, int* status)
60 {
61 size_t conv_size_half = 0;
62 double norm_factor = 1.;
63 oskar_Mem *kernel_cube = 0;
64 const int save_kernels = 0;
65 if (*status) return;
66
67 /* Evaluate number of w-projection planes, and w-scale. */
68 oskar_imager_evaluate_w_kernel_params(h, &h->num_w_planes, &h->w_scale);
69
70 /* Evaluate unnormalised kernels. */
71 kernel_cube = oskar_imager_evaluate_w_kernel_cube(h, h->num_w_planes,
72 h->w_scale, &conv_size_half, &norm_factor, status);
73
74 /* Evaluate the support size of each kernel. */
75 oskar_mem_free(h->w_support, status);
76 h->w_support = oskar_imager_evaluate_w_kernel_support_sizes(
77 h->num_w_planes, h->oversample, conv_size_half,
78 kernel_cube, norm_factor, status);
79
80 #if 0
81 /* Print kernel support sizes. */
82 {
83 int i = 0;
84 for (i = 0; i < h->num_w_planes; ++i)
85 {
86 const int* supp = oskar_mem_int_const(h->w_support, status);
87 printf("Plane %d, support: %d\n", i, supp[i]);
88 }
89 }
90 #endif
91
92 /* Normalise the kernel cube. */
93 oskar_imager_normalise_kernel_cube(h->w_support, h->oversample,
94 conv_size_half, kernel_cube, status);
95 if (save_kernels)
96 {
97 oskar_imager_trim_and_save_kernel_cube(h, h->num_w_planes,
98 h->w_support, &conv_size_half, kernel_cube, status);
99 }
100
101 /* Rearrange and compact the kernels. */
102 oskar_mem_free(h->w_kernels_compact, status);
103 oskar_mem_free(h->w_kernel_start, status);
104 h->w_kernel_start = oskar_mem_create(OSKAR_INT, OSKAR_CPU,
105 h->num_w_planes, status);
106 h->w_kernels_compact = oskar_mem_create(h->imager_prec| OSKAR_COMPLEX,
107 OSKAR_CPU, 0, status);
108 oskar_imager_rearrange_kernels(h->num_w_planes, h->w_support,
109 h->oversample, conv_size_half, kernel_cube, h->w_kernels_compact,
110 oskar_mem_int(h->w_kernel_start, status), status);
111 oskar_mem_free(kernel_cube, status);
112
113 /* Record data about the kernels. */
114 oskar_log_message(h->log, 'M', 0, "Baseline W values (wavelengths)");
115 oskar_log_message(h->log, 'M', 1, "Min: %.12e", h->ww_min);
116 oskar_log_message(h->log, 'M', 1, "Max: %.12e", h->ww_max);
117 oskar_log_message(h->log, 'M', 1, "RMS: %.12e", h->ww_rms);
118 oskar_log_message(h->log, 'M', 0,
119 "Using %d W-projection planes.", h->num_w_planes);
120 oskar_log_message(h->log, 'M', 0,
121 "Convolution kernel support range %d to %d.",
122 oskar_mem_int(h->w_support, status)[0],
123 oskar_mem_int(h->w_support, status)[h->num_w_planes - 1]);
124 const size_t compacted_len = oskar_mem_length(h->w_kernels_compact) *
125 oskar_mem_element_size(h->imager_prec | OSKAR_COMPLEX);
126 oskar_log_message(h->log, 'M', 0, "Convolution kernels use %.1f MB.",
127 compacted_len * 1e-6);
128
129 /* Copy to device memory if required. */
130 if (h->grid_on_gpu && h->num_gpus > 0)
131 {
132 int i = 0;
133 if (h->num_devices < h->num_gpus)
134 {
135 oskar_imager_set_num_devices(h, h->num_gpus);
136 }
137 for (i = 0; i < h->num_gpus; ++i)
138 {
139 DeviceData* d = &h->d[i];
140 oskar_device_set(h->dev_loc, h->gpu_ids[i], status);
141 if (*status) break;
142 oskar_mem_free(d->w_kernels_compact, status);
143 oskar_mem_free(d->w_kernel_start, status);
144 oskar_mem_free(d->w_support, status);
145 d->w_kernels_compact = oskar_mem_create_copy(
146 h->w_kernels_compact, h->dev_loc, status);
147 d->w_kernel_start = oskar_mem_create_copy(
148 h->w_kernel_start, h->dev_loc, status);
149 d->w_support = oskar_mem_create_copy(
150 h->w_support, h->dev_loc, status);
151 }
152
153 /* No longer need kernels in host memory. */
154 oskar_mem_free(h->w_kernels_compact, status);
155 h->w_kernels_compact = 0;
156 }
157 }
158
159
oskar_imager_evaluate_w_kernel_params(const oskar_Imager * h,int * num_w_planes,double * w_scale)160 static void oskar_imager_evaluate_w_kernel_params(const oskar_Imager* h,
161 int* num_w_planes, double* w_scale)
162 {
163 double max_uvw = 0.0;
164 if (h->ww_max > 0.0)
165 {
166 const double ww_mid = 0.5 * (h->ww_min + h->ww_max);
167 max_uvw = 1.05 * h->ww_max;
168 if (h->ww_rms > ww_mid)
169 {
170 max_uvw *= h->ww_rms / ww_mid;
171 }
172 }
173 else
174 {
175 max_uvw = 0.25 / fabs(h->cellsize_rad);
176 }
177 if (*num_w_planes < 1)
178 {
179 *num_w_planes = (int)(max_uvw *
180 fabs(sin(h->cellsize_rad * h->image_size / 2.0)));
181 }
182 if (*num_w_planes < 16) *num_w_planes = 16;
183 *w_scale = pow(*num_w_planes - 1, 2.0) / max_uvw;
184 }
185
186
oskar_imager_evaluate_w_kernel_cube(oskar_Imager * h,int num_w_planes,double w_scale,size_t * conv_size_half,double * norm_factor,int * status)187 static oskar_Mem* oskar_imager_evaluate_w_kernel_cube(oskar_Imager* h,
188 int num_w_planes, double w_scale,
189 size_t* conv_size_half, double* norm_factor, int* status)
190 {
191 size_t max_mem_bytes = 0;
192 oskar_FFT* fft = 0;
193 oskar_Mem *screen = 0, *screen_gpu = 0, *screen_ptr = 0;
194 oskar_Mem *taper = 0, *taper_gpu = 0, *taper_ptr = 0;
195 oskar_Mem *kernel_cube = 0;
196 char *ptr_out = 0, *ptr_in = 0;
197 double *maxes = 0, max_val = -INT_MAX, sampling = 0.0;
198 int i = 0;
199 if (*status) return 0;
200
201 /* Calculate convolution kernel size. */
202 const size_t max_bytes_per_plane = 64 * 1024 * 1024; /* 64 MB/plane */
203 max_mem_bytes = oskar_get_total_physical_memory();
204 max_mem_bytes = MIN(max_mem_bytes, max_bytes_per_plane * num_w_planes);
205 const double max_conv_size = sqrt(max_mem_bytes / (16. * num_w_planes));
206 const int nearest = oskar_imager_composite_nearest_even(
207 2 * (int)(max_conv_size / 2.0), 0, 0);
208 const int conv_size = MIN((int)(h->image_size * h->image_padding),nearest);
209 *conv_size_half = conv_size / 2 - 1;
210 const size_t kernel_plane_size = (*conv_size_half) * (*conv_size_half);
211
212 /* Get size of inner region of kernel. */
213 const int inner = conv_size / h->oversample;
214 const double l_max = sin(0.5 * h->fov_deg * M_PI/180.0);
215 sampling = (2.0 * l_max * h->oversample) / h->image_size;
216 sampling *= ((double) oskar_imager_plane_size(h)) / ((double) conv_size);
217
218 /* Generate 1D spheroidal tapering function to cover the inner region. */
219 const int prec = h->imager_prec;
220 taper = oskar_mem_create(prec, OSKAR_CPU, (size_t) inner, status);
221 taper_ptr = taper;
222 if (prec == OSKAR_DOUBLE)
223 {
224 double* t = (double*) oskar_mem_void(taper);
225 for (i = 0; i < inner; ++i)
226 {
227 const double nu = (i - (inner / 2)) / ((double)(inner / 2));
228 t[i] = oskar_grid_function_spheroidal(fabs(nu));
229 }
230 }
231 else
232 {
233 float* t = (float*) oskar_mem_void(taper);
234 for (i = 0; i < inner; ++i)
235 {
236 const double nu = (i - (inner / 2)) / ((double)(inner / 2));
237 t[i] = oskar_grid_function_spheroidal(fabs(nu));
238 }
239 }
240
241 /* Allocate space for the kernels. */
242 kernel_cube = oskar_mem_create(prec | OSKAR_COMPLEX, OSKAR_CPU,
243 ((size_t) num_w_planes) * kernel_plane_size, status);
244
245 /* Create scratch arrays and FFT plan for the phase screens. */
246 screen = oskar_mem_create(prec | OSKAR_COMPLEX,
247 OSKAR_CPU, conv_size * conv_size, status);
248 screen_ptr = screen;
249 const int fft_loc = (h->generate_w_kernels_on_gpu && h->num_gpus > 0) ?
250 h->dev_loc : OSKAR_CPU;
251 if (fft_loc != OSKAR_CPU)
252 {
253 oskar_device_set(h->dev_loc, h->gpu_ids[0], status);
254 screen_gpu = oskar_mem_create(prec | OSKAR_COMPLEX,
255 h->dev_loc, conv_size * conv_size, status);
256 taper_gpu = oskar_mem_create_copy(taper, h->dev_loc, status);
257 screen_ptr = screen_gpu;
258 taper_ptr = taper_gpu;
259 }
260 fft = oskar_fft_create(h->imager_prec, fft_loc,
261 2, conv_size, 0, status);
262 oskar_fft_set_ensure_consistent_norm(fft, 0);
263
264 /* Evaluate kernels. */
265 ptr_in = oskar_mem_char(screen);
266 const size_t element_size = 2 * oskar_mem_element_size(prec);
267 const size_t copy_len = (*conv_size_half) * element_size;
268 maxes = (double*) calloc(num_w_planes, sizeof(double));
269 for (i = 0; i < num_w_planes; ++i)
270 {
271 size_t iy = 0, in = 0, out = 0, offset = 0;
272
273 /* Generate the tapered phase screen. */
274 oskar_imager_generate_w_phase_screen(i, conv_size, inner,
275 sampling, w_scale, taper_ptr, screen_ptr, status);
276
277 /* Perform the FFT to get the kernel. No shifts are required. */
278 oskar_fft_exec(fft, screen_ptr, status);
279 if (screen_ptr != screen)
280 {
281 oskar_mem_copy(screen, screen_ptr, status);
282 }
283 if (*status) break;
284
285 /* Get the maximum (from the first element). */
286 if (prec == OSKAR_DOUBLE)
287 {
288 const double* t = (const double*) oskar_mem_void_const(screen);
289 maxes[i] = sqrt(t[0]*t[0] + t[1]*t[1]);
290 }
291 else
292 {
293 const float* t = (const float*) oskar_mem_void_const(screen);
294 maxes[i] = sqrt(t[0]*t[0] + t[1]*t[1]);
295 }
296
297 /* Save only the first quarter of the kernel; the rest is redundant. */
298 offset = kernel_plane_size * element_size * (size_t) i;
299 ptr_out = oskar_mem_char(kernel_cube) + offset;
300 for (iy = 0; iy < *conv_size_half; ++iy)
301 {
302 memcpy(ptr_out + out, ptr_in + in, copy_len);
303 in += element_size * (size_t) conv_size;
304 out += copy_len;
305 }
306 }
307 oskar_fft_free(fft);
308 oskar_mem_free(screen, status);
309 oskar_mem_free(screen_gpu, status);
310 oskar_mem_free(taper, status);
311 oskar_mem_free(taper_gpu, status);
312
313 /* Get scaling factor needed for normalisation. */
314 for (i = 0; i < num_w_planes; ++i) max_val = MAX(max_val, maxes[i]);
315 *norm_factor = 1.0 / max_val;
316 free(maxes);
317 return kernel_cube;
318 }
319
320
oskar_imager_evaluate_w_kernel_support_sizes(int num_w_planes,int oversample,size_t conv_size_half,const oskar_Mem * kernel_cube,double norm_factor,int * status)321 static oskar_Mem* oskar_imager_evaluate_w_kernel_support_sizes(
322 int num_w_planes, int oversample, size_t conv_size_half,
323 const oskar_Mem* kernel_cube, double norm_factor, int* status)
324 {
325 int *supp = 0, i = 0;
326 oskar_Mem* w_support = 0;
327 if (*status || !kernel_cube) return 0;
328 w_support = oskar_mem_create(OSKAR_INT, OSKAR_CPU, num_w_planes, status);
329 supp = oskar_mem_int(w_support, status);
330 const double threshold = 1e-3 / norm_factor;
331 const int prec = oskar_mem_precision(kernel_cube);
332 const int conv_size = ((int) conv_size_half + 1) * 2;
333 for (i = 0; i < num_w_planes; ++i)
334 {
335 int found = 0, j = 0;
336 if (*status) break;
337 const size_t start = conv_size_half * conv_size_half * (size_t) i;
338 if (prec == OSKAR_DOUBLE)
339 {
340 const double *RESTRICT p =
341 (const double*) oskar_mem_void_const(kernel_cube);
342 for (j = (int) conv_size_half - 1; j > 0; j--)
343 {
344 const size_t i1 = ((size_t) j * conv_size_half + start) << 1;
345 const size_t i2 = ((size_t) j + start) << 1;
346 const double v1 = sqrt(p[i1]*p[i1] + p[i1+1]*p[i1+1]);
347 const double v2 = sqrt(p[i2]*p[i2] + p[i2+1]*p[i2+1]);
348 if ((v1 > threshold) || (v2 > threshold))
349 {
350 found = 1;
351 break;
352 }
353 }
354 }
355 else
356 {
357 const float *RESTRICT p =
358 (const float*) oskar_mem_void_const(kernel_cube);
359 for (j = (int) conv_size_half - 1; j > 0; j--)
360 {
361 const size_t i1 = ((size_t) j * conv_size_half + start) << 1;
362 const size_t i2 = ((size_t) j + start) << 1;
363 const double v1 = sqrt(p[i1]*p[i1] + p[i1+1]*p[i1+1]);
364 const double v2 = sqrt(p[i2]*p[i2] + p[i2+1]*p[i2+1]);
365 if ((v1 > threshold) || (v2 > threshold))
366 {
367 found = 1;
368 break;
369 }
370 }
371 }
372 if (found)
373 {
374 supp[i] = 1 + (int)(0.5 + (double)j / (double)oversample);
375 if (supp[i] * oversample * 2 >= conv_size)
376 {
377 supp[i] = conv_size / 2 / oversample - 1;
378 }
379 }
380 }
381 return w_support;
382 }
383
384
oskar_imager_trim_and_save_kernel_cube(oskar_Imager * h,int num_w_planes,oskar_Mem * support,size_t * conv_size_half,oskar_Mem * kernel_cube,int * status)385 static void oskar_imager_trim_and_save_kernel_cube(oskar_Imager* h,
386 int num_w_planes, oskar_Mem* support, size_t* conv_size_half,
387 oskar_Mem* kernel_cube, int* status)
388 {
389 char* fname = 0;
390 int i = 0, max_val = -INT_MAX;
391 const int* supp = 0;
392 fitsfile* f = 0;
393 char *ttype[] = {"SUPPORT"};
394 char *tform[] = {"1J"}; /* 32-bit integer. */
395 char *tunit[] = {"\0"};
396 char extname[] = "W_KERNELS";
397 double cellsize_rad = 0.0, fov_rad = 0.0, w_scale = 0.0;
398 int grid_size = 0, image_size = 0, oversample = 0;
399 if (*status || !kernel_cube) return;
400 supp = oskar_mem_int_const(support, status);
401 for (i = 0; i < num_w_planes; ++i) max_val = MAX(max_val, supp[i]);
402 const int new_conv_size = 2 * (max_val + 2) * h->oversample;
403 const size_t new_conv_size_half = (size_t) new_conv_size / 2 - 2;
404 if (new_conv_size_half < *conv_size_half)
405 {
406 size_t iy = 0, in = 0, out = 0;
407 char *ptr = oskar_mem_char(kernel_cube);
408 const int prec = oskar_mem_precision(kernel_cube);
409 const size_t element_size = 2 * oskar_mem_element_size(prec);
410 const size_t copy_len = element_size * new_conv_size_half;
411 const size_t kernel_plane_size = (*conv_size_half) * (*conv_size_half);
412 for (i = 0; i < num_w_planes; ++i)
413 {
414 in = kernel_plane_size * element_size * (size_t) i;
415 for (iy = 0; iy < new_conv_size_half; ++iy)
416 {
417 /* Use memmove() rather than memcpy() to allow for overlap. */
418 memmove(ptr + out, ptr + in, copy_len);
419 in += (*conv_size_half) * element_size;
420 out += copy_len;
421 }
422 }
423 *conv_size_half = new_conv_size_half;
424 oskar_mem_realloc(kernel_cube,
425 ((size_t) num_w_planes) * new_conv_size_half *
426 new_conv_size_half, status);
427 }
428
429 /* Write kernel cube as primary and secondary HDU. */
430 fname = (char*) calloc(80, sizeof(char));
431 sprintf(fname, "oskar_w_kernel_cache_%04d.fits", num_w_planes);
432 oskar_mem_write_fits_cube(kernel_cube, fname,
433 (int) *conv_size_half, (int) *conv_size_half, num_w_planes, -1,
434 status);
435
436 /* Write relevant imaging parameters as primary header keywords. */
437 fits_open_file(&f, fname, READWRITE, status);
438 cellsize_rad = h->cellsize_rad;
439 fov_rad = h->fov_deg * M_PI / 180.0;
440 w_scale = h->w_scale;
441 grid_size = oskar_imager_plane_size(h);
442 image_size = h->image_size;
443 oversample = h->oversample;
444 fits_write_key(f, TINT, "OVERSAMP", &oversample,
445 "kernel oversample parameter", status);
446 fits_write_key(f, TINT, "GRIDSIZE", &grid_size,
447 "grid side length", status);
448 fits_write_key(f, TINT, "IMSIZE", &image_size,
449 "final image side length, in pixels", status);
450 fits_write_key(f, TDOUBLE, "FOV", &fov_rad,
451 "final image field of view, in radians", status);
452 fits_write_key(f, TDOUBLE, "CELLSIZE", &cellsize_rad,
453 "final image cell size, in radians", status);
454 fits_write_key(f, TDOUBLE, "W_SCALE", &w_scale,
455 "w_scale parameter", status);
456
457 /* Write kernel support sizes as a binary table extension. */
458 fits_create_tbl(f, BINARY_TBL, num_w_planes,
459 1, ttype, tform, tunit, extname, status);
460 fits_write_col(f, TINT, 1, 1, 1, num_w_planes,
461 oskar_mem_int(support, status), status);
462 fits_close_file(f, status);
463 free(fname);
464 }
465
466
oskar_imager_normalise_kernel_cube(const oskar_Mem * support,int oversample,size_t conv_size_half,oskar_Mem * kernel_cube,int * status)467 static void oskar_imager_normalise_kernel_cube(const oskar_Mem* support,
468 int oversample, size_t conv_size_half, oskar_Mem* kernel_cube,
469 int* status)
470 {
471 double sum = 0.0;
472 int ix = 0, iy = 0;
473 if (*status) return;
474 const int* supp = oskar_mem_int_const(support, status);
475
476 /* Normalise so that kernel 0 sums to 1,
477 * when jumping in steps of oversample. */
478 if (oskar_mem_precision(kernel_cube) == OSKAR_DOUBLE)
479 {
480 const double *RESTRICT p = oskar_mem_double_const(kernel_cube, status);
481 for (iy = -supp[0]; iy <= supp[0]; ++iy)
482 {
483 for (ix = -supp[0]; ix <= supp[0]; ++ix)
484 {
485 sum += p[2 * (abs(ix) * oversample +
486 conv_size_half * (abs(iy) * oversample))];
487 }
488 }
489 }
490 else
491 {
492 const float *RESTRICT p = oskar_mem_float_const(kernel_cube, status);
493 for (iy = -supp[0]; iy <= supp[0]; ++iy)
494 {
495 for (ix = -supp[0]; ix <= supp[0]; ++ix)
496 {
497 sum += p[2 * (abs(ix) * oversample +
498 conv_size_half * (abs(iy) * oversample))];
499 }
500 }
501 }
502 oskar_mem_scale_real(kernel_cube, 1.0 / sum,
503 0, oskar_mem_length(kernel_cube), status);
504 }
505
506
oskar_imager_rearrange_kernels(int num_w_planes,const oskar_Mem * support,int oversample,size_t conv_size_half,const oskar_Mem * kernels_in,oskar_Mem * kernels_out,int * rearranged_kernel_start,int * status)507 static void oskar_imager_rearrange_kernels(int num_w_planes,
508 const oskar_Mem* support, int oversample, size_t conv_size_half,
509 const oskar_Mem* kernels_in, oskar_Mem* kernels_out,
510 int* rearranged_kernel_start, int* status)
511 {
512 int w = 0, j = 0, k = 0, off_u = 0, off_v = 0;
513 size_t rearranged_size = 0;
514 float2* out_f = 0;
515 double2* out_d = 0;
516 if (*status) return;
517 const float2* in_f = (const float2*) oskar_mem_void_const(kernels_in);
518 const double2* in_d = (const double2*) oskar_mem_void_const(kernels_in);
519 const int* supp = oskar_mem_int_const(support, status);
520 const int oversample_h = oversample / 2;
521 const int prec = oskar_mem_precision(kernels_in);
522 const size_t height = oversample_h + 1;
523
524 /* Allocate enough memory for the rearranged kernels. */
525 for (w = 0; w < num_w_planes; w++)
526 {
527 const size_t conv_len = 2 * (size_t)(supp[w]) + 1;
528 const size_t width = (oversample_h * conv_len + 1) * conv_len;
529 rearranged_kernel_start[w] = (int) rearranged_size;
530 rearranged_size += (width * height);
531 }
532 oskar_mem_realloc(kernels_out, rearranged_size, status);
533 /* oskar_mem_set_value_real(kernels_out, 1e9, 0, 0, status); */
534 out_f = (float2*) oskar_mem_void(kernels_out);
535 out_d = (double2*) oskar_mem_void(kernels_out);
536
537 for (w = 0; w < num_w_planes; w++)
538 {
539 const int w_support = supp[w];
540 const size_t conv_len = 2 * (size_t)(supp[w]) + 1;
541 const size_t width = ((size_t)oversample_h * conv_len + 1) * conv_len;
542 const size_t c_in = conv_size_half * conv_size_half * (size_t)w;
543 const size_t c_out = (size_t)(rearranged_kernel_start[w]);
544
545 /* Within each kernel, off_u is slowest varying, so if the
546 * rearranged kernel is viewed as a very squashed image, the
547 * row index to use for off_u is given by width * abs(off_u).
548 *
549 * "Reverse" rows and rearrange elements in U dimension
550 * for reasonable cache utilisation.
551 * For idx_v = abs(off_v + j * oversample), the offset from the
552 * start of the row is given by width-1 - (idx_v * conv_len).
553 * Stride is +1 for positive or zero values of off_u;
554 * stride is -1 for negative values of off_u.
555 */
556 for (off_u = -oversample_h; off_u <= oversample_h; off_u++)
557 {
558 const size_t mid = c_out + (abs(off_u) + 1) * width - 1 - w_support;
559 const int stride = (off_u >= 0) ? 1 : -1;
560 for (off_v = -oversample_h; off_v <= oversample_h; off_v++)
561 {
562 for (j = 0; j <= w_support; j++)
563 {
564 const unsigned int idx_v = abs(off_v + j * oversample);
565 const size_t p = mid - (idx_v * conv_len);
566 for (k = -w_support; k <= w_support; k++)
567 {
568 const unsigned int idx_u = abs(off_u + k * oversample);
569 const size_t a = c_in + idx_v * conv_size_half + idx_u;
570 const size_t b = p + stride * k;
571 if (prec == OSKAR_SINGLE)
572 {
573 out_f[b] = in_f[a];
574 }
575 else
576 {
577 out_d[b] = in_d[a];
578 }
579 }
580 }
581 }
582 }
583 #if 0
584 if (w == 0)
585 {
586 oskar_Mem* a1 = oskar_mem_create_alias(kernels_out, c_out,
587 width * height, status);
588 oskar_Mem* a2 = oskar_mem_create_alias(kernels_in, c_in,
589 conv_size_half * conv_size_half, status);
590 oskar_mem_write_fits_cube(a1, "kernel_rearranged",
591 (int) width, (int) height, 1, 0, status);
592 oskar_mem_write_fits_cube(a2, "kernel_orig",
593 conv_size_half, conv_size_half, 1, 0, status);
594 oskar_mem_free(a1, status);
595 oskar_mem_free(a2, status);
596 printf("At w = %d, w_support = %d\n", w, w_support);
597 printf("Original size: %lu. Rearranged size: %lu\n",
598 (unsigned long) oskar_mem_length(kernels_in),
599 (unsigned long) oskar_mem_length(kernels_out));
600 }
601 #endif
602 }
603 #if 0
604 FILE* fhan = fopen("rearranged_kernels.txt", "w");
605 oskar_mem_save_ascii(fhan, 1, rearranged_size, status, kernels_out);
606 fclose(fhan);
607 #endif
608 }
609
610 #ifdef __cplusplus
611 }
612 #endif
613