1 /*
2  * Copyright (c) 2016-2021, The OSKAR Developers.
3  * See the LICENSE file at the top-level directory of this distribution.
4  */
5 
6 #include "imager/private_imager.h"
7 #include "imager/oskar_imager.h"
8 
9 #include "imager/oskar_grid_functions_spheroidal.h"
10 #include "imager/private_imager_composite_nearest_even.h"
11 #include "imager/private_imager_generate_w_phase_screen.h"
12 #include "imager/private_imager_init_wproj.h"
13 #include "math/oskar_cmath.h"
14 #include "math/oskar_fft.h"
15 #include "utility/oskar_device.h"
16 #include "utility/oskar_get_memory_usage.h"
17 
18 #include <stdlib.h>
19 #include <string.h>
20 #include <stdio.h>
21 
22 #ifdef __cplusplus
23 extern "C" {
24 #endif
25 
26 #define MIN(a,b) ((a) < (b) ? (a) : (b))
27 #define MAX(a,b) ((a) > (b) ? (a) : (b))
28 
29 #include <fitsio.h>
30 
31 static void oskar_imager_evaluate_w_kernel_params(const oskar_Imager* h,
32         int* num_w_planes, double* w_scale);
33 
34 static oskar_Mem* oskar_imager_evaluate_w_kernel_cube(oskar_Imager* h,
35         int num_w_planes, double w_scale,
36         size_t* conv_size_half, double* norm_factor, int* status);
37 
38 static oskar_Mem* oskar_imager_evaluate_w_kernel_support_sizes(
39         int num_w_planes, int oversample, size_t conv_size_half,
40         const oskar_Mem* kernel_cube, double norm_factor, int* status);
41 
42 static void oskar_imager_normalise_kernel_cube(const oskar_Mem* support,
43         int oversample, size_t conv_size_half, oskar_Mem* kernel_cube,
44         int* status);
45 
46 static void oskar_imager_trim_and_save_kernel_cube(oskar_Imager* h,
47         int num_planes, oskar_Mem* support, size_t* conv_size_half,
48         oskar_Mem* kernel_cube, int* status);
49 
50 static void oskar_imager_rearrange_kernels(int num_w_planes,
51         const oskar_Mem* support, int oversample, size_t conv_size_half,
52         const oskar_Mem* kernels_in, oskar_Mem* kernels_out,
53         int* rearranged_kernel_start, int* status);
54 
55 /*
56  * W-kernel generation is based on CASA implementation
57  * in code/synthesis/TransformMachines/WPConvFunc.cc
58  */
oskar_imager_init_wproj(oskar_Imager * h,int * status)59 void oskar_imager_init_wproj(oskar_Imager* h, int* status)
60 {
61     size_t conv_size_half = 0;
62     double norm_factor = 1.;
63     oskar_Mem *kernel_cube = 0;
64     const int save_kernels = 0;
65     if (*status) return;
66 
67     /* Evaluate number of w-projection planes, and w-scale. */
68     oskar_imager_evaluate_w_kernel_params(h, &h->num_w_planes, &h->w_scale);
69 
70     /* Evaluate unnormalised kernels. */
71     kernel_cube = oskar_imager_evaluate_w_kernel_cube(h, h->num_w_planes,
72             h->w_scale, &conv_size_half, &norm_factor, status);
73 
74     /* Evaluate the support size of each kernel. */
75     oskar_mem_free(h->w_support, status);
76     h->w_support = oskar_imager_evaluate_w_kernel_support_sizes(
77             h->num_w_planes, h->oversample, conv_size_half,
78             kernel_cube, norm_factor, status);
79 
80 #if 0
81     /* Print kernel support sizes. */
82     {
83         int i = 0;
84         for (i = 0; i < h->num_w_planes; ++i)
85         {
86             const int* supp = oskar_mem_int_const(h->w_support, status);
87             printf("Plane %d, support: %d\n", i, supp[i]);
88         }
89     }
90 #endif
91 
92     /* Normalise the kernel cube. */
93     oskar_imager_normalise_kernel_cube(h->w_support, h->oversample,
94             conv_size_half, kernel_cube, status);
95     if (save_kernels)
96     {
97         oskar_imager_trim_and_save_kernel_cube(h, h->num_w_planes,
98                 h->w_support, &conv_size_half, kernel_cube, status);
99     }
100 
101     /* Rearrange and compact the kernels. */
102     oskar_mem_free(h->w_kernels_compact, status);
103     oskar_mem_free(h->w_kernel_start, status);
104     h->w_kernel_start = oskar_mem_create(OSKAR_INT, OSKAR_CPU,
105             h->num_w_planes, status);
106     h->w_kernels_compact = oskar_mem_create(h->imager_prec| OSKAR_COMPLEX,
107             OSKAR_CPU, 0, status);
108     oskar_imager_rearrange_kernels(h->num_w_planes, h->w_support,
109             h->oversample, conv_size_half, kernel_cube, h->w_kernels_compact,
110             oskar_mem_int(h->w_kernel_start, status), status);
111     oskar_mem_free(kernel_cube, status);
112 
113     /* Record data about the kernels. */
114     oskar_log_message(h->log, 'M', 0, "Baseline W values (wavelengths)");
115     oskar_log_message(h->log, 'M', 1, "Min: %.12e", h->ww_min);
116     oskar_log_message(h->log, 'M', 1, "Max: %.12e", h->ww_max);
117     oskar_log_message(h->log, 'M', 1, "RMS: %.12e", h->ww_rms);
118     oskar_log_message(h->log, 'M', 0,
119             "Using %d W-projection planes.", h->num_w_planes);
120     oskar_log_message(h->log, 'M', 0,
121             "Convolution kernel support range %d to %d.",
122             oskar_mem_int(h->w_support, status)[0],
123             oskar_mem_int(h->w_support, status)[h->num_w_planes - 1]);
124     const size_t compacted_len = oskar_mem_length(h->w_kernels_compact) *
125             oskar_mem_element_size(h->imager_prec | OSKAR_COMPLEX);
126     oskar_log_message(h->log, 'M', 0, "Convolution kernels use %.1f MB.",
127             compacted_len * 1e-6);
128 
129     /* Copy to device memory if required. */
130     if (h->grid_on_gpu && h->num_gpus > 0)
131     {
132         int i = 0;
133         if (h->num_devices < h->num_gpus)
134         {
135             oskar_imager_set_num_devices(h, h->num_gpus);
136         }
137         for (i = 0; i < h->num_gpus; ++i)
138         {
139             DeviceData* d = &h->d[i];
140             oskar_device_set(h->dev_loc, h->gpu_ids[i], status);
141             if (*status) break;
142             oskar_mem_free(d->w_kernels_compact, status);
143             oskar_mem_free(d->w_kernel_start, status);
144             oskar_mem_free(d->w_support, status);
145             d->w_kernels_compact = oskar_mem_create_copy(
146                     h->w_kernels_compact, h->dev_loc, status);
147             d->w_kernel_start = oskar_mem_create_copy(
148                     h->w_kernel_start, h->dev_loc, status);
149             d->w_support = oskar_mem_create_copy(
150                     h->w_support, h->dev_loc, status);
151         }
152 
153         /* No longer need kernels in host memory. */
154         oskar_mem_free(h->w_kernels_compact, status);
155         h->w_kernels_compact = 0;
156     }
157 }
158 
159 
oskar_imager_evaluate_w_kernel_params(const oskar_Imager * h,int * num_w_planes,double * w_scale)160 static void oskar_imager_evaluate_w_kernel_params(const oskar_Imager* h,
161         int* num_w_planes, double* w_scale)
162 {
163     double max_uvw = 0.0;
164     if (h->ww_max > 0.0)
165     {
166         const double ww_mid = 0.5 * (h->ww_min + h->ww_max);
167         max_uvw = 1.05 * h->ww_max;
168         if (h->ww_rms > ww_mid)
169         {
170             max_uvw *= h->ww_rms / ww_mid;
171         }
172     }
173     else
174     {
175         max_uvw = 0.25 / fabs(h->cellsize_rad);
176     }
177     if (*num_w_planes < 1)
178     {
179         *num_w_planes = (int)(max_uvw *
180                 fabs(sin(h->cellsize_rad * h->image_size / 2.0)));
181     }
182     if (*num_w_planes < 16) *num_w_planes = 16;
183     *w_scale = pow(*num_w_planes - 1, 2.0) / max_uvw;
184 }
185 
186 
oskar_imager_evaluate_w_kernel_cube(oskar_Imager * h,int num_w_planes,double w_scale,size_t * conv_size_half,double * norm_factor,int * status)187 static oskar_Mem* oskar_imager_evaluate_w_kernel_cube(oskar_Imager* h,
188         int num_w_planes, double w_scale,
189         size_t* conv_size_half, double* norm_factor, int* status)
190 {
191     size_t max_mem_bytes = 0;
192     oskar_FFT* fft = 0;
193     oskar_Mem *screen = 0, *screen_gpu = 0, *screen_ptr = 0;
194     oskar_Mem *taper = 0, *taper_gpu = 0, *taper_ptr = 0;
195     oskar_Mem *kernel_cube = 0;
196     char *ptr_out = 0, *ptr_in = 0;
197     double *maxes = 0, max_val = -INT_MAX, sampling = 0.0;
198     int i = 0;
199     if (*status) return 0;
200 
201     /* Calculate convolution kernel size. */
202     const size_t max_bytes_per_plane = 64 * 1024 * 1024; /* 64 MB/plane */
203     max_mem_bytes = oskar_get_total_physical_memory();
204     max_mem_bytes = MIN(max_mem_bytes, max_bytes_per_plane * num_w_planes);
205     const double max_conv_size = sqrt(max_mem_bytes / (16. * num_w_planes));
206     const int nearest = oskar_imager_composite_nearest_even(
207             2 * (int)(max_conv_size / 2.0), 0, 0);
208     const int conv_size = MIN((int)(h->image_size * h->image_padding),nearest);
209     *conv_size_half = conv_size / 2 - 1;
210     const size_t kernel_plane_size = (*conv_size_half) * (*conv_size_half);
211 
212     /* Get size of inner region of kernel. */
213     const int inner = conv_size / h->oversample;
214     const double l_max = sin(0.5 * h->fov_deg * M_PI/180.0);
215     sampling = (2.0 * l_max * h->oversample) / h->image_size;
216     sampling *= ((double) oskar_imager_plane_size(h)) / ((double) conv_size);
217 
218     /* Generate 1D spheroidal tapering function to cover the inner region. */
219     const int prec = h->imager_prec;
220     taper = oskar_mem_create(prec, OSKAR_CPU, (size_t) inner, status);
221     taper_ptr = taper;
222     if (prec == OSKAR_DOUBLE)
223     {
224         double* t = (double*) oskar_mem_void(taper);
225         for (i = 0; i < inner; ++i)
226         {
227             const double nu = (i - (inner / 2)) / ((double)(inner / 2));
228             t[i] = oskar_grid_function_spheroidal(fabs(nu));
229         }
230     }
231     else
232     {
233         float* t = (float*) oskar_mem_void(taper);
234         for (i = 0; i < inner; ++i)
235         {
236             const double nu = (i - (inner / 2)) / ((double)(inner / 2));
237             t[i] = oskar_grid_function_spheroidal(fabs(nu));
238         }
239     }
240 
241     /* Allocate space for the kernels. */
242     kernel_cube = oskar_mem_create(prec | OSKAR_COMPLEX, OSKAR_CPU,
243             ((size_t) num_w_planes) * kernel_plane_size, status);
244 
245     /* Create scratch arrays and FFT plan for the phase screens. */
246     screen = oskar_mem_create(prec | OSKAR_COMPLEX,
247             OSKAR_CPU, conv_size * conv_size, status);
248     screen_ptr = screen;
249     const int fft_loc = (h->generate_w_kernels_on_gpu && h->num_gpus > 0) ?
250             h->dev_loc : OSKAR_CPU;
251     if (fft_loc != OSKAR_CPU)
252     {
253         oskar_device_set(h->dev_loc, h->gpu_ids[0], status);
254         screen_gpu = oskar_mem_create(prec | OSKAR_COMPLEX,
255                 h->dev_loc, conv_size * conv_size, status);
256         taper_gpu = oskar_mem_create_copy(taper, h->dev_loc, status);
257         screen_ptr = screen_gpu;
258         taper_ptr = taper_gpu;
259     }
260     fft = oskar_fft_create(h->imager_prec, fft_loc,
261             2, conv_size, 0, status);
262     oskar_fft_set_ensure_consistent_norm(fft, 0);
263 
264     /* Evaluate kernels. */
265     ptr_in = oskar_mem_char(screen);
266     const size_t element_size = 2 * oskar_mem_element_size(prec);
267     const size_t copy_len = (*conv_size_half) * element_size;
268     maxes = (double*) calloc(num_w_planes, sizeof(double));
269     for (i = 0; i < num_w_planes; ++i)
270     {
271         size_t iy = 0, in = 0, out = 0, offset = 0;
272 
273         /* Generate the tapered phase screen. */
274         oskar_imager_generate_w_phase_screen(i, conv_size, inner,
275                 sampling, w_scale, taper_ptr, screen_ptr, status);
276 
277         /* Perform the FFT to get the kernel. No shifts are required. */
278         oskar_fft_exec(fft, screen_ptr, status);
279         if (screen_ptr != screen)
280         {
281             oskar_mem_copy(screen, screen_ptr, status);
282         }
283         if (*status) break;
284 
285         /* Get the maximum (from the first element). */
286         if (prec == OSKAR_DOUBLE)
287         {
288             const double* t = (const double*) oskar_mem_void_const(screen);
289             maxes[i] = sqrt(t[0]*t[0] + t[1]*t[1]);
290         }
291         else
292         {
293             const float* t = (const float*) oskar_mem_void_const(screen);
294             maxes[i] = sqrt(t[0]*t[0] + t[1]*t[1]);
295         }
296 
297         /* Save only the first quarter of the kernel; the rest is redundant. */
298         offset = kernel_plane_size * element_size * (size_t) i;
299         ptr_out = oskar_mem_char(kernel_cube) + offset;
300         for (iy = 0; iy < *conv_size_half; ++iy)
301         {
302             memcpy(ptr_out + out, ptr_in + in, copy_len);
303             in += element_size * (size_t) conv_size;
304             out += copy_len;
305         }
306     }
307     oskar_fft_free(fft);
308     oskar_mem_free(screen, status);
309     oskar_mem_free(screen_gpu, status);
310     oskar_mem_free(taper, status);
311     oskar_mem_free(taper_gpu, status);
312 
313     /* Get scaling factor needed for normalisation. */
314     for (i = 0; i < num_w_planes; ++i) max_val = MAX(max_val, maxes[i]);
315     *norm_factor = 1.0 / max_val;
316     free(maxes);
317     return kernel_cube;
318 }
319 
320 
oskar_imager_evaluate_w_kernel_support_sizes(int num_w_planes,int oversample,size_t conv_size_half,const oskar_Mem * kernel_cube,double norm_factor,int * status)321 static oskar_Mem* oskar_imager_evaluate_w_kernel_support_sizes(
322         int num_w_planes, int oversample, size_t conv_size_half,
323         const oskar_Mem* kernel_cube, double norm_factor, int* status)
324 {
325     int *supp = 0, i = 0;
326     oskar_Mem* w_support = 0;
327     if (*status || !kernel_cube) return 0;
328     w_support = oskar_mem_create(OSKAR_INT, OSKAR_CPU, num_w_planes, status);
329     supp = oskar_mem_int(w_support, status);
330     const double threshold = 1e-3 / norm_factor;
331     const int prec = oskar_mem_precision(kernel_cube);
332     const int conv_size = ((int) conv_size_half + 1) * 2;
333     for (i = 0; i < num_w_planes; ++i)
334     {
335         int found = 0, j = 0;
336         if (*status) break;
337         const size_t start = conv_size_half * conv_size_half * (size_t) i;
338         if (prec == OSKAR_DOUBLE)
339         {
340             const double *RESTRICT p =
341                     (const double*) oskar_mem_void_const(kernel_cube);
342             for (j = (int) conv_size_half - 1; j > 0; j--)
343             {
344                 const size_t i1 = ((size_t) j * conv_size_half + start) << 1;
345                 const size_t i2 = ((size_t) j + start) << 1;
346                 const double v1 = sqrt(p[i1]*p[i1] + p[i1+1]*p[i1+1]);
347                 const double v2 = sqrt(p[i2]*p[i2] + p[i2+1]*p[i2+1]);
348                 if ((v1 > threshold) || (v2 > threshold))
349                 {
350                     found = 1;
351                     break;
352                 }
353             }
354         }
355         else
356         {
357             const float *RESTRICT p =
358                     (const float*) oskar_mem_void_const(kernel_cube);
359             for (j = (int) conv_size_half - 1; j > 0; j--)
360             {
361                 const size_t i1 = ((size_t) j * conv_size_half + start) << 1;
362                 const size_t i2 = ((size_t) j + start) << 1;
363                 const double v1 = sqrt(p[i1]*p[i1] + p[i1+1]*p[i1+1]);
364                 const double v2 = sqrt(p[i2]*p[i2] + p[i2+1]*p[i2+1]);
365                 if ((v1 > threshold) || (v2 > threshold))
366                 {
367                     found = 1;
368                     break;
369                 }
370             }
371         }
372         if (found)
373         {
374             supp[i] = 1 + (int)(0.5 + (double)j / (double)oversample);
375             if (supp[i] * oversample * 2 >= conv_size)
376             {
377                 supp[i] = conv_size / 2 / oversample - 1;
378             }
379         }
380     }
381     return w_support;
382 }
383 
384 
oskar_imager_trim_and_save_kernel_cube(oskar_Imager * h,int num_w_planes,oskar_Mem * support,size_t * conv_size_half,oskar_Mem * kernel_cube,int * status)385 static void oskar_imager_trim_and_save_kernel_cube(oskar_Imager* h,
386         int num_w_planes, oskar_Mem* support, size_t* conv_size_half,
387         oskar_Mem* kernel_cube, int* status)
388 {
389     char* fname = 0;
390     int i = 0, max_val = -INT_MAX;
391     const int* supp = 0;
392     fitsfile* f = 0;
393     char *ttype[] = {"SUPPORT"};
394     char *tform[] = {"1J"}; /* 32-bit integer. */
395     char *tunit[] = {"\0"};
396     char extname[] = "W_KERNELS";
397     double cellsize_rad = 0.0, fov_rad = 0.0, w_scale = 0.0;
398     int grid_size = 0, image_size = 0, oversample = 0;
399     if (*status || !kernel_cube) return;
400     supp = oskar_mem_int_const(support, status);
401     for (i = 0; i < num_w_planes; ++i) max_val = MAX(max_val, supp[i]);
402     const int new_conv_size = 2 * (max_val + 2) * h->oversample;
403     const size_t new_conv_size_half = (size_t) new_conv_size / 2 - 2;
404     if (new_conv_size_half < *conv_size_half)
405     {
406         size_t iy = 0, in = 0, out = 0;
407         char *ptr = oskar_mem_char(kernel_cube);
408         const int prec = oskar_mem_precision(kernel_cube);
409         const size_t element_size = 2 * oskar_mem_element_size(prec);
410         const size_t copy_len = element_size * new_conv_size_half;
411         const size_t kernel_plane_size = (*conv_size_half) * (*conv_size_half);
412         for (i = 0; i < num_w_planes; ++i)
413         {
414             in = kernel_plane_size * element_size * (size_t) i;
415             for (iy = 0; iy < new_conv_size_half; ++iy)
416             {
417                 /* Use memmove() rather than memcpy() to allow for overlap. */
418                 memmove(ptr + out, ptr + in, copy_len);
419                 in += (*conv_size_half) * element_size;
420                 out += copy_len;
421             }
422         }
423         *conv_size_half = new_conv_size_half;
424         oskar_mem_realloc(kernel_cube,
425                 ((size_t) num_w_planes) * new_conv_size_half *
426                 new_conv_size_half, status);
427     }
428 
429     /* Write kernel cube as primary and secondary HDU. */
430     fname = (char*) calloc(80, sizeof(char));
431     sprintf(fname, "oskar_w_kernel_cache_%04d.fits", num_w_planes);
432     oskar_mem_write_fits_cube(kernel_cube, fname,
433             (int) *conv_size_half, (int) *conv_size_half, num_w_planes, -1,
434             status);
435 
436     /* Write relevant imaging parameters as primary header keywords. */
437     fits_open_file(&f, fname, READWRITE, status);
438     cellsize_rad = h->cellsize_rad;
439     fov_rad = h->fov_deg * M_PI / 180.0;
440     w_scale = h->w_scale;
441     grid_size = oskar_imager_plane_size(h);
442     image_size = h->image_size;
443     oversample = h->oversample;
444     fits_write_key(f, TINT, "OVERSAMP", &oversample,
445             "kernel oversample parameter", status);
446     fits_write_key(f, TINT, "GRIDSIZE", &grid_size,
447             "grid side length", status);
448     fits_write_key(f, TINT, "IMSIZE", &image_size,
449             "final image side length, in pixels", status);
450     fits_write_key(f, TDOUBLE, "FOV", &fov_rad,
451             "final image field of view, in radians", status);
452     fits_write_key(f, TDOUBLE, "CELLSIZE", &cellsize_rad,
453             "final image cell size, in radians", status);
454     fits_write_key(f, TDOUBLE, "W_SCALE", &w_scale,
455             "w_scale parameter", status);
456 
457     /* Write kernel support sizes as a binary table extension. */
458     fits_create_tbl(f, BINARY_TBL, num_w_planes,
459             1, ttype, tform, tunit, extname, status);
460     fits_write_col(f, TINT, 1, 1, 1, num_w_planes,
461             oskar_mem_int(support, status), status);
462     fits_close_file(f, status);
463     free(fname);
464 }
465 
466 
oskar_imager_normalise_kernel_cube(const oskar_Mem * support,int oversample,size_t conv_size_half,oskar_Mem * kernel_cube,int * status)467 static void oskar_imager_normalise_kernel_cube(const oskar_Mem* support,
468         int oversample, size_t conv_size_half, oskar_Mem* kernel_cube,
469         int* status)
470 {
471     double sum = 0.0;
472     int ix = 0, iy = 0;
473     if (*status) return;
474     const int* supp = oskar_mem_int_const(support, status);
475 
476     /* Normalise so that kernel 0 sums to 1,
477      * when jumping in steps of oversample. */
478     if (oskar_mem_precision(kernel_cube) == OSKAR_DOUBLE)
479     {
480         const double *RESTRICT p = oskar_mem_double_const(kernel_cube, status);
481         for (iy = -supp[0]; iy <= supp[0]; ++iy)
482         {
483             for (ix = -supp[0]; ix <= supp[0]; ++ix)
484             {
485                 sum += p[2 * (abs(ix) * oversample +
486                         conv_size_half * (abs(iy) * oversample))];
487             }
488         }
489     }
490     else
491     {
492         const float *RESTRICT p = oskar_mem_float_const(kernel_cube, status);
493         for (iy = -supp[0]; iy <= supp[0]; ++iy)
494         {
495             for (ix = -supp[0]; ix <= supp[0]; ++ix)
496             {
497                 sum += p[2 * (abs(ix) * oversample +
498                         conv_size_half * (abs(iy) * oversample))];
499             }
500         }
501     }
502     oskar_mem_scale_real(kernel_cube, 1.0 / sum,
503             0, oskar_mem_length(kernel_cube), status);
504 }
505 
506 
oskar_imager_rearrange_kernels(int num_w_planes,const oskar_Mem * support,int oversample,size_t conv_size_half,const oskar_Mem * kernels_in,oskar_Mem * kernels_out,int * rearranged_kernel_start,int * status)507 static void oskar_imager_rearrange_kernels(int num_w_planes,
508         const oskar_Mem* support, int oversample, size_t conv_size_half,
509         const oskar_Mem* kernels_in, oskar_Mem* kernels_out,
510         int* rearranged_kernel_start, int* status)
511 {
512     int w = 0, j = 0, k = 0, off_u = 0, off_v = 0;
513     size_t rearranged_size = 0;
514     float2*  out_f = 0;
515     double2* out_d = 0;
516     if (*status) return;
517     const float2*  in_f = (const float2*)  oskar_mem_void_const(kernels_in);
518     const double2* in_d = (const double2*) oskar_mem_void_const(kernels_in);
519     const int* supp = oskar_mem_int_const(support, status);
520     const int oversample_h = oversample / 2;
521     const int prec = oskar_mem_precision(kernels_in);
522     const size_t height = oversample_h + 1;
523 
524     /* Allocate enough memory for the rearranged kernels. */
525     for (w = 0; w < num_w_planes; w++)
526     {
527         const size_t conv_len = 2 * (size_t)(supp[w]) + 1;
528         const size_t width = (oversample_h * conv_len + 1) * conv_len;
529         rearranged_kernel_start[w] = (int) rearranged_size;
530         rearranged_size += (width * height);
531     }
532     oskar_mem_realloc(kernels_out, rearranged_size, status);
533     /* oskar_mem_set_value_real(kernels_out, 1e9, 0, 0, status); */
534     out_f = (float2*)  oskar_mem_void(kernels_out);
535     out_d = (double2*) oskar_mem_void(kernels_out);
536 
537     for (w = 0; w < num_w_planes; w++)
538     {
539         const int w_support = supp[w];
540         const size_t conv_len = 2 * (size_t)(supp[w]) + 1;
541         const size_t width = ((size_t)oversample_h * conv_len + 1) * conv_len;
542         const size_t c_in = conv_size_half * conv_size_half * (size_t)w;
543         const size_t c_out = (size_t)(rearranged_kernel_start[w]);
544 
545         /* Within each kernel, off_u is slowest varying, so if the
546          * rearranged kernel is viewed as a very squashed image, the
547          * row index to use for off_u is given by width * abs(off_u).
548          *
549          * "Reverse" rows and rearrange elements in U dimension
550          * for reasonable cache utilisation.
551          * For idx_v = abs(off_v + j * oversample), the offset from the
552          * start of the row is given by width-1 - (idx_v * conv_len).
553          * Stride is +1 for positive or zero values of off_u;
554          * stride is -1 for negative values of off_u.
555          */
556         for (off_u = -oversample_h; off_u <= oversample_h; off_u++)
557         {
558             const size_t mid = c_out + (abs(off_u) + 1) * width - 1 - w_support;
559             const int stride = (off_u >= 0) ? 1 : -1;
560             for (off_v = -oversample_h; off_v <= oversample_h; off_v++)
561             {
562                 for (j = 0; j <= w_support; j++)
563                 {
564                     const unsigned int idx_v = abs(off_v + j * oversample);
565                     const size_t p = mid - (idx_v * conv_len);
566                     for (k = -w_support; k <= w_support; k++)
567                     {
568                         const unsigned int idx_u = abs(off_u + k * oversample);
569                         const size_t a = c_in + idx_v * conv_size_half + idx_u;
570                         const size_t b = p + stride * k;
571                         if (prec == OSKAR_SINGLE)
572                         {
573                             out_f[b] = in_f[a];
574                         }
575                         else
576                         {
577                             out_d[b] = in_d[a];
578                         }
579                     }
580                 }
581             }
582         }
583 #if 0
584         if (w == 0)
585         {
586             oskar_Mem* a1 = oskar_mem_create_alias(kernels_out, c_out,
587                     width * height, status);
588             oskar_Mem* a2 = oskar_mem_create_alias(kernels_in, c_in,
589                     conv_size_half * conv_size_half, status);
590             oskar_mem_write_fits_cube(a1, "kernel_rearranged",
591                     (int) width, (int) height, 1, 0, status);
592             oskar_mem_write_fits_cube(a2, "kernel_orig",
593                     conv_size_half, conv_size_half, 1, 0, status);
594             oskar_mem_free(a1, status);
595             oskar_mem_free(a2, status);
596             printf("At w = %d, w_support = %d\n", w, w_support);
597             printf("Original size: %lu. Rearranged size: %lu\n",
598                     (unsigned long) oskar_mem_length(kernels_in),
599                     (unsigned long) oskar_mem_length(kernels_out));
600         }
601 #endif
602     }
603 #if 0
604     FILE* fhan = fopen("rearranged_kernels.txt", "w");
605     oskar_mem_save_ascii(fhan, 1, rearranged_size, status, kernels_out);
606     fclose(fhan);
607 #endif
608 }
609 
610 #ifdef __cplusplus
611 }
612 #endif
613