1 /*
2  * Copyright (c) 2012-2021, The OSKAR Developers.
3  * See the LICENSE file at the top-level directory of this distribution.
4  */
5 
6 #include "beam_pattern/oskar_beam_pattern.h"
7 #include "beam_pattern/private_beam_pattern.h"
8 #include "convert/oskar_convert_mjd_to_gast_fast.h"
9 #include "correlate/oskar_evaluate_auto_power.h"
10 #include "correlate/oskar_evaluate_cross_power.h"
11 #include "math/oskar_cmath.h"
12 #include "math/private_cond2_2x2.h"
13 #include "utility/oskar_device.h"
14 #include "utility/oskar_file_exists.h"
15 #include "utility/oskar_get_error_string.h"
16 #include "utility/oskar_get_memory_usage.h"
17 #include "oskar_version.h"
18 
19 #include <stdlib.h>
20 #include <string.h>
21 
22 #include <fitsio.h>
23 
24 #ifdef _OPENMP
25 #include <omp.h>
26 #endif
27 
28 #ifdef __cplusplus
29 extern "C" {
30 #endif
31 
32 static void* run_blocks(void* arg);
33 static void sim_chunks(oskar_BeamPattern* h, int i_chunk_start, int i_time,
34         int i_channel, int i_active, int device_id, int* status);
35 static void write_chunks(oskar_BeamPattern* h, int i_chunk_start, int i_time,
36         int i_channel, int i_active, int* status);
37 static void write_pixels(oskar_BeamPattern* h, int i_chunk, int i_time,
38         int i_channel, int num_pix, int channel_average, int time_average,
39         const oskar_Mem* in, int chunk_desc, int stokes_in, int* status);
40 static void complex_to_amp(const oskar_Mem* complex_in, const int offset,
41         const int stride, const int num_points, oskar_Mem* output, int* status);
42 static void complex_to_phase(const oskar_Mem* complex_in, const int offset,
43         const int stride, const int num_points, oskar_Mem* output, int* status);
44 static void complex_to_real(const oskar_Mem* complex_in, const int offset,
45         const int stride, const int num_points, oskar_Mem* output, int* status);
46 static void complex_to_imag(const oskar_Mem* complex_in, const int offset,
47         const int stride, const int num_points, oskar_Mem* output, int* status);
48 static void jones_to_ixr(const oskar_Mem* complex_in, const int offset,
49         const int num_points, oskar_Mem* output, int* status);
50 static void oskar_convert_linear_to_stokes(const int num_points,
51         const int offset_in, const oskar_Mem* linear, const int stokes_index,
52         oskar_Mem* output, int* status);
53 static void record_timing(oskar_BeamPattern* h);
54 static unsigned int disp_width(unsigned int value);
55 
56 
57 struct ThreadArgs
58 {
59     oskar_BeamPattern* h;
60     int num_threads, thread_id;
61 };
62 typedef struct ThreadArgs ThreadArgs;
63 
oskar_beam_pattern_run(oskar_BeamPattern * h,int * status)64 void oskar_beam_pattern_run(oskar_BeamPattern* h, int* status)
65 {
66     int i = 0;
67     oskar_Timer* tmr = 0;
68     oskar_Thread** threads = 0;
69     ThreadArgs* args = 0;
70     if (*status || !h) return;
71 
72     /* Check root name exists. */
73     if (!h->root_path)
74     {
75         oskar_log_error(h->log, "No output file name specified.");
76         *status = OSKAR_ERR_FILE_IO;
77         return;
78     }
79 
80     /* Initialise if required. */
81     oskar_beam_pattern_check_init(h, status);
82     tmr = oskar_timer_create(OSKAR_TIMER_NATIVE);
83     oskar_timer_resume(tmr);
84 
85     /* Set up worker threads. */
86     const int num_threads = h->num_devices + 1;
87     oskar_barrier_set_num_threads(h->barrier, num_threads);
88     threads = (oskar_Thread**) calloc(num_threads, sizeof(oskar_Thread*));
89     args = (ThreadArgs*) calloc(num_threads, sizeof(ThreadArgs));
90     for (i = 0; i < num_threads; ++i)
91     {
92         args[i].h = h;
93         args[i].num_threads = num_threads;
94         args[i].thread_id = i;
95     }
96 
97     /* Record memory usage. */
98     if (!*status)
99     {
100 #if defined(OSKAR_HAVE_CUDA) || defined(OSKAR_HAVE_OPENCL)
101         oskar_log_section(h->log, 'M', "Initial memory usage");
102         for (i = 0; i < h->num_gpus; ++i)
103         {
104             oskar_device_log_mem(h->dev_loc, 0, h->gpu_ids[i], h->log);
105         }
106 #endif
107         oskar_log_section(h->log, 'M', "Starting simulation...");
108     }
109 
110     /* Set status code. */
111     h->status = *status;
112 
113     /* Start simulation timer. */
114     oskar_timer_start(h->tmr_sim);
115 
116     /* Start the worker threads. */
117     for (i = 0; i < num_threads; ++i)
118     {
119         threads[i] = oskar_thread_create(run_blocks, (void*)&args[i], 0);
120     }
121 
122     /* Wait for worker threads to finish. */
123     for (i = 0; i < num_threads; ++i)
124     {
125         oskar_thread_join(threads[i]);
126         oskar_thread_free(threads[i]);
127     }
128     free(threads);
129     free(args);
130 
131     /* Get status code. */
132     *status = h->status;
133 
134     /* Record memory usage. */
135     if (!*status)
136     {
137 #if defined(OSKAR_HAVE_CUDA) || defined(OSKAR_HAVE_OPENCL)
138         oskar_log_section(h->log, 'M', "Final memory usage");
139         for (i = 0; i < h->num_gpus; ++i)
140         {
141             oskar_device_log_mem(h->dev_loc, 0, h->gpu_ids[i], h->log);
142         }
143 #endif
144         /* Record time taken. */
145         oskar_log_set_value_width(h->log, 25);
146         record_timing(h);
147     }
148 
149     /* Finalise. */
150     oskar_beam_pattern_reset_cache(h, status);
151 
152     /* Check for errors. */
153     if (!*status)
154     {
155         oskar_log_message(h->log, 'M', 0, "Run completed in %.3f sec.",
156                 oskar_timer_elapsed(tmr));
157     }
158     else
159     {
160         oskar_log_error(h->log, "Run failed with code %i: %s.", *status,
161                 oskar_get_error_string(*status));
162     }
163     oskar_timer_free(tmr);
164 
165     /* Close the log. */
166     oskar_log_close(h->log);
167 }
168 
169 
170 /* Private methods. */
171 
run_blocks(void * arg)172 static void* run_blocks(void* arg)
173 {
174     oskar_BeamPattern* h = 0;
175     int i_inner = 0, i_outer = 0, num_inner = 0, num_outer = 0;
176     int c = 0, t = 0, f = 0, *status = 0;
177 
178     /* Loop indices for previous iteration (accessed only by thread 0). */
179     int cp = 0, tp = 0, fp = 0;
180 
181     /* Get thread function arguments. */
182     h = ((ThreadArgs*)arg)->h;
183     status = &(h->status);
184     const int num_threads = ((ThreadArgs*)arg)->num_threads;
185     const int thread_id = ((ThreadArgs*)arg)->thread_id;
186     const int device_id = thread_id - 1;
187 
188 #ifdef _OPENMP
189     /* Disable any nested parallelism. */
190     omp_set_nested(0);
191     omp_set_num_threads(1);
192 #endif
193 
194     if (device_id >= 0 && device_id < h->num_gpus)
195     {
196         oskar_device_set(h->dev_loc, h->gpu_ids[device_id], status);
197     }
198 
199     /* Set ranges of inner and outer loops based on averaging mode. */
200     if (h->average_single_axis != 'T')
201     {
202         num_outer = h->num_time_steps;
203         num_inner = h->num_channels; /* Channel on inner loop. */
204     }
205     else
206     {
207         num_outer = h->num_channels;
208         num_inner = h->num_time_steps; /* Time on inner loop. */
209     }
210 
211     /* Loop over image pixel chunks, running simulation and file writing one
212      * chunk at a time. Simulation and file output are overlapped by using
213      * double buffering, and a dedicated thread is used for file output.
214      *
215      * Thread 0 is used for file writes.
216      * Threads 1 to n (mapped to compute devices) do the simulation.
217      */
218     for (c = 0; c < h->num_chunks; c += h->num_devices)
219     {
220         for (i_outer = 0; i_outer < num_outer; ++i_outer)
221         {
222             for (i_inner = 0; i_inner < num_inner; ++i_inner)
223             {
224                 /* Set time and channel indices based on averaging mode. */
225                 if (h->average_single_axis != 'T')
226                 {
227                     t = i_outer;
228                     f = i_inner;
229                 }
230                 else
231                 {
232                     f = i_outer;
233                     t = i_inner;
234                 }
235                 if (thread_id > 0 || num_threads == 1)
236                 {
237                     sim_chunks(h, c, t, f, h->i_global & 1, device_id, status);
238                 }
239                 if (thread_id == 0 && h->i_global > 0)
240                 {
241                     write_chunks(h, cp, tp, fp, h->i_global & 1, status);
242                 }
243 
244                 /* Barrier 1: Set indices of the previous chunk(s). */
245                 oskar_barrier_wait(h->barrier);
246                 if (thread_id == 0)
247                 {
248                     cp = c;
249                     tp = t;
250                     fp = f;
251                     h->i_global++;
252                 }
253 
254                 /* Barrier 2: Check sim and write are done. */
255                 oskar_barrier_wait(h->barrier);
256             }
257         }
258     }
259 
260     /* Write the very last chunk(s). */
261     if (thread_id == 0)
262     {
263         write_chunks(h, cp, tp, fp, h->i_global & 1, status);
264     }
265 
266     return 0;
267 }
268 
269 
sim_chunks(oskar_BeamPattern * h,int i_chunk_start,int i_time,int i_channel,int i_active,int device_id,int * status)270 static void sim_chunks(oskar_BeamPattern* h, int i_chunk_start, int i_time,
271         int i_channel, int i_active, int device_id, int* status)
272 {
273     DeviceData* d = 0;
274     int chunk_size = 0, i = 0;
275     int num_models_evaluated = 0;
276     int *models_evaluated = 0, *model_offsets = 0;
277     if (*status) return;
278 
279     /* Get chunk index from GPU ID and chunk start,
280      * and return immediately if it's out of range. */
281     d = &h->d[device_id];
282     const int* type_map = oskar_mem_int_const(
283             oskar_telescope_station_type_map_const(d->tel), status);
284     const int i_chunk = i_chunk_start + device_id;
285     if (i_chunk >= h->num_chunks) return;
286 
287     /* Get time and frequency values. */
288     oskar_timer_resume(d->tmr_compute);
289     const double dt_dump = h->time_inc_sec / 86400.0;
290     const double mjd = h->time_start_mjd_utc + dt_dump * (i_time + 0.5);
291     const double gast_rad = oskar_convert_mjd_to_gast_fast(mjd);
292     const double freq_hz = h->freq_start_hz + i_channel * h->freq_inc_hz;
293 
294     /* Work out the size of the chunk. */
295     chunk_size = h->max_chunk_size;
296     if ((i_chunk + 1) * h->max_chunk_size > h->num_pixels)
297     {
298         chunk_size = h->num_pixels - i_chunk * h->max_chunk_size;
299     }
300 
301     /* Copy pixel chunk coordinate data to GPU only if chunk is different. */
302     if (i_chunk != d->previous_chunk_index)
303     {
304         const int offset = i_chunk * h->max_chunk_size;
305         d->previous_chunk_index = i_chunk;
306         oskar_mem_copy_contents(d->lon_rad, h->lon_rad,
307                 0, offset, chunk_size, status);
308         oskar_mem_copy_contents(d->lat_rad, h->lat_rad,
309                 0, offset, chunk_size, status);
310         oskar_mem_copy_contents(d->x, h->x, 0, offset, chunk_size, status);
311         oskar_mem_copy_contents(d->y, h->y, 0, offset, chunk_size, status);
312         oskar_mem_copy_contents(d->z, h->z, 0, offset, chunk_size, status);
313     }
314 
315     /* Generate beam for this pixel chunk, for all active stations. */
316     for (i = 0; i < h->num_active_stations; ++i)
317     {
318         const int offset = i * chunk_size;
319         const oskar_Mem* const source_coords[] = {d->x, d->y, d->z};
320         if (!oskar_telescope_allow_station_beam_duplication(d->tel))
321         {
322             const oskar_Station* station =
323                     oskar_telescope_station_const(d->tel, h->station_ids[i]);
324             if (!station)
325             {
326                 station = oskar_telescope_station_const(d->tel, 0);
327             }
328             oskar_station_beam(station,
329                     d->work, h->source_coord_type, chunk_size,
330                     source_coords, h->lon0, h->lat0,
331                     oskar_telescope_phase_centre_coord_type(d->tel),
332                     oskar_telescope_phase_centre_longitude_rad(d->tel),
333                     oskar_telescope_phase_centre_latitude_rad(d->tel),
334                     i_time, gast_rad, freq_hz,
335                     offset, d->jones_data, status);
336         }
337         else
338         {
339             int j = 0, station_to_copy = -1;
340             const int station_model_type = type_map[h->station_ids[i]];
341             for (j = 0; j < num_models_evaluated; ++j)
342             {
343                 if (models_evaluated[j] == station_model_type)
344                 {
345                     station_to_copy = model_offsets[j];
346                     break;
347                 }
348             }
349             if (station_to_copy >= 0)
350             {
351                 oskar_mem_copy_contents(
352                         d->jones_data, d->jones_data,
353                         (size_t)(i * chunk_size),               /* Dest. */
354                         (size_t)(station_to_copy * chunk_size), /* Source. */
355                         (size_t)chunk_size, status);
356             }
357             else
358             {
359                 oskar_station_beam(
360                         oskar_telescope_station_const(d->tel, station_model_type),
361                         d->work, h->source_coord_type, chunk_size,
362                         source_coords, h->lon0, h->lat0,
363                         oskar_telescope_phase_centre_coord_type(d->tel),
364                         oskar_telescope_phase_centre_longitude_rad(d->tel),
365                         oskar_telescope_phase_centre_latitude_rad(d->tel),
366                         i_time, gast_rad, freq_hz,
367                         offset, d->jones_data, status);
368                 num_models_evaluated++;
369                 models_evaluated = (int*) realloc(models_evaluated,
370                         num_models_evaluated * sizeof(int));
371                 model_offsets = (int*) realloc(model_offsets,
372                         num_models_evaluated * sizeof(int));
373                 models_evaluated[num_models_evaluated - 1] = station_model_type;
374                 model_offsets[num_models_evaluated - 1] = i;
375             }
376         }
377         if (d->auto_power[0])
378         {
379             oskar_evaluate_auto_power(chunk_size,
380                     offset, d->jones_data, 1.0, 0.0, 0.0, 0.0,
381                     offset, d->auto_power[0], status);
382         }
383         if (d->auto_power[1])
384         {
385             oskar_evaluate_auto_power(chunk_size,
386                     offset, d->jones_data,
387                     h->test_source_stokes[0],
388                     h->test_source_stokes[1],
389                     h->test_source_stokes[2],
390                     h->test_source_stokes[3],
391                     offset, d->auto_power[1], status);
392         }
393     }
394     free(models_evaluated);
395     free(model_offsets);
396     if (d->cross_power[0])
397     {
398         oskar_evaluate_cross_power(chunk_size, h->num_active_stations,
399                 d->jones_data, 1.0, 0.0, 0.0, 0.0,
400                 0, d->cross_power[0], status);
401     }
402     if (d->cross_power[1])
403     {
404         oskar_evaluate_cross_power(chunk_size, h->num_active_stations,
405                 d->jones_data,
406                 h->test_source_stokes[0],
407                 h->test_source_stokes[1],
408                 h->test_source_stokes[2],
409                 h->test_source_stokes[3],
410                 0, d->cross_power[1], status);
411     }
412 
413     /* Copy the output data into host memory. */
414     if (d->jones_data_cpu[i_active])
415     {
416         oskar_mem_copy_contents(d->jones_data_cpu[i_active], d->jones_data,
417                 0, 0, chunk_size * h->num_active_stations, status);
418     }
419     for (i = 0; i < 2; ++i)
420     {
421         if (d->auto_power[i])
422         {
423             oskar_mem_copy_contents(d->auto_power_cpu[i][i_active],
424                     d->auto_power[i], 0, 0,
425                     chunk_size * h->num_active_stations, status);
426         }
427         if (d->cross_power[i])
428         {
429             oskar_mem_copy_contents(d->cross_power_cpu[i][i_active],
430                     d->cross_power[i], 0, 0, chunk_size, status);
431         }
432     }
433     oskar_mutex_lock(h->mutex);
434     oskar_log_message(h->log, 'S', 1, "Chunk %*i/%i, "
435             "Time %*i/%i, Channel %*i/%i [Device %i]",
436             disp_width(h->num_chunks), i_chunk+1, h->num_chunks,
437             disp_width(h->num_time_steps), i_time+1, h->num_time_steps,
438             disp_width(h->num_channels), i_channel+1, h->num_channels,
439             device_id);
440     oskar_mutex_unlock(h->mutex);
441     oskar_timer_pause(d->tmr_compute);
442 }
443 
444 
write_chunks(oskar_BeamPattern * h,int i_chunk_start,int i_time,int i_channel,int i_active,int * status)445 static void write_chunks(oskar_BeamPattern* h, int i_chunk_start,
446         int i_time, int i_channel, int i_active, int* status)
447 {
448     int i = 0, chunk_sources = 0, stokes = 0;
449     if (*status) return;
450 
451     /* Write inactive chunk(s) from all GPUs. */
452     oskar_timer_resume(h->tmr_write);
453     for (i = 0; i < h->num_devices; ++i)
454     {
455         DeviceData* d = &h->d[i];
456 
457         /* Get chunk index from GPU ID & chunk start. Stop if out of range. */
458         const int i_chunk = i_chunk_start + i;
459         if (i_chunk >= h->num_chunks || *status) break;
460 
461         /* Get the size of the chunk. */
462         chunk_sources = h->max_chunk_size;
463         if ((i_chunk + 1) * h->max_chunk_size > h->num_pixels)
464         {
465             chunk_sources = h->num_pixels - i_chunk * h->max_chunk_size;
466         }
467         const int chunk_size = chunk_sources * h->num_active_stations;
468 
469         /* Write non-averaged raw data, if required. */
470         write_pixels(h, i_chunk, i_time, i_channel, chunk_sources, 0, 0,
471                 d->jones_data_cpu[!i_active], JONES_DATA, -1, status);
472 
473         /* Loop over Stokes parameter types. */
474         for (stokes = 0; stokes < 2; ++stokes)
475         {
476             /* Write non-averaged data, if required. */
477             write_pixels(h, i_chunk, i_time, i_channel, chunk_sources, 0, 0,
478                     d->auto_power_cpu[stokes][!i_active],
479                     AUTO_POWER_DATA, stokes, status);
480             write_pixels(h, i_chunk, i_time, i_channel, chunk_sources, 0, 0,
481                     d->cross_power_cpu[stokes][!i_active],
482                     CROSS_POWER_DATA, stokes, status);
483 
484             /* Time-average the data if required. */
485             if (d->auto_power_time_avg[stokes])
486             {
487                 oskar_mem_add(d->auto_power_time_avg[stokes],
488                         d->auto_power_time_avg[stokes],
489                         d->auto_power_cpu[stokes][!i_active],
490                         0, 0, 0, chunk_size, status);
491             }
492             if (d->cross_power_time_avg[stokes])
493             {
494                 oskar_mem_add(d->cross_power_time_avg[stokes],
495                         d->cross_power_time_avg[stokes],
496                         d->cross_power_cpu[stokes][!i_active],
497                         0, 0, 0, chunk_sources, status);
498             }
499 
500             /* Channel-average the data if required. */
501             if (d->auto_power_channel_avg[stokes])
502             {
503                 oskar_mem_add(d->auto_power_channel_avg[stokes],
504                         d->auto_power_channel_avg[stokes],
505                         d->auto_power_cpu[stokes][!i_active],
506                         0, 0, 0, chunk_size, status);
507             }
508             if (d->cross_power_channel_avg[stokes])
509             {
510                 oskar_mem_add(d->cross_power_channel_avg[stokes],
511                         d->cross_power_channel_avg[stokes],
512                         d->cross_power_cpu[stokes][!i_active],
513                         0, 0, 0, chunk_sources, status);
514             }
515 
516             /* Channel- and time-average the data if required. */
517             if (d->auto_power_channel_and_time_avg[stokes])
518             {
519                 oskar_mem_add(d->auto_power_channel_and_time_avg[stokes],
520                         d->auto_power_channel_and_time_avg[stokes],
521                         d->auto_power_cpu[stokes][!i_active],
522                         0, 0, 0, chunk_size, status);
523             }
524             if (d->cross_power_channel_and_time_avg[stokes])
525             {
526                 oskar_mem_add(d->cross_power_channel_and_time_avg[stokes],
527                         d->cross_power_channel_and_time_avg[stokes],
528                         d->cross_power_cpu[stokes][!i_active],
529                         0, 0, 0, chunk_sources, status);
530             }
531 
532             /* Write time-averaged data. */
533             if (i_time == h->num_time_steps - 1)
534             {
535                 if (d->auto_power_time_avg[stokes])
536                 {
537                     oskar_mem_scale_real(d->auto_power_time_avg[stokes],
538                             1.0 / h->num_time_steps, 0, chunk_size, status);
539                     write_pixels(h, i_chunk, 0, i_channel, chunk_sources, 0, 1,
540                             d->auto_power_time_avg[stokes],
541                             AUTO_POWER_DATA, stokes, status);
542                     oskar_mem_clear_contents(d->auto_power_time_avg[stokes],
543                             status);
544                 }
545                 if (d->cross_power_time_avg[stokes])
546                 {
547                     oskar_mem_scale_real(d->cross_power_time_avg[stokes],
548                             1.0 / h->num_time_steps, 0, chunk_sources, status);
549                     write_pixels(h, i_chunk, 0, i_channel, chunk_sources, 0, 1,
550                             d->cross_power_time_avg[stokes],
551                             CROSS_POWER_DATA, stokes, status);
552                     oskar_mem_clear_contents(d->cross_power_time_avg[stokes],
553                             status);
554                 }
555             }
556 
557             /* Write channel-averaged data. */
558             if (i_channel == h->num_channels - 1)
559             {
560                 if (d->auto_power_channel_avg[stokes])
561                 {
562                     oskar_mem_scale_real(d->auto_power_channel_avg[stokes],
563                             1.0 / h->num_channels, 0, chunk_size, status);
564                     write_pixels(h, i_chunk, i_time, 0, chunk_sources, 1, 0,
565                             d->auto_power_channel_avg[stokes],
566                             AUTO_POWER_DATA, stokes, status);
567                     oskar_mem_clear_contents(d->auto_power_channel_avg[stokes],
568                             status);
569                 }
570                 if (d->cross_power_channel_avg[stokes])
571                 {
572                     oskar_mem_scale_real(d->cross_power_channel_avg[stokes],
573                             1.0 / h->num_channels, 0, chunk_sources, status);
574                     write_pixels(h, i_chunk, i_time, 0, chunk_sources, 1, 0,
575                             d->cross_power_channel_avg[stokes],
576                             CROSS_POWER_DATA, stokes, status);
577                     oskar_mem_clear_contents(
578                             d->cross_power_channel_avg[stokes], status);
579                 }
580             }
581 
582             /* Write channel- and time-averaged data. */
583             if ((i_time == h->num_time_steps - 1) &&
584                     (i_channel == h->num_channels - 1))
585             {
586                 if (d->auto_power_channel_and_time_avg[stokes])
587                 {
588                     oskar_mem_scale_real(
589                             d->auto_power_channel_and_time_avg[stokes],
590                             1.0 / (h->num_channels * h->num_time_steps),
591                             0, chunk_size, status);
592                     write_pixels(h, i_chunk, 0, 0, chunk_sources, 1, 1,
593                             d->auto_power_channel_and_time_avg[stokes],
594                             AUTO_POWER_DATA, stokes, status);
595                     oskar_mem_clear_contents(
596                             d->auto_power_channel_and_time_avg[stokes],
597                             status);
598                 }
599                 if (d->cross_power_channel_and_time_avg[stokes])
600                 {
601                     oskar_mem_scale_real(
602                             d->cross_power_channel_and_time_avg[stokes],
603                             1.0 / (h->num_channels * h->num_time_steps),
604                             0, chunk_sources, status);
605                     write_pixels(h, i_chunk, 0, 0, chunk_sources, 1, 1,
606                             d->cross_power_channel_and_time_avg[stokes],
607                             CROSS_POWER_DATA, stokes, status);
608                     oskar_mem_clear_contents(
609                             d->cross_power_channel_and_time_avg[stokes],
610                             status);
611                 }
612             }
613         }
614     }
615     oskar_timer_pause(h->tmr_write);
616 }
617 
618 
write_pixels(oskar_BeamPattern * h,int i_chunk,int i_time,int i_channel,int num_pix,int channel_average,int time_average,const oskar_Mem * in,int chunk_desc,int stokes_in,int * status)619 static void write_pixels(oskar_BeamPattern* h, int i_chunk, int i_time,
620         int i_channel, int num_pix, int channel_average, int time_average,
621         const oskar_Mem* in, int chunk_desc, int stokes_in, int* status)
622 {
623     int i = 0;
624     if (!in) return;
625 
626     /* Loop over data products. */
627     const int num_pol = h->pol_mode == OSKAR_POL_MODE_FULL ? 4 : 1;
628     for (i = 0; i < h->num_data_products; ++i)
629     {
630         fitsfile* f = 0;
631         FILE* t = 0;
632         int dp = 0, stokes_out = 0, i_station = 0, off = 0;
633 
634         /* Get data product info. */
635         f          = h->data_products[i].fits_file;
636         t          = h->data_products[i].text_file;
637         dp         = h->data_products[i].type;
638         stokes_out = h->data_products[i].stokes_out;
639         i_station  = h->data_products[i].i_station;
640 
641         /* Check averaging mode and polarisation input type. */
642         if (h->data_products[i].time_average != time_average ||
643                 h->data_products[i].channel_average != channel_average ||
644                 h->data_products[i].stokes_in != stokes_in)
645         {
646             continue;
647         }
648 
649         /* Treat raw data output as special case, as it doesn't go via pix. */
650         if (dp == RAW_COMPLEX && chunk_desc == JONES_DATA && t)
651         {
652             oskar_Mem* station_data = 0;
653             station_data = oskar_mem_create_alias(in, i_station * num_pix,
654                     num_pix, status);
655             oskar_mem_save_ascii(t, 1, 0, num_pix, status, station_data);
656             oskar_mem_free(station_data, status);
657             continue;
658         }
659         if (dp == CROSS_POWER_RAW_COMPLEX &&
660                 chunk_desc == CROSS_POWER_DATA && t)
661         {
662             oskar_mem_save_ascii(t, 1, 0, num_pix, status, in);
663             continue;
664         }
665 
666         /* Convert complex values to pixel data. */
667         oskar_mem_clear_contents(h->pix, status);
668         if (chunk_desc == JONES_DATA && dp == AMP)
669         {
670             off = i_station * num_pix * num_pol;
671             if (stokes_out == XX || stokes_out == -1)
672             {
673                 complex_to_amp(in, off, num_pol, num_pix, h->pix, status);
674             }
675             else if (stokes_out == XY)
676             {
677                 complex_to_amp(in, off + 1, num_pol, num_pix, h->pix, status);
678             }
679             else if (stokes_out == YX)
680             {
681                 complex_to_amp(in, off + 2, num_pol, num_pix, h->pix, status);
682             }
683             else if (stokes_out == YY)
684             {
685                 complex_to_amp(in, off + 3, num_pol, num_pix, h->pix, status);
686             }
687             else
688             {
689                 continue;
690             }
691         }
692         else if (chunk_desc == JONES_DATA && dp == PHASE)
693         {
694             off = i_station * num_pix * num_pol;
695             if (stokes_out == XX || stokes_out == -1)
696             {
697                 complex_to_phase(in, off, num_pol, num_pix, h->pix, status);
698             }
699             else if (stokes_out == XY)
700             {
701                 complex_to_phase(in, off + 1, num_pol, num_pix, h->pix, status);
702             }
703             else if (stokes_out == YX)
704             {
705                 complex_to_phase(in, off + 2, num_pol, num_pix, h->pix, status);
706             }
707             else if (stokes_out == YY)
708             {
709                 complex_to_phase(in, off + 3, num_pol, num_pix, h->pix, status);
710             }
711             else
712             {
713                 continue;
714             }
715         }
716         else if (chunk_desc == JONES_DATA && dp == IXR)
717         {
718             jones_to_ixr(in, i_station * num_pix, num_pix, h->pix, status);
719         }
720         else if (chunk_desc == AUTO_POWER_DATA ||
721                 chunk_desc == CROSS_POWER_DATA)
722         {
723             off = i_station * num_pix; /* Station offset. */
724             if (off < 0 || chunk_desc == CROSS_POWER_DATA) off = 0;
725             if (chunk_desc == CROSS_POWER_DATA && (dp & AUTO_POWER))
726             {
727                 continue;
728             }
729             if (chunk_desc == AUTO_POWER_DATA && (dp & CROSS_POWER))
730             {
731                 continue;
732             }
733             if (stokes_out >= I && stokes_out <= V)
734             {
735                 oskar_convert_linear_to_stokes(num_pix, off, in,
736                         stokes_out, h->ctemp, status);
737             }
738             else
739             {
740                 continue;
741             }
742             if (dp & AMP)
743             {
744                 complex_to_amp(h->ctemp, 0, 1, num_pix, h->pix, status);
745             }
746             else if (dp & PHASE)
747             {
748                 complex_to_phase(h->ctemp, 0, 1, num_pix, h->pix, status);
749             }
750             else if (dp & REAL)
751             {
752                 complex_to_real(h->ctemp, 0, 1, num_pix, h->pix, status);
753             }
754             else if (dp & IMAG)
755             {
756                 complex_to_imag(h->ctemp, 0, 1, num_pix, h->pix, status);
757             }
758             else
759             {
760                 continue;
761             }
762         }
763         else continue;
764 
765         /* Check for FITS file. */
766         if (f && h->width && h->height)
767         {
768             long firstpix[4];
769             firstpix[0] = 1 + (i_chunk * h->max_chunk_size) % h->width;
770             firstpix[1] = 1 + (i_chunk * h->max_chunk_size) / h->width;
771             firstpix[2] = 1 + i_channel;
772             firstpix[3] = 1 + i_time;
773             fits_write_pix(f, (h->prec == OSKAR_DOUBLE ? TDOUBLE : TFLOAT),
774                     firstpix, num_pix, oskar_mem_void(h->pix), status);
775         }
776 
777         /* Check for text file. */
778         if (t) oskar_mem_save_ascii(t, 1, 0, num_pix, status, h->pix);
779     }
780 }
781 
782 
complex_to_amp(const oskar_Mem * complex_in,const int offset,const int stride,const int num_points,oskar_Mem * output,int * status)783 static void complex_to_amp(const oskar_Mem* complex_in, const int offset,
784         const int stride, const int num_points, oskar_Mem* output, int* status)
785 {
786     int i = 0, j = 0;
787     if (oskar_mem_precision(output) == OSKAR_SINGLE)
788     {
789         float *out = 0, x = 0.0f, y = 0.0f;
790         const float2* in = 0;
791         in = oskar_mem_float2_const(complex_in, status) + offset;
792         out = oskar_mem_float(output, status);
793         for (i = 0; i < num_points; ++i)
794         {
795             j = i * stride;
796             x = in[j].x;
797             y = in[j].y;
798             out[i] = sqrt(x*x + y*y);
799         }
800     }
801     else
802     {
803         double *out = 0, x = 0.0, y = 0.0;
804         const double2* in = 0;
805         in = oskar_mem_double2_const(complex_in, status) + offset;
806         out = oskar_mem_double(output, status);
807         for (i = 0; i < num_points; ++i)
808         {
809             j = i * stride;
810             x = in[j].x;
811             y = in[j].y;
812             out[i] = sqrt(x*x + y*y);
813         }
814     }
815 }
816 
817 
complex_to_phase(const oskar_Mem * complex_in,const int offset,const int stride,const int num_points,oskar_Mem * output,int * status)818 static void complex_to_phase(const oskar_Mem* complex_in, const int offset,
819         const int stride, const int num_points, oskar_Mem* output, int* status)
820 {
821     int i = 0, j = 0;
822     if (oskar_mem_precision(output) == OSKAR_SINGLE)
823     {
824         float *out = 0;
825         const float2* in = 0;
826         in = oskar_mem_float2_const(complex_in, status) + offset;
827         out = oskar_mem_float(output, status);
828         for (i = 0; i < num_points; ++i)
829         {
830             j = i * stride;
831             out[i] = atan2(in[j].y, in[j].x);
832         }
833     }
834     else
835     {
836         double *out = 0;
837         const double2* in = 0;
838         in = oskar_mem_double2_const(complex_in, status) + offset;
839         out = oskar_mem_double(output, status);
840         for (i = 0; i < num_points; ++i)
841         {
842             j = i * stride;
843             out[i] = atan2(in[j].y, in[j].x);
844         }
845     }
846 }
847 
848 
complex_to_real(const oskar_Mem * complex_in,const int offset,const int stride,const int num_points,oskar_Mem * output,int * status)849 static void complex_to_real(const oskar_Mem* complex_in, const int offset,
850         const int stride, const int num_points, oskar_Mem* output, int* status)
851 {
852     int i = 0;
853     if (oskar_mem_precision(output) == OSKAR_SINGLE)
854     {
855         float *out = 0;
856         const float2* in = 0;
857         in = oskar_mem_float2_const(complex_in, status) + offset;
858         out = oskar_mem_float(output, status);
859         for (i = 0; i < num_points; ++i) out[i] = in[i * stride].x;
860     }
861     else
862     {
863         double *out = 0;
864         const double2* in = 0;
865         in = oskar_mem_double2_const(complex_in, status) + offset;
866         out = oskar_mem_double(output, status);
867         for (i = 0; i < num_points; ++i) out[i] = in[i * stride].x;
868     }
869 }
870 
871 
complex_to_imag(const oskar_Mem * complex_in,const int offset,const int stride,const int num_points,oskar_Mem * output,int * status)872 static void complex_to_imag(const oskar_Mem* complex_in, const int offset,
873         const int stride, const int num_points, oskar_Mem* output, int* status)
874 {
875     int i = 0;
876     if (oskar_mem_precision(output) == OSKAR_SINGLE)
877     {
878         float *out = 0;
879         const float2* in = 0;
880         in = oskar_mem_float2_const(complex_in, status) + offset;
881         out = oskar_mem_float(output, status);
882         for (i = 0; i < num_points; ++i) out[i] = in[i * stride].y;
883     }
884     else
885     {
886         double *out = 0;
887         const double2* in = 0;
888         in = oskar_mem_double2_const(complex_in, status) + offset;
889         out = oskar_mem_double(output, status);
890         for (i = 0; i < num_points; ++i) out[i] = in[i * stride].y;
891     }
892 }
893 
894 
jones_to_ixr(const oskar_Mem * jones,const int offset,const int num_points,oskar_Mem * output,int * status)895 static void jones_to_ixr(const oskar_Mem* jones, const int offset,
896         const int num_points, oskar_Mem* output, int* status)
897 {
898     int i = 0;
899 
900     /* Check for fully polarised data. */
901     if (!oskar_mem_is_matrix(jones) || !oskar_mem_is_complex(jones)) return;
902 
903     if (oskar_mem_precision(output) == OSKAR_SINGLE)
904     {
905         float *out = 0, cond = 0.0f, ixr = 0.0f;
906         const float4c* in = 0;
907         in = oskar_mem_float4c_const(jones, status) + offset;
908         out = oskar_mem_float(output, status);
909         for (i = 0; i < num_points; ++i)
910         {
911             cond = oskar_cond2_2x2_inline_f(in + i);
912             ixr = (cond + 1.0f) / (cond - 1.0f);
913             ixr *= ixr;
914             if (ixr > 1e6) ixr = 1e6;
915             out[i] = ixr;
916         }
917     }
918     else
919     {
920         double *out = 0, cond = 0.0, ixr = 0.0;
921         const double4c* in = 0;
922         in = oskar_mem_double4c_const(jones, status) + offset;
923         out = oskar_mem_double(output, status);
924         for (i = 0; i < num_points; ++i)
925         {
926             cond = oskar_cond2_2x2_inline_d(in + i);
927             ixr = (cond + 1.0) / (cond - 1.0);
928             ixr *= ixr;
929             if (ixr > 1e8) ixr = 1e8;
930             out[i] = ixr;
931         }
932     }
933 }
934 
935 
936 #define LINEAR_TO_STOKES(N, IN, OUT) {\
937     int i;\
938     switch (stokes_index) {\
939     case 0: /* I = 0.5 * (XX + YY) */\
940         for (i = 0; i < N; ++i) {\
941             OUT[i].x = 0.5 * (IN[i].a.x + IN[i].d.x);\
942             OUT[i].y = 0.5 * (IN[i].a.y + IN[i].d.y);\
943         }\
944         break;\
945     case 1: /* Q = 0.5 * (XX - YY) */\
946         for (i = 0; i < N; ++i) {\
947             OUT[i].x = 0.5 * (IN[i].a.x - IN[i].d.x);\
948             OUT[i].y = 0.5 * (IN[i].a.y - IN[i].d.y);\
949         }\
950         break;\
951     case 2: /* U = 0.5 * (XY + YX) */\
952         for (i = 0; i < N; ++i) {\
953             OUT[i].x = 0.5 * (IN[i].b.x + IN[i].c.x);\
954             OUT[i].y = 0.5 * (IN[i].b.y + IN[i].c.y);\
955         }\
956         break;\
957     case 3: /* V = -0.5i * (XY - YX) */\
958         for (i = 0; i < N; ++i) {\
959             OUT[i].x =  0.5 * (IN[i].b.y - IN[i].c.y);\
960             OUT[i].y = -0.5 * (IN[i].b.x - IN[i].c.x);\
961         }\
962         break;\
963     default:\
964         break;\
965     }\
966     }
967 
oskar_convert_linear_to_stokes(const int num_points,const int offset_in,const oskar_Mem * linear,const int stokes_index,oskar_Mem * output,int * status)968 void oskar_convert_linear_to_stokes(const int num_points,
969         const int offset_in, const oskar_Mem* linear, const int stokes_index,
970         oskar_Mem* output, int* status)
971 {
972     if (*status) return;
973     if (!oskar_mem_is_complex(linear) || !oskar_mem_is_complex(output))
974     {
975         *status = OSKAR_ERR_BAD_DATA_TYPE;
976         return;
977     }
978     if (!oskar_mem_is_matrix(linear))
979     {
980         if (stokes_index == 0)
981         {
982             oskar_mem_copy_contents(output, linear, 0, offset_in, num_points,
983                     status);
984         }
985         else
986         {
987             *status = OSKAR_ERR_INVALID_ARGUMENT;
988         }
989         return;
990     }
991     if (oskar_mem_is_double(linear))
992     {
993         double2* out = 0;
994         const double4c* in = 0;
995         out = oskar_mem_double2(output, status);
996         in = oskar_mem_double4c_const(linear, status) + offset_in;
997         LINEAR_TO_STOKES(num_points, in, out)
998     }
999     else
1000     {
1001         float2* out = 0;
1002         const float4c* in = 0;
1003         out = oskar_mem_float2(output, status);
1004         in = oskar_mem_float4c_const(linear, status) + offset_in;
1005         LINEAR_TO_STOKES(num_points, in, out)
1006     }
1007 }
1008 
1009 
record_timing(oskar_BeamPattern * h)1010 static void record_timing(oskar_BeamPattern* h)
1011 {
1012     int i = 0;
1013     oskar_log_section(h->log, 'M', "Simulation timing");
1014     oskar_log_value(h->log, 'M', 0, "Total wall time", "%.3f s",
1015             oskar_timer_elapsed(h->tmr_sim));
1016     for (i = 0; i < h->num_devices; ++i)
1017     {
1018         oskar_log_value(h->log, 'M', 0, "Compute", "%.3f s [Device %i]",
1019                 oskar_timer_elapsed(h->d[i].tmr_compute), i);
1020     }
1021     oskar_log_value(h->log, 'M', 0, "Write", "%.3f s",
1022             oskar_timer_elapsed(h->tmr_write));
1023 }
1024 
1025 
disp_width(unsigned int v)1026 static unsigned int disp_width(unsigned int v)
1027 {
1028     return (v >= 100000u) ? 6 : (v >= 10000u) ? 5 : (v >= 1000u) ? 4 :
1029             (v >= 100u) ? 3 : (v >= 10u) ? 2u : 1u;
1030     /* return v == 1u ? 1u : (unsigned)log10(v)+1 */
1031 }
1032 
1033 
1034 #ifdef __cplusplus
1035 }
1036 #endif
1037