1 /*
2     Copyright (c) 2014-2016 Intel Corporation.  All Rights Reserved.
3 
4     Redistribution and use in source and binary forms, with or without
5     modification, are permitted provided that the following conditions
6     are met:
7 
8       * Redistributions of source code must retain the above copyright
9         notice, this list of conditions and the following disclaimer.
10       * Redistributions in binary form must reproduce the above copyright
11         notice, this list of conditions and the following disclaimer in the
12         documentation and/or other materials provided with the distribution.
13       * Neither the name of Intel Corporation nor the names of its
14         contributors may be used to endorse or promote products derived
15         from this software without specific prior written permission.
16 
17     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20     A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21     HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23     LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24     DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25     THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27     OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29 
30 
31 // Forward declaration as the following 2 functions are declared as friend
32 // in offload_engine.h.
33 // CLANG does not like static to been after friend declaration.
34 static void __offload_init_library_once(void);
35 static void __offload_fini_library(void);
36 
37 #include "offload_host.h"
38 #ifdef MYO_SUPPORT
39 #include "offload_myo_host.h"
40 #endif
41 
42 #include <malloc.h>
43 #ifndef TARGET_WINNT
44 #include <alloca.h>
45 #include <elf.h>
46 #endif // TARGET_WINNT
47 #include <errno.h>
48 #include <fcntl.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <sys/stat.h>
52 #include <sys/types.h>
53 
54 #include <algorithm>
55 #include <bitset>
56 #include <iostream>
57 
58 #if defined(HOST_WINNT)
59 #define PATH_SEPARATOR ";"
60 #else
61 #define PATH_SEPARATOR ":"
62 #endif
63 
64 #define GET_OFFLOAD_NUMBER(timer_data) \
65     timer_data? timer_data->offload_number : 0
66 
67 static void (*task_completion_callback)(void *);
68 
69 extern "C" {
70 #ifdef TARGET_WINNT
71 // Windows does not support imports from libraries without actually
72 // including them as dependence.  We don't want to include in the
73 // dependence since is it used only for Fortran when traceback is enabled.
74 // Chose to implement it with GetProcAddress.
75 #define FORTRAN_TRACE_BACK  win_for__continue_traceback
win_for__continue_traceback(_Offload_result coi_offload_result)76 int win_for__continue_traceback( _Offload_result coi_offload_result )
77 {
78     HINSTANCE hDLL;
79     int (* TraceBackRoutine)(_Offload_result value);
80 
81     hDLL = LoadLibrary("libifcoremd.dll");
82     if (hDLL != 0) {
83         TraceBackRoutine = (int (*)(_Offload_result)) GetProcAddress(hDLL,
84                                                  "for__continue_traceback");
85         if (TraceBackRoutine != 0) {
86             return TraceBackRoutine(coi_offload_result);
87         }
88         else {
89             OFFLOAD_TRACE(3,
90             "Cannot find for__continue_traceback routine in libifcorert.dll\n");
91             exit(1);
92         }
93     }
94     else {
95         OFFLOAD_TRACE(3, "Cannot load libifcorert.dll\n");
96         exit(1);
97     }
98     return 0;
99 }
100 
101 #else // TARGET_WINNT
102 
103 #define FORTRAN_TRACE_BACK for__continue_traceback
104 
105 // for__continue_traceback is provided as a dummy to resolve link time symbols
106 // for C/C++ programs.  For Fortran the actual fortran library function in
107 // libifcore.so is used.
108 #pragma weak for__continue_traceback
109 int for__continue_traceback( _Offload_result coi_offload_result )
110 {
111      OFFLOAD_TRACE(3,
112           "liboffload function for_continue_traceback should not be called.\n");
113      exit(1);
114 }
115 #endif //TARGET_WINNT
116 }  // extern "C"
117 
118 #ifdef TARGET_WINNT
119 // Small subset of ELF declarations for Windows which is needed to compile
120 // this file. ELF header is used to understand what binary type is contained
121 // in the target image - shared library or executable.
122 
123 typedef uint16_t Elf64_Half;
124 typedef uint32_t Elf64_Word;
125 typedef uint64_t Elf64_Addr;
126 typedef uint64_t Elf64_Off;
127 
128 #define EI_NIDENT   16
129 
130 #define ET_EXEC     2
131 #define ET_DYN      3
132 
133 typedef struct
134 {
135     unsigned char e_ident[EI_NIDENT];
136     Elf64_Half    e_type;
137     Elf64_Half    e_machine;
138     Elf64_Word    e_version;
139     Elf64_Addr    e_entry;
140     Elf64_Off     e_phoff;
141     Elf64_Off     e_shoff;
142     Elf64_Word    e_flags;
143     Elf64_Half    e_ehsize;
144     Elf64_Half    e_phentsize;
145     Elf64_Half    e_phnum;
146     Elf64_Half    e_shentsize;
147     Elf64_Half    e_shnum;
148     Elf64_Half    e_shstrndx;
149 } Elf64_Ehdr;
150 #endif // TARGET_WINNT
151 
152 // Host console and file logging
153 const char *prefix;
154 int console_enabled = 0;
155 int offload_number = 0;
156 
157 static const char *htrace_envname = "H_TRACE";
158 static const char *offload_report_envname = "OFFLOAD_REPORT";
159 static const char *timer_envname = "H_TIME";
160 
161 // DMA channel count used by COI and set via
162 // OFFLOAD_DMA_CHANNEL_COUNT environment variable
163 uint32_t mic_dma_channel_count;
164 
165 // Trace information
166 static const char* vardesc_direction_as_string[] = {
167     "NOCOPY",
168     "IN",
169     "OUT",
170     "INOUT"
171 };
172 static const char* vardesc_type_as_string[] = {
173     "unknown",
174     "data",
175     "data_ptr",
176     "func_ptr",
177     "void_ptr",
178     "string_ptr",
179     "dv",
180     "dv_data",
181     "dv_data_slice",
182     "dv_ptr",
183     "dv_ptr_data",
184     "dv_ptr_data_slice",
185     "cean_var",
186     "cean_var_ptr",
187     "c_data_ptr_array",
188     "c_extended_type",
189     "c_func_ptr_array",
190     "c_void_ptr_array",
191     "c_string_ptr_array",
192     "c_data_ptr_ptr",
193     "c_func_ptr_ptr",
194     "c_void_ptr_ptr",
195     "c_string_ptr_ptr",
196     "c_cean_var_ptr_ptr",
197 };
198 
199 Engine*         mic_engines = 0;
200 uint32_t        mic_engines_total = 0;
201 pthread_key_t   mic_thread_key;
202 MicEnvVar       mic_env_vars;
203 uint64_t        cpu_frequency = 0;
204 
205 // MIC_STACKSIZE
206 uint32_t mic_stack_size = 12 * 1024 * 1024;
207 
208 // MIC_BUFFERSIZE
209 uint64_t mic_buffer_size = 0;
210 
211 // Preallocated 4K page memory size for buffers on MIC
212 uint64_t mic_4k_buffer_size = 0;
213 
214 // Preallocated 2M page memory size for buffers on MIC
215 uint64_t mic_2m_buffer_size = 0;
216 
217 
218 // LD_LIBRARY_PATH for KNC
219 char* knc_library_path = 0;
220 
221 // LD_LIBRARY_PATH for KNL
222 char* knl_library_path = 0;
223 
224 
225 // MIC_PROXY_IO
226 bool mic_proxy_io = true;
227 
228 // MIC_PROXY_FS_ROOT
229 char* mic_proxy_fs_root = 0;
230 
231 // Threshold for creating buffers with large pages. Buffer is created
232 // with large pages hint if its size exceeds the threshold value.
233 // By default large pages are disabled right now (by setting default
234 // value for threshold to MAX) due to HSD 4114629.
235 uint64_t __offload_use_2mb_buffers = 0xffffffffffffffffULL;
236 static const char *mic_use_2mb_buffers_envname  =
237     "MIC_USE_2MB_BUFFERS";
238 
239 static uint64_t __offload_use_async_buffer_write = 2 * 1024 * 1024;
240 static const char *mic_use_async_buffer_write_envname  =
241     "MIC_USE_ASYNC_BUFFER_WRITE";
242 
243 static uint64_t __offload_use_async_buffer_read = 2 * 1024 * 1024;
244 static const char *mic_use_async_buffer_read_envname  =
245     "MIC_USE_ASYNC_BUFFER_READ";
246 
247 // device initialization type
248 OffloadInitType __offload_init_type = c_init_on_offload_all;
249 static const char *offload_init_envname = "OFFLOAD_INIT";
250 
251 // active wait
252 static bool __offload_active_wait = true;
253 static const char *offload_active_wait_envname = "OFFLOAD_ACTIVE_WAIT";
254 
255 // wait even for asynchronous offload
256 // true for now still the performance issue with COI is not fixed
257 static bool __offload_always_wait = true;
258 static const char *offload_always_wait_envname = "OFFLOAD_ALWAYS_WAIT";
259 
260 // OMP_DEFAULT_DEVICE
261 int __omp_device_num = 0;
262 static const char *omp_device_num_envname = "OMP_DEFAULT_DEVICE";
263 
264 //OFFLOAD_PARALLEL_COPY
265 static bool __offload_parallel_copy = false;
266 static const char *parallel_copy_envname = "OFFLOAD_PARALLEL_COPY";
267 
268 //Use COI interface for noncontiguous transfer if it exists.
269 static bool __offload_use_coi_noncontiguous_transfer = false;
270 static const char *use_coi_noncontiguous_transfer_envname =
271                        "MIC_USE_COI_MULTI_D";
272 
273 // The list of pending target libraries
274 static bool            __target_libs;
275 static TargetImageList __target_libs_list;
276 static mutex_t         __target_libs_lock;
277 static mutex_t         stack_alloc_lock;
278 static mutex_t         lock_complete;
279 
280 // Set of OffloadDescriptors of asynchronous offloads that are not destroyed
281 std::map<void *, bool> offload_descr_map;
282 
283 // Target executable
284 TargetImage*           __target_exe;
285 // is true if last loaded image is dll
286 bool __current_image_is_dll = false;
287 // is true if myo library is loaded when dll is loaded
288 bool __myo_init_in_so = false;
289 
290 // Print readable offload flags
trace_offload_flags(OffloadHostTimerData * timer_data,OffloadFlags offload_flags)291 static void trace_offload_flags(
292     OffloadHostTimerData* timer_data,
293     OffloadFlags offload_flags
294 )
295 {
296     // Sized big enough for all flag names
297     char fbuffer[256];
298     bool first = true;
299     if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
300         sprintf(fbuffer, "   OffloadFlags=(");
301         if (offload_flags.bits.fortran_traceback) {
302             sprintf(fbuffer+strlen(fbuffer), "fortran_traceback");
303             first = false;
304         }
305         if (offload_flags.bits.omp_async) {
306             sprintf(fbuffer+strlen(fbuffer), first ? "omp_async" : ",omp_async");
307             first = false;
308         }
309         OFFLOAD_DEBUG_TRACE_1(1,
310             GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func,
311             "%s)\n", fbuffer);
312     }
313 }
314 
315 // Print readable varDesc flags
trace_varDesc_flags(OffloadHostTimerData * timer_data,varDescFlags offload_flags)316 static void trace_varDesc_flags(
317     OffloadHostTimerData* timer_data,
318     varDescFlags offload_flags
319 )
320 {
321     // Sized big enough for all flag names
322     char fbuffer[256];
323     bool first = true;
324     if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
325         sprintf(fbuffer, "              varDescFlags=(");
326         if (offload_flags.is_static) {
327             sprintf(fbuffer+strlen(fbuffer), "is_static");
328             first = false;
329         }
330         if (offload_flags.is_static_dstn) {
331             sprintf(fbuffer+strlen(fbuffer),
332                 first ? "is_static_dstn" : ",is_static_dstn");
333             first = false;
334         }
335         if (offload_flags.has_length) {
336             sprintf(fbuffer+strlen(fbuffer),
337                 first ? "has_length" : ",has_length");
338             first = false;
339         }
340         if (offload_flags.is_stack_buf) {
341             sprintf(fbuffer+strlen(fbuffer),
342                 first ? "is_stack_buf" : ",is_stack_buf");
343             first = false;
344         }
345         if (offload_flags.targetptr) {
346             sprintf(fbuffer+strlen(fbuffer),
347                 first ? "targetptr" : ",targetptr");
348             first = false;
349         }
350         if (offload_flags.preallocated) {
351             sprintf(fbuffer+strlen(fbuffer),
352                 first ? "preallocated" : ",preallocated");
353             first = false;
354         }
355         if (offload_flags.is_pointer) {
356             sprintf(fbuffer+strlen(fbuffer),
357                 first ? "is_pointer" : ",is_pointer");
358             first = false;
359         }
360         if (offload_flags.sink_addr) {
361             sprintf(fbuffer+strlen(fbuffer),
362                 first ? "sink_addr" : ",sink_addr");
363             first = false;
364         }
365         if (offload_flags.alloc_disp) {
366             sprintf(fbuffer+strlen(fbuffer),
367                first ? "alloc_disp" : ",alloc_disp");
368             first = false;
369         }
370         if (offload_flags.is_noncont_src) {
371             sprintf(fbuffer+strlen(fbuffer),
372                 first ? "is_noncont_src" : ",is_noncont_src");
373             first = false;
374         }
375         if (offload_flags.is_noncont_dst) {
376             sprintf(fbuffer+strlen(fbuffer),
377                 first ? "is_noncont_dst" : ",is_noncont_dst");
378             first = false;
379         }
380         if (offload_flags.always_copy) {
381             sprintf(fbuffer+strlen(fbuffer),
382                 first ? "always_copy" : ",always_copy");
383             first = false;
384         }
385         if (offload_flags.always_delete) {
386             sprintf(fbuffer+strlen(fbuffer),
387                 first ? "always_delete" : ",always_delete");
388             first = false;
389         }
390         if (offload_flags.is_non_cont_struct) {
391             sprintf(fbuffer+strlen(fbuffer),
392                 first ? "is_non_cont_struct" : ",is_non_cont_struct");
393             first = false;
394         }
395         if (offload_flags.pin) {
396             sprintf(fbuffer+strlen(fbuffer),
397                 first ? "pin" : ",pin");
398             first = false;
399         }
400         if (offload_flags.is_device_ptr) {
401             sprintf(fbuffer+strlen(fbuffer),
402                 first ? "is_device_ptr" : ",is_device_ptr");
403             first = false;
404         }
405         if (offload_flags.use_device_ptr) {
406             sprintf(fbuffer+strlen(fbuffer),
407                 first ? "use_device_ptr" : ",use_device_ptr");
408         }
409         OFFLOAD_DEBUG_TRACE_1(1,
410             GET_OFFLOAD_NUMBER(timer_data), c_offload_init_func,
411             "%s)\n", fbuffer);
412     }
413 }
414 
offload_get_src_base(void * ptr,uint8_t type)415 static char * offload_get_src_base(void * ptr, uint8_t type)
416 {
417     char *base;
418     if (VAR_TYPE_IS_PTR(type)) {
419         base = *static_cast<char**>(ptr);
420     }
421     else if (VAR_TYPE_IS_SCALAR(type)) {
422         base = static_cast<char*>(ptr);
423     }
424     else if (VAR_TYPE_IS_DV_DATA_SLICE(type) || VAR_TYPE_IS_DV_DATA(type)) {
425         ArrDesc *dvp;
426         if (VAR_TYPE_IS_DV_DATA_SLICE(type)) {
427             const Arr_Desc *ap = static_cast<const Arr_Desc*>(ptr);
428             dvp = (type == c_dv_data_slice) ?
429                   reinterpret_cast<ArrDesc*>(ap->base) :
430                   *reinterpret_cast<ArrDesc**>(ap->base);
431         }
432         else {
433             dvp = (type == c_dv_data) ?
434                   static_cast<ArrDesc*>(ptr) :
435                   *static_cast<ArrDesc**>(ptr);
436         }
437         base = reinterpret_cast<char*>(dvp->Base);
438     }
439     else {
440         base = NULL;
441     }
442     return base;
443 }
444 
report_coi_error(error_types msg,COIRESULT res)445 void OffloadDescriptor::report_coi_error(error_types msg, COIRESULT res)
446 {
447     // special case for the 'process died' error
448     if (res == COI_PROCESS_DIED) {
449         m_device.fini_process(true);
450     }
451     else {
452         switch (msg) {
453             case c_buf_create:
454                 if (res == COI_OUT_OF_MEMORY) {
455                     msg = c_buf_create_out_of_mem;
456                 }
457                 /* fallthru */
458 
459             case c_buf_create_from_mem:
460             case c_buf_get_address:
461             case c_pipeline_create:
462             case c_pipeline_run_func:
463                 LIBOFFLOAD_ERROR(msg, m_device.get_logical_index(), res);
464                 break;
465 
466             case c_buf_read:
467             case c_buf_write:
468             case c_buf_copy:
469             case c_buf_map:
470             case c_buf_unmap:
471             case c_buf_destroy:
472             case c_buf_set_state:
473                 LIBOFFLOAD_ERROR(msg, res);
474                 break;
475 
476             default:
477                 break;
478         }
479     }
480 
481     exit(1);
482 }
483 
translate_coi_error(COIRESULT res) const484 _Offload_result OffloadDescriptor::translate_coi_error(COIRESULT res) const
485 {
486     switch (res) {
487         case COI_SUCCESS:
488             return OFFLOAD_SUCCESS;
489 
490         case COI_PROCESS_DIED:
491             return OFFLOAD_PROCESS_DIED;
492 
493         case COI_OUT_OF_MEMORY:
494             return OFFLOAD_OUT_OF_MEMORY;
495 
496         default:
497             return OFFLOAD_ERROR;
498     }
499 }
500 
501 // is_targetptr == 0 && is_prealloc == 0 - allocation of pointer data;
502 // is_targetptr == 1 && is_prealloc == 0 - allocation of target memory:
503 //    allocate memory at target; use its value as base in target table.
504 // is_targetptr == 1 && is_prealloc == 1 - use preallocated target memory:
505 //    base - is address at target of preallocated memory; use its value as
506 //    base in target table.
507 
alloc_ptr_data(PtrData * & ptr_data,void * base,int64_t disp,int64_t size,int64_t alloc_disp,int align,bool is_targptr,bool is_prealloc,bool pin)508 bool OffloadDescriptor::alloc_ptr_data(
509     PtrData* &ptr_data,
510     void *base,
511     int64_t disp,
512     int64_t size,
513     int64_t alloc_disp,
514     int align,
515     bool is_targptr,
516     bool is_prealloc,
517     bool pin
518 )
519 {
520     // total length of base
521     int64_t length = size;
522     bool is_new;
523     COIBUFFER targptr_buf;
524     COIRESULT res;
525     uint32_t buffer_flags = 0;
526     char * base_disp = reinterpret_cast<char *>(base) + disp;
527 
528     // create buffer with large pages if data length exceeds
529     // large page threshold
530     if (length >= __offload_use_2mb_buffers) {
531         buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
532     }
533     // Allocate memory at target for targetptr without preallocated as we need
534     // its address as base argument in call to m_device.insert_ptr_data
535     if (is_targptr && !is_prealloc) {
536         length = alloc_disp ? length : size + disp;
537         res = COI::BufferCreate(
538             length,
539             COI_BUFFER_OPENCL,
540             buffer_flags,
541             0,
542             1,
543             &m_device.get_process(),
544             &targptr_buf);
545         if (res != COI_SUCCESS) {
546             if (m_status != 0) {
547                 m_status->result = translate_coi_error(res);
548             }
549             else if (m_is_mandatory) {
550                 report_coi_error(c_buf_create, res);
551             }
552             return false;
553         }
554 
555         res = COI::BufferGetSinkAddress(
556                        targptr_buf, reinterpret_cast<uint64_t *>(&base));
557         if (res != COI_SUCCESS) {
558             if (m_status != 0) {
559                 m_status->result = translate_coi_error(res);
560             }
561             else if (m_is_mandatory) {
562                 report_coi_error(c_buf_get_address, res);
563             }
564             return false;
565         }
566     }
567 
568     OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
569                   alloc_disp ? base : base_disp,
570                   alloc_disp ? length : size + disp);
571 
572     // add new entry
573 
574     ptr_data = is_targptr ?
575                m_device.find_targetptr_data(base_disp) :
576                m_device.find_ptr_data(base_disp);
577     // if ptr_data is found just need to check it for overlapping
578     if (ptr_data) {
579         is_new = false;
580         base = base_disp;
581     }
582     else {
583         // If association is not found we must create it.
584         length = alloc_disp ? length : size + disp;
585         ptr_data = is_targptr ?
586                m_device.insert_targetptr_data(base, length, is_new) :
587                m_device.insert_ptr_data(base, length, is_new);
588     }
589     if (is_new) {
590 
591         OFFLOAD_TRACE(3, "Added new association\n");
592 
593         if (length > 0) {
594             OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
595 
596             // align should be a power of 2
597             if (!pin && !is_targptr &&
598                 align > 0 && (align & (align - 1)) == 0) {
599                 // offset within mic_buffer. Can do offset optimization
600                 // only when source address alignment satisfies requested
601                 // alignment on the target (cq172736).
602                 if ((reinterpret_cast<intptr_t>(base) & (align - 1)) == 0) {
603                     ptr_data->mic_offset =
604                         reinterpret_cast<intptr_t>(base) & 4095;
605                 }
606             }
607 
608             // buffer size and flags
609             uint64_t buffer_size = length + ptr_data->mic_offset;
610 
611             // For targetptr there is no CPU buffer
612             if (pin || !is_targptr) {
613                 // create CPU buffer
614                 OFFLOAD_DEBUG_TRACE_1(3,
615                           GET_OFFLOAD_NUMBER(get_timer_data()),
616                           c_offload_create_buf_host,
617                           "Creating buffer from source memory %p, "
618                           "length %lld\n", base, length);
619 
620                 // result is not checked because we can continue without cpu
621                 // buffer. In this case we will use COIBufferRead/Write
622                 // instead of COIBufferCopy.
623 
624                 COI::BufferCreateFromMemory(length,
625                                         COI_BUFFER_OPENCL,
626                                         0,
627                                         base,
628                                         1,
629                                         &m_device.get_process(),
630                                         &ptr_data->cpu_buf);
631             }
632 
633             // create MIC buffer
634             if (is_prealloc) {
635                 OFFLOAD_DEBUG_TRACE_1(3,
636                           GET_OFFLOAD_NUMBER(get_timer_data()),
637                           c_offload_create_buf_mic,
638                           "Creating buffer from sink memory: "
639                           "addr %p, size %lld, offset %d, flags 0x%x\n",
640                           base, buffer_size, ptr_data->mic_offset,
641                           buffer_flags);
642                 res = COI::BufferCreateFromMemory(ptr_data->cpu_addr.length(),
643                                                   COI_BUFFER_NORMAL,
644                                                   COI_SINK_MEMORY,
645                                                   base,
646                                                   1,
647                                                   &m_device.get_process(),
648                                                   &ptr_data->mic_buf);
649                 if (res != COI_SUCCESS) {
650                     if (m_status != 0) {
651                         m_status->result = translate_coi_error(res);
652                     }
653                     else if (m_is_mandatory) {
654                         report_coi_error(c_buf_create, res);
655                     }
656                     ptr_data->alloc_ptr_data_lock.unlock();
657                     return false;
658                 }
659             }
660             else if (is_targptr) {
661                 ptr_data->mic_buf = targptr_buf;
662             }
663             else if (!pin) {
664                 OFFLOAD_DEBUG_TRACE_1(3,
665                           GET_OFFLOAD_NUMBER(get_timer_data()),
666                           c_offload_create_buf_mic,
667                           "Creating buffer for sink: size %lld, offset %d, "
668                           "flags =0x%x\n", buffer_size,
669                           ptr_data->mic_offset, buffer_flags);
670                 res = COI::BufferCreate(buffer_size,
671                                         COI_BUFFER_NORMAL,
672                                         buffer_flags,
673                                         0,
674                                         1,
675                                         &m_device.get_process(),
676                                         &ptr_data->mic_buf);
677                 if (res != COI_SUCCESS) {
678                     if (m_status != 0) {
679                         m_status->result = translate_coi_error(res);
680                     }
681                     else if (m_is_mandatory) {
682                         report_coi_error(c_buf_create, res);
683                     }
684                     ptr_data->alloc_ptr_data_lock.unlock();
685                     return false;
686                 }
687             }
688 
689             if (!pin) {
690                 // make buffer valid on the device.
691                 res = COI::BufferSetState(ptr_data->mic_buf,
692                     m_device.get_process(),
693                     COI_BUFFER_VALID,
694                     COI_BUFFER_NO_MOVE,
695                     0, 0, 0);
696                 if (res != COI_SUCCESS) {
697                     if (m_status != 0) {
698                         m_status->result = translate_coi_error(res);
699                     }
700                     else if (m_is_mandatory) {
701                         report_coi_error(c_buf_set_state, res);
702                     }
703                     ptr_data->alloc_ptr_data_lock.unlock();
704                     return false;
705                 }
706 
707                 res = COI::BufferSetState(ptr_data->mic_buf,
708                     COI_PROCESS_SOURCE,
709                     COI_BUFFER_INVALID,
710                     COI_BUFFER_NO_MOVE,
711                 0, 0, 0);
712                 if (res != COI_SUCCESS) {
713                     if (m_status != 0) {
714                         m_status->result = translate_coi_error(res);
715                     }
716                     else if (m_is_mandatory) {
717                         report_coi_error(c_buf_set_state, res);
718                     }
719                     ptr_data->alloc_ptr_data_lock.unlock();
720                     return false;
721                 }
722             }
723         }
724         ptr_data->alloc_disp = alloc_disp;
725         ptr_data->alloc_ptr_data_lock.unlock();
726     }
727     else {
728         mutex_locker_t locker(ptr_data->alloc_ptr_data_lock);
729 
730         OFFLOAD_TRACE(3, "Found existing association: addr %p, length %lld, "
731                       "is_static %d\n",
732                       ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
733                       ptr_data->is_static);
734 
735         // This is not a new entry. Make sure that provided address range fits
736         // into existing one.
737         MemRange addr_range(base, length);
738         if (!ptr_data->cpu_addr.contains(addr_range)) {
739             LIBOFFLOAD_ERROR(c_bad_ptr_mem_alloc, base, length,
740                            const_cast<void *>(ptr_data->cpu_addr.start()),
741                            ptr_data->cpu_addr.length());
742             exit(1);
743         }
744 
745         // if the entry is associated with static data it may not have buffers
746         // created because they are created on demand.
747         if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
748             return false;
749         }
750     }
751 
752     return true;
753 }
754 
find_ptr_data(PtrData * & ptr_data,void * in_base,int64_t disp,int64_t size,bool is_targetptr,bool report_error)755 bool OffloadDescriptor::find_ptr_data(
756     PtrData* &ptr_data,
757     void *in_base,
758     int64_t disp,
759     int64_t size,
760     bool is_targetptr,
761     bool report_error
762 )
763 {
764     // total length of base
765     int64_t length = size;
766     char *base = reinterpret_cast<char *>(in_base) + disp;
767 
768     OFFLOAD_TRACE(3, "Looking for association for data: addr %p, "
769                   "length %lld\n", base, length);
770 
771     // find existing association in pointer table
772     ptr_data = is_targetptr ?
773                m_device.find_targetptr_data(base) :
774                m_device.find_ptr_data(base);
775     if (ptr_data == 0) {
776         if (report_error) {
777             LIBOFFLOAD_ERROR(c_no_ptr_data, base);
778             exit(1);
779         }
780         OFFLOAD_TRACE(3, "Association does not exist\n");
781         return true;
782     }
783 
784     OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
785                   ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
786                   ptr_data->is_static);
787 
788     // make sure that provided address range fits into existing one
789     MemRange addr_range(base, length);
790     if (!ptr_data->cpu_addr.contains(addr_range)) {
791         if (report_error) {
792             LIBOFFLOAD_ERROR(c_bad_ptr_mem_range, base, length,
793                            const_cast<void *>(ptr_data->cpu_addr.start()),
794                            ptr_data->cpu_addr.length());
795             exit(1);
796         }
797         OFFLOAD_TRACE(3, "Existing association partially overlaps with "
798                       "data address range\n");
799         ptr_data = 0;
800         return true;
801     }
802 
803     // if the entry is associated with static data it may not have buffers
804     // created because they are created on demand.
805     if (ptr_data->is_static && !init_static_ptr_data(ptr_data)) {
806         return false;
807     }
808 
809     return true;
810 }
811 
find_device_ptr(int64_t * & device_ptr,void * host_ptr)812 void OffloadDescriptor::find_device_ptr(
813     int64_t* &device_ptr,
814     void *host_ptr
815 )
816 {
817     PtrData* ptr_data;
818     char *base = reinterpret_cast<char *>(host_ptr);
819 
820     OFFLOAD_TRACE(3, "Looking for association for data: addr %p\n", base);
821 
822     // find existing association in pointer table
823     ptr_data = m_device.find_ptr_data(base);
824 
825 //    MIC address should have been assigned.
826 //    For now assume does not exist and get the addr
827 //    if ((ptr_data == 0) || ptr_data->mic_addr) {
828 
829     if (ptr_data == 0) {
830        OFFLOAD_TRACE(3, "Association does not exist\n");
831        LIBOFFLOAD_ERROR(c_no_ptr_data, base);
832        exit(1);
833     }
834     if (!ptr_data->mic_addr) {
835        COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
836                                                  &ptr_data->mic_addr);
837        if (res != COI_SUCCESS) {
838            if (m_status != 0)
839                m_status->result = translate_coi_error(res);
840            report_coi_error(c_buf_get_address, res);
841        }
842     }
843 
844     device_ptr = (int64_t *) ptr_data->mic_addr;
845 
846     OFFLOAD_TRACE(3, "Found association: host_ptr %p, device_ptr = %p\n",
847                   ptr_data->cpu_addr.start(), device_ptr);
848 }
849 
init_static_ptr_data(PtrData * ptr_data)850 bool OffloadDescriptor::init_static_ptr_data(PtrData *ptr_data)
851 {
852     OffloadTimer timer(get_timer_data(), c_offload_host_alloc_buffers);
853 
854     if (ptr_data->cpu_buf == 0) {
855         OFFLOAD_TRACE(3, "Creating buffer from source memory %llx\n",
856                       ptr_data->cpu_addr.start());
857 
858         COIRESULT res = COI::BufferCreateFromMemory(
859             ptr_data->cpu_addr.length(),
860             COI_BUFFER_OPENCL,
861             0,
862             const_cast<void*>(ptr_data->cpu_addr.start()),
863             1, &m_device.get_process(),
864             &ptr_data->cpu_buf);
865 
866         if (res != COI_SUCCESS) {
867             if (m_status != 0) {
868                 m_status->result = translate_coi_error(res);
869                 return false;
870             }
871             report_coi_error(c_buf_create_from_mem, res);
872         }
873     }
874 
875     if (ptr_data->mic_buf == 0) {
876         OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n",
877                       ptr_data->mic_addr);
878 
879         COIRESULT res = COI::BufferCreateFromMemory(
880             ptr_data->cpu_addr.length(),
881             COI_BUFFER_NORMAL,
882             COI_SINK_MEMORY,
883             reinterpret_cast<void*>(ptr_data->mic_addr),
884             1, &m_device.get_process(),
885             &ptr_data->mic_buf);
886 
887         if (res != COI_SUCCESS) {
888             if (m_status != 0) {
889                 m_status->result = translate_coi_error(res);
890                 return false;
891             }
892             report_coi_error(c_buf_create_from_mem, res);
893         }
894     }
895 
896     return true;
897 }
898 
init_mic_address(PtrData * ptr_data)899 bool OffloadDescriptor::init_mic_address(PtrData *ptr_data)
900 {
901     if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
902         COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
903                                                   &ptr_data->mic_addr);
904         if (res != COI_SUCCESS) {
905             if (m_status != 0) {
906                 m_status->result = translate_coi_error(res);
907             }
908             else if (m_is_mandatory) {
909                 report_coi_error(c_buf_get_address, res);
910             }
911             return false;
912         }
913     }
914     return true;
915 }
916 
nullify_target_stack(COIBUFFER targ_buf,uint64_t size)917 bool OffloadDescriptor::nullify_target_stack(
918     COIBUFFER targ_buf,
919     uint64_t size
920 )
921 {
922     char * ptr = (char*)malloc(size);
923     if (ptr == NULL)
924       LIBOFFLOAD_ERROR(c_malloc);
925     COIRESULT res;
926 
927     memset(ptr, 0, size);
928     res = COI::BufferWrite(
929         targ_buf,
930         0,
931         ptr,
932         size,
933         COI_COPY_UNSPECIFIED,
934         0, 0, 0);
935     free(ptr);
936     if (res != COI_SUCCESS) {
937         if (m_status != 0) {
938             m_status->result = translate_coi_error(res);
939             return false;
940         }
941         report_coi_error(c_buf_write, res);
942     }
943     return true;
944 }
945 
print_persistList_item(const char * msg,PersistData * cur_el)946 static void print_persistList_item(
947     const char *msg,
948    PersistData *cur_el
949 )
950 {
951     OFFLOAD_TRACE(4, "%s\n", msg);
952     OFFLOAD_TRACE(4, "    stack_cpu_addr = %p\n", cur_el->stack_cpu_addr);
953     OFFLOAD_TRACE(4, "    routine_id     = %d\n", cur_el->routine_id);
954     OFFLOAD_TRACE(4, "    thread_id      = %lld\n", cur_el->thread_id);
955     OFFLOAD_TRACE(4, "    stack_ptr_data = %p\n", cur_el->stack_ptr_data);
956     OFFLOAD_TRACE(4, "        MIC buffer = %p\n", cur_el->stack_ptr_data->mic_buf);
957     OFFLOAD_TRACE(4, "        MIC addr   = %p\n", cur_el->stack_ptr_data->mic_addr);
958     OFFLOAD_TRACE(4, "    cpu_stack_addr = %p\n", cur_el->cpu_stack_addr);
959 }
960 
961 static mutex_t stack_memory_manager_lock;
962 
offload_stack_memory_manager(const void * stack_begin,int routine_id,int buf_size,int align,bool thread_specific_function_locals,bool * is_new)963 bool OffloadDescriptor::offload_stack_memory_manager(
964     const void * stack_begin,
965     int  routine_id,
966     int  buf_size,
967     int  align,
968     bool thread_specific_function_locals,
969     bool *is_new)
970 {
971     //mutex_locker_t locker(stack_alloc_lock);
972     stack_memory_manager_lock.lock();
973 
974     PersistData * new_el;
975     PersistDataList::iterator it_begin = m_device.m_persist_list.begin();
976     PersistDataList::iterator it_end;
977     int erase = 0;
978     uint64_t cur_thread_id = m_device.get_thread_id();
979 
980     OFFLOAD_TRACE(3, "offload_stack_memory_manager("
981         "stack_begin=%p, routine_id=%d, buf_size=%d,"
982         "align=%d, thread_specific_function_locals=%d, bool=%p)\n",
983         stack_begin, routine_id, buf_size,
984         align, thread_specific_function_locals, is_new);
985     OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id);
986     *is_new = false;
987 
988     for (PersistDataList::iterator it = m_device.m_persist_list.begin();
989         it != m_device.m_persist_list.end(); it++) {
990         PersistData cur_el = *it;
991 
992         print_persistList_item("Current element in persist list:", &cur_el);
993         if (stack_begin > it->stack_cpu_addr) {
994             if (cur_thread_id == cur_el.thread_id) {
995                 // this stack data must be destroyed
996                 m_destroy_stack.push_front(cur_el.stack_ptr_data);
997                 it_end = it;
998                 erase++;
999                 OFFLOAD_TRACE(3, "Current element below TOS: so delete\n");
1000             }
1001         }
1002         else if (stack_begin == it->stack_cpu_addr) {
1003             if (routine_id != it-> routine_id) {
1004                 // this stack data must be destroyed
1005                 // because the current function is a dynamic sibling
1006                 m_destroy_stack.push_front(cur_el.stack_ptr_data);
1007                 it_end = it;
1008                 erase++;
1009                 OFFLOAD_TRACE(3, "Current element is sibling: so delete\n");
1010                 break;
1011             }
1012             else if (!thread_specific_function_locals ||
1013                 cur_thread_id == cur_el.thread_id) {
1014                 // stack data is reused
1015                 m_stack_ptr_data = it->stack_ptr_data;
1016                 if (erase > 0) {
1017                     // all obsolete stack sections must be erased from the list
1018                     m_device.m_persist_list.erase(it_begin, ++it_end);
1019                     m_in_datalen +=
1020                         erase * sizeof(new_el->stack_ptr_data->mic_addr);
1021                 }
1022                 OFFLOAD_TRACE(3, "Reuse of stack buffer with addr %p\n",
1023                                  m_stack_ptr_data->mic_addr);
1024                 stack_memory_manager_lock.unlock();
1025                 return true;
1026             }
1027         }
1028         else if (stack_begin < it->stack_cpu_addr &&
1029                  cur_thread_id == cur_el.thread_id) {
1030                 OFFLOAD_TRACE(3, "Current element is above TOS\n");
1031             break;
1032         }
1033     }
1034 
1035     if (erase > 0) {
1036         // all obsolete stack sections must be erased from the list
1037         m_device.m_persist_list.erase(it_begin, ++it_end);
1038         m_in_datalen += erase * sizeof(new_el->stack_ptr_data->mic_addr);
1039     }
1040     // new stack table is created
1041     new_el = new PersistData(stack_begin, routine_id, buf_size, cur_thread_id);
1042     // create MIC buffer
1043     COIRESULT res;
1044     uint32_t buffer_flags = 0;
1045 
1046     // create buffer with large pages if data length exceeds
1047     // large page threshold
1048     if (buf_size >= __offload_use_2mb_buffers) {
1049         buffer_flags = COI_OPTIMIZE_HUGE_PAGE_SIZE;
1050     }
1051     res = COI::BufferCreate(buf_size,
1052         COI_BUFFER_NORMAL,
1053         buffer_flags,
1054         0,
1055         1,
1056         &m_device.get_process(),
1057         &new_el->stack_ptr_data->mic_buf);
1058     if (res != COI_SUCCESS) {
1059         if (m_status != 0) {
1060             m_status->result = translate_coi_error(res);
1061         }
1062         else if (m_is_mandatory) {
1063             report_coi_error(c_buf_create, res);
1064         }
1065         stack_memory_manager_lock.unlock();
1066         return false;
1067     }
1068     // make buffer valid on the device.
1069     res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
1070         m_device.get_process(),
1071         COI_BUFFER_VALID,
1072         COI_BUFFER_NO_MOVE,
1073         0, 0, 0);
1074     if (res != COI_SUCCESS) {
1075         if (m_status != 0) {
1076             m_status->result = translate_coi_error(res);
1077         }
1078         else if (m_is_mandatory) {
1079             report_coi_error(c_buf_set_state, res);
1080         }
1081         stack_memory_manager_lock.unlock();
1082         return false;
1083     }
1084     res = COI::BufferSetState(new_el->stack_ptr_data->mic_buf,
1085         COI_PROCESS_SOURCE,
1086         COI_BUFFER_INVALID,
1087         COI_BUFFER_NO_MOVE,
1088         0, 0, 0);
1089     if (res != COI_SUCCESS) {
1090         if (m_status != 0) {
1091             m_status->result = translate_coi_error(res);
1092         }
1093         else if (m_is_mandatory) {
1094             report_coi_error(c_buf_set_state, res);
1095         }
1096         stack_memory_manager_lock.unlock();
1097         return false;
1098     }
1099     // persistence algorithm requires target stack initialy to be nullified
1100     if (!nullify_target_stack(new_el->stack_ptr_data->mic_buf, buf_size)) {
1101         stack_memory_manager_lock.unlock();
1102         return false;
1103     }
1104 
1105     m_stack_ptr_data = new_el->stack_ptr_data;
1106     init_mic_address(m_stack_ptr_data);
1107     OFFLOAD_TRACE(3, "Allocating stack buffer with addr %p\n",
1108                       m_stack_ptr_data->mic_addr);
1109     m_device.m_persist_list.push_front(*new_el);
1110     init_mic_address(new_el->stack_ptr_data);
1111     *is_new = true;
1112 
1113     stack_memory_manager_lock.unlock();
1114     return true;
1115 }
1116 
1117 // Search through persistent stack buffers
1118 // for the top-of-stack buffer for this thread
get_this_threads_cpu_stack_addr(const void * stack_begin,int routine_id,bool thread_specific_function_locals)1119 char* OffloadDescriptor::get_this_threads_cpu_stack_addr(
1120     const void * stack_begin,
1121     int  routine_id,
1122     bool thread_specific_function_locals
1123 )
1124 {
1125     uint64_t cur_thread_id = m_device.get_thread_id();
1126     char* matched = 0;
1127 
1128     OFFLOAD_TRACE(3, "get_this_threads_cpu_stack_addr("
1129         "stack_begin=%p, routine_id=%d, thread_specific_function_locals=%d)\n",
1130         stack_begin, routine_id, thread_specific_function_locals);
1131     OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id);
1132 
1133     stack_memory_manager_lock.lock();
1134     for (PersistDataList::iterator it = m_device.m_persist_list.begin();
1135          it != m_device.m_persist_list.end(); it++)
1136     {
1137         PersistData cur_el = *it;
1138         print_persistList_item("Current element in persist list:", &cur_el);
1139         if (stack_begin == cur_el.stack_cpu_addr)
1140         {
1141             // For OpenMP shared function locals matching is done without
1142             // regard to thread id. But, we return the last match, which
1143             // corresponds to the outer stack.
1144             if (!thread_specific_function_locals)
1145             {
1146                 matched = cur_el.cpu_stack_addr;
1147                 continue;
1148             }
1149             // For non-OpenMP shared function-local variables
1150             // the thread-id must match
1151             if (cur_thread_id == cur_el.thread_id)
1152             {
1153                 matched = cur_el.cpu_stack_addr;
1154                 break;
1155             }
1156         }
1157     }
1158     stack_memory_manager_lock.unlock();
1159     if (matched != 0)
1160     {
1161         OFFLOAD_TRACE(3, "get_this_threads_cpu_stack_addr() => %p\n", matched);
1162         return matched;
1163     }
1164 
1165     OFFLOAD_TRACE(1,
1166         "Could not find persistent data; expect Read/Write failure\n");
1167     return 0;
1168 }
1169 
1170 // Search through persistent stack buffers
1171 // for the top-of-stack MIC buffer for this thread
get_this_threads_mic_stack_addr(const void * stack_begin,int routine_id,bool thread_specific_function_locals)1172 PtrData* OffloadDescriptor::get_this_threads_mic_stack_addr(
1173     const void * stack_begin,
1174     int  routine_id,
1175     bool thread_specific_function_locals
1176 )
1177 {
1178     uint64_t cur_thread_id = m_device.get_thread_id();
1179     PtrData* matched = 0;
1180 
1181     OFFLOAD_TRACE(3, "get_this_threads_mic_stack_addr("
1182         "stack_begin=%p, routine_id=%d, thread_specific_function_locals=%d)\n",
1183         stack_begin, routine_id, thread_specific_function_locals);
1184     OFFLOAD_TRACE(3, "cur_thread_id=%lld\n", cur_thread_id);
1185 
1186     stack_memory_manager_lock.lock();
1187     for (PersistDataList::iterator it = m_device.m_persist_list.begin();
1188          it != m_device.m_persist_list.end(); it++)
1189     {
1190         PersistData cur_el = *it;
1191         print_persistList_item("Current element in persist list:", &cur_el);
1192         if (stack_begin == cur_el.stack_cpu_addr)
1193         {
1194             // For OpenMP shared function locals matching is done without
1195             // regard to thread id. But, we return the last match, which
1196             // corresponds to the outer stack.
1197             if (!thread_specific_function_locals)
1198             {
1199                 matched = cur_el.stack_ptr_data;
1200                 continue;
1201             }
1202             // For non-OpenMP shared function-local variables
1203             // the thread-id must match
1204             if (cur_thread_id == cur_el.thread_id)
1205             {
1206                 matched = cur_el.stack_ptr_data;
1207                 break;
1208             }
1209         }
1210     }
1211     stack_memory_manager_lock.unlock();
1212     if (matched != 0)
1213     {
1214         OFFLOAD_TRACE(3, "get_this_threads_mic_stack_addr() => %p\n", matched);
1215         return matched;
1216     }
1217 
1218     OFFLOAD_TRACE(1,
1219         "Could not find persistent data; expect Read/Write failure\n");
1220     return 0;
1221 }
1222 
setup_use_device_ptr(int i)1223 void OffloadDescriptor::setup_use_device_ptr(int i)
1224 {
1225     PtrData *ptr_data;
1226     ArrDesc *dvp;
1227     void *base;
1228     if (m_vars_extra[i].type_src == c_dv_ptr) {
1229         dvp = *static_cast<ArrDesc**>(m_vars[i].ptr);
1230         base = reinterpret_cast<void*>(dvp->Base);
1231     }
1232     else {
1233         base = *static_cast<void**>(m_vars[i].ptr);
1234     }
1235     if (m_vars[i].direction.in) {
1236         int64_t *device_ptr;
1237         bool    is_new = true;
1238 
1239         find_device_ptr(device_ptr, base);
1240 
1241         // Create a entry in targetptr table using device_ptr
1242         // as lookup for later recover the host pointer
1243         ptr_data = m_device.insert_targetptr_data(device_ptr,
1244             0, is_new);
1245 
1246         // Actually the base is a host pointer and cpu_addr is
1247         // device pointer.  This is special case where the 2
1248         // address usage is reversed to enable using existing
1249         // PtrData structure instead of adding new fields.
1250         ptr_data->mic_addr  = (uint64_t) base;
1251 
1252         ptr_data->alloc_ptr_data_lock.unlock();
1253 
1254         // Replace host pointer with device pointer
1255         if (m_vars_extra[i].type_src == c_dv_ptr) {
1256             dvp->Base = reinterpret_cast<dv_size>(device_ptr);
1257         }
1258         else {
1259             *static_cast<void**>(m_vars[i].ptr) = device_ptr;
1260         }
1261     }
1262     else if (m_vars[i].direction.out) {
1263         // For use_device_ptr and out find associated host ptr
1264         // and assign to host ptr
1265         ptr_data = m_device.find_targetptr_data(base);
1266         if (!ptr_data) {
1267             LIBOFFLOAD_ERROR(c_no_ptr_data, base);
1268             exit(1);
1269         }
1270         if (m_vars_extra[i].type_src == c_dv_ptr) {
1271             dvp->Base = ptr_data->mic_addr;
1272         }
1273         else {
1274             *static_cast<void**>(m_vars[i].ptr) =
1275                 reinterpret_cast<void*>(ptr_data->mic_addr);
1276         }
1277         m_device.remove_targetptr_data(
1278             ptr_data->cpu_addr.start());
1279     }
1280 }
1281 
setup_descriptors(VarDesc * vars,VarDesc2 * vars2,int vars_total,int entry_id,const void * stack_addr)1282 bool OffloadDescriptor::setup_descriptors(
1283     VarDesc *vars,
1284     VarDesc2 *vars2,
1285     int vars_total,
1286     int entry_id,
1287     const void *stack_addr
1288 )
1289 {
1290     COIRESULT res;
1291     // To enable caching the CPU stack base address for stack variables
1292     char* this_threads_cpu_stack_addr = 0;
1293     // To properly deal with non-OpenMP threading and function-local variables
1294     // For OpenMP threading we support all function-locals in shared mode only
1295     bool thread_specific_function_locals = !omp_in_parallel();
1296 
1297     OffloadTimer timer(get_timer_data(), c_offload_host_setup_buffers);
1298     // make a copy of variable descriptors
1299     m_vars_total = vars_total;
1300     if (vars_total > 0) {
1301         m_vars = (VarDesc*) malloc(m_vars_total * sizeof(VarDesc));
1302         if (m_vars == NULL)
1303           LIBOFFLOAD_ERROR(c_malloc);
1304         memcpy(m_vars, vars, m_vars_total * sizeof(VarDesc));
1305         m_vars_extra = (VarExtra*) malloc(m_vars_total * sizeof(VarExtra));
1306         if (m_vars_extra == NULL)
1307           LIBOFFLOAD_ERROR(c_malloc);
1308     }
1309 
1310     // dependencies
1311     m_in_deps_allocated = m_vars_total + 1;
1312     m_in_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_in_deps_allocated);
1313     if (m_in_deps == NULL)
1314       LIBOFFLOAD_ERROR(c_malloc);
1315     if (m_vars_total > 0) {
1316         m_out_deps_allocated = m_vars_total;
1317         m_out_deps = (COIEVENT*) malloc(sizeof(COIEVENT) * m_out_deps_allocated);
1318         if (m_out_deps == NULL)
1319           LIBOFFLOAD_ERROR(c_malloc);
1320     }
1321     // copyin/copyout data length
1322     m_in_datalen = 0;
1323     m_out_datalen = 0;
1324 
1325     // First pass over variable descriptors
1326     // - Calculate size of the input and output non-pointer data
1327     // - Allocate buffers for input and output pointers
1328     for (int i = 0; i < m_vars_total; i++) {
1329         void*   alloc_base = NULL;
1330         int64_t alloc_disp = 0;
1331         int64_t alloc_size = 0;
1332         bool    src_is_for_mic = (m_vars[i].direction.out ||
1333                                   m_vars[i].into == NULL);
1334         bool    src_is_for_host = (m_vars[i].direction.in ||
1335                                   m_vars[i].into == NULL);
1336         const char *var_sname = "";
1337         if (vars2 != NULL && i < vars_total) {
1338             if (vars2[i].sname != NULL) {
1339                 var_sname = vars2[i].sname;
1340             }
1341         }
1342 
1343         // instead of m_vars[i].type.src we will use m_vars_extra[i].type_src
1344         if (m_vars[i].type.src == c_extended_type) {
1345             VarDescExtendedType *etype =
1346                 reinterpret_cast<VarDescExtendedType*>(m_vars[i].ptr);
1347             m_vars_extra[i].type_src = etype->extended_type;
1348             m_vars[i].ptr            = etype->ptr;
1349         }
1350         else {
1351             m_vars_extra[i].type_src = m_vars[i].type.src;
1352         }
1353         // instead of m_vars[i].type.dst we will use m_vars_extra[i].type_dst
1354         if (m_vars[i].type.dst == c_extended_type) {
1355             VarDescExtendedType *etype =
1356                 reinterpret_cast<VarDescExtendedType*>(m_vars[i].into);
1357             if (etype) {
1358                 m_vars_extra[i].type_dst = etype->extended_type;
1359                 m_vars[i].into           = etype->ptr;
1360             }
1361             else {
1362                 m_vars_extra[i].type_dst = m_vars_extra[i].type_src;
1363             }
1364         }
1365         else {
1366             m_vars_extra[i].type_dst = m_vars[i].type.dst;
1367         }
1368         OFFLOAD_TRACE(2, "   VarDesc %d, var=%s, %s, %s\n",
1369             i, var_sname,
1370             vardesc_direction_as_string[m_vars[i].direction.bits],
1371             vardesc_type_as_string[m_vars_extra[i].type_src]);
1372         if (vars2 != NULL && i < vars_total && vars2[i].dname != NULL) {
1373             OFFLOAD_TRACE(2, "              into=%s, %s\n", vars2[i].dname,
1374                 vardesc_type_as_string[m_vars_extra[i].type_dst]);
1375         }
1376         OFFLOAD_TRACE(2,
1377             "              type_src=%d, type_dstn=%d, direction=%d, "
1378             "alloc_if=%d, free_if=%d, align=%d, mic_offset=%d, flags=0x%x, "
1379             "offset=%lld, size=%lld, count/disp=%lld, ptr=%p, into=%p\n",
1380             m_vars_extra[i].type_src,
1381             m_vars_extra[i].type_dst,
1382             m_vars[i].direction.bits,
1383             m_vars[i].alloc_if,
1384             m_vars[i].free_if,
1385             m_vars[i].align,
1386             m_vars[i].mic_offset,
1387             m_vars[i].flags.bits,
1388             m_vars[i].offset,
1389             m_vars[i].size,
1390             m_vars[i].count,
1391             m_vars[i].ptr,
1392             m_vars[i].into);
1393         // If any varDesc flags bits set, show them
1394         if (console_enabled >= 1 && m_vars[i].flags.bits != 0) {
1395             trace_varDesc_flags(get_timer_data(), m_vars[i].flags);
1396         }
1397 
1398         // preallocated implies targetptr
1399         if (m_vars[i].flags.preallocated) {
1400             // targetptr preallocated alloc_if(1) may not be used with
1401             // an in clause
1402             if (m_vars[i].direction.in && m_vars[i].alloc_if) {
1403                 LIBOFFLOAD_ERROR(c_in_with_preallocated);
1404                 exit(1);
1405             }
1406             m_vars[i].flags.targetptr = 1;
1407         }
1408         if (m_vars[i].alloc != NULL) {
1409             // array descriptor
1410             const Arr_Desc *ap =
1411                 static_cast<const Arr_Desc*>(m_vars[i].alloc);
1412 
1413             // debug dump
1414             ARRAY_DESC_DUMP("    ", "ALLOC", ap, 0, 1);
1415 
1416             __arr_data_offset_and_length(ap, alloc_disp, alloc_size);
1417 
1418             alloc_base = reinterpret_cast<void*>(ap->base);
1419         }
1420 
1421         m_vars_extra[i].alloc = m_vars[i].alloc;
1422         m_vars_extra[i].auto_data = 0;
1423         m_vars_extra[i].cpu_disp = 0;
1424         m_vars_extra[i].cpu_offset = 0;
1425         m_vars_extra[i].src_data = 0;
1426         m_vars_extra[i].read_rng_src = 0;
1427         m_vars_extra[i].read_rng_dst = 0;
1428         m_vars_extra[i].omp_last_event_type = c_last_not;
1429         // flag is_arr_ptr_el is 1 only for var_descs generated
1430         // for c_data_ptr_array type
1431         if (i < vars_total) {
1432             m_vars_extra[i].is_arr_ptr_el = 0;
1433         }
1434         if (TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_src) ||
1435             TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_dst) ||
1436             m_vars[i].flags.is_pointer) {
1437             m_vars_extra[i].pointer_offset = m_vars[i].offset;
1438             m_vars[i].offset = 0;
1439             m_in_datalen += sizeof(m_vars[i].offset);
1440         }
1441 
1442         switch (m_vars_extra[i].type_src) {
1443             case c_data_ptr_array:
1444                 {
1445                     const Arr_Desc *ap;
1446                     const VarDesc3 *vd3 =
1447                         static_cast<const VarDesc3*>(m_vars[i].ptr);
1448                     int flags = vd3->array_fields;
1449                     OFFLOAD_TRACE(2,
1450                         "              pointer array flags = %04x\n", flags);
1451                     OFFLOAD_TRACE(2,
1452                         "              pointer array type is %s\n",
1453                         vardesc_type_as_string[flags & 0x3f]);
1454                     ap = static_cast<const Arr_Desc*>(vd3->ptr_array);
1455                     ARRAY_DESC_DUMP("              ", "ptr array", ap,
1456                                     m_vars[i].flags.is_pointer, 1);
1457                     if (m_vars[i].into) {
1458                         ap = static_cast<const Arr_Desc*>(m_vars[i].into);
1459                         ARRAY_DESC_DUMP(
1460                             "              ", "into array", ap, 0, 1);
1461                     }
1462                     if ((flags & (1<<flag_align_is_array)) != 0) {
1463                         ap = static_cast<const Arr_Desc*>(vd3->align_array);
1464                         ARRAY_DESC_DUMP(
1465                             "              ", "align array", ap, 0, 1);
1466                     }
1467                     if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
1468                         ap = static_cast<const Arr_Desc*>(vd3->alloc_if_array);
1469                         ARRAY_DESC_DUMP(
1470                             "              ", "alloc_if array", ap, 0, 1);
1471                     }
1472                     if ((flags & (1<<flag_free_if_is_array)) != 0) {
1473                         ap = static_cast<const Arr_Desc*>(vd3->free_if_array);
1474                         ARRAY_DESC_DUMP(
1475                             "              ", "free_if array", ap, 0, 1);
1476                     }
1477                     if ((flags & (1<<flag_extent_start_is_array)) != 0) {
1478                         ap = static_cast<const Arr_Desc*>(vd3->extent_start);
1479                         ARRAY_DESC_DUMP(
1480                             "              ", "extent_start array", ap, 0, 1);
1481                     } else if ((flags &
1482                         (1<<flag_extent_start_is_scalar)) != 0) {
1483                         OFFLOAD_TRACE(2,
1484                             "              extent_start scalar = %d\n",
1485                             (int64_t)vd3->extent_start);
1486                     }
1487                     if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
1488                         ap = static_cast<const Arr_Desc*>
1489                             (vd3->extent_elements);
1490                         ARRAY_DESC_DUMP("              ",
1491                                         "extent_elements array", ap, 0, 1);
1492                     } else if ((flags &
1493                         (1<<flag_extent_elements_is_scalar)) != 0) {
1494                         OFFLOAD_TRACE(2,
1495                             "              extent_elements scalar = %d\n",
1496                             (int64_t)vd3->extent_elements);
1497                     }
1498                     if ((flags & (1<<flag_into_start_is_array)) != 0) {
1499                         ap = static_cast<const Arr_Desc*>(vd3->into_start);
1500                         ARRAY_DESC_DUMP(
1501                             "              ", "into_start array", ap, 0, 1);
1502                     } else if ((flags &
1503                         (1<<flag_into_start_is_scalar)) != 0) {
1504                         OFFLOAD_TRACE(2,
1505                             "              into_start scalar = %d\n",
1506                             (int64_t)vd3->into_start);
1507                     }
1508                     if ((flags & (1<<flag_into_elements_is_array)) != 0) {
1509                         ap = static_cast<const Arr_Desc*>(vd3->into_elements);
1510                         ARRAY_DESC_DUMP(
1511                             "              ", "into_elements array", ap, 0, 1);
1512                     } else if ((flags &
1513                         (1<<flag_into_elements_is_scalar)) != 0) {
1514                         OFFLOAD_TRACE(2,
1515                             "              into_elements scalar = %d\n",
1516                             (int64_t)vd3->into_elements);
1517                     }
1518                     if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
1519                         ap = static_cast<const Arr_Desc*>(vd3->alloc_start);
1520                         ARRAY_DESC_DUMP(
1521                             "              ", "alloc_start array", ap, 0, 1);
1522                     } else if ((flags &
1523                         (1<<flag_alloc_start_is_scalar)) != 0) {
1524                         OFFLOAD_TRACE(2,
1525                             "              alloc_start scalar = %d\n",
1526                             (int64_t)vd3->alloc_start);
1527                     }
1528                     if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
1529                         ap = static_cast<const Arr_Desc*>(vd3->alloc_elements);
1530                         ARRAY_DESC_DUMP("              ",
1531                                         "alloc_elements array", ap, 0, 1);
1532                     } else if ((flags &
1533                         (1<<flag_alloc_elements_is_scalar)) != 0) {
1534                         OFFLOAD_TRACE(2,
1535                             "              alloc_elements scalar = %d\n",
1536                             (int64_t)vd3->alloc_elements);
1537                     }
1538                 }
1539                 if (!gen_var_descs_for_pointer_array(i)) {
1540                     return false;
1541                 }
1542                 break;
1543 
1544             case c_data:
1545             case c_void_ptr:
1546             case c_void_ptr_ptr:
1547             case c_cean_var:
1548                 // In all uses later
1549                 // VarDesc.size will have the length of the data to be
1550                 // transferred
1551                 // VarDesc.disp will have an offset from base
1552 
1553                 if (m_vars[i].flags.is_non_cont_struct && src_is_for_host) {
1554                     NonContigDesc *desc =
1555                         static_cast<NonContigDesc*>(m_vars[i].ptr);
1556                     noncont_struct_dump("    ", "DATA", desc);
1557                     m_vars_extra[i].noncont_desc = desc;
1558                     m_vars[i].ptr = reinterpret_cast<void*>(desc->base);
1559                     m_vars[i].size = get_noncont_struct_size(desc);
1560                     m_vars[i].disp = 0;
1561                 }
1562                 else if (m_vars_extra[i].type_src == c_cean_var) {
1563                     // array descriptor
1564                     const Arr_Desc *ap =
1565                         static_cast<const Arr_Desc*>(m_vars[i].ptr);
1566 
1567                     // debug dump
1568                     ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic);
1569 
1570                     // offset and length are derived from the array descriptor
1571                     __arr_data_offset_and_length(ap, m_vars[i].disp,
1572                                                  m_vars[i].size);
1573                     if (!is_arr_desc_contiguous(ap)) {
1574                         m_vars[i].flags.is_noncont_src = 1;
1575                         m_vars_extra[i].read_rng_src =
1576                             init_read_ranges_arr_desc(ap);
1577                     }
1578                     // all necessary information about length and offset is
1579                     // transferred in var descriptor. There is no need to send
1580                     // array descriptor to the target side.
1581                     m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
1582                 }
1583                 else {
1584                     m_vars[i].size *= m_vars[i].count;
1585                     m_vars[i].disp = 0;
1586                 }
1587 
1588                 if (m_vars[i].direction.bits) {
1589                     // make sure that transfer size > 0
1590                     if (m_vars[i].size <= 0) {
1591                         LIBOFFLOAD_ERROR(c_zero_or_neg_transfer_size);
1592                         exit(1);
1593                     }
1594 
1595                     if (m_vars[i].flags.is_static) {
1596                         PtrData *ptr_data;
1597                         // find data associated with variable
1598                         if (!find_ptr_data(ptr_data,
1599                                            m_vars[i].ptr,
1600                                            m_vars[i].disp,
1601                                            m_vars[i].size,
1602                                            false, false)) {
1603                             return false;
1604                         }
1605 
1606                         if (ptr_data != 0) {
1607                             // offset to base from the beginning of the buffer
1608                             // memory
1609                             m_vars[i].offset =
1610                                 (char*) m_vars[i].ptr -
1611                                 (char*) ptr_data->cpu_addr.start();
1612                         }
1613                         else {
1614                             m_vars[i].flags.is_static = false;
1615                             if (m_vars[i].into == NULL) {
1616                                 m_vars[i].flags.is_static_dstn = false;
1617                             }
1618                         }
1619                         m_vars_extra[i].src_data = ptr_data;
1620                     }
1621 
1622                     if (m_vars[i].direction.in &&
1623                         !m_vars[i].flags.is_static &&
1624                         !m_vars[i].flags.is_stack_buf) {
1625                         m_in_datalen += m_vars[i].size;
1626 
1627                         // for non-static target destination defined as CEAN
1628                         // expression we pass to target its size and dist
1629                         if (m_vars[i].into == NULL &&
1630                             m_vars_extra[i].type_src == c_cean_var) {
1631                             m_in_datalen += 2 * sizeof(uint64_t);
1632                         }
1633                         m_need_runfunction = true;
1634                     }
1635                     if (m_vars[i].direction.out &&
1636                         !m_vars[i].flags.is_static &&
1637                         !m_vars[i].flags.is_stack_buf) {
1638                         m_out_datalen += m_vars[i].size;
1639                         m_need_runfunction = true;
1640                     }
1641                 }
1642                 if (m_is_openmp && src_is_for_host &&
1643                     !m_vars[i].flags.is_device_ptr) {
1644                     if (m_vars[i].flags.is_static) {
1645                         PtrData *ptr_data = m_vars_extra[i].src_data;
1646                         // Static data is transferred either by omp target
1647                         // update construct which passes zeros for
1648                         // alloc_if and free_if or by always modifier.
1649                         // Implicit openmp reference is transfered also
1650                         // if its reference count is equal to 1
1651                         if (ptr_data &&
1652                             IS_OPENMP_IMPLICIT_OR_LINK(ptr_data->var_alloc_type)) {
1653                             if (m_vars[i].alloc_if) {
1654                                 ptr_data->add_reference();
1655                             }
1656 
1657                             if (!m_vars[i].flags.always_copy &&
1658                                 (m_vars[i].alloc_if || m_vars[i].free_if) &&
1659                                 ptr_data->get_reference() != 1) {
1660                                 m_vars[i].direction.bits = c_parameter_nocopy;
1661                             }
1662                         }
1663                         else if (
1664                             !m_vars[i].flags.always_copy &&
1665                             (m_vars[i].alloc_if || m_vars[i].free_if)) {
1666                                 m_vars[i].direction.bits = c_parameter_nocopy;
1667                         }
1668                     }
1669                     else {
1670                         AutoData *auto_data;
1671                         if (m_vars[i].alloc_if) {
1672                             auto_data = m_device.insert_auto_data(
1673                                 m_vars[i].ptr, m_vars[i].size);
1674                             auto_data->add_reference();
1675                         }
1676                         else {
1677                             // TODO: what should be done if var is not in
1678                             // the table?
1679                             auto_data = m_device.find_auto_data(
1680                                 m_vars[i].ptr);
1681                         }
1682 
1683                         // For automatic variables data is transferred:
1684                         // - if always modifier is used OR
1685                         // - if alloc_if == 0 && free_if == 0 OR
1686                         // - if reference count is 1
1687                         if (!m_vars[i].flags.always_copy &&
1688                             (m_vars[i].alloc_if || m_vars[i].free_if) &&
1689                             auto_data != 0 &&
1690                             auto_data->get_reference() != 1) {
1691                                 m_vars[i].direction.bits = c_parameter_nocopy;
1692                         }
1693 
1694                         // save data for later use
1695                         m_vars_extra[i].auto_data = auto_data;
1696                     }
1697                 }
1698                 break;
1699 
1700             case c_dv:
1701                 if (m_vars[i].flags.use_device_ptr) {
1702                     setup_use_device_ptr(i);
1703                     break;
1704                 }
1705                 else if (m_vars[i].direction.bits ||
1706                     m_vars[i].alloc_if ||
1707                     m_vars[i].free_if) {
1708                     ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].ptr);
1709 
1710                     // debug dump
1711                     __dv_desc_dump("IN/OUT", dvp);
1712 
1713                     // send dope vector contents excluding base
1714                     m_in_datalen += m_vars[i].size - sizeof(uint64_t);
1715                     m_need_runfunction = true;
1716                 }
1717                 break;
1718 
1719             case c_string_ptr:
1720             case c_string_ptr_ptr:
1721                 if ((m_vars[i].direction.bits ||
1722                      m_vars[i].alloc_if ||
1723                      m_vars[i].free_if) &&
1724                     m_vars[i].size == 0) {
1725                     m_vars[i].size = 1;
1726                     m_vars[i].count =
1727                         strlen(*static_cast<char**>(m_vars[i].ptr)) + 1;
1728                 }
1729                 /* fallthru */
1730 
1731             case c_data_ptr:
1732             case c_data_ptr_ptr:
1733                 if (m_vars[i].flags.is_stack_buf &&
1734                     !m_vars[i].direction.bits &&
1735                     m_vars[i].alloc_if) {
1736                     // this var_desc is for stack buffer
1737                     bool is_new;
1738 
1739                     if (!offload_stack_memory_manager(
1740                             stack_addr, entry_id,
1741                             m_vars[i].count, m_vars[i].align,
1742                             thread_specific_function_locals, &is_new)) {
1743                         return false;
1744                     }
1745                     if (is_new) {
1746                         m_compute_buffers.push_back(
1747                             m_stack_ptr_data->mic_buf);
1748                         m_device.m_persist_list.front().cpu_stack_addr =
1749                             static_cast<char*>(m_vars[i].ptr);
1750                         PersistData *new_el = &m_device.m_persist_list.front();
1751                         print_persistList_item(
1752                             "New element in persist list:",
1753                             new_el);
1754                     }
1755                     else {
1756                         m_vars[i].flags.sink_addr = 1;
1757                         m_in_datalen += sizeof(m_stack_ptr_data->mic_addr);
1758                         if (thread_specific_function_locals) {
1759                             m_stack_ptr_data = get_this_threads_mic_stack_addr(
1760                                 stack_addr, entry_id,
1761                                 thread_specific_function_locals);
1762                         }
1763                     }
1764                     m_vars[i].size = m_destroy_stack.size();
1765                     m_vars_extra[i].src_data = m_stack_ptr_data;
1766 
1767                     // need to add or remove references for stack buffer at target
1768                     if (is_new || m_destroy_stack.size()) {
1769                         m_need_runfunction = true;
1770                     }
1771 
1772                     break;
1773                 }
1774                 /* fallthru */
1775 
1776             case c_cean_var_ptr:
1777             case c_cean_var_ptr_ptr:
1778             case c_dv_ptr:
1779                 if (m_vars[i].flags.is_non_cont_struct && src_is_for_host) {
1780                      NonContigDesc *desc =
1781                         static_cast<NonContigDesc*>(m_vars[i].ptr);
1782                     noncont_struct_dump("    ", "PTR", desc);
1783                     m_vars_extra[i].noncont_desc = desc;
1784                     m_vars[i].ptr = reinterpret_cast<void*>(desc->base);
1785                     m_vars[i].disp = 0;
1786                 }
1787                 else if (m_vars_extra[i].type_src == c_cean_var_ptr ||
1788                          m_vars_extra[i].type_src == c_cean_var_ptr_ptr) {
1789                     // array descriptor
1790                     const Arr_Desc *ap =
1791                         static_cast<const Arr_Desc*>(m_vars[i].ptr);
1792 
1793                     // debug dump
1794                     ARRAY_DESC_DUMP("", "IN/OUT", ap, 1, !src_is_for_mic);
1795 
1796                     // offset and length are derived from the array descriptor
1797                     __arr_data_offset_and_length(ap, m_vars[i].disp,
1798                                                  m_vars[i].size);
1799 
1800                     if (!is_arr_desc_contiguous(ap)) {
1801                         m_vars[i].flags.is_noncont_src = 1;
1802                         m_vars_extra[i].read_rng_src =
1803                             init_read_ranges_arr_desc(ap);
1804                     }
1805                     // all necessary information about length and offset is
1806                     // transferred in var descriptor. There is no need to send
1807                     // array descriptor to the target side.
1808                     m_vars[i].ptr = reinterpret_cast<void*>(ap->base);
1809                 }
1810                 else if (m_vars_extra[i].type_src == c_dv_ptr) {
1811                     // need to send DV to the device unless it is 'nocopy'
1812                     if (m_vars[i].direction.bits ||
1813                         m_vars[i].alloc_if ||
1814                         m_vars[i].free_if) {
1815                         ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].ptr);
1816 
1817                         // debug dump
1818                         __dv_desc_dump("IN/OUT", dvp);
1819 
1820                         // for use_device_ptr don't need to change
1821                         // OUT direction to IN direction
1822                         if (!m_vars[i].flags.use_device_ptr) {
1823                             m_vars[i].direction.bits = c_parameter_in;
1824                         }
1825                     }
1826 
1827                     // no displacement
1828                     m_vars[i].disp = 0;
1829                 }
1830                 else {
1831                     // For "use_device_ptr" if direction is "in" then need to
1832                     // find the associated device pointer and replace the host
1833                     // pointer with device pointer.  Also save the host pointer
1834                     // to restore when "out" is encountered.
1835                     // For "out" find the host pointer associated with the
1836                     // device pointer and restore the host pointer
1837                     if (m_vars[i].flags.use_device_ptr && src_is_for_host) {
1838                           setup_use_device_ptr(i);
1839                           break;
1840                     }
1841 
1842                     // c_data_ptr or c_string_ptr
1843                     m_vars[i].size *= m_vars[i].count;
1844                     m_vars[i].disp = 0;
1845                 }
1846 
1847                 if (m_vars[i].direction.bits ||
1848                     m_vars[i].alloc_if ||
1849                     m_vars[i].free_if) {
1850                     PtrData *ptr_data;
1851 
1852                     // check that buffer length > 0
1853                     if (m_vars[i].alloc_if &&
1854                         m_vars[i].disp + m_vars[i].size <
1855                         (m_is_openmp ? 0 : 1)) {
1856                         LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
1857                         exit(1);
1858                     }
1859 
1860                     // base address
1861                     void *base = *static_cast<void**>(m_vars[i].ptr);
1862 
1863                     // allocate buffer if we have no INTO and don't need
1864                     // allocation for the ptr at target
1865                     if (src_is_for_mic) {
1866                         if (m_vars[i].flags.is_stack_buf) {
1867                             // for stack persistent objects ptr data is created
1868                             // by var_desc with number 0.
1869                             // Its ptr_data is stored at m_stack_ptr_data
1870                             ptr_data = m_stack_ptr_data;
1871                         }
1872                         else if (m_vars[i].alloc_if) {
1873                             if (m_vars[i].flags.preallocated) {
1874                                 m_out_datalen += sizeof(void*);
1875                                 m_need_runfunction = true;
1876                                 break;
1877                             }
1878                             // add new entry
1879                             if (!alloc_ptr_data(
1880                                     ptr_data,
1881                                     reinterpret_cast<char *>(base) + alloc_disp,
1882                                     (alloc_base != NULL) ?
1883                                         alloc_disp : m_vars[i].disp,
1884                                     (alloc_base != NULL) ?
1885                                         alloc_size : m_vars[i].size,
1886                                     alloc_disp,
1887                                     (alloc_base != NULL) ?
1888                                         0 : m_vars[i].align,
1889                                     m_vars[i].flags.targetptr,
1890                                     0,
1891                                     m_vars[i].flags.pin)) {
1892                                 return false;
1893                             }
1894                             if (m_vars[i].flags.targetptr) {
1895                                 if (!init_mic_address(ptr_data)) {
1896                                     return false;
1897                                 }
1898                                 *static_cast<void**>(m_vars[i].ptr) = base =
1899                                   reinterpret_cast<void*>(ptr_data->mic_addr);
1900                             }
1901                             if (ptr_data->add_reference() == 0 &&
1902                                 ptr_data->mic_buf != 0) {
1903                                 // add buffer to the list of buffers that
1904                                 // are passed to dispatch call
1905                                 m_compute_buffers.push_back(
1906                                     ptr_data->mic_buf);
1907                             }
1908                             else if (!m_vars[i].flags.pin &&
1909                                      !m_vars[i].flags.preallocated) {
1910                                 // will send buffer address to device
1911                                 m_vars[i].flags.sink_addr = 1;
1912                                 m_in_datalen += sizeof(ptr_data->mic_addr);
1913                             }
1914 
1915                             if (!m_vars[i].flags.pin &&
1916                                 !ptr_data->is_static) {
1917                                 // need to add reference for buffer
1918                                 m_need_runfunction = true;
1919                             }
1920                         }
1921                         else {
1922                             bool error_if_not_found = true;
1923                             if (m_is_openmp) {
1924                                 // For omp target update variable is ignored
1925                                 // if it does not exist.
1926                                 if (m_vars[i].flags.always_copy ||
1927                                     (!m_vars[i].alloc_if &&
1928                                      !m_vars[i].free_if)) {
1929                                     error_if_not_found = false;
1930                                 }
1931                             }
1932 
1933                             // use existing association from pointer table
1934                             if (!find_ptr_data(ptr_data,
1935                                                base,
1936                                                m_vars[i].disp,
1937                                                m_vars[i].size,
1938                                                m_vars[i].flags.targetptr,
1939                                                error_if_not_found)) {
1940                                 return false;
1941                             }
1942 
1943                             if (m_is_openmp) {
1944                                 // make var nocopy if it does not exist
1945                                 if (ptr_data == 0) {
1946                                     m_vars[i].direction.bits =
1947                                         c_parameter_nocopy;
1948                                 }
1949                             }
1950 
1951                             if (ptr_data != 0) {
1952                                 m_vars[i].flags.sink_addr = 1;
1953                                 m_in_datalen += sizeof(ptr_data->mic_addr);
1954                             }
1955                         }
1956 
1957                         if (ptr_data != 0) {
1958 
1959                             if (ptr_data->alloc_disp != 0) {
1960                                 m_vars[i].flags.alloc_disp = 1;
1961                                 m_in_datalen += sizeof(alloc_disp);
1962                             }
1963 
1964                             if (m_vars[i].flags.sink_addr) {
1965                                 // get buffers's address on the sink
1966                                 if (!init_mic_address(ptr_data)) {
1967                                     return false;
1968                                 }
1969 
1970                                 m_in_datalen += sizeof(ptr_data->mic_addr);
1971                             }
1972 
1973                             if (!m_vars[i].flags.pin &&
1974                                 !ptr_data->is_static && m_vars[i].free_if) {
1975                                 // need to decrement buffer reference on target
1976                                 m_need_runfunction = true;
1977                             }
1978 
1979                             // offset to base from the beginning of the buffer
1980                             // memory
1981                             m_vars[i].offset = (char*) base -
1982                                 (char*) ptr_data->cpu_addr.start();
1983 
1984                             // copy other pointer properties to var descriptor
1985                             m_vars[i].mic_offset = ptr_data->mic_offset;
1986                             m_vars[i].flags.is_static = ptr_data->is_static;
1987                         }
1988                     }
1989                     else {
1990                         if (!find_ptr_data(ptr_data,
1991                                            base,
1992                                            m_vars[i].disp,
1993                                            m_vars[i].size,
1994                                            false, false)) {
1995                             return false;
1996                         }
1997                         if (ptr_data) {
1998                             m_vars[i].offset =
1999                                 (char*) base -
2000                                 (char*) ptr_data->cpu_addr.start();
2001                         }
2002                     }
2003 
2004                     if (m_is_openmp) {
2005                         if (m_vars[i].flags.use_device_ptr) {
2006                             setup_use_device_ptr(i);
2007                         }
2008                         // for TO transfer of stack buffer's variable
2009                         if (src_is_for_host && m_vars[i].flags.is_stack_buf) {
2010                             AutoData *auto_data;
2011                             char *base = *static_cast<char**>(m_vars[i].ptr);
2012                             if (m_vars[i].alloc_if) {
2013                                 auto_data =m_device.insert_auto_data(
2014                                    base + m_vars[i].disp,
2015                                     m_vars[i].size);
2016                                 auto_data->add_reference();
2017                             }
2018                             else {
2019                                 auto_data = m_device.find_auto_data(
2020                                     base + m_vars[i].disp);
2021                             }
2022                             // save data for later use
2023                             m_vars_extra[i].auto_data = auto_data;
2024 
2025                             // For automatic variables
2026                             // data is transferred:
2027                             // - if always modifier is used OR
2028                             // - if alloc_if == 0 && free_if == 0 OR
2029                             // - if reference count is 1
2030                             if (!m_vars[i].flags.always_copy &&
2031                                 (m_vars[i].alloc_if ||
2032                                 m_vars[i].free_if) &&
2033                                 auto_data != 0 &&
2034                                 auto_data->get_reference() != 1) {
2035                                     m_vars[i].direction.bits =
2036                                         c_parameter_nocopy;
2037                             }
2038                         }
2039                         // for FROM transfer of global pointer variable
2040                         // FROM transfer of stack buffer's variable
2041                         // is treated at INTO branch
2042                         else if (src_is_for_mic &&
2043                             !m_vars[i].flags.is_stack_buf) {
2044                                 // data is transferred only if
2045                                 // alloc_if == 0 && free_if == 0
2046                                 // or reference count is 1
2047                                 if (!m_vars[i].flags.always_copy &&
2048                                     (m_vars[i].alloc_if ||
2049                                     m_vars[i].free_if) &&
2050                                     ptr_data &&
2051                                     ptr_data->get_reference() != 1)
2052                                 {
2053                                     m_vars[i].direction.bits =
2054                                         c_parameter_nocopy;
2055                                 }
2056                         }
2057                     }
2058                     // save pointer data
2059                     m_vars_extra[i].src_data = ptr_data;
2060                 }
2061                 break;
2062 
2063             case c_func_ptr:
2064             case c_func_ptr_ptr:
2065                 if (m_vars[i].direction.in) {
2066                     m_in_datalen += __offload_funcs.max_name_length();
2067                 }
2068                 if (m_vars[i].direction.out) {
2069                     m_out_datalen += __offload_funcs.max_name_length();
2070                 }
2071                 m_need_runfunction = true;
2072                 break;
2073 
2074             case c_dv_data:
2075             case c_dv_ptr_data:
2076             case c_dv_data_slice:
2077             case c_dv_ptr_data_slice:
2078                 ArrDesc *dvp;
2079                 if (m_vars[i].flags.is_non_cont_struct) {
2080                     NonContigDesc *desc =
2081                         static_cast<NonContigDesc*>(m_vars[i].ptr);
2082                     noncont_struct_dump("    ", "DV-DATA", desc);
2083                     dvp = reinterpret_cast<ArrDesc*>(desc->base);
2084                 }
2085                 else if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src)) {
2086                     const Arr_Desc *ap;
2087                     ap = static_cast<const Arr_Desc*>(m_vars[i].ptr);
2088 
2089                     dvp = (m_vars_extra[i].type_src == c_dv_data_slice) ?
2090                           reinterpret_cast<ArrDesc*>(ap->base) :
2091                           *reinterpret_cast<ArrDesc**>(ap->base);
2092                 }
2093                 else {
2094                     dvp = (m_vars_extra[i].type_src == c_dv_data) ?
2095                           static_cast<ArrDesc*>(m_vars[i].ptr) :
2096                           *static_cast<ArrDesc**>(m_vars[i].ptr);
2097                 }
2098 
2099                 // if allocatable dope vector isn't allocated don't
2100                 // transfer its data
2101                 if (!__dv_is_allocated(dvp)) {
2102                     m_vars[i].direction.bits = c_parameter_nocopy;
2103                     m_vars[i].alloc_if = 0;
2104                     m_vars[i].free_if = 0;
2105                 }
2106                 if (m_vars[i].direction.bits ||
2107                     m_vars[i].alloc_if ||
2108                     m_vars[i].free_if) {
2109                     const Arr_Desc *ap;
2110 
2111                     if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src)) {
2112                         ap = static_cast<const Arr_Desc*>(m_vars[i].ptr);
2113 
2114                         // debug dump
2115                         ARRAY_DESC_DUMP("", "IN/OUT", ap, 0, !src_is_for_mic);
2116                     }
2117                     if (!__dv_is_contiguous(dvp)) {
2118                         m_vars[i].flags.is_noncont_src = 1;
2119                         m_vars_extra[i].read_rng_src =
2120                             init_read_ranges_dv(dvp);
2121                     }
2122 
2123                     // size and displacement
2124                     if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src)) {
2125                         // offset and length are derived from the
2126                         // array descriptor
2127                         __arr_data_offset_and_length(ap,
2128                                                      m_vars[i].disp,
2129                                                      m_vars[i].size);
2130                         if (m_vars[i].direction.bits) {
2131                             if (!is_arr_desc_contiguous(ap)) {
2132                                 if (m_vars[i].flags.is_noncont_src) {
2133                                     LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
2134                                     return false;
2135                                 }
2136                                 m_vars[i].flags.is_noncont_src = 1;
2137                                 m_vars_extra[i].read_rng_src =
2138                                     init_read_ranges_arr_desc(ap);
2139                             }
2140                         }
2141                     }
2142                     else {
2143                         if (m_vars[i].flags.has_length) {
2144                             m_vars[i].size =
2145                                 __dv_data_length(dvp, m_vars[i].count);
2146                         }
2147                         else {
2148                             m_vars[i].size = __dv_data_length(dvp);
2149                         }
2150                         m_vars[i].disp = 0;
2151                     }
2152 
2153                     // check that length >= 0
2154                     if (m_vars[i].alloc_if &&
2155                         (m_vars[i].disp + m_vars[i].size < 0)) {
2156                         LIBOFFLOAD_ERROR(c_zero_or_neg_ptr_len);
2157                         exit(1);
2158                     }
2159 
2160                     // base address
2161                     void *base = reinterpret_cast<void*>(dvp->Base);
2162                     PtrData *ptr_data;
2163 
2164                     // allocate buffer if we have no INTO and don't need
2165                     // allocation for the ptr at target
2166                     if (src_is_for_mic) {
2167                         if (m_vars[i].alloc_if) {
2168                             // add new entry
2169                             if (!alloc_ptr_data(
2170                                     ptr_data,
2171                                     reinterpret_cast<char *>(base) + alloc_disp,
2172                                     (alloc_base != NULL) ?
2173                                         alloc_disp : m_vars[i].disp,
2174                                     (alloc_base != NULL) ?
2175                                         alloc_size : m_vars[i].size,
2176                                     alloc_disp,
2177                                     (alloc_base != NULL) ?
2178                                         0 : m_vars[i].align,
2179                                     m_vars[i].flags.targetptr,
2180                                     m_vars[i].flags.preallocated,
2181                                     m_vars[i].flags.pin)) {
2182                                 return false;
2183                             }
2184 
2185                             if (ptr_data->add_reference() == 0 &&
2186                                 ptr_data->mic_buf != 0) {
2187                                 // add buffer to the list of buffers
2188                                 // that are passed to dispatch call
2189                                 m_compute_buffers.push_back(
2190                                     ptr_data->mic_buf);
2191                             }
2192                             else {
2193                                 // will send buffer address to device
2194                                 m_vars[i].flags.sink_addr = 1;
2195                             }
2196 
2197                             if (!ptr_data->is_static) {
2198                                 // need to add reference for buffer
2199                                 m_need_runfunction = true;
2200                             }
2201                         }
2202                         else {
2203                             bool error_if_not_found = true;
2204                             if (m_is_openmp) {
2205                                 // For omp target update variable is ignored
2206                                 // if it does not exist.
2207                                 if (m_vars[i].flags.always_copy ||
2208                                     (!m_vars[i].alloc_if &&
2209                                      !m_vars[i].free_if)) {
2210                                     error_if_not_found = false;
2211                                 }
2212                             }
2213 
2214                             // use existing association from pointer table
2215                             if (!find_ptr_data(ptr_data,
2216                                                base,
2217                                                m_vars[i].disp,
2218                                                m_vars[i].size,
2219                                                m_vars[i].flags.targetptr,
2220                                                error_if_not_found)) {
2221                                 return false;
2222                             }
2223 
2224                             if (m_is_openmp) {
2225                                 // make var nocopy if it does not exist
2226                                 if (ptr_data == 0) {
2227                                     m_vars[i].direction.bits =
2228                                         c_parameter_nocopy;
2229                                 }
2230                             }
2231 
2232                             if (ptr_data != 0) {
2233                                 // need to update base in dope vector on device
2234                                 m_vars[i].flags.sink_addr = 1;
2235                             }
2236                         }
2237 
2238                         if (ptr_data != 0) {
2239                             if (m_is_openmp) {
2240                                 // data is transferred if
2241                                 // - if always modifier is used OR
2242                                 // - if alloc_if == 0 && free_if == 0 OR
2243                                 // - if reference count is 1
2244                                 if (!m_vars[i].flags.always_copy &&
2245                                     (m_vars[i].alloc_if ||
2246                                      m_vars[i].free_if) &&
2247                                     ptr_data->get_reference() != 1) {
2248                                     m_vars[i].direction.bits =
2249                                         c_parameter_nocopy;
2250                                 }
2251                             }
2252 
2253                             if (ptr_data->alloc_disp != 0) {
2254                                 m_vars[i].flags.alloc_disp = 1;
2255                                 m_in_datalen += sizeof(alloc_disp);
2256                             }
2257 
2258                             if (m_vars[i].flags.sink_addr) {
2259                                 // get buffers's address on the sink
2260                                 if (!init_mic_address(ptr_data)) {
2261                                     return false;
2262                                 }
2263 
2264                                 m_in_datalen += sizeof(ptr_data->mic_addr);
2265                             }
2266 
2267                             if (!ptr_data->is_static && m_vars[i].free_if) {
2268                                 // need to decrement buffer reference on target
2269                                 m_need_runfunction = true;
2270                             }
2271 
2272                             // offset to base from the beginning of the buffer
2273                             // memory
2274                             m_vars[i].offset =
2275                                 (char*) base -
2276                                 (char*) ptr_data->cpu_addr.start();
2277 
2278                             // copy other pointer properties to var descriptor
2279                             m_vars[i].mic_offset = ptr_data->mic_offset;
2280                             m_vars[i].flags.is_static = ptr_data->is_static;
2281                         }
2282                     }
2283                     else { // !src_is_for_mic
2284                         if (!find_ptr_data(ptr_data,
2285                                            base,
2286                                            m_vars[i].disp,
2287                                            m_vars[i].size,
2288                                            false, false)) {
2289                             return false;
2290                         }
2291                         m_vars[i].offset = !ptr_data ? 0 :
2292                                 (char*) base -
2293                                 (char*) ptr_data->cpu_addr.start();
2294                     }
2295 
2296                     // save pointer data
2297                     m_vars_extra[i].src_data = ptr_data;
2298                 }
2299                 break;
2300 
2301             default:
2302                 LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars_extra[i].type_src);
2303                 LIBOFFLOAD_ABORT;
2304         }
2305         if (m_vars_extra[i].type_src == c_data_ptr_array) {
2306             continue;
2307         }
2308 
2309         if (src_is_for_mic && m_vars[i].flags.is_stack_buf) {
2310             if (this_threads_cpu_stack_addr == 0) {
2311                 this_threads_cpu_stack_addr =
2312                     get_this_threads_cpu_stack_addr(
2313                         stack_addr, entry_id, thread_specific_function_locals);
2314             }
2315             m_vars[i].offset = static_cast<char*>
2316                                    (m_vars[i].ptr) -
2317                                     this_threads_cpu_stack_addr;
2318         }
2319         // if source is used at CPU save its offset and disp
2320         if (m_vars[i].into == NULL || m_vars[i].direction.in) {
2321             m_vars_extra[i].cpu_offset = m_vars[i].offset;
2322             m_vars_extra[i].cpu_disp   = m_vars[i].disp;
2323         }
2324 
2325         // If "into" is define we need to do the similar work for it
2326         if (!m_vars[i].into) {
2327             continue;
2328         }
2329 
2330         int64_t into_disp =0, into_offset = 0;
2331 
2332         switch (m_vars_extra[i].type_dst) {
2333             case c_data_ptr_array:
2334                 break;
2335             case c_data:
2336             case c_void_ptr:
2337             case c_void_ptr_ptr:
2338             case c_cean_var: {
2339                 int64_t size = m_vars[i].size;
2340 
2341                 if (m_vars[i].flags.is_non_cont_struct && src_is_for_mic) {
2342                     NonContigDesc *desc =
2343                         static_cast<NonContigDesc*>(m_vars[i].into);
2344                     noncont_struct_dump("", "INTO DATA", desc);
2345                     m_vars_extra[i].noncont_desc = desc;
2346                     m_vars[i].into = reinterpret_cast<void*>(desc->base);
2347                     size = get_noncont_struct_size(desc);
2348                     into_disp = 0;
2349                 }
2350                 else if (m_vars_extra[i].type_dst == c_cean_var) {
2351                     // array descriptor
2352                     const Arr_Desc *ap =
2353                         static_cast<const Arr_Desc*>(m_vars[i].into);
2354 
2355                     // debug dump
2356                     ARRAY_DESC_DUMP("    ", "INTO", ap, 0, src_is_for_mic);
2357 
2358                     // offset and length are derived from the array descriptor
2359                     __arr_data_offset_and_length(ap, into_disp, size);
2360 
2361                     if (!is_arr_desc_contiguous(ap)) {
2362                         m_vars[i].flags.is_noncont_dst = 1;
2363                         m_vars_extra[i].read_rng_dst =
2364                             init_read_ranges_arr_desc(ap);
2365                         if (!cean_ranges_match(
2366                             m_vars_extra[i].read_rng_src,
2367                             m_vars_extra[i].read_rng_dst)) {
2368                             LIBOFFLOAD_ERROR(c_ranges_dont_match);
2369                             exit(1);
2370                         }
2371                     }
2372                     m_vars[i].into = reinterpret_cast<void*>(ap->base);
2373                 }
2374 
2375                 int64_t size_src = m_vars_extra[i].read_rng_src &&
2376                                    !m_vars[i].flags.is_non_cont_struct ?
2377                     cean_get_transf_size(m_vars_extra[i].read_rng_src) :
2378                     m_vars[i].size;
2379                 int64_t size_dst = m_vars_extra[i].read_rng_dst ?
2380                     cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
2381                     size;
2382                 // It's supposed that "into" size must be not less
2383                 // than src size
2384                 if (size_src > size_dst) {
2385                     LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
2386                                      size_src, size_dst);
2387                     exit(1);
2388                 }
2389 
2390                 if (m_vars[i].direction.bits) {
2391                     if (m_vars[i].flags.is_static_dstn) {
2392                         PtrData *ptr_data;
2393 
2394                         // find data associated with variable
2395                         if (!find_ptr_data(ptr_data, m_vars[i].into,
2396                                            into_disp, size, false, false)) {
2397                             return false;
2398                         }
2399                         if (ptr_data != 0) {
2400                             // offset to base from the beginning of the buffer
2401                             // memory
2402                             into_offset =
2403                                 (char*) m_vars[i].into -
2404                                 (char*) ptr_data->cpu_addr.start();
2405                         }
2406                         else {
2407                             m_vars[i].flags.is_static_dstn = false;
2408                         }
2409                         m_vars_extra[i].dst_data = ptr_data;
2410                     }
2411                 }
2412 
2413                 if (m_vars[i].direction.in &&
2414                     !m_vars[i].flags.is_static_dstn) {
2415                     m_in_datalen += m_vars[i].size;
2416 
2417                     // for non-static target destination defined as CEAN
2418                     // expression we pass to target its size and dist
2419                     if (m_vars_extra[i].type_dst == c_cean_var) {
2420                         m_in_datalen += 2 * sizeof(uint64_t);
2421                     }
2422                     m_need_runfunction = true;
2423                 }
2424 
2425                 if (m_is_openmp && src_is_for_mic) {
2426                     if (m_vars[i].flags.is_static_dstn) {
2427                         // Static data is transferred either by omp target
2428                         // update construct which passes zeros for
2429                         // alloc_if and free_if or by always modifier.
2430                         if (!m_vars[i].flags.always_copy &&
2431                             (m_vars[i].alloc_if || m_vars[i].free_if)) {
2432                                 m_vars[i].direction.bits = c_parameter_nocopy;
2433                         }
2434                     }
2435                     else {
2436                         AutoData *auto_data;
2437                         if (m_vars[i].alloc_if) {
2438                             auto_data = m_device.insert_auto_data(
2439                                 m_vars[i].into, size_dst);
2440                             auto_data->add_reference();
2441                         }
2442                         else {
2443                             // TODO: what should be done if var is not in
2444                             // the table?
2445                             auto_data = m_device.find_auto_data(
2446                                 m_vars[i].into);
2447                         }
2448 
2449                         // For automatic variables data is transferred:
2450                         // - if always modifier is used OR
2451                         // - if alloc_if == 0 && free_if == 0 OR
2452                         // - if reference count is 1
2453                         if (!m_vars[i].flags.always_copy &&
2454                             (m_vars[i].alloc_if || m_vars[i].free_if) &&
2455                             (auto_data == 0 ||
2456                             auto_data->get_reference() != 1)) {
2457                                 m_vars[i].direction.bits = c_parameter_nocopy;
2458                         }
2459                         // save data for later use
2460                         m_vars_extra[i].auto_data = auto_data;
2461                     }
2462                 }
2463                 break;
2464             }
2465 
2466             case c_dv:
2467                 if (m_vars[i].direction.bits ||
2468                     m_vars[i].alloc_if ||
2469                     m_vars[i].free_if) {
2470                     ArrDesc *dvp = static_cast<ArrDesc*>(m_vars[i].into);
2471 
2472                     // debug dump
2473                     __dv_desc_dump("INTO", dvp);
2474 
2475                     // send dope vector contents excluding base
2476                     m_in_datalen += m_vars[i].size - sizeof(uint64_t);
2477                     m_need_runfunction = true;
2478                 }
2479                 break;
2480 
2481             case c_string_ptr:
2482             case c_data_ptr:
2483             case c_string_ptr_ptr:
2484             case c_data_ptr_ptr:
2485             case c_cean_var_ptr:
2486             case c_cean_var_ptr_ptr:
2487             case c_dv_ptr: {
2488                 int64_t size = m_vars[i].size;
2489 
2490                 if (m_vars_extra[i].type_dst == c_cean_var_ptr ||
2491                     m_vars_extra[i].type_dst == c_cean_var_ptr_ptr) {
2492                     // array descriptor
2493                     const Arr_Desc *ap =
2494                         static_cast<const Arr_Desc*>(m_vars[i].into);
2495 
2496                     // debug dump
2497                     ARRAY_DESC_DUMP("    ", "INTO", ap, 1, src_is_for_mic);
2498 
2499                     // offset and length are derived from the array descriptor
2500                     __arr_data_offset_and_length(ap, into_disp, size);
2501 
2502                     if (!is_arr_desc_contiguous(ap)) {
2503                         m_vars[i].flags.is_noncont_src = 1;
2504                         m_vars_extra[i].read_rng_dst =
2505                             init_read_ranges_arr_desc(ap);
2506                         if (!cean_ranges_match(
2507                             m_vars_extra[i].read_rng_src,
2508                             m_vars_extra[i].read_rng_dst)) {
2509                             LIBOFFLOAD_ERROR(c_ranges_dont_match);
2510                         }
2511                     }
2512                     m_vars[i].into = reinterpret_cast<char**>(ap->base);
2513                 }
2514                 else if (m_vars_extra[i].type_dst == c_dv_ptr) {
2515                     // need to send DV to the device unless it is 'nocopy'
2516                     if (m_vars[i].direction.bits ||
2517                         m_vars[i].alloc_if ||
2518                         m_vars[i].free_if) {
2519                         ArrDesc *dvp = *static_cast<ArrDesc**>(m_vars[i].into);
2520 
2521                         // debug dump
2522                         __dv_desc_dump("INTO", dvp);
2523 
2524                         m_vars[i].direction.bits = c_parameter_in;
2525                     }
2526                 }
2527 
2528                 int64_t size_src = m_vars_extra[i].read_rng_src &&
2529                                    !m_vars[i].flags.is_non_cont_struct ?
2530                     cean_get_transf_size(m_vars_extra[i].read_rng_src) :
2531                     m_vars[i].size;
2532                 int64_t size_dst = m_vars_extra[i].read_rng_dst ?
2533                     cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
2534                     size;
2535                 // It's supposed that "into" size must be not less than
2536                 // src size
2537                 if (size_src > size_dst) {
2538                     LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
2539                                      size_src, size_dst);
2540                     exit(1);
2541                 }
2542 
2543                 if (m_vars[i].direction.bits) {
2544                     PtrData *ptr_data;
2545 
2546                     // base address
2547                     void *base = *static_cast<void**>(m_vars[i].into);
2548 
2549                     if (m_vars[i].direction.in) {
2550                         // allocate buffer
2551                         if (m_vars[i].flags.is_stack_buf) {
2552                             // for stack persistent objects ptr data is created
2553                             // by var_desc with number 0.
2554                             // Its ptr_data is stored at m_stack_ptr_data
2555                             ptr_data = m_stack_ptr_data;
2556                         }
2557                         else if (m_vars[i].alloc_if) {
2558                             if (m_vars[i].flags.preallocated) {
2559                                 m_out_datalen += sizeof(void*);
2560                                 m_need_runfunction = true;
2561                                 break;
2562                             }
2563                             // add new entry
2564                             if (!alloc_ptr_data(
2565                                     ptr_data,
2566                                     reinterpret_cast<char *>(base) + alloc_disp,
2567                                     (alloc_base != NULL) ?
2568                                         alloc_disp : into_disp,
2569                                     (alloc_base != NULL) ?
2570                                         alloc_size : size,
2571                                     alloc_disp,
2572                                     (alloc_base != NULL) ?
2573                                         0 : m_vars[i].align,
2574                                     m_vars[i].flags.targetptr,
2575                                     m_vars[i].flags.preallocated,
2576                                     m_vars[i].flags.pin)) {
2577                                 return false;
2578                             }
2579                             if (m_vars[i].flags.targetptr) {
2580                                 if (!init_mic_address(ptr_data)) {
2581                                     return false;
2582                                 }
2583                                 *static_cast<void**>(m_vars[i].into) = base =
2584                                     reinterpret_cast<void*>(ptr_data->mic_addr);
2585                             }
2586                             if (ptr_data->add_reference() == 0 &&
2587                                 ptr_data->mic_buf != 0) {
2588                                 // add buffer to the list of buffers that
2589                                 // are passed to dispatch call
2590                                 m_compute_buffers.push_back(
2591                                     ptr_data->mic_buf);
2592                             }
2593                             else {
2594                                 // will send buffer address to device
2595                                 m_vars[i].flags.sink_addr = 1;
2596                             }
2597 
2598                             if (!ptr_data->is_static) {
2599                                 // need to add reference for buffer
2600                                 m_need_runfunction = true;
2601                             }
2602                         }
2603                         else {
2604                             // use existing association from pointer table
2605                             if (!find_ptr_data(ptr_data, base, into_disp,
2606                                     size, m_vars[i].flags.targetptr, true)) {
2607                                 return false;
2608                             }
2609                             m_vars[i].flags.sink_addr = 1;
2610                         }
2611 
2612                         if (ptr_data->alloc_disp != 0) {
2613                             m_vars[i].flags.alloc_disp = 1;
2614                             m_in_datalen += sizeof(alloc_disp);
2615                         }
2616 
2617                         if (m_vars[i].flags.sink_addr) {
2618                             // get buffers's address on the sink
2619                             if (!init_mic_address(ptr_data)) {
2620                                 return false;
2621                             }
2622 
2623                             m_in_datalen += sizeof(ptr_data->mic_addr);
2624                         }
2625 
2626                         if (!ptr_data->is_static && m_vars[i].free_if) {
2627                             // need to decrement buffer reference on target
2628                             m_need_runfunction = true;
2629                         }
2630 
2631                         // copy other pointer properties to var descriptor
2632                         m_vars[i].mic_offset = ptr_data->mic_offset;
2633                         m_vars[i].flags.is_static_dstn = ptr_data->is_static;
2634                     }
2635                     else {
2636                         if (!find_ptr_data(ptr_data,
2637                                            base,
2638                                            into_disp,
2639                                            m_vars[i].size,
2640                                            false, false)) {
2641                             return false;
2642                         }
2643                     }
2644                     if (ptr_data) {
2645                         into_offset = ptr_data ?
2646                             (char*) base -
2647                             (char*) ptr_data->cpu_addr.start() :
2648                             0;
2649                     }
2650 
2651                     if (m_is_openmp) {
2652                         // for FROM transfer of stack buffer's variable
2653                         if (src_is_for_mic && m_vars[i].flags.is_stack_buf) {
2654                             AutoData *auto_data;
2655                             char *base = *static_cast<char**>(m_vars[i].into);
2656                             if (m_vars[i].alloc_if) {
2657                                 auto_data =m_device.insert_auto_data(
2658                                     base + into_disp,
2659                                     size);
2660                                 auto_data->add_reference();
2661                             }
2662                             else {
2663                                 auto_data = m_device.find_auto_data(
2664                                     base + into_disp);
2665                             }
2666                             // save data for later use
2667                             m_vars_extra[i].auto_data = auto_data;
2668                             // For automatic variables
2669                             // data is transferred:
2670                             // - if always modifier is used OR
2671                             // - if alloc_if == 0 && free_if == 0 OR
2672                             // - if reference count is 1
2673                             if (!m_vars[i].flags.always_copy &&
2674                                 (m_vars[i].alloc_if ||
2675                                 m_vars[i].free_if) &&
2676                                 auto_data != 0 &&
2677                                 auto_data->get_reference() != 1) {
2678                                     m_vars[i].direction.bits =
2679                                         c_parameter_nocopy;
2680                             }
2681                         }
2682                     }
2683                     // save pointer data
2684                     m_vars_extra[i].dst_data = ptr_data;
2685                 }
2686                 break;
2687             }
2688 
2689             case c_func_ptr:
2690             case c_func_ptr_ptr:
2691                 break;
2692 
2693             case c_dv_data:
2694             case c_dv_ptr_data:
2695             case c_dv_data_slice:
2696             case c_dv_ptr_data_slice:
2697                 if (m_vars[i].direction.bits ||
2698                     m_vars[i].alloc_if ||
2699                     m_vars[i].free_if) {
2700                     const Arr_Desc *ap;
2701                     ArrDesc *dvp;
2702                     PtrData *ptr_data;
2703                     int64_t disp;
2704                     int64_t size;
2705 
2706                     if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst)) {
2707                         ap = static_cast<const Arr_Desc*>(m_vars[i].into);
2708 
2709                         // debug dump
2710                         ARRAY_DESC_DUMP("    ", "INTO", ap, 0, src_is_for_mic);
2711 
2712                         dvp = (m_vars_extra[i].type_dst == c_dv_data_slice) ?
2713                               reinterpret_cast<ArrDesc*>(ap->base) :
2714                               *reinterpret_cast<ArrDesc**>(ap->base);
2715                     }
2716                     else {
2717                         dvp = (m_vars_extra[i].type_dst == c_dv_data) ?
2718                               static_cast<ArrDesc*>(m_vars[i].into) :
2719                               *static_cast<ArrDesc**>(m_vars[i].into);
2720                     }
2721                     if (!__dv_is_contiguous(dvp)) {
2722                         m_vars[i].flags.is_noncont_dst = 1;
2723                         m_vars_extra[i].read_rng_dst =
2724                             init_read_ranges_dv(dvp);
2725                     }
2726                     // size and displacement
2727                     if (VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst)) {
2728                         // offset and length are derived from the array
2729                         // descriptor
2730                         __arr_data_offset_and_length(ap, into_disp, size);
2731                         if (m_vars[i].direction.bits) {
2732                             if (!is_arr_desc_contiguous(ap)) {
2733                                 if (m_vars[i].flags.is_noncont_dst) {
2734                                     LIBOFFLOAD_ERROR(c_slice_of_noncont_array);
2735                                     return false;
2736                                 }
2737                                 m_vars[i].flags.is_noncont_dst = 1;
2738                                 m_vars_extra[i].read_rng_dst =
2739                                     init_read_ranges_arr_desc(ap);
2740                                 if (!cean_ranges_match(
2741                                     m_vars_extra[i].read_rng_src,
2742                                     m_vars_extra[i].read_rng_dst)) {
2743                                     LIBOFFLOAD_ERROR(c_ranges_dont_match);
2744                                 }
2745                             }
2746                         }
2747                     }
2748                     else {
2749                         if (m_vars[i].flags.has_length) {
2750                             size = __dv_data_length(dvp, m_vars[i].count);
2751                         }
2752                         else {
2753                             size = __dv_data_length(dvp);
2754                         }
2755                         disp = 0;
2756                     }
2757 
2758                     int64_t size_src =
2759                         m_vars_extra[i].read_rng_src &&
2760                         (!m_vars[i].flags.is_non_cont_struct ||
2761                          src_is_for_mic)  ?
2762                         cean_get_transf_size(m_vars_extra[i].read_rng_src) :
2763                         m_vars[i].size;
2764                     int64_t size_dst =
2765                         m_vars_extra[i].read_rng_dst ?
2766                         cean_get_transf_size(m_vars_extra[i].read_rng_dst) :
2767                         size;
2768                     // It's supposed that "into" size must be not less
2769                     // than src size
2770                     if (size_src > size_dst) {
2771                         LIBOFFLOAD_ERROR(c_different_src_and_dstn_sizes,
2772                             size_src, size_dst);
2773                         exit(1);
2774                     }
2775 
2776                     // base address
2777                     void *base = reinterpret_cast<void*>(dvp->Base);
2778 
2779                     // allocate buffer
2780                     if (m_vars[i].direction.in) {
2781                         if (m_vars[i].alloc_if) {
2782                             // add new entry
2783                             if (!alloc_ptr_data(
2784                                     ptr_data,
2785                                     reinterpret_cast<char *>(base) + alloc_disp,
2786                                     (alloc_base != NULL) ?
2787                                         alloc_disp : into_disp,
2788                                     (alloc_base != NULL) ?
2789                                         alloc_size : size,
2790                                     alloc_disp,
2791                                     (alloc_base != NULL) ?
2792                                         0 : m_vars[i].align,
2793                                     m_vars[i].flags.targetptr,
2794                                     m_vars[i].flags.preallocated,
2795                                     m_vars[i].flags.pin)) {
2796                                 return false;
2797                             }
2798                             if (ptr_data->add_reference() == 0 &&
2799                                 ptr_data->mic_buf !=0) {
2800                                 // add buffer to the list of buffers
2801                                 // that are passed to dispatch call
2802                                 m_compute_buffers.push_back(
2803                                     ptr_data->mic_buf);
2804                             }
2805                             else {
2806                                 // will send buffer address to device
2807                                 m_vars[i].flags.sink_addr = 1;
2808                             }
2809 
2810                             if (!ptr_data->is_static) {
2811                                 // need to add reference for buffer
2812                                 m_need_runfunction = true;
2813                             }
2814                         }
2815                         else {
2816                             // use existing association from pointer table
2817                             if (!find_ptr_data(ptr_data, base, into_disp,
2818                                 size, m_vars[i].flags.targetptr, true)) {
2819                                 return false;
2820                             }
2821 
2822                             // need to update base in dope vector on device
2823                             m_vars[i].flags.sink_addr = 1;
2824                         }
2825 
2826                         if (ptr_data->alloc_disp != 0) {
2827                             m_vars[i].flags.alloc_disp = 1;
2828                             m_in_datalen += sizeof(alloc_disp);
2829                         }
2830 
2831                         if (m_vars[i].flags.sink_addr) {
2832                             // get buffers's address on the sink
2833                             if (!init_mic_address(ptr_data)) {
2834                                 return false;
2835                             }
2836                             m_in_datalen += sizeof(ptr_data->mic_addr);
2837                         }
2838 
2839                         if (!ptr_data->is_static && m_vars[i].free_if) {
2840                             // need to decrement buffer reference on target
2841                             m_need_runfunction = true;
2842                         }
2843 
2844                         // offset to base from the beginning of the buffer
2845                         // memory
2846                         into_offset =
2847                             (char*) base - (char*) ptr_data->cpu_addr.start();
2848 
2849                         // copy other pointer properties to var descriptor
2850                         m_vars[i].mic_offset = ptr_data->mic_offset;
2851                         m_vars[i].flags.is_static_dstn = ptr_data->is_static;
2852                     }
2853                     else { // src_is_for_mic
2854                         if (!find_ptr_data(ptr_data,
2855                                            base,
2856                                            into_disp,
2857                                            size,
2858                                            false, false)) {
2859                             return false;
2860                         }
2861                         into_offset = !ptr_data ?
2862                             0 :
2863                             (char*) base - (char*) ptr_data->cpu_addr.start();
2864                     }
2865 
2866                     // save pointer data
2867                     m_vars_extra[i].dst_data = ptr_data;
2868                 }
2869                 break;
2870 
2871             default:
2872                 LIBOFFLOAD_ERROR(c_unknown_var_type, m_vars_extra[i].type_src);
2873                 LIBOFFLOAD_ABORT;
2874         }
2875         // if into is used at CPU save its offset and disp
2876         if (m_vars[i].direction.out) {
2877             m_vars_extra[i].cpu_offset = into_offset;
2878             m_vars_extra[i].cpu_disp   = into_disp;
2879         }
2880         else {
2881             if (m_vars[i].flags.is_stack_buf) {
2882                 if (this_threads_cpu_stack_addr == 0) {
2883                     this_threads_cpu_stack_addr =
2884                         get_this_threads_cpu_stack_addr(
2885                             stack_addr, entry_id,
2886                             thread_specific_function_locals);
2887                 }
2888                 into_offset = static_cast<char*>
2889                                   (m_vars[i].into) -
2890                                    this_threads_cpu_stack_addr;
2891             }
2892             m_vars[i].offset = into_offset;
2893             m_vars[i].disp   = into_disp;
2894         }
2895     }
2896 
2897     return true;
2898 }
2899 
setup_misc_data(const char * name)2900 bool OffloadDescriptor::setup_misc_data(const char *name)
2901 {
2902     OffloadTimer timer(get_timer_data(), c_offload_host_setup_misc_data);
2903 
2904     // we can skip run functon call together with wait if offloaded
2905     // region is empty and there is no user defined non-pointer IN/OUT data
2906     if (m_need_runfunction) {
2907         // variable descriptors are sent as input data
2908         m_in_datalen += m_vars_total * sizeof(VarDesc);
2909 
2910         // timer data is sent as a part of the output data
2911         m_out_datalen += OFFLOAD_TIMER_DATALEN();
2912 
2913         // max from input data and output data length
2914         uint64_t data_len = m_in_datalen > m_out_datalen ? m_in_datalen :
2915                                                            m_out_datalen;
2916 
2917         // Misc data has the following layout
2918         //     <Function Descriptor>
2919         //     <Function Name>
2920         //     <In/Out Data>            (optional)
2921         //
2922         // We can transfer copyin/copyout data in misc/return data which can
2923         // be passed to run function call if its size does not exceed
2924         // COI_PIPELINE_MAX_IN_MISC_DATA_LEN. Otherwise we have to allocate
2925         // buffer for it.
2926 
2927         m_func_desc_size = sizeof(FunctionDescriptor) + strlen(name) + 1;
2928         m_func_desc_size = (m_func_desc_size + 7) & ~7;
2929 
2930         int misc_data_offset = 0;
2931         int misc_data_size = 0;
2932         if (data_len > 0) {
2933             if (m_func_desc_size +
2934                 m_in_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN &&
2935                 m_out_datalen <= COI_PIPELINE_MAX_IN_MISC_DATA_LEN) {
2936                 // use misc/return data for copyin/copyout
2937                 misc_data_offset = m_func_desc_size;
2938                 misc_data_size = data_len;
2939             }
2940             else {
2941                 OffloadTimer timer_buf(get_timer_data(),
2942                                        c_offload_host_alloc_data_buffer);
2943 
2944                 // send/receive data using buffer
2945                 COIRESULT res = COI::BufferCreate(data_len,
2946                                                   COI_BUFFER_OPENCL,
2947                                                   0, 0,
2948                                                   1, &m_device.get_process(),
2949                                                   &m_inout_buf);
2950                 if (res != COI_SUCCESS) {
2951                     if (m_status != 0) {
2952                         m_status->result = translate_coi_error(res);
2953                         return false;
2954                     }
2955                     report_coi_error(c_buf_create, res);
2956                 }
2957 
2958                 m_compute_buffers.push_back(m_inout_buf);
2959                 m_destroy_buffers.push_back(m_inout_buf);
2960             }
2961         }
2962 
2963         // initialize function descriptor
2964         m_func_desc = (FunctionDescriptor*) malloc(m_func_desc_size +
2965                                                    misc_data_size);
2966         if (m_func_desc == NULL)
2967           LIBOFFLOAD_ERROR(c_malloc);
2968         m_func_desc->console_enabled = console_enabled;
2969         m_func_desc->timer_enabled = offload_report_enabled &&
2970             (timer_enabled || offload_report_level);
2971         m_func_desc->offload_report_level = offload_report_enabled ?
2972                                               offload_report_level : 0;
2973         m_func_desc->offload_number = GET_OFFLOAD_NUMBER(get_timer_data());
2974         m_func_desc->in_datalen = m_in_datalen;
2975         m_func_desc->out_datalen = m_out_datalen;
2976         m_func_desc->vars_num = m_vars_total;
2977         m_func_desc->data_offset = misc_data_offset;
2978 
2979         // append entry name
2980         strcpy(m_func_desc->data, name);
2981     }
2982 
2983     return true;
2984 }
2985 
setup_omp_async_info()2986 void OffloadDescriptor::setup_omp_async_info()
2987 {
2988     OFFLOAD_TRACE(2, "setup_omp_async_info\n");
2989     OmpAsyncLastEventType event_type = m_need_runfunction ?
2990                                    c_last_runfunc : c_last_write;
2991     int last_in = m_need_runfunction ? 0 : -1;
2992     int i;
2993 
2994     for (i = m_vars_total - 1; i >=0; i--) {
2995         bool src_is_target = (m_vars[i].direction.out || !m_vars[i].into);
2996         int var_type = src_is_target ? m_vars_extra[i].type_src :
2997                                        m_vars_extra[i].type_dst;
2998         bool target_is_static = src_is_target ? m_vars[i].flags.is_static :
2999                                                 m_vars[i].flags.is_static_dstn;
3000         switch (var_type) {
3001             case c_data:
3002             case c_void_ptr:
3003             case c_cean_var:
3004                 if (m_vars[i].direction.out && target_is_static) {
3005                     event_type = c_last_read;
3006                 }
3007                 else if (last_in < 0 && m_vars[i].direction.in &&
3008                     target_is_static) {
3009                     last_in = i;
3010                 }
3011                 break;
3012             case c_string_ptr:
3013             case c_data_ptr:
3014             case c_string_ptr_ptr:
3015             case c_data_ptr_ptr:
3016             case c_cean_var_ptr:
3017             case c_cean_var_ptr_ptr:
3018             case c_dv_ptr:
3019             case c_dv_data:
3020             case c_dv_ptr_data:
3021             case c_dv_data_slice:
3022             case c_dv_ptr_data_slice:
3023 
3024                 if (m_vars[i].direction.out) {
3025                     event_type = c_last_read;
3026                 }
3027                 else if (last_in < 0 && m_vars[i].direction.in) {
3028                     last_in = i;
3029                 }
3030                 break;
3031             default:
3032                 break;
3033         }
3034         if (event_type == c_last_read) {
3035             break;
3036         }
3037     }
3038 
3039     if (event_type == c_last_read) {
3040         m_vars_extra[i].omp_last_event_type = c_last_read;
3041     }
3042     else if (event_type == c_last_write) {
3043         m_vars_extra[last_in].omp_last_event_type = c_last_write;
3044     }
3045     m_omp_async_last_event_type = event_type;
3046     OFFLOAD_TRACE(2, "setup_omp_async_info: event_type=%d\n",
3047                   m_omp_async_last_event_type);
3048 }
3049 
3050 extern "C" {
offload_proxy_task_completed_ooo(COIEVENT e,const COIRESULT r,const void * info)3051     void offload_proxy_task_completed_ooo(
3052         COIEVENT e,
3053         const COIRESULT r,
3054         const void *info
3055     )
3056     {
3057 	task_completion_callback ((void *) info);
3058     }
3059 
3060     // Callback function for asynchronous offloads
offload_complete_task(COIEVENT e,const COIRESULT r,const void * info)3061     void offload_complete_task(
3062         COIEVENT e,
3063         const COIRESULT r,
3064         const void *info
3065     )
3066     {
3067         Stream            *stream;
3068         OffloadDescriptor *task = const_cast<OffloadDescriptor*>(
3069             reinterpret_cast<const OffloadDescriptor*>(info));
3070         uint32_t         events_remained;
3071 
3072         lock_complete.lock();
3073         if (!offload_descr_map[task]) {
3074             lock_complete.unlock();
3075             return;
3076         }
3077 
3078 #ifndef TARGET_WINNT
3079         events_remained = __sync_sub_and_fetch(&task->m_event_count, 1);
3080 #else // TARGET_WINNT
3081         events_remained = _InterlockedDecrement(&task->m_event_count);
3082 #endif // TARGET_WINNT
3083        // Waiting for the last event
3084        if (events_remained != 0) {
3085            lock_complete.unlock();
3086            return;
3087        }
3088 
3089         // Callback could be called when execution at host is completed.
3090         // Do nothing as engine data is destructed
3091         if (!task->get_device().get_ready()) {
3092             lock_complete.unlock();
3093             return;
3094         }
3095 
3096         void *           signal = task->get_signal();
3097         _Offload_stream  stream_handle = task->get_stream();
3098 
3099         OFFLOAD_TRACE(2, "Call function offload_complete_task(%p)\n", info);
3100 
3101         // Completed offload has a signal
3102         if (task->m_has_signal) {
3103             if (!offload_descr_map[task]) {
3104                 lock_complete.unlock();
3105                 return;
3106             }
3107             task->get_device().complete_signaled_ofld(signal);
3108             // Asynchronous offload can have both signal and stream. Need to
3109             // clean stream if any.
3110             stream_handle = task->get_stream();
3111             if (stream_handle != -1) {
3112                 stream = Stream::find_stream(stream_handle, false);
3113                 if (stream && stream->get_last_offload() == task) {
3114                     stream->set_last_offload(NULL);
3115                 }
3116             }
3117             offload_descr_map[task] = false;
3118             lock_complete.unlock();
3119 
3120             if (task->offload_finish(0)) { //arg is 0 for is_traceback
3121                 task->cleanup();
3122             }
3123             delete task;
3124         }
3125         // Asynchronous by stream
3126         else {
3127             if (stream_handle != 0) {
3128                 stream = Stream::find_stream(stream_handle, false);
3129 
3130                 // the stream was not created or was destroyed
3131                 if (!stream) {
3132                     LIBOFFLOAD_ERROR(c_offload_no_stream,
3133                         task->get_device().get_logical_index());
3134                     LIBOFFLOAD_ABORT;
3135                 }
3136                 if (!offload_descr_map[task]) {
3137                     lock_complete.unlock();
3138                     return;
3139                 }
3140                 if (task == stream->get_last_offload()) {
3141                     stream->set_last_offload(NULL);
3142                 }
3143                 // if the offload has both signal and stream we will complete
3144                 // it as it has the signal. So we don't need to mark signal
3145                 // as completed.
3146                 offload_descr_map[task] = false;
3147                 lock_complete.unlock();
3148                 if (task->offload_finish(0)) { //arg is 0 for is_traceback
3149                     task->cleanup();
3150                 }
3151                 delete task;
3152             }
3153         }
3154     }
3155 }
3156 
register_omp_event_call_back(const COIEVENT * event,const void * info)3157 void OffloadDescriptor::register_omp_event_call_back(
3158     const COIEVENT *event,
3159     const void *info)
3160 {
3161     register_event_call_back(&offload_proxy_task_completed_ooo, event, info);
3162 }
3163 
register_event_call_back(void (* func)(COIEVENT,const COIRESULT,const void *),const COIEVENT * event,const void * info)3164 void OffloadDescriptor::register_event_call_back(
3165     void (*func)(COIEVENT, const COIRESULT, const void*),
3166     const COIEVENT *event,
3167     const void *info)
3168 {
3169     OFFLOAD_TRACE(2, "register_event_call_back(event=%p, info=%p)\n",
3170                   event, info);
3171     if (COI::EventRegisterCallback) {
3172         COI::EventRegisterCallback(
3173                  *event,
3174                  func,
3175                  info, 0);
3176         OFFLOAD_TRACE(2,
3177             "COI::EventRegisterCallback found; callback registered\n");
3178     }
3179 }
3180 
wait_dependencies(const void ** waits,int num_waits,_Offload_stream handle)3181 bool OffloadDescriptor::wait_dependencies(
3182     const void    **waits,
3183     int             num_waits,
3184     _Offload_stream handle
3185 )
3186 {
3187     OffloadTimer timer(get_timer_data(), c_offload_host_wait_deps);
3188     bool ret = true;
3189     OffloadDescriptor *task;
3190     void *    signal;
3191 
3192     if (num_waits == 0) {
3193         // Prepare in dependencies for stream
3194         get_stream_in_dependencies(m_num_in_dependencies,m_p_in_dependencies);
3195         return true;
3196     }
3197 
3198     // wait for streams
3199     if (num_waits == -1) {
3200         Stream * stream;
3201         // some specific stream of the device
3202         if (handle != 0) {
3203             lock_complete.lock();
3204             stream = Stream::find_stream(handle, false);
3205 
3206             // the stream was not created or was destroyed
3207             if (!stream) {
3208                 LIBOFFLOAD_ERROR(c_offload_no_stream, m_device.get_logical_index());
3209                 LIBOFFLOAD_ABORT;
3210             }
3211             task = stream->get_last_offload();
3212 
3213             // offload was completed by previous offload_wait pragma
3214             // or wait clause
3215             if (!offload_descr_map[task]) {
3216                 lock_complete.unlock();
3217                 return true;
3218             }
3219             stream->set_last_offload(NULL);
3220             if (task->m_has_signal) {
3221                 signal = task->get_signal();
3222                 if (m_device.find_signal(signal, false) == task) {
3223                     m_device.complete_signaled_ofld(signal);
3224                 }
3225             }
3226             offload_descr_map[task] = false;
3227             lock_complete.unlock();
3228 
3229             if (!task->offload_finish(0)) { //arg is 0 for is_traceback
3230                 ret = false;
3231             }
3232             task->cleanup();
3233             delete task;
3234         }
3235         // all streams of the device or over all devices
3236         else {
3237             StreamMap stream_map = Stream::all_streams;
3238             for (StreamMap::iterator it = stream_map.begin();
3239                 it != stream_map.end(); it++) {
3240                 Stream * stream = it->second;
3241                 if (!m_wait_all_devices &&
3242                     stream->get_device() != m_device.get_logical_index()) {
3243                     continue;
3244                 }
3245                 lock_complete.lock();
3246 
3247                 // get associated async task
3248                 OffloadDescriptor *task = stream->get_last_offload();
3249                 // offload was completed by offload_wait pragma or wait clause
3250                 if (!offload_descr_map[task]) {
3251                    lock_complete.unlock();
3252                     continue;
3253                 }
3254                 if (task->m_has_signal) {
3255                     signal = task->get_signal();
3256                     if (task->get_device().find_signal(signal, false) ==
3257                         task) {
3258                         task->get_device().complete_signaled_ofld(signal);
3259                     }
3260                 }
3261                 stream->set_last_offload(NULL);
3262                 offload_descr_map[task] = false;
3263                 lock_complete.unlock();
3264                 if (!task->offload_finish(0)) { //arg is 0 for is_traceback
3265                     ret = false;
3266                 }
3267                 task->cleanup();
3268                 delete task;
3269             }
3270             // no uncompleted streams
3271             return true;
3272         }
3273     }
3274     else {
3275 
3276         // If offload is asynchronous we will not really wait for signals.
3277         // We will collect all waited events into m_p_in_dependencies vector
3278         // to be used in future calls to COI::Copy... API.
3279 
3280         if (!__offload_always_wait && (m_has_signal || (get_stream() > 0))) {
3281             uint64_t        num_in_dep = 0,
3282                             num_in_dep_prev = 0;
3283             COIEVENT        *p_in_dep = NULL;
3284             _Offload_stream stream_handle = get_stream();
3285             Stream          *stream;
3286             bool            stream_need_connection = stream_handle > 0;
3287 
3288             if (stream_need_connection) {
3289                 stream = Stream::find_stream(stream_handle, false);
3290                 // check previous offload with the stream_handle
3291                 // to be noncompleted
3292                 if (!stream) {
3293                     stream_need_connection = false;
3294                 }
3295             }
3296             for (int i = 0; i < num_waits; i++) {
3297                 task = m_device.find_signal(waits[i], false);
3298                 if (task == 0) {
3299                     LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(),
3300                         waits[i]);
3301                     LIBOFFLOAD_ABORT;
3302                 }
3303                 else if (task == SIGNAL_HAS_COMPLETED) {
3304                     continue;
3305                 }
3306                 if (stream_need_connection &&
3307                     stream->get_last_offload() == task) {
3308                     stream_need_connection = false;
3309                 }
3310                 if (!task->m_num_in_dependencies) {
3311                     continue;
3312                 }
3313                 num_in_dep += task->m_num_in_dependencies;
3314                 p_in_dep = (COIEVENT*)realloc(p_in_dep,
3315                                               sizeof(COIEVENT) * num_in_dep);
3316 		if (p_in_dep == NULL)
3317 		    LIBOFFLOAD_ERROR(c_malloc);
3318                 memcpy(p_in_dep + num_in_dep_prev, task->m_p_in_dependencies,
3319                        task->m_num_in_dependencies * sizeof(COIEVENT));
3320                 num_in_dep_prev = num_in_dep;
3321             }
3322             if (stream_need_connection) {
3323                 task = stream->get_last_offload();
3324                 if (task) {
3325                     num_in_dep += task->m_num_in_dependencies;
3326                     p_in_dep = (COIEVENT*)realloc(p_in_dep,
3327                                               sizeof(COIEVENT) * num_in_dep);
3328 		    if (p_in_dep == NULL)
3329 			LIBOFFLOAD_ERROR(c_malloc);
3330                     memcpy(p_in_dep + num_in_dep_prev,
3331                            task->m_p_in_dependencies,
3332                            task->m_num_in_dependencies * sizeof(COIEVENT));
3333                     num_in_dep_prev = num_in_dep;
3334                 }
3335             }
3336             m_num_in_dependencies = num_in_dep ? num_in_dep :
3337                                                  m_num_in_dependencies;
3338             m_p_in_dependencies = num_in_dep ? p_in_dep : m_p_in_dependencies;
3339         }
3340         // wait and do offload_finish for serial offload
3341         else {
3342             for (int i = 0; i < num_waits; i++) {
3343                 _Offload_stream stream_handle;
3344                 Stream *stream;
3345 
3346                 lock_complete.lock();
3347                 task = m_device.find_signal(waits[i], false);
3348                 if (task == 0) {
3349                     LIBOFFLOAD_ERROR(c_offload1, m_device.get_logical_index(),
3350                         waits[i]);
3351                     LIBOFFLOAD_ABORT;
3352                 }
3353                 else if (!offload_descr_map[task]) {
3354                     lock_complete.unlock();
3355                     continue;
3356                 }
3357                 // Need to mark signal as completed to prevent run condition
3358                 // with the call to "offload_complete_task" for the same
3359                 // signal.
3360                 m_device.complete_signaled_ofld(waits[i]);
3361 
3362                 // Asynchronous offload can have both signal and stream.
3363                 // Need to clean stream if any.
3364 
3365                 stream_handle = task->m_stream;
3366                 if (stream_handle != -1) {
3367                     stream = Stream::find_stream(stream_handle, false);
3368                     if (stream && stream->get_last_offload() == task) {
3369                         stream->set_last_offload(NULL);
3370                     }
3371                 }
3372                 offload_descr_map[task] = false;
3373                 lock_complete.unlock();
3374 
3375                 if (!task->offload_finish(0)) { //arg is 0 for is_traceback
3376                     ret = false;
3377                 }
3378                 task->cleanup();
3379 
3380                 delete task;
3381             }
3382         }
3383     }
3384     return ret;
3385 }
3386 
offload_wrap(const char * name,bool is_empty,VarDesc * vars,VarDesc2 * vars2,int vars_total,const void ** waits,int num_waits,const void ** signal,int entry_id,const void * stack_addr,OffloadFlags offload_flags)3387 bool OffloadDescriptor::offload_wrap(
3388     const char *name,
3389     bool is_empty,
3390     VarDesc *vars,
3391     VarDesc2 *vars2,
3392     int vars_total,
3393     const void **waits,
3394     int num_waits,
3395     const void **signal,
3396     int entry_id,
3397     const void *stack_addr,
3398     OffloadFlags offload_flags
3399 )
3400 {
3401     OffloadWaitKind wait_kind = c_offload_wait_signal;
3402     bool is_traceback = offload_flags.bits.fortran_traceback;
3403 
3404     // define kind of wait if any;
3405     // there can be one of the following kind:
3406     // 1. c_offload_wait_signal for "offload_wait wait(signal)"
3407     // 2. c_offload_wait_stream for "offload_wait stream(stream)"
3408     // 3. c_offload_wait_all_streams for "offload_wait stream(0)"
3409     if (num_waits == -1) {
3410         wait_kind = (m_stream == 0) ?
3411                     c_offload_wait_all_streams :
3412                     c_offload_wait_stream;
3413     }
3414     char buf[35];
3415     const char *stream_str;
3416 
3417     if (m_stream == no_stream || num_waits ==-1) {
3418         stream_str = "none";
3419     }
3420     else if (m_stream == 0) {
3421         stream_str = "all";
3422     }
3423     else {
3424         sprintf(buf, "%#llx", m_stream);
3425         stream_str = buf;
3426     }
3427 
3428     if (m_has_signal) {
3429         OFFLOAD_DEBUG_TRACE_1(1,
3430                       GET_OFFLOAD_NUMBER(get_timer_data()),
3431                       c_offload_init_func,
3432                       "Offload function %s, is_empty=%d, #varDescs=%d, "
3433                       "signal=none, stream=%s, #waits=%d%c",
3434                       name, is_empty, vars_total, stream_str, num_waits,
3435                       num_waits == 0 ? '\n' : ' ');
3436         // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
3437         // since the number of waits is not fixed.
3438         if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
3439             if (num_waits) {
3440                 printf("(");
3441                 if (m_stream == no_stream) {
3442                     printf("%p", waits[0]);
3443                     for (int i = 1; i < num_waits; i++) {
3444                         printf(", %p", waits[i]);
3445                     }
3446                 }
3447                 else if (m_stream != 0) {
3448                     printf("%#x", m_stream);
3449                 }
3450                 else {
3451                     printf(" all streams");
3452                 }
3453                 printf(")");
3454             }
3455             printf("\n");
3456             fflush(NULL);
3457         }
3458         // stream in wait is reported further in OFFLOAD_REPORT for waits
3459         if (m_stream != no_stream && num_waits == 0) {
3460             OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3461                            c_offload_stream,
3462                            "%d\n", m_stream);
3463         }
3464         OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3465                       c_offload_signal,
3466                       "none %d\n", 0);
3467     }
3468     else {
3469         OFFLOAD_DEBUG_TRACE_1(1,
3470                       GET_OFFLOAD_NUMBER(get_timer_data()),
3471                       c_offload_init_func,
3472                       "Offload function %s, is_empty=%d, #varDescs=%d, "
3473                       "signal=%p, stream=%s, #waits=%d%c",
3474                       name, is_empty, vars_total, signal, stream_str,
3475                       num_waits, num_waits == 0 ? '\n' : ' ');
3476         // Breaks the norm of using OFFLOAD_DEBUG_TRACE to print the waits
3477         // since the number of waits is not fixed.
3478         if (!OFFLOAD_DO_TRACE && (console_enabled >= 1)) {
3479             if (num_waits) {
3480                 printf("(");
3481                 if (m_stream == no_stream) {
3482                     printf("%p", waits[0]);
3483                     for (int i = 1; i < num_waits; i++) {
3484                         printf(", %p", waits[i]);
3485                     }
3486                     printf(")");
3487                 }
3488                 else if (m_stream != 0) {
3489                     printf("%#x", m_stream);
3490                 }
3491                 else {
3492                     printf(" all streams");
3493                 }
3494                 printf(")");
3495             }
3496             printf("\n");
3497             fflush(NULL);
3498         }
3499         // stream in wait is reported further in OFFLOAD_REPORT for waits
3500         if (m_stream != no_stream && num_waits == 0) {
3501             OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3502                            c_offload_stream,
3503                            "%d\n", m_stream);
3504         }
3505         OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3506                       c_offload_signal,
3507                       "%d\n", signal);
3508     }
3509     if (console_enabled >= 1 && offload_flags.flags != 0) {
3510         trace_offload_flags(get_timer_data(), offload_flags);
3511     }
3512 
3513     OFFLOAD_REPORT(3, GET_OFFLOAD_NUMBER(get_timer_data()),
3514                    c_offload_wait, "%d\n",
3515                    wait_kind, num_waits,
3516                    (wait_kind == c_offload_wait_signal) ?
3517                    waits :
3518                    reinterpret_cast<const void **>(m_stream));
3519 
3520     if (m_status != 0) {
3521         m_status->result = OFFLOAD_SUCCESS;
3522         m_status->device_number = m_device.get_logical_index();
3523     }
3524 
3525     m_initial_need_runfunction = m_need_runfunction = !is_empty;
3526 
3527     // wait for dependencies to finish or set
3528     // m_num_in_dependencies and m_p_in_dependencies for asynchronous offload
3529     if (!wait_dependencies(waits, num_waits, m_stream)) {
3530         cleanup();
3531         return false;
3532     }
3533 
3534     // setup buffers
3535     if (!setup_descriptors(vars, vars2, vars_total, entry_id, stack_addr)) {
3536         cleanup();
3537         return false;
3538     }
3539 
3540     if (offload_flags.bits.omp_async) {
3541         setup_omp_async_info();
3542     }
3543 
3544     // initiate send for pointers. Want to do it as early as possible.
3545     if (!send_pointer_data(signal != 0 || offload_flags.bits.omp_async,
3546                            signal)) {
3547         cleanup();
3548         return false;
3549     }
3550 
3551     // setup misc data for run function
3552     if (!setup_misc_data(name)) {
3553         cleanup();
3554         return false;
3555     }
3556 
3557     // gather copyin data into buffer
3558     if (!gather_copyin_data()) {
3559         cleanup();
3560         return false;
3561     }
3562 
3563     // Start the computation
3564     if (!compute(signal)) {
3565         cleanup();
3566         return false;
3567     }
3568 
3569     // initiate receive for pointers
3570     if (!receive_pointer_data(signal != 0 || offload_flags.bits.omp_async,
3571                               true, signal)) {
3572         cleanup();
3573         return false;
3574     }
3575 
3576     if (offload_flags.bits.omp_async) {
3577         return true;
3578     }
3579 
3580     // if there is a signal or stream save descriptor for the later use.
3581     // num_waits == -1 is for offload_wait and there is nothing to save
3582     if (num_waits != -1 && (signal != 0 || m_stream != no_stream)) {
3583 
3584         if (signal != 0) {
3585             m_device.add_signal(*signal, this);
3586         }
3587 
3588         if (m_stream != no_stream && m_stream != 0) {
3589             Stream* stream = Stream::find_stream(m_stream, false);
3590             if (stream) {
3591                 stream->set_last_offload(this);
3592             }
3593             else {
3594                 LIBOFFLOAD_ERROR(c_offload_no_stream, m_device.get_logical_index());
3595                 LIBOFFLOAD_ABORT;
3596             }
3597         }
3598         // Register callback function "offload_complete_task" for all out
3599         // events or for all in events if there are no out transfers
3600        if (!m_preallocated_alloc) {
3601             m_event_count = m_out_deps_total ?
3602                             m_out_deps_total : m_in_deps_total;
3603             COIEVENT *event_list = m_out_deps_total ? m_out_deps : m_in_deps;
3604 
3605             for (int i = 0; i < m_event_count; i++) {
3606                 register_event_call_back(&offload_complete_task,
3607                                          &event_list[i], this);
3608             }
3609             offload_descr_map[this] = true;
3610             return true;
3611         }
3612     }
3613 
3614     // wait for the offload to finish.
3615     if (!offload_finish(is_traceback)) {
3616         cleanup();
3617         return false;
3618     }
3619 
3620     cleanup();
3621     return true;
3622 }
3623 
offload(const char * name,bool is_empty,VarDesc * vars,VarDesc2 * vars2,int vars_total,const void ** waits,int num_waits,const void ** signal,int entry_id,const void * stack_addr,OffloadFlags offload_flags)3624 bool OffloadDescriptor::offload(
3625     const char *name,
3626     bool is_empty,
3627     VarDesc *vars,
3628     VarDesc2 *vars2,
3629     int vars_total,
3630     const void **waits,
3631     int num_waits,
3632     const void **signal,
3633     int entry_id,
3634     const void *stack_addr,
3635     OffloadFlags offload_flags
3636 )
3637 {
3638     bool res;
3639     res = offload_wrap(name, is_empty, vars, vars2, vars_total,
3640                        waits, num_waits, signal, entry_id,
3641                        stack_addr, offload_flags);
3642     if (res == false && !m_traceback_called) {
3643         if (offload_flags.bits.fortran_traceback) {
3644             OFFLOAD_TRACE(3,
3645                 "Calling Fortran library to continue traceback from MIC\n");
3646             FORTRAN_TRACE_BACK(m_status->result);
3647             m_traceback_called = true;
3648         }
3649     }
3650     return res;
3651 }
3652 
offload_finish(bool is_traceback)3653 bool OffloadDescriptor::offload_finish(
3654     bool is_traceback
3655 )
3656 {
3657     COIRESULT res;
3658 
3659     // wait for compute dependencies to become signaled
3660     if (m_in_deps_total > 0 &&
3661         (m_out_deps_total <= 0 || m_preallocated_alloc)) {
3662         OffloadTimer timer(get_timer_data(), c_offload_host_wait_compute);
3663 
3664         if (__offload_active_wait) {
3665             // keep CPU busy
3666             do {
3667                 res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
3668             }
3669             while (res == COI_TIME_OUT_REACHED);
3670         }
3671         else {
3672             res = COI::EventWait(m_in_deps_total, m_in_deps, -1, 1, 0, 0);
3673         }
3674 
3675         if (res != COI_SUCCESS) {
3676             if (m_status != 0 && !m_traceback_called) {
3677                 m_status->result = translate_coi_error(res);
3678                 if (is_traceback) {
3679                     OFFLOAD_TRACE(3,
3680                     "Calling Fortran library to continue traceback from MIC\n");
3681                     FORTRAN_TRACE_BACK(m_status->result);
3682                     m_traceback_called = true;
3683                 }
3684                 return false;
3685             }
3686             if (is_traceback && !m_traceback_called) {
3687                 OFFLOAD_TRACE(3,
3688                   "Calling Fortran library to continue traceback from MIC\n");
3689                 FORTRAN_TRACE_BACK(OFFLOAD_ERROR);
3690                 exit(1);
3691             }
3692             report_coi_error(c_event_wait, res);
3693         }
3694     }
3695 
3696     // need to do scatter copyout data received from target after
3697     // completing in dependencies to get preallocated buffers.
3698     // If there are no preallocated buffers we will scatter_copyout_data
3699     // after completing out dependencies. In this case we dont need wait
3700     // in dependencies as they are already in DAG.
3701     if (m_out_with_preallocated) {
3702         if (!scatter_copyout_data()) {
3703             return false;
3704         }
3705         if (!receive_pointer_data(m_out_deps_total > 0, false, NULL)) {
3706                 cleanup();
3707                 return false;
3708         }
3709     }
3710 
3711     // wait for receive dependencies to become signaled
3712     if (m_out_deps_total > 0) {
3713         OffloadTimer timer(get_timer_data(), c_offload_host_wait_buffers_reads);
3714 
3715         if (__offload_active_wait) {
3716             // keep CPU busy
3717             do {
3718                 res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
3719             }
3720             while (res == COI_TIME_OUT_REACHED);
3721         }
3722         else {
3723             res = COI::EventWait(m_out_deps_total, m_out_deps, -1, 1, 0, 0);
3724         }
3725 
3726         if (res != COI_SUCCESS) {
3727             if (m_status != 0 && !m_traceback_called) {
3728                 m_status->result = translate_coi_error(res);
3729                 if (is_traceback) {
3730                     OFFLOAD_TRACE(3,
3731                     "Calling Fortran library to continue traceback from MIC\n");
3732                     FORTRAN_TRACE_BACK(m_status->result);
3733                     m_traceback_called = true;
3734                 }
3735                 return false;
3736             }
3737             if (is_traceback && !m_traceback_called) {
3738                 OFFLOAD_TRACE(3,
3739                   "Calling Fortran library to continue traceback from MIC\n");
3740                 FORTRAN_TRACE_BACK(OFFLOAD_ERROR);
3741                 exit(1);
3742             }
3743             report_coi_error(c_event_wait, res);
3744         }
3745     }
3746 
3747     if (!m_out_with_preallocated && !scatter_copyout_data()) {
3748         return false;
3749     }
3750     // destroy buffers
3751     {
3752         OffloadTimer timer(get_timer_data(), c_offload_host_destroy_buffers);
3753 
3754         for (BufferList::const_iterator it = m_destroy_buffers.begin();
3755              it != m_destroy_buffers.end(); it++) {
3756             res = COI::BufferDestroy(*it);
3757             if (res != COI_SUCCESS) {
3758                 if (m_status != 0) {
3759                     m_status->result = translate_coi_error(res);
3760                     return false;
3761                 }
3762                 report_coi_error(c_buf_destroy, res);
3763             }
3764         }
3765     }
3766 
3767     return true;
3768 }
3769 
cleanup()3770 void OffloadDescriptor::cleanup()
3771 {
3772     // release device in orsl
3773     ORSL::release(m_device.get_logical_index());
3774 
3775     OFFLOAD_TIMER_STOP(get_timer_data(), c_offload_host_total_offload);
3776 
3777     // report stuff
3778     Offload_Report_Epilog(get_timer_data());
3779 }
3780 
is_signaled()3781 bool OffloadDescriptor::is_signaled()
3782 {
3783     bool signaled = true;
3784     COIRESULT res;
3785 
3786     // check compute and receive dependencies
3787     if (m_out_deps_total > 0) {
3788         res = COI::EventWait(m_out_deps_total, m_out_deps, 0, 1, 0, 0);
3789         signaled = signaled && (res == COI_SUCCESS);
3790     }
3791     else if (m_in_deps_total > 0) {
3792         res = COI::EventWait(m_in_deps_total, m_in_deps, 0, 1, 0, 0);
3793         signaled = signaled && (res == COI_SUCCESS);
3794     }
3795 
3796     return signaled;
3797 }
3798 
make_arr_desc(void * ptr_val,int64_t extent_start_val,int64_t extent_elements_val,int64_t size)3799 static Arr_Desc * make_arr_desc(
3800     void*   ptr_val,
3801     int64_t extent_start_val,
3802     int64_t extent_elements_val,
3803     int64_t size
3804 )
3805 {
3806     Arr_Desc *res;
3807     res = (Arr_Desc *)malloc(sizeof(Arr_Desc));
3808     if (res == NULL)
3809       LIBOFFLOAD_ERROR(c_malloc);
3810     res->base = reinterpret_cast<int64_t>(ptr_val);
3811     res->rank = 1;
3812     res->dim[0].size = size;
3813     res->dim[0].lindex = 0;
3814     res->dim[0].lower = extent_start_val;
3815     res->dim[0].upper = extent_elements_val + extent_start_val - 1;
3816     res->dim[0].stride = 1;
3817     return res;
3818 }
3819 
3820 // Send pointer data if source or destination or both of them are
3821 // noncontiguous. There is guarantee that length of destination enough for
3822 // transferred data.
send_noncontiguous_pointer_data(int i,PtrData * src_data,PtrData * dst_data,COIEVENT * event,uint64_t & data_sent,uint32_t in_deps_amount,COIEVENT * in_deps)3823 bool OffloadDescriptor::send_noncontiguous_pointer_data(
3824     int i,
3825     PtrData* src_data,
3826     PtrData* dst_data,
3827     COIEVENT *event,
3828     uint64_t &data_sent,
3829     uint32_t in_deps_amount,
3830     COIEVENT *in_deps
3831     )
3832 {
3833     NonContigDesc *desc;
3834     int noncont_num;
3835     int64_t offset_src, offset_dst;
3836     int64_t length_src, length_dst;
3837     int64_t length_src_cur, length_dst_cur;
3838     int64_t send_size;
3839     COIRESULT res;
3840     bool dst_is_empty = true;
3841     bool src_is_empty = true;
3842 
3843     // If BufferWriteMultiD is defined we can set values of required arguments
3844     // and transfer noncontiguous data via call to the COI routine.
3845     if (!m_vars[i].flags.is_non_cont_struct &&
3846         __offload_use_coi_noncontiguous_transfer && COI::BufferWriteMultiD) {
3847         struct Arr_Desc* arr_desc_dst;
3848         struct Arr_Desc* arr_desc_src;
3849         int64_t size_src, size_dst;
3850         char *base = offload_get_src_base(static_cast<char*>(m_vars[i].ptr),
3851             m_vars_extra[i].type_src);
3852         COIBUFFER dst_buf = m_vars[i].into ?
3853             m_vars_extra[i].dst_data->mic_buf :
3854             m_vars_extra[i].src_data->mic_buf;
3855 
3856         offset_src = (m_vars_extra[i].read_rng_src)?
3857             m_vars_extra[i].read_rng_src->init_offset : m_vars_extra[i].cpu_disp;
3858         size_src = m_vars_extra[i].read_rng_src ?
3859             cean_get_transf_size(m_vars_extra[i].read_rng_src) :
3860             m_vars[i].size;
3861 
3862         offset_dst = (m_vars_extra[i].read_rng_dst)?
3863             m_vars_extra[i].read_rng_dst->init_offset : m_vars[i].disp;
3864         size_dst = m_vars_extra[i].read_rng_dst ?
3865             cean_get_transf_size(m_vars_extra[i].read_rng_dst) : m_vars[i].size;
3866 
3867         int64_t el_size = (!m_vars[i].into ||
3868             (m_vars_extra[i].read_rng_src && m_vars_extra[i].read_rng_dst)) ?
3869             1 :
3870             m_vars_extra[i].read_rng_src ?
3871             m_vars_extra[i].read_rng_src->arr_desc->dim[
3872                 m_vars_extra[i].read_rng_src->arr_desc->rank - 1].size :
3873             m_vars_extra[i].read_rng_dst->arr_desc->dim[
3874                 m_vars_extra[i].read_rng_dst->arr_desc->rank - 1].size;
3875 
3876         arr_desc_src = (m_vars_extra[i].read_rng_src) ?
3877                 m_vars_extra[i].read_rng_src->arr_desc :
3878                 make_arr_desc(NULL, // don't required for source
3879                     offset_src/el_size, size_src/el_size, el_size);
3880 
3881         arr_desc_dst = !m_vars[i].into ?
3882                 arr_desc_src :
3883                 (m_vars_extra[i].read_rng_dst) ?
3884                     m_vars_extra[i].read_rng_dst->arr_desc :
3885                     make_arr_desc(NULL,
3886                         offset_dst/el_size, size_src/el_size, el_size);
3887 
3888         int64_t alloc_disp = m_vars[i].into ?
3889                     m_vars_extra[i].dst_data->alloc_disp :
3890                     m_vars_extra[i].src_data->alloc_disp;
3891 
3892         arr_desc_dst->base = 0;
3893         arr_desc_src->base = reinterpret_cast<int64_t>(base);
3894 
3895         res = COI::BufferWriteMultiD(
3896             dst_buf,                // in_DestBuffer,
3897             NULL,                   // DestProcess,
3898             m_vars[i].offset + m_vars[i].mic_offset -
3899             alloc_disp,             // Offset
3900             (void*)arr_desc_dst,    // descriptor of DestArray
3901             (void*)arr_desc_src,    // descriptor of SrcArray
3902             COI_COPY_UNSPECIFIED,   // Type
3903             m_num_in_dependencies,  // Number of in Dependencies
3904             m_p_in_dependencies,    // array of in Dependencies
3905             event);                 // out Dependency
3906         if (res != COI_SUCCESS) {
3907             if (m_status != 0) {
3908                 m_status->result = translate_coi_error(res);
3909                 return false;
3910             }
3911             report_coi_error(c_buf_copy, res);
3912         }
3913         return(true);
3914     }
3915 
3916     data_sent = 0;
3917     if (m_vars[i].flags.is_non_cont_struct) {
3918         desc = m_vars_extra[i].noncont_desc;
3919         noncont_num = 0;
3920     }
3921     else {
3922         // Set length_src and length_dst
3923         length_src = (m_vars_extra[i].read_rng_src) ?
3924             m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
3925         length_dst = !m_vars[i].into ? length_src :
3926             (m_vars_extra[i].read_rng_dst) ?
3927             m_vars_extra[i].read_rng_dst->range_size :
3928         m_vars[i].size;
3929         send_size = (length_src < length_dst) ? length_src : length_dst;
3930     }
3931 
3932     // if event is defined we must multiplate it for all contiguous ranges
3933     // that will be Copied/Write.
3934     // Take in account that we already have 1 event.
3935     if (event) {
3936         uint32_t range_num = m_vars[i].flags.is_non_cont_struct ?
3937                                 desc->interval_cnt :
3938                                 (length_src / send_size) *
3939                                 ((m_vars_extra[i].read_rng_src) ?
3940                                 m_vars_extra[i].read_rng_src->range_max_number : 1) ;
3941         m_in_deps_allocated += range_num ;
3942         m_in_deps    =
3943             (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * m_in_deps_allocated);
3944         m_in_deps_total--;
3945     }
3946 
3947     // consequently get contiguous ranges,
3948     // define corresponded destination offset and send data
3949     do {
3950         if (m_vars[i].flags.is_non_cont_struct) {
3951             // ranges are over
3952             if (noncont_num >= desc->interval_cnt) {
3953                 break;
3954             }
3955             offset_src = offset_dst = desc->interval[noncont_num].lower;
3956             send_size = desc->interval[noncont_num].size;
3957             noncont_num++;
3958         }
3959         else {
3960             if (src_is_empty) {
3961                 if (m_vars_extra[i].read_rng_src) {
3962                     if (!get_next_range(m_vars_extra[i].read_rng_src,
3963                         &offset_src)) {
3964                         // source ranges are over - nothing to send
3965                         break;
3966                     }
3967                 }
3968                 else if (data_sent == 0) {
3969                     offset_src = m_vars_extra[i].cpu_disp;
3970                 }
3971                 else {
3972                     break;
3973                 }
3974                 length_src_cur = length_src;
3975             }
3976             else {
3977                 // if source is contiguous or its contiguous range is greater
3978                 // than destination one
3979                 offset_src += send_size;
3980             }
3981             length_src_cur -= send_size;
3982             src_is_empty = length_src_cur == 0;
3983 
3984             if (dst_is_empty) {
3985                 if (m_vars[i].into) {
3986                     if (m_vars_extra[i].read_rng_dst) {
3987                         if (!get_next_range(m_vars_extra[i].read_rng_dst,
3988                             &offset_dst)) {
3989                             // destination ranges are over
3990                             LIBOFFLOAD_ERROR(c_destination_is_over);
3991                             return false;
3992                         }
3993                     }
3994                     // into is contiguous.
3995                     else {
3996                         offset_dst = m_vars[i].disp;
3997                     }
3998                     length_dst_cur = length_dst;
3999                 }
4000                 // same as source
4001                 else {
4002                     offset_dst = offset_src;
4003                     length_dst_cur = length_src;
4004                 }
4005             }
4006             else {
4007                 // if destination is contiguous or its contiguous range is greater
4008                 // than source one
4009                 offset_dst += send_size;
4010             }
4011             length_dst_cur -= send_size;
4012             dst_is_empty = length_dst_cur == 0;
4013         }
4014         if (event) {
4015             event =  &m_in_deps[m_in_deps_total++];
4016         }
4017         if (src_data != 0 && src_data->cpu_buf != 0) {
4018             res = COI::BufferCopy(
4019                 dst_data->mic_buf,
4020                 src_data->cpu_buf,
4021                 m_vars[i].mic_offset +
4022                 m_vars[i].offset + offset_dst,
4023                 m_vars_extra[i].cpu_offset + offset_src,
4024                 send_size,
4025                 COI_COPY_UNSPECIFIED,
4026                 m_num_in_dependencies,
4027                 m_p_in_dependencies,
4028                 event);
4029             if (res != COI_SUCCESS) {
4030                 if (m_status != 0) {
4031                     m_status->result = translate_coi_error(res);
4032                     return false;
4033                 }
4034                 report_coi_error(c_buf_copy, res);
4035             }
4036         }
4037         else {
4038             char *base = offload_get_src_base(m_vars[i].ptr,
4039                 m_vars_extra[i].type_src);
4040 
4041             res = COI::BufferWrite(
4042                 dst_data->mic_buf,
4043                 m_vars[i].mic_offset +
4044                 m_vars[i].offset + offset_dst,
4045                 base + offset_src,
4046                 send_size,
4047                 COI_COPY_UNSPECIFIED,
4048                 m_num_in_dependencies,
4049                 m_p_in_dependencies,
4050                 event);
4051             if (res != COI_SUCCESS) {
4052                 if (m_status != 0) {
4053                     m_status->result = translate_coi_error(res);
4054                     return false;
4055                 }
4056                 report_coi_error(c_buf_write, res);
4057             }
4058         }
4059         data_sent += send_size;
4060     }
4061     while (true);
4062     return true;
4063 }
4064 
send_pointer_data(bool is_async,void * info)4065 bool OffloadDescriptor::send_pointer_data(bool is_async, void* info)
4066 {
4067     OffloadTimer timer(get_timer_data(), c_offload_host_send_pointers);
4068 
4069     bool should_use_async_buffer_write = m_initial_need_runfunction;
4070     uint64_t ptr_sent = 0;
4071     COIRESULT res;
4072     uint32_t in_deps_amount = 0;
4073     COIEVENT *in_deps = NULL;
4074 
4075     // For offload_transfer and offload with empty body without signal:
4076     // - if there is only one buffer copy - send data synchronously
4077     // - if there are multiple buffer copy and
4078     // __offload_parallel_copy is false - send data synchronously
4079     // - if there are multiple buffer copy and
4080     // __offload_parallel_copy is true - send data asynchronously
4081     // It concerns only big size data - greater than __offload_use_async_buffer_write.
4082     // Data of size less than __offload_use_async_buffer_write are sent synchronously.
4083     // Synchronous transfer results in better performance in COI.
4084     // __offload_parallel_copy is false by default but can be changed
4085     // via environment variable OFFLOAD_PARALLEL_COPY
4086     if (!m_initial_need_runfunction && __offload_parallel_copy) {
4087         int big_size_count = 0;
4088         for (int i = 0; i < m_vars_total; i++) {
4089             if (m_vars[i].direction.in &&
4090                 m_vars[i].size >= __offload_use_async_buffer_write) {
4091                 switch (m_vars_extra[i].type_dst) {
4092                     case c_data:
4093                     case c_void_ptr:
4094                     case c_void_ptr_ptr:
4095                     case c_cean_var:
4096                         if (m_vars[i].flags.is_static_dstn) {
4097                             big_size_count++;
4098                         }
4099                         break;
4100                     case c_string_ptr:
4101                     case c_string_ptr_ptr:
4102                     case c_data_ptr:
4103                     case c_data_ptr_ptr:
4104                     case c_cean_var_ptr:
4105                     case c_cean_var_ptr_ptr:
4106                     case c_dv_ptr:
4107                     case c_dv_data:
4108                     case c_dv_ptr_data:
4109                     case c_dv_data_slice:
4110                     case c_dv_ptr_data_slice:
4111                         big_size_count++;
4112                         break;
4113                     default:
4114                         break;
4115                 }
4116             }
4117         }
4118         if (big_size_count > 1) {
4119             should_use_async_buffer_write = true;
4120         }
4121     }
4122 
4123     // Initiate send for pointer data
4124     for (int i = 0; i < m_vars_total; i++) {
4125         uint64_t sent_data = m_vars[i].size;
4126 
4127         if (m_vars_extra[i].omp_last_event_type == c_last_write &&
4128             m_in_deps_total > 0) {
4129             m_num_in_dependencies = m_in_deps_total;
4130             m_p_in_dependencies = m_in_deps;
4131         }
4132         switch (m_vars_extra[i].type_dst) {
4133             case c_data_ptr_array:
4134                 break;
4135             case c_data:
4136             case c_void_ptr:
4137             case c_void_ptr_ptr:
4138             case c_cean_var:
4139                 if (m_vars[i].direction.in &&
4140                     m_vars[i].flags.is_static_dstn) {
4141                     COIEVENT *event =
4142                         (m_stream != no_stream ||
4143                          is_async ||
4144                          (should_use_async_buffer_write &&
4145                           m_vars[i].size >= __offload_use_async_buffer_write)) ?
4146                         &m_in_deps[m_in_deps_total++] : 0;
4147                     PtrData* dst_data = m_vars[i].into ?
4148                                             m_vars_extra[i].dst_data :
4149                                             m_vars_extra[i].src_data;
4150                     PtrData* src_data =
4151                         VAR_TYPE_IS_PTR(m_vars_extra[i].type_src) ||
4152                         VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_src) &&
4153                         m_vars[i].flags.is_static ?
4154                            m_vars_extra[i].src_data : 0;
4155 
4156                     if (m_vars[i].flags.is_non_cont_struct ||
4157                         m_vars[i].flags.is_noncont_src ||
4158                         m_vars[i].flags.is_noncont_dst) {
4159                         if (!send_noncontiguous_pointer_data(
4160                                 i, src_data, dst_data, event, sent_data,
4161                                 m_num_in_dependencies, m_p_in_dependencies)) {
4162                             return false;
4163                         }
4164                     }
4165                     else if (src_data != 0 && src_data->cpu_buf != 0) {
4166                         res = COI::BufferCopy(
4167                             dst_data->mic_buf,
4168                             src_data->cpu_buf,
4169                             m_vars[i].mic_offset +
4170                             m_vars[i].offset + m_vars[i].disp,
4171                             m_vars_extra[i].cpu_offset +
4172                             m_vars_extra[i].cpu_disp,
4173                             m_vars[i].size,
4174                             COI_COPY_UNSPECIFIED,
4175                             m_num_in_dependencies,
4176                             m_p_in_dependencies,
4177                             event);
4178                         if (res != COI_SUCCESS) {
4179                             if (m_status != 0) {
4180                                 m_status->result = translate_coi_error(res);
4181                                 return false;
4182                             }
4183                             report_coi_error(c_buf_copy, res);
4184                         }
4185                     }
4186                     else {
4187                         char *base = offload_get_src_base(m_vars[i].ptr,
4188                                          m_vars_extra[i].type_src);
4189                         res = COI::BufferWrite(
4190                             dst_data->mic_buf,
4191                             m_vars[i].mic_offset +
4192                             m_vars[i].offset + m_vars[i].disp,
4193                             base + m_vars_extra[i].cpu_disp,
4194                             m_vars[i].size,
4195                             COI_COPY_UNSPECIFIED,
4196                             m_num_in_dependencies,
4197                             m_p_in_dependencies,
4198                             event);
4199                         if (res != COI_SUCCESS) {
4200                             if (m_status != 0) {
4201                                 m_status->result = translate_coi_error(res);
4202                                 return false;
4203                             }
4204                             report_coi_error(c_buf_write, res);
4205                         }
4206                     }
4207                     ptr_sent += sent_data;
4208                 }
4209                 break;
4210 
4211             case c_data_ptr:
4212                //  If use_device_ptr no data needs to be sent
4213                if (m_vars[i].flags.use_device_ptr) {
4214                    break;
4215                }
4216             case c_string_ptr:
4217             case c_string_ptr_ptr:
4218             case c_data_ptr_ptr:
4219             case c_cean_var_ptr:
4220             case c_cean_var_ptr_ptr:
4221             case c_dv_ptr:
4222                 if (m_vars[i].direction.in && m_vars[i].size > 0) {
4223                     COIEVENT *event =
4224                         (m_stream != no_stream ||
4225                          is_async ||
4226                          (should_use_async_buffer_write &&
4227                           m_vars[i].size >= __offload_use_async_buffer_write)) ?
4228                         &m_in_deps[m_in_deps_total++] : 0;
4229                     PtrData* dst_data = m_vars[i].into ?
4230                                             m_vars_extra[i].dst_data :
4231                                             m_vars_extra[i].src_data;
4232                     PtrData* src_data =
4233                         VAR_TYPE_IS_PTR(m_vars_extra[i].type_src) ||
4234                         VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_src) &&
4235                         m_vars[i].flags.is_static ?
4236                             m_vars_extra[i].src_data : 0;
4237 
4238                     if (m_vars[i].flags.is_non_cont_struct ||
4239                         m_vars[i].flags.is_noncont_src ||
4240                         m_vars[i].flags.is_noncont_dst) {
4241                         send_noncontiguous_pointer_data(
4242                             i, src_data, dst_data, event, sent_data,
4243                             in_deps_amount, in_deps);
4244                     }
4245                     else if (src_data != 0 && src_data->cpu_buf != 0) {
4246                         res = COI::BufferCopy(
4247                             dst_data->mic_buf,
4248                             src_data->cpu_buf,
4249                             m_vars[i].mic_offset +
4250                             m_vars[i].offset + m_vars[i].disp,
4251                             m_vars_extra[i].cpu_offset +
4252                             m_vars_extra[i].cpu_disp,
4253                             m_vars[i].size,
4254                             COI_COPY_UNSPECIFIED,
4255                             m_num_in_dependencies,
4256                             m_p_in_dependencies,
4257                             event);
4258                         if (res != COI_SUCCESS) {
4259                             if (m_status != 0) {
4260                                 m_status->result = translate_coi_error(res);
4261                                 return false;
4262                             }
4263                             report_coi_error(c_buf_copy, res);
4264                         }
4265                     }
4266                     else {
4267                         char *base = offload_get_src_base(m_vars[i].ptr,
4268                                          m_vars_extra[i].type_src);
4269                         res = COI::BufferWrite(
4270                             dst_data->mic_buf,
4271                             m_vars[i].mic_offset +
4272                             m_vars[i].offset + m_vars[i].disp,
4273                             base + m_vars_extra[i].cpu_disp,
4274                             m_vars[i].size,
4275                             COI_COPY_UNSPECIFIED,
4276                             m_num_in_dependencies,
4277                             m_p_in_dependencies,
4278                             event);
4279                         if (res != COI_SUCCESS) {
4280                             if (m_status != 0) {
4281                                 m_status->result = translate_coi_error(res);
4282                                 return false;
4283                             }
4284                             report_coi_error(c_buf_write, res);
4285                         }
4286                     }
4287 
4288                     ptr_sent += sent_data;
4289                 }
4290                 break;
4291 
4292             case c_dv_data:
4293             case c_dv_ptr_data:
4294                 if (m_vars[i].direction.in &&
4295                     m_vars[i].size > 0) {
4296                     PtrData *ptr_data = m_vars[i].into ?
4297                                         m_vars_extra[i].dst_data :
4298                                         m_vars_extra[i].src_data;
4299                     PtrData* src_data = m_vars_extra[i].src_data;
4300 
4301                     COIEVENT *event =
4302                         (m_stream != no_stream ||
4303                          is_async ||
4304                          (should_use_async_buffer_write &&
4305                           m_vars[i].size >= __offload_use_async_buffer_write)) ?
4306                         &m_in_deps[m_in_deps_total++] : 0;
4307 
4308                     if (m_vars[i].flags.is_non_cont_struct ||
4309                         m_vars[i].flags.is_noncont_src ||
4310                         m_vars[i].flags.is_noncont_dst) {
4311                         send_noncontiguous_pointer_data(
4312                             i, src_data, ptr_data, event, sent_data,
4313                             in_deps_amount, in_deps);
4314                     }
4315                     else if (src_data && src_data->cpu_buf != 0) {
4316                         res = COI::BufferCopy(
4317                             ptr_data->mic_buf,
4318                             src_data->cpu_buf,
4319                             m_vars[i].offset + ptr_data->mic_offset +
4320                             m_vars[i].disp,
4321                             m_vars_extra[i].cpu_offset +
4322                             m_vars_extra[i].cpu_disp,
4323                             m_vars[i].size,
4324                             COI_COPY_UNSPECIFIED,
4325                             m_num_in_dependencies,
4326                             m_p_in_dependencies,
4327                             event);
4328                         if (res != COI_SUCCESS) {
4329                             if (m_status != 0) {
4330                                 m_status->result = translate_coi_error(res);
4331                                 return false;
4332                             }
4333                             report_coi_error(c_buf_copy, res);
4334                         }
4335                     }
4336                     else {
4337                         char *base = offload_get_src_base(m_vars[i].ptr,
4338                                          m_vars_extra[i].type_src);
4339                         res = COI::BufferWrite(
4340                             ptr_data->mic_buf,
4341                             ptr_data->mic_offset +
4342                             m_vars[i].offset + m_vars[i].disp,
4343                             base + m_vars_extra[i].cpu_disp,
4344                             m_vars[i].size,
4345                             COI_COPY_UNSPECIFIED,
4346                             m_num_in_dependencies,
4347                             m_p_in_dependencies,
4348                             event);
4349                         if (res != COI_SUCCESS) {
4350                             if (m_status != 0) {
4351                                 m_status->result = translate_coi_error(res);
4352                                 return false;
4353                             }
4354                             report_coi_error(c_buf_write, res);
4355                         }
4356                     }
4357                     ptr_sent += sent_data;
4358                 }
4359                 break;
4360 
4361             case c_dv_data_slice:
4362             case c_dv_ptr_data_slice:
4363                 if (m_vars[i].direction.in &&
4364                     m_vars[i].size > 0) {
4365                     PtrData *dst_data = m_vars[i].into ?
4366                                         m_vars_extra[i].dst_data :
4367                                         m_vars_extra[i].src_data;
4368                     PtrData* src_data =
4369                         (VAR_TYPE_IS_PTR(m_vars_extra[i].type_src) ||
4370                         VAR_TYPE_IS_DV_DATA(m_vars_extra[i].type_src) ||
4371                         VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src) ||
4372                         VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_src) &&
4373                         m_vars[i].flags.is_static) ?
4374                             m_vars_extra[i].src_data : 0;
4375                     COIEVENT *event =
4376                         (m_stream != no_stream ||
4377                          is_async ||
4378                          (should_use_async_buffer_write &&
4379                           m_vars[i].size >= __offload_use_async_buffer_write)) ?
4380                         &m_in_deps[m_in_deps_total++] : 0;
4381                     if (m_vars[i].flags.is_non_cont_struct ||
4382                         m_vars[i].flags.is_noncont_src ||
4383                         m_vars[i].flags.is_noncont_dst) {
4384                         send_noncontiguous_pointer_data(
4385                             i, src_data, dst_data, event, sent_data,
4386                             in_deps_amount, in_deps);
4387                     }
4388                     else if (src_data && src_data->cpu_buf != 0) {
4389                         res = COI::BufferCopy(
4390                             dst_data->mic_buf,
4391                             src_data->cpu_buf,
4392                             m_vars[i].offset +
4393                             dst_data->mic_offset +
4394                             m_vars[i].disp,
4395                             m_vars_extra[i].cpu_offset +
4396                             m_vars_extra[i].cpu_disp,
4397                             m_vars[i].size,
4398                             COI_COPY_UNSPECIFIED,
4399                             m_num_in_dependencies,
4400                             m_p_in_dependencies,
4401                             event);
4402                         if (res != COI_SUCCESS) {
4403                             if (m_status != 0) {
4404                                 m_status->result = translate_coi_error(res);
4405                                 return false;
4406                             }
4407                             report_coi_error(c_buf_copy, res);
4408                         }
4409                     }
4410                     else {
4411                         char *base = offload_get_src_base(m_vars[i].ptr,
4412                                          m_vars_extra[i].type_src);
4413                         res = COI::BufferWrite(
4414                             dst_data->mic_buf,
4415                             dst_data->mic_offset +
4416                             m_vars[i].offset + m_vars[i].disp,
4417                             base + m_vars_extra[i].cpu_disp,
4418                             m_vars[i].size,
4419                             COI_COPY_UNSPECIFIED,
4420                             m_num_in_dependencies,
4421                             m_p_in_dependencies,
4422                             event);
4423                         if (res != COI_SUCCESS) {
4424                             if (m_status != 0) {
4425                                 m_status->result = translate_coi_error(res);
4426                                 return false;
4427                             }
4428                             report_coi_error(c_buf_write, res);
4429                         }
4430                     }
4431 
4432                     ptr_sent += sent_data;
4433                 }
4434                 break;
4435 
4436             default:
4437                 break;
4438         }
4439         if (m_vars_extra[i].omp_last_event_type == c_last_write) {
4440             register_omp_event_call_back(&m_in_deps[m_in_deps_total - 1], info);
4441         }
4442         // alloc field isn't used at target.
4443         // We can reuse it for offset of array pointers.
4444         if (m_vars_extra[i].is_arr_ptr_el) {
4445             m_vars[i].ptr_arr_offset = m_vars_extra[i].ptr_arr_offset;
4446         }
4447     }
4448     // list of out events created while send_pointer_data now became input
4449     // dependencies for runfunction (or Read transfers from target if
4450     // runfunction is absent)
4451     m_num_in_dependencies = m_in_deps_total ? m_in_deps_total :
4452                             m_num_in_dependencies;
4453     m_p_in_dependencies = m_in_deps_total ? m_in_deps : m_p_in_dependencies;
4454 
4455     if (m_status) {
4456         m_status->data_sent += ptr_sent;
4457     }
4458 
4459     OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), ptr_sent);
4460     OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
4461                   c_offload_sent_pointer_data,
4462                   "Total pointer data sent to target: [%lld] bytes\n",
4463                   ptr_sent);
4464 
4465     return true;
4466 }
4467 
gather_copyin_data()4468 bool OffloadDescriptor::gather_copyin_data()
4469 {
4470     OffloadTimer timer(get_timer_data(), c_offload_host_gather_inputs);
4471 
4472     if (m_need_runfunction && m_in_datalen > 0) {
4473         COIMAPINSTANCE map_inst;
4474         char *data;
4475 
4476         // init marshaller
4477         if (m_inout_buf != 0) {
4478             OffloadTimer timer_map(get_timer_data(),
4479                                    c_offload_host_map_in_data_buffer);
4480 
4481             COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_in_datalen,
4482                                            COI_MAP_WRITE_ENTIRE_BUFFER,
4483                                            0, 0, 0, &map_inst,
4484                                            reinterpret_cast<void**>(&data));
4485             if (res != COI_SUCCESS) {
4486                 if (m_status != 0) {
4487                     m_status->result = translate_coi_error(res);
4488                     return false;
4489                 }
4490                 report_coi_error(c_buf_map, res);
4491             }
4492         }
4493         else {
4494             data = (char*) m_func_desc + m_func_desc->data_offset;
4495         }
4496 
4497         // send variable descriptors
4498         memcpy(data, m_vars, m_vars_total * sizeof(VarDesc));
4499         data += m_vars_total * sizeof(VarDesc);
4500 
4501         // init marshaller
4502         m_in.init_buffer(data, m_in_datalen);
4503 
4504         // Gather copy data into buffer
4505         for (int i = 0; i < m_vars_total; i++) {
4506             bool src_is_for_mic = (m_vars[i].direction.out ||
4507                                    m_vars[i].into == NULL);
4508             PtrData* ptr_data = src_is_for_mic ?
4509                                 m_vars_extra[i].src_data :
4510                                 m_vars_extra[i].dst_data;
4511             if (m_vars[i].flags.alloc_disp) {
4512                 m_in.send_data(&ptr_data->alloc_disp,
4513                                sizeof(ptr_data->alloc_disp));
4514             }
4515             if (TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_src) ||
4516                 TYPE_IS_PTR_TO_PTR(m_vars_extra[i].type_dst) ||
4517                 (m_vars_extra[i].type_src == c_data_ptr_array &&
4518                  m_vars[i].flags.is_pointer)) {
4519                 m_in.send_data(&m_vars_extra[i].pointer_offset,
4520                                sizeof(m_vars_extra[i].pointer_offset));
4521             }
4522             // send sink address to the target
4523             if (m_vars[i].flags.sink_addr) {
4524                 m_in.send_data(&ptr_data->mic_addr,
4525                                sizeof(ptr_data->mic_addr));
4526             }
4527 
4528             switch (m_vars_extra[i].type_dst) {
4529                 case c_data_ptr_array:
4530                     break;
4531                 case c_data:
4532                 case c_void_ptr:
4533                 case c_void_ptr_ptr:
4534                 case c_cean_var:
4535                     if (m_vars[i].direction.in &&
4536                         !m_vars[i].flags.is_static_dstn) {
4537 
4538                         char *ptr = offload_get_src_base(m_vars[i].ptr,
4539                                         m_vars_extra[i].type_src);
4540                         if (m_vars_extra[i].type_dst == c_cean_var) {
4541                             // offset and length are derived from the array
4542                             // descriptor
4543                             int64_t size = m_vars[i].size;
4544                             int64_t disp = m_vars[i].disp;
4545                             m_in.send_data(reinterpret_cast<char*>(&size),
4546                                            sizeof(int64_t));
4547                             m_in.send_data(reinterpret_cast<char*>(&disp),
4548                                            sizeof(int64_t));
4549                         }
4550 
4551                         m_in.send_data(ptr + m_vars_extra[i].cpu_disp,
4552                                        m_vars[i].size);
4553                     }
4554                     break;
4555 
4556                 case c_dv:
4557                     if (m_vars[i].direction.bits ||
4558                         m_vars[i].alloc_if ||
4559                         m_vars[i].free_if) {
4560                         // send dope vector excluding base
4561                         char *ptr = static_cast<char*>(m_vars[i].ptr);
4562                         m_in.send_data(ptr + sizeof(uint64_t),
4563                                        m_vars[i].size - sizeof(uint64_t));
4564                     }
4565                     break;
4566 
4567                 case c_data_ptr:
4568                     // send to target addresses of obsolete
4569                     // stacks to be released
4570                     if (m_vars[i].flags.is_stack_buf &&
4571                         !m_vars[i].direction.bits &&
4572                         m_vars[i].alloc_if &&
4573                         m_vars[i].size != 0) {
4574                         for (PtrDataList::iterator it =
4575                             m_destroy_stack.begin();
4576                             it != m_destroy_stack.end(); it++) {
4577                             PtrData * ptr_data = *it;
4578                             m_in.send_data(&(ptr_data->mic_addr),
4579                                 sizeof(ptr_data->mic_addr));
4580                         }
4581                     }
4582                     break;
4583                 case c_func_ptr:
4584                 case c_func_ptr_ptr:
4585                     if (m_vars[i].direction.in) {
4586                         m_in.send_func_ptr(*((const void**) m_vars[i].ptr));
4587                     }
4588                     break;
4589 
4590                 default:
4591                     break;
4592             }
4593         }
4594 
4595         if (m_status) {
4596             m_status->data_sent += m_in.get_tfr_size();
4597         }
4598 
4599         if (m_func_desc->data_offset == 0) {
4600             OffloadTimer timer_unmap(get_timer_data(),
4601                                      c_offload_host_unmap_in_data_buffer);
4602             COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
4603             if (res != COI_SUCCESS) {
4604                 if (m_status != 0) {
4605                     m_status->result = translate_coi_error(res);
4606                     return false;
4607                 }
4608                 report_coi_error(c_buf_unmap, res);
4609             }
4610         }
4611     }
4612 
4613     OFFLOAD_TIMER_HOST_SDATA(get_timer_data(), m_in.get_tfr_size());
4614     OFFLOAD_DEBUG_TRACE_1(1,
4615                   GET_OFFLOAD_NUMBER(get_timer_data()), c_offload_copyin_data,
4616                   "Total copyin data sent to target: [%lld] bytes\n",
4617                   m_in.get_tfr_size());
4618 
4619     return true;
4620 }
4621 
compute(void * info)4622 bool OffloadDescriptor::compute(void *info)
4623 {
4624     OffloadTimer timer(get_timer_data(), c_offload_host_start_compute);
4625 
4626     if (m_need_runfunction) {
4627         OFFLOAD_DEBUG_TRACE_1(2, GET_OFFLOAD_NUMBER(get_timer_data()),
4628                               c_offload_compute, "Compute task on MIC\n");
4629 
4630         void* misc = m_func_desc;
4631         int   misc_len = m_func_desc_size;
4632         void* ret = 0;
4633         int   ret_len = 0;
4634 
4635         if (m_func_desc->data_offset != 0) {
4636             misc_len += m_in_datalen;
4637 
4638             if (m_out_datalen > 0) {
4639                 ret = (char*) m_func_desc + m_func_desc->data_offset;
4640                 ret_len = m_out_datalen;
4641             }
4642         }
4643 
4644         // dispatch task
4645         COIRESULT res;
4646         COIEVENT event;
4647 
4648         res = m_device.compute(m_stream,
4649                                m_compute_buffers,
4650                                misc, misc_len,
4651                                ret, ret_len,
4652                                m_num_in_dependencies,
4653                                m_p_in_dependencies,
4654                                &event);
4655 
4656         if (res != COI_SUCCESS) {
4657             if (m_status != 0) {
4658                 m_status->result = translate_coi_error(res);
4659                 return false;
4660             }
4661             report_coi_error(c_pipeline_run_func, res);
4662         }
4663 
4664         if (m_omp_async_last_event_type == c_last_runfunc) {
4665             register_omp_event_call_back(&event, info);
4666         }
4667 
4668         m_in_deps_total = m_num_in_dependencies = 1;
4669         m_in_deps[0] = event;
4670         m_p_in_dependencies = m_in_deps;
4671     }
4672 
4673     return true;
4674 }
4675 
4676 // receive pointer data if source or destination or both of them are
4677 // noncontiguous. There is guarantee that length of destination enough for
4678 // transferred data.
receive_noncontiguous_pointer_data(int i,COIBUFFER dst_buf,COIEVENT * event,uint64_t & received_data,uint32_t in_deps_amount,COIEVENT * in_deps)4679 bool OffloadDescriptor::receive_noncontiguous_pointer_data(
4680     int i,
4681     COIBUFFER dst_buf,
4682     COIEVENT *event,
4683     uint64_t &received_data,
4684     uint32_t in_deps_amount,
4685     COIEVENT *in_deps
4686 )
4687 {
4688     NonContigDesc *desc;
4689     int noncont_num;
4690     int64_t offset_src, offset_dst;
4691     int64_t length_src, length_dst;
4692     int64_t length_src_cur, length_dst_cur;
4693     int64_t receive_size;
4694     COIRESULT res;
4695     bool dst_is_empty = true;
4696     bool src_is_empty = true;
4697 
4698     char *base = offload_get_src_base(
4699                      m_vars[i].into ?
4700                      static_cast<char*>(m_vars[i].into) :
4701                      static_cast<char*>(m_vars[i].ptr),
4702                      m_vars_extra[i].type_dst);
4703     received_data = 0;
4704 
4705     // If BufferReadMultiD is defined we can set values of required arguments
4706     // and transfer noncontiguous data via call to the COI routine.
4707     if (__offload_use_coi_noncontiguous_transfer && COI::BufferReadMultiD) {
4708         struct Arr_Desc* arr_desc_dst;
4709         struct Arr_Desc* arr_desc_src;
4710         int64_t size_src, size_dst;
4711 
4712         offset_src = (m_vars_extra[i].read_rng_src)?
4713             m_vars_extra[i].read_rng_src->init_offset : m_vars[i].disp;
4714         size_src = m_vars_extra[i].read_rng_src ?
4715             cean_get_transf_size(m_vars_extra[i].read_rng_src) :
4716             m_vars[i].size;
4717 
4718         offset_dst = (m_vars_extra[i].read_rng_dst)?
4719             m_vars_extra[i].read_rng_dst->init_offset : m_vars_extra[i].cpu_disp;
4720         size_dst = m_vars_extra[i].read_rng_dst ?
4721             cean_get_transf_size(m_vars_extra[i].read_rng_dst) : m_vars[i].size;
4722 
4723         int64_t el_size = (!m_vars[i].into ||
4724                            (m_vars_extra[i].read_rng_src &&
4725                             m_vars_extra[i].read_rng_dst)) ?
4726                             1 :
4727                             m_vars_extra[i].read_rng_src ?
4728                                 m_vars_extra[i].read_rng_src->arr_desc->dim[
4729                 m_vars_extra[i].read_rng_src->arr_desc->rank - 1].size :
4730         m_vars_extra[i].read_rng_dst->arr_desc->dim[
4731             m_vars_extra[i].read_rng_dst->arr_desc->rank - 1].size;
4732             arr_desc_src = (m_vars_extra[i].read_rng_src) ?
4733                 m_vars_extra[i].read_rng_src->arr_desc :
4734             make_arr_desc(NULL, // don't required for source
4735                 offset_src/el_size, size_src/el_size,
4736                 el_size);
4737             arr_desc_dst = !m_vars[i].into ? arr_desc_src :
4738                 (m_vars_extra[i].read_rng_dst) ?
4739                 m_vars_extra[i].read_rng_dst->arr_desc :
4740             make_arr_desc(NULL,
4741                 offset_dst/el_size, size_src/el_size, el_size);
4742 
4743             arr_desc_dst->base = reinterpret_cast<int64_t>(base);
4744 
4745             res = COI::BufferReadMultiD(
4746                 m_vars_extra[i].src_data->mic_buf,      // SourceBuffer
4747                 m_vars[i].offset + m_vars[i].mic_offset -
4748                 m_vars_extra[i].src_data->alloc_disp,         // Offset
4749                 (void*)arr_desc_dst,          // descriptor of DestArray
4750                 (void*)arr_desc_src,          // descriptor of SrcArray
4751                 COI_COPY_UNSPECIFIED,         // Type
4752                 m_num_in_dependencies,        // Number of in Dependencies
4753                 m_p_in_dependencies,          // array of in Dependencies
4754                 event);                       // out Dependency
4755             if (res != COI_SUCCESS) {
4756                 if (m_status != 0) {
4757                     m_status->result = translate_coi_error(res);
4758                     return false;
4759                 }
4760                 report_coi_error(c_buf_copy, res);
4761             }
4762             return(true);
4763     }
4764     if (m_vars[i].flags.is_non_cont_struct) {
4765         desc = m_vars_extra[i].noncont_desc;
4766         noncont_num = 0;
4767     }
4768     else {
4769         // Set length_src and length_dst
4770         length_src = (m_vars_extra[i].read_rng_src) ?
4771             m_vars_extra[i].read_rng_src->range_size : m_vars[i].size;
4772         length_dst = !m_vars[i].into ? length_src :
4773                      (m_vars_extra[i].read_rng_dst) ?
4774                      m_vars_extra[i].read_rng_dst->range_size : m_vars[i].size;
4775         receive_size = (length_src < length_dst) ? length_src : length_dst;
4776     }
4777 
4778     // if event is defined we must multiplate for all contiguous intervals
4779     // that will be Copied/Read.
4780     // Take in account that we already have 1 event.
4781     if (event) {
4782         uint32_t range_num = m_vars[i].flags.is_non_cont_struct ?
4783                                 desc->interval_cnt :
4784                                 (length_src / receive_size) *
4785                                 ((m_vars_extra[i].read_rng_src) ?
4786                                 m_vars_extra[i].read_rng_src->range_max_number : 1) ;
4787         m_out_deps_allocated += range_num;
4788         m_out_deps    =
4789             (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_out_deps_allocated);
4790         m_out_deps_total--;
4791     }
4792 
4793     // consequently get contiguous ranges,
4794     // define corresponded destination offset and receive data
4795     do {
4796         if (m_vars[i].flags.is_non_cont_struct) {
4797             // ranges are over
4798             if (noncont_num >= desc->interval_cnt) {
4799                 break;
4800             }
4801             offset_src = offset_dst = desc->interval[noncont_num].lower;
4802             receive_size = desc->interval[noncont_num].size;
4803             noncont_num++;
4804         }
4805         else { // get source offset
4806             if (src_is_empty) {
4807                 if (m_vars_extra[i].read_rng_src) {
4808                     if (!get_next_range(m_vars_extra[i].read_rng_src,
4809                         &offset_src)) {
4810                             // source ranges are over - nothing to send
4811                             break;
4812                     }
4813                 }
4814                 else if (received_data == 0) {
4815                     offset_src = m_vars[i].disp;
4816                 }
4817                 else {
4818                     break;
4819                 }
4820                 length_src_cur = length_src;
4821             }
4822             else {
4823                 // if source is contiguous or its contiguous range is greater
4824                 // than destination one
4825                 offset_src += receive_size;
4826             }
4827             length_src_cur -= receive_size;
4828             src_is_empty = length_src_cur == 0;
4829 
4830             // get destination offset
4831             if (dst_is_empty) {
4832                 if (m_vars[i].into) {
4833                     if (m_vars_extra[i].read_rng_dst) {
4834                         if (!get_next_range(m_vars_extra[i].read_rng_dst,
4835                             &offset_dst)) {
4836                                 // destination ranges are over
4837                                 LIBOFFLOAD_ERROR(c_destination_is_over);
4838                                 return false;
4839                         }
4840                     }
4841                     // destination is contiguous.
4842                     else {
4843                         offset_dst = m_vars_extra[i].cpu_disp;
4844                     }
4845                     length_dst_cur = length_dst;
4846                 }
4847                 // same as source
4848                 else {
4849                     offset_dst = offset_src;
4850                     length_dst_cur = length_src;
4851                 }
4852             }
4853             else {
4854                 // if destination is contiguous or its contiguous range is greater
4855                 // than source one
4856                 offset_dst += receive_size;
4857             }
4858             length_dst_cur -= receive_size;
4859             dst_is_empty = length_dst_cur == 0;
4860         }
4861         if (event) {
4862             event =  &m_out_deps[m_out_deps_total++];
4863         }
4864         if (dst_buf != 0) {
4865             res = COI::BufferCopy(
4866                 dst_buf,
4867                 m_vars_extra[i].src_data->mic_buf,
4868                 m_vars_extra[i].cpu_offset + offset_dst,
4869                 m_vars[i].offset + offset_src +
4870                 m_vars[i].mic_offset,
4871                 receive_size,
4872                 COI_COPY_UNSPECIFIED,
4873                 m_num_in_dependencies,
4874                 m_p_in_dependencies,
4875                 event);
4876             if (res != COI_SUCCESS) {
4877                 if (m_status != 0) {
4878                     m_status->result = translate_coi_error(res);
4879                     return false;
4880                 }
4881                 report_coi_error(c_buf_copy, res);
4882             }
4883         }
4884         else {
4885             res = COI::BufferRead(
4886                 m_vars_extra[i].src_data->mic_buf,
4887                 m_vars[i].offset + offset_src +
4888                 m_vars[i].mic_offset,
4889                 base + offset_dst,
4890                 receive_size,
4891                 COI_COPY_UNSPECIFIED,
4892                 m_num_in_dependencies,
4893                 m_p_in_dependencies,
4894                 event);
4895             if (res != COI_SUCCESS) {
4896                 if (m_status != 0) {
4897                     m_status->result = translate_coi_error(res);
4898                     return false;
4899                 }
4900                 report_coi_error(c_buf_read, res);
4901             }
4902         }
4903         received_data += receive_size;
4904     }
4905     while (true);
4906     return true;
4907 }
4908 
receive_pointer_data(bool is_async,bool first_run,void * info)4909 bool OffloadDescriptor::receive_pointer_data(bool is_async,
4910                                              bool first_run, void *info)
4911 {
4912     OffloadTimer timer(get_timer_data(), c_offload_host_start_buffers_reads);
4913 
4914     bool should_use_async_buffer_read = m_initial_need_runfunction;
4915     uint64_t ptr_received = 0;
4916     COIRESULT res;
4917 
4918     // For offload_transfer and offload with empty body without signal:
4919     // - if there is only one buffer copy - get data synchronously
4920     // - if there are multiple buffer copy and
4921     //      __offload_parallel_copy is false - get data synchronously
4922     // - if there are multiple buffer copy
4923     //      and __offload_parallel_copy is true - get data asynchronously
4924     // It concerns only data with size greater than __offload_use_async_buffer_read.
4925     // Data of size less than __offload_use_async_buffer_read are received synchronously.
4926     // Synchronous transfer results in better performance in COI.
4927     // __offload_parallel_copy is false by default but can be changed
4928     // via environment variable OFFLOAD_PARALLEL_COPY
4929     if (!m_initial_need_runfunction && __offload_parallel_copy) {
4930         int big_size_count = 0;
4931 
4932         for (int i = 0; i < m_vars_total; i++) {
4933             if (m_vars[i].direction.out &&
4934                 m_vars[i].size >= __offload_use_async_buffer_read) {
4935                 // preallocated OUT only at second run
4936                 if (first_run == m_vars[i].flags.preallocated) {
4937                     continue;
4938                 }
4939                 switch (m_vars_extra[i].type_src) {
4940                     case c_data:
4941                     case c_void_ptr:
4942                     case c_void_ptr_ptr:
4943                     case c_cean_var:
4944                         if (m_vars[i].flags.is_static) {
4945                             big_size_count++;
4946                         }
4947                         break;
4948                     case c_string_ptr:
4949                     case c_data_ptr:
4950                     case c_string_ptr_ptr:
4951                     case c_data_ptr_ptr:
4952                     case c_cean_var_ptr:
4953                     case c_cean_var_ptr_ptr:
4954                     case c_dv_data:
4955                     case c_dv_ptr_data:
4956                     case c_dv_data_slice:
4957                     case c_dv_ptr_data_slice:
4958                     case c_dv_ptr:
4959                         big_size_count++;
4960                         break;
4961                     default:
4962                         break;
4963                 }
4964             }
4965         }
4966         if (big_size_count > 1) {
4967             should_use_async_buffer_read = true;
4968         }
4969     }
4970     uint32_t in_deps_amount = m_in_deps_total;
4971     COIEVENT *in_deps = m_in_deps_total > 0 ? m_in_deps : 0;
4972 
4973     for (int i = 0; i < m_vars_total; i++) {
4974         uint64_t received_data = m_vars[i].size;
4975 
4976          // Nothing to receive if use_device_ptr
4977          if (m_vars[i].flags.use_device_ptr )
4978             continue;
4979          if (m_vars_extra[i].omp_last_event_type == c_last_read &&
4980              m_out_deps_total > 0) {
4981              m_num_in_dependencies = m_out_deps_total;
4982              m_p_in_dependencies   = m_out_deps;
4983         }
4984         // At first run don't receive by preallocated target pointer as the
4985         //pointer value will be ready later after call to scatter_copyout_data
4986         if (first_run && m_vars[i].alloc_if && m_vars[i].flags.preallocated) {
4987             m_preallocated_alloc = true;
4988             // need one more call to OffloadDescriptor::receive_pointer_data
4989             if (m_vars[i].direction.out) {
4990                 m_out_with_preallocated = true;
4991             }
4992             continue;
4993         }
4994         switch (m_vars_extra[i].type_src) {
4995             case c_data_ptr_array:
4996                 break;
4997             case c_data:
4998             case c_void_ptr:
4999             case c_void_ptr_ptr:
5000             case c_cean_var:
5001                 if (m_vars[i].direction.out &&
5002                     m_vars[i].flags.is_static) {
5003                     COIEVENT *event =
5004                         (m_stream != no_stream ||
5005                          is_async ||
5006                          m_in_deps_total > 0 ||
5007                          (should_use_async_buffer_read &&
5008                           m_vars[i].size >= __offload_use_async_buffer_read)) ?
5009                         &m_out_deps[m_out_deps_total++] : 0;
5010                     PtrData *ptr_data = NULL;
5011                     COIBUFFER dst_buf = NULL; // buffer at host
5012                     char *base;
5013 
5014                     if (VAR_TYPE_IS_PTR(m_vars_extra[i].type_dst)) {
5015                         ptr_data = m_vars[i].into ?
5016                                    m_vars_extra[i].dst_data :
5017                                    m_vars_extra[i].src_data;
5018                     }
5019                     else if (VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_dst)) {
5020                         if (m_vars[i].flags.is_static_dstn) {
5021                             ptr_data = m_vars[i].into ?
5022                                        m_vars_extra[i].dst_data :
5023                                        m_vars_extra[i].src_data;
5024                         }
5025                     }
5026                     dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
5027                     if (dst_buf == NULL) {
5028                         base = offload_get_src_base(
5029                             m_vars[i].into ?
5030                             static_cast<char*>(m_vars[i].into) :
5031                             static_cast<char*>(m_vars[i].ptr),
5032                             m_vars_extra[i].type_dst);
5033                     }
5034 
5035                     if (m_vars[i].flags.is_non_cont_struct ||
5036                         m_vars[i].flags.is_noncont_src ||
5037                         m_vars[i].flags.is_noncont_dst) {
5038                         receive_noncontiguous_pointer_data(
5039                             i, dst_buf, event, received_data,
5040                             m_num_in_dependencies, m_p_in_dependencies);
5041                     }
5042                     else if (dst_buf != 0) {
5043                         res = COI::BufferCopy(
5044                             dst_buf,
5045                             m_vars_extra[i].src_data->mic_buf,
5046                             m_vars_extra[i].cpu_offset +
5047                             m_vars_extra[i].cpu_disp,
5048                             m_vars[i].offset + m_vars[i].disp,
5049                             m_vars[i].size,
5050                             COI_COPY_UNSPECIFIED,
5051                             m_num_in_dependencies,
5052                             m_p_in_dependencies,
5053                             event);
5054                         if (res != COI_SUCCESS) {
5055                             if (m_status != 0) {
5056                                 m_status->result = translate_coi_error(res);
5057                                 return false;
5058                             }
5059                             report_coi_error(c_buf_copy, res);
5060                         }
5061                     }
5062                     else {
5063                        res = COI::BufferRead(
5064                             m_vars_extra[i].src_data->mic_buf,
5065                             m_vars[i].offset + m_vars[i].disp,
5066                             base + m_vars_extra[i].cpu_offset +
5067                             m_vars_extra[i].cpu_disp,
5068                             m_vars[i].size,
5069                             COI_COPY_UNSPECIFIED,
5070                             m_num_in_dependencies,
5071                             m_p_in_dependencies,
5072                             event);
5073                         if (res != COI_SUCCESS) {
5074                             if (m_status != 0) {
5075                                 m_status->result = translate_coi_error(res);
5076                                 return false;
5077                             }
5078                             report_coi_error(c_buf_read, res);
5079                         }
5080                     }
5081                     ptr_received += received_data;
5082                 }
5083                 break;
5084 
5085             case c_string_ptr:
5086             case c_data_ptr:
5087             case c_string_ptr_ptr:
5088             case c_data_ptr_ptr:
5089             case c_cean_var_ptr:
5090             case c_cean_var_ptr_ptr:
5091             case c_dv_data:
5092             case c_dv_ptr_data:
5093             case c_dv_data_slice:
5094             case c_dv_ptr_data_slice:
5095             case c_dv_ptr: {
5096                 COIBUFFER dst_buf = NULL; // buffer on host
5097                 if (m_vars[i].direction.out && m_vars[i].size > 0) {
5098                     COIEVENT *event =
5099                         (m_stream != no_stream ||
5100                          is_async ||
5101                          m_in_deps_total > 0 ||
5102                          (should_use_async_buffer_read &&
5103                           m_vars[i].size >= __offload_use_async_buffer_read)) ?
5104                         &m_out_deps[m_out_deps_total++] : 0;
5105 
5106                     uint64_t dst_offset = 0;
5107                     char *base = static_cast<char*>(m_vars[i].ptr);
5108 
5109                     if (VAR_TYPE_IS_PTR(m_vars_extra[i].type_dst)) {
5110                         PtrData *ptr_data = m_vars[i].into ?
5111                                             m_vars_extra[i].dst_data :
5112                                             m_vars_extra[i].src_data;
5113                         dst_buf = ptr_data ? ptr_data->cpu_buf : NULL;
5114                         if (dst_buf == NULL) {
5115                             base = m_vars[i].into ?
5116                                    *static_cast<char**>(m_vars[i].into) :
5117                                    *static_cast<char**>(m_vars[i].ptr);
5118                         }
5119                         dst_offset = m_vars_extra[i].cpu_offset +
5120                                      m_vars_extra[i].cpu_disp;
5121                     }
5122                     else if (VAR_TYPE_IS_SCALAR(m_vars_extra[i].type_dst)) {
5123                         if (m_vars[i].flags.is_static_dstn) {
5124                             dst_buf = m_vars[i].into ?
5125                                         m_vars_extra[i].dst_data->cpu_buf :
5126                                         m_vars_extra[i].src_data->cpu_buf;
5127                         }
5128                         if (dst_buf == NULL) {
5129                             base = offload_get_src_base(
5130                                 m_vars[i].into ?
5131                                 static_cast<char*>(m_vars[i].into) :
5132                                 static_cast<char*>(m_vars[i].ptr),
5133                                 m_vars_extra[i].type_dst);
5134                         }
5135                         dst_offset = m_vars_extra[i].cpu_offset +
5136                                      m_vars_extra[i].cpu_disp;
5137                     }
5138                     else if (VAR_TYPE_IS_DV_DATA(m_vars_extra[i].type_dst) ||
5139                              VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst)) {
5140                         PtrData *ptr_data = m_vars[i].into != 0 ?
5141                                             m_vars_extra[i].dst_data :
5142                                             m_vars_extra[i].src_data;
5143                         dst_buf = ptr_data != 0 ? ptr_data->cpu_buf : 0;
5144                         if (dst_buf == NULL) {
5145                             base = offload_get_src_base(
5146                                 m_vars[i].into ?
5147                                 static_cast<char*>(m_vars[i].into) :
5148                                 static_cast<char*>(m_vars[i].ptr),
5149                                 m_vars_extra[i].type_dst);
5150 
5151                         }
5152                         dst_offset = m_vars_extra[i].cpu_offset +
5153                                      m_vars_extra[i].cpu_disp;
5154                     }
5155 
5156                     if (m_vars[i].flags.is_non_cont_struct ||
5157                         m_vars[i].flags.is_noncont_src ||
5158                         m_vars[i].flags.is_noncont_dst) {
5159                         receive_noncontiguous_pointer_data(
5160                             i, dst_buf, event, received_data,
5161                             m_num_in_dependencies, m_p_in_dependencies);
5162                     }
5163                     else if (dst_buf != 0) {
5164                         res = COI::BufferCopy(
5165                             dst_buf,
5166                             m_vars_extra[i].src_data->mic_buf,
5167                             dst_offset,
5168                             m_vars[i].offset + m_vars[i].disp +
5169                                 m_vars[i].mic_offset,
5170                             m_vars[i].size,
5171                             COI_COPY_UNSPECIFIED,
5172                             m_num_in_dependencies,
5173                             m_p_in_dependencies,
5174                             event);
5175                         if (res != COI_SUCCESS) {
5176                             if (m_status != 0) {
5177                                 m_status->result = translate_coi_error(res);
5178                                 return false;
5179                             }
5180                             report_coi_error(c_buf_copy, res);
5181                         }
5182                     }
5183                     else {
5184                         res = COI::BufferRead(
5185                             m_vars_extra[i].src_data->mic_buf,
5186                             m_vars[i].offset + m_vars[i].disp +
5187                                 m_vars[i].mic_offset,
5188                             base + dst_offset,
5189                             m_vars[i].size,
5190                             COI_COPY_UNSPECIFIED,
5191                             m_num_in_dependencies,
5192                             m_p_in_dependencies,
5193                             event);
5194                         if (res != COI_SUCCESS) {
5195                             if (m_status != 0) {
5196                                 m_status->result = translate_coi_error(res);
5197                                 return false;
5198                             }
5199                             report_coi_error(c_buf_read, res);
5200                         }
5201                     }
5202                     ptr_received += received_data;
5203                 }
5204                 break;
5205             }
5206 
5207             default:
5208                 break;
5209         }
5210 
5211         if (m_vars_extra[i].omp_last_event_type == c_last_read) {
5212             register_omp_event_call_back(&m_out_deps[m_out_deps_total - 1], info);
5213         }
5214         // destroy buffers for obsolete stacks
5215         if (m_destroy_stack.size() != 0) {
5216             for (PtrDataList::iterator it = m_destroy_stack.begin();
5217                 it != m_destroy_stack.end(); it++) {
5218                 PtrData *ptr_data = *it;
5219                 m_destroy_buffers.push_back(ptr_data->mic_buf);
5220                 OFFLOAD_TRACE(3, "Removing stack buffer with addr %p\n",
5221                                   ptr_data->mic_addr);
5222             }
5223             m_destroy_stack.clear();
5224         }
5225         if (m_vars[i].free_if) {
5226             // remove association for automatic variables
5227             if (m_is_openmp) {
5228                 if (m_vars_extra[i].auto_data) {
5229                     AutoData *auto_data = m_vars_extra[i].auto_data;
5230                     if (m_vars[i].flags.always_delete) {
5231                         auto_data->nullify_reference();
5232                     }
5233                     else if (auto_data->remove_reference() == 0) {
5234                        m_device.remove_auto_data(auto_data->cpu_addr.start());
5235                     }
5236                     continue;
5237                 }
5238                 else {
5239                     PtrData *ptr_data = m_vars_extra[i].src_data;
5240                     if (ptr_data &&
5241                         IS_OPENMP_IMPLICIT_OR_LINK(ptr_data->var_alloc_type)) {
5242                         if (ptr_data->get_reference() > 0) {
5243                             ptr_data->remove_reference();
5244                         }
5245                         continue;
5246                     }
5247                }
5248             }
5249 
5250             // destroy buffers
5251             if (m_vars[i].direction.out || m_vars[i].into == NULL) {
5252                 if (!VAR_TYPE_IS_PTR(m_vars_extra[i].type_src) &&
5253                     !VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_src) &&
5254                     !VAR_TYPE_IS_DV_DATA(m_vars_extra[i].type_src)) {
5255                     continue;
5256                 }
5257 
5258                 PtrData *ptr_data = m_vars_extra[i].src_data;
5259                 if (ptr_data->remove_reference() == 0) {
5260                     // destroy buffers
5261                     if (ptr_data->cpu_buf != 0) {
5262                         m_destroy_buffers.push_back(ptr_data->cpu_buf);
5263                     }
5264                     if (ptr_data->mic_buf != 0) {
5265                         m_destroy_buffers.push_back(ptr_data->mic_buf);
5266                     }
5267                     OFFLOAD_TRACE(3, "Removing association for addr %p\n",
5268                                   ptr_data->cpu_addr.start());
5269 
5270                     // remove association from map
5271                     if (m_vars[i].flags.targetptr) {
5272                         m_device.remove_targetptr_data(ptr_data->cpu_addr.start());
5273                     }
5274                     else {
5275                         m_device.remove_ptr_data(ptr_data->cpu_addr.start());
5276                     }
5277                 }
5278             }
5279             else if (VAR_TYPE_IS_PTR(m_vars_extra[i].type_dst) ||
5280                      VAR_TYPE_IS_DV_DATA_SLICE(m_vars_extra[i].type_dst) ||
5281                      VAR_TYPE_IS_DV_DATA(m_vars_extra[i].type_dst)) {
5282                 PtrData *ptr_data = m_vars_extra[i].dst_data;
5283 
5284                 if (ptr_data->remove_reference() == 0) {
5285                     // destroy buffers
5286                     if (ptr_data->cpu_buf != 0) {
5287                         m_destroy_buffers.push_back(ptr_data->cpu_buf);
5288                     }
5289                     if (ptr_data->mic_buf != 0) {
5290                         m_destroy_buffers.push_back(ptr_data->mic_buf);
5291                     }
5292                     OFFLOAD_TRACE(3, "Removing association for addr %p\n",
5293                                   ptr_data->cpu_addr.start());
5294 
5295                     // remove association from map
5296                     if (m_vars[i].flags.targetptr) {
5297                         m_device.remove_targetptr_data(ptr_data->cpu_addr.start());
5298                     }
5299                     else {
5300                         m_device.remove_ptr_data(ptr_data->cpu_addr.start());
5301                     }
5302                 }
5303             }
5304         }
5305     }
5306 
5307     if (m_status) {
5308         m_status->data_received += ptr_received;
5309     }
5310 
5311     m_num_in_dependencies = m_out_deps_total ? m_out_deps_total :
5312                                                m_num_in_dependencies;
5313     m_p_in_dependencies = m_out_deps_total ? m_out_deps : m_p_in_dependencies;
5314 
5315     OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), ptr_received);
5316     OFFLOAD_DEBUG_TRACE_1(1, GET_OFFLOAD_NUMBER(get_timer_data()),
5317                   c_offload_received_pointer_data,
5318                   "Total pointer data received from target: [%lld] bytes\n",
5319                   ptr_received);
5320 
5321     return true;
5322 }
5323 
scatter_copyout_data()5324 bool OffloadDescriptor::scatter_copyout_data()
5325 {
5326     OffloadTimer timer(get_timer_data(), c_offload_host_scatter_outputs);
5327 
5328     if (m_need_runfunction && m_out_datalen > 0) {
5329 
5330         // total size that need to be transferred from target to host
5331         COIMAPINSTANCE map_inst;
5332         COIRESULT res;
5333         char *data;
5334 
5335         // output data buffer
5336         if (m_func_desc->data_offset == 0) {
5337             OffloadTimer timer_map(get_timer_data(),
5338                                    c_offload_host_map_out_data_buffer);
5339 
5340             COIRESULT res = COI::BufferMap(m_inout_buf, 0, m_out_datalen,
5341                                            COI_MAP_READ_ONLY, 0, 0, 0,
5342                                            &map_inst,
5343                                             reinterpret_cast<void**>(&data));
5344             if (res != COI_SUCCESS) {
5345                 if (m_status != 0) {
5346                     m_status->result = translate_coi_error(res);
5347                     return false;
5348                 }
5349                 report_coi_error(c_buf_map, res);
5350             }
5351         }
5352         else {
5353             data = (char*) m_func_desc + m_func_desc->data_offset;
5354         }
5355 
5356         // get timing data
5357         OFFLOAD_TIMER_TARGET_DATA(get_timer_data(), data);
5358         data += OFFLOAD_TIMER_DATALEN();
5359 
5360         // initialize output marshaller
5361         m_out.init_buffer(data, m_out_datalen);
5362 
5363         for (int i = 0; i < m_vars_total; i++) {
5364             bool src_is_for_mic = (m_vars[i].direction.out ||
5365                                    m_vars[i].into == NULL);
5366 
5367             if (m_vars_extra[i].type_src != c_data_ptr_array &&
5368                 m_vars[i].flags.preallocated && m_vars[i].alloc_if) {
5369                 PtrData *ptr_data;
5370                 void *ptr_value;
5371                 void ** cpu_ptr = src_is_for_mic ?
5372                                   reinterpret_cast<void**>(m_vars[i].ptr) :
5373                                   reinterpret_cast<void**>(m_vars[i].into);
5374                 void*   alloc_base = NULL;
5375                 int64_t alloc_disp = 0;
5376                 int64_t alloc_size;
5377                 if (m_vars_extra[i].alloc != NULL) {
5378                     // array descriptor
5379                     const Arr_Desc *ap =
5380                         static_cast<const Arr_Desc*>(m_vars_extra[i].alloc);
5381 
5382                     __arr_data_offset_and_length(ap, alloc_disp, alloc_size);
5383 
5384                     alloc_base = reinterpret_cast<void*>(ap->base);
5385                 }
5386 
5387                 // get pointer to target memory
5388                 m_out.receive_data(&ptr_value, sizeof(void*));
5389 
5390                 // add new entry
5391                 if (!alloc_ptr_data(
5392                     ptr_data,
5393                     ptr_value,
5394                     (alloc_base != NULL) ?
5395                         alloc_disp : m_vars[i].disp,
5396                     (alloc_base != NULL) ?
5397                         alloc_size : m_vars[i].size,
5398                     alloc_disp,
5399                     0,
5400                     m_vars[i].flags.targetptr,
5401                     m_vars[i].flags.preallocated,
5402                     m_vars[i].flags.pin)) {
5403                     return false;
5404                 }
5405 
5406                 ptr_data->add_reference();
5407                 *cpu_ptr = ptr_value;
5408                 if (src_is_for_mic) {
5409                     m_vars_extra[i].src_data = ptr_data;
5410                 }
5411                 else {
5412                     m_vars_extra[i].dst_data = ptr_data;
5413                 }
5414                 m_vars[i].offset = (char*) ptr_value -
5415                                    (char*) ptr_data->cpu_addr.start();
5416             }
5417 
5418             switch (m_vars_extra[i].type_src) {
5419                 case c_data_ptr_array:
5420                     break;
5421                 case c_data:
5422                 case c_void_ptr:
5423                 case c_void_ptr_ptr:
5424                 case c_cean_var:
5425                     if (m_vars[i].direction.out &&
5426                         !m_vars[i].flags.is_static) {
5427 
5428                         if (m_vars[i].into) {
5429                             char *ptr = offload_get_src_base(
5430                                 static_cast<char*>(m_vars[i].into),
5431                                 m_vars_extra[i].type_dst);
5432                             m_out.receive_data(ptr + m_vars_extra[i].cpu_disp,
5433                                                m_vars[i].size);
5434                         }
5435                         else {
5436                             m_out.receive_data(
5437                                 static_cast<char*>(m_vars[i].ptr) +
5438                                     m_vars_extra[i].cpu_disp,
5439                                 m_vars[i].size);
5440                         }
5441                     }
5442                     break;
5443 
5444                 case c_func_ptr:
5445                 case c_func_ptr_ptr:
5446                     if (m_vars[i].direction.out) {
5447                         m_out.receive_func_ptr((const void**) m_vars[i].ptr);
5448                     }
5449                     break;
5450 
5451                 default:
5452                     break;
5453             }
5454         }
5455 
5456         if (m_status) {
5457             m_status->data_received += m_out.get_tfr_size();
5458         }
5459 
5460         if (m_func_desc->data_offset == 0) {
5461             OffloadTimer timer_unmap(get_timer_data(),
5462                                      c_offload_host_unmap_out_data_buffer);
5463 
5464             COIRESULT res = COI::BufferUnmap(map_inst, 0, 0, 0);
5465             if (res != COI_SUCCESS) {
5466                 if (m_status != 0) {
5467                     m_status->result = translate_coi_error(res);
5468                     return false;
5469                 }
5470                 report_coi_error(c_buf_unmap, res);
5471             }
5472         }
5473     }
5474 
5475     OFFLOAD_TIMER_HOST_RDATA(get_timer_data(), m_out.get_tfr_size());
5476     OFFLOAD_TRACE(1, "Total copyout data received from target: [%lld] bytes\n",
5477                   m_out.get_tfr_size());
5478 
5479     return true;
5480 }
5481 
get_arr_desc_numbers(const Arr_Desc * ap,int64_t el_size,int64_t & offset,int64_t & size,int & el_number,CeanReadRanges * & ptr_ranges)5482 static void get_arr_desc_numbers(
5483     const Arr_Desc *ap,
5484     int64_t el_size,
5485     int64_t &offset,
5486     int64_t &size,
5487     int     &el_number,
5488     CeanReadRanges* &ptr_ranges
5489 )
5490 {
5491     if (is_arr_desc_contiguous(ap)) {
5492         ptr_ranges = NULL;
5493         __arr_data_offset_and_length(ap, offset, size);
5494         el_number = size / el_size;
5495     }
5496     else {
5497         ptr_ranges = init_read_ranges_arr_desc(ap);
5498         el_number = (ptr_ranges->range_size / el_size) *
5499                     ptr_ranges->range_max_number;
5500         size = ptr_ranges->range_size;
5501     }
5502 }
5503 
gen_var_descs_for_pointer_array(int i)5504 bool OffloadDescriptor::gen_var_descs_for_pointer_array(int i)
5505 {
5506     int             pointers_number;
5507     int             tmp_val;
5508     int             new_index = m_vars_total;
5509     const Arr_Desc *ap;
5510     const VarDesc3 *vd3 = static_cast<const VarDesc3*>(m_vars[i].ptr);
5511     int             flags = vd3->array_fields;
5512     bool            src_is_for_mic = (m_vars[i].direction.out ||
5513                                       m_vars[i].into == NULL);
5514 
5515     ReadArrElements<void *>  ptr;
5516     ReadArrElements<void *>  into;
5517     ReadArrElements<int64_t> ext_start;
5518     ReadArrElements<int64_t> ext_elements;
5519     ReadArrElements<int64_t> align;
5520     ReadArrElements<int64_t> alloc_if;
5521     ReadArrElements<int64_t> free_if;
5522     ReadArrElements<int64_t> into_start;
5523     ReadArrElements<int64_t> into_elem;
5524     ReadArrElements<int64_t> alloc_start;
5525     ReadArrElements<int64_t> alloc_elem;
5526 
5527 
5528     ap = static_cast<const Arr_Desc*>(vd3->ptr_array);
5529 
5530     // "pointers_number" for total number of transferred pointers.
5531     // For each of them we create new var_desc and put it at the bottom
5532     // of the var_desc's array
5533     get_arr_desc_numbers(ap, sizeof(void *), ptr.offset, ptr.size,
5534         pointers_number, ptr.ranges);
5535     ptr.base = reinterpret_cast<char*>(ap->base);
5536 
5537     // 2. prepare memory for new var_descs
5538     m_vars_total += pointers_number;
5539     m_vars       = (VarDesc*)realloc(m_vars, m_vars_total * sizeof(VarDesc));
5540     if (m_vars == NULL)
5541       LIBOFFLOAD_ERROR(c_malloc);
5542     m_vars_extra =
5543         (VarExtra*)realloc(m_vars_extra, m_vars_total * sizeof(VarExtra));
5544     if (m_vars_extra == NULL)
5545       LIBOFFLOAD_ERROR(c_malloc);
5546     m_in_deps    =
5547         (COIEVENT*)realloc(m_in_deps, sizeof(COIEVENT) * (m_vars_total + 1));
5548     if (m_in_deps == NULL)
5549       LIBOFFLOAD_ERROR(c_malloc);
5550     m_out_deps   =
5551         (COIEVENT*)realloc(m_out_deps, sizeof(COIEVENT) * m_vars_total);
5552     if (m_out_deps == NULL)
5553       LIBOFFLOAD_ERROR(c_malloc);
5554 
5555     // 3. Prepare for reading new var_desc's fields
5556     //    EXTENT START
5557     if ((flags & (1<<flag_extent_start_is_array)) != 0) {
5558         ap = static_cast<const Arr_Desc*>(vd3->extent_start);
5559         get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, ext_start.offset,
5560             ext_start.size, tmp_val, ext_start.ranges);
5561         ext_start.base = reinterpret_cast<char*>(ap->base);
5562         ext_start.el_size = ap->dim[ap->rank - 1].size;
5563 
5564         if (tmp_val < pointers_number) {
5565             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
5566             return false;
5567         }
5568     }
5569     else if ((flags & (1<<flag_extent_start_is_scalar)) != 0) {
5570         ext_start.val = (int64_t)vd3->extent_start;
5571     }
5572     else {
5573         ext_start.val = 0;
5574     }
5575 
5576     //    EXTENT ELEMENTS NUMBER
5577     if ((flags & (1<<flag_extent_elements_is_array)) != 0) {
5578         ap = static_cast<const Arr_Desc*>(vd3->extent_elements);
5579         get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
5580             ext_elements.offset, ext_elements.size,
5581             tmp_val, ext_elements.ranges);
5582         ext_elements.base = reinterpret_cast<char*>(ap->base);
5583         ext_elements.el_size = ap->dim[ap->rank - 1].size;
5584 
5585         if (tmp_val < pointers_number) {
5586             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
5587             return false;
5588         }
5589     }
5590     else if ((flags & (1<<flag_extent_elements_is_scalar)) != 0) {
5591         ext_elements.val = (int64_t)vd3->extent_elements;
5592     }
5593     else {
5594         ext_elements.val = m_vars[i].count;
5595     }
5596 
5597     //    ALLOC_IF
5598     if ((flags & (1<<flag_alloc_if_is_array)) != 0) {
5599         ap = static_cast<const Arr_Desc*>(vd3->alloc_if_array);
5600         get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_if.offset,
5601             alloc_if.size, tmp_val, alloc_if.ranges);
5602         alloc_if.base = reinterpret_cast<char*>(ap->base);
5603         alloc_if.el_size = ap->dim[ap->rank - 1].size;
5604 
5605         if (tmp_val < pointers_number) {
5606             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
5607             return false;
5608         }
5609     }
5610     else {
5611         alloc_if.val = m_vars[i].alloc_if;
5612     }
5613 
5614     //    FREE_IF
5615     if ((flags & (1<<flag_free_if_is_array)) != 0) {
5616         ap = static_cast<const Arr_Desc*>(vd3->free_if_array);
5617         get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, free_if.offset,
5618             free_if.size, tmp_val, free_if.ranges);
5619         free_if.base = reinterpret_cast<char*>(ap->base);
5620         free_if.el_size = ap->dim[ap->rank - 1].size;
5621 
5622         if (tmp_val < pointers_number) {
5623             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
5624             return false;
5625         }
5626     }
5627     else {
5628         free_if.val = m_vars[i].free_if;
5629     }
5630 
5631     //    ALIGN
5632 
5633     if ((flags & (1<<flag_align_is_array)) != 0) {
5634         ap = static_cast<const Arr_Desc*>(vd3->align_array);
5635         get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, align.offset,
5636             align.size, tmp_val, align.ranges);
5637         align.base = reinterpret_cast<char*>(ap->base);
5638         align.el_size = ap->dim[ap->rank - 1].size;
5639 
5640         if (tmp_val < pointers_number) {
5641             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
5642             return false;
5643         }
5644     }
5645     else {
5646         align.val = m_vars[i].align;
5647     }
5648 
5649     // 3.1 INTO
5650 
5651     if (m_vars[i].into) {
5652         ap = static_cast<const Arr_Desc*>(m_vars[i].into);
5653         get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into.offset,
5654             into.size, tmp_val, into.ranges);
5655         into.base = reinterpret_cast<char*>(ap->base);
5656 
5657         if (tmp_val < pointers_number) {
5658             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
5659             return false;
5660         }
5661     }
5662 
5663     // 3.2 INTO_START
5664 
5665     if ((flags & (1<<flag_into_start_is_array)) != 0) {
5666         ap = static_cast<const Arr_Desc*>(vd3->into_start);
5667         get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_start.offset,
5668             into_start.size, tmp_val, into_start.ranges);
5669         into_start.base = reinterpret_cast<char*>(ap->base);
5670         into_start.el_size = ap->dim[ap->rank - 1].size;
5671 
5672         if (tmp_val < pointers_number) {
5673             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
5674             return false;
5675         }
5676     }
5677     else if ((flags & (1<<flag_into_start_is_scalar)) != 0) {
5678         into_start.val = (int64_t)vd3->into_start;
5679     }
5680     else {
5681         into_start.val = 0;
5682     }
5683 
5684     // 3.3 INTO_ELEMENTS
5685 
5686     if ((flags & (1<<flag_into_elements_is_array)) != 0) {
5687         ap = static_cast<const Arr_Desc*>(vd3->into_elements);
5688         get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, into_elem.offset,
5689             into_elem.size, tmp_val, into_elem.ranges);
5690         into_elem.base = reinterpret_cast<char*>(ap->base);
5691         into_elem.el_size = ap->dim[ap->rank - 1].size;
5692 
5693         if (tmp_val < pointers_number) {
5694             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
5695             return false;
5696         }
5697     }
5698     else if ((flags & (1<<flag_into_elements_is_scalar)) != 0) {
5699         into_elem.val = (int64_t)vd3->into_elements;
5700     }
5701     else {
5702         into_elem.val = m_vars[i].count;
5703     }
5704 
5705     //    alloc_start
5706 
5707     if ((flags & (1<<flag_alloc_start_is_array)) != 0) {
5708         ap = static_cast<const Arr_Desc*>(vd3->alloc_start);
5709         get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size,
5710             alloc_start.offset, alloc_start.size, tmp_val,
5711             alloc_start.ranges);
5712         alloc_start.base = reinterpret_cast<char*>(ap->base);
5713         alloc_start.el_size = ap->dim[ap->rank - 1].size;
5714 
5715         if (tmp_val < pointers_number) {
5716             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
5717             return false;
5718         }
5719     }
5720     else if ((flags & (1<<flag_alloc_start_is_scalar)) != 0) {
5721         alloc_start.val = (int64_t)vd3->alloc_start;
5722     }
5723     else {
5724         alloc_start.val = 0;
5725     }
5726 
5727     //    alloc_elem
5728 
5729     if ((flags & (1<<flag_alloc_elements_is_array)) != 0) {
5730         ap = static_cast<const Arr_Desc*>(vd3->alloc_elements);
5731         get_arr_desc_numbers(ap, ap->dim[ap->rank - 1].size, alloc_elem.offset,
5732             alloc_elem.size, tmp_val, alloc_elem.ranges);
5733         alloc_elem.base = reinterpret_cast<char*>(ap->base);
5734         alloc_elem.el_size = ap->dim[ap->rank - 1].size;
5735         if (tmp_val < pointers_number) {
5736             LIBOFFLOAD_ERROR(c_pointer_array_mismatch,
5737                              "alloc_extent elements");
5738             return false;
5739         }
5740     }
5741     else if ((flags & (1<<flag_alloc_elements_is_scalar)) != 0) {
5742         alloc_elem.val = (int64_t)vd3->alloc_elements;
5743     }
5744     else {
5745         alloc_elem.val = 0;
5746     }
5747 
5748     for (int k = 0; k < pointers_number; k++) {
5749         int type = flags & 0x3f;
5750         int type_src, type_dst;
5751         //  Get new values
5752         // type_src, type_dst
5753         type_src = type_dst = (type == c_data_ptr_array) ?
5754                               c_data_ptr   : (type == c_func_ptr_array) ?
5755                               c_func_ptr   : (type == c_void_ptr_array) ?
5756                               c_void_ptr   : (type == c_string_ptr_array) ?
5757                               c_string_ptr : 0;
5758 
5759         // Get ptr val
5760         if (!ptr.read_next(true)) {
5761             break;
5762         }
5763         else {
5764             ptr.val = (void*)(ptr.base + ptr.offset);
5765         }
5766 
5767         // !!! If we got error at phase of reading - it's an internal
5768         // !!! error, as we must detect mismatch before
5769 
5770         // Get into val
5771         if (m_vars[i].into) {
5772             if (!into.read_next(true)) {
5773                 LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into");
5774                 LIBOFFLOAD_ABORT;
5775             }
5776             else {
5777                 into.val = (void*)(into.base + into.offset);
5778             }
5779         }
5780 
5781         // Get other components of the clause
5782         if (!ext_start.read_next(flags & (1<<flag_extent_start_is_array))) {
5783             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent start");
5784             LIBOFFLOAD_ABORT;
5785         }
5786         if (!ext_elements.read_next(
5787                 flags & (1<<flag_extent_elements_is_array))) {
5788             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "extent elements");
5789             LIBOFFLOAD_ABORT;
5790         }
5791         if (!alloc_if.read_next(flags & (1<<flag_alloc_if_is_array))) {
5792             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_if");
5793             LIBOFFLOAD_ABORT;
5794         }
5795         if (!free_if.read_next(flags & (1<<flag_free_if_is_array))) {
5796             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "free_if");
5797             LIBOFFLOAD_ABORT;
5798         }
5799         if (!align.read_next(flags & (1<<flag_align_is_array))) {
5800             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "align");
5801             LIBOFFLOAD_ABORT;
5802         }
5803         if (!into_start.read_next(flags & (1<<flag_into_start_is_array))) {
5804             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent start");
5805             LIBOFFLOAD_ABORT;
5806         }
5807         if (!into_elem.read_next(flags & (1<<flag_into_elements_is_array))) {
5808             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "into_extent elements");
5809             LIBOFFLOAD_ABORT;
5810         }
5811         if (!alloc_start.read_next(flags & (1<<flag_alloc_start_is_array))) {
5812             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent start");
5813             LIBOFFLOAD_ABORT;
5814         }
5815         if (!alloc_elem.read_next(
5816                  flags & (1<<flag_alloc_elements_is_array))) {
5817             LIBOFFLOAD_ERROR(c_pointer_array_mismatch, "alloc_extent elements");
5818             LIBOFFLOAD_ABORT;
5819         }
5820 
5821         m_vars[new_index + k].direction.bits = m_vars[i].direction.bits;
5822         m_vars[new_index + k].alloc_if = alloc_if.val;
5823         m_vars[new_index + k].free_if = free_if.val;
5824         m_vars[new_index + k].align = align.val;
5825         m_vars[new_index + k].mic_offset = 0;
5826         m_vars[new_index + k].flags.bits = m_vars[i].flags.bits;
5827         m_vars[new_index + k].flags.is_pointer = 0;
5828         m_vars[new_index + k].offset = 0;
5829         m_vars[new_index + k].size = m_vars[i].size;
5830         m_vars[new_index + k].flags.targetptr = m_vars[i].flags.targetptr;
5831         m_vars[new_index + k].flags.preallocated =
5832                                              m_vars[i].flags.preallocated;
5833 
5834         if (ext_start.val == 0) {
5835             m_vars[new_index + k].count = ext_elements.val;
5836             m_vars[new_index + k].ptr = ptr.val;
5837             if (type_src == c_string_ptr) {
5838                 m_vars[new_index + k].size = 0;
5839             }
5840         }
5841         else {
5842             m_vars[new_index + k].count = 0;
5843             m_vars[new_index + k].ptr =
5844                 static_cast<void*>(make_arr_desc(
5845                 ptr.val,
5846                 ext_start.val,
5847                 ext_elements.val,
5848                 m_vars[i].size));
5849 
5850             type_src = type_src == c_data_ptr ? c_cean_var_ptr :
5851                                    c_string_ptr ? c_cean_var_ptr :
5852                                    type_src;
5853             if (!m_vars[i].into) {
5854                 type_dst = type_src;
5855             }
5856         }
5857 
5858         if (m_vars[i].into && into_elem.val != 0) {
5859             m_vars[new_index + k].into =
5860                 static_cast<void*>(make_arr_desc(
5861                 into.val,
5862                 into_start.val,
5863                 into_elem.val,
5864                 m_vars[i].size));
5865             type_dst = (type == c_data_ptr_array) ? c_cean_var_ptr :
5866                        (type == c_string_ptr_array) ? c_cean_var_ptr :
5867                         type_src;
5868         }
5869         else {
5870             m_vars[new_index + k].into = NULL;
5871         }
5872 
5873         if (alloc_elem.val != 0) {
5874             m_vars[new_index + k].alloc =
5875                 static_cast<void*>(make_arr_desc(
5876                 ptr.val,
5877                 alloc_start.val,
5878                 alloc_elem.val,
5879                 m_vars[i].size));
5880         }
5881         else {
5882             m_vars[new_index + k].alloc = NULL;
5883         }
5884 
5885         m_vars[new_index + k].type.src =
5886             m_vars_extra[new_index + k].type_src = type_src;
5887         m_vars[new_index + k].type.dst =
5888             m_vars_extra[new_index + k].type_dst = type_dst;
5889 
5890         m_vars_extra[new_index + k].alloc = m_vars[new_index + k].alloc;
5891         m_vars_extra[new_index + k].is_arr_ptr_el = 1;
5892         m_vars_extra[new_index + k].ptr_arr_offset =
5893             src_is_for_mic ? ptr.offset : into.offset;
5894     }
5895     // count and alloc fields are useless at target. They can be reused
5896     // for pointer arrays.
5897     m_vars[i].count = pointers_number;
5898     m_vars[i].ptr_arr_offset = new_index;
5899     return true;
5900 }
5901 
5902 // Gets in dependencies of the previous offload via the stream "m_stream".
5903 // Out argument in_deps_amount - address of amount of the dependencies
5904 // Out argument in_deps - address of array of dependencies.
5905 // Description of the dependencies scheme for streams :
5906 // ----------------------------------------------------
5907 // Every offload forms DAG consisted of 3 nodes:
5908 // for in-transfers, runfunction and out-transfers.
5909 // Every node has in-dependencies and out-dependencies
5910 // Out-dependencies of previous node forms in-dependencies of current node.
5911 // In-dependencies of 1-st node (of in-transfers) without streams is equal
5912 // to NULL. For streams in-dependencies of 1-st node is equal to list of out
5913 // dependencies of last node of previous offload via this stream.
5914 // So we can say that DAGs of 2 consequent offloads via the same stream are
5915 // connected by the way described above.
get_stream_in_dependencies(uint32_t & in_deps_amount,COIEVENT * & in_deps)5916 void OffloadDescriptor::get_stream_in_dependencies(
5917     uint32_t &in_deps_amount,
5918     COIEVENT* &in_deps
5919 )
5920 {
5921     if (m_stream != no_stream && m_stream != 0) {
5922         Stream * stream = Stream::find_stream(m_stream, false);
5923         if (!stream) {
5924             LIBOFFLOAD_ERROR(c_offload_no_stream,
5925                              m_device.get_logical_index());
5926             LIBOFFLOAD_ABORT;
5927         }
5928         OffloadDescriptor* offload = stream->get_last_offload();
5929 
5930         // if it's the first offload in the stream
5931         if (!offload) {
5932             return;
5933         }
5934         // if last offload has out-tranfers
5935         if (offload->m_out_deps_total) {
5936             in_deps_amount = offload->m_out_deps_total;
5937             in_deps = offload->m_out_deps;
5938         }
5939         // last offload only sends pointer data or run function or both of them
5940         // and has no out-transfers
5941         else if (offload->m_in_deps_total) {
5942             in_deps_amount = offload->m_in_deps_total;
5943             in_deps = offload->m_in_deps;
5944         }
5945     }
5946 }
5947 
__offload_fini_library(void)5948 static void __offload_fini_library(void)
5949 {
5950     OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ...\n");
5951     if (mic_engines_total > 0) {
5952         delete[] mic_engines;
5953         mic_engines_total = 0;
5954 
5955         if (mic_proxy_fs_root != 0) {
5956             free(mic_proxy_fs_root);
5957             mic_proxy_fs_root = 0;
5958         }
5959 
5960         if (knc_library_path != 0) {
5961             free(knc_library_path);
5962             knc_library_path = 0;
5963         }
5964 
5965         if (knl_library_path != 0) {
5966             free(knl_library_path);
5967             knl_library_path = 0;
5968         }
5969 
5970         // destroy thread key
5971         thread_key_delete(mic_thread_key);
5972     }
5973 
5974     // unload COI library
5975     if (COI::is_available) {
5976         COI::fini();
5977     }
5978 
5979     OFFLOAD_DEBUG_TRACE(2, "Cleanup offload library ... done\n");
5980 }
5981 
5982 typedef std::pair<int, micLcpuMask*> deviceLcpu;
5983 typedef std::list<deviceLcpu> deviceLcpuList;
5984 
process_offload_devices(const char * env_var,uint32_t num_devices,deviceLcpuList & device_cpu_list)5985 static int process_offload_devices(
5986     const char *env_var,
5987     uint32_t num_devices,
5988     deviceLcpuList &device_cpu_list
5989 )
5990 {
5991     // Value is composed of comma separated physical device index
5992     // optionally qualified by logical CPU subset, e.g. 0[60,70-80]
5993     char *buf = strdup(env_var);
5994     if (buf == NULL)
5995         LIBOFFLOAD_ERROR(c_malloc);
5996     char *str = buf;
5997     bool device_set_finished = false;
5998     int num_devices_specified = 0;
5999     do {
6000         char *dev_ptr = str;
6001         int dev_len = strcspn(str, "[,");
6002         micLcpuMask* cpu_mask = 0;
6003         if (str[dev_len] == '[') {
6004             // CPU subset specified
6005             cpu_mask = new micLcpuMask;
6006             cpu_mask->reset();
6007             char *cpu_ptr = str + dev_len + 1;
6008             do {
6009                 int64_t cnum;
6010                 bool cpu_set_finished = false;
6011                 int cpu_len = strcspn(cpu_ptr, ",-]");
6012                 if (cpu_ptr[cpu_len] == ',' || cpu_ptr[cpu_len] == ']') {
6013                     // A single CPU specified
6014                     cpu_set_finished = cpu_ptr[cpu_len] == ']';
6015                     cpu_ptr[cpu_len] = '\0';
6016                     // Convert cpu string to an int
6017                     if (!__offload_parse_int_string(cpu_ptr, cnum)) {
6018                         LIBOFFLOAD_ERROR(c_mic_init7);
6019                         delete cpu_mask;
6020                         free(buf);
6021                         return 0;
6022                     } else {
6023                         OFFLOAD_DEBUG_TRACE(3,
6024                             "Single CPU %d selected\n", cnum);
6025                         cpu_mask->set(cnum);
6026                     }
6027                     cpu_ptr = cpu_ptr + cpu_len + 1;
6028                     if (cpu_set_finished) {
6029                         break;
6030                     }
6031                 } else if (cpu_ptr[cpu_len] == '-') {
6032                     int64_t range_start, range_end;
6033                     // A range of CPUs specified
6034                     cpu_ptr[cpu_len] = '\0';
6035                     // Convert cpu string to an int
6036                     if (!__offload_parse_int_string(cpu_ptr, range_start)) {
6037                         LIBOFFLOAD_ERROR(c_mic_init8);
6038                         delete cpu_mask;
6039                         free(buf);
6040                         return 0;
6041                     } else {
6042                         OFFLOAD_DEBUG_TRACE(3,
6043                             "Start of CPU range specified as %d\n",
6044                             range_start);
6045                         cpu_ptr = cpu_ptr + cpu_len + 1;
6046                         cpu_len = strcspn(cpu_ptr, ",]");
6047                         if (cpu_ptr[cpu_len] == ',' ||
6048                             cpu_ptr[cpu_len] == ']') {
6049                             cpu_set_finished = cpu_ptr[cpu_len] == ']';
6050                             cpu_ptr[cpu_len] = '\0';
6051                             // Convert cpu string to an int
6052                             if (!__offload_parse_int_string(
6053                                 cpu_ptr, range_end)) {
6054                                 LIBOFFLOAD_ERROR(c_mic_init9);
6055                                 delete cpu_mask;
6056                                 free(buf);
6057                                 return 0;
6058                             } else {
6059                                 OFFLOAD_DEBUG_TRACE(3,
6060                                     "End of CPU range specified as %d\n",
6061                                     range_end);
6062                                 if (range_end < range_start) {
6063                                     LIBOFFLOAD_ERROR(c_mic_init10);
6064                                     delete cpu_mask;
6065                                     free(buf);
6066                                     return 0;
6067                                 } else {
6068                                     for (int i=range_start; i<=range_end; i++)
6069                                     {
6070                                         OFFLOAD_DEBUG_TRACE(3,
6071                                           "CPU %d selected as part of range\n",
6072                                           i);
6073                                         cpu_mask->set(i);
6074                                     }
6075                                     cpu_ptr = cpu_ptr + cpu_len + 1;
6076                                     if (cpu_set_finished) {
6077                                         break;
6078                                     }
6079                                 }
6080                             }
6081                         } else {
6082                             LIBOFFLOAD_ERROR(c_mic_init10);
6083                             delete cpu_mask;
6084                             free(buf);
6085                             return 0;
6086                         }
6087                     }
6088                 } else {
6089                     // Error: expected , or - or ]
6090                     LIBOFFLOAD_ERROR(c_mic_init11);
6091                     delete cpu_mask;
6092                     free(buf);
6093                     return 0;
6094                 }
6095             } while (true);
6096             // Point to next device specification
6097             str = cpu_ptr;
6098             if (*str == '\0') {
6099                 device_set_finished = true;
6100             } else {
6101                 // Skip the comma after a device specification
6102                 str++;
6103             }
6104         } else if (str[dev_len] == ',') {
6105             // CPU subset not specified
6106             // Point to next device specification
6107             str = str + dev_len + 1;
6108         } else {
6109             // No more device specifications
6110             device_set_finished = true;
6111         }
6112         dev_ptr[dev_len] = '\0';
6113         // Convert device string to an int
6114         int64_t num;
6115         if (!__offload_parse_int_string(dev_ptr, num)) {
6116             LIBOFFLOAD_ERROR(c_mic_init5);
6117             delete cpu_mask;
6118             free(buf);
6119             return 0;
6120         }
6121         if (num < 0 || num >= num_devices) {
6122             LIBOFFLOAD_ERROR(c_mic_init6, num);
6123             delete cpu_mask;
6124             free(buf);
6125             return 0;
6126         }
6127 	    OFFLOAD_DEBUG_TRACE(3, "Offloadable MIC = %d\n", num);
6128         // Save the specified physical device and cpu mask
6129         device_cpu_list.push_back(make_pair(num, cpu_mask));
6130         num_devices_specified++;
6131 
6132         if (device_set_finished) {
6133             break;
6134         }
6135     } while (true);
6136 
6137     free(buf);
6138     return num_devices_specified;
6139 }
6140 
__offload_init_library_once(void)6141 static void __offload_init_library_once(void)
6142 {
6143     COIRESULT res;
6144     uint32_t num_devices;
6145     deviceLcpuList device_cpu_list;
6146     prefix = report_get_message_str(c_report_host);
6147 
6148     // initialize trace
6149     const char *env_var = getenv(htrace_envname);
6150     if (env_var != 0 && *env_var != '\0') {
6151         int64_t new_val;
6152         if (__offload_parse_int_string(env_var, new_val)) {
6153             console_enabled = new_val & 0x0f;
6154         }
6155     }
6156 
6157 	OFFLOAD_DEBUG_TRACE(2, "---- Start of environment variable processing\n");
6158     env_var = getenv(offload_report_envname);
6159     if (env_var != 0 && *env_var != '\0') {
6160 	    OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6161             offload_report_envname, env_var);
6162         int64_t env_val;
6163         if (__offload_parse_int_string(env_var, env_val)) {
6164             if (env_val == OFFLOAD_REPORT_1 ||
6165                 env_val == OFFLOAD_REPORT_2 ||
6166                 env_val == OFFLOAD_REPORT_3) {
6167                 offload_report_level = env_val;
6168 	            OFFLOAD_DEBUG_TRACE(2, "Offload report level set to %d\n",
6169                     offload_report_level);
6170             }
6171             else {
6172                 LIBOFFLOAD_ERROR(c_invalid_env_report_value,
6173                                  offload_report_envname);
6174             }
6175         }
6176         else {
6177             LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
6178                              offload_report_envname);
6179         }
6180     }
6181     else if (!offload_report_level) {
6182         env_var = getenv(timer_envname);
6183         if (env_var != 0 && *env_var != '\0') {
6184 	        OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n", timer_envname, env_var);
6185             timer_enabled = atoi(env_var);
6186 	        OFFLOAD_DEBUG_TRACE(2, "Timer enable flag set to %d\n",
6187                 timer_enabled);
6188         }
6189     }
6190 
6191     // initialize COI
6192     if (!COI::init()) {
6193         return;
6194     }
6195 
6196     // Process OFFLOAD_NODES, specification of physical MICs available
6197     env_var = getenv("OFFLOAD_NODES");
6198     if (env_var != 0 && *env_var != '\0') {
6199 	    OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_NODES=%s\n", env_var);
6200 		// Pass env var on to COI
6201         char * new_env_var =
6202                    (char*) malloc(sizeof("COI_OFFLOAD_NODES=") +
6203                                   strlen(env_var) + 1);
6204         if (new_env_var == NULL)
6205             LIBOFFLOAD_ERROR(c_malloc);
6206         sprintf(new_env_var, "COI_OFFLOAD_NODES=%s", env_var);
6207         putenv(new_env_var);
6208 	    OFFLOAD_DEBUG_TRACE(2, "Setting COI_OFFLOAD_NODES = %s \n", getenv("COI_OFFLOAD_NODES"));
6209 
6210         // value is composed of comma separated physical device indexes
6211         char *buf = strdup(env_var);
6212         if (buf == NULL)
6213             LIBOFFLOAD_ERROR(c_malloc);
6214         char *str, *ptr;
6215 		int num_mics = 0;
6216         for (str = strtok_r(buf, ",", &ptr); str != 0;
6217             str = strtok_r(0, ",", &ptr)) {
6218             // count this MIC
6219             num_mics++;
6220         }
6221 	    OFFLOAD_DEBUG_TRACE(2, "Number of offloadable MICs = %d\n", num_mics);
6222         free(buf);
6223     }
6224     else {
6225 	    OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_NODES is not set\n");
6226     }
6227 
6228     // get number of devices installed in the system
6229     res = COI::EngineGetCount(COI_ISA_MIC, &num_devices);
6230     if (res != COI_SUCCESS) {
6231         return;
6232     }
6233 
6234     if (num_devices > MIC_ENGINES_MAX) {
6235         num_devices = MIC_ENGINES_MAX;
6236     }
6237 
6238     // Determine devices & cpus that can be used for offloading
6239     env_var = getenv("OFFLOAD_DEVICES");
6240     if (env_var != 0 && *env_var != '\0') {
6241 	    OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_DEVICES=%s\n", env_var);
6242         if (strcasecmp(env_var, "none") != 0) {
6243             mic_engines_total =
6244                 process_offload_devices(
6245                     env_var, num_devices, device_cpu_list);
6246             if (mic_engines_total > 0) {
6247                 OFFLOAD_DEBUG_TRACE(2, "Valid value, %d device(s) specified\n",
6248                        mic_engines_total);
6249             }
6250             else {
6251                 OFFLOAD_DEBUG_TRACE(2, "Invalid value, will not offload\n");
6252                 return;
6253             }
6254         }
6255         else {
6256             // No need to continue since no offload devices
6257             return;
6258         }
6259     }
6260     else {
6261 	    OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_DEVICES is not set\n");
6262     }
6263     if (mic_engines_total == 0) {
6264         // Fallback to using all available devices and all CPUs on each
6265         OFFLOAD_DEBUG_TRACE(2, "Fallback to all devices\n");
6266         device_cpu_list.clear();
6267         mic_engines_total = 0;
6268         for (int i = 0; i < num_devices; i++) {
6269             COIENGINE engine;
6270             res = COI::EngineGetHandle(COI_ISA_MIC, i, &engine);
6271             if (res == COI_SUCCESS) {
6272                 device_cpu_list.push_back(make_pair(i, (micLcpuMask*)0));
6273                 OFFLOAD_DEBUG_TRACE(2, "Device %d is available\n", i);
6274                 mic_engines_total++;
6275             }
6276         }
6277     }
6278 
6279     // no need to continue if there are no devices to offload to
6280     if (mic_engines_total <= 0) {
6281         return;
6282     }
6283 
6284     // Initialize indexes for available devices
6285     mic_engines = new Engine[mic_engines_total];
6286     std::list<deviceLcpu>::iterator deviceIterator;
6287     int l_idx = 0;
6288     for (deviceIterator = device_cpu_list.begin();
6289          deviceIterator != device_cpu_list.end();
6290          deviceIterator++)
6291     {
6292         deviceLcpu device_mask_pair = *deviceIterator;
6293         int device_num = device_mask_pair.first;
6294         micLcpuMask *device_mask = device_mask_pair.second;
6295 
6296         mic_engines[l_idx].set_indexes(l_idx, device_num);
6297         mic_engines[l_idx].set_cpu_mask(device_mask);
6298         OFFLOAD_DEBUG_TRACE(2,
6299             "Logical MIC%d => Physical MIC%d\n", l_idx, device_num);
6300         if (device_mask != NULL) {
6301             std::string cpu_string =
6302                 device_mask->to_string<
6303                     char,
6304                     std::string::traits_type,
6305                     std::string::allocator_type>();
6306             OFFLOAD_DEBUG_TRACE(2, "    CPUs: %s\n", cpu_string.data());
6307         }
6308         else {
6309             OFFLOAD_DEBUG_TRACE(2, "    CPUs: all\n");
6310         }
6311         l_idx++;
6312     }
6313 
6314     // Get DMA channel count to pass it to COI
6315     env_var = getenv("OFFLOAD_DMA_CHANNEL_COUNT");
6316     if (env_var != 0 && *env_var != '\0') {
6317 	    OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_DMA_CHANNEL_COUNT=%s\n", env_var);
6318         int64_t new_val;
6319         if (__offload_parse_int_string(env_var, new_val)) {
6320             mic_dma_channel_count = new_val;
6321 	        OFFLOAD_DEBUG_TRACE(2, "Using %d DMA channels\n",
6322                 mic_dma_channel_count);
6323         }
6324         else {
6325             LIBOFFLOAD_ERROR(c_invalid_env_var_value,
6326                              "OFFLOAD_DMA_CHANNEL_COUNT");
6327         }
6328     }
6329     else {
6330 	    OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_DMA_CHANNEL_COUNT is not set\n");
6331     }
6332 
6333     // Set COI_HOST_THREAD_AFFINITY if OFFLOAD_HOST_THREAD_AFFINITY is set.
6334     // Use putenv instead of setenv as Windows has no setenv.
6335     // Note: putenv requires its argument can't be freed or modified.
6336     // So no free after call to putenv or elsewhere.
6337     env_var = getenv("OFFLOAD_HOST_THREAD_AFFINITY");
6338     if (env_var != 0 && *env_var != '\0') {
6339 	    OFFLOAD_DEBUG_TRACE(2, "---- OFFLOAD_HOST_THREAD_AFFINITY=%s\n", env_var);
6340         char * new_env_var =
6341                    (char*) malloc(sizeof("COI_HOST_THREAD_AFFINITY=") +
6342                                   strlen(env_var) + 1);
6343         if (new_env_var == NULL)
6344             LIBOFFLOAD_ERROR(c_malloc);
6345         sprintf(new_env_var, "COI_HOST_THREAD_AFFINITY=%s", env_var);
6346         putenv(new_env_var);
6347 	    OFFLOAD_DEBUG_TRACE(2, "Setting COI_HOST_THREAD_AFFINITY = %s \n",
6348    	                                         getenv("COI_HOST_THREAD_AFFINITY"));
6349     }
6350     else {
6351 	    OFFLOAD_DEBUG_TRACE(2, "OFFLOAD_HOST_THREAD_AFFINITY is not set\n");
6352     }
6353 
6354     // library search path for KNC device binaries
6355     env_var = getenv("MIC_LD_LIBRARY_PATH");
6356     if (env_var != 0) {
6357         OFFLOAD_DEBUG_TRACE(2, "---- MIC_LD_LIBRARY_PATH=%s\n", env_var);
6358         knc_library_path = strdup(env_var);
6359         if (knc_library_path == NULL)
6360             LIBOFFLOAD_ERROR(c_malloc);
6361         OFFLOAD_DEBUG_TRACE(2, "KNC library path set to %s\n", knc_library_path);
6362     }
6363     else {
6364         OFFLOAD_DEBUG_TRACE(2, "MIC_LD_LIBRARY_PATH is not set\n");
6365     }
6366 
6367     // library search path for KNL device binaries
6368     env_var = getenv("LD_LIBRARY_PATH");
6369     if (env_var != 0) {
6370         OFFLOAD_DEBUG_TRACE(2, "---- LD_LIBRARY_PATH=%s\n", env_var);
6371         knl_library_path = strdup(env_var);
6372         if (knl_library_path == NULL)
6373             LIBOFFLOAD_ERROR(c_malloc);
6374         OFFLOAD_DEBUG_TRACE(2, "KNL library path set to %s\n", knl_library_path);
6375     }
6376     else {
6377         OFFLOAD_DEBUG_TRACE(2, "LD_LIBRARY_PATH is not set\n");
6378     }
6379 
6380     // memory size reserved for COI buffers
6381     env_var = getenv("MIC_BUFFERSIZE");
6382     if (env_var != 0 && *env_var != '\0') {
6383         OFFLOAD_DEBUG_TRACE(2, "---- MIC_BUFFERSIZE=%s\n", env_var);
6384         uint64_t new_size;
6385         if (__offload_parse_size_string(env_var, new_size)) {
6386             mic_buffer_size = new_size;
6387 	        OFFLOAD_DEBUG_TRACE(2,
6388                 "Reserved memory for COI buffers set to %lld bytes\n",
6389                 mic_buffer_size);
6390         }
6391         else {
6392             LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_BUFFERSIZE");
6393         }
6394     }
6395     else {
6396 	    OFFLOAD_DEBUG_TRACE(2, "MIC_BUFFERSIZE is not set\n");
6397     }
6398 
6399     // memory size reserved for 4K pages for COI buffers
6400     env_var = getenv("MIC_4K_BUFFER_RESERVE_SIZE");
6401     if (env_var != 0 && *env_var != '\0') {
6402         OFFLOAD_DEBUG_TRACE(2, "---- MIC_4K_BUFFER_RESERVE_SIZE=%s\n", env_var);
6403         uint64_t new_size;
6404         if (__offload_parse_size_string(env_var, new_size)) {
6405             mic_4k_buffer_size = new_size;
6406 	        OFFLOAD_DEBUG_TRACE(2,
6407                 "Reserved memory for 4K COI buffers set to %lld bytes\n",
6408                 mic_4k_buffer_size);
6409         }
6410         else {
6411             LIBOFFLOAD_ERROR(c_invalid_env_var_value, "MIC_4K_BUFFER_RESERVE_SIZE");
6412         }
6413     }
6414     else {
6415         OFFLOAD_DEBUG_TRACE(2, "MIC_4K_BUFFER_RESERVE_SIZE is not set\n");
6416     }
6417 
6418     // memory size reserved for 2M pages for COI buffers
6419     env_var = getenv("MIC_2M_BUFFER_RESERVE_SIZE");
6420     if (env_var != 0 && *env_var != '\0') {
6421 	    OFFLOAD_DEBUG_TRACE(2, "---- MIC_2M_BUFFER_RESERVE_SIZE=%s\n", env_var);
6422         uint64_t new_size;
6423         if (__offload_parse_size_string(env_var, new_size)) {
6424             mic_2m_buffer_size = new_size;
6425             OFFLOAD_DEBUG_TRACE(2,
6426                 "Reserved memory for 2M COI buffers set to %lld bytes\n",
6427                 mic_2m_buffer_size);
6428         }
6429         else {
6430             LIBOFFLOAD_ERROR(c_invalid_env_var_value,
6431                 "MIC_2M_BUFFER_RESERVE_SIZE");
6432         }
6433     }
6434     else {
6435 	    OFFLOAD_DEBUG_TRACE(2, "MIC_2M_BUFFER_RESERVE_SIZE is not set\n");
6436     }
6437 
6438     // determine stacksize for the pipeline on the device
6439     env_var = getenv("MIC_STACKSIZE");
6440     if (env_var != 0 && *env_var != '\0') {
6441 	    OFFLOAD_DEBUG_TRACE(2, "---- MIC_STACKSIZE=%s\n", env_var);
6442         uint64_t new_size;
6443         if (__offload_parse_size_string(env_var, new_size) &&
6444             (new_size >= 16384) && ((new_size & 4095) == 0)) {
6445             mic_stack_size = new_size;
6446             OFFLOAD_DEBUG_TRACE(2, "MIC stack size set to %lld bytes\n",
6447                 mic_stack_size);
6448         }
6449         else {
6450             LIBOFFLOAD_ERROR(c_mic_init3);
6451         }
6452     }
6453     else {
6454         OFFLOAD_DEBUG_TRACE(2, "MIC_STACKSIZE is not set\n");
6455     }
6456 
6457     // proxy I/O
6458     env_var = getenv("MIC_PROXY_IO");
6459     if (env_var != 0 && *env_var != '\0') {
6460         OFFLOAD_DEBUG_TRACE(2, "---- MIC_PROXY_IO=%s\n", env_var);
6461         int64_t new_val;
6462         if (__offload_parse_int_string(env_var, new_val)) {
6463             mic_proxy_io = new_val;
6464             OFFLOAD_DEBUG_TRACE(2, "MIC proxy i/o set to %s\n",
6465                 mic_proxy_io);
6466         }
6467         else {
6468             LIBOFFLOAD_ERROR(c_invalid_env_var_int_value, "MIC_PROXY_IO");
6469         }
6470     }
6471     else {
6472         OFFLOAD_DEBUG_TRACE(2, "MIC_PROXY_IO is not set\n");
6473     }
6474 
6475 
6476     env_var = getenv("MIC_PROXY_FS_ROOT");
6477     if (env_var != 0 && *env_var != '\0') {
6478 	    OFFLOAD_DEBUG_TRACE(2, "---- MIC_PROXY_FS_ROOT=%s\n", env_var);
6479         mic_proxy_fs_root = strdup(env_var);
6480         if (mic_proxy_fs_root == NULL)
6481             LIBOFFLOAD_ERROR(c_malloc);
6482 	OFFLOAD_DEBUG_TRACE(2, "MIC proxy fs root set to %s\n",
6483             mic_proxy_fs_root);
6484     }
6485     else {
6486 	OFFLOAD_DEBUG_TRACE(2, "MIC_PROXY_FS_ROOT is not set\n");
6487     }
6488 
6489     // Prepare environment for the target process using the following
6490     // rules
6491     // - If MIC_ENV_PREFIX is set then any environment variable on the
6492     //   host which has that prefix are copied to the device without
6493     //   the prefix.
6494     //   All other host environment variables are ignored.
6495     // - If MIC_ENV_PREFIX is not set or if MIC_ENV_PREFIX="" then host
6496     //   environment is duplicated.
6497     env_var = getenv("MIC_ENV_PREFIX");
6498     if (env_var != 0 && *env_var != '\0') {
6499 	OFFLOAD_DEBUG_TRACE(2, "---- MIC_ENV_PREFIX=%s\n", env_var);
6500         mic_env_vars.set_prefix(env_var);
6501 
6502         int len = strlen(env_var);
6503         for (int i = 0; environ[i] != 0; i++) {
6504             if (strncmp(environ[i], env_var, len) == 0 &&
6505                 strncmp(environ[i], "MIC_LD_LIBRARY_PATH", 19) != 0 &&
6506                 environ[i][len] != '=') {
6507                 mic_env_vars.analyze_env_var(environ[i]);
6508             }
6509         }
6510     }
6511     else {
6512 	    OFFLOAD_DEBUG_TRACE(2, "MIC_ENV_PREFIX is not set\n");
6513     }
6514 
6515     // create key for thread data
6516     if (thread_key_create(&mic_thread_key, Engine::destroy_thread_data)) {
6517         LIBOFFLOAD_ERROR(c_mic_init4, errno);
6518         return;
6519     }
6520 
6521     // cpu frequency
6522     cpu_frequency = COI::PerfGetCycleFrequency();
6523 
6524     env_var = getenv(mic_use_2mb_buffers_envname);
6525     if (env_var != 0 && *env_var != '\0') {
6526 	    OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6527             mic_use_2mb_buffers_envname, env_var);
6528         uint64_t new_size;
6529         if (__offload_parse_size_string(env_var, new_size)) {
6530             __offload_use_2mb_buffers = new_size;
6531 	        OFFLOAD_DEBUG_TRACE(2,
6532                 "Threshold for use of 2M buffers set to %lld\n",
6533                 __offload_use_2mb_buffers);
6534         }
6535         else {
6536             LIBOFFLOAD_ERROR(c_invalid_env_var_value,
6537                              mic_use_2mb_buffers_envname);
6538         }
6539     }
6540     else {
6541 	    OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", mic_use_2mb_buffers_envname);
6542     }
6543 
6544     env_var = getenv(mic_use_async_buffer_write_envname);
6545     if (env_var != 0 && *env_var != '\0') {
6546 	    OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6547             mic_use_async_buffer_write_envname, env_var);
6548         uint64_t new_size;
6549         if (__offload_parse_size_string(env_var, new_size)) {
6550             __offload_use_async_buffer_write = new_size;
6551 	        OFFLOAD_DEBUG_TRACE(2,
6552                 "Threshold for async buffer write set to %lld\n",
6553                 __offload_use_async_buffer_write);
6554         }
6555     }
6556     else {
6557 	    OFFLOAD_DEBUG_TRACE(2, "%s is not set\n",
6558             mic_use_async_buffer_write_envname);
6559     }
6560 
6561     env_var = getenv(mic_use_async_buffer_read_envname);
6562     if (env_var != 0 && *env_var != '\0') {
6563 	    OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6564             mic_use_async_buffer_read_envname, env_var);
6565         uint64_t new_size;
6566         if (__offload_parse_size_string(env_var, new_size)) {
6567             __offload_use_async_buffer_read = new_size;
6568 	        OFFLOAD_DEBUG_TRACE(2,
6569                 "Threshold for async buffer read set to %lld\n",
6570                 __offload_use_async_buffer_read);
6571         }
6572     }
6573     else {
6574 	    OFFLOAD_DEBUG_TRACE(2, "%s is not set\n",
6575             mic_use_async_buffer_read_envname);
6576     }
6577 
6578     // mic initialization type
6579     env_var = getenv(offload_init_envname);
6580     if (env_var != 0 && *env_var != '\0') {
6581 	    OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6582             offload_init_envname, env_var);
6583         if (strcmp(env_var, "on_offload") == 0) {
6584             __offload_init_type = c_init_on_offload;
6585 	        OFFLOAD_DEBUG_TRACE(2,
6586                 "A MIC device will be initialized "
6587                 "on first offload to that device\n");
6588         }
6589         else if (strcmp(env_var, "on_offload_all") == 0) {
6590             __offload_init_type = c_init_on_offload_all;
6591 	        OFFLOAD_DEBUG_TRACE(2,
6592                 "All MIC devices will be initialized "
6593                 "on first offload to any device\n");
6594         }
6595         else if (strcmp(env_var, "on_start") == 0) {
6596             __offload_init_type = c_init_on_start;
6597 	        OFFLOAD_DEBUG_TRACE(2,
6598                 "All MIC devices will be initialized "
6599                 "at program start\n");
6600         }
6601         else {
6602             LIBOFFLOAD_ERROR(c_invalid_env_var_value, offload_init_envname);
6603         }
6604     }
6605     else {
6606 	    OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", offload_init_envname);
6607     }
6608 
6609     // active wait
6610     env_var = getenv(offload_active_wait_envname);
6611     if (env_var != 0 && *env_var != '\0') {
6612 	    OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6613             offload_active_wait_envname, env_var);
6614         int64_t new_val;
6615         if (__offload_parse_int_string(env_var, new_val)) {
6616             __offload_active_wait = new_val;
6617 	        OFFLOAD_DEBUG_TRACE(2,
6618                 "Flag to poll on event completion is set to %d\n",
6619                 __offload_active_wait);
6620         }
6621         else {
6622             LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
6623                              offload_active_wait_envname);
6624         }
6625     }
6626     else {
6627 	    OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", offload_active_wait_envname);
6628     }
6629 
6630     // always wait
6631     env_var = getenv(offload_always_wait_envname);
6632     if (env_var != 0 && *env_var != '\0') {
6633 	    OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6634             offload_always_wait_envname, env_var);
6635         int64_t new_val;
6636         if (__offload_parse_int_string(env_var, new_val)) {
6637             __offload_always_wait = new_val;
6638 	        OFFLOAD_DEBUG_TRACE(2,
6639                 "Flag to poll on event completion is set to %d\n",
6640                 __offload_active_wait);
6641         }
6642         else {
6643             LIBOFFLOAD_ERROR(c_invalid_env_var_int_value,
6644                              offload_always_wait_envname);
6645         }
6646     }
6647     else {
6648 	    OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", offload_always_wait_envname);
6649     }
6650 
6651     // omp device num
6652     env_var = getenv(omp_device_num_envname);
6653     if (env_var != 0 && *env_var != '\0') {
6654 	    OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6655             omp_device_num_envname, env_var);
6656         int64_t new_val;
6657         if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) {
6658             __omp_device_num = new_val;
6659 	        OFFLOAD_DEBUG_TRACE(2, "OpenMP default device number is set to %d\n",
6660                 __omp_device_num);
6661         }
6662         else {
6663             LIBOFFLOAD_ERROR(c_omp_invalid_device_num_env,
6664                              omp_device_num_envname);
6665         }
6666     }
6667     else {
6668 	    OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", omp_device_num_envname);
6669     }
6670 
6671     // parallel copy of offload_transfer
6672     env_var = getenv(parallel_copy_envname);
6673     if (env_var != 0 && *env_var != '\0') {
6674 	    OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6675             parallel_copy_envname, env_var);
6676         int64_t new_val;
6677         if (__offload_parse_int_string(env_var, new_val) && new_val >= 0) {
6678             __offload_parallel_copy = new_val;
6679 	        OFFLOAD_DEBUG_TRACE(2,
6680                 "Flag for using async buffer copy is set to %d\n",
6681                 __offload_parallel_copy);
6682         }
6683         else {
6684             LIBOFFLOAD_ERROR(c_invalid_env_var_value,
6685                              parallel_copy_envname);
6686         }
6687     }
6688     else {
6689 	    OFFLOAD_DEBUG_TRACE(2, "%s is not set\n", parallel_copy_envname);
6690     }
6691 
6692     // use COI interface for noncontiguous arrays transfer
6693     env_var = getenv(use_coi_noncontiguous_transfer_envname);
6694     if (env_var != 0 && *env_var != '\0') {
6695 	    OFFLOAD_DEBUG_TRACE(2, "---- %s=%s\n",
6696             use_coi_noncontiguous_transfer_envname, env_var);
6697         uint64_t new_size;
6698         if (__offload_parse_size_string(env_var, new_size)) {
6699             __offload_use_coi_noncontiguous_transfer = new_size;
6700 	        OFFLOAD_DEBUG_TRACE(2,
6701                 "Flag for using new COI noncontiguous API is set to %d\n",
6702                 __offload_use_coi_noncontiguous_transfer);
6703         }
6704         else {
6705             LIBOFFLOAD_ERROR(c_invalid_env_var_value,
6706                              use_coi_noncontiguous_transfer_envname);
6707         }
6708     }
6709     else {
6710 	    OFFLOAD_DEBUG_TRACE(2, "%s is not set\n",
6711             use_coi_noncontiguous_transfer_envname);
6712     }
6713 
6714     OFFLOAD_DEBUG_TRACE(2, "---- End of environment variable processing\n");
6715 
6716     // init ORSL
6717     ORSL::init();
6718 }
6719 
__offload_init_library(void)6720 extern int __offload_init_library(void)
6721 {
6722     // do one time intialization
6723     static OffloadOnceControl ctrl = OFFLOAD_ONCE_CONTROL_INIT;
6724     __offload_run_once(&ctrl, __offload_init_library_once);
6725 
6726     // offload is available if COI is available and the number of devices > 0
6727     bool is_available = COI::is_available && (mic_engines_total > 0);
6728 
6729     // register pending libraries if there are any
6730     if (is_available && __target_libs) {
6731         mutex_locker_t locker(__target_libs_lock);
6732 
6733         for (TargetImageList::iterator it = __target_libs_list.begin();
6734              it != __target_libs_list.end(); it++) {
6735             // Register library in COI
6736             COI::ProcessRegisterLibraries(1, &it->data, &it->size,
6737                                           &it->origin, &it->offset);
6738 
6739             // add lib to all engines
6740             for (int i = 0; i < mic_engines_total; i++) {
6741                 mic_engines[i].add_lib(*it);
6742             }
6743         }
6744 
6745         __target_libs = false;
6746         __target_libs_list.clear();
6747     }
6748 
6749     return is_available;
6750 }
6751 
__offload_target_image_is_executable(const void * target_image)6752 extern "C" bool __offload_target_image_is_executable(const void *target_image)
6753 {
6754     const struct Image *image = static_cast<const struct Image*>(target_image);
6755 
6756     // decode image
6757     const char *name = image->data;
6758     const void *data = image->data + strlen(image->data) + 1;
6759 
6760     // determine image type
6761     const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
6762     return (hdr->e_type == ET_EXEC);
6763 }
6764 
__offload_register_image(const void * target_image)6765 extern "C" bool __offload_register_image(const void *target_image)
6766 {
6767     const struct Image *image = static_cast<const struct Image*>(target_image);
6768     const void *data = image->data + strlen(image->data) + 1;
6769     uint64_t    size = image->size;
6770     uint64_t    offset = 0;
6771 
6772     // decode image
6773     const char *fat_name = image->data;
6774     char *mic_name   = (char *) malloc(strlen(image->data) + 1);
6775     char *host_name  = (char *) malloc(strlen(image->data));
6776     int        i;
6777 
6778     if ((mic_name == NULL) || (host_name == NULL))
6779         LIBOFFLOAD_ERROR(c_malloc);
6780 
6781     // The origin name is the name of the file on the host
6782     // this is used by Vtune, since it is a fat binary we
6783     // use the host file name of the fat binary.
6784     // Driver prepends the host file name ending with "?"
6785     // to the image->data name so need to extract the string
6786     // name format:  <mic_name>?<origin>
6787 
6788     // Get <mic_name>
6789     i = 0;
6790     while ((*fat_name != '\0') && (*fat_name != '?')) {
6791        mic_name[i] = *fat_name;
6792        fat_name++;
6793        i++;
6794     }
6795 
6796     // Remove the host file name by inserting end of string marker
6797     mic_name[i]  = '\0';
6798 
6799     // Get <host_name>
6800     if (*fat_name == '?') {
6801        // The string following "?" is the name of the host file name.
6802        fat_name++;
6803        i = 0;
6804        while (*fat_name != '\0') {
6805           host_name[i] = *fat_name;
6806           fat_name++;
6807           i++;
6808        }
6809        host_name[i] = '\0';
6810     }
6811     else {
6812     // Windows current does not have host name
6813        free(host_name);
6814        host_name = 0;
6815     }
6816 
6817     // our actions depend on the image type
6818     const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
6819     switch (hdr->e_type) {
6820         case ET_EXEC:
6821             __current_image_is_dll = false;
6822             // Each offload application is supposed to have only one target
6823             // image representing target executable.
6824             // No thread synchronization is required here as the initialization
6825             // code is always executed in a single thread.
6826             if (__target_exe != 0) {
6827                 LIBOFFLOAD_ERROR(c_multiple_target_exes);
6828                 exit(1);
6829             }
6830             __target_exe = new TargetImage(mic_name, data, size, host_name, offset);
6831 
6832             // Registration code for execs is always called from the context
6833             // of main and thus we can safely call any function here,
6834             // including LoadLibrary API on windows. This is the place where
6835             // we do the offload library initialization.
6836             if (__offload_init_library()) {
6837                 // initialize engine if init_type is on_start
6838                 if (__offload_init_type == c_init_on_start) {
6839                     for (int i = 0; i < mic_engines_total; i++) {
6840                         mic_engines[i].init();
6841                     }
6842                 }
6843             }
6844             return mic_engines_total > 0;
6845 
6846         case ET_DYN:
6847         {
6848             char * fullname = NULL;
6849             __current_image_is_dll = true;
6850             // We add the library to a list of pending libraries
6851             __target_libs_lock.lock();
6852             __target_libs = true;
6853             __target_libs_list.push_back(
6854             TargetImage(mic_name, data, size, fullname, offset));
6855             __target_libs_lock.unlock();
6856             // If __target_exe is set, then main has started running
6857             // If not main, then we can't do anything useful here
6858             // because this registration code is called from DllMain
6859             // context (on windows).
6860             if (__target_exe != 0) {
6861                 // There is no need to delay loading the library
6862                 if (!__offload_init_library()) {
6863                     // Couldn't validate library as a fat offload library
6864                     LIBOFFLOAD_ERROR(c_unknown_binary_type);
6865                     exit(1);
6866                 }
6867             }
6868             return true;
6869         }
6870 
6871         default:
6872             // something is definitely wrong, issue an error and exit
6873             LIBOFFLOAD_ERROR(c_unknown_binary_type);
6874             exit(1);
6875     }
6876 }
6877 
6878 // When dlopen is used dlclose may happen after the COI process
6879 // is destroyed.  In which case images cannot be unloaded and should
6880 // be skipped.  So track if coi has been unloaded.
6881 static bool coi_may_have_been_unloaded = false;
6882 
__offload_unregister_image(const void * target_image)6883 extern "C" void __offload_unregister_image(const void *target_image)
6884 {
6885     // Target image is packed as follows:
6886     //      8 bytes                - size of the target binary
6887     //      null-terminated string - binary name
6888     //      <size> bytes           - binary contents
6889     const struct Image {
6890          int64_t size;
6891          char data[];
6892     } *image = static_cast<const struct Image*>(target_image);
6893 
6894     // decode image
6895     const char *name = image->data;
6896     const void *data = image->data + strlen(image->data) + 1;
6897 
6898     // our actions depend on the image type
6899     const Elf64_Ehdr *hdr = static_cast<const Elf64_Ehdr*>(data);
6900     if (hdr->e_type == ET_EXEC) {
6901         // We are executing exec's desctructors.
6902         // It is time to do a library cleanup.
6903         if (timer_enabled) {
6904             Offload_Timer_Print();
6905         }
6906 
6907         coi_may_have_been_unloaded = true;
6908 
6909         // Do not unload the MYO library if it loaded in dll.
6910         if (!__myo_init_in_so)
6911         {
6912 #ifdef MYO_SUPPORT
6913             __offload_myoFini();
6914 #endif // MYO_SUPPORT
6915 
6916             __offload_fini_library();
6917        }
6918     }
6919     else if ((hdr->e_type == ET_DYN) && !coi_may_have_been_unloaded) {
6920         for (int i = 0; i < mic_engines_total; i++) {
6921            mic_engines[i].unload_library(data, name);
6922         }
6923 
6924     }
6925 }
6926 
__offload_register_task_callback(void (* cb)(void *))6927 extern "C" void __offload_register_task_callback(void (*cb)(void *))
6928 {
6929     task_completion_callback = cb;
6930 }
6931 
6932 // Runtime trace interface for user programs
6933 
__offload_console_trace(int level)6934 void __offload_console_trace(int level)
6935 {
6936     console_enabled = level;
6937 }
6938 
6939 // User-visible offload API
6940 
_Offload_number_of_devices(void)6941 int _Offload_number_of_devices(void)
6942 {
6943     __offload_init_library();
6944     return mic_engines_total;
6945 }
6946 
_Offload_get_device_number(void)6947 int _Offload_get_device_number(void)
6948 {
6949     return -1;
6950 }
6951 
_Offload_get_physical_device_number(void)6952 int _Offload_get_physical_device_number(void)
6953 {
6954     return -1;
6955 }
6956 
_Offload_signaled(int index,void * signal)6957 int _Offload_signaled(int index, void *signal)
6958 {
6959     __offload_init_library();
6960 
6961     // check index value
6962     if (index < 0) {
6963         LIBOFFLOAD_ERROR(c_offload_signaled1, index);
6964         LIBOFFLOAD_ABORT;
6965     }
6966 
6967     index %= mic_engines_total;
6968 
6969     // find associated async task
6970     OffloadDescriptor *task =
6971         mic_engines[index].find_signal(signal, false);
6972     if (task == 0) {
6973         LIBOFFLOAD_ERROR(c_offload_signaled2, signal);
6974         LIBOFFLOAD_ABORT;
6975     }
6976     // if signal is removed by wait completing
6977     else if (task == SIGNAL_HAS_COMPLETED) {
6978         return (true);
6979     }
6980     return task->is_signaled();
6981 }
6982 
_Offload_report(int val)6983 void _Offload_report(int val)
6984 {
6985     if (val == OFFLOAD_REPORT_ON ||
6986         val == OFFLOAD_REPORT_OFF) {
6987         offload_report_enabled = val;
6988     }
6989 }
6990 
_Offload_find_associated_mic_memory(int target,const void * cpu_addr,void ** cpu_base_addr,uint64_t * buf_length,void ** mic_addr,uint64_t * mic_buf_start_offset,int * is_static)6991 int _Offload_find_associated_mic_memory(
6992     int          target,
6993     const void*  cpu_addr,
6994     void**       cpu_base_addr,
6995     uint64_t*    buf_length,
6996     void**       mic_addr,
6997     uint64_t*    mic_buf_start_offset,
6998     int*         is_static
6999 )
7000 {
7001     __offload_init_library();
7002 
7003     // check target value
7004     if (target < 0) {
7005         LIBOFFLOAD_ERROR(c_offload_signaled1, target);
7006         LIBOFFLOAD_ABORT;
7007     }
7008     target %= mic_engines_total;
7009 
7010     // find existing association in pointer table
7011     PtrData* ptr_data = mic_engines[target].find_ptr_data(cpu_addr);
7012     if (ptr_data == 0) {
7013         OFFLOAD_TRACE(3, "Association does not exist\n");
7014         return 0;
7015     }
7016 
7017     OFFLOAD_TRACE(3, "Found association: base %p, length %lld, is_static %d\n",
7018                   ptr_data->cpu_addr.start(), ptr_data->cpu_addr.length(),
7019                   ptr_data->is_static);
7020 
7021     if (ptr_data->mic_buf != 0 && ptr_data->mic_addr == 0) {
7022         COIRESULT res = COI::BufferGetSinkAddress(ptr_data->mic_buf,
7023                                                   &ptr_data->mic_addr);
7024         if (res != COI_SUCCESS) {
7025             return 0;
7026         }
7027     }
7028     *cpu_base_addr = const_cast<void *>(ptr_data->cpu_addr.start());
7029     *buf_length = ptr_data->cpu_addr.length() - ptr_data->alloc_disp;
7030     *mic_addr = (void *)(ptr_data->mic_addr + ptr_data->mic_offset);
7031     *mic_buf_start_offset = ptr_data->alloc_disp;
7032     *is_static = ptr_data->is_static;
7033     return ptr_data->is_static ? 1 : ptr_data->get_reference();
7034 }
7035 
_Offload_stream_create(int device,int number_of_cpus)7036 _Offload_stream _Offload_stream_create(
7037     int device,           // MIC device number
7038     int number_of_cpus    // Cores allocated to the stream
7039     )
7040 {
7041     __offload_init_library();
7042 
7043     // check target value
7044     if (device < 0) {
7045         LIBOFFLOAD_ERROR(c_offload_signaled1, device);
7046         LIBOFFLOAD_ABORT;
7047     }
7048     device %= mic_engines_total;
7049 
7050     // Create new stream and get its handle
7051     _Offload_stream handle = Stream::add_stream(device, number_of_cpus);
7052     if (handle == 0) {
7053         OFFLOAD_TRACE(3, "Can't create stream\n");
7054         return 0;
7055     }
7056 
7057     // create pipeline associated with the new stream
7058     mic_engines[device].get_pipeline(handle);
7059 
7060     return(handle);
7061 }
7062 
_Offload_stream_destroy(int device,_Offload_stream handle)7063 int _Offload_stream_destroy(
7064     int             device,   // MIC device number
7065     _Offload_stream handle    // stream to destroy
7066     )
7067 {
7068     if (Stream::get_streams_count() == 0) {
7069         LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
7070         LIBOFFLOAD_ABORT;
7071     }
7072     // check target value
7073     if (device < 0) {
7074         LIBOFFLOAD_ERROR(c_offload_signaled1, device);
7075         LIBOFFLOAD_ABORT;
7076     }
7077     device %= mic_engines_total;
7078 
7079     mic_engines[device].stream_destroy(handle);
7080 
7081     return(true);
7082 }
7083 
_Offload_stream_delete(_Offload_stream handle)7084 int _Offload_stream_delete(
7085     _Offload_stream handle    // stream to destroy
7086     )
7087 {
7088     int device;    // MIC device number
7089     Stream * stream;
7090 
7091     if (Stream::get_streams_count() == 0) {
7092         LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
7093         LIBOFFLOAD_ABORT;
7094     }
7095 
7096     stream = Stream::find_stream(handle, false);
7097     // the stream was not created or was destroyed
7098     if (!stream) {
7099         LIBOFFLOAD_ERROR(c_offload_no_stream, device);
7100         LIBOFFLOAD_ABORT;
7101     }
7102 
7103     device = stream->get_device();
7104 
7105     mic_engines[device].stream_destroy(handle);
7106 
7107     return(true);
7108 }
7109 
_Offload_stream_completed(int device,_Offload_stream handler)7110 int _Offload_stream_completed(int device, _Offload_stream handler)
7111 {
7112     if (Stream::get_streams_count() == 0) {
7113         LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
7114         LIBOFFLOAD_ABORT;
7115     }
7116     // check device index value
7117     if (device < -1) {
7118         LIBOFFLOAD_ERROR(c_offload_signaled1, device);
7119         LIBOFFLOAD_ABORT;
7120     }
7121     else if (device > -1) {
7122         device %= mic_engines_total;
7123     }
7124     // get stream
7125     Stream * stream;
7126 
7127     if (handler != 0) {
7128         stream =  Stream::find_stream(handler, false);
7129 
7130         // the stream was not created or was destroyed
7131         if (!stream) {
7132             LIBOFFLOAD_ERROR(c_offload_no_stream, device);
7133             LIBOFFLOAD_ABORT;
7134         }
7135 
7136         if (device != stream->get_device()) {
7137             LIBOFFLOAD_ERROR(c_offload_device_doesnt_match_to_stream,
7138                              stream->get_device());
7139             LIBOFFLOAD_ABORT;
7140         }
7141         // find associated async task
7142         OffloadDescriptor *task = stream->get_last_offload();
7143 
7144         // offload was completed by offload_wait pragma or wait clause
7145         if (task == 0) {
7146             return(true);
7147         }
7148         return task->is_signaled();
7149     }
7150     // zero handler is for all streams at the device
7151     else {
7152         StreamMap stream_map = Stream::all_streams;
7153         for (StreamMap::iterator it = stream_map.begin();
7154             it != stream_map.end(); it++) {
7155             Stream * stream = it->second;
7156             if (device != -1 && device != stream->get_device()) {
7157                 continue;
7158             }
7159             // find associated async task
7160             OffloadDescriptor *task = stream->get_last_offload();
7161 
7162             // offload was completed by offload_wait pragma or wait clause
7163             if (task == 0) {
7164                 continue;
7165             }
7166             // if even one stream is not completed result is false
7167             if (!task->is_signaled()) {
7168                 return false;
7169             }
7170         }
7171         // no uncompleted streams
7172         return true;
7173     }
7174 }
7175 
_Offload_stream_is_empty(_Offload_stream handle)7176 int _Offload_stream_is_empty(_Offload_stream handle)
7177 {
7178     int device;
7179 
7180     if (Stream::get_streams_count() == 0) {
7181         LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
7182         LIBOFFLOAD_ABORT;
7183     }
7184     if (handle != 0) {
7185         Stream * stream =  Stream::find_stream(handle, false);
7186 
7187         // the stream was not created or was destroyed
7188         if (!stream) {
7189             LIBOFFLOAD_ERROR(c_offload_no_stream, device);
7190             LIBOFFLOAD_ABORT;
7191         }
7192         device = stream->get_device();
7193     }
7194     else {
7195         device = -1;
7196     }
7197     // Use 0 for device index as _Offload_stream_completed
7198     // ignores this value while defining streams completion
7199     return _Offload_stream_completed(device, handle);
7200 }
7201 
_Offload_device_streams_completed(int device)7202 int _Offload_device_streams_completed(int device)
7203 {
7204     if (Stream::get_streams_count() == 0) {
7205         LIBOFFLOAD_ERROR(c_offload_streams_are_absent);
7206         LIBOFFLOAD_ABORT;
7207     }
7208     // check index value
7209     if (device < -1) {
7210         LIBOFFLOAD_ERROR(c_offload_signaled1, device);
7211         LIBOFFLOAD_ABORT;
7212     }
7213     else if (device > -1) {
7214         device %= mic_engines_total;
7215     }
7216 
7217     StreamMap stream_map = Stream::all_streams;
7218     for (StreamMap::iterator it = stream_map.begin();
7219         it != stream_map.end(); it++)
7220     {
7221         Stream * stream = it->second;
7222 
7223         if (device != -1 && device != stream->get_device()) {
7224             continue;
7225         }
7226         // find associated async task
7227         OffloadDescriptor *task = stream->get_last_offload();
7228 
7229         // offload was completed by offload_wait pragma or wait clause
7230         if (task == 0) {
7231             continue;
7232         }
7233         // if even one stream is not completed result is false
7234         if (!task->is_signaled()) {
7235             return false;
7236         }
7237     }
7238     // no uncompleted streams
7239     return true;
7240 }
7241 
7242 // IDB support
7243 int   __dbg_is_attached = 0;
7244 int   __dbg_target_id = -1;
7245 pid_t __dbg_target_so_pid = -1;
7246 char  __dbg_target_exe_name[MAX_TARGET_NAME] = {0};
7247 const int __dbg_api_major_version = 1;
7248 const int __dbg_api_minor_version = 0;
7249 
__dbg_target_so_loaded()7250 void __dbg_target_so_loaded()
7251 {
7252 }
__dbg_target_so_unloaded()7253 void __dbg_target_so_unloaded()
7254 {
7255 }
7256