1 /*
2     Copyright (c) 2014-2016 Intel Corporation.  All Rights Reserved.
3 
4     Redistribution and use in source and binary forms, with or without
5     modification, are permitted provided that the following conditions
6     are met:
7 
8       * Redistributions of source code must retain the above copyright
9         notice, this list of conditions and the following disclaimer.
10       * Redistributions in binary form must reproduce the above copyright
11         notice, this list of conditions and the following disclaimer in the
12         documentation and/or other materials provided with the distribution.
13       * Neither the name of Intel Corporation nor the names of its
14         contributors may be used to endorse or promote products derived
15         from this software without specific prior written permission.
16 
17     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20     A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21     HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23     LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24     DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25     THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27     OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29 
30 
31 /*! \file
32     \brief The parts of the runtime library used only on the host
33 */
34 
35 #ifndef OFFLOAD_HOST_H_INCLUDED
36 #define OFFLOAD_HOST_H_INCLUDED
37 
38 #ifndef TARGET_WINNT
39 #include <unistd.h>
40 #endif // TARGET_WINNT
41 #include "offload_common.h"
42 #include "offload_util.h"
43 #include "offload_engine.h"
44 #include "offload_env.h"
45 #include "offload_orsl.h"
46 #include "coi/coi_client.h"
47 
48 // MIC engines.
49 DLL_LOCAL extern Engine*  mic_engines;
50 DLL_LOCAL extern uint32_t mic_engines_total;
51 
52 // DMA channel count used by COI and set via
53 // OFFLOAD_DMA_CHANNEL_COUNT environment variable
54 DLL_LOCAL extern uint32_t mic_dma_channel_count;
55 
56 //! The target image is packed as follows.
57 /*!      1. 8 bytes containing the size of the target binary          */
58 /*!      2. a null-terminated string which is the binary name         */
59 /*!      3. <size> number of bytes that are the contents of the image */
60 /*!      The address of symbol __offload_target_image
61              is the address of this structure.                        */
62 struct Image {
63      int64_t size; //!< Size in bytes of the target binary name and contents
64      char data[];  //!< The name and contents of the target image
65 };
66 
67 // The offload descriptor.
68 class OffloadDescriptor
69 {
70 public:
71     enum  OmpAsyncLastEventType {
72         c_last_not,     // not last event
73         c_last_write,   // the last event that is write
74         c_last_read,    // the last event that is read
75         c_last_runfunc  // the last event that is runfunction
76     };
77 
OffloadDescriptor(int index,_Offload_status * status,bool is_mandatory,bool is_openmp,OffloadHostTimerData * timer_data)78     OffloadDescriptor(
79         int index,
80         _Offload_status *status,
81         bool is_mandatory,
82         bool is_openmp,
83         OffloadHostTimerData * timer_data
84     ) :
85         m_device(mic_engines[index == -1 ? 0 : index % mic_engines_total]),
86         m_is_mandatory(is_mandatory),
87         m_is_openmp(is_openmp),
88         m_inout_buf(0),
89         m_func_desc(0),
90         m_func_desc_size(0),
91         m_num_in_dependencies(0),
92         m_p_in_dependencies(0),
93         m_in_deps(0),
94         m_in_deps_total(0),
95         m_in_deps_allocated(0),
96         m_out_deps(0),
97         m_out_deps_total(0),
98         m_out_deps_allocated(0),
99         m_vars(0),
100         m_vars_extra(0),
101         m_status(status),
102         m_timer_data(timer_data),
103         m_out_with_preallocated(false),
104         m_preallocated_alloc(false),
105         m_traceback_called(false),
106         m_stream(-1),
107         m_signal(0),
108         m_has_signal(0),
109         m_omp_async_last_event_type(c_last_not)
110     {
111         m_wait_all_devices = index == -1;
112     }
113 
~OffloadDescriptor()114     ~OffloadDescriptor()
115     {
116         if (m_in_deps != 0) {
117             free(m_in_deps);
118         }
119         if (m_out_deps != 0) {
120             free(m_out_deps);
121         }
122         if (m_func_desc != 0) {
123             free(m_func_desc);
124         }
125         if (m_vars != 0) {
126             free(m_vars);
127             free(m_vars_extra);
128         }
129     }
130 
131     bool offload(const char *name, bool is_empty,
132                  VarDesc *vars, VarDesc2 *vars2, int vars_total,
133                  const void **waits, int num_waits, const void **signal,
134                  int entry_id, const void *stack_addr,
135                  OffloadFlags offload_flags);
136 
137     bool offload_finish(bool is_traceback);
138 
139     bool is_signaled();
140 
get_timer_data()141     OffloadHostTimerData* get_timer_data() const {
142         return m_timer_data;
143     }
144 
set_stream(_Offload_stream stream)145     void set_stream(_Offload_stream stream) {
146         m_stream = stream;
147     }
148 
get_stream()149     _Offload_stream get_stream() {
150         return(m_stream);
151     }
152 
get_device()153     Engine& get_device() {
154         return m_device;
155     }
156 
get_signal()157     void* get_signal() {
158         return(m_signal);
159     }
160 
set_signal(const void * signal)161     void set_signal(const void* signal) {
162         m_has_signal = 1;
163         m_signal = const_cast<void*>(signal);
164     }
165 
166     void cleanup();
167 
168     uint32_t  m_event_count;
169     bool      m_has_signal;
170 
171 private:
172     bool offload_wrap(const char *name, bool is_empty,
173                  VarDesc *vars, VarDesc2 *vars2, int vars_total,
174                  const void **waits, int num_waits, const void **signal,
175                  int entry_id, const void *stack_addr,
176                  OffloadFlags offload_flags);
177     bool wait_dependencies(const void **waits, int num_waits,
178                            _Offload_stream stream);
179     bool setup_descriptors(VarDesc *vars, VarDesc2 *vars2, int vars_total,
180                            int entry_id, const void *stack_addr);
181     bool setup_misc_data(const char *name);
182     bool send_pointer_data(bool is_async, void* info);
183     bool send_noncontiguous_pointer_data(
184         int i,
185         PtrData* src_buf,
186         PtrData* dst_buf,
187         COIEVENT *event,
188         uint64_t  &sent_data,
189         uint32_t in_deps_amount,
190         COIEVENT *in_deps
191         );
192     bool receive_noncontiguous_pointer_data(
193         int i,
194         COIBUFFER dst_buf,
195         COIEVENT *event,
196         uint64_t  &received_data,
197         uint32_t in_deps_amount,
198         COIEVENT *in_deps
199         );
200 
201     bool gather_copyin_data();
202 
203     bool compute(void *);
204 
205     bool receive_pointer_data(bool is_async, bool first_run, void * info);
206     bool scatter_copyout_data();
207 
208     bool find_ptr_data(PtrData* &ptr_data, void *base, int64_t disp,
209                        int64_t length, bool is_targptr,
210                        bool error_does_not_exist = true);
211 
212     void find_device_ptr( int64_t* &device_ptr,
213                        void *host_ptr);
214 
215     bool alloc_ptr_data(PtrData* &ptr_data, void *base, int64_t disp,
216                         int64_t length, int64_t alloc_disp, int align,
217                         bool is_targptr, bool is_prealloc, bool pin);
218     bool create_preallocated_buffer(PtrData* ptr_data, void *base);
219     bool init_static_ptr_data(PtrData *ptr_data);
220     bool init_mic_address(PtrData *ptr_data);
221     bool offload_stack_memory_manager(
222         const void * stack_begin,
223         int routine_id,
224         int buf_size,
225         int align,
226         bool thread_specific_function_locals,
227         bool *is_new);
228     char *get_this_threads_cpu_stack_addr(
229         const void * stack_begin,
230         int routine_id,
231         bool thread_specific_function_locals);
232     PtrData *get_this_threads_mic_stack_addr(
233         const void * stack_begin,
234         int routine_id,
235         bool thread_specific_function_locals);
236     bool nullify_target_stack(COIBUFFER targ_buf, uint64_t size);
237 
238     bool gen_var_descs_for_pointer_array(int i);
239 
240     void get_stream_in_dependencies(uint32_t &in_deps_amount,
241                                     COIEVENT* &in_deps);
242 
243     void report_coi_error(error_types msg, COIRESULT res);
244     _Offload_result translate_coi_error(COIRESULT res) const;
245 
246     void setup_omp_async_info();
247 
248     void setup_use_device_ptr(int i);
249 
250     void register_event_call_back(void (*)(
251                                       COIEVENT,
252                                       const COIRESULT,
253                                       const void*),
254                                   const COIEVENT *event,
255                                   const void *info);
256 
257     void register_omp_event_call_back(const COIEVENT *event, const void *info);
258 
259 private:
260     typedef std::list<COIBUFFER> BufferList;
261 
262     // extra data associated with each variable descriptor
263     struct VarExtra {
264         PtrData* src_data;
265         PtrData* dst_data;
266         AutoData* auto_data;
267         int64_t cpu_disp;
268         int64_t cpu_offset;
269         void *alloc;
270         union {
271             CeanReadRanges *read_rng_src;
272             NonContigDesc  *noncont_desc;
273         };
274         CeanReadRanges *read_rng_dst;
275         int64_t ptr_arr_offset;
276         bool is_arr_ptr_el;
277         OmpAsyncLastEventType omp_last_event_type;
278         int64_t pointer_offset;
279         uint16_t type_src;
280         uint16_t type_dst;
281     };
282 
283     template<typename T> class ReadArrElements {
284     public:
ReadArrElements()285         ReadArrElements():
286             ranges(NULL),
287             el_size(sizeof(T)),
288             offset(0),
289             count(0),
290             is_empty(true),
291             base(NULL)
292         {}
293 
read_next(bool flag)294         bool read_next(bool flag)
295         {
296             if (flag != 0) {
297                 if (is_empty) {
298                     if (ranges) {
299                         if (!get_next_range(ranges, &offset)) {
300                             // ranges are over
301                             return false;
302                         }
303                     }
304                     // all contiguous elements are over
305                     else if (count != 0) {
306                         return false;
307                     }
308 
309                     length_cur = size;
310                 }
311                 else {
312                     offset += el_size;
313                 }
314                 val = (T)get_el_value(base, offset, el_size);
315                 length_cur -= el_size;
316                 count++;
317                 is_empty = length_cur == 0;
318             }
319             return true;
320         }
321     public:
322         CeanReadRanges * ranges;
323         T       val;
324         int     el_size;
325         int64_t size,
326                 offset,
327                 length_cur;
328         bool    is_empty;
329         int     count;
330         char   *base;
331     };
332 
333     // ptr_data for persistent auto objects
334     PtrData*    m_stack_ptr_data;
335     PtrDataList m_destroy_stack;
336 
337     // Engine
338     Engine& m_device;
339 
340     // true for offload_wait target(mic) stream(0)
341     bool m_wait_all_devices;
342 
343     // if true offload is mandatory
344     bool m_is_mandatory;
345 
346     // if true offload has openmp origin
347     const bool m_is_openmp;
348 
349     // The Marshaller for the inputs of the offloaded region.
350     Marshaller m_in;
351 
352     // The Marshaller for the outputs of the offloaded region.
353     Marshaller m_out;
354 
355     // List of buffers that are passed to dispatch call
356     BufferList m_compute_buffers;
357 
358     // List of buffers that need to be destroyed at the end of offload
359     BufferList m_destroy_buffers;
360 
361     // Variable descriptors
362     VarDesc*  m_vars;
363     VarExtra* m_vars_extra;
364     int       m_vars_total;
365 
366     // Pointer to a user-specified status variable
367     _Offload_status *m_status;
368 
369     // Function descriptor
370     FunctionDescriptor* m_func_desc;
371     uint32_t            m_func_desc_size;
372 
373     // Buffer for transferring copyin/copyout data
374     COIBUFFER m_inout_buf;
375 
376 
377     // Dependencies
378     COIEVENT *m_in_deps;
379     uint32_t  m_in_deps_total;
380     uint32_t  m_in_deps_allocated;
381     COIEVENT *m_out_deps;
382     uint32_t  m_out_deps_total;
383     uint32_t  m_out_deps_allocated;
384 
385     // 2 variables defines input dependencies for current COI API.
386     // The calls to routines as BufferWrite/PipelineRunFunction/BufferRead
387     // is supposed to have input dependencies.
388     // 2 variables below defines the number and vector of dependencies
389     // in every current moment of offload.
390     // So any phase of offload can use its values as input dependencies
391     // for the COI API that the phase calls.
392     // It means that all phases (of Write, RunFunction,Read) must keep
393     // the variables correct to be used by following phase.
394     // If some consequent offloads are connected (i.e. by the same stream)
395     // the final 2 variables of the offload is used as initial inputs
396     // for the next offload.
397     uint32_t  m_num_in_dependencies;
398     COIEVENT *m_p_in_dependencies;
399 
400     // Stream
401     _Offload_stream m_stream;
402 
403     // Signal
404     void* m_signal;
405 
406     // Timer data
407     OffloadHostTimerData *m_timer_data;
408 
409     // copyin/copyout data length
410     uint64_t m_in_datalen;
411     uint64_t m_out_datalen;
412 
413     // a boolean value calculated in setup_descriptors. If true we need to do
414     // a run function on the target. Otherwise it may be optimized away.
415     bool m_need_runfunction;
416 
417     // initialized value of m_need_runfunction;
418     // is used to recognize offload_transfer
419     bool m_initial_need_runfunction;
420 
421     // a Boolean value set to true when OUT clauses with preallocated targetptr
422     // is encountered to indicate that call receive_pointer_data needs to be
423     // invoked again after call to scatter_copyout_data.
424     bool m_out_with_preallocated;
425 
426     // a Boolean value set to true if an alloc_if(1) is used with preallocated
427     // targetptr to indicate the need to scatter_copyout_data even for
428     // async offload
429     bool m_preallocated_alloc;
430 
431     // a Boolean value set to true if traceback routine is called
432     bool m_traceback_called;
433 
434     OmpAsyncLastEventType m_omp_async_last_event_type;
435 };
436 
437 // Initialization types for MIC
438 enum OffloadInitType {
439     c_init_on_start,         // all devices before entering main
440     c_init_on_offload,       // single device before starting the first offload
441     c_init_on_offload_all    // all devices before starting the first offload
442 };
443 
444 // Determines if MIC code is an executable or a shared library
445 extern "C" bool __offload_target_image_is_executable(const void *target_image);
446 
447 // Initializes library and registers specified offload image.
448 extern "C" bool __offload_register_image(const void* image);
449 extern "C" void __offload_unregister_image(const void* image);
450 
451 // Registers asynchronous task completion callback
452 extern "C" void __offload_register_task_callback(void (*cb)(void *));
453 
454 // Initializes offload runtime library.
455 DLL_LOCAL extern int __offload_init_library(void);
456 
457 // thread data for associating pipelines with threads
458 DLL_LOCAL extern pthread_key_t mic_thread_key;
459 
460 // location of offload_main executable
461 // To be used if the main application has no offload and is not built
462 // with -offload but dynamic library linked in has offload pragma
463 DLL_LOCAL extern char* mic_device_main;
464 
465 // Environment variables for devices
466 DLL_LOCAL extern MicEnvVar mic_env_vars;
467 
468 // CPU frequency
469 DLL_LOCAL extern uint64_t cpu_frequency;
470 
471 // LD_LIBRARY_PATH for KNC libraries
472 DLL_LOCAL extern char* knc_library_path;
473 
474 // LD_LIBRARY_PATH for KNL libraries
475 DLL_LOCAL extern char* knl_library_path;
476 
477 // stack size for target
478 DLL_LOCAL extern uint32_t mic_stack_size;
479 
480 // Preallocated memory size for buffers on MIC
481 DLL_LOCAL extern uint64_t mic_buffer_size;
482 
483 // Preallocated 4K page memory size for buffers on MIC
484 DLL_LOCAL extern uint64_t mic_4k_buffer_size;
485 
486 // Preallocated 2M page memory size for buffers on MIC
487 DLL_LOCAL extern uint64_t mic_2m_buffer_size;
488 
489 // Setting controlling inout proxy
490 DLL_LOCAL extern bool  mic_proxy_io;
491 DLL_LOCAL extern char* mic_proxy_fs_root;
492 
493 // Threshold for creating buffers with large pages
494 DLL_LOCAL extern uint64_t __offload_use_2mb_buffers;
495 
496 // offload initialization type
497 DLL_LOCAL extern OffloadInitType __offload_init_type;
498 
499 // Device number to offload to when device is not explicitly specified.
500 DLL_LOCAL extern int __omp_device_num;
501 
502 // target executable
503 DLL_LOCAL extern TargetImage* __target_exe;
504 
505 // is true if last loaded image is dll
506 DLL_LOCAL extern bool __current_image_is_dll;
507 // is true if myo library is loaded when dll is loaded
508 DLL_LOCAL extern bool __myo_init_in_so;
509 
510 // IDB support
511 
512 // Called by the offload runtime after initialization of offload infrastructure
513 // has been completed.
514 extern "C" void  __dbg_target_so_loaded();
515 
516 // Called by the offload runtime when the offload infrastructure is about to be
517 // shut down, currently at application exit.
518 extern "C" void  __dbg_target_so_unloaded();
519 
520 // Null-terminated string containing path to the process image of the hosting
521 // application (offload_main)
522 #define MAX_TARGET_NAME 512
523 extern "C" char  __dbg_target_exe_name[MAX_TARGET_NAME];
524 
525 // Integer specifying the process id
526 extern "C" pid_t __dbg_target_so_pid;
527 
528 // Integer specifying the 0-based device number
529 extern "C" int   __dbg_target_id;
530 
531 // Set to non-zero by the host-side debugger to enable offload debugging
532 // support
533 extern "C" int   __dbg_is_attached;
534 
535 // Major version of the debugger support API
536 extern "C" const int __dbg_api_major_version;
537 
538 // Minor version of the debugger support API
539 extern "C" const int __dbg_api_minor_version;
540 
541 #endif // OFFLOAD_HOST_H_INCLUDED
542