1 /*
2     Copyright (c) 2014-2016 Intel Corporation.  All Rights Reserved.
3 
4     Redistribution and use in source and binary forms, with or without
5     modification, are permitted provided that the following conditions
6     are met:
7 
8       * Redistributions of source code must retain the above copyright
9         notice, this list of conditions and the following disclaimer.
10       * Redistributions in binary form must reproduce the above copyright
11         notice, this list of conditions and the following disclaimer in the
12         documentation and/or other materials provided with the distribution.
13       * Neither the name of Intel Corporation nor the names of its
14         contributors may be used to endorse or promote products derived
15         from this software without specific prior written permission.
16 
17     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20     A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21     HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23     LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24     DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25     THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27     OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29 
30 
31 #ifndef OFFLOAD_ENGINE_H_INCLUDED
32 #define OFFLOAD_ENGINE_H_INCLUDED
33 
34 #include <limits.h>
35 #include <bitset>
36 #include <list>
37 #include <set>
38 #include <map>
39 #include "offload_common.h"
40 #include "coi/coi_client.h"
41 
42 #define SIGNAL_HAS_COMPLETED ((OffloadDescriptor *)-1)
43 const int64_t no_stream = -1;
44 
45 // Address range
46 class MemRange {
47 public:
MemRange()48     MemRange() : m_start(0), m_length(0) {}
MemRange(const void * addr,uint64_t len)49     MemRange(const void *addr, uint64_t len) : m_start(addr), m_length(len) {}
50 
start()51     const void* start() const {
52         return m_start;
53     }
54 
end()55     const void* end() const {
56         return static_cast<const char*>(m_start) + m_length;
57     }
58 
length()59     uint64_t length() const {
60         return m_length;
61     }
62 
63     // returns true if given range overlaps with another one
overlaps(const MemRange & o)64     bool overlaps(const MemRange &o) const {
65         // Two address ranges A[start, end) and B[start,end) overlap
66         // if A.start < B.end and A.end > B.start.
67         return start() < o.end() && end() > o.start();
68     }
69 
70     // returns true if given range contains the other range
contains(const MemRange & o)71     bool contains(const MemRange &o) const {
72         return start() <= o.start() && o.end() <= end();
73     }
74 
75 private:
76     const void* m_start;
77     uint64_t    m_length;
78 };
79 
80 // Data associated with a pointer variable
81 class PtrData {
82 public:
PtrData(const void * addr,uint64_t len)83     PtrData(const void *addr, uint64_t len) :
84         cpu_addr(addr, len), cpu_buf(0),
85         mic_addr(0), alloc_disp(0), mic_buf(0), mic_offset(0),
86         ref_count(0), is_static(false), is_omp_associate(false)
87     {}
88 
89     //
90     // Copy constructor
91     //
PtrData(const PtrData & ptr)92     PtrData(const PtrData& ptr):
93         cpu_addr(ptr.cpu_addr), cpu_buf(ptr.cpu_buf),
94         mic_addr(ptr.mic_addr), alloc_disp(ptr.alloc_disp),
95         mic_buf(ptr.mic_buf), mic_offset(ptr.mic_offset),
96         ref_count(ptr.ref_count), is_static(ptr.is_static),
97         is_omp_associate(ptr.is_omp_associate),
98         var_alloc_type(0)
99     {}
100 
101     bool operator<(const PtrData &o) const {
102         // Variables are sorted by the CPU start address.
103         // Overlapping memory ranges are considered equal.
104         return (cpu_addr.start() < o.cpu_addr.start()) &&
105                !cpu_addr.overlaps(o.cpu_addr);
106     }
107 
add_reference()108     long add_reference() {
109         if (is_omp_associate || (is_static && !var_alloc_type)) {
110             return LONG_MAX;
111         }
112 #ifndef TARGET_WINNT
113         return __sync_fetch_and_add(&ref_count, 1);
114 #else // TARGET_WINNT
115         return _InterlockedIncrement(&ref_count) - 1;
116 #endif // TARGET_WINNT
117     }
118 
remove_reference()119     long remove_reference() {
120         if (is_omp_associate || (is_static && !var_alloc_type)) {
121             return LONG_MAX;
122         }
123 #ifndef TARGET_WINNT
124         return __sync_sub_and_fetch(&ref_count, 1);
125 #else // TARGET_WINNT
126         return _InterlockedDecrement(&ref_count);
127 #endif // TARGET_WINNT
128     }
129 
get_reference()130     long get_reference() const {
131         if (is_omp_associate || (is_static && !var_alloc_type)) {
132             return LONG_MAX;
133         }
134         return ref_count;
135     }
136 
137 public:
138     // CPU address range
139     const MemRange  cpu_addr;
140 
141     // CPU and MIC buffers
142     COIBUFFER       cpu_buf;
143     COIBUFFER       mic_buf;
144 
145     // placeholder for buffer address on mic
146     uint64_t        mic_addr;
147 
148     uint64_t        alloc_disp;
149 
150     // additional offset to pointer data on MIC for improving bandwidth for
151     // data which is not 4K aligned
152     uint32_t        mic_offset;
153 
154     // if true buffers are created from static memory
155     bool            is_static;
156 
157     // true if MIC buffer created by omp_target_associate
158     bool            is_omp_associate;
159 
160     bool            var_alloc_type;
161     mutex_t         alloc_ptr_data_lock;
162 
163 private:
164     // reference count for the entry
165     long            ref_count;
166 };
167 
168 typedef std::list<PtrData*> PtrDataList;
169 
170 class PtrDataTable {
171 public:
172     typedef std::set<PtrData> PtrSet;
173 
find_ptr_data(const void * ptr)174     PtrData* find_ptr_data(const void *ptr) {
175         m_ptr_lock.lock();
176         PtrSet::iterator res = list.find(PtrData(ptr, 0));
177 
178         m_ptr_lock.unlock();
179         if (res == list.end()) {
180             return 0;
181         }
182         return const_cast<PtrData*>(res.operator->());
183     }
184 
insert_ptr_data(const void * ptr,uint64_t len,bool & is_new)185     PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
186         m_ptr_lock.lock();
187         std::pair<PtrSet::iterator, bool> res =
188             list.insert(PtrData(ptr, len));
189 
190         PtrData* ptr_data = const_cast<PtrData*>(res.first.operator->());
191         m_ptr_lock.unlock();
192 
193         is_new = res.second;
194         if (is_new) {
195             // It's necessary to lock as soon as possible.
196             // unlock must be done at call site of insert_ptr_data at
197             // branch for is_new
198             ptr_data->alloc_ptr_data_lock.lock();
199         }
200         return ptr_data;
201     }
202 
remove_ptr_data(const void * ptr)203     void remove_ptr_data(const void *ptr) {
204         m_ptr_lock.lock();
205         list.erase(PtrData(ptr, 0));
206         m_ptr_lock.unlock();
207     }
208 private:
209 
210     PtrSet list;
211     mutex_t     m_ptr_lock;
212 };
213 
214 // Data associated with automatic variable
215 class AutoData {
216 public:
AutoData(const void * addr,uint64_t len)217     AutoData(const void *addr, uint64_t len) :
218         cpu_addr(addr, len), ref_count(0)
219     {}
220 
221     bool operator<(const AutoData &o) const {
222         // Variables are sorted by the CPU start address.
223         // Overlapping memory ranges are considered equal.
224         return (cpu_addr.start() < o.cpu_addr.start()) &&
225                !cpu_addr.overlaps(o.cpu_addr);
226     }
227 
add_reference()228     long add_reference() {
229 #ifndef TARGET_WINNT
230         return __sync_fetch_and_add(&ref_count, 1);
231 #else // TARGET_WINNT
232         return _InterlockedIncrement(&ref_count) - 1;
233 #endif // TARGET_WINNT
234     }
235 
remove_reference()236     long remove_reference() {
237 #ifndef TARGET_WINNT
238         return __sync_sub_and_fetch(&ref_count, 1);
239 #else // TARGET_WINNT
240         return _InterlockedDecrement(&ref_count);
241 #endif // TARGET_WINNT
242     }
243 
nullify_reference()244     long nullify_reference() {
245 #ifndef TARGET_WINNT
246         return __sync_lock_test_and_set(&ref_count, 0);
247 #else // TARGET_WINNT
248         return _InterlockedExchange(&ref_count,0);
249 #endif // TARGET_WINNT
250     }
251 
get_reference()252     long get_reference() const {
253         return ref_count;
254     }
255 
256 public:
257     // CPU address range
258     const MemRange cpu_addr;
259 
260 private:
261     // reference count for the entry
262     long ref_count;
263 };
264 
265 // Set of autimatic variables
266 typedef std::set<AutoData> AutoSet;
267 
268 // Target image data
269 struct TargetImage
270 {
TargetImageTargetImage271     TargetImage(const char *_name, const void *_data, uint64_t _size,
272                 const char *_origin, uint64_t _offset) :
273         name(_name), data(_data), size(_size),
274         origin(_origin), offset(_offset)
275     {}
276 
277     // library name
278     const char* name;
279 
280     // contents and size
281     const void* data;
282     uint64_t    size;
283 
284     // file of origin and offset within that file
285     const char* origin;
286     uint64_t    offset;
287 };
288 
289 typedef std::list<TargetImage> TargetImageList;
290 
291 // dynamic library and Image associated with lib
292 struct DynLib
293 {
DynLibDynLib294     DynLib(const char *_name, const void *_data,
295            COILIBRARY _lib) :
296         name(_name), data(_data), lib(_lib)
297     {}
298     // library name
299     const char* name;
300 
301     // contents
302     const void* data;
303 
304     COILIBRARY lib;
305 };
306 typedef std::list<DynLib> DynLibList;
307 
308 // Data associated with persistent auto objects
309 struct PersistData
310 {
PersistDataPersistData311     PersistData(const void *addr, uint64_t routine_num,
312                 uint64_t size, uint64_t thread) :
313         stack_cpu_addr(addr), routine_id(routine_num), thread_id(thread)
314     {
315         stack_ptr_data = new PtrData(0, size);
316     }
317     // 1-st key value - beginning of the stack at CPU
318     const void *   stack_cpu_addr;
319     // 2-nd key value - identifier of routine invocation at CPU
320     uint64_t   routine_id;
321     // 3-rd key value - thread identifier
322     uint64_t   thread_id;
323 
324     // corresponded PtrData; only stack_ptr_data->mic_buf is used
325     PtrData * stack_ptr_data;
326     // used to get offset of the variable in stack buffer
327     char * cpu_stack_addr;
328 };
329 
330 typedef std::list<PersistData> PersistDataList;
331 
332 // Data associated with stream
333 struct Stream
334 {
StreamStream335     Stream(int device, int num_of_cpus) :
336        m_number_of_cpus(num_of_cpus), m_pipeline(0), m_last_offload(0),
337        m_device(device)
338     {}
~StreamStream339     ~Stream() {
340         if (m_pipeline) {
341              COI::PipelineDestroy(m_pipeline);
342         }
343     }
344 
get_pipelineStream345     COIPIPELINE get_pipeline(void) {
346         return(m_pipeline);
347     }
348 
get_deviceStream349     int get_device(void) {
350         return(m_device);
351     }
352 
get_cpu_numberStream353     int get_cpu_number(void) {
354         return(m_number_of_cpus);
355     }
356 
set_pipelineStream357     void set_pipeline(COIPIPELINE pipeline) {
358         m_pipeline = pipeline;
359     }
360 
get_last_offloadStream361     OffloadDescriptor* get_last_offload(void) {
362         return(m_last_offload);
363     }
364 
set_last_offloadStream365     void set_last_offload(OffloadDescriptor*   last_offload) {
366         m_last_offload = last_offload;
367     }
368 
369     static Stream* find_stream(uint64_t handle, bool remove);
370 
add_streamStream371     static _Offload_stream  add_stream(int device, int number_of_cpus) {
372         _Offload_stream result;
373         m_stream_lock.lock();
374         result = ++m_streams_count;
375         all_streams[m_streams_count] = new Stream(device, number_of_cpus);
376         m_stream_lock.unlock();
377         return(result);
378     }
379 
get_streams_countStream380     static uint64_t get_streams_count() {
381         return m_streams_count;
382     }
383 
384     typedef std::map<uint64_t, Stream*> StreamMap;
385 
386     static uint64_t  m_streams_count;
387     static StreamMap all_streams;
388     static mutex_t   m_stream_lock;
389 
390     int m_device;
391 
392     // number of cpus
393     int m_number_of_cpus;
394 
395     // The pipeline associated with the stream
396     COIPIPELINE         m_pipeline;
397 
398     // The last offload occured via the stream
399     OffloadDescriptor*  m_last_offload;
400 
401     // Cpus used by the stream
402     std::bitset<COI_MAX_HW_THREADS> m_stream_cpus;
403 };
404 
405 typedef std::map<uint64_t, Stream*> StreamMap;
406 typedef std::bitset<COI_MAX_HW_THREADS> micLcpuMask;
407 
408 // ordered by count double linked list of cpus used by streams
409 typedef struct CpuEl{
410     uint64_t      count; // number of streams using the cpu
411     struct CpuEl* prev;  // cpu with the same or lesser count
412     struct CpuEl* next;  // cpu with the same or greater count
413 } CpuEl;
414 
415 // class representing a single engine
416 struct Engine {
417     friend void __offload_init_library_once(void);
418     friend void __offload_fini_library(void);
419 
420 #define CPU_INDEX(x) (x - m_cpus)
421 #define check_result(res, tag, ...) \
422     { \
423         if (res == COI_PROCESS_DIED) { \
424             fini_process(true); \
425             exit(1); \
426         } \
427         if (res != COI_SUCCESS) { \
428             __liboffload_error_support(tag, __VA_ARGS__); \
429             exit(1); \
430         } \
431     }
432 
get_logical_indexEngine433     int get_logical_index() const {
434         return m_index;
435     }
436 
get_physical_indexEngine437     int get_physical_index() const {
438         return m_physical_index;
439     }
440 
get_processEngine441     const COIPROCESS& get_process() const {
442         return m_process;
443     }
444 
get_readyEngine445     bool get_ready() {
446         return m_ready;
447     }
448 
449     uint64_t get_thread_id(void);
450 
451     // initialize device
452     void init(void);
453 
454     // unload library
455     void unload_library(const void *data, const char *name);
456 
457     // add new library
add_libEngine458     void add_lib(const TargetImage &lib)
459     {
460         m_lock.lock();
461         m_ready = false;
462         m_images.push_back(lib);
463         m_lock.unlock();
464     }
465 
466     COIRESULT compute(
467         _Offload_stream     stream,
468         const std::list<COIBUFFER> &buffers,
469         const void*         data,
470         uint16_t            data_size,
471         void*               ret,
472         uint16_t            ret_size,
473         uint32_t            num_deps,
474         const COIEVENT*     deps,
475         COIEVENT*           event
476     );
477 
478 #ifdef MYO_SUPPORT
479     // temporary workaround for blocking behavior for myoiLibInit/Fini calls
init_myoEngine480     void init_myo(COIEVENT *event) {
481         COIRESULT res;
482         res = COI::PipelineRunFunction(get_pipeline(),
483                                        m_funcs[c_func_myo_init],
484                                        0, 0, 0, 0, 0, 0, 0, 0, 0,
485                                        event);
486         check_result(res, c_pipeline_run_func, m_index, res);
487     }
488 
fini_myoEngine489     void fini_myo(COIEVENT *event) {
490         COIRESULT res;
491         res = COI::PipelineRunFunction(get_pipeline(),
492                                        m_funcs[c_func_myo_fini],
493                                        0, 0, 0, 0, 0, 0, 0, 0, 0,
494                                        event);
495         check_result(res, c_pipeline_run_func, m_index, res);
496     }
497 #endif // MYO_SUPPORT
498 
499     //
500     // Memory association table
501     //
find_ptr_dataEngine502     PtrData* find_ptr_data(const void *ptr) {
503         return m_ptr_set.find_ptr_data(ptr);
504     }
505 
find_targetptr_dataEngine506     PtrData* find_targetptr_data(const void *ptr) {
507         return m_targetptr_set.find_ptr_data(ptr);
508     }
509 
insert_ptr_dataEngine510     PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
511         return m_ptr_set.insert_ptr_data(ptr, len, is_new);
512     }
513 
insert_targetptr_dataEngine514     PtrData* insert_targetptr_data(const void *ptr, uint64_t len,
515                                    bool &is_new) {
516         return m_targetptr_set.insert_ptr_data(ptr, len, is_new);
517     }
518 
remove_ptr_dataEngine519     void remove_ptr_data(const void *ptr) {
520         m_ptr_set.remove_ptr_data(ptr);
521     }
522 
remove_targetptr_dataEngine523     void remove_targetptr_data(const void *ptr) {
524         m_targetptr_set.remove_ptr_data(ptr);
525     }
526 
527     //
528     // Automatic variables
529     //
find_auto_dataEngine530     AutoData* find_auto_data(const void *ptr) {
531         AutoSet &auto_vars = get_auto_vars();
532         AutoSet::iterator res = auto_vars.find(AutoData(ptr, 0));
533         if (res == auto_vars.end()) {
534             return 0;
535         }
536         return const_cast<AutoData*>(res.operator->());
537     }
538 
insert_auto_dataEngine539     AutoData* insert_auto_data(const void *ptr, uint64_t len) {
540         AutoSet &auto_vars = get_auto_vars();
541         std::pair<AutoSet::iterator, bool> res =
542             auto_vars.insert(AutoData(ptr, len));
543         return const_cast<AutoData*>(res.first.operator->());
544     }
545 
remove_auto_dataEngine546     void remove_auto_data(const void *ptr) {
547         get_auto_vars().erase(AutoData(ptr, 0));
548     }
549 
550     //
551     // Signals
552     //
add_signalEngine553     void add_signal(const void *signal, OffloadDescriptor *desc) {
554         m_signal_lock.lock();
555         m_signal_map[signal] = desc;
556         m_signal_lock.unlock();
557     }
558 
find_signalEngine559     OffloadDescriptor* find_signal(const void *signal, bool remove) {
560         OffloadDescriptor *desc = 0;
561 
562         m_signal_lock.lock();
563         {
564             SignalMap::iterator it = m_signal_map.find(signal);
565             if (it != m_signal_map.end()) {
566                 desc = it->second;
567                 if (remove) {
568                     it->second = SIGNAL_HAS_COMPLETED;
569                 }
570             }
571         }
572         m_signal_lock.unlock();
573 
574         return desc;
575     }
576 
complete_signaled_ofldEngine577     void complete_signaled_ofld(const void *signal) {
578 
579         m_signal_lock.lock();
580         {
581             SignalMap::iterator it = m_signal_map.find(signal);
582             if (it != m_signal_map.end()) {
583                 it->second = SIGNAL_HAS_COMPLETED;
584             }
585         }
586         m_signal_lock.unlock();
587     }
588 
589     void stream_destroy(_Offload_stream handle);
590 
591     void move_cpu_el_after(CpuEl* cpu_what, CpuEl* cpu_after);
592     void print_stream_cpu_list(const char *);
593 
594     COIPIPELINE get_pipeline(_Offload_stream stream);
595 
get_stream_mapEngine596     StreamMap get_stream_map() {
597         return m_stream_map;
598     }
599 
600     // stop device process
601     void fini_process(bool verbose);
602 
603     // list of stacks active at the engine
604     PersistDataList m_persist_list;
605 
606 private:
EngineEngine607     Engine() : m_index(-1), m_physical_index(-1), m_process(0), m_ready(false),
608                m_proc_number(0), m_assigned_cpus(0), m_cpus(0), m_cpu_head(0)
609     {}
610 
~EngineEngine611     ~Engine() {
612         m_ready = false;
613         for (StreamMap::iterator it = m_stream_map.begin();
614              it != m_stream_map.end(); it++) {
615             Stream * stream = it->second;
616             delete stream;
617         }
618         if (m_process != 0) {
619             fini_process(false);
620         }
621         if (m_assigned_cpus) {
622             delete m_assigned_cpus;
623         }
624     }
625 
626     // set indexes
set_indexesEngine627     void set_indexes(int logical_index, int physical_index) {
628         m_index = logical_index;
629         m_physical_index = physical_index;
630     }
631 
632     // set CPU mask
set_cpu_maskEngine633     void set_cpu_mask(micLcpuMask *cpu_mask)
634     {
635         m_assigned_cpus = cpu_mask;
636     }
637 
638     // start process on device
639     void init_process();
640 
641     void load_libraries(void);
642     void init_ptr_data(void);
643 
644     // performs library intialization on the device side
645     pid_t init_device(void);
646 
647 private:
648     // get pipeline associated with a calling thread
649     COIPIPELINE get_pipeline(void);
650 
651     // get automatic vars set associated with the calling thread
652     AutoSet& get_auto_vars(void);
653 
654     // destructor for thread data
655     static void destroy_thread_data(void *data);
656 
657 private:
658     typedef std::set<PtrData> PtrSet;
659     typedef std::map<const void*, OffloadDescriptor*> SignalMap;
660 
661     // device indexes
662     int         m_index;
663     int         m_physical_index;
664 
665     // cpu mask
666     micLcpuMask *m_assigned_cpus;
667 
668     // number of COI pipes created for the engine
669     long        m_proc_number;
670 
671     // process handle
672     COIPROCESS  m_process;
673 
674     // If false, device either has not been initialized or new libraries
675     // have been added.
676     bool        m_ready;
677     mutex_t     m_lock;
678 
679     // List of libraries to be loaded
680     TargetImageList m_images;
681 
682     // var tables
683     PtrDataTable m_ptr_set;
684     PtrDataTable m_targetptr_set;
685 
686     // signals
687     SignalMap m_signal_map;
688     mutex_t   m_signal_lock;
689 
690     // streams
691     StreamMap   m_stream_map;
692     mutex_t     m_stream_lock;
693     int         m_num_cores;
694     int         m_num_threads;
695     CpuEl*      m_cpus;
696     CpuEl*      m_cpu_head;
697 
698     // List of dynamic libraries to be registred
699     DynLibList m_dyn_libs;
700 
701     // constants for accessing device function handles
702     enum {
703         c_func_compute = 0,
704 #ifdef MYO_SUPPORT
705         c_func_myo_init,
706         c_func_myo_fini,
707 #endif // MYO_SUPPORT
708         c_func_init,
709         c_func_var_table_size,
710         c_func_var_table_copy,
711         c_func_set_stream_affinity,
712         c_funcs_total
713     };
714     static const char* m_func_names[c_funcs_total];
715 
716     // device function handles
717     COIFUNCTION m_funcs[c_funcs_total];
718 
719     // int -> name mapping for device signals
720     static const int   c_signal_max = 32;
721     static const char* c_signal_names[c_signal_max];
722 };
723 
724 #endif // OFFLOAD_ENGINE_H_INCLUDED
725