1 /*
2     Copyright (c) 2014-2015 Intel Corporation.  All Rights Reserved.
3 
4     Redistribution and use in source and binary forms, with or without
5     modification, are permitted provided that the following conditions
6     are met:
7 
8       * Redistributions of source code must retain the above copyright
9         notice, this list of conditions and the following disclaimer.
10       * Redistributions in binary form must reproduce the above copyright
11         notice, this list of conditions and the following disclaimer in the
12         documentation and/or other materials provided with the distribution.
13       * Neither the name of Intel Corporation nor the names of its
14         contributors may be used to endorse or promote products derived
15         from this software without specific prior written permission.
16 
17     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20     A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21     HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23     LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24     DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25     THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27     OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29 
30 
31 #ifndef OFFLOAD_ENGINE_H_INCLUDED
32 #define OFFLOAD_ENGINE_H_INCLUDED
33 
34 #include <limits.h>
35 #include <bitset>
36 #include <list>
37 #include <set>
38 #include <map>
39 #include "offload_common.h"
40 #include "coi/coi_client.h"
41 
42 #define SIGNAL_IS_REMOVED ((OffloadDescriptor *)-1)
43 const int64_t no_stream = -1;
44 
45 // Address range
46 class MemRange {
47 public:
MemRange()48     MemRange() : m_start(0), m_length(0) {}
MemRange(const void * addr,uint64_t len)49     MemRange(const void *addr, uint64_t len) : m_start(addr), m_length(len) {}
50 
start()51     const void* start() const {
52         return m_start;
53     }
54 
end()55     const void* end() const {
56         return static_cast<const char*>(m_start) + m_length;
57     }
58 
length()59     uint64_t length() const {
60         return m_length;
61     }
62 
63     // returns true if given range overlaps with another one
overlaps(const MemRange & o)64     bool overlaps(const MemRange &o) const {
65         // Two address ranges A[start, end) and B[start,end) overlap
66         // if A.start < B.end and A.end > B.start.
67         return start() < o.end() && end() > o.start();
68     }
69 
70     // returns true if given range contains the other range
contains(const MemRange & o)71     bool contains(const MemRange &o) const {
72         return start() <= o.start() && o.end() <= end();
73     }
74 
75 private:
76     const void* m_start;
77     uint64_t    m_length;
78 };
79 
80 // Data associated with a pointer variable
81 class PtrData {
82 public:
PtrData(const void * addr,uint64_t len)83     PtrData(const void *addr, uint64_t len) :
84         cpu_addr(addr, len), cpu_buf(0),
85         mic_addr(0), alloc_disp(0), mic_buf(0), mic_offset(0),
86         ref_count(0), is_static(false)
87     {}
88 
89     //
90     // Copy constructor
91     //
PtrData(const PtrData & ptr)92     PtrData(const PtrData& ptr):
93         cpu_addr(ptr.cpu_addr), cpu_buf(ptr.cpu_buf),
94         mic_addr(ptr.mic_addr), alloc_disp(ptr.alloc_disp),
95         mic_buf(ptr.mic_buf), mic_offset(ptr.mic_offset),
96         ref_count(ptr.ref_count), is_static(ptr.is_static)
97     {}
98 
99     bool operator<(const PtrData &o) const {
100         // Variables are sorted by the CPU start address.
101         // Overlapping memory ranges are considered equal.
102         return (cpu_addr.start() < o.cpu_addr.start()) &&
103                !cpu_addr.overlaps(o.cpu_addr);
104     }
105 
add_reference()106     long add_reference() {
107         if (is_static) {
108             return LONG_MAX;
109         }
110 #ifndef TARGET_WINNT
111         return __sync_fetch_and_add(&ref_count, 1);
112 #else // TARGET_WINNT
113         return _InterlockedIncrement(&ref_count) - 1;
114 #endif // TARGET_WINNT
115     }
116 
remove_reference()117     long remove_reference() {
118         if (is_static) {
119             return LONG_MAX;
120         }
121 #ifndef TARGET_WINNT
122         return __sync_sub_and_fetch(&ref_count, 1);
123 #else // TARGET_WINNT
124         return _InterlockedDecrement(&ref_count);
125 #endif // TARGET_WINNT
126     }
127 
get_reference()128     long get_reference() const {
129         if (is_static) {
130             return LONG_MAX;
131         }
132         return ref_count;
133     }
134 
135 public:
136     // CPU address range
137     const MemRange  cpu_addr;
138 
139     // CPU and MIC buffers
140     COIBUFFER       cpu_buf;
141     COIBUFFER       mic_buf;
142 
143     // placeholder for buffer address on mic
144     uint64_t        mic_addr;
145 
146     uint64_t        alloc_disp;
147 
148     // additional offset to pointer data on MIC for improving bandwidth for
149     // data which is not 4K aligned
150     uint32_t        mic_offset;
151 
152     // if true buffers are created from static memory
153     bool            is_static;
154     mutex_t         alloc_ptr_data_lock;
155 
156 private:
157     // reference count for the entry
158     long            ref_count;
159 };
160 
161 typedef std::list<PtrData*> PtrDataList;
162 
163 class PtrDataTable {
164 public:
165     typedef std::set<PtrData> PtrSet;
166 
find_ptr_data(const void * ptr)167     PtrData* find_ptr_data(const void *ptr) {
168         m_ptr_lock.lock();
169         PtrSet::iterator res = list.find(PtrData(ptr, 0));
170 
171         m_ptr_lock.unlock();
172         if (res == list.end()) {
173             return 0;
174         }
175         return const_cast<PtrData*>(res.operator->());
176     }
177 
insert_ptr_data(const void * ptr,uint64_t len,bool & is_new)178     PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
179         m_ptr_lock.lock();
180         std::pair<PtrSet::iterator, bool> res =
181             list.insert(PtrData(ptr, len));
182 
183         PtrData* ptr_data = const_cast<PtrData*>(res.first.operator->());
184         m_ptr_lock.unlock();
185 
186         is_new = res.second;
187         if (is_new) {
188             // It's necessary to lock as soon as possible.
189             // unlock must be done at call site of insert_ptr_data at
190             // branch for is_new
191             ptr_data->alloc_ptr_data_lock.lock();
192         }
193         return ptr_data;
194     }
195 
remove_ptr_data(const void * ptr)196     void remove_ptr_data(const void *ptr) {
197         m_ptr_lock.lock();
198         list.erase(PtrData(ptr, 0));
199         m_ptr_lock.unlock();
200     }
201 private:
202 
203     PtrSet list;
204     mutex_t     m_ptr_lock;
205 };
206 
207 // Data associated with automatic variable
208 class AutoData {
209 public:
AutoData(const void * addr,uint64_t len)210     AutoData(const void *addr, uint64_t len) :
211         cpu_addr(addr, len), ref_count(0)
212     {}
213 
214     bool operator<(const AutoData &o) const {
215         // Variables are sorted by the CPU start address.
216         // Overlapping memory ranges are considered equal.
217         return (cpu_addr.start() < o.cpu_addr.start()) &&
218                !cpu_addr.overlaps(o.cpu_addr);
219     }
220 
add_reference()221     long add_reference() {
222 #ifndef TARGET_WINNT
223         return __sync_fetch_and_add(&ref_count, 1);
224 #else // TARGET_WINNT
225         return _InterlockedIncrement(&ref_count) - 1;
226 #endif // TARGET_WINNT
227     }
228 
remove_reference()229     long remove_reference() {
230 #ifndef TARGET_WINNT
231         return __sync_sub_and_fetch(&ref_count, 1);
232 #else // TARGET_WINNT
233         return _InterlockedDecrement(&ref_count);
234 #endif // TARGET_WINNT
235     }
236 
nullify_reference()237     long nullify_reference() {
238 #ifndef TARGET_WINNT
239         return __sync_lock_test_and_set(&ref_count, 0);
240 #else // TARGET_WINNT
241         return _InterlockedExchange(&ref_count,0);
242 #endif // TARGET_WINNT
243     }
244 
get_reference()245     long get_reference() const {
246         return ref_count;
247     }
248 
249 public:
250     // CPU address range
251     const MemRange cpu_addr;
252 
253 private:
254     // reference count for the entry
255     long ref_count;
256 };
257 
258 // Set of autimatic variables
259 typedef std::set<AutoData> AutoSet;
260 
261 // Target image data
262 struct TargetImage
263 {
TargetImageTargetImage264     TargetImage(const char *_name, const void *_data, uint64_t _size,
265                 const char *_origin, uint64_t _offset) :
266         name(_name), data(_data), size(_size),
267         origin(_origin), offset(_offset)
268     {}
269 
270     // library name
271     const char* name;
272 
273     // contents and size
274     const void* data;
275     uint64_t    size;
276 
277     // file of origin and offset within that file
278     const char* origin;
279     uint64_t    offset;
280 };
281 
282 typedef std::list<TargetImage> TargetImageList;
283 
284 // dynamic library and Image associated with lib
285 struct DynLib
286 {
DynLibDynLib287     DynLib(const char *_name, const void *_data,
288            COILIBRARY _lib) :
289         name(_name), data(_data), lib(_lib)
290     {}
291     // library name
292     const char* name;
293 
294     // contents
295     const void* data;
296 
297     COILIBRARY lib;
298 };
299 typedef std::list<DynLib> DynLibList;
300 
301 // Data associated with persistent auto objects
302 struct PersistData
303 {
PersistDataPersistData304     PersistData(const void *addr, uint64_t routine_num,
305                 uint64_t size, uint64_t thread) :
306         stack_cpu_addr(addr), routine_id(routine_num), thread_id(thread)
307     {
308         stack_ptr_data = new PtrData(0, size);
309     }
310     // 1-st key value - beginning of the stack at CPU
311     const void *   stack_cpu_addr;
312     // 2-nd key value - identifier of routine invocation at CPU
313     uint64_t   routine_id;
314     // 3-rd key value - thread identifier
315     uint64_t   thread_id;
316 
317     // corresponded PtrData; only stack_ptr_data->mic_buf is used
318     PtrData * stack_ptr_data;
319     // used to get offset of the variable in stack buffer
320     char * cpu_stack_addr;
321 };
322 
323 typedef std::list<PersistData> PersistDataList;
324 
325 // Data associated with stream
326 struct Stream
327 {
StreamStream328     Stream(int device, int num_of_cpus) :
329        m_number_of_cpus(num_of_cpus), m_pipeline(0), m_last_offload(0),
330        m_device(device)
331     {}
~StreamStream332     ~Stream() {
333         if (m_pipeline) {
334              COI::PipelineDestroy(m_pipeline);
335         }
336     }
337 
get_pipelineStream338     COIPIPELINE get_pipeline(void) {
339         return(m_pipeline);
340     }
341 
get_deviceStream342     int get_device(void) {
343         return(m_device);
344     }
345 
get_cpu_numberStream346     int get_cpu_number(void) {
347         return(m_number_of_cpus);
348     }
349 
set_pipelineStream350     void set_pipeline(COIPIPELINE pipeline) {
351         m_pipeline = pipeline;
352     }
353 
get_last_offloadStream354     OffloadDescriptor* get_last_offload(void) {
355         return(m_last_offload);
356     }
357 
set_last_offloadStream358     void set_last_offload(OffloadDescriptor*   last_offload) {
359         m_last_offload = last_offload;
360     }
361 
362     static Stream* find_stream(uint64_t handle, bool remove);
363 
add_streamStream364     static _Offload_stream  add_stream(int device, int number_of_cpus) {
365         m_stream_lock.lock();
366         all_streams[++m_streams_count] = new Stream(device, number_of_cpus);
367         m_stream_lock.unlock();
368         return(m_streams_count);
369     }
370 
371     typedef std::map<uint64_t, Stream*> StreamMap;
372 
373     static uint64_t  m_streams_count;
374     static StreamMap all_streams;
375     static mutex_t   m_stream_lock;
376 
377     int m_device;
378 
379     // number of cpus
380     int m_number_of_cpus;
381 
382     // The pipeline associated with the stream
383     COIPIPELINE         m_pipeline;
384 
385     // The last offload occured via the stream
386     OffloadDescriptor*  m_last_offload;
387 
388     // Cpus used by the stream
389     std::bitset<COI_MAX_HW_THREADS> m_stream_cpus;
390 };
391 
392 typedef std::map<uint64_t, Stream*> StreamMap;
393 
394 // class representing a single engine
395 struct Engine {
396     friend void __offload_init_library_once(void);
397     friend void __offload_fini_library(void);
398 
399 #define check_result(res, tag, ...) \
400     { \
401         if (res == COI_PROCESS_DIED) { \
402             fini_process(true); \
403             exit(1); \
404         } \
405         if (res != COI_SUCCESS) { \
406             __liboffload_error_support(tag, __VA_ARGS__); \
407             exit(1); \
408         } \
409     }
410 
get_logical_indexEngine411     int get_logical_index() const {
412         return m_index;
413     }
414 
get_physical_indexEngine415     int get_physical_index() const {
416         return m_physical_index;
417     }
418 
get_processEngine419     const COIPROCESS& get_process() const {
420         return m_process;
421     }
422 
423     uint64_t get_thread_id(void);
424 
425     // initialize device
426     void init(void);
427 
428     // unload library
429     void unload_library(const void *data, const char *name);
430 
431     // add new library
add_libEngine432     void add_lib(const TargetImage &lib)
433     {
434         m_lock.lock();
435         m_ready = false;
436         m_images.push_back(lib);
437         m_lock.unlock();
438     }
439 
440     COIRESULT compute(
441         _Offload_stream     stream,
442         const std::list<COIBUFFER> &buffers,
443         const void*         data,
444         uint16_t            data_size,
445         void*               ret,
446         uint16_t            ret_size,
447         uint32_t            num_deps,
448         const COIEVENT*     deps,
449         COIEVENT*           event
450     );
451 
452 #ifdef MYO_SUPPORT
453     // temporary workaround for blocking behavior for myoiLibInit/Fini calls
init_myoEngine454     void init_myo(COIEVENT *event) {
455         COIRESULT res;
456         res = COI::PipelineRunFunction(get_pipeline(),
457                                        m_funcs[c_func_myo_init],
458                                        0, 0, 0, 0, 0, 0, 0, 0, 0,
459                                        event);
460         check_result(res, c_pipeline_run_func, m_index, res);
461     }
462 
fini_myoEngine463     void fini_myo(COIEVENT *event) {
464         COIRESULT res;
465         res = COI::PipelineRunFunction(get_pipeline(),
466                                        m_funcs[c_func_myo_fini],
467                                        0, 0, 0, 0, 0, 0, 0, 0, 0,
468                                        event);
469         check_result(res, c_pipeline_run_func, m_index, res);
470     }
471 #endif // MYO_SUPPORT
472 
473     //
474     // Memory association table
475     //
find_ptr_dataEngine476     PtrData* find_ptr_data(const void *ptr) {
477         return m_ptr_set.find_ptr_data(ptr);
478     }
479 
find_targetptr_dataEngine480     PtrData* find_targetptr_data(const void *ptr) {
481         return m_targetptr_set.find_ptr_data(ptr);
482     }
483 
insert_ptr_dataEngine484     PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) {
485         return m_ptr_set.insert_ptr_data(ptr, len, is_new);
486     }
487 
insert_targetptr_dataEngine488     PtrData* insert_targetptr_data(const void *ptr, uint64_t len,
489                                    bool &is_new) {
490         return m_targetptr_set.insert_ptr_data(ptr, len, is_new);
491     }
492 
remove_ptr_dataEngine493     void remove_ptr_data(const void *ptr) {
494         m_ptr_set.remove_ptr_data(ptr);
495     }
496 
remove_targetptr_dataEngine497     void remove_targetptr_data(const void *ptr) {
498         m_targetptr_set.remove_ptr_data(ptr);
499     }
500 
501     //
502     // Automatic variables
503     //
find_auto_dataEngine504     AutoData* find_auto_data(const void *ptr) {
505         AutoSet &auto_vars = get_auto_vars();
506         AutoSet::iterator res = auto_vars.find(AutoData(ptr, 0));
507         if (res == auto_vars.end()) {
508             return 0;
509         }
510         return const_cast<AutoData*>(res.operator->());
511     }
512 
insert_auto_dataEngine513     AutoData* insert_auto_data(const void *ptr, uint64_t len) {
514         AutoSet &auto_vars = get_auto_vars();
515         std::pair<AutoSet::iterator, bool> res =
516             auto_vars.insert(AutoData(ptr, len));
517         return const_cast<AutoData*>(res.first.operator->());
518     }
519 
remove_auto_dataEngine520     void remove_auto_data(const void *ptr) {
521         get_auto_vars().erase(AutoData(ptr, 0));
522     }
523 
524     //
525     // Signals
526     //
add_signalEngine527     void add_signal(const void *signal, OffloadDescriptor *desc) {
528         m_signal_lock.lock();
529         m_signal_map[signal] = desc;
530         m_signal_lock.unlock();
531     }
532 
find_signalEngine533     OffloadDescriptor* find_signal(const void *signal, bool remove) {
534         OffloadDescriptor *desc = 0;
535 
536         m_signal_lock.lock();
537         {
538             SignalMap::iterator it = m_signal_map.find(signal);
539             if (it != m_signal_map.end()) {
540                 desc = it->second;
541                 if (remove) {
542                     it->second = SIGNAL_IS_REMOVED;
543                 }
544             }
545         }
546         m_signal_lock.unlock();
547 
548         return desc;
549     }
550 
551     void stream_destroy(_Offload_stream handle);
552 
553     COIPIPELINE get_pipeline(_Offload_stream stream);
554 
get_stream_mapEngine555     StreamMap get_stream_map() {
556         return m_stream_map;
557     }
558 
559     // stop device process
560     void fini_process(bool verbose);
561 
562     // list of stacks active at the engine
563     PersistDataList m_persist_list;
564 
565 private:
EngineEngine566     Engine() : m_index(-1), m_physical_index(-1), m_process(0), m_ready(false),
567                m_proc_number(0)
568     {}
569 
~EngineEngine570     ~Engine() {
571         for (StreamMap::iterator it = m_stream_map.begin();
572              it != m_stream_map.end(); it++) {
573             Stream * stream = it->second;
574             delete stream;
575         }
576         if (m_process != 0) {
577             fini_process(false);
578         }
579     }
580 
581     // set indexes
set_indexesEngine582     void set_indexes(int logical_index, int physical_index) {
583         m_index = logical_index;
584         m_physical_index = physical_index;
585     }
586 
587     // start process on device
588     void init_process();
589 
590     void load_libraries(void);
591     void init_ptr_data(void);
592 
593     // performs library intialization on the device side
594     pid_t init_device(void);
595 
596 private:
597     // get pipeline associated with a calling thread
598     COIPIPELINE get_pipeline(void);
599 
600     // get automatic vars set associated with the calling thread
601     AutoSet& get_auto_vars(void);
602 
603     // destructor for thread data
604     static void destroy_thread_data(void *data);
605 
606 private:
607     typedef std::set<PtrData> PtrSet;
608     typedef std::map<const void*, OffloadDescriptor*> SignalMap;
609 
610     // device indexes
611     int         m_index;
612     int         m_physical_index;
613 
614     // number of COI pipes created for the engine
615     long        m_proc_number;
616 
617     // process handle
618     COIPROCESS  m_process;
619 
620     // If false, device either has not been initialized or new libraries
621     // have been added.
622     bool        m_ready;
623     mutex_t     m_lock;
624 
625     // List of libraries to be loaded
626     TargetImageList m_images;
627 
628     // var tables
629     PtrDataTable m_ptr_set;
630     PtrDataTable m_targetptr_set;
631 
632     // signals
633     SignalMap m_signal_map;
634     mutex_t   m_signal_lock;
635 
636     // streams
637     StreamMap m_stream_map;
638     mutex_t   m_stream_lock;
639     int       m_num_cores;
640     int       m_num_threads;
641     std::bitset<COI_MAX_HW_THREADS> m_cpus;
642 
643     // List of dynamic libraries to be registred
644     DynLibList m_dyn_libs;
645 
646     // constants for accessing device function handles
647     enum {
648         c_func_compute = 0,
649 #ifdef MYO_SUPPORT
650         c_func_myo_init,
651         c_func_myo_fini,
652 #endif // MYO_SUPPORT
653         c_func_init,
654         c_func_var_table_size,
655         c_func_var_table_copy,
656         c_func_set_stream_affinity,
657         c_funcs_total
658     };
659     static const char* m_func_names[c_funcs_total];
660 
661     // device function handles
662     COIFUNCTION m_funcs[c_funcs_total];
663 
664     // int -> name mapping for device signals
665     static const int   c_signal_max = 32;
666     static const char* c_signal_names[c_signal_max];
667 };
668 
669 #endif // OFFLOAD_ENGINE_H_INCLUDED
670