1 /* 2 Copyright (c) 2014-2015 Intel Corporation. All Rights Reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 * Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 * Redistributions in binary form must reproduce the above copyright 11 notice, this list of conditions and the following disclaimer in the 12 documentation and/or other materials provided with the distribution. 13 * Neither the name of Intel Corporation nor the names of its 14 contributors may be used to endorse or promote products derived 15 from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 31 #ifndef OFFLOAD_ENGINE_H_INCLUDED 32 #define OFFLOAD_ENGINE_H_INCLUDED 33 34 #include <limits.h> 35 #include <bitset> 36 #include <list> 37 #include <set> 38 #include <map> 39 #include "offload_common.h" 40 #include "coi/coi_client.h" 41 42 #define SIGNAL_IS_REMOVED ((OffloadDescriptor *)-1) 43 const int64_t no_stream = -1; 44 45 // Address range 46 class MemRange { 47 public: MemRange()48 MemRange() : m_start(0), m_length(0) {} MemRange(const void * addr,uint64_t len)49 MemRange(const void *addr, uint64_t len) : m_start(addr), m_length(len) {} 50 start()51 const void* start() const { 52 return m_start; 53 } 54 end()55 const void* end() const { 56 return static_cast<const char*>(m_start) + m_length; 57 } 58 length()59 uint64_t length() const { 60 return m_length; 61 } 62 63 // returns true if given range overlaps with another one overlaps(const MemRange & o)64 bool overlaps(const MemRange &o) const { 65 // Two address ranges A[start, end) and B[start,end) overlap 66 // if A.start < B.end and A.end > B.start. 67 return start() < o.end() && end() > o.start(); 68 } 69 70 // returns true if given range contains the other range contains(const MemRange & o)71 bool contains(const MemRange &o) const { 72 return start() <= o.start() && o.end() <= end(); 73 } 74 75 private: 76 const void* m_start; 77 uint64_t m_length; 78 }; 79 80 // Data associated with a pointer variable 81 class PtrData { 82 public: PtrData(const void * addr,uint64_t len)83 PtrData(const void *addr, uint64_t len) : 84 cpu_addr(addr, len), cpu_buf(0), 85 mic_addr(0), alloc_disp(0), mic_buf(0), mic_offset(0), 86 ref_count(0), is_static(false) 87 {} 88 89 // 90 // Copy constructor 91 // PtrData(const PtrData & ptr)92 PtrData(const PtrData& ptr): 93 cpu_addr(ptr.cpu_addr), cpu_buf(ptr.cpu_buf), 94 mic_addr(ptr.mic_addr), alloc_disp(ptr.alloc_disp), 95 mic_buf(ptr.mic_buf), mic_offset(ptr.mic_offset), 96 ref_count(ptr.ref_count), is_static(ptr.is_static) 97 {} 98 99 bool operator<(const PtrData &o) const { 100 // Variables are sorted by the CPU start address. 101 // Overlapping memory ranges are considered equal. 102 return (cpu_addr.start() < o.cpu_addr.start()) && 103 !cpu_addr.overlaps(o.cpu_addr); 104 } 105 add_reference()106 long add_reference() { 107 if (is_static) { 108 return LONG_MAX; 109 } 110 #ifndef TARGET_WINNT 111 return __sync_fetch_and_add(&ref_count, 1); 112 #else // TARGET_WINNT 113 return _InterlockedIncrement(&ref_count) - 1; 114 #endif // TARGET_WINNT 115 } 116 remove_reference()117 long remove_reference() { 118 if (is_static) { 119 return LONG_MAX; 120 } 121 #ifndef TARGET_WINNT 122 return __sync_sub_and_fetch(&ref_count, 1); 123 #else // TARGET_WINNT 124 return _InterlockedDecrement(&ref_count); 125 #endif // TARGET_WINNT 126 } 127 get_reference()128 long get_reference() const { 129 if (is_static) { 130 return LONG_MAX; 131 } 132 return ref_count; 133 } 134 135 public: 136 // CPU address range 137 const MemRange cpu_addr; 138 139 // CPU and MIC buffers 140 COIBUFFER cpu_buf; 141 COIBUFFER mic_buf; 142 143 // placeholder for buffer address on mic 144 uint64_t mic_addr; 145 146 uint64_t alloc_disp; 147 148 // additional offset to pointer data on MIC for improving bandwidth for 149 // data which is not 4K aligned 150 uint32_t mic_offset; 151 152 // if true buffers are created from static memory 153 bool is_static; 154 mutex_t alloc_ptr_data_lock; 155 156 private: 157 // reference count for the entry 158 long ref_count; 159 }; 160 161 typedef std::list<PtrData*> PtrDataList; 162 163 class PtrDataTable { 164 public: 165 typedef std::set<PtrData> PtrSet; 166 find_ptr_data(const void * ptr)167 PtrData* find_ptr_data(const void *ptr) { 168 m_ptr_lock.lock(); 169 PtrSet::iterator res = list.find(PtrData(ptr, 0)); 170 171 m_ptr_lock.unlock(); 172 if (res == list.end()) { 173 return 0; 174 } 175 return const_cast<PtrData*>(res.operator->()); 176 } 177 insert_ptr_data(const void * ptr,uint64_t len,bool & is_new)178 PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) { 179 m_ptr_lock.lock(); 180 std::pair<PtrSet::iterator, bool> res = 181 list.insert(PtrData(ptr, len)); 182 183 PtrData* ptr_data = const_cast<PtrData*>(res.first.operator->()); 184 m_ptr_lock.unlock(); 185 186 is_new = res.second; 187 if (is_new) { 188 // It's necessary to lock as soon as possible. 189 // unlock must be done at call site of insert_ptr_data at 190 // branch for is_new 191 ptr_data->alloc_ptr_data_lock.lock(); 192 } 193 return ptr_data; 194 } 195 remove_ptr_data(const void * ptr)196 void remove_ptr_data(const void *ptr) { 197 m_ptr_lock.lock(); 198 list.erase(PtrData(ptr, 0)); 199 m_ptr_lock.unlock(); 200 } 201 private: 202 203 PtrSet list; 204 mutex_t m_ptr_lock; 205 }; 206 207 // Data associated with automatic variable 208 class AutoData { 209 public: AutoData(const void * addr,uint64_t len)210 AutoData(const void *addr, uint64_t len) : 211 cpu_addr(addr, len), ref_count(0) 212 {} 213 214 bool operator<(const AutoData &o) const { 215 // Variables are sorted by the CPU start address. 216 // Overlapping memory ranges are considered equal. 217 return (cpu_addr.start() < o.cpu_addr.start()) && 218 !cpu_addr.overlaps(o.cpu_addr); 219 } 220 add_reference()221 long add_reference() { 222 #ifndef TARGET_WINNT 223 return __sync_fetch_and_add(&ref_count, 1); 224 #else // TARGET_WINNT 225 return _InterlockedIncrement(&ref_count) - 1; 226 #endif // TARGET_WINNT 227 } 228 remove_reference()229 long remove_reference() { 230 #ifndef TARGET_WINNT 231 return __sync_sub_and_fetch(&ref_count, 1); 232 #else // TARGET_WINNT 233 return _InterlockedDecrement(&ref_count); 234 #endif // TARGET_WINNT 235 } 236 nullify_reference()237 long nullify_reference() { 238 #ifndef TARGET_WINNT 239 return __sync_lock_test_and_set(&ref_count, 0); 240 #else // TARGET_WINNT 241 return _InterlockedExchange(&ref_count,0); 242 #endif // TARGET_WINNT 243 } 244 get_reference()245 long get_reference() const { 246 return ref_count; 247 } 248 249 public: 250 // CPU address range 251 const MemRange cpu_addr; 252 253 private: 254 // reference count for the entry 255 long ref_count; 256 }; 257 258 // Set of autimatic variables 259 typedef std::set<AutoData> AutoSet; 260 261 // Target image data 262 struct TargetImage 263 { TargetImageTargetImage264 TargetImage(const char *_name, const void *_data, uint64_t _size, 265 const char *_origin, uint64_t _offset) : 266 name(_name), data(_data), size(_size), 267 origin(_origin), offset(_offset) 268 {} 269 270 // library name 271 const char* name; 272 273 // contents and size 274 const void* data; 275 uint64_t size; 276 277 // file of origin and offset within that file 278 const char* origin; 279 uint64_t offset; 280 }; 281 282 typedef std::list<TargetImage> TargetImageList; 283 284 // dynamic library and Image associated with lib 285 struct DynLib 286 { DynLibDynLib287 DynLib(const char *_name, const void *_data, 288 COILIBRARY _lib) : 289 name(_name), data(_data), lib(_lib) 290 {} 291 // library name 292 const char* name; 293 294 // contents 295 const void* data; 296 297 COILIBRARY lib; 298 }; 299 typedef std::list<DynLib> DynLibList; 300 301 // Data associated with persistent auto objects 302 struct PersistData 303 { PersistDataPersistData304 PersistData(const void *addr, uint64_t routine_num, 305 uint64_t size, uint64_t thread) : 306 stack_cpu_addr(addr), routine_id(routine_num), thread_id(thread) 307 { 308 stack_ptr_data = new PtrData(0, size); 309 } 310 // 1-st key value - beginning of the stack at CPU 311 const void * stack_cpu_addr; 312 // 2-nd key value - identifier of routine invocation at CPU 313 uint64_t routine_id; 314 // 3-rd key value - thread identifier 315 uint64_t thread_id; 316 317 // corresponded PtrData; only stack_ptr_data->mic_buf is used 318 PtrData * stack_ptr_data; 319 // used to get offset of the variable in stack buffer 320 char * cpu_stack_addr; 321 }; 322 323 typedef std::list<PersistData> PersistDataList; 324 325 // Data associated with stream 326 struct Stream 327 { StreamStream328 Stream(int device, int num_of_cpus) : 329 m_number_of_cpus(num_of_cpus), m_pipeline(0), m_last_offload(0), 330 m_device(device) 331 {} ~StreamStream332 ~Stream() { 333 if (m_pipeline) { 334 COI::PipelineDestroy(m_pipeline); 335 } 336 } 337 get_pipelineStream338 COIPIPELINE get_pipeline(void) { 339 return(m_pipeline); 340 } 341 get_deviceStream342 int get_device(void) { 343 return(m_device); 344 } 345 get_cpu_numberStream346 int get_cpu_number(void) { 347 return(m_number_of_cpus); 348 } 349 set_pipelineStream350 void set_pipeline(COIPIPELINE pipeline) { 351 m_pipeline = pipeline; 352 } 353 get_last_offloadStream354 OffloadDescriptor* get_last_offload(void) { 355 return(m_last_offload); 356 } 357 set_last_offloadStream358 void set_last_offload(OffloadDescriptor* last_offload) { 359 m_last_offload = last_offload; 360 } 361 362 static Stream* find_stream(uint64_t handle, bool remove); 363 add_streamStream364 static _Offload_stream add_stream(int device, int number_of_cpus) { 365 m_stream_lock.lock(); 366 all_streams[++m_streams_count] = new Stream(device, number_of_cpus); 367 m_stream_lock.unlock(); 368 return(m_streams_count); 369 } 370 371 typedef std::map<uint64_t, Stream*> StreamMap; 372 373 static uint64_t m_streams_count; 374 static StreamMap all_streams; 375 static mutex_t m_stream_lock; 376 377 int m_device; 378 379 // number of cpus 380 int m_number_of_cpus; 381 382 // The pipeline associated with the stream 383 COIPIPELINE m_pipeline; 384 385 // The last offload occured via the stream 386 OffloadDescriptor* m_last_offload; 387 388 // Cpus used by the stream 389 std::bitset<COI_MAX_HW_THREADS> m_stream_cpus; 390 }; 391 392 typedef std::map<uint64_t, Stream*> StreamMap; 393 394 // class representing a single engine 395 struct Engine { 396 friend void __offload_init_library_once(void); 397 friend void __offload_fini_library(void); 398 399 #define check_result(res, tag, ...) \ 400 { \ 401 if (res == COI_PROCESS_DIED) { \ 402 fini_process(true); \ 403 exit(1); \ 404 } \ 405 if (res != COI_SUCCESS) { \ 406 __liboffload_error_support(tag, __VA_ARGS__); \ 407 exit(1); \ 408 } \ 409 } 410 get_logical_indexEngine411 int get_logical_index() const { 412 return m_index; 413 } 414 get_physical_indexEngine415 int get_physical_index() const { 416 return m_physical_index; 417 } 418 get_processEngine419 const COIPROCESS& get_process() const { 420 return m_process; 421 } 422 423 uint64_t get_thread_id(void); 424 425 // initialize device 426 void init(void); 427 428 // unload library 429 void unload_library(const void *data, const char *name); 430 431 // add new library add_libEngine432 void add_lib(const TargetImage &lib) 433 { 434 m_lock.lock(); 435 m_ready = false; 436 m_images.push_back(lib); 437 m_lock.unlock(); 438 } 439 440 COIRESULT compute( 441 _Offload_stream stream, 442 const std::list<COIBUFFER> &buffers, 443 const void* data, 444 uint16_t data_size, 445 void* ret, 446 uint16_t ret_size, 447 uint32_t num_deps, 448 const COIEVENT* deps, 449 COIEVENT* event 450 ); 451 452 #ifdef MYO_SUPPORT 453 // temporary workaround for blocking behavior for myoiLibInit/Fini calls init_myoEngine454 void init_myo(COIEVENT *event) { 455 COIRESULT res; 456 res = COI::PipelineRunFunction(get_pipeline(), 457 m_funcs[c_func_myo_init], 458 0, 0, 0, 0, 0, 0, 0, 0, 0, 459 event); 460 check_result(res, c_pipeline_run_func, m_index, res); 461 } 462 fini_myoEngine463 void fini_myo(COIEVENT *event) { 464 COIRESULT res; 465 res = COI::PipelineRunFunction(get_pipeline(), 466 m_funcs[c_func_myo_fini], 467 0, 0, 0, 0, 0, 0, 0, 0, 0, 468 event); 469 check_result(res, c_pipeline_run_func, m_index, res); 470 } 471 #endif // MYO_SUPPORT 472 473 // 474 // Memory association table 475 // find_ptr_dataEngine476 PtrData* find_ptr_data(const void *ptr) { 477 return m_ptr_set.find_ptr_data(ptr); 478 } 479 find_targetptr_dataEngine480 PtrData* find_targetptr_data(const void *ptr) { 481 return m_targetptr_set.find_ptr_data(ptr); 482 } 483 insert_ptr_dataEngine484 PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) { 485 return m_ptr_set.insert_ptr_data(ptr, len, is_new); 486 } 487 insert_targetptr_dataEngine488 PtrData* insert_targetptr_data(const void *ptr, uint64_t len, 489 bool &is_new) { 490 return m_targetptr_set.insert_ptr_data(ptr, len, is_new); 491 } 492 remove_ptr_dataEngine493 void remove_ptr_data(const void *ptr) { 494 m_ptr_set.remove_ptr_data(ptr); 495 } 496 remove_targetptr_dataEngine497 void remove_targetptr_data(const void *ptr) { 498 m_targetptr_set.remove_ptr_data(ptr); 499 } 500 501 // 502 // Automatic variables 503 // find_auto_dataEngine504 AutoData* find_auto_data(const void *ptr) { 505 AutoSet &auto_vars = get_auto_vars(); 506 AutoSet::iterator res = auto_vars.find(AutoData(ptr, 0)); 507 if (res == auto_vars.end()) { 508 return 0; 509 } 510 return const_cast<AutoData*>(res.operator->()); 511 } 512 insert_auto_dataEngine513 AutoData* insert_auto_data(const void *ptr, uint64_t len) { 514 AutoSet &auto_vars = get_auto_vars(); 515 std::pair<AutoSet::iterator, bool> res = 516 auto_vars.insert(AutoData(ptr, len)); 517 return const_cast<AutoData*>(res.first.operator->()); 518 } 519 remove_auto_dataEngine520 void remove_auto_data(const void *ptr) { 521 get_auto_vars().erase(AutoData(ptr, 0)); 522 } 523 524 // 525 // Signals 526 // add_signalEngine527 void add_signal(const void *signal, OffloadDescriptor *desc) { 528 m_signal_lock.lock(); 529 m_signal_map[signal] = desc; 530 m_signal_lock.unlock(); 531 } 532 find_signalEngine533 OffloadDescriptor* find_signal(const void *signal, bool remove) { 534 OffloadDescriptor *desc = 0; 535 536 m_signal_lock.lock(); 537 { 538 SignalMap::iterator it = m_signal_map.find(signal); 539 if (it != m_signal_map.end()) { 540 desc = it->second; 541 if (remove) { 542 it->second = SIGNAL_IS_REMOVED; 543 } 544 } 545 } 546 m_signal_lock.unlock(); 547 548 return desc; 549 } 550 551 void stream_destroy(_Offload_stream handle); 552 553 COIPIPELINE get_pipeline(_Offload_stream stream); 554 get_stream_mapEngine555 StreamMap get_stream_map() { 556 return m_stream_map; 557 } 558 559 // stop device process 560 void fini_process(bool verbose); 561 562 // list of stacks active at the engine 563 PersistDataList m_persist_list; 564 565 private: EngineEngine566 Engine() : m_index(-1), m_physical_index(-1), m_process(0), m_ready(false), 567 m_proc_number(0) 568 {} 569 ~EngineEngine570 ~Engine() { 571 for (StreamMap::iterator it = m_stream_map.begin(); 572 it != m_stream_map.end(); it++) { 573 Stream * stream = it->second; 574 delete stream; 575 } 576 if (m_process != 0) { 577 fini_process(false); 578 } 579 } 580 581 // set indexes set_indexesEngine582 void set_indexes(int logical_index, int physical_index) { 583 m_index = logical_index; 584 m_physical_index = physical_index; 585 } 586 587 // start process on device 588 void init_process(); 589 590 void load_libraries(void); 591 void init_ptr_data(void); 592 593 // performs library intialization on the device side 594 pid_t init_device(void); 595 596 private: 597 // get pipeline associated with a calling thread 598 COIPIPELINE get_pipeline(void); 599 600 // get automatic vars set associated with the calling thread 601 AutoSet& get_auto_vars(void); 602 603 // destructor for thread data 604 static void destroy_thread_data(void *data); 605 606 private: 607 typedef std::set<PtrData> PtrSet; 608 typedef std::map<const void*, OffloadDescriptor*> SignalMap; 609 610 // device indexes 611 int m_index; 612 int m_physical_index; 613 614 // number of COI pipes created for the engine 615 long m_proc_number; 616 617 // process handle 618 COIPROCESS m_process; 619 620 // If false, device either has not been initialized or new libraries 621 // have been added. 622 bool m_ready; 623 mutex_t m_lock; 624 625 // List of libraries to be loaded 626 TargetImageList m_images; 627 628 // var tables 629 PtrDataTable m_ptr_set; 630 PtrDataTable m_targetptr_set; 631 632 // signals 633 SignalMap m_signal_map; 634 mutex_t m_signal_lock; 635 636 // streams 637 StreamMap m_stream_map; 638 mutex_t m_stream_lock; 639 int m_num_cores; 640 int m_num_threads; 641 std::bitset<COI_MAX_HW_THREADS> m_cpus; 642 643 // List of dynamic libraries to be registred 644 DynLibList m_dyn_libs; 645 646 // constants for accessing device function handles 647 enum { 648 c_func_compute = 0, 649 #ifdef MYO_SUPPORT 650 c_func_myo_init, 651 c_func_myo_fini, 652 #endif // MYO_SUPPORT 653 c_func_init, 654 c_func_var_table_size, 655 c_func_var_table_copy, 656 c_func_set_stream_affinity, 657 c_funcs_total 658 }; 659 static const char* m_func_names[c_funcs_total]; 660 661 // device function handles 662 COIFUNCTION m_funcs[c_funcs_total]; 663 664 // int -> name mapping for device signals 665 static const int c_signal_max = 32; 666 static const char* c_signal_names[c_signal_max]; 667 }; 668 669 #endif // OFFLOAD_ENGINE_H_INCLUDED 670