1 /* 2 Copyright (c) 2014-2016 Intel Corporation. All Rights Reserved. 3 4 Redistribution and use in source and binary forms, with or without 5 modification, are permitted provided that the following conditions 6 are met: 7 8 * Redistributions of source code must retain the above copyright 9 notice, this list of conditions and the following disclaimer. 10 * Redistributions in binary form must reproduce the above copyright 11 notice, this list of conditions and the following disclaimer in the 12 documentation and/or other materials provided with the distribution. 13 * Neither the name of Intel Corporation nor the names of its 14 contributors may be used to endorse or promote products derived 15 from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 21 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 */ 29 30 31 #ifndef OFFLOAD_ENGINE_H_INCLUDED 32 #define OFFLOAD_ENGINE_H_INCLUDED 33 34 #include <limits.h> 35 #include <bitset> 36 #include <list> 37 #include <set> 38 #include <map> 39 #include "offload_common.h" 40 #include "coi/coi_client.h" 41 42 #define SIGNAL_HAS_COMPLETED ((OffloadDescriptor *)-1) 43 const int64_t no_stream = -1; 44 45 // Address range 46 class MemRange { 47 public: MemRange()48 MemRange() : m_start(0), m_length(0) {} MemRange(const void * addr,uint64_t len)49 MemRange(const void *addr, uint64_t len) : m_start(addr), m_length(len) {} 50 start()51 const void* start() const { 52 return m_start; 53 } 54 end()55 const void* end() const { 56 return static_cast<const char*>(m_start) + m_length; 57 } 58 length()59 uint64_t length() const { 60 return m_length; 61 } 62 63 // returns true if given range overlaps with another one overlaps(const MemRange & o)64 bool overlaps(const MemRange &o) const { 65 // Two address ranges A[start, end) and B[start,end) overlap 66 // if A.start < B.end and A.end > B.start. 67 return start() < o.end() && end() > o.start(); 68 } 69 70 // returns true if given range contains the other range contains(const MemRange & o)71 bool contains(const MemRange &o) const { 72 return start() <= o.start() && o.end() <= end(); 73 } 74 75 private: 76 const void* m_start; 77 uint64_t m_length; 78 }; 79 80 // Data associated with a pointer variable 81 class PtrData { 82 public: PtrData(const void * addr,uint64_t len)83 PtrData(const void *addr, uint64_t len) : 84 cpu_addr(addr, len), cpu_buf(0), 85 mic_addr(0), alloc_disp(0), mic_buf(0), mic_offset(0), 86 ref_count(0), is_static(false), is_omp_associate(false) 87 {} 88 89 // 90 // Copy constructor 91 // PtrData(const PtrData & ptr)92 PtrData(const PtrData& ptr): 93 cpu_addr(ptr.cpu_addr), cpu_buf(ptr.cpu_buf), 94 mic_addr(ptr.mic_addr), alloc_disp(ptr.alloc_disp), 95 mic_buf(ptr.mic_buf), mic_offset(ptr.mic_offset), 96 ref_count(ptr.ref_count), is_static(ptr.is_static), 97 is_omp_associate(ptr.is_omp_associate), 98 var_alloc_type(0) 99 {} 100 101 bool operator<(const PtrData &o) const { 102 // Variables are sorted by the CPU start address. 103 // Overlapping memory ranges are considered equal. 104 return (cpu_addr.start() < o.cpu_addr.start()) && 105 !cpu_addr.overlaps(o.cpu_addr); 106 } 107 add_reference()108 long add_reference() { 109 if (is_omp_associate || (is_static && !var_alloc_type)) { 110 return LONG_MAX; 111 } 112 #ifndef TARGET_WINNT 113 return __sync_fetch_and_add(&ref_count, 1); 114 #else // TARGET_WINNT 115 return _InterlockedIncrement(&ref_count) - 1; 116 #endif // TARGET_WINNT 117 } 118 remove_reference()119 long remove_reference() { 120 if (is_omp_associate || (is_static && !var_alloc_type)) { 121 return LONG_MAX; 122 } 123 #ifndef TARGET_WINNT 124 return __sync_sub_and_fetch(&ref_count, 1); 125 #else // TARGET_WINNT 126 return _InterlockedDecrement(&ref_count); 127 #endif // TARGET_WINNT 128 } 129 get_reference()130 long get_reference() const { 131 if (is_omp_associate || (is_static && !var_alloc_type)) { 132 return LONG_MAX; 133 } 134 return ref_count; 135 } 136 137 public: 138 // CPU address range 139 const MemRange cpu_addr; 140 141 // CPU and MIC buffers 142 COIBUFFER cpu_buf; 143 COIBUFFER mic_buf; 144 145 // placeholder for buffer address on mic 146 uint64_t mic_addr; 147 148 uint64_t alloc_disp; 149 150 // additional offset to pointer data on MIC for improving bandwidth for 151 // data which is not 4K aligned 152 uint32_t mic_offset; 153 154 // if true buffers are created from static memory 155 bool is_static; 156 157 // true if MIC buffer created by omp_target_associate 158 bool is_omp_associate; 159 160 bool var_alloc_type; 161 mutex_t alloc_ptr_data_lock; 162 163 private: 164 // reference count for the entry 165 long ref_count; 166 }; 167 168 typedef std::list<PtrData*> PtrDataList; 169 170 class PtrDataTable { 171 public: 172 typedef std::set<PtrData> PtrSet; 173 find_ptr_data(const void * ptr)174 PtrData* find_ptr_data(const void *ptr) { 175 m_ptr_lock.lock(); 176 PtrSet::iterator res = list.find(PtrData(ptr, 0)); 177 178 m_ptr_lock.unlock(); 179 if (res == list.end()) { 180 return 0; 181 } 182 return const_cast<PtrData*>(res.operator->()); 183 } 184 insert_ptr_data(const void * ptr,uint64_t len,bool & is_new)185 PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) { 186 m_ptr_lock.lock(); 187 std::pair<PtrSet::iterator, bool> res = 188 list.insert(PtrData(ptr, len)); 189 190 PtrData* ptr_data = const_cast<PtrData*>(res.first.operator->()); 191 m_ptr_lock.unlock(); 192 193 is_new = res.second; 194 if (is_new) { 195 // It's necessary to lock as soon as possible. 196 // unlock must be done at call site of insert_ptr_data at 197 // branch for is_new 198 ptr_data->alloc_ptr_data_lock.lock(); 199 } 200 return ptr_data; 201 } 202 remove_ptr_data(const void * ptr)203 void remove_ptr_data(const void *ptr) { 204 m_ptr_lock.lock(); 205 list.erase(PtrData(ptr, 0)); 206 m_ptr_lock.unlock(); 207 } 208 private: 209 210 PtrSet list; 211 mutex_t m_ptr_lock; 212 }; 213 214 // Data associated with automatic variable 215 class AutoData { 216 public: AutoData(const void * addr,uint64_t len)217 AutoData(const void *addr, uint64_t len) : 218 cpu_addr(addr, len), ref_count(0) 219 {} 220 221 bool operator<(const AutoData &o) const { 222 // Variables are sorted by the CPU start address. 223 // Overlapping memory ranges are considered equal. 224 return (cpu_addr.start() < o.cpu_addr.start()) && 225 !cpu_addr.overlaps(o.cpu_addr); 226 } 227 add_reference()228 long add_reference() { 229 #ifndef TARGET_WINNT 230 return __sync_fetch_and_add(&ref_count, 1); 231 #else // TARGET_WINNT 232 return _InterlockedIncrement(&ref_count) - 1; 233 #endif // TARGET_WINNT 234 } 235 remove_reference()236 long remove_reference() { 237 #ifndef TARGET_WINNT 238 return __sync_sub_and_fetch(&ref_count, 1); 239 #else // TARGET_WINNT 240 return _InterlockedDecrement(&ref_count); 241 #endif // TARGET_WINNT 242 } 243 nullify_reference()244 long nullify_reference() { 245 #ifndef TARGET_WINNT 246 return __sync_lock_test_and_set(&ref_count, 0); 247 #else // TARGET_WINNT 248 return _InterlockedExchange(&ref_count,0); 249 #endif // TARGET_WINNT 250 } 251 get_reference()252 long get_reference() const { 253 return ref_count; 254 } 255 256 public: 257 // CPU address range 258 const MemRange cpu_addr; 259 260 private: 261 // reference count for the entry 262 long ref_count; 263 }; 264 265 // Set of autimatic variables 266 typedef std::set<AutoData> AutoSet; 267 268 // Target image data 269 struct TargetImage 270 { TargetImageTargetImage271 TargetImage(const char *_name, const void *_data, uint64_t _size, 272 const char *_origin, uint64_t _offset) : 273 name(_name), data(_data), size(_size), 274 origin(_origin), offset(_offset) 275 {} 276 277 // library name 278 const char* name; 279 280 // contents and size 281 const void* data; 282 uint64_t size; 283 284 // file of origin and offset within that file 285 const char* origin; 286 uint64_t offset; 287 }; 288 289 typedef std::list<TargetImage> TargetImageList; 290 291 // dynamic library and Image associated with lib 292 struct DynLib 293 { DynLibDynLib294 DynLib(const char *_name, const void *_data, 295 COILIBRARY _lib) : 296 name(_name), data(_data), lib(_lib) 297 {} 298 // library name 299 const char* name; 300 301 // contents 302 const void* data; 303 304 COILIBRARY lib; 305 }; 306 typedef std::list<DynLib> DynLibList; 307 308 // Data associated with persistent auto objects 309 struct PersistData 310 { PersistDataPersistData311 PersistData(const void *addr, uint64_t routine_num, 312 uint64_t size, uint64_t thread) : 313 stack_cpu_addr(addr), routine_id(routine_num), thread_id(thread) 314 { 315 stack_ptr_data = new PtrData(0, size); 316 } 317 // 1-st key value - beginning of the stack at CPU 318 const void * stack_cpu_addr; 319 // 2-nd key value - identifier of routine invocation at CPU 320 uint64_t routine_id; 321 // 3-rd key value - thread identifier 322 uint64_t thread_id; 323 324 // corresponded PtrData; only stack_ptr_data->mic_buf is used 325 PtrData * stack_ptr_data; 326 // used to get offset of the variable in stack buffer 327 char * cpu_stack_addr; 328 }; 329 330 typedef std::list<PersistData> PersistDataList; 331 332 // Data associated with stream 333 struct Stream 334 { StreamStream335 Stream(int device, int num_of_cpus) : 336 m_number_of_cpus(num_of_cpus), m_pipeline(0), m_last_offload(0), 337 m_device(device) 338 {} ~StreamStream339 ~Stream() { 340 if (m_pipeline) { 341 COI::PipelineDestroy(m_pipeline); 342 } 343 } 344 get_pipelineStream345 COIPIPELINE get_pipeline(void) { 346 return(m_pipeline); 347 } 348 get_deviceStream349 int get_device(void) { 350 return(m_device); 351 } 352 get_cpu_numberStream353 int get_cpu_number(void) { 354 return(m_number_of_cpus); 355 } 356 set_pipelineStream357 void set_pipeline(COIPIPELINE pipeline) { 358 m_pipeline = pipeline; 359 } 360 get_last_offloadStream361 OffloadDescriptor* get_last_offload(void) { 362 return(m_last_offload); 363 } 364 set_last_offloadStream365 void set_last_offload(OffloadDescriptor* last_offload) { 366 m_last_offload = last_offload; 367 } 368 369 static Stream* find_stream(uint64_t handle, bool remove); 370 add_streamStream371 static _Offload_stream add_stream(int device, int number_of_cpus) { 372 _Offload_stream result; 373 m_stream_lock.lock(); 374 result = ++m_streams_count; 375 all_streams[m_streams_count] = new Stream(device, number_of_cpus); 376 m_stream_lock.unlock(); 377 return(result); 378 } 379 get_streams_countStream380 static uint64_t get_streams_count() { 381 return m_streams_count; 382 } 383 384 typedef std::map<uint64_t, Stream*> StreamMap; 385 386 static uint64_t m_streams_count; 387 static StreamMap all_streams; 388 static mutex_t m_stream_lock; 389 390 int m_device; 391 392 // number of cpus 393 int m_number_of_cpus; 394 395 // The pipeline associated with the stream 396 COIPIPELINE m_pipeline; 397 398 // The last offload occured via the stream 399 OffloadDescriptor* m_last_offload; 400 401 // Cpus used by the stream 402 std::bitset<COI_MAX_HW_THREADS> m_stream_cpus; 403 }; 404 405 typedef std::map<uint64_t, Stream*> StreamMap; 406 typedef std::bitset<COI_MAX_HW_THREADS> micLcpuMask; 407 408 // ordered by count double linked list of cpus used by streams 409 typedef struct CpuEl{ 410 uint64_t count; // number of streams using the cpu 411 struct CpuEl* prev; // cpu with the same or lesser count 412 struct CpuEl* next; // cpu with the same or greater count 413 } CpuEl; 414 415 // class representing a single engine 416 struct Engine { 417 friend void __offload_init_library_once(void); 418 friend void __offload_fini_library(void); 419 420 #define CPU_INDEX(x) (x - m_cpus) 421 #define check_result(res, tag, ...) \ 422 { \ 423 if (res == COI_PROCESS_DIED) { \ 424 fini_process(true); \ 425 exit(1); \ 426 } \ 427 if (res != COI_SUCCESS) { \ 428 __liboffload_error_support(tag, __VA_ARGS__); \ 429 exit(1); \ 430 } \ 431 } 432 get_logical_indexEngine433 int get_logical_index() const { 434 return m_index; 435 } 436 get_physical_indexEngine437 int get_physical_index() const { 438 return m_physical_index; 439 } 440 get_processEngine441 const COIPROCESS& get_process() const { 442 return m_process; 443 } 444 get_readyEngine445 bool get_ready() { 446 return m_ready; 447 } 448 449 uint64_t get_thread_id(void); 450 451 // initialize device 452 void init(void); 453 454 // unload library 455 void unload_library(const void *data, const char *name); 456 457 // add new library add_libEngine458 void add_lib(const TargetImage &lib) 459 { 460 m_lock.lock(); 461 m_ready = false; 462 m_images.push_back(lib); 463 m_lock.unlock(); 464 } 465 466 COIRESULT compute( 467 _Offload_stream stream, 468 const std::list<COIBUFFER> &buffers, 469 const void* data, 470 uint16_t data_size, 471 void* ret, 472 uint16_t ret_size, 473 uint32_t num_deps, 474 const COIEVENT* deps, 475 COIEVENT* event 476 ); 477 478 #ifdef MYO_SUPPORT 479 // temporary workaround for blocking behavior for myoiLibInit/Fini calls init_myoEngine480 void init_myo(COIEVENT *event) { 481 COIRESULT res; 482 res = COI::PipelineRunFunction(get_pipeline(), 483 m_funcs[c_func_myo_init], 484 0, 0, 0, 0, 0, 0, 0, 0, 0, 485 event); 486 check_result(res, c_pipeline_run_func, m_index, res); 487 } 488 fini_myoEngine489 void fini_myo(COIEVENT *event) { 490 COIRESULT res; 491 res = COI::PipelineRunFunction(get_pipeline(), 492 m_funcs[c_func_myo_fini], 493 0, 0, 0, 0, 0, 0, 0, 0, 0, 494 event); 495 check_result(res, c_pipeline_run_func, m_index, res); 496 } 497 #endif // MYO_SUPPORT 498 499 // 500 // Memory association table 501 // find_ptr_dataEngine502 PtrData* find_ptr_data(const void *ptr) { 503 return m_ptr_set.find_ptr_data(ptr); 504 } 505 find_targetptr_dataEngine506 PtrData* find_targetptr_data(const void *ptr) { 507 return m_targetptr_set.find_ptr_data(ptr); 508 } 509 insert_ptr_dataEngine510 PtrData* insert_ptr_data(const void *ptr, uint64_t len, bool &is_new) { 511 return m_ptr_set.insert_ptr_data(ptr, len, is_new); 512 } 513 insert_targetptr_dataEngine514 PtrData* insert_targetptr_data(const void *ptr, uint64_t len, 515 bool &is_new) { 516 return m_targetptr_set.insert_ptr_data(ptr, len, is_new); 517 } 518 remove_ptr_dataEngine519 void remove_ptr_data(const void *ptr) { 520 m_ptr_set.remove_ptr_data(ptr); 521 } 522 remove_targetptr_dataEngine523 void remove_targetptr_data(const void *ptr) { 524 m_targetptr_set.remove_ptr_data(ptr); 525 } 526 527 // 528 // Automatic variables 529 // find_auto_dataEngine530 AutoData* find_auto_data(const void *ptr) { 531 AutoSet &auto_vars = get_auto_vars(); 532 AutoSet::iterator res = auto_vars.find(AutoData(ptr, 0)); 533 if (res == auto_vars.end()) { 534 return 0; 535 } 536 return const_cast<AutoData*>(res.operator->()); 537 } 538 insert_auto_dataEngine539 AutoData* insert_auto_data(const void *ptr, uint64_t len) { 540 AutoSet &auto_vars = get_auto_vars(); 541 std::pair<AutoSet::iterator, bool> res = 542 auto_vars.insert(AutoData(ptr, len)); 543 return const_cast<AutoData*>(res.first.operator->()); 544 } 545 remove_auto_dataEngine546 void remove_auto_data(const void *ptr) { 547 get_auto_vars().erase(AutoData(ptr, 0)); 548 } 549 550 // 551 // Signals 552 // add_signalEngine553 void add_signal(const void *signal, OffloadDescriptor *desc) { 554 m_signal_lock.lock(); 555 m_signal_map[signal] = desc; 556 m_signal_lock.unlock(); 557 } 558 find_signalEngine559 OffloadDescriptor* find_signal(const void *signal, bool remove) { 560 OffloadDescriptor *desc = 0; 561 562 m_signal_lock.lock(); 563 { 564 SignalMap::iterator it = m_signal_map.find(signal); 565 if (it != m_signal_map.end()) { 566 desc = it->second; 567 if (remove) { 568 it->second = SIGNAL_HAS_COMPLETED; 569 } 570 } 571 } 572 m_signal_lock.unlock(); 573 574 return desc; 575 } 576 complete_signaled_ofldEngine577 void complete_signaled_ofld(const void *signal) { 578 579 m_signal_lock.lock(); 580 { 581 SignalMap::iterator it = m_signal_map.find(signal); 582 if (it != m_signal_map.end()) { 583 it->second = SIGNAL_HAS_COMPLETED; 584 } 585 } 586 m_signal_lock.unlock(); 587 } 588 589 void stream_destroy(_Offload_stream handle); 590 591 void move_cpu_el_after(CpuEl* cpu_what, CpuEl* cpu_after); 592 void print_stream_cpu_list(const char *); 593 594 COIPIPELINE get_pipeline(_Offload_stream stream); 595 get_stream_mapEngine596 StreamMap get_stream_map() { 597 return m_stream_map; 598 } 599 600 // stop device process 601 void fini_process(bool verbose); 602 603 // list of stacks active at the engine 604 PersistDataList m_persist_list; 605 606 private: EngineEngine607 Engine() : m_index(-1), m_physical_index(-1), m_process(0), m_ready(false), 608 m_proc_number(0), m_assigned_cpus(0), m_cpus(0), m_cpu_head(0) 609 {} 610 ~EngineEngine611 ~Engine() { 612 m_ready = false; 613 for (StreamMap::iterator it = m_stream_map.begin(); 614 it != m_stream_map.end(); it++) { 615 Stream * stream = it->second; 616 delete stream; 617 } 618 if (m_process != 0) { 619 fini_process(false); 620 } 621 if (m_assigned_cpus) { 622 delete m_assigned_cpus; 623 } 624 } 625 626 // set indexes set_indexesEngine627 void set_indexes(int logical_index, int physical_index) { 628 m_index = logical_index; 629 m_physical_index = physical_index; 630 } 631 632 // set CPU mask set_cpu_maskEngine633 void set_cpu_mask(micLcpuMask *cpu_mask) 634 { 635 m_assigned_cpus = cpu_mask; 636 } 637 638 // start process on device 639 void init_process(); 640 641 void load_libraries(void); 642 void init_ptr_data(void); 643 644 // performs library intialization on the device side 645 pid_t init_device(void); 646 647 private: 648 // get pipeline associated with a calling thread 649 COIPIPELINE get_pipeline(void); 650 651 // get automatic vars set associated with the calling thread 652 AutoSet& get_auto_vars(void); 653 654 // destructor for thread data 655 static void destroy_thread_data(void *data); 656 657 private: 658 typedef std::set<PtrData> PtrSet; 659 typedef std::map<const void*, OffloadDescriptor*> SignalMap; 660 661 // device indexes 662 int m_index; 663 int m_physical_index; 664 665 // cpu mask 666 micLcpuMask *m_assigned_cpus; 667 668 // number of COI pipes created for the engine 669 long m_proc_number; 670 671 // process handle 672 COIPROCESS m_process; 673 674 // If false, device either has not been initialized or new libraries 675 // have been added. 676 bool m_ready; 677 mutex_t m_lock; 678 679 // List of libraries to be loaded 680 TargetImageList m_images; 681 682 // var tables 683 PtrDataTable m_ptr_set; 684 PtrDataTable m_targetptr_set; 685 686 // signals 687 SignalMap m_signal_map; 688 mutex_t m_signal_lock; 689 690 // streams 691 StreamMap m_stream_map; 692 mutex_t m_stream_lock; 693 int m_num_cores; 694 int m_num_threads; 695 CpuEl* m_cpus; 696 CpuEl* m_cpu_head; 697 698 // List of dynamic libraries to be registred 699 DynLibList m_dyn_libs; 700 701 // constants for accessing device function handles 702 enum { 703 c_func_compute = 0, 704 #ifdef MYO_SUPPORT 705 c_func_myo_init, 706 c_func_myo_fini, 707 #endif // MYO_SUPPORT 708 c_func_init, 709 c_func_var_table_size, 710 c_func_var_table_copy, 711 c_func_set_stream_affinity, 712 c_funcs_total 713 }; 714 static const char* m_func_names[c_funcs_total]; 715 716 // device function handles 717 COIFUNCTION m_funcs[c_funcs_total]; 718 719 // int -> name mapping for device signals 720 static const int c_signal_max = 32; 721 static const char* c_signal_names[c_signal_max]; 722 }; 723 724 #endif // OFFLOAD_ENGINE_H_INCLUDED 725