1 /*
2     Copyright (c) 2014-2016 Intel Corporation.  All Rights Reserved.
3 
4     Redistribution and use in source and binary forms, with or without
5     modification, are permitted provided that the following conditions
6     are met:
7 
8       * Redistributions of source code must retain the above copyright
9         notice, this list of conditions and the following disclaimer.
10       * Redistributions in binary form must reproduce the above copyright
11         notice, this list of conditions and the following disclaimer in the
12         documentation and/or other materials provided with the distribution.
13       * Neither the name of Intel Corporation nor the names of its
14         contributors may be used to endorse or promote products derived
15         from this software without specific prior written permission.
16 
17     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20     A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21     HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23     LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24     DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25     THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27     OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29 
30 
31 #include "offload_engine.h"
32 #include <signal.h>
33 #include <errno.h>
34 #include <sys/stat.h>
35 #include <sys/types.h>
36 
37 #include <algorithm>
38 #include <vector>
39 
40 #include "offload_host.h"
41 #include "offload_table.h"
42 #include "offload_iterator.h"
43 
44 #if defined(HOST_WINNT)
45 #define PATH_SEPARATOR ";"
46 #else
47 #define PATH_SEPARATOR ":"
48 #endif
49 
50 // Static members of Stream class must be described somewhere.
51 // This members describe the list of all streams defined in programm
52 // via call to _Offload_stream_create.
53 uint64_t  Stream::m_streams_count = 0;
54 StreamMap Stream::all_streams;
55 mutex_t   Stream::m_stream_lock;
56 char*     mic_library_path = 0;
57 
58 const char* Engine::m_func_names[Engine::c_funcs_total] =
59 {
60     "server_compute",
61 #ifdef MYO_SUPPORT
62     "server_myoinit",
63     "server_myofini",
64 #endif // MYO_SUPPORT
65     "server_init",
66     "server_var_table_size",
67     "server_var_table_copy",
68     "server_set_stream_affinity"
69 };
70 
71 // Symbolic representation of system signals. Fix for CQ233593
72 const char* Engine::c_signal_names[Engine::c_signal_max] =
73 {
74     "Unknown SIGNAL",
75     "SIGHUP",    /*  1, Hangup (POSIX).  */
76     "SIGINT",    /*  2, Interrupt (ANSI).  */
77     "SIGQUIT",   /*  3, Quit (POSIX).  */
78     "SIGILL",    /*  4, Illegal instruction (ANSI).  */
79     "SIGTRAP",   /*  5, Trace trap (POSIX).  */
80     "SIGABRT",   /*  6, Abort (ANSI).  */
81     "SIGBUS",    /*  7, BUS error (4.2 BSD).  */
82     "SIGFPE",    /*  8, Floating-point exception (ANSI).  */
83     "SIGKILL",   /*  9, Kill, unblockable (POSIX).  */
84     "SIGUSR1",   /* 10, User-defined signal 1 (POSIX).  */
85     "SIGSEGV",   /* 11, Segmentation violation (ANSI).  */
86     "SIGUSR2",   /* 12, User-defined signal 2 (POSIX).  */
87     "SIGPIPE",   /* 13, Broken pipe (POSIX).  */
88     "SIGALRM",   /* 14, Alarm clock (POSIX).  */
89     "SIGTERM",   /* 15, Termination (ANSI).  */
90     "SIGSTKFLT", /* 16, Stack fault.  */
91     "SIGCHLD",   /* 17, Child status has changed (POSIX).  */
92     "SIGCONT",   /* 18, Continue (POSIX).  */
93     "SIGSTOP",   /* 19, Stop, unblockable (POSIX).  */
94     "SIGTSTP",   /* 20, Keyboard stop (POSIX).  */
95     "SIGTTIN",   /* 21, Background read from tty (POSIX).  */
96     "SIGTTOU",   /* 22, Background write to tty (POSIX).  */
97     "SIGURG",    /* 23, Urgent condition on socket (4.2 BSD).  */
98     "SIGXCPU",   /* 24, CPU limit exceeded (4.2 BSD).  */
99     "SIGXFSZ",   /* 25, File size limit exceeded (4.2 BSD).  */
100     "SIGVTALRM", /* 26, Virtual alarm clock (4.2 BSD).  */
101     "SIGPROF",   /* 27, Profiling alarm clock (4.2 BSD).  */
102     "SIGWINCH",  /* 28, Window size change (4.3 BSD, Sun).  */
103     "SIGIO",     /* 29, I/O now possible (4.2 BSD).  */
104     "SIGPWR",    /* 30, Power failure restart (System V).  */
105     "SIGSYS"     /* 31, Bad system call.  */
106 };
107 
init(void)108 void Engine::init(void)
109 {
110     if (!m_ready) {
111         mutex_locker_t locker(m_lock);
112 
113         if (!m_ready) {
114             // start process if not done yet
115             if (m_process == 0) {
116                 init_process();
117             }
118 
119             // load penging images
120             load_libraries();
121 
122             // and (re)build pointer table
123             init_ptr_data();
124 
125             // it is ready now
126             m_ready = true;
127 
128             //  Inform the debugger
129             if (__dbg_is_attached) {
130                 __dbg_target_so_loaded();
131             }
132         }
133     }
134 }
135 
print_stream_cpu_list(const char * str)136 void Engine::print_stream_cpu_list(const char * str)
137 {
138     int count = 0;
139     char buffer[1024];
140     CpuEl* cpu_el = m_cpu_head;
141 
142     OFFLOAD_DEBUG_TRACE(3,
143                   "%s : cpu list as Index(Count) for the streams is :\n", str);
144     buffer[0] = 0;
145     for (int i = 0; i < m_num_threads; i++) {
146          cpu_el = m_cpus + i;
147          if (m_assigned_cpus == 0 || (*m_assigned_cpus)[i]) {
148              count++;
149              sprintf(buffer + strlen(buffer), "%d(%d) ", CPU_INDEX(cpu_el), cpu_el->count);
150              if (count % 20 == 0) {
151                  OFFLOAD_DEBUG_TRACE(3, "%s\n", buffer);
152                  buffer[0] = 0;
153              }
154          }
155     }
156     if (count % 20 != 0) {
157         OFFLOAD_DEBUG_TRACE(3, "%s\n", buffer);
158     }
159 }
160 
init_process(void)161 void Engine::init_process(void)
162 {
163     COIENGINE engine;
164     COIRESULT res;
165     const char **environ;
166     char buf[4096];  // For exe path name
167     char* mic_device_main = 0;
168 
169     // create environment for the target process
170     environ = (const char**) mic_env_vars.create_environ_for_card(m_index);
171     if (environ != 0) {
172         for (const char **p = environ; *p != 0; p++) {
173             OFFLOAD_DEBUG_TRACE(3, "Env Var for card %d: %s\n", m_index, *p);
174         }
175     }
176 
177     // Create execution context in the specified device
178     OFFLOAD_DEBUG_TRACE(2, "Getting device %d (engine %d) handle\n", m_index,
179                         m_physical_index);
180     res = COI::EngineGetHandle(COI_ISA_MIC, m_physical_index, &engine);
181     check_result(res, c_get_engine_handle, m_index, res);
182 
183     // Get engine info on threads and cores.
184     // The values of core number and thread number will be used later at stream
185     // creation by call to _Offload_stream_create(device,number_of_cpus).
186 
187     COI_ENGINE_INFO engine_info;
188 
189     res = COI::EngineGetInfo(engine, sizeof(COI_ENGINE_INFO), &engine_info);
190     check_result(res, c_get_engine_info, m_index, res);
191     if (mic_library_path == 0 ) {
192        if (engine_info.ISA == COI_DEVICE_KNC) {
193           mic_library_path = knc_library_path;
194        }
195        else if (engine_info.ISA == COI_DEVICE_KNL) {
196           mic_library_path = knl_library_path;
197        }
198        else {
199           LIBOFFLOAD_ERROR(c_unknown_mic_device_type);
200        }
201     }
202 
203     // m_cpus is the list of all available threads.
204     // At the begining all threads made available through OFFLOAD_DEVICES
205     // or all threads existed at the engine if OFFLOAD_DEVICES isn't set.
206     // m_cpu_head points to the head of the m_cpus list.
207     // m_cpus is ordered by number of streams using the thread.
208     // m_cpu_head points to the least used thread.
209     // After creating and destroying a stream the m_cpus list must be fixed
210     // to be ordered.
211 
212     m_cpus = (CpuEl*)malloc(engine_info.NumThreads * sizeof(CpuEl));
213     if (m_cpus == NULL)
214         LIBOFFLOAD_ERROR(c_malloc);
215     memset(m_cpus, 0, engine_info.NumThreads * sizeof(CpuEl));
216     CpuEl* prev_cpu = NULL;
217 
218     for (int i = 0; i < engine_info.NumThreads; i++) {
219          if (m_assigned_cpus == 0 || (*m_assigned_cpus)[i]) {
220              if (prev_cpu) {
221                  prev_cpu->next = m_cpus + i;
222              }
223              else {
224                  m_cpu_head = m_cpus + i;
225              }
226              m_cpus[i].prev = prev_cpu;
227              m_cpus[i].count = 0;
228              prev_cpu = m_cpus + i;
229          }
230     }
231 
232     // The following values will be used at pipeline creation for streams
233     m_num_cores = engine_info.NumCores;
234     m_num_threads = engine_info.NumThreads;
235 
236     print_stream_cpu_list("init_process");
237 
238     // Check if OFFLOAD_DMA_CHANNEL_COUNT is set to 2
239     // Only the value 2 is supported in 16.0
240     if (mic_dma_channel_count == 2) {
241         if (COI::ProcessConfigureDMA) {
242             // Set DMA channels using COI API
243             COI::ProcessConfigureDMA(2, COI::DMA_MODE_READ_WRITE);
244         }
245         else {
246             // Set environment variable COI_DMA_CHANNEL_COUNT
247             // use putenv instead of setenv as Windows has no setenv.
248             // Note: putenv requires its argument can't be freed or modified.
249             // So no free after call to putenv or elsewhere.
250             char * env_var = strdup("COI_DMA_CHANNEL_COUNT=2");
251             if (env_var == NULL)
252                 LIBOFFLOAD_ERROR(c_malloc);
253             putenv(env_var);
254         }
255     }
256 
257     // Target executable is not available then use compiler provided offload_main
258     if (__target_exe == 0) {
259        // find target executable to be used if main application is not an
260        // offload build application.
261        const char *base_name = "offload_main";
262        if (mic_library_path != 0) {
263           char *buf = strdup(mic_library_path);
264           if (buf == NULL)
265               LIBOFFLOAD_ERROR(c_malloc);
266           char *try_name = (char*) alloca(strlen(mic_library_path) +
267                                           strlen(base_name) + 2);
268           char *dir, *ptr;
269 
270           for (dir = strtok_r(buf, PATH_SEPARATOR, &ptr); dir != 0;
271                dir = strtok_r(0, PATH_SEPARATOR, &ptr)) {
272               // compose a full path
273               sprintf(try_name, "%s/%s", dir, base_name);
274 
275               // check if such file exists
276               struct stat st;
277               if (stat(try_name, &st) == 0 && S_ISREG(st.st_mode)) {
278                   mic_device_main = strdup(try_name);
279                   if (mic_device_main == NULL)
280                       LIBOFFLOAD_ERROR(c_malloc);
281                   break;
282               }
283           }
284           free(buf);
285        }
286        if (mic_device_main == 0) {
287           LIBOFFLOAD_ERROR(c_report_no_target_exe, "offload_main");
288           exit(1);
289        }
290 
291        OFFLOAD_DEBUG_TRACE(2,
292            "Loading target executable %s\n",mic_device_main);
293 
294        res = COI::ProcessCreateFromFile(
295            engine,                 // in_Engine
296            mic_device_main,        // in_pBinaryName
297            0,                      // in_Argc
298            0,                      // in_ppArgv
299            environ == 0,           // in_DupEnv
300            environ,                // in_ppAdditionalEnv
301            mic_proxy_io,           // in_ProxyActive
302            mic_proxy_fs_root,      // in_ProxyfsRoot
303            mic_buffer_size,        // in_BufferSpace
304            mic_library_path,       // in_LibrarySearchPath
305            &m_process              // out_pProcess
306        );
307     }
308     else {
309     // Target executable should be available by the time when we
310     // attempt to initialize the device
311 
312        //  Need the full path of the FAT exe for VTUNE
313        {
314 #ifndef TARGET_WINNT
315           ssize_t len = readlink("/proc/self/exe", buf,1000);
316 #else
317           int len = GetModuleFileName(NULL, buf,1000);
318 #endif // TARGET_WINNT
319           if  (len == -1) {
320              LIBOFFLOAD_ERROR(c_report_no_host_exe);
321              exit(1);
322           }
323           else if (len > 999) {
324              LIBOFFLOAD_ERROR(c_report_path_buff_overflow);
325              exit(1);
326           }
327           buf[len] = '\0';
328        }
329 
330        OFFLOAD_DEBUG_TRACE(2,
331            "Loading target executable \"%s\" from %p, size %lld, host file %s\n",
332            __target_exe->name, __target_exe->data, __target_exe->size,
333            buf);
334 
335        res = COI::ProcessCreateFromMemory(
336            engine,                 // in_Engine
337            __target_exe->name,     // in_pBinaryName
338            __target_exe->data,     // in_pBinaryBuffer
339            __target_exe->size,     // in_BinaryBufferLength,
340            0,                      // in_Argc
341            0,                      // in_ppArgv
342            environ == 0,           // in_DupEnv
343            environ,                // in_ppAdditionalEnv
344            mic_proxy_io,           // in_ProxyActive
345            mic_proxy_fs_root,      // in_ProxyfsRoot
346            mic_buffer_size,        // in_BufferSpace
347            mic_library_path,       // in_LibrarySearchPath
348            buf,                    // in_FileOfOrigin
349            -1,                     // in_FileOfOriginOffset use -1 to indicate to
350                                    // COI that is is a FAT binary
351            &m_process              // out_pProcess
352        );
353     }
354     check_result(res, c_process_create, m_index, res);
355 
356     if ((mic_4k_buffer_size != 0) || (mic_2m_buffer_size !=0)) {
357        // available only in MPSS 4.2 and greater
358        if (COI::ProcessSetCacheSize != 0 ) {
359           int flags;
360           //  Need compiler to use MPSS 3.2 or greater to get these
361           // definition so currently hardcoding it
362           //  COI_CACHE_ACTION_GROW_NOW && COI_CACHE_MODE_ONDEMAND_SYNC;
363           flags = 0x00020002;
364           res = COI::ProcessSetCacheSize(
365                m_process,             // in_Process
366                mic_2m_buffer_size,    // in_HugePagePoolSize
367                flags,                 // inHugeFlags
368                mic_4k_buffer_size,    // in_SmallPagePoolSize
369                flags,                 // inSmallFlags
370                0,                     // in_NumDependencies
371                0,                     // in_pDependencies
372                0                      // out_PCompletion
373           );
374           OFFLOAD_DEBUG_TRACE(2,
375               "Reserve target buffers 4K pages = %d  2M pages = %d\n",
376                   mic_4k_buffer_size, mic_2m_buffer_size);
377            check_result(res, c_process_set_cache_size, m_index, res);
378        }
379        else {
380              OFFLOAD_DEBUG_TRACE(2,
381                  "Reserve target buffers not supported in current MPSS\n");
382        }
383     }
384 
385     // get function handles
386     res = COI::ProcessGetFunctionHandles(m_process, c_funcs_total,
387                                          m_func_names, m_funcs);
388     check_result(res, c_process_get_func_handles, m_index, res);
389 
390     // initialize device side
391     pid_t pid = init_device();
392 
393     // For IDB
394     if (__dbg_is_attached) {
395         // TODO: we have in-memory executable now.
396         // Check with IDB team what should we provide them now?
397         if (__target_exe == 0) {
398             strcpy(__dbg_target_exe_name, "offload_main");
399         }
400         else {
401             if (strlen(__target_exe->name) < MAX_TARGET_NAME) {
402                 strcpy(__dbg_target_exe_name, __target_exe->name);
403             }
404         }
405         __dbg_target_so_pid = pid;
406         __dbg_target_id = m_physical_index;
407        // The call to __dbg_target_so_loaded() is moved
408        // to Engine:init so all the libraries are loaded before
409        // informing debugger so debugger can access them.
410        // __dbg_target_so_loaded();
411     }
412 }
413 
fini_process(bool verbose)414 void Engine::fini_process(bool verbose)
415 {
416     if (m_process != 0) {
417         uint32_t sig;
418         int8_t ret;
419 
420         // destroy target process
421         OFFLOAD_DEBUG_TRACE(2, "Destroying process on the device %d\n",
422                             m_index);
423 
424         COIRESULT res = COI::ProcessDestroy(m_process, -1, 0, &ret, &sig);
425         m_process = 0;
426 
427         if (res == COI_SUCCESS) {
428             OFFLOAD_DEBUG_TRACE(3, "Device process: signal %d, exit code %d\n",
429                                 sig, ret);
430             if (verbose) {
431                 if (sig != 0) {
432                     LIBOFFLOAD_ERROR(
433                         c_mic_process_exit_sig, m_index, sig,
434                         c_signal_names[sig >= c_signal_max ? 0 : sig]);
435                 }
436                 else {
437                     LIBOFFLOAD_ERROR(c_mic_process_exit_ret, m_index, ret);
438                 }
439             }
440 
441             // for idb
442             if (__dbg_is_attached) {
443                 __dbg_target_so_unloaded();
444             }
445         }
446         else {
447             if (verbose) {
448                 LIBOFFLOAD_ERROR(c_mic_process_exit, m_index);
449             }
450         }
451     }
452 }
453 
load_libraries()454 void Engine::load_libraries()
455 {
456     // load libraries collected so far
457     for (TargetImageList::iterator it = m_images.begin();
458          it != m_images.end(); it++) {
459         OFFLOAD_DEBUG_TRACE(2,
460             "Loading library \"%s\" from %p, size %llu, host file %s\n",
461                                     it->name, it->data, it->size, it->origin);
462 
463         // load library to the device
464         COILIBRARY lib;
465         COIRESULT res;
466         res = COI::ProcessLoadLibraryFromMemory(m_process,
467                                                 it->data,
468                                                 it->size,
469                                                 it->name,
470                                                 mic_library_path,
471                                                 it->origin,
472                                                 (it->origin) ? -1 : 0,
473                                                 COI_LOADLIBRARY_V1_FLAGS,
474                                                 &lib);
475         m_dyn_libs.push_front(DynLib(it->name, it->data, lib));
476 
477         if (res != COI_SUCCESS && res != COI_ALREADY_EXISTS) {
478             check_result(res, c_load_library, it->origin, m_index, res);
479         }
480     }
481     m_images.clear();
482 }
483 
unload_library(const void * data,const char * name)484 void Engine::unload_library(const void *data, const char *name)
485 {
486     if (m_process == 0) {
487        return;
488     }
489     for (DynLibList::iterator it = m_dyn_libs.begin();
490          it != m_dyn_libs.end(); it++) {
491          if (it->data == data) {
492             COIRESULT res;
493             OFFLOAD_DEBUG_TRACE(2,
494                "Unloading library \"%s\"\n",name);
495             res = COI::ProcessUnloadLibrary(m_process,it->lib);
496             m_dyn_libs.erase(it);
497             if (res != COI_SUCCESS) {
498                 check_result(res, c_unload_library, m_index, res);
499             }
500             return;
501          }
502     }
503 }
504 
target_entry_cmp(const VarList::BufEntry & l,const VarList::BufEntry & r)505 static bool target_entry_cmp(
506     const VarList::BufEntry &l,
507     const VarList::BufEntry &r
508 )
509 {
510     const char *l_name = reinterpret_cast<const char*>(l.name);
511     const char *r_name = reinterpret_cast<const char*>(r.name);
512     return strcmp(l_name, r_name) < 0;
513 }
514 
host_entry_cmp(const VarTable::Entry * l,const VarTable::Entry * r)515 static bool host_entry_cmp(
516     const VarTable::Entry *l,
517     const VarTable::Entry *r
518 )
519 {
520     return strcmp(l->name, r->name) < 0;
521 }
522 
init_ptr_data(void)523 void Engine::init_ptr_data(void)
524 {
525     COIRESULT res;
526     COIEVENT event;
527 
528     // Prepare table of host entries
529     std::vector<const VarTable::Entry*> host_table(
530                                          Iterator(__offload_vars.get_head()),
531                                          Iterator());
532 
533     // no need to do anything further is host table is empty
534     if (host_table.size() <= 0) {
535         return;
536     }
537 
538     // Get var table entries from the target.
539     // First we need to get size for the buffer to copy data
540     struct {
541         int64_t nelems;
542         int64_t length;
543     } params;
544 
545     res = COI::PipelineRunFunction(get_pipeline(),
546                                    m_funcs[c_func_var_table_size],
547                                    0, 0, 0,
548                                    0, 0,
549                                    0, 0,
550                                    &params, sizeof(params),
551                                    &event);
552     check_result(res, c_pipeline_run_func, m_index, res);
553 
554     res = COI::EventWait(1, &event, -1, 1, 0, 0);
555     check_result(res, c_event_wait, res);
556 
557     if (params.length == 0) {
558         return;
559     }
560 
561     // create buffer for target entries and copy data to host
562     COIBUFFER buffer;
563     res = COI::BufferCreate(params.length, COI_BUFFER_NORMAL, 0, 0, 1,
564                             &m_process, &buffer);
565     check_result(res, c_buf_create, m_index, res);
566 
567     COI_ACCESS_FLAGS flags = COI_SINK_WRITE;
568     res = COI::PipelineRunFunction(get_pipeline(),
569                                    m_funcs[c_func_var_table_copy],
570                                    1, &buffer, &flags,
571                                    0, 0,
572                                    &params.nelems, sizeof(params.nelems),
573                                    0, 0,
574                                    &event);
575     check_result(res, c_pipeline_run_func, m_index, res);
576 
577     res = COI::EventWait(1, &event, -1, 1, 0, 0);
578     check_result(res, c_event_wait, res);
579 
580     // patch names in target data
581     VarList::BufEntry *target_table;
582     COIMAPINSTANCE map_inst;
583     res = COI::BufferMap(buffer, 0, params.length, COI_MAP_READ_ONLY, 0, 0,
584                          0, &map_inst,
585                          reinterpret_cast<void**>(&target_table));
586     check_result(res, c_buf_map, res);
587 
588     VarList::table_patch_names(target_table, params.nelems);
589 
590     // and sort entries
591     std::sort(target_table, target_table + params.nelems, target_entry_cmp);
592     std::sort(host_table.begin(), host_table.end(), host_entry_cmp);
593 
594     // merge host and target entries and enter matching vars map
595     std::vector<const VarTable::Entry*>::const_iterator hi =
596         host_table.begin();
597     std::vector<const VarTable::Entry*>::const_iterator he =
598         host_table.end();
599     const VarList::BufEntry *ti = target_table;
600     const VarList::BufEntry *te = target_table + params.nelems;
601 
602     while (hi != he && ti != te) {
603         int res = strcmp((*hi)->name, reinterpret_cast<const char*>(ti->name));
604         if (res == 0) {
605             bool is_new;
606             // add matching entry to var map
607             PtrData *ptr = insert_ptr_data((*hi)->addr, (*hi)->size, is_new);
608 
609             // store address for new entries
610             if (is_new) {
611                 ptr->mic_addr = ti->addr;
612                 ptr->is_static = true;
613                 ptr->var_alloc_type = (*hi)->var_alloc_type;
614             }
615             ptr->alloc_ptr_data_lock.unlock();
616             hi++;
617             ti++;
618         }
619         else if (res < 0) {
620             hi++;
621         }
622         else {
623             ti++;
624         }
625     }
626 
627     // cleanup
628     res = COI::BufferUnmap(map_inst, 0, 0, 0);
629     check_result(res, c_buf_unmap, res);
630 
631     res = COI::BufferDestroy(buffer);
632     check_result(res, c_buf_destroy, res);
633 }
634 
compute(_Offload_stream stream,const std::list<COIBUFFER> & buffers,const void * data,uint16_t data_size,void * ret,uint16_t ret_size,uint32_t num_deps,const COIEVENT * deps,COIEVENT * event)635 COIRESULT Engine::compute(
636     _Offload_stream stream,
637     const std::list<COIBUFFER> &buffers,
638     const void*         data,
639     uint16_t            data_size,
640     void*               ret,
641     uint16_t            ret_size,
642     uint32_t            num_deps,
643     const COIEVENT*     deps,
644     COIEVENT*           event
645 ) /* const */
646 {
647     COIBUFFER *bufs;
648     COI_ACCESS_FLAGS *flags;
649     COIRESULT res;
650 
651     // convert buffers list to array
652     int num_bufs = buffers.size();
653     if (num_bufs > 0) {
654         bufs = (COIBUFFER*) alloca(num_bufs * sizeof(COIBUFFER));
655         flags = (COI_ACCESS_FLAGS*) alloca(num_bufs *
656                                            sizeof(COI_ACCESS_FLAGS));
657 
658         int i = 0;
659         for (std::list<COIBUFFER>::const_iterator it = buffers.begin();
660              it != buffers.end(); it++) {
661             bufs[i] = *it;
662 
663             // TODO: this should be fixed
664             flags[i++] = COI_SINK_WRITE;
665         }
666     }
667     else {
668         bufs = 0;
669         flags = 0;
670     }
671     COIPIPELINE pipeline = (stream == no_stream) ?
672                            get_pipeline() :
673                            get_pipeline(stream);
674     // start computation
675     res = COI::PipelineRunFunction(pipeline,
676                                    m_funcs[c_func_compute],
677                                    num_bufs, bufs, flags,
678                                    num_deps, deps,
679                                    data, data_size,
680                                    ret, ret_size,
681                                    event);
682     return res;
683 }
684 
init_device(void)685 pid_t Engine::init_device(void)
686 {
687     struct init_data {
688         int  device_index;
689         int  devices_total;
690         int  console_level;
691         int  offload_report_level;
692     } data;
693     COIRESULT res;
694     COIEVENT event;
695     pid_t pid;
696 
697     OFFLOAD_DEBUG_TRACE_1(2, 0, c_offload_init,
698                           "Initializing device with logical index %d "
699                           "and physical index %d\n",
700                            m_index, m_physical_index);
701 
702     // setup misc data
703     data.device_index = m_index;
704     data.devices_total = mic_engines_total;
705     data.console_level = console_enabled;
706     data.offload_report_level = offload_report_level;
707 
708     res = COI::PipelineRunFunction(get_pipeline(),
709                                    m_funcs[c_func_init],
710                                    0, 0, 0, 0, 0,
711                                    &data, sizeof(data),
712                                    &pid, sizeof(pid),
713                                    &event);
714     check_result(res, c_pipeline_run_func, m_index, res);
715 
716     res = COI::EventWait(1, &event, -1, 1, 0, 0);
717     check_result(res, c_event_wait, res);
718 
719     OFFLOAD_DEBUG_TRACE(2, "Device process pid is %d\n", pid);
720 
721     return pid;
722 }
723 
724 // data associated with each thread
725 struct Thread {
ThreadThread726     Thread(long* addr_coipipe_counter) {
727         m_addr_coipipe_counter = addr_coipipe_counter;
728         memset(m_pipelines, 0, sizeof(m_pipelines));
729     }
730 
~ThreadThread731     ~Thread() {
732 #ifndef TARGET_WINNT
733         __sync_sub_and_fetch(m_addr_coipipe_counter, 1);
734 #else // TARGET_WINNT
735         _InterlockedDecrement(m_addr_coipipe_counter);
736 #endif // TARGET_WINNT
737         for (int i = 0; i < mic_engines_total; i++) {
738             if (m_pipelines[i] != 0) {
739                 COI::PipelineDestroy(m_pipelines[i]);
740             }
741         }
742     }
743 
get_pipelineThread744     COIPIPELINE get_pipeline(int index) const {
745         return m_pipelines[index];
746     }
747 
set_pipelineThread748     void set_pipeline(int index, COIPIPELINE pipeline) {
749         m_pipelines[index] = pipeline;
750     }
751 
get_auto_varsThread752     AutoSet& get_auto_vars() {
753         return m_auto_vars;
754     }
755 
756 private:
757     long*       m_addr_coipipe_counter;
758     AutoSet     m_auto_vars;
759     COIPIPELINE m_pipelines[MIC_ENGINES_MAX];
760 };
761 
get_pipeline(void)762 COIPIPELINE Engine::get_pipeline(void)
763 {
764     Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
765     if (thread == 0) {
766         thread = new Thread(&m_proc_number);
767         thread_setspecific(mic_thread_key, thread);
768     }
769 
770     COIPIPELINE pipeline = thread->get_pipeline(m_index);
771     if (pipeline == 0) {
772         COIRESULT res;
773         int proc_num;
774 
775 #ifndef TARGET_WINNT
776         proc_num = __sync_fetch_and_add(&m_proc_number, 1);
777 #else // TARGET_WINNT
778         proc_num = _InterlockedIncrement(&m_proc_number);
779 #endif // TARGET_WINNT
780 
781         if (proc_num > COI_PIPELINE_MAX_PIPELINES) {
782             LIBOFFLOAD_ERROR(c_coipipe_max_number, COI_PIPELINE_MAX_PIPELINES);
783             LIBOFFLOAD_ABORT;
784         }
785 
786         // Create pipeline for this thread
787         if (m_assigned_cpus == 0) {
788             // If m_assigned_cpus is NULL, it implies all threads
789             // Create the pipeline with no CPU mask
790             res = COI::PipelineCreate(m_process, 0, mic_stack_size, &pipeline);
791         } else {
792             // Create COI CPU mask
793             COI_CPU_MASK  in_Mask;
794             res = COI::PipelineClearCPUMask(in_Mask);
795             check_result(res, c_clear_cpu_mask, m_index, res);
796 
797             int threads_per_core = m_num_threads / m_num_cores;
798 
799             // Available threads are defined by examining of m_assigned_cpus bitset.
800             // We skip thread 0.
801             for (int i = 1; i < m_num_threads; i++) {
802                 // For available thread i m_assigned_cpus[i] is equal to 1
803                 if ((*m_assigned_cpus)[i]) {
804                     COI_CPU_MASK_SET(i, in_Mask);
805                 }
806             }
807             OFFLOAD_DEBUG_TRACE(2, "COIPipelineCreate Mask for this CPU thread\n"
808                                "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n"
809                                "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n",
810                                in_Mask[0], in_Mask[1], in_Mask[2], in_Mask[3],
811                                in_Mask[4], in_Mask[5], in_Mask[6], in_Mask[7],
812                                in_Mask[8], in_Mask[9], in_Mask[10], in_Mask[11],
813                                in_Mask[12], in_Mask[13], in_Mask[14], in_Mask[15]);
814 
815             // Create the pipeline with allowable CPUs
816             res = COI::PipelineCreate(m_process, in_Mask, mic_stack_size, &pipeline);
817         }
818         check_result(res, c_pipeline_create, m_index, res);
819         thread->set_pipeline(m_index, pipeline);
820     }
821     return pipeline;
822 }
823 
find_stream(uint64_t handle,bool remove)824 Stream* Stream::find_stream(uint64_t handle, bool remove)
825 {
826     Stream *stream = 0;
827 
828     m_stream_lock.lock();
829     {
830         StreamMap::iterator it = all_streams.find(handle);
831         if (it != all_streams.end()) {
832             stream = it->second;
833             if (remove) {
834                 all_streams.erase(it);
835             }
836         }
837     }
838     m_stream_lock.unlock();
839     return stream;
840 }
841 
move_cpu_el_after(CpuEl * cpu_what,CpuEl * cpu_after)842 void Engine::move_cpu_el_after(CpuEl* cpu_what, CpuEl* cpu_after)
843 {
844     if (cpu_what == cpu_after) {
845         return;
846     }
847     CpuEl* cpu_prev = cpu_what->prev;
848 
849     // remove cpu_what
850     if (!cpu_prev) {
851         m_cpu_head = cpu_what->next;
852     }
853     else {
854         cpu_prev->next = cpu_what->next;
855     }
856     if (cpu_what->next) {
857         cpu_what->next->prev = cpu_prev;
858     }
859 
860     // insert cpu_what after cpu_after
861     cpu_what->prev = cpu_after;
862     cpu_what->next = cpu_after->next;
863     if (cpu_after->next) {
864         cpu_after->next->prev = cpu_what;
865     }
866     cpu_after->next = cpu_what;
867 }
868 
get_pipeline(_Offload_stream handle)869 COIPIPELINE Engine::get_pipeline(_Offload_stream handle)
870 {
871     Stream * stream = Stream::find_stream(handle, false);
872 
873     if (!stream) {
874         LIBOFFLOAD_ERROR(c_offload_no_stream, m_index);
875         LIBOFFLOAD_ABORT;
876     }
877 
878     COIPIPELINE pipeline = stream->get_pipeline();
879 
880     if (pipeline == 0) {
881         COIRESULT     res;
882         int           proc_num;
883         COI_CPU_MASK  in_Mask ;
884 
885 #ifndef TARGET_WINNT
886         proc_num = __sync_fetch_and_add(&m_proc_number, 1);
887 #else // TARGET_WINNT
888         proc_num = _InterlockedIncrement(&m_proc_number);
889 #endif // TARGET_WINNT
890 
891         if (proc_num > COI_PIPELINE_MAX_PIPELINES) {
892             LIBOFFLOAD_ERROR(c_coipipe_max_number, COI_PIPELINE_MAX_PIPELINES);
893             LIBOFFLOAD_ABORT;
894         }
895 
896         m_stream_lock.lock();
897 
898         // start process if not done yet
899         if (m_process == 0) {
900             init_process();
901         }
902 
903         // create CPUmask
904         res = COI::PipelineClearCPUMask(in_Mask);
905         check_result(res, c_clear_cpu_mask, m_index, res);
906 
907         int stream_cpu_num = stream->get_cpu_number();
908 
909         stream->m_stream_cpus.reset();
910 
911         int threads_per_core = m_num_threads / m_num_cores;
912 
913 
914         // Available threads is taken from m_cpus list.
915         // m_cpu_head points to the head of m_cpus.
916         // the elements of m_cpus is ordered by the number of usage in streams.
917 
918         CpuEl *cpu_el = m_cpu_head;
919         CpuEl *cpu_used_el, *cpu_used_prev, *cpu_prev;
920 
921         for (int i = 0; i < stream_cpu_num; i++) {
922             COI_CPU_MASK_SET(CPU_INDEX(cpu_el), in_Mask);
923             stream->m_stream_cpus.set(CPU_INDEX(cpu_el));
924             //If the number of availabale threads is less than stream_cpu_num,
925             // the stream_cpu_num is restricted to this number.
926             if (!cpu_el->next) {
927                 break;
928             }
929             if (i + 1 < stream_cpu_num) {
930                 cpu_el = cpu_el->next;
931             }
932         }
933 
934         // assertion : cpu_el points to the last used thread
935         cpu_used_el = cpu_el;
936         while (cpu_used_el) {
937             cpu_used_el->count++;
938             cpu_el = cpu_prev = cpu_used_el;
939             cpu_used_prev = cpu_used_el->prev;
940             if (!cpu_el->next) {
941                 cpu_used_el = cpu_used_prev;
942                 continue;
943             }
944 
945             while (cpu_el) {
946                 if (cpu_used_el->count < cpu_el->count) {
947                     break;
948                 }
949                 // Equal used threads are ordered by thread number to
950                 // assign to a stream as contiguous threads as possible.
951                 else if (cpu_used_el->count == cpu_el->count &&
952                          CPU_INDEX(cpu_used_el) <  CPU_INDEX(cpu_el)) {
953                      break;
954                 }
955                 cpu_prev = cpu_el;
956                 cpu_el = cpu_el->next;
957             }
958             if (cpu_used_el != cpu_prev) {
959                 move_cpu_el_after(cpu_used_el, cpu_prev);
960             }
961             cpu_used_el = cpu_used_prev;
962         }
963         print_stream_cpu_list("get_pipeline");
964 
965         // create pipeline for this thread
966         OFFLOAD_DEBUG_TRACE(2, "COIPipelineCreate Mask for this Stream\n"
967                                "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n"
968                                "%016lx %016lx %016lx %016lx\n%016lx %016lx %016lx %016lx\n",
969                                in_Mask[0], in_Mask[1], in_Mask[2], in_Mask[3],
970                                in_Mask[4], in_Mask[5], in_Mask[6], in_Mask[7],
971                                in_Mask[8], in_Mask[9], in_Mask[10], in_Mask[11],
972                                in_Mask[12], in_Mask[13], in_Mask[14], in_Mask[15]);
973         res = COI::PipelineCreate(m_process, in_Mask,
974                                   mic_stack_size, &pipeline);
975         check_result(res, c_pipeline_create, m_index, res);
976 
977         // Set stream's affinities
978         {
979             struct affinity_spec affinity_spec;
980             char* affinity_type;
981             int i;
982 
983             // "compact" by default
984             affinity_spec.affinity_type = affinity_compact;
985 
986             // Check if user has specified type of affinity
987             if ((affinity_type = getenv("OFFLOAD_STREAM_AFFINITY")) !=
988                                         NULL)
989             {
990                 char affinity_str[16];
991                 int affinity_str_len;
992 
993                 OFFLOAD_DEBUG_TRACE(2,
994                     "User has specified OFFLOAD_STREAM_AFFINITY=%s\n",
995                     affinity_type);
996 
997                 // Set type of affinity requested
998                 affinity_str_len = strlen(affinity_type);
999                 for (i=0; i<affinity_str_len && i<15; i++)
1000                 {
1001                     affinity_str[i] = tolower(affinity_type[i]);
1002                 }
1003                 affinity_str[i] = '\0';
1004                 if (strcmp(affinity_str, "compact") == 0) {
1005                     affinity_spec.affinity_type = affinity_compact;
1006                     OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n");
1007                 } else if (strcmp(affinity_str, "scatter") == 0) {
1008                     affinity_spec.affinity_type = affinity_scatter;
1009                     OFFLOAD_DEBUG_TRACE(2, "Setting affinity=scatter\n");
1010                 } else {
1011                     LIBOFFLOAD_ERROR(c_incorrect_affinity, affinity_str);
1012                     affinity_spec.affinity_type = affinity_compact;
1013                     OFFLOAD_DEBUG_TRACE(2, "Setting affinity=compact\n");
1014                 }
1015             }
1016             // Make flat copy of sink mask because COI's mask is opaque
1017             for (i=0; i<16; i++) {
1018                 affinity_spec.sink_mask[i] = in_Mask[i];
1019             }
1020             // Set number of cores and threads
1021             affinity_spec.num_cores = m_num_cores;
1022             affinity_spec.num_threads = m_num_threads;
1023 
1024             COIEVENT event;
1025             res = COI::PipelineRunFunction(pipeline,
1026                                    m_funcs[c_func_set_stream_affinity],
1027                                    0, 0, 0,
1028                                    0, 0,
1029                                    &affinity_spec, sizeof(affinity_spec),
1030                                    0, 0,
1031                                    &event);
1032             check_result(res, c_pipeline_run_func, m_index, res);
1033 
1034             res = COI::EventWait(1, &event, -1, 1, 0, 0);
1035             check_result(res, c_event_wait, res);
1036         }
1037 
1038         m_stream_lock.unlock();
1039         stream->set_pipeline(pipeline);
1040     }
1041     return pipeline;
1042 }
1043 
stream_destroy(_Offload_stream handle)1044 void Engine::stream_destroy(_Offload_stream handle)
1045 {
1046     // get stream
1047     Stream * stream =  Stream::find_stream(handle, true);
1048 
1049     if (stream) {
1050         // return cpus for future use
1051         for (int i = 0; i < m_num_threads; i++) {
1052             if (stream->m_stream_cpus.test(i)) {
1053                 CpuEl *cpu_el = m_cpus + i;
1054                 CpuEl *cpu_first_el = cpu_el;
1055                 // decrease count of thread "i" and move its CpuEl to the
1056                 // proper place into the ordered list
1057                 cpu_el->count--;
1058                 while (cpu_el->prev) {
1059                     if (cpu_first_el->count > cpu_el->prev->count) {
1060                         break;
1061                     }
1062                     else if (cpu_first_el->count == cpu_el->prev->count &&
1063                              CPU_INDEX(cpu_first_el) > CPU_INDEX(cpu_el->prev)) {
1064                         break;
1065                     }
1066                     cpu_el = cpu_el->prev;
1067                 }
1068                 cpu_el = cpu_el->prev;
1069                 // If cpu_el for thread "i" must be moved in the list
1070                 if (cpu_first_el != cpu_el) {
1071                     // Thread "i" is used the least times. It must be set as
1072                     // the m_cpu_head.
1073                     if (!cpu_el) {
1074                         if (!cpu_first_el->prev) {
1075                             continue;
1076                         }
1077                         // remove cpu_el.
1078                         cpu_first_el->prev->next = cpu_first_el->next;
1079                         if (cpu_first_el->next) {
1080                             cpu_first_el->next->prev = cpu_first_el->prev;
1081                         }
1082                         // make cpu_first_el as new m_cpu_head
1083                         cpu_first_el->prev = NULL;
1084                         cpu_first_el->next = m_cpu_head;
1085                         m_cpu_head->prev = cpu_first_el;
1086                         m_cpu_head = cpu_first_el;
1087                     }
1088                     else {
1089                         move_cpu_el_after(cpu_first_el, cpu_el);
1090                     }
1091                 }
1092             }
1093         }
1094         print_stream_cpu_list("stream_destroy");
1095         delete stream;
1096     }
1097     else {
1098         LIBOFFLOAD_ERROR(c_offload_no_stream, m_index);
1099         LIBOFFLOAD_ABORT;
1100     }
1101 }
1102 
get_thread_id(void)1103 uint64_t Engine::get_thread_id(void)
1104 {
1105     Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
1106     if (thread == 0) {
1107         thread = new Thread(&m_proc_number);
1108         thread_setspecific(mic_thread_key, thread);
1109     }
1110 
1111     return reinterpret_cast<uint64_t>(thread);
1112 }
1113 
get_auto_vars(void)1114 AutoSet& Engine::get_auto_vars(void)
1115 {
1116     Thread* thread = (Thread*) thread_getspecific(mic_thread_key);
1117     if (thread == 0) {
1118         thread = new Thread(&m_proc_number);
1119         thread_setspecific(mic_thread_key, thread);
1120     }
1121 
1122     return thread->get_auto_vars();
1123 }
1124 
destroy_thread_data(void * data)1125 void Engine::destroy_thread_data(void *data)
1126 {
1127     delete static_cast<Thread*>(data);
1128 }
1129