1 /*
2     Copyright (c) 2014-2016 Intel Corporation.  All Rights Reserved.
3 
4     Redistribution and use in source and binary forms, with or without
5     modification, are permitted provided that the following conditions
6     are met:
7 
8       * Redistributions of source code must retain the above copyright
9         notice, this list of conditions and the following disclaimer.
10       * Redistributions in binary form must reproduce the above copyright
11         notice, this list of conditions and the following disclaimer in the
12         documentation and/or other materials provided with the distribution.
13       * Neither the name of Intel Corporation nor the names of its
14         contributors may be used to endorse or promote products derived
15         from this software without specific prior written permission.
16 
17     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20     A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
21     HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
22     SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
23     LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
24     DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
25     THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
26     (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27     OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 */
29 
30 
31 #include <omp.h>
32 //#include <stdlib.h>
33 //#include "offload.h"
34 #include "compiler_if_host.h"
35 
36 
37 // OpenMP API
38 
omp_set_default_device(int num)39 void omp_set_default_device(int num) __GOMP_NOTHROW
40 {
41     if (num >= 0) {
42         __omp_device_num = num;
43     }
44 }
45 
omp_get_default_device(void)46 int omp_get_default_device(void) __GOMP_NOTHROW
47 {
48     return __omp_device_num;
49 }
50 
omp_get_num_devices()51 int omp_get_num_devices() __GOMP_NOTHROW
52 {
53     __offload_init_library();
54     return mic_engines_total;
55 }
56 
57 // OpenMP 4.5 APIs
58 
59 // COI supports 3-dim multiD transfers
60 #define MAX_ARRAY_RANK 3
61 
omp_get_initial_device(void)62 int omp_get_initial_device(
63     void
64 ) __GOMP_NOTHROW
65 {
66     return -1;
67 }
68 
omp_target_alloc(size_t size,int device_num)69 void* omp_target_alloc(
70     size_t size,
71     int    device_num
72 ) __GOMP_NOTHROW
73 {
74     __offload_init_library();
75 
76     OFFLOAD_TRACE(2, "omp_target_alloc(%lld, %d)\n", size, device_num);
77 
78     if (device_num < -1) {
79         LIBOFFLOAD_ERROR(c_invalid_device_number);
80         exit(1);
81     }
82 
83     void* result = 0;
84 
85     // malloc on CPU
86     if (device_num == -1) {
87         // We do not check for malloc returning NULL because the
88         // specification of this API includes the possibility of failure.
89         // The user will check the returned result
90         result = malloc(size);
91         return result;
92     }
93 
94     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(
95                        TARGET_MIC, device_num, 0, NULL, __func__, 0);
96     if (ofld != 0) {
97         VarDesc vars[2] = {0};
98 
99         vars[0].type.src = c_data;
100         vars[0].type.dst = c_data;
101         vars[0].direction.bits = c_parameter_in;
102         vars[0].size = sizeof(size);
103         vars[0].count = 1;
104         vars[0].ptr = &size;
105 
106         vars[1].type.src = c_data;
107         vars[1].type.dst = c_data;
108         vars[1].direction.bits = c_parameter_out;
109         vars[1].size = sizeof(result);
110         vars[1].count = 1;
111         vars[1].ptr = &result;
112 
113         OFFLOAD_OFFLOAD(ofld, "omp_target_alloc_target",
114                         0, 2, vars, NULL, 0, 0, 0);
115     }
116     return result;
117 }
118 
omp_target_free(void * device_ptr,int device_num)119 void omp_target_free(
120     void *device_ptr,
121     int   device_num
122 ) __GOMP_NOTHROW
123 {
124     __offload_init_library();
125 
126     OFFLOAD_TRACE(2, "omp_target_free(%p, %d)\n", device_ptr, device_num);
127 
128     if (device_num < -1) {
129         LIBOFFLOAD_ERROR(c_invalid_device_number);
130         exit(1);
131     }
132 
133     // free on CPU
134     if (device_num == -1) {
135         free(device_ptr);
136         return;
137     }
138 
139     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(
140                        TARGET_MIC, device_num, 0, NULL, __func__, 0);
141     if (ofld) {
142         VarDesc vars[1] = {0};
143 
144         vars[0].type.src = c_data;
145         vars[0].type.dst = c_data;
146         vars[0].direction.bits = c_parameter_in;
147         vars[0].size = sizeof(device_ptr);
148         vars[0].count = 1;
149         vars[0].ptr = &device_ptr;
150 
151         OFFLOAD_OFFLOAD(ofld, "omp_target_free_target",
152                         0, 1, vars, NULL, 0, 0, 0);
153     }
154 }
155 
omp_target_is_present(void * ptr,int device_num)156 int omp_target_is_present(
157     void *ptr,
158     int device_num
159 ) __GOMP_NOTHROW
160 {
161     __offload_init_library();
162 
163     OFFLOAD_TRACE(2, "omp_target_is_present(%p, %d)\n", ptr, device_num);
164 
165     if (device_num < -1) {
166         LIBOFFLOAD_ERROR(c_invalid_device_number);
167         exit(1);
168     }
169 
170     if (device_num == -1) {
171         return false;
172     }
173 
174     // If OpenMP allows wrap-around for device numbers, enable next line
175     //device_num %= mic_engines_total;
176 
177     // lookup existing association in pointer table
178     PtrData* ptr_data = mic_engines[device_num].find_ptr_data(ptr);
179     if (ptr_data == 0) {
180         OFFLOAD_TRACE(3, "Address %p is not mapped on device %d\n",
181                       ptr, device_num);
182         return false;
183     }
184 
185     OFFLOAD_TRACE(3, "Address %p found mapped on device %d\n",
186                   ptr, device_num);
187     return true;
188 }
189 
omp_target_memcpy(void * dst,void * src,size_t length,size_t dst_offset,size_t src_offset,int dst_device,int src_device)190 int omp_target_memcpy(
191     void   *dst,
192     void   *src,
193     size_t  length,
194     size_t  dst_offset,
195     size_t  src_offset,
196     int     dst_device,
197     int     src_device
198 ) __GOMP_NOTHROW
199 {
200     __offload_init_library();
201 
202     OFFLOAD_TRACE(2, "omp_target_memcpy(%p, %p, %lld, %lld, %lld, %d, %d)\n",
203                   dst, src, length, dst_offset, src_offset, dst_device, src_device);
204 
205     if (dst_device < -1 || src_device < -1) {
206         LIBOFFLOAD_ERROR(c_invalid_device_number);
207         exit(1);
208     }
209 
210     char* srcp = (char *)src + src_offset;
211     char* dstp = (char *)dst + dst_offset;
212 
213     if (src_device == -1) {
214         // Source is CPU
215         if (dst_device == -1) {
216             // CPU -> CPU
217             memcpy(dstp, srcp, length);
218             return 0;
219         } else {
220             // CPU -> MIC
221             // COIBufferWrite
222             // If OpenMP allows wrap-around for device numbers, enable next line
223             //dst_device %= mic_engines_total;
224 
225             OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n", dstp);
226             COIBUFFER mic_buf;
227             COIRESULT res = COI::BufferCreateFromMemory(length,
228                                 COI_BUFFER_NORMAL, COI_SINK_MEMORY, dstp,
229                                 1, &mic_engines[dst_device].get_process(),
230                                 &mic_buf);
231             if (res != COI_SUCCESS) {
232                 LIBOFFLOAD_ERROR(c_buf_create_from_mem, res);
233                 return 1;
234             }
235             res = COI::BufferWrite(mic_buf, 0, srcp, length,
236                       COI_COPY_UNSPECIFIED, 0, 0, 0);
237             if (res != COI_SUCCESS) {
238                 LIBOFFLOAD_ERROR(c_buf_write, res);
239                 return 1;
240             }
241             res = COI::BufferDestroy(mic_buf);
242             if (res != COI_SUCCESS) {
243                 LIBOFFLOAD_ERROR(c_buf_destroy, res);
244                 return 1;
245             }
246             return 0;
247         }
248     } else {
249         // Source is device
250         if (dst_device == -1) {
251             // MIC -> CPU
252             // COIBufferRead
253 
254             // If OpenMP allows wrap-around for device numbers, enable next line
255             //src_device %= mic_engines_total;
256 
257             OFFLOAD_TRACE(3, "Creating buffer from sink memory %llx\n", srcp);
258             COIBUFFER mic_buf;
259             COIRESULT res = COI::BufferCreateFromMemory(length,
260                                 COI_BUFFER_NORMAL, COI_SINK_MEMORY, srcp,
261                                 1, &mic_engines[src_device].get_process(),
262                                 &mic_buf);
263             if (res != COI_SUCCESS) {
264                 LIBOFFLOAD_ERROR(c_buf_create_from_mem, res);
265                 return 1;
266             }
267             res = COI::BufferRead(mic_buf, 0, dstp, length,
268                       COI_COPY_UNSPECIFIED, 0, 0, 0);
269             if (res != COI_SUCCESS) {
270                 LIBOFFLOAD_ERROR(c_buf_read, res);
271                 return 1;
272             }
273             res = COI::BufferDestroy(mic_buf);
274             if (res != COI_SUCCESS) {
275                 LIBOFFLOAD_ERROR(c_buf_destroy, res);
276                 return 1;
277             }
278             return 0;
279         } else {
280             // some MIC -> some MIC
281             if (src_device == dst_device) {
282                 // MIC local copy will be done as remote memcpy
283 
284                 OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(TARGET_MIC, src_device,
285                                                       0, NULL, __func__, 0);
286                 if (ofld) {
287                     VarDesc vars[3] = {0};
288 
289                     vars[0].type.src = c_data;
290                     vars[0].type.dst = c_data;
291                     vars[0].direction.bits = c_parameter_in;
292                     vars[0].size = sizeof(dstp);
293                     vars[0].count = 1;
294                     vars[0].ptr = &dstp;
295 
296                     vars[1].type.src = c_data;
297                     vars[1].type.dst = c_data;
298                     vars[1].direction.bits = c_parameter_in;
299                     vars[1].size = sizeof(srcp);
300                     vars[1].count = 1;
301                     vars[1].ptr = &srcp;
302 
303                     vars[2].type.src = c_data;
304                     vars[2].type.dst = c_data;
305                     vars[2].direction.bits = c_parameter_in;
306                     vars[2].size = sizeof(length);
307                     vars[2].count = 1;
308                     vars[2].ptr = &length;
309 
310                     OFFLOAD_OFFLOAD(ofld, "omp_target_memcpy_target",
311                                     0, 3, vars, NULL, 0, 0, 0);
312                     return 0;
313                 } else {
314                     return 1;
315                 }
316             } else {
317                 // MICx -> MICy
318                 // Allocate CPU buffer
319                 char *cpu_mem = (char *)malloc(length);
320                 if (cpu_mem == 0) {
321                     LIBOFFLOAD_ERROR(c_malloc);
322                     return 1;
323                 }
324                 int retval = 1;
325                 if (omp_target_memcpy(
326                         cpu_mem, srcp, length, 0, 0, -1, src_device) == 0) {
327                     retval = omp_target_memcpy(
328                                  dstp, cpu_mem, length, 0, 0, dst_device, -1);
329                 }
330                 free(cpu_mem);
331                 return retval;
332             }
333         }
334     }
335 }
336 
bytesize_at_this_dimension(size_t element_size,int num_dims,const size_t * dimensions)337 static size_t bytesize_at_this_dimension(
338     size_t element_size,
339     int num_dims,
340     const size_t* dimensions
341 )
342 {
343     if (num_dims > 1) {
344         return dimensions[1] *
345                bytesize_at_this_dimension(
346                    element_size, num_dims-1, dimensions+1);
347     } else {
348         return element_size;
349     }
350 }
351 
memcpy_rect(char * dst,char * src,size_t element_size,int num_dims,const size_t * volume,const size_t * dst_offsets,const size_t * src_offsets,const size_t * dst_dimensions,const size_t * src_dimensions)352 static void memcpy_rect(
353     char         *dst,
354     char         *src,
355     size_t        element_size,
356     int           num_dims,
357     const size_t *volume,
358     const size_t *dst_offsets,
359     const size_t *src_offsets,
360     const size_t *dst_dimensions,
361     const size_t *src_dimensions
362 )
363 {
364     if (num_dims > 1) {
365         int count = volume[0];
366         int dst_index = dst_offsets[0];
367         int src_index = src_offsets[0];
368         size_t dst_element_size =
369             bytesize_at_this_dimension(element_size, num_dims, dst_dimensions);
370         size_t src_element_size =
371             bytesize_at_this_dimension(element_size, num_dims, src_dimensions);
372         for (; count>0; dst_index++, src_index++, count--) {
373             memcpy_rect(dst+dst_element_size*dst_index,
374                         src+src_element_size*src_index,
375                         element_size, num_dims-1, volume+1,
376                         dst_offsets+1, src_offsets+1,
377                         dst_dimensions+1, src_dimensions+1);
378         }
379     } else {
380         memcpy(dst+dst_offsets[0]*element_size,
381                src+src_offsets[0]*element_size,
382                element_size * volume[0]);
383     }
384 }
385 
omp_target_memcpy_rect(void * dst_,void * src_,size_t element_size,int num_dims,const size_t * volume,const size_t * dst_offsets,const size_t * src_offsets,const size_t * dst_dimensions,const size_t * src_dimensions,int dst_device,int src_device)386 int omp_target_memcpy_rect(
387     void         *dst_,
388     void         *src_,
389     size_t        element_size,
390     int           num_dims,
391     const size_t *volume,
392     const size_t *dst_offsets,
393     const size_t *src_offsets,
394     const size_t *dst_dimensions,
395     const size_t *src_dimensions,
396     int           dst_device,
397     int           src_device
398 ) __GOMP_NOTHROW
399 {
400     char *dst = (char *)dst_;
401     char *src = (char *)src_;
402 
403     __offload_init_library();
404 
405     OFFLOAD_TRACE(2, "omp_target_memcpy_rect(%p, %p, %lld, %d, "
406                   "%p, %p, %p, %p, %p, %d, %d)\n",
407                   dst, src, element_size, num_dims,
408                   volume, dst_offsets, src_offsets,
409                   dst_dimensions, src_dimensions, dst_device, src_device);
410 
411     // MAX_ARRAY_RANK dimensions are supported
412     if (dst == 0 && src == 0) {
413         return MAX_ARRAY_RANK;
414     }
415 
416     if (num_dims < 1 || num_dims > MAX_ARRAY_RANK ||
417         element_size < 1 ||
418         volume == 0 || dst_offsets == 0 || src_offsets == 0 ||
419         dst_dimensions == 0 || src_dimensions == 0) {
420         return 1;
421     }
422 
423     if (dst_device < -1 || src_device < -1) {
424         LIBOFFLOAD_ERROR(c_invalid_device_number);
425         exit(1);
426     }
427 
428     if (src_device == -1) {
429         // Source is CPU
430         if (dst_device == -1) {
431             // CPU -> CPU
432             memcpy_rect((char*)dst, (char*)src, element_size, num_dims, volume,
433                         dst_offsets, src_offsets,
434                         dst_dimensions, src_dimensions);
435             return 0;
436         } else {
437             // CPU -> MIC
438             // COIBufferWriteMultiD
439             struct arr_desc dst_desc;
440             struct arr_desc src_desc;
441 
442             dst_desc.base = (int64_t)dst;
443             dst_desc.rank = num_dims;
444 
445             src_desc.base = (int64_t)src;
446             src_desc.rank = num_dims;
447 
448             for (int i=0; i<num_dims; i++)
449             {
450                 dst_desc.dim[i].size   = bytesize_at_this_dimension(
451                                              element_size,
452                                              num_dims - i,
453                                              dst_dimensions + i);
454                 dst_desc.dim[i].lindex = 0;
455                 dst_desc.dim[i].lower  = dst_offsets[i];
456                 dst_desc.dim[i].upper  = dst_offsets[i] + volume[i] - 1;
457                 dst_desc.dim[i].stride = 1;
458 
459                 src_desc.dim[i].size   = bytesize_at_this_dimension(
460                                              element_size,
461                                              num_dims - i,
462                                              src_dimensions + i);
463                 src_desc.dim[i].lindex = 0;
464                 src_desc.dim[i].lower  = src_offsets[i];
465                 src_desc.dim[i].upper  = src_offsets[i] + volume[i] - 1;
466                 src_desc.dim[i].stride = 1;
467             }
468             __arr_desc_dump("", "dst", (const Arr_Desc*)&dst_desc, false, false);
469             __arr_desc_dump("", "src", (const Arr_Desc*)&src_desc, false, false);
470 
471             // If OpenMP allows wrap-around for device numbers, enable next line
472             //dst_device %= mic_engines_total;
473 
474             // Compute MIC buffer size
475             size_t dst_length = dst_dimensions[0] * bytesize_at_this_dimension(
476                                                         element_size,
477                                                         num_dims,
478                                                         dst_dimensions);
479 
480             OFFLOAD_TRACE(3,
481                 "Creating buffer from sink memory %llx of size %lld\n",
482                 dst, dst_length);
483             COIBUFFER mic_buf;
484             COIRESULT res = COI::BufferCreateFromMemory(dst_length,
485                                 COI_BUFFER_NORMAL, COI_SINK_MEMORY, dst,
486                                 1, &mic_engines[dst_device].get_process(),
487                                 &mic_buf);
488             if (res != COI_SUCCESS) {
489                 LIBOFFLOAD_ERROR(c_buf_create_from_mem, res);
490                 return 1;
491             }
492             res = COI::BufferWriteMultiD(mic_buf,
493                       mic_engines[dst_device].get_process(),
494                       0, &dst_desc, &src_desc,
495                       COI_COPY_UNSPECIFIED, 0, 0, 0);
496             if (res != COI_SUCCESS) {
497                 LIBOFFLOAD_ERROR(c_buf_write, res);
498                 return 1;
499             }
500             res = COI::BufferDestroy(mic_buf);
501             if (res != COI_SUCCESS) {
502                 LIBOFFLOAD_ERROR(c_buf_destroy, res);
503                 return 1;
504             }
505             return 0;
506         }
507     } else {
508         // Source is device
509         if (dst_device == -1) {
510             // COIBufferReadMultiD
511             struct arr_desc dst_desc;
512             struct arr_desc src_desc;
513 
514             dst_desc.base = (int64_t)dst;
515             dst_desc.rank = num_dims;
516 
517             src_desc.base = (int64_t)src;
518             src_desc.rank = num_dims;
519 
520             for (int i=0; i<num_dims; i++)
521             {
522                 dst_desc.dim[i].size   = bytesize_at_this_dimension(
523                                              element_size,
524                                              num_dims - i,
525                                              dst_dimensions + i);
526                 dst_desc.dim[i].lindex = 0;
527                 dst_desc.dim[i].lower  = dst_offsets[i];
528                 dst_desc.dim[i].upper  = dst_offsets[i] + volume[i] - 1;
529                 dst_desc.dim[i].stride = 1;
530 
531                 src_desc.dim[i].size   = bytesize_at_this_dimension(
532                                              element_size,
533                                              num_dims - i,
534                                              src_dimensions + i);
535                 src_desc.dim[i].lindex = 0;
536                 src_desc.dim[i].lower  = src_offsets[i];
537                 src_desc.dim[i].upper  = src_offsets[i] + volume[i] - 1;
538                 src_desc.dim[i].stride = 1;
539             }
540             __arr_desc_dump("", "dst", (const Arr_Desc*)&dst_desc, false, false);
541             __arr_desc_dump("", "src", (const Arr_Desc*)&src_desc, false, false);
542 
543             // If OpenMP allows wrap-around for device numbers, enable next line
544             //src_device %= mic_engines_total;
545 
546             // Compute MIC buffer size
547             size_t src_length = src_dimensions[0] * bytesize_at_this_dimension(
548                                                         element_size,
549                                                         num_dims,
550                                                         src_dimensions);
551 
552             OFFLOAD_TRACE(3,
553                 "Creating buffer from sink memory %llx of size %lld\n",
554                 src, src_length);
555             COIBUFFER mic_buf;
556             COIRESULT res = COI::BufferCreateFromMemory(src_length,
557                                 COI_BUFFER_NORMAL, COI_SINK_MEMORY, src,
558                                 1, &mic_engines[src_device].get_process(),
559                                 &mic_buf);
560             if (res != COI_SUCCESS) {
561                 LIBOFFLOAD_ERROR(c_buf_create_from_mem, res);
562                 return 1;
563             }
564             res = COI::BufferReadMultiD(mic_buf, 0,
565                       &dst_desc, &src_desc,
566                       COI_COPY_UNSPECIFIED, 0, 0, 0);
567             if (res != COI_SUCCESS) {
568                 LIBOFFLOAD_ERROR(c_buf_write, res);
569                 return 1;
570             }
571             res = COI::BufferDestroy(mic_buf);
572             if (res != COI_SUCCESS) {
573                 LIBOFFLOAD_ERROR(c_buf_destroy, res);
574                 return 1;
575             }
576             return 0;
577         } else {
578             // some MIC -> some MIC
579             if (src_device == dst_device) {
580                 // MIC local copy will be done as remote memcpy_rect
581                 struct parameters {
582                     void   *dst;
583                     void   *src;
584                     size_t element_size;
585                     int    num_dims;
586                     size_t array_info[MAX_ARRAY_RANK*5];
587                 } parameters = {dst, src, element_size, num_dims};
588                 int result;
589 
590                 for (int i=0; i<num_dims; i++)
591                 {
592                     parameters.array_info[i]            = volume[i];
593                     parameters.array_info[i+num_dims]   = dst_offsets[i];
594                     parameters.array_info[i+num_dims*2] = src_offsets[i];
595                     parameters.array_info[i+num_dims*3] = dst_dimensions[i];
596                     parameters.array_info[i+num_dims*4] = src_dimensions[i];
597                 }
598 
599                 OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(TARGET_MIC, src_device,
600                                                       0, NULL, __func__, 0);
601                 if (ofld) {
602                     VarDesc vars[1] = {0};
603 
604                     vars[0].type.src = c_data;
605                     vars[0].type.dst = c_data;
606                     vars[0].direction.bits = c_parameter_in;
607                     vars[0].size = sizeof(parameters) -
608                                        (MAX_ARRAY_RANK - num_dims) *
609                                        5 * sizeof(size_t);
610                     vars[0].count = 1;
611                     vars[0].ptr = &parameters;
612 
613                     OFFLOAD_OFFLOAD(ofld, "omp_target_memcpy_rect_target",
614                                     0, 1, vars, NULL, 0, 0, 0);
615                     return 0;
616                 } else {
617                     return 1;
618                 }
619             } else {
620                 // MICx -> MICy
621 
622                 // Compute transfer byte-count
623                 size_t dst_length = element_size;
624                 for (int i=0; i<num_dims; i++) {
625                     dst_length *= volume[i];
626                 }
627 
628                 // Allocate CPU buffer
629                 char *cpu_mem = (char *)malloc(dst_length);
630                 if (cpu_mem == 0) {
631                     LIBOFFLOAD_ERROR(c_malloc);
632                     return 1;
633                 }
634 
635                 // Create CPU offset and dimension arrays
636                 // The CPU array collects the data in a contiguous block
637                 size_t cpu_offsets[MAX_ARRAY_RANK];
638                 size_t cpu_dimensions[MAX_ARRAY_RANK];
639                 for (int i=0; i<num_dims; i++) {
640                     cpu_offsets[i] = 0;
641                     cpu_dimensions[i] = volume[i];
642                 }
643 
644                 int retval = 1;
645                 if (omp_target_memcpy_rect(
646                         cpu_mem, src, element_size, num_dims, volume,
647                         cpu_offsets, src_offsets,
648                         cpu_dimensions, src_dimensions,
649                         -1, src_device) == 0) {
650                     retval = omp_target_memcpy_rect(
651                                  dst, cpu_mem, element_size, num_dims, volume,
652                                  dst_offsets, cpu_offsets,
653                                  dst_dimensions, cpu_dimensions,
654                                  dst_device, -1);
655                 }
656                 free(cpu_mem);
657                 return retval;
658             }
659         }
660     }
661 }
662 
663 // host_ptr is key in table that yields association on device
664 // A COIBUFFER of specified size is created from the memory at
665 //     device_ptr+device_offset on device_num
omp_target_associate_ptr(void * host_ptr,void * device_ptr,size_t size,size_t device_offset,int device_num)666 int omp_target_associate_ptr(
667     void   *host_ptr,
668     void   *device_ptr,
669     size_t  size,
670     size_t  device_offset,
671     int     device_num
672 ) __GOMP_NOTHROW
673 {
674     COIRESULT res;
675 
676     __offload_init_library();
677 
678     OFFLOAD_TRACE(2, "omp_target_associate_ptr(%p, %p, %lld, %lld, %d)\n",
679                   host_ptr, device_ptr, size, device_offset, device_num);
680 
681     if (device_num < -1) {
682         LIBOFFLOAD_ERROR(c_invalid_device_number);
683         exit(1);
684     }
685 
686     // Associating to CPU is treated as failure
687     if (device_num == -1) {
688         return 1;
689     }
690 
691     // An incorrect size is treated as failure
692     if (size < 0) {
693         return 1;
694     }
695 
696     // If OpenMP allows wrap-around for device numbers, enable next line
697     //Engine& device = mic_engines[device_num % mic_engines_total];
698     Engine& device = mic_engines[device_num];
699 
700     // Does host pointer have association already?
701     // lookup existing association in pointer table
702     PtrData* ptr_data = device.find_ptr_data(host_ptr);
703     if (ptr_data != 0) {
704         OFFLOAD_TRACE(3, "Address %p is already mapped on device %d\n",
705                       host_ptr, device_num);
706         // Is current device pointer and offset same as existing?
707         if ((void*)ptr_data->mic_addr == device_ptr &&
708             (size_t)ptr_data->alloc_disp == device_offset) {
709             return 0;
710         } else {
711             return 1;
712         }
713     }
714 
715     // Create association
716     OFFLOAD_TRACE(3, "Creating association for data: addr %p, length %lld\n",
717                   host_ptr, size);
718 
719     bool is_new;
720     ptr_data = device.insert_ptr_data(host_ptr, size, is_new);
721     ptr_data->is_omp_associate = true;
722 
723     // create CPU buffer
724     OFFLOAD_TRACE(3,
725               "Creating buffer from source memory %p, length %lld\n",
726               host_ptr, size);
727 
728     // result is not checked because we can continue without cpu
729     // buffer. In this case we will use COIBufferRead/Write
730     // instead of COIBufferCopy.
731 
732     COI::BufferCreateFromMemory(size,
733                             COI_BUFFER_OPENCL,
734                             0,
735                             host_ptr,
736                             1,
737                             &device.get_process(),
738                             &ptr_data->cpu_buf);
739 
740     // create MIC buffer
741     OFFLOAD_TRACE(3,
742               "Creating buffer from sink memory: addr %p, size %lld\n",
743               (char *)device_ptr + device_offset, size);
744     res = COI::BufferCreateFromMemory(size,
745                                       COI_BUFFER_NORMAL,
746                                       COI_SINK_MEMORY,
747                                       device_ptr,
748                                       1,
749                                       &device.get_process(),
750                                       &ptr_data->mic_buf);
751     if (res != COI_SUCCESS) {
752         ptr_data->alloc_ptr_data_lock.unlock();
753         return 1;
754     }
755 
756     // make buffer valid on the device.
757     res = COI::BufferSetState(ptr_data->mic_buf,
758         device.get_process(),
759         COI_BUFFER_VALID,
760         COI_BUFFER_NO_MOVE,
761         0, 0, 0);
762     if (res != COI_SUCCESS) {
763         ptr_data->alloc_ptr_data_lock.unlock();
764         return 1;
765     }
766 
767     res = COI::BufferSetState(ptr_data->mic_buf,
768         COI_PROCESS_SOURCE,
769         COI_BUFFER_INVALID,
770         COI_BUFFER_NO_MOVE,
771         0, 0, 0);
772     if (res != COI_SUCCESS) {
773         ptr_data->alloc_ptr_data_lock.unlock();
774         return 1;
775     }
776     ptr_data->alloc_disp = device_offset;
777     ptr_data->alloc_ptr_data_lock.unlock();
778 
779     return 0;
780 }
781 
omp_target_disassociate_ptr(void * host_ptr,int device_num)782 int omp_target_disassociate_ptr(
783     void   *host_ptr,
784     int     device_num
785 ) __GOMP_NOTHROW
786 {
787     COIRESULT res;
788 
789     __offload_init_library();
790 
791     OFFLOAD_TRACE(2, "omp_target_disassociate_ptr(%p, %d)\n",
792                   host_ptr, device_num);
793 
794     if (device_num < -1) {
795         LIBOFFLOAD_ERROR(c_invalid_device_number);
796         exit(1);
797     }
798 
799     // Dissociating from CPU is treated as failure
800     if (device_num == -1) {
801         return 1;
802     }
803 
804     // If OpenMP allows wrap-around for device numbers, enable next line
805     //Engine& device = mic_engines[device_num % mic_engines_total];
806     Engine& device = mic_engines[device_num];
807 
808     // Lookup existing association in pointer table
809     PtrData* ptr_data = device.find_ptr_data(host_ptr);
810 
811     // Attempt to disassociate unassociated pointer is a failure
812     if (ptr_data == 0) {
813         return 1;
814     }
815 
816     // Destroy buffers
817     if (ptr_data->cpu_buf != 0) {
818         OFFLOAD_TRACE(3, "Destroying CPU buffer %p\n", ptr_data->cpu_buf);
819         COI::BufferDestroy(ptr_data->cpu_buf);
820     }
821     if (ptr_data->mic_buf != 0) {
822         OFFLOAD_TRACE(3, "Destroying MIC buffer %p\n", ptr_data->mic_buf);
823         COI::BufferDestroy(ptr_data->mic_buf);
824     }
825 
826     // Remove association from map
827     OFFLOAD_TRACE(3, "Removing association for addr %p\n",
828                   ptr_data->cpu_addr.start());
829     device.remove_ptr_data(ptr_data->cpu_addr.start());
830 
831     return 0;
832 }
833 
834 // End of OpenMP 4.5 APIs
835 
836 
837 // OpenMP API wrappers
838 
omp_set_int_target(TARGET_TYPE target_type,int target_number,int setting,const char * f_name)839 static void omp_set_int_target(
840     TARGET_TYPE target_type,
841     int target_number,
842     int setting,
843     const char* f_name
844 )
845 {
846     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
847                                           f_name, 0);
848     if (ofld) {
849         VarDesc vars[1] = {0};
850 
851         vars[0].type.src = c_data;
852         vars[0].type.dst = c_data;
853         vars[0].direction.bits = c_parameter_in;
854         vars[0].size = sizeof(int);
855         vars[0].count = 1;
856         vars[0].ptr = &setting;
857 
858         OFFLOAD_OFFLOAD(ofld, f_name, 0, 1, vars, NULL, 0, 0, 0);
859     }
860 }
861 
omp_get_int_target(TARGET_TYPE target_type,int target_number,const char * f_name)862 static int omp_get_int_target(
863     TARGET_TYPE target_type,
864     int target_number,
865     const char * f_name
866 )
867 {
868     int setting = 0;
869 
870     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
871                                           f_name, 0);
872     if (ofld) {
873         VarDesc vars[1] = {0};
874 
875         vars[0].type.src = c_data;
876         vars[0].type.dst = c_data;
877         vars[0].direction.bits = c_parameter_out;
878         vars[0].size = sizeof(int);
879         vars[0].count = 1;
880         vars[0].ptr = &setting;
881 
882         OFFLOAD_OFFLOAD(ofld, f_name, 0, 1, vars, NULL, 0, 0, 0);
883     }
884     return setting;
885 }
886 
omp_set_num_threads_target(TARGET_TYPE target_type,int target_number,int num_threads)887 void omp_set_num_threads_target(
888     TARGET_TYPE target_type,
889     int target_number,
890     int num_threads
891 )
892 {
893     omp_set_int_target(target_type, target_number, num_threads,
894                        "omp_set_num_threads_target");
895 }
896 
omp_get_max_threads_target(TARGET_TYPE target_type,int target_number)897 int omp_get_max_threads_target(
898     TARGET_TYPE target_type,
899     int target_number
900 )
901 {
902     return omp_get_int_target(target_type, target_number,
903                               "omp_get_max_threads_target");
904 }
905 
omp_get_num_procs_target(TARGET_TYPE target_type,int target_number)906 int omp_get_num_procs_target(
907     TARGET_TYPE target_type,
908     int target_number
909 )
910 {
911     return omp_get_int_target(target_type, target_number,
912                               "omp_get_num_procs_target");
913 }
914 
omp_set_dynamic_target(TARGET_TYPE target_type,int target_number,int num_threads)915 void omp_set_dynamic_target(
916     TARGET_TYPE target_type,
917     int target_number,
918     int num_threads
919 )
920 {
921     omp_set_int_target(target_type, target_number, num_threads,
922                        "omp_set_dynamic_target");
923 }
924 
omp_get_dynamic_target(TARGET_TYPE target_type,int target_number)925 int omp_get_dynamic_target(
926     TARGET_TYPE target_type,
927     int target_number
928 )
929 {
930     return omp_get_int_target(target_type, target_number,
931                               "omp_get_dynamic_target");
932 }
933 
omp_set_nested_target(TARGET_TYPE target_type,int target_number,int nested)934 void omp_set_nested_target(
935     TARGET_TYPE target_type,
936     int target_number,
937     int nested
938 )
939 {
940     omp_set_int_target(target_type, target_number, nested,
941                        "omp_set_nested_target");
942 }
943 
omp_get_nested_target(TARGET_TYPE target_type,int target_number)944 int omp_get_nested_target(
945     TARGET_TYPE target_type,
946     int target_number
947 )
948 {
949     return omp_get_int_target(target_type, target_number,
950                               "omp_get_nested_target");
951 }
952 
omp_set_schedule_target(TARGET_TYPE target_type,int target_number,omp_sched_t kind,int modifier)953 void omp_set_schedule_target(
954     TARGET_TYPE target_type,
955     int target_number,
956     omp_sched_t kind,
957     int modifier
958 )
959 {
960     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
961                                           __func__, 0);
962     if (ofld != 0) {
963         VarDesc vars[2] = {0};
964 
965         vars[0].type.src = c_data;
966         vars[0].type.dst = c_data;
967         vars[0].direction.bits = c_parameter_in;
968         vars[0].size = sizeof(omp_sched_t);
969         vars[0].count = 1;
970         vars[0].ptr = &kind;
971 
972         vars[1].type.src = c_data;
973         vars[1].type.dst = c_data;
974         vars[1].direction.bits = c_parameter_in;
975         vars[1].size = sizeof(int);
976         vars[1].count = 1;
977         vars[1].ptr = &modifier;
978 
979         OFFLOAD_OFFLOAD(ofld, "omp_set_schedule_target",
980                         0, 2, vars, NULL, 0, 0, 0);
981     }
982 }
983 
omp_get_schedule_target(TARGET_TYPE target_type,int target_number,omp_sched_t * kind,int * modifier)984 void omp_get_schedule_target(
985     TARGET_TYPE target_type,
986     int target_number,
987     omp_sched_t *kind,
988     int *modifier
989 )
990 {
991     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
992                                           __func__, 0);
993     if (ofld != 0) {
994         VarDesc vars[2] = {0};
995 
996         vars[0].type.src = c_data;
997         vars[0].type.dst = c_data;
998         vars[0].direction.bits = c_parameter_out;
999         vars[0].size = sizeof(omp_sched_t);
1000         vars[0].count = 1;
1001         vars[0].ptr = kind;
1002 
1003         vars[1].type.src = c_data;
1004         vars[1].type.dst = c_data;
1005         vars[1].direction.bits = c_parameter_out;
1006         vars[1].size = sizeof(int);
1007         vars[1].count = 1;
1008         vars[1].ptr = modifier;
1009 
1010         OFFLOAD_OFFLOAD(ofld, "omp_get_schedule_target",
1011                         0, 2, vars, NULL, 0, 0, 0);
1012     }
1013 }
1014 
1015 // lock API functions
1016 
omp_init_lock_target(TARGET_TYPE target_type,int target_number,omp_lock_target_t * lock)1017 void omp_init_lock_target(
1018     TARGET_TYPE target_type,
1019     int target_number,
1020     omp_lock_target_t *lock
1021 )
1022 {
1023     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
1024                                           __func__, 0);
1025     if (ofld != 0) {
1026         VarDesc vars[1] = {0};
1027 
1028         vars[0].type.src = c_data;
1029         vars[0].type.dst = c_data;
1030         vars[0].direction.bits = c_parameter_out;
1031         vars[0].size = sizeof(omp_lock_target_t);
1032         vars[0].count = 1;
1033         vars[0].ptr = lock;
1034 
1035         OFFLOAD_OFFLOAD(ofld, "omp_init_lock_target",
1036                         0, 1, vars, NULL, 0, 0, 0);
1037     }
1038 }
1039 
omp_destroy_lock_target(TARGET_TYPE target_type,int target_number,omp_lock_target_t * lock)1040 void omp_destroy_lock_target(
1041     TARGET_TYPE target_type,
1042     int target_number,
1043     omp_lock_target_t *lock
1044 )
1045 {
1046     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
1047                                           __func__, 0);
1048     if (ofld != 0) {
1049         VarDesc vars[1] = {0};
1050 
1051         vars[0].type.src = c_data;
1052         vars[0].type.dst = c_data;
1053         vars[0].direction.bits = c_parameter_in;
1054         vars[0].size = sizeof(omp_lock_target_t);
1055         vars[0].count = 1;
1056         vars[0].ptr = lock;
1057 
1058         OFFLOAD_OFFLOAD(ofld, "omp_destroy_lock_target",
1059                         0, 1, vars, NULL, 0, 0, 0);
1060     }
1061 }
1062 
omp_set_lock_target(TARGET_TYPE target_type,int target_number,omp_lock_target_t * lock)1063 void omp_set_lock_target(
1064     TARGET_TYPE target_type,
1065     int target_number,
1066     omp_lock_target_t *lock
1067 )
1068 {
1069     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
1070                                           __func__, 0);
1071     if (ofld != 0) {
1072         VarDesc vars[1] = {0};
1073 
1074         vars[0].type.src = c_data;
1075         vars[0].type.dst = c_data;
1076         vars[0].direction.bits = c_parameter_inout;
1077         vars[0].size = sizeof(omp_lock_target_t);
1078         vars[0].count = 1;
1079         vars[0].ptr = lock;
1080 
1081         OFFLOAD_OFFLOAD(ofld, "omp_set_lock_target",
1082                         0, 1, vars, NULL, 0, 0, 0);
1083     }
1084 }
1085 
omp_unset_lock_target(TARGET_TYPE target_type,int target_number,omp_lock_target_t * lock)1086 void omp_unset_lock_target(
1087     TARGET_TYPE target_type,
1088     int target_number,
1089     omp_lock_target_t *lock
1090 )
1091 {
1092     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
1093                                           __func__, 0);
1094     if (ofld != 0) {
1095         VarDesc vars[1] = {0};
1096 
1097         vars[0].type.src = c_data;
1098         vars[0].type.dst = c_data;
1099         vars[0].direction.bits = c_parameter_inout;
1100         vars[0].size = sizeof(omp_lock_target_t);
1101         vars[0].count = 1;
1102         vars[0].ptr = lock;
1103 
1104         OFFLOAD_OFFLOAD(ofld, "omp_unset_lock_target",
1105                         0, 1, vars, NULL, 0, 0, 0);
1106     }
1107 }
1108 
omp_test_lock_target(TARGET_TYPE target_type,int target_number,omp_lock_target_t * lock)1109 int omp_test_lock_target(
1110     TARGET_TYPE target_type,
1111     int target_number,
1112     omp_lock_target_t *lock
1113 )
1114 {
1115     int result = 0;
1116 
1117     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
1118                                           __func__, 0);
1119     if (ofld != 0) {
1120         VarDesc vars[2] = {0};
1121 
1122         vars[0].type.src = c_data;
1123         vars[0].type.dst = c_data;
1124         vars[0].direction.bits = c_parameter_inout;
1125         vars[0].size = sizeof(omp_lock_target_t);
1126         vars[0].count = 1;
1127         vars[0].ptr = lock;
1128 
1129         vars[1].type.src = c_data;
1130         vars[1].type.dst = c_data;
1131         vars[1].direction.bits = c_parameter_out;
1132         vars[1].size = sizeof(int);
1133         vars[1].count = 1;
1134         vars[1].ptr = &result;
1135 
1136         OFFLOAD_OFFLOAD(ofld, "omp_test_lock_target",
1137                         0, 2, vars, NULL, 0, 0, 0);
1138     }
1139     return result;
1140 }
1141 
1142 // nested lock API functions
1143 
omp_init_nest_lock_target(TARGET_TYPE target_type,int target_number,omp_nest_lock_target_t * lock)1144 void omp_init_nest_lock_target(
1145     TARGET_TYPE target_type,
1146     int target_number,
1147     omp_nest_lock_target_t *lock
1148 )
1149 {
1150     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
1151                                           __func__, 0);
1152     if (ofld != 0) {
1153         VarDesc vars[1] = {0};
1154 
1155         vars[0].type.src = c_data;
1156         vars[0].type.dst = c_data;
1157         vars[0].direction.bits = c_parameter_out;
1158         vars[0].size = sizeof(omp_nest_lock_target_t);
1159         vars[0].count = 1;
1160         vars[0].ptr = lock;
1161 
1162         OFFLOAD_OFFLOAD(ofld, "omp_init_nest_lock_target",
1163                         0, 1, vars, NULL, 0, 0, 0);
1164     }
1165 }
1166 
omp_destroy_nest_lock_target(TARGET_TYPE target_type,int target_number,omp_nest_lock_target_t * lock)1167 void omp_destroy_nest_lock_target(
1168     TARGET_TYPE target_type,
1169     int target_number,
1170     omp_nest_lock_target_t *lock
1171 )
1172 {
1173     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
1174                                           __func__, 0);
1175     if (ofld != 0) {
1176         VarDesc vars[1] = {0};
1177 
1178         vars[0].type.src = c_data;
1179         vars[0].type.dst = c_data;
1180         vars[0].direction.bits = c_parameter_in;
1181         vars[0].size = sizeof(omp_nest_lock_target_t);
1182         vars[0].count = 1;
1183         vars[0].ptr = lock;
1184 
1185         OFFLOAD_OFFLOAD(ofld, "omp_destroy_nest_lock_target",
1186                         0, 1, vars, NULL, 0, 0, 0);
1187     }
1188 }
1189 
omp_set_nest_lock_target(TARGET_TYPE target_type,int target_number,omp_nest_lock_target_t * lock)1190 void omp_set_nest_lock_target(
1191     TARGET_TYPE target_type,
1192     int target_number,
1193     omp_nest_lock_target_t *lock
1194 )
1195 {
1196     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
1197                                           __func__, 0);
1198     if (ofld != 0) {
1199         VarDesc vars[1] = {0};
1200 
1201         vars[0].type.src = c_data;
1202         vars[0].type.dst = c_data;
1203         vars[0].direction.bits = c_parameter_inout;
1204         vars[0].size = sizeof(omp_nest_lock_target_t);
1205         vars[0].count = 1;
1206         vars[0].ptr = lock;
1207 
1208         OFFLOAD_OFFLOAD(ofld, "omp_set_nest_lock_target",
1209                         0, 1, vars, NULL, 0, 0, 0);
1210     }
1211 }
1212 
omp_unset_nest_lock_target(TARGET_TYPE target_type,int target_number,omp_nest_lock_target_t * lock)1213 void omp_unset_nest_lock_target(
1214     TARGET_TYPE target_type,
1215     int target_number,
1216     omp_nest_lock_target_t *lock
1217 )
1218 {
1219     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
1220                                           __func__, 0);
1221     if (ofld != 0) {
1222         VarDesc vars[1] = {0};
1223 
1224         vars[0].type.src = c_data;
1225         vars[0].type.dst = c_data;
1226         vars[0].direction.bits = c_parameter_inout;
1227         vars[0].size = sizeof(omp_nest_lock_target_t);
1228         vars[0].count = 1;
1229         vars[0].ptr = lock;
1230 
1231         OFFLOAD_OFFLOAD(ofld, "omp_unset_nest_lock_target",
1232                         0, 1, vars, NULL, 0, 0, 0);
1233     }
1234 }
1235 
omp_test_nest_lock_target(TARGET_TYPE target_type,int target_number,omp_nest_lock_target_t * lock)1236 int omp_test_nest_lock_target(
1237     TARGET_TYPE target_type,
1238     int target_number,
1239     omp_nest_lock_target_t *lock
1240 )
1241 {
1242     int result = 0;
1243 
1244     OFFLOAD ofld = OFFLOAD_TARGET_ACQUIRE(target_type, target_number, 0, NULL,
1245                                           __func__, 0);
1246     if (ofld != 0) {
1247         VarDesc vars[2] = {0};
1248 
1249         vars[0].type.src = c_data;
1250         vars[0].type.dst = c_data;
1251         vars[0].direction.bits = c_parameter_inout;
1252         vars[0].size = sizeof(omp_nest_lock_target_t);
1253         vars[0].count = 1;
1254         vars[0].ptr = lock;
1255 
1256         vars[1].type.src = c_data;
1257         vars[1].type.dst = c_data;
1258         vars[1].direction.bits = c_parameter_out;
1259         vars[1].size = sizeof(int);
1260         vars[1].count = 1;
1261         vars[1].ptr = &result;
1262 
1263         OFFLOAD_OFFLOAD(ofld, "omp_test_nest_lock_target",
1264                         0, 2, vars, NULL, 0, 0, 0);
1265     }
1266     return result;
1267 }
1268