1 /* pocl-cuda.c - driver for CUDA devices
2 
3    Copyright (c) 2016-2017 James Price / University of Bristol
4 
5    Permission is hereby granted, free of charge, to any person obtaining a copy
6    of this software and associated documentation files (the "Software"), to
7    deal
8    in the Software without restriction, including without limitation the rights
9    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10    copies of the Software, and to permit persons to whom the Software is
11    furnished to do so, subject to the following conditions:
12 
13    The above copyright notice and this permission notice shall be included in
14    all copies or substantial portions of the Software.
15 
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21    FROM,
22    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23    THE SOFTWARE.
24 */
25 
26 #include "config.h"
27 
28 #include "common.h"
29 #include "common_driver.h"
30 #include "devices.h"
31 #include "pocl.h"
32 #include "pocl-cuda.h"
33 #include "pocl-ptx-gen.h"
34 #include "pocl_cache.h"
35 #include "pocl_file_util.h"
36 #include "pocl_llvm.h"
37 #include "pocl_mem_management.h"
38 #include "pocl_runtime_config.h"
39 #include "pocl_timing.h"
40 #include "pocl_util.h"
41 
42 #include <string.h>
43 
44 #include <cuda.h>
45 #include <cuda_runtime.h>
46 
47 typedef struct pocl_cuda_device_data_s
48 {
49   CUdevice device;
50   CUcontext context;
51   CUevent epoch_event;
52   cl_ulong epoch;
53   char libdevice[PATH_MAX];
54   pocl_lock_t compile_lock;
55 } pocl_cuda_device_data_t;
56 
57 typedef struct pocl_cuda_queue_data_s
58 {
59   CUstream stream;
60   int use_threads;
61   pthread_t submit_thread;
62   pthread_t finalize_thread;
63   pthread_mutex_t lock;
64   pthread_cond_t pending_cond;
65   pthread_cond_t running_cond;
66   _cl_command_node *volatile pending_queue;
67   _cl_command_node *volatile running_queue;
68   cl_command_queue queue;
69 } pocl_cuda_queue_data_t;
70 
71 typedef struct pocl_cuda_kernel_data_s
72 {
73   CUmodule module;
74   CUmodule module_offsets;
75   CUfunction kernel;
76   CUfunction kernel_offsets;
77   size_t *alignments;
78 } pocl_cuda_kernel_data_t;
79 
80 typedef struct pocl_cuda_event_data_s
81 {
82   CUevent start;
83   CUevent end;
84   volatile int events_ready;
85   cl_int *ext_event_flag;
86   pthread_cond_t event_cond;
87   volatile unsigned num_ext_events;
88 } pocl_cuda_event_data_t;
89 
90 extern unsigned int pocl_num_devices;
91 
92 void *pocl_cuda_submit_thread (void *);
93 void *pocl_cuda_finalize_thread (void *);
94 
95 static void
pocl_cuda_abort_on_error(CUresult result,unsigned line,const char * func,const char * code,const char * api)96 pocl_cuda_abort_on_error (CUresult result, unsigned line, const char *func,
97                           const char *code, const char *api)
98 {
99   if (result != CUDA_SUCCESS)
100     {
101       const char *err_name;
102       const char *err_string;
103       cuGetErrorName (result, &err_name);
104       cuGetErrorString (result, &err_string);
105       POCL_MSG_PRINT2 (CUDA, func, line, "Error during %s\n", api);
106       POCL_ABORT ("%s: %s\n", err_name, err_string);
107     }
108 }
109 
110 static int
pocl_cuda_error(CUresult result,unsigned line,const char * func,const char * code,const char * api)111 pocl_cuda_error (CUresult result, unsigned line, const char *func,
112                           const char *code, const char *api)
113 {
114   int err = (result != CUDA_SUCCESS);
115   if (err)
116     {
117       const char *err_name;
118       const char *err_string;
119       cuGetErrorName (result, &err_name);
120       cuGetErrorString (result, &err_string);
121       POCL_MSG_ERR ("CUDA error during %s. %s: %s\n", api, err_name, err_string);
122     }
123   return err;
124 }
125 
126 #define CUDA_CHECK(result, api)                                               \
127   pocl_cuda_abort_on_error (result, __LINE__, __FUNCTION__, #result, api)
128 
129 #define CUDA_CHECK_ERROR(result, api)                                         \
130   pocl_cuda_error (result, __LINE__, __FUNCTION__, #result, api)
131 
pocl_cuda_handle_cl_nv_device_attribute_query(cl_device_id device,cl_device_info param_name,size_t param_value_size,void * param_value,size_t * param_value_size_ret)132 cl_int pocl_cuda_handle_cl_nv_device_attribute_query(cl_device_id   device,
133                                                      cl_device_info param_name,
134                                                      size_t         param_value_size,
135                                                      void *         param_value,
136                                                      size_t *       param_value_size_ret)
137 {
138   CUdevice cudaDev = ((pocl_cuda_device_data_t *)device->data)->device;
139   unsigned int value;
140   CUresult res;
141 
142   switch(param_name) {
143     case CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV:
144       res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDev);
145       CUDA_CHECK(res, "cuDeviceGetAttribute");
146       POCL_RETURN_GETINFO(cl_uint, value);
147     case CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV:
148       res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDev);
149       CUDA_CHECK(res, "cuDeviceGetAttribute");
150       POCL_RETURN_GETINFO(cl_uint, value);
151     case CL_DEVICE_REGISTERS_PER_BLOCK_NV:
152       res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, cudaDev);
153       CUDA_CHECK(res, "cuDeviceGetAttribute");
154       POCL_RETURN_GETINFO(cl_uint, value);
155     case CL_DEVICE_WARP_SIZE_NV:
156       res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cudaDev);
157       CUDA_CHECK(res, "cuDeviceGetAttribute");
158       POCL_RETURN_GETINFO(cl_uint, value);
159     case CL_DEVICE_GPU_OVERLAP_NV:
160       res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, cudaDev);
161       CUDA_CHECK(res, "cuDeviceGetAttribute");
162       POCL_RETURN_GETINFO(cl_bool, value);
163     case CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:
164       res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, cudaDev);
165       CUDA_CHECK(res, "cuDeviceGetAttribute");
166       POCL_RETURN_GETINFO(cl_bool, value);
167     case CL_DEVICE_INTEGRATED_MEMORY_NV:
168       res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_INTEGRATED, cudaDev);
169       CUDA_CHECK(res, "cuDeviceGetAttribute");
170       POCL_RETURN_GETINFO(cl_bool, value);
171     case CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV:
172       res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, cudaDev);
173       CUDA_CHECK(res, "cuDeviceGetAttribute");
174       POCL_RETURN_GETINFO(cl_uint, value);
175     case CL_DEVICE_PCI_BUS_ID_NV:
176       res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cudaDev);
177       CUDA_CHECK(res, "cuDeviceGetAttribute");
178       POCL_RETURN_GETINFO(cl_uint, value);
179     case CL_DEVICE_PCI_SLOT_ID_NV:
180       res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cudaDev);
181       CUDA_CHECK(res, "cuDeviceGetAttribute");
182       POCL_RETURN_GETINFO(cl_uint, value);
183     case CL_DEVICE_PCI_DOMAIN_ID_NV:
184       res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, cudaDev);
185       CUDA_CHECK(res, "cuDeviceGetAttribute");
186       POCL_RETURN_GETINFO(cl_uint, value);
187     default:
188       return CL_INVALID_VALUE;
189   }
190 
191 }
192 
193 void
pocl_cuda_init_device_ops(struct pocl_device_ops * ops)194 pocl_cuda_init_device_ops (struct pocl_device_ops *ops)
195 {
196   ops->device_name = "CUDA";
197   ops->build_hash = pocl_cuda_build_hash;
198   ops->probe = pocl_cuda_probe;
199   ops->uninit = pocl_cuda_uninit;
200   ops->reinit = NULL;
201   ops->init = pocl_cuda_init;
202   ops->init_queue = pocl_cuda_init_queue;
203   ops->free_queue = pocl_cuda_free_queue;
204 
205   ops->alloc_mem_obj = pocl_cuda_alloc_mem_obj;
206   ops->free = pocl_cuda_free;
207 
208   ops->submit = pocl_cuda_submit;
209   ops->notify = pocl_cuda_notify;
210   ops->broadcast = pocl_broadcast;
211   ops->wait_event = pocl_cuda_wait_event;
212   ops->update_event = pocl_cuda_update_event;
213   ops->free_event_data = pocl_cuda_free_event_data;
214   ops->join = pocl_cuda_join;
215   ops->flush = pocl_cuda_flush;
216   ops->init_build = pocl_cuda_init_build;
217   // TODO
218   ops->notify_event_finished = pocl_cuda_notify_event_finished;
219 
220   ops->get_device_info_ext = pocl_cuda_handle_cl_nv_device_attribute_query;
221   ops->build_source = pocl_driver_build_source;
222   ops->link_program = pocl_driver_link_program;
223   ops->build_binary = pocl_driver_build_binary;
224   ops->free_program = pocl_driver_free_program;
225   ops->setup_metadata = pocl_driver_setup_metadata;
226   ops->supports_binary = pocl_driver_supports_binary;
227   ops->build_poclbinary = pocl_driver_build_poclbinary;
228   ops->compile_kernel = pocl_cuda_compile_kernel;
229 
230   // TODO
231   ops->get_mapping_ptr = pocl_driver_get_mapping_ptr;
232   ops->free_mapping_ptr = pocl_driver_free_mapping_ptr;
233 
234   ops->can_migrate_d2d = NULL;
235   ops->migrate_d2d = NULL;
236   ops->read = NULL;
237   ops->read_rect = NULL;
238   ops->write = NULL;
239   ops->write_rect = NULL;
240   ops->copy = NULL;
241   ops->copy_rect = NULL;
242   ops->map_mem = NULL;
243   ops->unmap_mem = NULL;
244   ops->run = NULL;
245 }
246 
247 cl_int
pocl_cuda_init(unsigned j,cl_device_id dev,const char * parameters)248 pocl_cuda_init (unsigned j, cl_device_id dev, const char *parameters)
249 {
250   CUresult result;
251   int ret = CL_SUCCESS;
252 
253   if (dev->data)
254     return ret;
255 
256   pocl_init_default_device_infos (dev);
257   dev->extensions = CUDA_DEVICE_EXTENSIONS;
258 
259   dev->vendor = "NVIDIA Corporation";
260   dev->vendor_id = 0x10de; /* the PCIID for NVIDIA */
261 
262   dev->type = CL_DEVICE_TYPE_GPU;
263   dev->address_bits = (sizeof (void *) * 8);
264 
265   dev->llvm_target_triplet = (sizeof (void *) == 8) ? "nvptx64" : "nvptx";
266 
267   dev->spmd = CL_TRUE;
268   dev->workgroup_pass = CL_FALSE;
269   dev->execution_capabilities = CL_EXEC_KERNEL;
270 
271   dev->global_as_id = 1;
272   dev->local_as_id = 3;
273   dev->constant_as_id = 1;
274 
275   /* TODO: Get images working */
276   dev->image_support = CL_FALSE;
277 
278   dev->autolocals_to_args
279       = POCL_AUTOLOCALS_TO_ARGS_ONLY_IF_DYNAMIC_LOCALS_PRESENT;
280 
281   dev->has_64bit_long = 1;
282 
283   pocl_cuda_device_data_t *data = calloc (1, sizeof (pocl_cuda_device_data_t));
284   result = cuDeviceGet (&data->device, j);
285   if (CUDA_CHECK_ERROR (result, "cuDeviceGet"))
286     ret = CL_INVALID_DEVICE;
287 
288   /* Get specific device name */
289   dev->long_name = dev->short_name = calloc (256, sizeof (char));
290 
291   if (ret != CL_INVALID_DEVICE)
292     cuDeviceGetName (dev->long_name, 256, data->device);
293   else
294     snprintf (dev->long_name, 255, "Unavailable CUDA device #%d", j);
295 
296   SETUP_DEVICE_CL_VERSION (CUDA_DEVICE_CL_VERSION_MAJOR,
297                            CUDA_DEVICE_CL_VERSION_MINOR);
298 
299   /* Get other device properties */
300   if (ret != CL_INVALID_DEVICE)
301     {
302       /* CUDA device attributes (as fetched by cuDeviceGetAttribute) are always (unsigned)
303        * integers, where the OpenCL counterparts are of a variety of (other) integer types.
304        * Fetch the values in an unsigned int and copy it over.
305        * We also OR all return values of cuDeviceGetAttribute, and at the end we will check
306        * if it's not CL_SUCCESS. We miss the exact line that failed this way, but it's
307        * faster than checking after each attribute fetch.
308        */
309       int value = 0;
310 #define GET_CU_PROP(key, target) do { \
311   result |= cuDeviceGetAttribute (&value, CU_DEVICE_ATTRIBUTE_##key, data->device); \
312   target = value; \
313 } while (0)
314 
315       GET_CU_PROP (MAX_THREADS_PER_BLOCK, dev->max_work_group_size);
316       GET_CU_PROP (MAX_BLOCK_DIM_X, dev->max_work_item_sizes[0]);
317       GET_CU_PROP (MAX_BLOCK_DIM_Y, dev->max_work_item_sizes[1]);
318       GET_CU_PROP (MAX_BLOCK_DIM_Z, dev->max_work_item_sizes[2]);
319       GET_CU_PROP (MAX_SHARED_MEMORY_PER_BLOCK, dev->local_mem_size);
320       GET_CU_PROP (MULTIPROCESSOR_COUNT, dev->max_compute_units);
321       GET_CU_PROP (ECC_ENABLED, dev->error_correction_support);
322       GET_CU_PROP (INTEGRATED, dev->host_unified_memory);
323       GET_CU_PROP (TOTAL_CONSTANT_MEMORY, dev->max_constant_buffer_size);
324       GET_CU_PROP (CLOCK_RATE, dev->max_clock_frequency);
325       dev->max_clock_frequency /= 1000;
326       GET_CU_PROP (TEXTURE_ALIGNMENT, dev->mem_base_addr_align);
327       GET_CU_PROP (INTEGRATED, dev->host_unified_memory);
328     }
329   if (CUDA_CHECK_ERROR (result, "cuDeviceGetAttribute"))
330     ret = CL_INVALID_DEVICE;
331 
332   dev->preferred_wg_size_multiple = 32;
333   dev->preferred_vector_width_char = 1;
334   dev->preferred_vector_width_short = 1;
335   dev->preferred_vector_width_int = 1;
336   dev->preferred_vector_width_long = 1;
337   dev->preferred_vector_width_float = 1;
338   dev->preferred_vector_width_double = 1;
339   dev->preferred_vector_width_half = 0;
340   dev->native_vector_width_char = 1;
341   dev->native_vector_width_short = 1;
342   dev->native_vector_width_int = 1;
343   dev->native_vector_width_long = 1;
344   dev->native_vector_width_float = 1;
345   dev->native_vector_width_double = 1;
346   dev->native_vector_width_half = 0;
347 
348   dev->single_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
349                           | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN
350                           | CL_FP_DENORM;
351   dev->double_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
352                           | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN
353                           | CL_FP_DENORM;
354 
355   dev->local_mem_type = CL_LOCAL;
356 
357   /* Get GPU architecture name */
358   int sm_maj = 0, sm_min = 0;
359   if (ret != CL_INVALID_DEVICE)
360     {
361       cuDeviceGetAttribute (&sm_maj, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
362                             data->device);
363       cuDeviceGetAttribute (&sm_min, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
364                             data->device);
365     }
366   char *gpu_arch = calloc (16, sizeof (char));
367   snprintf (gpu_arch, 16, "sm_%d%d", sm_maj, sm_min);
368   dev->llvm_cpu = pocl_get_string_option ("POCL_CUDA_GPU_ARCH", gpu_arch);
369   POCL_MSG_PRINT_INFO ("[CUDA] GPU architecture = %s\n", dev->llvm_cpu);
370 
371   /* Find libdevice library */
372   if (findLibDevice (data->libdevice, dev->llvm_cpu))
373     {
374       if (ret != CL_INVALID_DEVICE)
375         {
376           POCL_MSG_ERR ("[CUDA] failed to find libdevice library\n");
377           dev->compiler_available = dev->linker_available = 0;
378         }
379     }
380 
381   dev->device_side_printf = 0;
382 
383   /* Create context */
384   if (ret != CL_INVALID_DEVICE)
385     {
386       result = cuCtxCreate (&data->context, CU_CTX_MAP_HOST, data->device);
387       if (CUDA_CHECK_ERROR (result, "cuCtxCreate"))
388         ret = CL_INVALID_DEVICE;
389     }
390 
391   /* Create epoch event for timing info */
392   if (ret != CL_INVALID_DEVICE)
393     {
394       result = cuEventCreate (&data->epoch_event, CU_EVENT_DEFAULT);
395       CUDA_CHECK_ERROR (result, "cuEventCreate");
396 
397       data->epoch = pocl_gettimemono_ns ();
398 
399       result = cuEventRecord (data->epoch_event, 0);
400       result = cuEventSynchronize (data->epoch_event);
401       if (CUDA_CHECK_ERROR (result, "cuEventSynchronize"))
402         ret = CL_INVALID_DEVICE;
403     }
404 
405   /* Get global memory size */
406   size_t memfree = 0, memtotal = 0;
407   if (ret != CL_INVALID_DEVICE)
408     result = cuMemGetInfo (&memfree, &memtotal);
409   dev->max_mem_alloc_size = max (memtotal / 4, 128 * 1024 * 1024);
410   dev->global_mem_size = memtotal;
411 
412   dev->data = data;
413 
414   POCL_INIT_LOCK (data->compile_lock);
415   return ret;
416 }
417 
418 cl_int
pocl_cuda_init_queue(cl_device_id device,cl_command_queue queue)419 pocl_cuda_init_queue (cl_device_id device, cl_command_queue queue)
420 {
421   cuCtxSetCurrent (((pocl_cuda_device_data_t *)queue->device->data)->context);
422 
423   pocl_cuda_queue_data_t *queue_data
424       = calloc (1, sizeof (pocl_cuda_queue_data_t));
425   queue->data = queue_data;
426   queue_data->queue = queue;
427 
428   CUresult result
429       = cuStreamCreate (&queue_data->stream, CU_STREAM_NON_BLOCKING);
430   if (CUDA_CHECK_ERROR (result, "cuStreamCreate"))
431     return CL_OUT_OF_RESOURCES;
432 
433   queue_data->use_threads
434       = !pocl_get_bool_option ("POCL_CUDA_DISABLE_QUEUE_THREADS", 1);
435 
436   if (queue_data->use_threads)
437     {
438       pthread_mutex_init (&queue_data->lock, NULL);
439       pthread_cond_init (&queue_data->pending_cond, NULL);
440       pthread_cond_init (&queue_data->running_cond, NULL);
441       int err = pthread_create (&queue_data->submit_thread, NULL,
442                                 pocl_cuda_submit_thread, queue_data);
443       if (err)
444         {
445           POCL_MSG_ERR ("[CUDA] Error creating submit thread: %d\n", err);
446           return CL_OUT_OF_RESOURCES;
447         }
448 
449       err = pthread_create (&queue_data->finalize_thread, NULL,
450                             pocl_cuda_finalize_thread, queue_data);
451       if (err)
452         {
453           POCL_MSG_ERR ("[CUDA] Error creating finalize thread: %d\n", err);
454           return CL_OUT_OF_RESOURCES;
455         }
456     }
457 
458   return CL_SUCCESS;
459 }
460 
461 int
pocl_cuda_free_queue(cl_device_id device,cl_command_queue queue)462 pocl_cuda_free_queue (cl_device_id device, cl_command_queue queue)
463 {
464   pocl_cuda_queue_data_t *queue_data = (pocl_cuda_queue_data_t *)queue->data;
465 
466   cuCtxSetCurrent (((pocl_cuda_device_data_t *)queue->device->data)->context);
467   cuStreamDestroy (queue_data->stream);
468 
469   assert (queue_data->pending_queue == NULL);
470   assert (queue_data->running_queue == NULL);
471 
472   /* Kill queue threads */
473   if (queue_data->use_threads)
474     {
475       pthread_mutex_lock (&queue_data->lock);
476       queue_data->queue = NULL;
477       pthread_cond_signal (&queue_data->pending_cond);
478       pthread_cond_signal (&queue_data->running_cond);
479       pthread_mutex_unlock (&queue_data->lock);
480       pthread_join (queue_data->submit_thread, NULL);
481       pthread_join (queue_data->finalize_thread, NULL);
482     }
483   return CL_SUCCESS;
484 }
485 
486 char *
pocl_cuda_build_hash(cl_device_id device)487 pocl_cuda_build_hash (cl_device_id device)
488 {
489   char *res = calloc (1000, sizeof (char));
490   snprintf (res, 1000, "CUDA-%s", device->llvm_cpu);
491   return res;
492 }
493 
494 unsigned int
pocl_cuda_probe(struct pocl_device_ops * ops)495 pocl_cuda_probe (struct pocl_device_ops *ops)
496 {
497   int env_count = pocl_device_get_env_count (ops->device_name);
498 
499   int probe_count = 0;
500   CUresult ret = cuInit (0);
501   if (ret == CUDA_SUCCESS)
502     {
503       ret = cuDeviceGetCount (&probe_count);
504       if (ret != CUDA_SUCCESS)
505         probe_count = 0;
506     }
507 
508   /* If the user requested a specific number of CUDA devices,
509    * pretend we only have that many, if we can. If they requested
510    * more than there are, abort informing the user of the issue.
511    */
512   if (env_count >= 0)
513     {
514       if (env_count > probe_count)
515         POCL_ABORT ("[CUDA] %d devices requested, but only %d are available\n",
516           env_count, probe_count);
517       probe_count = env_count;
518     }
519 
520   return probe_count;
521 }
522 
523 cl_int
pocl_cuda_uninit(unsigned j,cl_device_id device)524 pocl_cuda_uninit (unsigned j, cl_device_id device)
525 {
526   pocl_cuda_device_data_t *data = device->data;
527 
528   if (device->available)
529       cuCtxDestroy (data->context);
530 
531   POCL_MEM_FREE (data);
532   device->data = NULL;
533 
534   POCL_MEM_FREE (device->long_name);
535   return CL_SUCCESS;
536 }
537 
538 cl_int
pocl_cuda_alloc_mem_obj(cl_device_id device,cl_mem mem,void * host_ptr)539 pocl_cuda_alloc_mem_obj (cl_device_id device, cl_mem mem, void *host_ptr)
540 {
541   cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);
542   pocl_mem_identifier *p = &mem->device_ptrs[device->global_mem_id];
543   int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
544 
545   CUresult result;
546   void *b = NULL;
547 
548   p->extra_ptr = NULL;
549   p->version = 0;
550   cl_mem_flags flags = mem->flags;
551 
552   if (flags & CL_MEM_USE_HOST_PTR)
553     {
554 #if defined __arm__
555           /* cuMemHostRegister is not supported on ARM.
556            * Allocate device memory and perform explicit copies
557            * before and after running a kernel */
558           result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);
559           CUDA_CHECK (result, "cuMemAlloc");
560 #else
561       POCL_RETURN_ERROR_ON ((pocl_alloc_or_retain_mem_host_ptr (mem) != 0),
562                             CL_OUT_OF_HOST_MEMORY,
563                             "Cannot allocate backing memory!\n");
564 
565       result = cuMemHostRegister (mem->mem_host_ptr, mem->size,
566                                   CU_MEMHOSTREGISTER_DEVICEMAP);
567       if (result != CUDA_SUCCESS
568           && result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
569         CUDA_CHECK (result, "cuMemHostRegister");
570       result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, mem->mem_host_ptr,
571                                           0);
572       CUDA_CHECK (result, "cuMemHostGetDevicePointer");
573 
574       /* TODO can we assume cuMemHostRegister copies
575        * the content of host memory to the device ? for now, lets not */
576       p->version = 0;
577 #endif
578     }
579   /* preallocate host visible memory */
580   else if ((flags & CL_MEM_ALLOC_HOST_PTR) && (mem->mem_host_ptr == NULL))
581     {
582       result = cuMemHostAlloc (&p->extra_ptr, mem->size,
583                                CU_MEMHOSTREGISTER_DEVICEMAP);
584       CUDA_CHECK (result, "cuMemHostAlloc");
585       result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, p->extra_ptr, 0);
586       CUDA_CHECK (result, "cuMemHostGetDevicePointer");
587       mem->mem_host_ptr = p->extra_ptr;
588       mem->mem_host_ptr_refcount = 1;
589       mem->mem_host_ptr_version = 0;
590 
591       if (flags & CL_MEM_COPY_HOST_PTR)
592         {
593           result = cuMemcpyHtoD ((CUdeviceptr)b, host_ptr, mem->size);
594           CUDA_CHECK (result, "cuMemcpyHtoD");
595 
596           result = cuStreamSynchronize (0);
597           CUDA_CHECK (result, "cuStreamSynchronize");
598 
599           mem->mem_host_ptr_version = 1;
600           mem->latest_version = 1;
601           p->version = 1;
602         }
603     }
604   else
605     {
606       result = cuMemAlloc ((CUdeviceptr *)&b, mem->size);
607       if (result != CUDA_SUCCESS)
608         {
609           const char *err;
610           cuGetErrorName (result, &err);
611           POCL_MSG_PRINT2 (CUDA, __FUNCTION__, __LINE__,
612                            "-> Failed to allocate memory: %s\n", err);
613           return CL_MEM_OBJECT_ALLOCATION_FAILURE;
614         }
615     }
616   p->mem_ptr = b;
617   err = CL_SUCCESS;
618 
619   return err;
620 }
621 
622 void
pocl_cuda_free(cl_device_id device,cl_mem mem_obj)623 pocl_cuda_free (cl_device_id device, cl_mem mem_obj)
624 {
625   cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);
626   pocl_mem_identifier *p = &mem_obj->device_ptrs[device->global_mem_id];
627 
628   if (mem_obj->flags & CL_MEM_USE_HOST_PTR)
629     {
630 #if defined __arm__
631       cuMemFree ((CUdeviceptr)p->mem_ptr);
632 #else
633       assert (p->extra_ptr == NULL);
634       cuMemHostUnregister (mem_obj->mem_host_ptr);
635 #endif
636     }
637   else if (p->extra_ptr)
638     {
639       mem_obj->mem_host_ptr = NULL;
640       mem_obj->mem_host_ptr_refcount = 0;
641       mem_obj->mem_host_ptr_version = 0;
642       cuMemFreeHost (p->extra_ptr);
643       p->extra_ptr = NULL;
644     }
645   else
646     {
647       assert (p->extra_ptr == NULL);
648       assert (p->mem_ptr != NULL);
649       cuMemFree ((CUdeviceptr)p->mem_ptr);
650     }
651   p->mem_ptr = NULL;
652   p->version = 0;
653 }
654 
655 void
pocl_cuda_submit_read(CUstream stream,void * host_ptr,const void * device_ptr,size_t offset,size_t cb)656 pocl_cuda_submit_read (CUstream stream, void *host_ptr, const void *device_ptr,
657                        size_t offset, size_t cb)
658 {
659   POCL_MSG_PRINT_CUDA ("cuMemcpyDtoHAsync %p -> %p / %zu B \n", device_ptr, host_ptr, cb);
660   CUresult result = cuMemcpyDtoHAsync (
661       host_ptr, (CUdeviceptr) (device_ptr + offset), cb, stream);
662   CUDA_CHECK (result, "cuMemcpyDtoHAsync");
663 }
664 
665 void
pocl_cuda_submit_memfill(CUstream stream,void * mem_ptr,size_t size_in_bytes,size_t offset,const void * pattern,size_t pattern_size)666 pocl_cuda_submit_memfill (CUstream stream, void *mem_ptr, size_t size_in_bytes,
667                           size_t offset, const void *pattern,
668                           size_t pattern_size)
669 {
670   CUresult result;
671   switch (pattern_size)
672     {
673     case 1:
674       result
675           = cuMemsetD8Async ((CUdeviceptr) (((char *)mem_ptr) + offset),
676                              *(unsigned char *)pattern, size_in_bytes, stream);
677       break;
678     case 2:
679       result = cuMemsetD16Async ((CUdeviceptr) (((char *)mem_ptr) + offset),
680                                  *(unsigned short *)pattern, size_in_bytes / 2,
681                                  stream);
682       break;
683     case 4:
684       result = cuMemsetD32Async ((CUdeviceptr) (((char *)mem_ptr) + offset),
685                                  *(unsigned int *)pattern, size_in_bytes / 4,
686                                  stream);
687       break;
688     case 8:
689     case 16:
690     case 32:
691     case 64:
692     case 128:
693       POCL_ABORT_UNIMPLEMENTED ("fill_kernel with pattern_size >=8");
694     default:
695       POCL_ABORT ("unrecognized pattern_size");
696     }
697   CUDA_CHECK (result, "cuMemset*Async");
698 }
699 
700 void
pocl_cuda_submit_write(CUstream stream,const void * host_ptr,void * device_ptr,size_t offset,size_t cb)701 pocl_cuda_submit_write (CUstream stream, const void *host_ptr,
702                         void *device_ptr, size_t offset, size_t cb)
703 {
704   POCL_MSG_PRINT_CUDA ("cuMemcpyHtoDAsync %p -> %p / %zu B \n", host_ptr, device_ptr, cb);
705   CUresult result = cuMemcpyHtoDAsync ((CUdeviceptr) (device_ptr + offset),
706                                        host_ptr, cb, stream);
707   CUDA_CHECK (result, "cuMemcpyHtoDAsync");
708 }
709 
710 void
pocl_cuda_submit_copy(CUstream stream,void * __restrict__ src_mem_ptr,size_t src_offset,void * __restrict__ dst_mem_ptr,size_t dst_offset,size_t cb)711 pocl_cuda_submit_copy (CUstream stream, void*__restrict__ src_mem_ptr,
712                        size_t src_offset,  void *__restrict__ dst_mem_ptr,
713                        size_t dst_offset, size_t cb)
714 {
715   void *src_ptr = src_mem_ptr + src_offset;
716   void *dst_ptr = dst_mem_ptr + dst_offset;
717 
718   if (src_ptr == dst_ptr)
719     return;
720 
721   CUresult result;
722   POCL_MSG_PRINT_CUDA ("cuMemcpyDtoDAsync %p -> %p / %zu B \n", src_ptr, dst_ptr, cb);
723   result = cuMemcpyDtoDAsync ((CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr,
724                                 cb, stream);
725   CUDA_CHECK (result, "cuMemcpyDtoDAsync");
726 }
727 
728 void
pocl_cuda_submit_read_rect(CUstream stream,void * __restrict__ const host_ptr,void * __restrict__ const device_ptr,const size_t * __restrict__ const buffer_origin,const size_t * __restrict__ const host_origin,const size_t * __restrict__ const region,size_t const buffer_row_pitch,size_t const buffer_slice_pitch,size_t const host_row_pitch,size_t const host_slice_pitch)729 pocl_cuda_submit_read_rect (CUstream stream, void *__restrict__ const host_ptr,
730                             void *__restrict__ const device_ptr,
731                             const size_t *__restrict__ const buffer_origin,
732                             const size_t *__restrict__ const host_origin,
733                             const size_t *__restrict__ const region,
734                             size_t const buffer_row_pitch,
735                             size_t const buffer_slice_pitch,
736                             size_t const host_row_pitch,
737                             size_t const host_slice_pitch)
738 {
739   CUDA_MEMCPY3D params = { 0 };
740 
741   POCL_MSG_PRINT_CUDA ("cuMemcpy3D / READ_RECT %p -> %p \n", device_ptr, host_ptr);
742 
743   params.WidthInBytes = region[0];
744   params.Height = region[1];
745   params.Depth = region[2];
746 
747   params.dstMemoryType = CU_MEMORYTYPE_HOST;
748   params.dstHost = host_ptr;
749   params.dstXInBytes = host_origin[0];
750   params.dstY = host_origin[1];
751   params.dstZ = host_origin[2];
752   params.dstPitch = host_row_pitch;
753   params.dstHeight = host_slice_pitch / host_row_pitch;
754 
755   params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
756   params.srcDevice = (CUdeviceptr)device_ptr;
757   params.srcXInBytes = buffer_origin[0];
758   params.srcY = buffer_origin[1];
759   params.srcZ = buffer_origin[2];
760   params.srcPitch = buffer_row_pitch;
761   params.srcHeight = buffer_slice_pitch / buffer_row_pitch;
762 
763   CUresult result = cuMemcpy3DAsync (&params, stream);
764   CUDA_CHECK (result, "cuMemcpy3DAsync");
765 }
766 
767 void
pocl_cuda_submit_write_rect(CUstream stream,const void * __restrict__ const host_ptr,void * __restrict__ const device_ptr,const size_t * __restrict__ const buffer_origin,const size_t * __restrict__ const host_origin,const size_t * __restrict__ const region,size_t const buffer_row_pitch,size_t const buffer_slice_pitch,size_t const host_row_pitch,size_t const host_slice_pitch)768 pocl_cuda_submit_write_rect (CUstream stream,
769                              const void *__restrict__ const host_ptr,
770                              void *__restrict__ const device_ptr,
771                              const size_t *__restrict__ const buffer_origin,
772                              const size_t *__restrict__ const host_origin,
773                              const size_t *__restrict__ const region,
774                              size_t const buffer_row_pitch,
775                              size_t const buffer_slice_pitch,
776                              size_t const host_row_pitch,
777                              size_t const host_slice_pitch)
778 {
779   CUDA_MEMCPY3D params = { 0 };
780 
781   POCL_MSG_PRINT_CUDA ("cuMemcpy3D / WRITE_RECT %p -> %p \n", host_ptr, device_ptr);
782 
783   params.WidthInBytes = region[0];
784   params.Height = region[1];
785   params.Depth = region[2];
786 
787   params.srcMemoryType = CU_MEMORYTYPE_HOST;
788   params.srcHost = host_ptr;
789   params.srcXInBytes = host_origin[0];
790   params.srcY = host_origin[1];
791   params.srcZ = host_origin[2];
792   params.srcPitch = host_row_pitch;
793   params.srcHeight = host_slice_pitch / host_row_pitch;
794 
795   params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
796   params.dstDevice = (CUdeviceptr)device_ptr;
797   params.dstXInBytes = buffer_origin[0];
798   params.dstY = buffer_origin[1];
799   params.dstZ = buffer_origin[2];
800   params.dstPitch = buffer_row_pitch;
801   params.dstHeight = buffer_slice_pitch / buffer_row_pitch;
802 
803   CUresult result = cuMemcpy3DAsync (&params, stream);
804   CUDA_CHECK (result, "cuMemcpy3DAsync");
805 }
806 
807 void
pocl_cuda_submit_copy_rect(CUstream stream,cl_device_id dev,void * src_ptr,void * dst_ptr,const size_t * __restrict__ const src_origin,const size_t * __restrict__ const dst_origin,const size_t * __restrict__ const region,size_t const src_row_pitch,size_t const src_slice_pitch,size_t const dst_row_pitch,size_t const dst_slice_pitch)808 pocl_cuda_submit_copy_rect (CUstream stream,
809                             cl_device_id dev,
810                             void* src_ptr,
811                             void* dst_ptr,
812                             const size_t *__restrict__ const src_origin,
813                             const size_t *__restrict__ const dst_origin,
814                             const size_t *__restrict__ const region,
815                             size_t const src_row_pitch,
816                             size_t const src_slice_pitch,
817                             size_t const dst_row_pitch,
818                             size_t const dst_slice_pitch)
819 {
820   CUDA_MEMCPY3D params = { 0 };
821 
822   POCL_MSG_PRINT_CUDA ("cuMemcpy3D / COPY_RECT %p -> %p \n", src_ptr, dst_ptr);
823 
824   params.WidthInBytes = region[0];
825   params.Height = region[1];
826   params.Depth = region[2];
827 
828   params.srcDevice = (CUdeviceptr)src_ptr;
829   params.srcXInBytes = src_origin[0];
830   params.srcY = src_origin[1];
831   params.srcZ = src_origin[2];
832   params.srcPitch = src_row_pitch;
833   params.srcHeight = src_slice_pitch / src_row_pitch;
834 
835   params.dstDevice = (CUdeviceptr)dst_ptr;
836   params.dstXInBytes = dst_origin[0];
837   params.dstY = dst_origin[1];
838   params.dstZ = dst_origin[2];
839   params.dstPitch = dst_row_pitch;
840   params.dstHeight = dst_slice_pitch / dst_row_pitch;
841 
842   params.srcMemoryType = params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
843 
844   CUresult result = cuMemcpy3DAsync (&params, stream);
845   CUDA_CHECK (result, "cuMemcpy3DAsync");
846 }
847 
848 void
pocl_cuda_submit_map_mem(CUstream stream,cl_mem mem,pocl_mem_identifier * p,size_t offset,size_t size,void * host_ptr)849 pocl_cuda_submit_map_mem (CUstream stream, cl_mem mem,
850                           pocl_mem_identifier *p,
851                           size_t offset, size_t size, void *host_ptr)
852 {
853   assert (host_ptr != NULL);
854 
855   if ((mem->flags & CL_MEM_USE_HOST_PTR)
856       || (p->extra_ptr))
857     return;
858 
859   POCL_MSG_PRINT_CUDA ("cuMemcpyDtoHAsync %p / %zu B \n", host_ptr, size);
860 
861   void *buf_ptr = p->mem_ptr;
862 
863   CUresult result = cuMemcpyDtoHAsync (
864       host_ptr, (CUdeviceptr) (buf_ptr + offset), size, stream);
865   CUDA_CHECK (result, "cuMemcpyDtoHAsync");
866 }
867 
868 void *
pocl_cuda_submit_unmap_mem(CUstream stream,pocl_mem_identifier * dst_mem_id,size_t offset,size_t size,void * host_ptr,cl_map_flags map_flags)869 pocl_cuda_submit_unmap_mem (CUstream stream, pocl_mem_identifier *dst_mem_id,
870                             size_t offset, size_t size, void *host_ptr,
871                             cl_map_flags map_flags)
872 {
873   /* Only copy back if mapped for writing */
874   if (map_flags == CL_MAP_READ)
875     return NULL;
876 
877   if (host_ptr)
878     {
879       CUresult result = cuMemcpyHtoDAsync (
880           (CUdeviceptr) (dst_mem_id->mem_ptr + offset), host_ptr, size, stream);
881       CUDA_CHECK (result, "cuMemcpyHtoDAsync");
882     }
883   return NULL;
884 }
885 
886 static pocl_cuda_kernel_data_t *
load_or_generate_kernel(cl_kernel kernel,cl_device_id device,int has_offsets,unsigned device_i,_cl_command_node * command,int specialized)887 load_or_generate_kernel (cl_kernel kernel, cl_device_id device,
888                          int has_offsets, unsigned device_i,
889                          _cl_command_node *command, int specialized)
890 {
891   CUresult result;
892   pocl_kernel_metadata_t *meta = kernel->meta;
893   /* Check if we already have a compiled kernel function */
894   pocl_cuda_kernel_data_t *kdata
895       = (pocl_cuda_kernel_data_t *)meta->data[device_i];
896   if (kdata)
897     {
898       if ((has_offsets && kdata->kernel_offsets)
899           || (!has_offsets && kdata->kernel))
900         return kdata;
901     }
902   else
903     {
904       /* TODO: when can we release this? */
905       kdata = meta->data[device_i]
906           = (void *)calloc (1, sizeof (pocl_cuda_kernel_data_t));
907     }
908 
909   pocl_cuda_device_data_t *ddata = (pocl_cuda_device_data_t *)device->data;
910   cuCtxSetCurrent (ddata->context);
911 
912   POCL_LOCK(ddata->compile_lock);
913 
914   /* Generate the parallel bitcode file linked with the kernel library */
915   int error = pocl_llvm_generate_workgroup_function (device_i, device, kernel,
916                                                      command, specialized);
917   if (error)
918     {
919       POCL_MSG_PRINT_GENERAL ("pocl_llvm_generate_workgroup_function() failed"
920                               " for kernel %s\n", kernel->name);
921       assert (error == 0);
922     }
923 
924   char bc_filename[POCL_FILENAME_LENGTH];
925   pocl_cache_work_group_function_path (bc_filename, kernel->program, device_i,
926                                        kernel, command, specialized);
927 
928   char ptx_filename[POCL_FILENAME_LENGTH];
929   strcpy (ptx_filename, bc_filename);
930   strncat (ptx_filename, ".ptx", POCL_FILENAME_LENGTH - 1);
931 
932   if (!pocl_exists (ptx_filename))
933     {
934       /* Generate PTX from LLVM bitcode */
935       if (pocl_ptx_gen (bc_filename, ptx_filename, kernel->name,
936                         device->llvm_cpu,
937                         ((pocl_cuda_device_data_t *)device->data)->libdevice,
938                         has_offsets))
939         POCL_ABORT ("pocl-cuda: failed to generate PTX\n");
940     }
941 
942   /* Load PTX module */
943   /* TODO: When can we unload the module? */
944   CUmodule module;
945   result = cuModuleLoad (&module, ptx_filename);
946   CUDA_CHECK (result, "cuModuleLoad");
947 
948   /* Get kernel function */
949   CUfunction function;
950   result = cuModuleGetFunction (&function, module, kernel->name);
951   CUDA_CHECK (result, "cuModuleGetFunction");
952 
953   /* Get pointer aligment */
954   if (!kdata->alignments)
955     {
956       kdata->alignments
957           = calloc (meta->num_args + meta->num_locals + 4, sizeof (size_t));
958       pocl_cuda_get_ptr_arg_alignment (bc_filename, kernel->name,
959                                        kdata->alignments);
960     }
961 
962   if (has_offsets)
963     {
964       kdata->module_offsets = module;
965       kdata->kernel_offsets = function;
966     }
967   else
968     {
969       kdata->module = module;
970       kdata->kernel = function;
971     }
972 
973   POCL_UNLOCK (ddata->compile_lock);
974 
975   return kdata;
976 }
977 
978 void
pocl_cuda_compile_kernel(_cl_command_node * cmd,cl_kernel kernel,cl_device_id device,int specialize)979 pocl_cuda_compile_kernel (_cl_command_node *cmd, cl_kernel kernel,
980                           cl_device_id device, int specialize)
981 {
982   load_or_generate_kernel (kernel, device, 0, cmd->device_i, cmd,
983                            specialize);
984 }
985 
986 void
pocl_cuda_submit_kernel(CUstream stream,_cl_command_node * cmd,cl_device_id device,cl_event event)987 pocl_cuda_submit_kernel (CUstream stream, _cl_command_node *cmd,
988                          cl_device_id device, cl_event event)
989 {
990   _cl_command_run run = cmd->command.run;
991   cl_kernel kernel = run.kernel;
992   pocl_argument *arguments = run.arguments;
993   struct pocl_context pc = run.pc;
994   pocl_kernel_metadata_t *meta = kernel->meta;
995 
996   /* Check if we need to handle global work offsets */
997   int has_offsets =
998     (pc.global_offset[0] || pc.global_offset[1] || pc.global_offset[2]);
999 
1000   /* Get kernel function */
1001   pocl_cuda_kernel_data_t *kdata = load_or_generate_kernel (
1002       kernel, device, has_offsets, cmd->device_i, cmd, 1);
1003   CUmodule module = has_offsets ? kdata->module_offsets : kdata->module;
1004   CUfunction function = has_offsets ? kdata->kernel_offsets : kdata->kernel;
1005 
1006   /* Prepare kernel arguments */
1007   void *null = NULL;
1008   unsigned sharedMemBytes = 0;
1009   void *params[meta->num_args + meta->num_locals + 4];
1010   unsigned sharedMemOffsets[meta->num_args + meta->num_locals];
1011   unsigned constantMemBytes = 0;
1012   unsigned constantMemOffsets[meta->num_args];
1013   unsigned globalOffsets[3];
1014 
1015   /* Get handle to constant memory buffer */
1016   size_t constant_mem_size;
1017   CUdeviceptr constant_mem_base = 0;
1018   cuModuleGetGlobal (&constant_mem_base, &constant_mem_size, module,
1019                      "_constant_memory_region_");
1020 
1021   CUresult result;
1022   unsigned i;
1023   for (i = 0; i < meta->num_args; i++)
1024     {
1025       pocl_argument_type type = meta->arg_info[i].type;
1026       switch (type)
1027         {
1028         case POCL_ARG_TYPE_NONE:
1029           params[i] = arguments[i].value;
1030           break;
1031         case POCL_ARG_TYPE_POINTER:
1032           {
1033             if (ARG_IS_LOCAL (meta->arg_info[i]))
1034               {
1035                 size_t size = arguments[i].size;
1036                 size_t align = kdata->alignments[i];
1037 
1038                 /* Pad offset to align memory */
1039                 if (sharedMemBytes % align)
1040                   sharedMemBytes += align - (sharedMemBytes % align);
1041 
1042                 sharedMemOffsets[i] = sharedMemBytes;
1043                 params[i] = sharedMemOffsets + i;
1044 
1045                 sharedMemBytes += size;
1046               }
1047             else if (meta->arg_info[i].address_qualifier
1048                      == CL_KERNEL_ARG_ADDRESS_CONSTANT)
1049               {
1050                 assert (constant_mem_base);
1051                 assert (arguments[i].is_svm == 0);
1052 
1053                 /* Get device pointer */
1054                 cl_mem mem = *(void **)arguments[i].value;
1055                 CUdeviceptr src
1056                     = (CUdeviceptr)mem->device_ptrs[device->global_mem_id].mem_ptr
1057                       + arguments[i].offset;
1058 
1059                 size_t align = kdata->alignments[i];
1060                 if (constantMemBytes % align)
1061                   {
1062                     constantMemBytes += align - (constantMemBytes % align);
1063                   }
1064 
1065                 /* Copy to constant buffer at current offset */
1066                 result
1067                     = cuMemcpyDtoDAsync (constant_mem_base + constantMemBytes,
1068                                          src, mem->size, stream);
1069                 CUDA_CHECK (result, "cuMemcpyDtoDAsync");
1070 
1071                 constantMemOffsets[i] = constantMemBytes;
1072                 params[i] = constantMemOffsets + i;
1073 
1074                 constantMemBytes += mem->size;
1075               }
1076             else
1077               {
1078                 assert (arguments[i].is_svm == 0);
1079                 if (arguments[i].value)
1080                   {
1081                     cl_mem mem = *(void **)arguments[i].value;
1082                     params[i] = &mem->device_ptrs[device->global_mem_id].mem_ptr
1083                                 + arguments[i].offset;
1084 
1085 #if defined __arm__
1086                     /* On ARM with USE_HOST_PTR, perform explicit copy to
1087                      * device */
1088                     if (mem->flags & CL_MEM_USE_HOST_PTR)
1089                       {
1090                         cuMemcpyHtoD (*(CUdeviceptr *)(params[i]),
1091                                       mem->mem_host_ptr, mem->size);
1092                         cuStreamSynchronize (0);
1093                       }
1094 #endif
1095                   }
1096                 else
1097                   {
1098                     params[i] = &null;
1099                   }
1100               }
1101             break;
1102           }
1103         case POCL_ARG_TYPE_IMAGE:
1104         case POCL_ARG_TYPE_SAMPLER:
1105           POCL_ABORT ("Unhandled argument type for CUDA\n");
1106           break;
1107         }
1108     }
1109 
1110   if (constantMemBytes > constant_mem_size)
1111     POCL_ABORT ("[CUDA] Total constant buffer size %u exceeds %lu allocated\n",
1112                 constantMemBytes, constant_mem_size);
1113 
1114   unsigned arg_index = meta->num_args;
1115 
1116   if (sharedMemBytes != 0)
1117     {
1118       /* Deal with automatic local allocations if there are local function args
1119        */
1120       /* TODO: Would be better to remove arguments and make these static GEPs
1121        */
1122       for (i = 0; i < meta->num_locals; ++i, ++arg_index)
1123         {
1124           size_t size = meta->local_sizes[i];
1125           size_t align = kdata->alignments[arg_index];
1126 
1127           /* Pad offset to align memory */
1128           if (sharedMemBytes % align)
1129             sharedMemBytes += align - (sharedMemBytes % align);
1130 
1131           sharedMemOffsets[arg_index] = sharedMemBytes;
1132           sharedMemBytes += size;
1133           params[arg_index] = sharedMemOffsets + arg_index;
1134         }
1135     }
1136 
1137   /* Add global work dimensionality */
1138   params[arg_index++] = &pc.work_dim;
1139 
1140   /* Add global offsets if necessary */
1141   if (has_offsets)
1142     {
1143       globalOffsets[0] = pc.global_offset[0];
1144       globalOffsets[1] = pc.global_offset[1];
1145       globalOffsets[2] = pc.global_offset[2];
1146       params[arg_index++] = globalOffsets + 0;
1147       params[arg_index++] = globalOffsets + 1;
1148       params[arg_index++] = globalOffsets + 2;
1149     }
1150 
1151   /* Launch kernel */
1152   result = cuLaunchKernel (function, pc.num_groups[0], pc.num_groups[1],
1153                            pc.num_groups[2], pc.local_size[0],
1154                            pc.local_size[1], pc.local_size[2], sharedMemBytes,
1155                            stream, params, NULL);
1156   CUDA_CHECK (result, "cuLaunchKernel");
1157 }
1158 
1159 void
pocl_cuda_submit_node(_cl_command_node * node,cl_command_queue cq,int locked)1160 pocl_cuda_submit_node (_cl_command_node *node, cl_command_queue cq, int locked)
1161 {
1162   CUresult result;
1163   CUstream stream = ((pocl_cuda_queue_data_t *)cq->data)->stream;
1164 
1165   if (!locked)
1166   POCL_LOCK_OBJ (node->event);
1167 
1168   pocl_cuda_event_data_t *event_data
1169       = (pocl_cuda_event_data_t *)node->event->data;
1170 
1171   /* Process event dependencies */
1172   event_node *dep = NULL;
1173   LL_FOREACH (node->event->wait_list, dep)
1174     {
1175       /* If it is in the process of completing, just skip it */
1176       if (dep->event->status <= CL_COMPLETE)
1177         continue;
1178 
1179       /* Add CUDA event dependency */
1180       if (dep->event->command_type != CL_COMMAND_USER
1181           && dep->event->queue->device->ops == cq->device->ops)
1182         {
1183           /* Block stream on event, but only for different queues */
1184           if (dep->event->queue != node->event->queue)
1185             {
1186               pocl_cuda_event_data_t *dep_data
1187                   = (pocl_cuda_event_data_t *)dep->event->data;
1188 
1189               /* Wait until dependency has finished being submitted */
1190               while (!dep_data->events_ready)
1191                 ;
1192 
1193               result = cuStreamWaitEvent (stream, dep_data->end, 0);
1194               CUDA_CHECK (result, "cuStreamWaitEvent");
1195             }
1196         }
1197       else
1198         {
1199           if (!((pocl_cuda_queue_data_t *)cq->data)->use_threads)
1200             POCL_ABORT (
1201                 "Can't handle non-CUDA dependencies without queue threads\n");
1202 
1203           event_data->num_ext_events++;
1204         }
1205     }
1206 
1207   /* Wait on flag for external events */
1208   if (event_data->num_ext_events)
1209     {
1210       CUdeviceptr dev_ext_event_flag;
1211       result = cuMemHostAlloc ((void **)&event_data->ext_event_flag, 4,
1212                                CU_MEMHOSTALLOC_DEVICEMAP);
1213       CUDA_CHECK (result, "cuMemAllocHost");
1214 
1215       *event_data->ext_event_flag = 0;
1216 
1217       result = cuMemHostGetDevicePointer (&dev_ext_event_flag,
1218                                            event_data->ext_event_flag, 0);
1219       CUDA_CHECK (result, "cuMemHostGetDevicePointer");
1220       result = cuStreamWaitValue32 (stream, dev_ext_event_flag, 1,
1221                                     CU_STREAM_WAIT_VALUE_GEQ);
1222       CUDA_CHECK (result, "cuStreamWaitValue32");
1223     }
1224 
1225   /* Create and record event for command start if profiling enabled */
1226   if (cq->properties & CL_QUEUE_PROFILING_ENABLE)
1227     {
1228       result = cuEventCreate (&event_data->start, CU_EVENT_DEFAULT);
1229       CUDA_CHECK (result, "cuEventCreate");
1230       result = cuEventRecord (event_data->start, stream);
1231       CUDA_CHECK (result, "cuEventRecord");
1232     }
1233 
1234   pocl_update_event_submitted (node->event);
1235 
1236   POCL_UNLOCK_OBJ (node->event);
1237 
1238   cl_event event = node->event;
1239   cl_device_id dev = node->device;
1240   _cl_command_t *cmd = &node->command;
1241 
1242   switch (node->type)
1243     {
1244     case CL_COMMAND_READ_BUFFER:
1245       pocl_cuda_submit_read (
1246           stream, cmd->read.dst_host_ptr, cmd->read.src_mem_id->mem_ptr,
1247           node->command.read.offset, node->command.read.size);
1248       break;
1249     case CL_COMMAND_WRITE_BUFFER:
1250       pocl_cuda_submit_write (
1251           stream, cmd->write.src_host_ptr, cmd->write.dst_mem_id->mem_ptr,
1252           node->command.write.offset, node->command.write.size);
1253       break;
1254     case CL_COMMAND_COPY_BUFFER:
1255       {
1256         pocl_cuda_submit_copy (
1257             stream, cmd->copy.src_mem_id->mem_ptr, cmd->copy.src_offset,
1258             cmd->copy.dst_mem_id->mem_ptr, cmd->copy.dst_offset, cmd->copy.size);
1259         break;
1260       }
1261     case CL_COMMAND_READ_BUFFER_RECT:
1262       pocl_cuda_submit_read_rect (
1263           stream,
1264           cmd->read_rect.dst_host_ptr,
1265           cmd->read_rect.src_mem_id->mem_ptr,
1266           cmd->read_rect.buffer_origin,
1267           cmd->read_rect.host_origin,
1268           cmd->read_rect.region,
1269           cmd->read_rect.buffer_row_pitch,
1270           cmd->read_rect.buffer_slice_pitch,
1271           cmd->read_rect.host_row_pitch,
1272           cmd->read_rect.host_slice_pitch);
1273       break;
1274     case CL_COMMAND_WRITE_BUFFER_RECT:
1275       pocl_cuda_submit_write_rect (
1276           stream,
1277           cmd->write_rect.src_host_ptr,
1278           cmd->write_rect.dst_mem_id->mem_ptr,
1279           cmd->write_rect.buffer_origin,
1280           cmd->write_rect.host_origin,
1281           cmd->write_rect.region,
1282           cmd->read_rect.buffer_row_pitch,
1283           cmd->read_rect.buffer_slice_pitch,
1284           cmd->read_rect.host_row_pitch,
1285           cmd->read_rect.host_slice_pitch);
1286       break;
1287     case CL_COMMAND_COPY_BUFFER_RECT:
1288       {
1289         pocl_cuda_submit_copy_rect (
1290           stream, dev,
1291           cmd->copy_rect.src_mem_id->mem_ptr,
1292           cmd->copy_rect.dst_mem_id->mem_ptr,
1293           cmd->copy_rect.src_origin,
1294           cmd->copy_rect.dst_origin,
1295           cmd->copy_rect.region,
1296           cmd->copy_rect.src_row_pitch,
1297           cmd->copy_rect.src_slice_pitch,
1298           cmd->copy_rect.dst_row_pitch,
1299           cmd->copy_rect.dst_slice_pitch);
1300         break;
1301       }
1302     case CL_COMMAND_MAP_BUFFER:
1303       {
1304         cl_mem buffer = event->mem_objs[0];
1305         pocl_cuda_submit_map_mem (
1306             stream, buffer, cmd->map.mem_id, cmd->map.mapping->offset,
1307             cmd->map.mapping->size, cmd->map.mapping->host_ptr);
1308         break;
1309       }
1310     case CL_COMMAND_UNMAP_MEM_OBJECT:
1311       {
1312         cl_mem buffer = event->mem_objs[0];
1313         assert (buffer->is_image == CL_FALSE);
1314         pocl_cuda_submit_unmap_mem (
1315             stream,
1316             cmd->unmap.mem_id,
1317             cmd->unmap.mapping->offset,
1318             cmd->unmap.mapping->size,
1319             cmd->unmap.mapping->host_ptr,
1320             cmd->unmap.mapping->map_flags);
1321         break;
1322       }
1323     case CL_COMMAND_NDRANGE_KERNEL:
1324       pocl_cuda_submit_kernel (stream, node, node->device, node->event);
1325       break;
1326 
1327     case CL_COMMAND_MIGRATE_MEM_OBJECTS:
1328       switch (cmd->migrate.type)
1329         {
1330         case ENQUEUE_MIGRATE_TYPE_D2H:
1331           {
1332             cl_mem mem = event->mem_objs[0];
1333             pocl_cuda_submit_read (stream, mem->mem_host_ptr,
1334                                    cmd->migrate.mem_id->mem_ptr, 0, mem->size);
1335             break;
1336           }
1337         case ENQUEUE_MIGRATE_TYPE_H2D:
1338           {
1339             cl_mem mem = event->mem_objs[0];
1340             pocl_cuda_submit_write (stream, mem->mem_host_ptr,
1341                                     cmd->migrate.mem_id->mem_ptr, 0,
1342                                     mem->size);
1343             break;
1344           }
1345         case ENQUEUE_MIGRATE_TYPE_D2D:
1346           {
1347             POCL_ABORT_UNIMPLEMENTED (
1348                 "CUDA does not support D2D migration.\n");
1349           }
1350         case ENQUEUE_MIGRATE_TYPE_NOP:
1351           {
1352             break;
1353           }
1354         }
1355       break;
1356 
1357     case CL_COMMAND_MARKER:
1358     case CL_COMMAND_BARRIER:
1359       break;
1360 
1361     case CL_COMMAND_FILL_BUFFER:
1362       pocl_cuda_submit_memfill (stream, cmd->memfill.dst_mem_id->mem_ptr,
1363                                 cmd->memfill.size, cmd->memfill.offset,
1364                                 cmd->memfill.pattern,
1365                                 cmd->memfill.pattern_size);
1366       break;
1367     case CL_COMMAND_READ_IMAGE:
1368     case CL_COMMAND_WRITE_IMAGE:
1369     case CL_COMMAND_COPY_IMAGE:
1370     case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
1371     case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
1372     case CL_COMMAND_FILL_IMAGE:
1373     case CL_COMMAND_MAP_IMAGE:
1374     case CL_COMMAND_NATIVE_KERNEL:
1375     case CL_COMMAND_SVM_FREE:
1376     case CL_COMMAND_SVM_MAP:
1377     case CL_COMMAND_SVM_UNMAP:
1378     case CL_COMMAND_SVM_MEMCPY:
1379     case CL_COMMAND_SVM_MEMFILL:
1380     default:
1381       POCL_ABORT_UNIMPLEMENTED (pocl_command_to_str (node->type));
1382       break;
1383     }
1384 
1385   /* Create and record event for command end */
1386   if (cq->properties & CL_QUEUE_PROFILING_ENABLE)
1387     result = cuEventCreate (&event_data->end, CU_EVENT_DEFAULT);
1388   else
1389     result = cuEventCreate (&event_data->end, CU_EVENT_DISABLE_TIMING);
1390   CUDA_CHECK (result, "cuEventCreate");
1391   result = cuEventRecord (event_data->end, stream);
1392   CUDA_CHECK (result, "cuEventRecord");
1393 
1394   event_data->events_ready = 1;
1395 }
1396 
1397 void
pocl_cuda_submit(_cl_command_node * node,cl_command_queue cq)1398 pocl_cuda_submit (_cl_command_node *node, cl_command_queue cq)
1399 {
1400   /* Allocate CUDA event data */
1401   pocl_cuda_event_data_t *p
1402       = (pocl_cuda_event_data_t *)calloc (1, sizeof (pocl_cuda_event_data_t));
1403   node->event->data = p;
1404 
1405   if (((pocl_cuda_queue_data_t *)cq->data)->use_threads)
1406     {
1407 
1408       pthread_cond_init (&p->event_cond, NULL);
1409       /* Add command to work queue */
1410       POCL_UNLOCK_OBJ (node->event);
1411       pocl_cuda_queue_data_t *queue_data = (pocl_cuda_queue_data_t *)cq->data;
1412       pthread_mutex_lock (&queue_data->lock);
1413       DL_APPEND (queue_data->pending_queue, node);
1414       pthread_cond_signal (&queue_data->pending_cond);
1415       pthread_mutex_unlock (&queue_data->lock);
1416     }
1417   else
1418     {
1419       /* Submit command in this thread */
1420       cuCtxSetCurrent (((pocl_cuda_device_data_t *)cq->device->data)->context);
1421       pocl_cuda_submit_node (node, cq, 1);
1422     }
1423 }
1424 
1425 void
pocl_cuda_notify(cl_device_id device,cl_event event,cl_event finished)1426 pocl_cuda_notify (cl_device_id device, cl_event event, cl_event finished)
1427 {
1428   /* Ignore CUDA device events, we've already handled these dependencies */
1429   if (finished->queue && finished->queue->device->ops == device->ops)
1430     return;
1431 
1432   if (event->status == CL_QUEUED)
1433     return;
1434 
1435   pocl_cuda_event_data_t *event_data = (pocl_cuda_event_data_t *)event->data;
1436 
1437   assert (event_data);
1438   assert (event_data->num_ext_events > 0);
1439   assert (event_data->ext_event_flag);
1440 
1441   /* If dependency failed, so should we */
1442   /* TODO: This isn't true if this is an implicit dependency */
1443   if (finished->status < 0)
1444     event->status = -1;
1445 
1446   /* Decrement external event counter */
1447   /* Trigger flag if none left */
1448   if (!--event_data->num_ext_events)
1449     *event_data->ext_event_flag = 1;
1450 }
1451 
1452 void
pocl_cuda_flush(cl_device_id device,cl_command_queue cq)1453 pocl_cuda_flush (cl_device_id device, cl_command_queue cq)
1454 {
1455   /* TODO: Something here? */
1456 }
1457 
1458 void
pocl_cuda_finalize_command(cl_device_id device,cl_event event)1459 pocl_cuda_finalize_command (cl_device_id device, cl_event event)
1460 {
1461   CUresult result;
1462   pocl_cuda_event_data_t *event_data = (pocl_cuda_event_data_t *)event->data;
1463 
1464   /* Wait for command to finish */
1465   cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);
1466   result = cuEventSynchronize (event_data->end);
1467   CUDA_CHECK (result, "cuEventSynchronize");
1468 
1469   /* Clean up mapped memory allocations */
1470   if (event->command_type == CL_COMMAND_UNMAP_MEM_OBJECT)
1471     {
1472       pocl_unmap_command_finished2 (event, &event->command->command);
1473     }
1474 
1475   if (event->command_type == CL_COMMAND_NDRANGE_KERNEL
1476       || event->command_type == CL_COMMAND_TASK)
1477     {
1478 #if defined __arm__
1479       /* On ARM with USE_HOST_PTR, perform explict copies back from device */
1480       cl_kernel kernel = event->command.run.kernel;
1481       pocl_argument *arguments = event->command.run.arguments;
1482       unsigned i;
1483       for (i = 0; i < meta->num_args; i++)
1484         {
1485           if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER)
1486             {
1487               if (!ARG_IS_LOCAL (meta->arg_info[i]) && arguments[i].value)
1488                 {
1489                   cl_mem mem = *(void **)arguments[i].value;
1490                   if (mem->flags & CL_MEM_USE_HOST_PTR)
1491                     {
1492                       CUdeviceptr ptr
1493                           = (CUdeviceptr)mem->device_ptrs[device->global_mem_id]
1494                                 .mem_ptr;
1495                       cuMemcpyDtoH (mem->mem_host_ptr, ptr, mem->size);
1496                       cuStreamSynchronize (0);
1497                     }
1498                 }
1499             }
1500         }
1501 #endif
1502 
1503       pocl_ndrange_node_cleanup (event->command);
1504     }
1505   else
1506     {
1507       pocl_mem_manager_free_command (event->command);
1508     }
1509 
1510   /* Handle failed events */
1511 
1512 
1513   pocl_update_event_running (event);
1514   if (event->status < 0)
1515     pocl_update_event_failed (event);
1516   else
1517     POCL_UPDATE_EVENT_COMPLETE_MSG (event, "CUDA event");
1518 }
1519 
1520 void
pocl_cuda_update_event(cl_device_id device,cl_event event)1521 pocl_cuda_update_event (cl_device_id device, cl_event event)
1522 {
1523   if ((event->status == CL_COMPLETE)
1524       && (event->queue->properties & CL_QUEUE_PROFILING_ENABLE))
1525     {
1526       /* Update timing info with CUDA event timers if profiling enabled */
1527       /* CUDA doesn't provide a way to get event timestamps directly,
1528        * only the elapsed time between two events. We use the elapsed
1529        * time from the epoch event enqueued on device creation to get
1530        * the actual timestamps.
1531        *
1532        * Since the CUDA timer resolution is lower than the host timer,
1533        * this can sometimes result in the start time being before the
1534        * submit time, so we use max() to ensure the timestamps are
1535        * sane. */
1536 
1537       float diff;
1538       CUresult result;
1539       pocl_cuda_event_data_t *event_data
1540           = (pocl_cuda_event_data_t *)event->data;
1541       cl_ulong epoch = ((pocl_cuda_device_data_t *)device->data)->epoch;
1542 
1543       result = cuEventElapsedTime (
1544           &diff, ((pocl_cuda_device_data_t *)device->data)->epoch_event,
1545           event_data->start);
1546       CUDA_CHECK (result, "cuEventElapsedTime");
1547       event->time_start = (cl_ulong) (epoch + diff * 1e6);
1548       event->time_start = max (event->time_start, epoch + 1);
1549 
1550       result = cuEventElapsedTime (
1551           &diff, ((pocl_cuda_device_data_t *)device->data)->epoch_event,
1552           event_data->end);
1553       CUDA_CHECK (result, "cuEventElapsedTime");
1554       event->time_end = (cl_ulong) (epoch + diff * 1e6);
1555       event->time_end = max (event->time_end, event->time_start + 1);
1556     }
1557 }
1558 
1559 void
pocl_cuda_wait_event_recurse(cl_device_id device,cl_event event)1560 pocl_cuda_wait_event_recurse (cl_device_id device, cl_event event)
1561 {
1562   while (event->wait_list)
1563     pocl_cuda_wait_event_recurse (device, event->wait_list->event);
1564 
1565   if (event->status > CL_COMPLETE)
1566     pocl_cuda_finalize_command (device, event);
1567 }
1568 
1569 void
pocl_cuda_notify_event_finished(cl_event event)1570 pocl_cuda_notify_event_finished (cl_event event)
1571 {
1572   pocl_cuda_event_data_t *e_d = (pocl_cuda_event_data_t *)event->data;
1573 
1574   if (((pocl_cuda_queue_data_t *)event->queue->data)->use_threads)
1575     pthread_cond_broadcast (&e_d->event_cond);
1576 }
1577 
1578 void
pocl_cuda_wait_event(cl_device_id device,cl_event event)1579 pocl_cuda_wait_event (cl_device_id device, cl_event event)
1580 {
1581   pocl_cuda_event_data_t *e_d = (pocl_cuda_event_data_t *)event->data;
1582 
1583   if (((pocl_cuda_queue_data_t *)event->queue->data)->use_threads)
1584     {
1585       /* Wait until background thread marks command as complete */
1586       POCL_LOCK_OBJ (event);
1587       while (event->status > CL_COMPLETE)
1588         {
1589           pthread_cond_wait (&e_d->event_cond, &event->pocl_lock);
1590         }
1591       POCL_UNLOCK_OBJ (event);
1592     }
1593   else
1594     {
1595       /* Recursively finalize commands in this thread */
1596       pocl_cuda_wait_event_recurse (device, event);
1597     }
1598 }
1599 
1600 void
pocl_cuda_free_event_data(cl_event event)1601 pocl_cuda_free_event_data (cl_event event)
1602 {
1603   if (event->data)
1604     {
1605       pocl_cuda_event_data_t *event_data
1606           = (pocl_cuda_event_data_t *)event->data;
1607       pthread_cond_destroy (&event_data->event_cond);
1608       if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
1609         cuEventDestroy (event_data->start);
1610       cuEventDestroy (event_data->end);
1611       if (event_data->ext_event_flag)
1612         {
1613           CUresult result = cuMemFreeHost (event_data->ext_event_flag);
1614           CUDA_CHECK (result, "cuMemFreeHost");
1615         }
1616       free (event->data);
1617     }
1618 }
1619 
1620 void
pocl_cuda_join(cl_device_id device,cl_command_queue cq)1621 pocl_cuda_join (cl_device_id device, cl_command_queue cq)
1622 {
1623   /* Grab event at end of queue */
1624   POCL_LOCK_OBJ (cq);
1625   cl_event event = cq->last_event.event;
1626   if (!event)
1627     {
1628       POCL_UNLOCK_OBJ (cq);
1629       return;
1630     }
1631   POname (clRetainEvent) (event);
1632   POCL_UNLOCK_OBJ (cq);
1633 
1634   pocl_cuda_wait_event (device, event);
1635 
1636   POname (clReleaseEvent) (event);
1637 }
1638 
1639 void *
pocl_cuda_submit_thread(void * data)1640 pocl_cuda_submit_thread (void *data)
1641 {
1642   pocl_cuda_queue_data_t *queue_data = (pocl_cuda_queue_data_t *)data;
1643 
1644   cl_command_queue queue = queue_data->queue;
1645   if (queue)
1646     cuCtxSetCurrent (
1647         ((pocl_cuda_device_data_t *)queue->device->data)->context);
1648   else
1649     /* This queue has already been released */
1650     return NULL;
1651 
1652   while (1)
1653     {
1654       /* Attempt to get next command from work queue */
1655       _cl_command_node *node = NULL;
1656       pthread_mutex_lock (&queue_data->lock);
1657       if (!queue_data->queue)
1658         {
1659           pthread_mutex_unlock (&queue_data->lock);
1660           break;
1661         }
1662       if (!queue_data->pending_queue)
1663         {
1664           pthread_cond_wait (&queue_data->pending_cond, &queue_data->lock);
1665         }
1666       if (queue_data->pending_queue)
1667         {
1668           node = queue_data->pending_queue;
1669           DL_DELETE (queue_data->pending_queue, node);
1670         }
1671       pthread_mutex_unlock (&queue_data->lock);
1672 
1673       /* Submit command, if we found one */
1674       if (node)
1675         {
1676           pocl_cuda_submit_node (node, queue_data->queue, 0);
1677 
1678           /* Add command to running queue */
1679           pthread_mutex_lock (&queue_data->lock);
1680           DL_APPEND (queue_data->running_queue, node);
1681           pthread_cond_signal (&queue_data->running_cond);
1682           pthread_mutex_unlock (&queue_data->lock);
1683         }
1684     }
1685 
1686   return NULL;
1687 }
1688 
1689 void *
pocl_cuda_finalize_thread(void * data)1690 pocl_cuda_finalize_thread (void *data)
1691 {
1692   pocl_cuda_queue_data_t *queue_data = (pocl_cuda_queue_data_t *)data;
1693 
1694   cl_command_queue queue = queue_data->queue;
1695   if (queue)
1696     cuCtxSetCurrent (
1697         ((pocl_cuda_device_data_t *)queue->device->data)->context);
1698   else
1699     /* This queue has already been released */
1700     return NULL;
1701 
1702   while (1)
1703     {
1704       /* Attempt to get next node from running queue */
1705       _cl_command_node *node = NULL;
1706       pthread_mutex_lock (&queue_data->lock);
1707       if (!queue_data->queue)
1708         {
1709           pthread_mutex_unlock (&queue_data->lock);
1710           break;
1711         }
1712       if (!queue_data->running_queue)
1713         {
1714           pthread_cond_wait (&queue_data->running_cond, &queue_data->lock);
1715         }
1716       if (queue_data->running_queue)
1717         {
1718           node = queue_data->running_queue;
1719           DL_DELETE (queue_data->running_queue, node);
1720         }
1721       pthread_mutex_unlock (&queue_data->lock);
1722 
1723       /* Wait for command to finish, if we found one */
1724       if (node)
1725         pocl_cuda_finalize_command (queue->device, node->event);
1726     }
1727 
1728   return NULL;
1729 }
1730 
pocl_cuda_init_build(void * data)1731 char* pocl_cuda_init_build(void *data)
1732 {
1733 #ifdef LLVM_OLDER_THAN_7_0
1734     return strdup("");
1735 #else
1736     return strdup("-mllvm --nvptx-short-ptr");
1737 #endif
1738 }
1739