1 /* pocl-cuda.c - driver for CUDA devices
2
3 Copyright (c) 2016-2017 James Price / University of Bristol
4
5 Permission is hereby granted, free of charge, to any person obtaining a copy
6 of this software and associated documentation files (the "Software"), to
7 deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 FROM,
22 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23 THE SOFTWARE.
24 */
25
26 #include "config.h"
27
28 #include "common.h"
29 #include "common_driver.h"
30 #include "devices.h"
31 #include "pocl.h"
32 #include "pocl-cuda.h"
33 #include "pocl-ptx-gen.h"
34 #include "pocl_cache.h"
35 #include "pocl_file_util.h"
36 #include "pocl_llvm.h"
37 #include "pocl_mem_management.h"
38 #include "pocl_runtime_config.h"
39 #include "pocl_timing.h"
40 #include "pocl_util.h"
41
42 #include <string.h>
43
44 #include <cuda.h>
45 #include <cuda_runtime.h>
46
47 typedef struct pocl_cuda_device_data_s
48 {
49 CUdevice device;
50 CUcontext context;
51 CUevent epoch_event;
52 cl_ulong epoch;
53 char libdevice[PATH_MAX];
54 pocl_lock_t compile_lock;
55 } pocl_cuda_device_data_t;
56
57 typedef struct pocl_cuda_queue_data_s
58 {
59 CUstream stream;
60 int use_threads;
61 pthread_t submit_thread;
62 pthread_t finalize_thread;
63 pthread_mutex_t lock;
64 pthread_cond_t pending_cond;
65 pthread_cond_t running_cond;
66 _cl_command_node *volatile pending_queue;
67 _cl_command_node *volatile running_queue;
68 cl_command_queue queue;
69 } pocl_cuda_queue_data_t;
70
71 typedef struct pocl_cuda_kernel_data_s
72 {
73 CUmodule module;
74 CUmodule module_offsets;
75 CUfunction kernel;
76 CUfunction kernel_offsets;
77 size_t *alignments;
78 } pocl_cuda_kernel_data_t;
79
80 typedef struct pocl_cuda_event_data_s
81 {
82 CUevent start;
83 CUevent end;
84 volatile int events_ready;
85 cl_int *ext_event_flag;
86 pthread_cond_t event_cond;
87 volatile unsigned num_ext_events;
88 } pocl_cuda_event_data_t;
89
90 extern unsigned int pocl_num_devices;
91
92 void *pocl_cuda_submit_thread (void *);
93 void *pocl_cuda_finalize_thread (void *);
94
95 static void
pocl_cuda_abort_on_error(CUresult result,unsigned line,const char * func,const char * code,const char * api)96 pocl_cuda_abort_on_error (CUresult result, unsigned line, const char *func,
97 const char *code, const char *api)
98 {
99 if (result != CUDA_SUCCESS)
100 {
101 const char *err_name;
102 const char *err_string;
103 cuGetErrorName (result, &err_name);
104 cuGetErrorString (result, &err_string);
105 POCL_MSG_PRINT2 (CUDA, func, line, "Error during %s\n", api);
106 POCL_ABORT ("%s: %s\n", err_name, err_string);
107 }
108 }
109
110 static int
pocl_cuda_error(CUresult result,unsigned line,const char * func,const char * code,const char * api)111 pocl_cuda_error (CUresult result, unsigned line, const char *func,
112 const char *code, const char *api)
113 {
114 int err = (result != CUDA_SUCCESS);
115 if (err)
116 {
117 const char *err_name;
118 const char *err_string;
119 cuGetErrorName (result, &err_name);
120 cuGetErrorString (result, &err_string);
121 POCL_MSG_ERR ("CUDA error during %s. %s: %s\n", api, err_name, err_string);
122 }
123 return err;
124 }
125
126 #define CUDA_CHECK(result, api) \
127 pocl_cuda_abort_on_error (result, __LINE__, __FUNCTION__, #result, api)
128
129 #define CUDA_CHECK_ERROR(result, api) \
130 pocl_cuda_error (result, __LINE__, __FUNCTION__, #result, api)
131
pocl_cuda_handle_cl_nv_device_attribute_query(cl_device_id device,cl_device_info param_name,size_t param_value_size,void * param_value,size_t * param_value_size_ret)132 cl_int pocl_cuda_handle_cl_nv_device_attribute_query(cl_device_id device,
133 cl_device_info param_name,
134 size_t param_value_size,
135 void * param_value,
136 size_t * param_value_size_ret)
137 {
138 CUdevice cudaDev = ((pocl_cuda_device_data_t *)device->data)->device;
139 unsigned int value;
140 CUresult res;
141
142 switch(param_name) {
143 case CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV:
144 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cudaDev);
145 CUDA_CHECK(res, "cuDeviceGetAttribute");
146 POCL_RETURN_GETINFO(cl_uint, value);
147 case CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV:
148 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cudaDev);
149 CUDA_CHECK(res, "cuDeviceGetAttribute");
150 POCL_RETURN_GETINFO(cl_uint, value);
151 case CL_DEVICE_REGISTERS_PER_BLOCK_NV:
152 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, cudaDev);
153 CUDA_CHECK(res, "cuDeviceGetAttribute");
154 POCL_RETURN_GETINFO(cl_uint, value);
155 case CL_DEVICE_WARP_SIZE_NV:
156 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_WARP_SIZE, cudaDev);
157 CUDA_CHECK(res, "cuDeviceGetAttribute");
158 POCL_RETURN_GETINFO(cl_uint, value);
159 case CL_DEVICE_GPU_OVERLAP_NV:
160 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, cudaDev);
161 CUDA_CHECK(res, "cuDeviceGetAttribute");
162 POCL_RETURN_GETINFO(cl_bool, value);
163 case CL_DEVICE_KERNEL_EXEC_TIMEOUT_NV:
164 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, cudaDev);
165 CUDA_CHECK(res, "cuDeviceGetAttribute");
166 POCL_RETURN_GETINFO(cl_bool, value);
167 case CL_DEVICE_INTEGRATED_MEMORY_NV:
168 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_INTEGRATED, cudaDev);
169 CUDA_CHECK(res, "cuDeviceGetAttribute");
170 POCL_RETURN_GETINFO(cl_bool, value);
171 case CL_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT_NV:
172 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, cudaDev);
173 CUDA_CHECK(res, "cuDeviceGetAttribute");
174 POCL_RETURN_GETINFO(cl_uint, value);
175 case CL_DEVICE_PCI_BUS_ID_NV:
176 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cudaDev);
177 CUDA_CHECK(res, "cuDeviceGetAttribute");
178 POCL_RETURN_GETINFO(cl_uint, value);
179 case CL_DEVICE_PCI_SLOT_ID_NV:
180 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cudaDev);
181 CUDA_CHECK(res, "cuDeviceGetAttribute");
182 POCL_RETURN_GETINFO(cl_uint, value);
183 case CL_DEVICE_PCI_DOMAIN_ID_NV:
184 res = cuDeviceGetAttribute(&value, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, cudaDev);
185 CUDA_CHECK(res, "cuDeviceGetAttribute");
186 POCL_RETURN_GETINFO(cl_uint, value);
187 default:
188 return CL_INVALID_VALUE;
189 }
190
191 }
192
193 void
pocl_cuda_init_device_ops(struct pocl_device_ops * ops)194 pocl_cuda_init_device_ops (struct pocl_device_ops *ops)
195 {
196 ops->device_name = "CUDA";
197 ops->build_hash = pocl_cuda_build_hash;
198 ops->probe = pocl_cuda_probe;
199 ops->uninit = pocl_cuda_uninit;
200 ops->reinit = NULL;
201 ops->init = pocl_cuda_init;
202 ops->init_queue = pocl_cuda_init_queue;
203 ops->free_queue = pocl_cuda_free_queue;
204
205 ops->alloc_mem_obj = pocl_cuda_alloc_mem_obj;
206 ops->free = pocl_cuda_free;
207
208 ops->submit = pocl_cuda_submit;
209 ops->notify = pocl_cuda_notify;
210 ops->broadcast = pocl_broadcast;
211 ops->wait_event = pocl_cuda_wait_event;
212 ops->update_event = pocl_cuda_update_event;
213 ops->free_event_data = pocl_cuda_free_event_data;
214 ops->join = pocl_cuda_join;
215 ops->flush = pocl_cuda_flush;
216 ops->init_build = pocl_cuda_init_build;
217 // TODO
218 ops->notify_event_finished = pocl_cuda_notify_event_finished;
219
220 ops->get_device_info_ext = pocl_cuda_handle_cl_nv_device_attribute_query;
221 ops->build_source = pocl_driver_build_source;
222 ops->link_program = pocl_driver_link_program;
223 ops->build_binary = pocl_driver_build_binary;
224 ops->free_program = pocl_driver_free_program;
225 ops->setup_metadata = pocl_driver_setup_metadata;
226 ops->supports_binary = pocl_driver_supports_binary;
227 ops->build_poclbinary = pocl_driver_build_poclbinary;
228 ops->compile_kernel = pocl_cuda_compile_kernel;
229
230 // TODO
231 ops->get_mapping_ptr = pocl_driver_get_mapping_ptr;
232 ops->free_mapping_ptr = pocl_driver_free_mapping_ptr;
233
234 ops->can_migrate_d2d = NULL;
235 ops->migrate_d2d = NULL;
236 ops->read = NULL;
237 ops->read_rect = NULL;
238 ops->write = NULL;
239 ops->write_rect = NULL;
240 ops->copy = NULL;
241 ops->copy_rect = NULL;
242 ops->map_mem = NULL;
243 ops->unmap_mem = NULL;
244 ops->run = NULL;
245 }
246
247 cl_int
pocl_cuda_init(unsigned j,cl_device_id dev,const char * parameters)248 pocl_cuda_init (unsigned j, cl_device_id dev, const char *parameters)
249 {
250 CUresult result;
251 int ret = CL_SUCCESS;
252
253 if (dev->data)
254 return ret;
255
256 pocl_init_default_device_infos (dev);
257 dev->extensions = CUDA_DEVICE_EXTENSIONS;
258
259 dev->vendor = "NVIDIA Corporation";
260 dev->vendor_id = 0x10de; /* the PCIID for NVIDIA */
261
262 dev->type = CL_DEVICE_TYPE_GPU;
263 dev->address_bits = (sizeof (void *) * 8);
264
265 dev->llvm_target_triplet = (sizeof (void *) == 8) ? "nvptx64" : "nvptx";
266
267 dev->spmd = CL_TRUE;
268 dev->workgroup_pass = CL_FALSE;
269 dev->execution_capabilities = CL_EXEC_KERNEL;
270
271 dev->global_as_id = 1;
272 dev->local_as_id = 3;
273 dev->constant_as_id = 1;
274
275 /* TODO: Get images working */
276 dev->image_support = CL_FALSE;
277
278 dev->autolocals_to_args
279 = POCL_AUTOLOCALS_TO_ARGS_ONLY_IF_DYNAMIC_LOCALS_PRESENT;
280
281 dev->has_64bit_long = 1;
282
283 pocl_cuda_device_data_t *data = calloc (1, sizeof (pocl_cuda_device_data_t));
284 result = cuDeviceGet (&data->device, j);
285 if (CUDA_CHECK_ERROR (result, "cuDeviceGet"))
286 ret = CL_INVALID_DEVICE;
287
288 /* Get specific device name */
289 dev->long_name = dev->short_name = calloc (256, sizeof (char));
290
291 if (ret != CL_INVALID_DEVICE)
292 cuDeviceGetName (dev->long_name, 256, data->device);
293 else
294 snprintf (dev->long_name, 255, "Unavailable CUDA device #%d", j);
295
296 SETUP_DEVICE_CL_VERSION (CUDA_DEVICE_CL_VERSION_MAJOR,
297 CUDA_DEVICE_CL_VERSION_MINOR);
298
299 /* Get other device properties */
300 if (ret != CL_INVALID_DEVICE)
301 {
302 /* CUDA device attributes (as fetched by cuDeviceGetAttribute) are always (unsigned)
303 * integers, where the OpenCL counterparts are of a variety of (other) integer types.
304 * Fetch the values in an unsigned int and copy it over.
305 * We also OR all return values of cuDeviceGetAttribute, and at the end we will check
306 * if it's not CL_SUCCESS. We miss the exact line that failed this way, but it's
307 * faster than checking after each attribute fetch.
308 */
309 int value = 0;
310 #define GET_CU_PROP(key, target) do { \
311 result |= cuDeviceGetAttribute (&value, CU_DEVICE_ATTRIBUTE_##key, data->device); \
312 target = value; \
313 } while (0)
314
315 GET_CU_PROP (MAX_THREADS_PER_BLOCK, dev->max_work_group_size);
316 GET_CU_PROP (MAX_BLOCK_DIM_X, dev->max_work_item_sizes[0]);
317 GET_CU_PROP (MAX_BLOCK_DIM_Y, dev->max_work_item_sizes[1]);
318 GET_CU_PROP (MAX_BLOCK_DIM_Z, dev->max_work_item_sizes[2]);
319 GET_CU_PROP (MAX_SHARED_MEMORY_PER_BLOCK, dev->local_mem_size);
320 GET_CU_PROP (MULTIPROCESSOR_COUNT, dev->max_compute_units);
321 GET_CU_PROP (ECC_ENABLED, dev->error_correction_support);
322 GET_CU_PROP (INTEGRATED, dev->host_unified_memory);
323 GET_CU_PROP (TOTAL_CONSTANT_MEMORY, dev->max_constant_buffer_size);
324 GET_CU_PROP (CLOCK_RATE, dev->max_clock_frequency);
325 dev->max_clock_frequency /= 1000;
326 GET_CU_PROP (TEXTURE_ALIGNMENT, dev->mem_base_addr_align);
327 GET_CU_PROP (INTEGRATED, dev->host_unified_memory);
328 }
329 if (CUDA_CHECK_ERROR (result, "cuDeviceGetAttribute"))
330 ret = CL_INVALID_DEVICE;
331
332 dev->preferred_wg_size_multiple = 32;
333 dev->preferred_vector_width_char = 1;
334 dev->preferred_vector_width_short = 1;
335 dev->preferred_vector_width_int = 1;
336 dev->preferred_vector_width_long = 1;
337 dev->preferred_vector_width_float = 1;
338 dev->preferred_vector_width_double = 1;
339 dev->preferred_vector_width_half = 0;
340 dev->native_vector_width_char = 1;
341 dev->native_vector_width_short = 1;
342 dev->native_vector_width_int = 1;
343 dev->native_vector_width_long = 1;
344 dev->native_vector_width_float = 1;
345 dev->native_vector_width_double = 1;
346 dev->native_vector_width_half = 0;
347
348 dev->single_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
349 | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN
350 | CL_FP_DENORM;
351 dev->double_fp_config = CL_FP_ROUND_TO_NEAREST | CL_FP_ROUND_TO_ZERO
352 | CL_FP_ROUND_TO_INF | CL_FP_FMA | CL_FP_INF_NAN
353 | CL_FP_DENORM;
354
355 dev->local_mem_type = CL_LOCAL;
356
357 /* Get GPU architecture name */
358 int sm_maj = 0, sm_min = 0;
359 if (ret != CL_INVALID_DEVICE)
360 {
361 cuDeviceGetAttribute (&sm_maj, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
362 data->device);
363 cuDeviceGetAttribute (&sm_min, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
364 data->device);
365 }
366 char *gpu_arch = calloc (16, sizeof (char));
367 snprintf (gpu_arch, 16, "sm_%d%d", sm_maj, sm_min);
368 dev->llvm_cpu = pocl_get_string_option ("POCL_CUDA_GPU_ARCH", gpu_arch);
369 POCL_MSG_PRINT_INFO ("[CUDA] GPU architecture = %s\n", dev->llvm_cpu);
370
371 /* Find libdevice library */
372 if (findLibDevice (data->libdevice, dev->llvm_cpu))
373 {
374 if (ret != CL_INVALID_DEVICE)
375 {
376 POCL_MSG_ERR ("[CUDA] failed to find libdevice library\n");
377 dev->compiler_available = dev->linker_available = 0;
378 }
379 }
380
381 dev->device_side_printf = 0;
382
383 /* Create context */
384 if (ret != CL_INVALID_DEVICE)
385 {
386 result = cuCtxCreate (&data->context, CU_CTX_MAP_HOST, data->device);
387 if (CUDA_CHECK_ERROR (result, "cuCtxCreate"))
388 ret = CL_INVALID_DEVICE;
389 }
390
391 /* Create epoch event for timing info */
392 if (ret != CL_INVALID_DEVICE)
393 {
394 result = cuEventCreate (&data->epoch_event, CU_EVENT_DEFAULT);
395 CUDA_CHECK_ERROR (result, "cuEventCreate");
396
397 data->epoch = pocl_gettimemono_ns ();
398
399 result = cuEventRecord (data->epoch_event, 0);
400 result = cuEventSynchronize (data->epoch_event);
401 if (CUDA_CHECK_ERROR (result, "cuEventSynchronize"))
402 ret = CL_INVALID_DEVICE;
403 }
404
405 /* Get global memory size */
406 size_t memfree = 0, memtotal = 0;
407 if (ret != CL_INVALID_DEVICE)
408 result = cuMemGetInfo (&memfree, &memtotal);
409 dev->max_mem_alloc_size = max (memtotal / 4, 128 * 1024 * 1024);
410 dev->global_mem_size = memtotal;
411
412 dev->data = data;
413
414 POCL_INIT_LOCK (data->compile_lock);
415 return ret;
416 }
417
418 cl_int
pocl_cuda_init_queue(cl_device_id device,cl_command_queue queue)419 pocl_cuda_init_queue (cl_device_id device, cl_command_queue queue)
420 {
421 cuCtxSetCurrent (((pocl_cuda_device_data_t *)queue->device->data)->context);
422
423 pocl_cuda_queue_data_t *queue_data
424 = calloc (1, sizeof (pocl_cuda_queue_data_t));
425 queue->data = queue_data;
426 queue_data->queue = queue;
427
428 CUresult result
429 = cuStreamCreate (&queue_data->stream, CU_STREAM_NON_BLOCKING);
430 if (CUDA_CHECK_ERROR (result, "cuStreamCreate"))
431 return CL_OUT_OF_RESOURCES;
432
433 queue_data->use_threads
434 = !pocl_get_bool_option ("POCL_CUDA_DISABLE_QUEUE_THREADS", 1);
435
436 if (queue_data->use_threads)
437 {
438 pthread_mutex_init (&queue_data->lock, NULL);
439 pthread_cond_init (&queue_data->pending_cond, NULL);
440 pthread_cond_init (&queue_data->running_cond, NULL);
441 int err = pthread_create (&queue_data->submit_thread, NULL,
442 pocl_cuda_submit_thread, queue_data);
443 if (err)
444 {
445 POCL_MSG_ERR ("[CUDA] Error creating submit thread: %d\n", err);
446 return CL_OUT_OF_RESOURCES;
447 }
448
449 err = pthread_create (&queue_data->finalize_thread, NULL,
450 pocl_cuda_finalize_thread, queue_data);
451 if (err)
452 {
453 POCL_MSG_ERR ("[CUDA] Error creating finalize thread: %d\n", err);
454 return CL_OUT_OF_RESOURCES;
455 }
456 }
457
458 return CL_SUCCESS;
459 }
460
461 int
pocl_cuda_free_queue(cl_device_id device,cl_command_queue queue)462 pocl_cuda_free_queue (cl_device_id device, cl_command_queue queue)
463 {
464 pocl_cuda_queue_data_t *queue_data = (pocl_cuda_queue_data_t *)queue->data;
465
466 cuCtxSetCurrent (((pocl_cuda_device_data_t *)queue->device->data)->context);
467 cuStreamDestroy (queue_data->stream);
468
469 assert (queue_data->pending_queue == NULL);
470 assert (queue_data->running_queue == NULL);
471
472 /* Kill queue threads */
473 if (queue_data->use_threads)
474 {
475 pthread_mutex_lock (&queue_data->lock);
476 queue_data->queue = NULL;
477 pthread_cond_signal (&queue_data->pending_cond);
478 pthread_cond_signal (&queue_data->running_cond);
479 pthread_mutex_unlock (&queue_data->lock);
480 pthread_join (queue_data->submit_thread, NULL);
481 pthread_join (queue_data->finalize_thread, NULL);
482 }
483 return CL_SUCCESS;
484 }
485
486 char *
pocl_cuda_build_hash(cl_device_id device)487 pocl_cuda_build_hash (cl_device_id device)
488 {
489 char *res = calloc (1000, sizeof (char));
490 snprintf (res, 1000, "CUDA-%s", device->llvm_cpu);
491 return res;
492 }
493
494 unsigned int
pocl_cuda_probe(struct pocl_device_ops * ops)495 pocl_cuda_probe (struct pocl_device_ops *ops)
496 {
497 int env_count = pocl_device_get_env_count (ops->device_name);
498
499 int probe_count = 0;
500 CUresult ret = cuInit (0);
501 if (ret == CUDA_SUCCESS)
502 {
503 ret = cuDeviceGetCount (&probe_count);
504 if (ret != CUDA_SUCCESS)
505 probe_count = 0;
506 }
507
508 /* If the user requested a specific number of CUDA devices,
509 * pretend we only have that many, if we can. If they requested
510 * more than there are, abort informing the user of the issue.
511 */
512 if (env_count >= 0)
513 {
514 if (env_count > probe_count)
515 POCL_ABORT ("[CUDA] %d devices requested, but only %d are available\n",
516 env_count, probe_count);
517 probe_count = env_count;
518 }
519
520 return probe_count;
521 }
522
523 cl_int
pocl_cuda_uninit(unsigned j,cl_device_id device)524 pocl_cuda_uninit (unsigned j, cl_device_id device)
525 {
526 pocl_cuda_device_data_t *data = device->data;
527
528 if (device->available)
529 cuCtxDestroy (data->context);
530
531 POCL_MEM_FREE (data);
532 device->data = NULL;
533
534 POCL_MEM_FREE (device->long_name);
535 return CL_SUCCESS;
536 }
537
538 cl_int
pocl_cuda_alloc_mem_obj(cl_device_id device,cl_mem mem,void * host_ptr)539 pocl_cuda_alloc_mem_obj (cl_device_id device, cl_mem mem, void *host_ptr)
540 {
541 cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);
542 pocl_mem_identifier *p = &mem->device_ptrs[device->global_mem_id];
543 int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
544
545 CUresult result;
546 void *b = NULL;
547
548 p->extra_ptr = NULL;
549 p->version = 0;
550 cl_mem_flags flags = mem->flags;
551
552 if (flags & CL_MEM_USE_HOST_PTR)
553 {
554 #if defined __arm__
555 /* cuMemHostRegister is not supported on ARM.
556 * Allocate device memory and perform explicit copies
557 * before and after running a kernel */
558 result = cuMemAlloc ((CUdeviceptr *)&b, mem_obj->size);
559 CUDA_CHECK (result, "cuMemAlloc");
560 #else
561 POCL_RETURN_ERROR_ON ((pocl_alloc_or_retain_mem_host_ptr (mem) != 0),
562 CL_OUT_OF_HOST_MEMORY,
563 "Cannot allocate backing memory!\n");
564
565 result = cuMemHostRegister (mem->mem_host_ptr, mem->size,
566 CU_MEMHOSTREGISTER_DEVICEMAP);
567 if (result != CUDA_SUCCESS
568 && result != CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED)
569 CUDA_CHECK (result, "cuMemHostRegister");
570 result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, mem->mem_host_ptr,
571 0);
572 CUDA_CHECK (result, "cuMemHostGetDevicePointer");
573
574 /* TODO can we assume cuMemHostRegister copies
575 * the content of host memory to the device ? for now, lets not */
576 p->version = 0;
577 #endif
578 }
579 /* preallocate host visible memory */
580 else if ((flags & CL_MEM_ALLOC_HOST_PTR) && (mem->mem_host_ptr == NULL))
581 {
582 result = cuMemHostAlloc (&p->extra_ptr, mem->size,
583 CU_MEMHOSTREGISTER_DEVICEMAP);
584 CUDA_CHECK (result, "cuMemHostAlloc");
585 result = cuMemHostGetDevicePointer ((CUdeviceptr *)&b, p->extra_ptr, 0);
586 CUDA_CHECK (result, "cuMemHostGetDevicePointer");
587 mem->mem_host_ptr = p->extra_ptr;
588 mem->mem_host_ptr_refcount = 1;
589 mem->mem_host_ptr_version = 0;
590
591 if (flags & CL_MEM_COPY_HOST_PTR)
592 {
593 result = cuMemcpyHtoD ((CUdeviceptr)b, host_ptr, mem->size);
594 CUDA_CHECK (result, "cuMemcpyHtoD");
595
596 result = cuStreamSynchronize (0);
597 CUDA_CHECK (result, "cuStreamSynchronize");
598
599 mem->mem_host_ptr_version = 1;
600 mem->latest_version = 1;
601 p->version = 1;
602 }
603 }
604 else
605 {
606 result = cuMemAlloc ((CUdeviceptr *)&b, mem->size);
607 if (result != CUDA_SUCCESS)
608 {
609 const char *err;
610 cuGetErrorName (result, &err);
611 POCL_MSG_PRINT2 (CUDA, __FUNCTION__, __LINE__,
612 "-> Failed to allocate memory: %s\n", err);
613 return CL_MEM_OBJECT_ALLOCATION_FAILURE;
614 }
615 }
616 p->mem_ptr = b;
617 err = CL_SUCCESS;
618
619 return err;
620 }
621
622 void
pocl_cuda_free(cl_device_id device,cl_mem mem_obj)623 pocl_cuda_free (cl_device_id device, cl_mem mem_obj)
624 {
625 cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);
626 pocl_mem_identifier *p = &mem_obj->device_ptrs[device->global_mem_id];
627
628 if (mem_obj->flags & CL_MEM_USE_HOST_PTR)
629 {
630 #if defined __arm__
631 cuMemFree ((CUdeviceptr)p->mem_ptr);
632 #else
633 assert (p->extra_ptr == NULL);
634 cuMemHostUnregister (mem_obj->mem_host_ptr);
635 #endif
636 }
637 else if (p->extra_ptr)
638 {
639 mem_obj->mem_host_ptr = NULL;
640 mem_obj->mem_host_ptr_refcount = 0;
641 mem_obj->mem_host_ptr_version = 0;
642 cuMemFreeHost (p->extra_ptr);
643 p->extra_ptr = NULL;
644 }
645 else
646 {
647 assert (p->extra_ptr == NULL);
648 assert (p->mem_ptr != NULL);
649 cuMemFree ((CUdeviceptr)p->mem_ptr);
650 }
651 p->mem_ptr = NULL;
652 p->version = 0;
653 }
654
655 void
pocl_cuda_submit_read(CUstream stream,void * host_ptr,const void * device_ptr,size_t offset,size_t cb)656 pocl_cuda_submit_read (CUstream stream, void *host_ptr, const void *device_ptr,
657 size_t offset, size_t cb)
658 {
659 POCL_MSG_PRINT_CUDA ("cuMemcpyDtoHAsync %p -> %p / %zu B \n", device_ptr, host_ptr, cb);
660 CUresult result = cuMemcpyDtoHAsync (
661 host_ptr, (CUdeviceptr) (device_ptr + offset), cb, stream);
662 CUDA_CHECK (result, "cuMemcpyDtoHAsync");
663 }
664
665 void
pocl_cuda_submit_memfill(CUstream stream,void * mem_ptr,size_t size_in_bytes,size_t offset,const void * pattern,size_t pattern_size)666 pocl_cuda_submit_memfill (CUstream stream, void *mem_ptr, size_t size_in_bytes,
667 size_t offset, const void *pattern,
668 size_t pattern_size)
669 {
670 CUresult result;
671 switch (pattern_size)
672 {
673 case 1:
674 result
675 = cuMemsetD8Async ((CUdeviceptr) (((char *)mem_ptr) + offset),
676 *(unsigned char *)pattern, size_in_bytes, stream);
677 break;
678 case 2:
679 result = cuMemsetD16Async ((CUdeviceptr) (((char *)mem_ptr) + offset),
680 *(unsigned short *)pattern, size_in_bytes / 2,
681 stream);
682 break;
683 case 4:
684 result = cuMemsetD32Async ((CUdeviceptr) (((char *)mem_ptr) + offset),
685 *(unsigned int *)pattern, size_in_bytes / 4,
686 stream);
687 break;
688 case 8:
689 case 16:
690 case 32:
691 case 64:
692 case 128:
693 POCL_ABORT_UNIMPLEMENTED ("fill_kernel with pattern_size >=8");
694 default:
695 POCL_ABORT ("unrecognized pattern_size");
696 }
697 CUDA_CHECK (result, "cuMemset*Async");
698 }
699
700 void
pocl_cuda_submit_write(CUstream stream,const void * host_ptr,void * device_ptr,size_t offset,size_t cb)701 pocl_cuda_submit_write (CUstream stream, const void *host_ptr,
702 void *device_ptr, size_t offset, size_t cb)
703 {
704 POCL_MSG_PRINT_CUDA ("cuMemcpyHtoDAsync %p -> %p / %zu B \n", host_ptr, device_ptr, cb);
705 CUresult result = cuMemcpyHtoDAsync ((CUdeviceptr) (device_ptr + offset),
706 host_ptr, cb, stream);
707 CUDA_CHECK (result, "cuMemcpyHtoDAsync");
708 }
709
710 void
pocl_cuda_submit_copy(CUstream stream,void * __restrict__ src_mem_ptr,size_t src_offset,void * __restrict__ dst_mem_ptr,size_t dst_offset,size_t cb)711 pocl_cuda_submit_copy (CUstream stream, void*__restrict__ src_mem_ptr,
712 size_t src_offset, void *__restrict__ dst_mem_ptr,
713 size_t dst_offset, size_t cb)
714 {
715 void *src_ptr = src_mem_ptr + src_offset;
716 void *dst_ptr = dst_mem_ptr + dst_offset;
717
718 if (src_ptr == dst_ptr)
719 return;
720
721 CUresult result;
722 POCL_MSG_PRINT_CUDA ("cuMemcpyDtoDAsync %p -> %p / %zu B \n", src_ptr, dst_ptr, cb);
723 result = cuMemcpyDtoDAsync ((CUdeviceptr)dst_ptr, (CUdeviceptr)src_ptr,
724 cb, stream);
725 CUDA_CHECK (result, "cuMemcpyDtoDAsync");
726 }
727
728 void
pocl_cuda_submit_read_rect(CUstream stream,void * __restrict__ const host_ptr,void * __restrict__ const device_ptr,const size_t * __restrict__ const buffer_origin,const size_t * __restrict__ const host_origin,const size_t * __restrict__ const region,size_t const buffer_row_pitch,size_t const buffer_slice_pitch,size_t const host_row_pitch,size_t const host_slice_pitch)729 pocl_cuda_submit_read_rect (CUstream stream, void *__restrict__ const host_ptr,
730 void *__restrict__ const device_ptr,
731 const size_t *__restrict__ const buffer_origin,
732 const size_t *__restrict__ const host_origin,
733 const size_t *__restrict__ const region,
734 size_t const buffer_row_pitch,
735 size_t const buffer_slice_pitch,
736 size_t const host_row_pitch,
737 size_t const host_slice_pitch)
738 {
739 CUDA_MEMCPY3D params = { 0 };
740
741 POCL_MSG_PRINT_CUDA ("cuMemcpy3D / READ_RECT %p -> %p \n", device_ptr, host_ptr);
742
743 params.WidthInBytes = region[0];
744 params.Height = region[1];
745 params.Depth = region[2];
746
747 params.dstMemoryType = CU_MEMORYTYPE_HOST;
748 params.dstHost = host_ptr;
749 params.dstXInBytes = host_origin[0];
750 params.dstY = host_origin[1];
751 params.dstZ = host_origin[2];
752 params.dstPitch = host_row_pitch;
753 params.dstHeight = host_slice_pitch / host_row_pitch;
754
755 params.srcMemoryType = CU_MEMORYTYPE_DEVICE;
756 params.srcDevice = (CUdeviceptr)device_ptr;
757 params.srcXInBytes = buffer_origin[0];
758 params.srcY = buffer_origin[1];
759 params.srcZ = buffer_origin[2];
760 params.srcPitch = buffer_row_pitch;
761 params.srcHeight = buffer_slice_pitch / buffer_row_pitch;
762
763 CUresult result = cuMemcpy3DAsync (¶ms, stream);
764 CUDA_CHECK (result, "cuMemcpy3DAsync");
765 }
766
767 void
pocl_cuda_submit_write_rect(CUstream stream,const void * __restrict__ const host_ptr,void * __restrict__ const device_ptr,const size_t * __restrict__ const buffer_origin,const size_t * __restrict__ const host_origin,const size_t * __restrict__ const region,size_t const buffer_row_pitch,size_t const buffer_slice_pitch,size_t const host_row_pitch,size_t const host_slice_pitch)768 pocl_cuda_submit_write_rect (CUstream stream,
769 const void *__restrict__ const host_ptr,
770 void *__restrict__ const device_ptr,
771 const size_t *__restrict__ const buffer_origin,
772 const size_t *__restrict__ const host_origin,
773 const size_t *__restrict__ const region,
774 size_t const buffer_row_pitch,
775 size_t const buffer_slice_pitch,
776 size_t const host_row_pitch,
777 size_t const host_slice_pitch)
778 {
779 CUDA_MEMCPY3D params = { 0 };
780
781 POCL_MSG_PRINT_CUDA ("cuMemcpy3D / WRITE_RECT %p -> %p \n", host_ptr, device_ptr);
782
783 params.WidthInBytes = region[0];
784 params.Height = region[1];
785 params.Depth = region[2];
786
787 params.srcMemoryType = CU_MEMORYTYPE_HOST;
788 params.srcHost = host_ptr;
789 params.srcXInBytes = host_origin[0];
790 params.srcY = host_origin[1];
791 params.srcZ = host_origin[2];
792 params.srcPitch = host_row_pitch;
793 params.srcHeight = host_slice_pitch / host_row_pitch;
794
795 params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
796 params.dstDevice = (CUdeviceptr)device_ptr;
797 params.dstXInBytes = buffer_origin[0];
798 params.dstY = buffer_origin[1];
799 params.dstZ = buffer_origin[2];
800 params.dstPitch = buffer_row_pitch;
801 params.dstHeight = buffer_slice_pitch / buffer_row_pitch;
802
803 CUresult result = cuMemcpy3DAsync (¶ms, stream);
804 CUDA_CHECK (result, "cuMemcpy3DAsync");
805 }
806
807 void
pocl_cuda_submit_copy_rect(CUstream stream,cl_device_id dev,void * src_ptr,void * dst_ptr,const size_t * __restrict__ const src_origin,const size_t * __restrict__ const dst_origin,const size_t * __restrict__ const region,size_t const src_row_pitch,size_t const src_slice_pitch,size_t const dst_row_pitch,size_t const dst_slice_pitch)808 pocl_cuda_submit_copy_rect (CUstream stream,
809 cl_device_id dev,
810 void* src_ptr,
811 void* dst_ptr,
812 const size_t *__restrict__ const src_origin,
813 const size_t *__restrict__ const dst_origin,
814 const size_t *__restrict__ const region,
815 size_t const src_row_pitch,
816 size_t const src_slice_pitch,
817 size_t const dst_row_pitch,
818 size_t const dst_slice_pitch)
819 {
820 CUDA_MEMCPY3D params = { 0 };
821
822 POCL_MSG_PRINT_CUDA ("cuMemcpy3D / COPY_RECT %p -> %p \n", src_ptr, dst_ptr);
823
824 params.WidthInBytes = region[0];
825 params.Height = region[1];
826 params.Depth = region[2];
827
828 params.srcDevice = (CUdeviceptr)src_ptr;
829 params.srcXInBytes = src_origin[0];
830 params.srcY = src_origin[1];
831 params.srcZ = src_origin[2];
832 params.srcPitch = src_row_pitch;
833 params.srcHeight = src_slice_pitch / src_row_pitch;
834
835 params.dstDevice = (CUdeviceptr)dst_ptr;
836 params.dstXInBytes = dst_origin[0];
837 params.dstY = dst_origin[1];
838 params.dstZ = dst_origin[2];
839 params.dstPitch = dst_row_pitch;
840 params.dstHeight = dst_slice_pitch / dst_row_pitch;
841
842 params.srcMemoryType = params.dstMemoryType = CU_MEMORYTYPE_DEVICE;
843
844 CUresult result = cuMemcpy3DAsync (¶ms, stream);
845 CUDA_CHECK (result, "cuMemcpy3DAsync");
846 }
847
848 void
pocl_cuda_submit_map_mem(CUstream stream,cl_mem mem,pocl_mem_identifier * p,size_t offset,size_t size,void * host_ptr)849 pocl_cuda_submit_map_mem (CUstream stream, cl_mem mem,
850 pocl_mem_identifier *p,
851 size_t offset, size_t size, void *host_ptr)
852 {
853 assert (host_ptr != NULL);
854
855 if ((mem->flags & CL_MEM_USE_HOST_PTR)
856 || (p->extra_ptr))
857 return;
858
859 POCL_MSG_PRINT_CUDA ("cuMemcpyDtoHAsync %p / %zu B \n", host_ptr, size);
860
861 void *buf_ptr = p->mem_ptr;
862
863 CUresult result = cuMemcpyDtoHAsync (
864 host_ptr, (CUdeviceptr) (buf_ptr + offset), size, stream);
865 CUDA_CHECK (result, "cuMemcpyDtoHAsync");
866 }
867
868 void *
pocl_cuda_submit_unmap_mem(CUstream stream,pocl_mem_identifier * dst_mem_id,size_t offset,size_t size,void * host_ptr,cl_map_flags map_flags)869 pocl_cuda_submit_unmap_mem (CUstream stream, pocl_mem_identifier *dst_mem_id,
870 size_t offset, size_t size, void *host_ptr,
871 cl_map_flags map_flags)
872 {
873 /* Only copy back if mapped for writing */
874 if (map_flags == CL_MAP_READ)
875 return NULL;
876
877 if (host_ptr)
878 {
879 CUresult result = cuMemcpyHtoDAsync (
880 (CUdeviceptr) (dst_mem_id->mem_ptr + offset), host_ptr, size, stream);
881 CUDA_CHECK (result, "cuMemcpyHtoDAsync");
882 }
883 return NULL;
884 }
885
886 static pocl_cuda_kernel_data_t *
load_or_generate_kernel(cl_kernel kernel,cl_device_id device,int has_offsets,unsigned device_i,_cl_command_node * command,int specialized)887 load_or_generate_kernel (cl_kernel kernel, cl_device_id device,
888 int has_offsets, unsigned device_i,
889 _cl_command_node *command, int specialized)
890 {
891 CUresult result;
892 pocl_kernel_metadata_t *meta = kernel->meta;
893 /* Check if we already have a compiled kernel function */
894 pocl_cuda_kernel_data_t *kdata
895 = (pocl_cuda_kernel_data_t *)meta->data[device_i];
896 if (kdata)
897 {
898 if ((has_offsets && kdata->kernel_offsets)
899 || (!has_offsets && kdata->kernel))
900 return kdata;
901 }
902 else
903 {
904 /* TODO: when can we release this? */
905 kdata = meta->data[device_i]
906 = (void *)calloc (1, sizeof (pocl_cuda_kernel_data_t));
907 }
908
909 pocl_cuda_device_data_t *ddata = (pocl_cuda_device_data_t *)device->data;
910 cuCtxSetCurrent (ddata->context);
911
912 POCL_LOCK(ddata->compile_lock);
913
914 /* Generate the parallel bitcode file linked with the kernel library */
915 int error = pocl_llvm_generate_workgroup_function (device_i, device, kernel,
916 command, specialized);
917 if (error)
918 {
919 POCL_MSG_PRINT_GENERAL ("pocl_llvm_generate_workgroup_function() failed"
920 " for kernel %s\n", kernel->name);
921 assert (error == 0);
922 }
923
924 char bc_filename[POCL_FILENAME_LENGTH];
925 pocl_cache_work_group_function_path (bc_filename, kernel->program, device_i,
926 kernel, command, specialized);
927
928 char ptx_filename[POCL_FILENAME_LENGTH];
929 strcpy (ptx_filename, bc_filename);
930 strncat (ptx_filename, ".ptx", POCL_FILENAME_LENGTH - 1);
931
932 if (!pocl_exists (ptx_filename))
933 {
934 /* Generate PTX from LLVM bitcode */
935 if (pocl_ptx_gen (bc_filename, ptx_filename, kernel->name,
936 device->llvm_cpu,
937 ((pocl_cuda_device_data_t *)device->data)->libdevice,
938 has_offsets))
939 POCL_ABORT ("pocl-cuda: failed to generate PTX\n");
940 }
941
942 /* Load PTX module */
943 /* TODO: When can we unload the module? */
944 CUmodule module;
945 result = cuModuleLoad (&module, ptx_filename);
946 CUDA_CHECK (result, "cuModuleLoad");
947
948 /* Get kernel function */
949 CUfunction function;
950 result = cuModuleGetFunction (&function, module, kernel->name);
951 CUDA_CHECK (result, "cuModuleGetFunction");
952
953 /* Get pointer aligment */
954 if (!kdata->alignments)
955 {
956 kdata->alignments
957 = calloc (meta->num_args + meta->num_locals + 4, sizeof (size_t));
958 pocl_cuda_get_ptr_arg_alignment (bc_filename, kernel->name,
959 kdata->alignments);
960 }
961
962 if (has_offsets)
963 {
964 kdata->module_offsets = module;
965 kdata->kernel_offsets = function;
966 }
967 else
968 {
969 kdata->module = module;
970 kdata->kernel = function;
971 }
972
973 POCL_UNLOCK (ddata->compile_lock);
974
975 return kdata;
976 }
977
978 void
pocl_cuda_compile_kernel(_cl_command_node * cmd,cl_kernel kernel,cl_device_id device,int specialize)979 pocl_cuda_compile_kernel (_cl_command_node *cmd, cl_kernel kernel,
980 cl_device_id device, int specialize)
981 {
982 load_or_generate_kernel (kernel, device, 0, cmd->device_i, cmd,
983 specialize);
984 }
985
986 void
pocl_cuda_submit_kernel(CUstream stream,_cl_command_node * cmd,cl_device_id device,cl_event event)987 pocl_cuda_submit_kernel (CUstream stream, _cl_command_node *cmd,
988 cl_device_id device, cl_event event)
989 {
990 _cl_command_run run = cmd->command.run;
991 cl_kernel kernel = run.kernel;
992 pocl_argument *arguments = run.arguments;
993 struct pocl_context pc = run.pc;
994 pocl_kernel_metadata_t *meta = kernel->meta;
995
996 /* Check if we need to handle global work offsets */
997 int has_offsets =
998 (pc.global_offset[0] || pc.global_offset[1] || pc.global_offset[2]);
999
1000 /* Get kernel function */
1001 pocl_cuda_kernel_data_t *kdata = load_or_generate_kernel (
1002 kernel, device, has_offsets, cmd->device_i, cmd, 1);
1003 CUmodule module = has_offsets ? kdata->module_offsets : kdata->module;
1004 CUfunction function = has_offsets ? kdata->kernel_offsets : kdata->kernel;
1005
1006 /* Prepare kernel arguments */
1007 void *null = NULL;
1008 unsigned sharedMemBytes = 0;
1009 void *params[meta->num_args + meta->num_locals + 4];
1010 unsigned sharedMemOffsets[meta->num_args + meta->num_locals];
1011 unsigned constantMemBytes = 0;
1012 unsigned constantMemOffsets[meta->num_args];
1013 unsigned globalOffsets[3];
1014
1015 /* Get handle to constant memory buffer */
1016 size_t constant_mem_size;
1017 CUdeviceptr constant_mem_base = 0;
1018 cuModuleGetGlobal (&constant_mem_base, &constant_mem_size, module,
1019 "_constant_memory_region_");
1020
1021 CUresult result;
1022 unsigned i;
1023 for (i = 0; i < meta->num_args; i++)
1024 {
1025 pocl_argument_type type = meta->arg_info[i].type;
1026 switch (type)
1027 {
1028 case POCL_ARG_TYPE_NONE:
1029 params[i] = arguments[i].value;
1030 break;
1031 case POCL_ARG_TYPE_POINTER:
1032 {
1033 if (ARG_IS_LOCAL (meta->arg_info[i]))
1034 {
1035 size_t size = arguments[i].size;
1036 size_t align = kdata->alignments[i];
1037
1038 /* Pad offset to align memory */
1039 if (sharedMemBytes % align)
1040 sharedMemBytes += align - (sharedMemBytes % align);
1041
1042 sharedMemOffsets[i] = sharedMemBytes;
1043 params[i] = sharedMemOffsets + i;
1044
1045 sharedMemBytes += size;
1046 }
1047 else if (meta->arg_info[i].address_qualifier
1048 == CL_KERNEL_ARG_ADDRESS_CONSTANT)
1049 {
1050 assert (constant_mem_base);
1051 assert (arguments[i].is_svm == 0);
1052
1053 /* Get device pointer */
1054 cl_mem mem = *(void **)arguments[i].value;
1055 CUdeviceptr src
1056 = (CUdeviceptr)mem->device_ptrs[device->global_mem_id].mem_ptr
1057 + arguments[i].offset;
1058
1059 size_t align = kdata->alignments[i];
1060 if (constantMemBytes % align)
1061 {
1062 constantMemBytes += align - (constantMemBytes % align);
1063 }
1064
1065 /* Copy to constant buffer at current offset */
1066 result
1067 = cuMemcpyDtoDAsync (constant_mem_base + constantMemBytes,
1068 src, mem->size, stream);
1069 CUDA_CHECK (result, "cuMemcpyDtoDAsync");
1070
1071 constantMemOffsets[i] = constantMemBytes;
1072 params[i] = constantMemOffsets + i;
1073
1074 constantMemBytes += mem->size;
1075 }
1076 else
1077 {
1078 assert (arguments[i].is_svm == 0);
1079 if (arguments[i].value)
1080 {
1081 cl_mem mem = *(void **)arguments[i].value;
1082 params[i] = &mem->device_ptrs[device->global_mem_id].mem_ptr
1083 + arguments[i].offset;
1084
1085 #if defined __arm__
1086 /* On ARM with USE_HOST_PTR, perform explicit copy to
1087 * device */
1088 if (mem->flags & CL_MEM_USE_HOST_PTR)
1089 {
1090 cuMemcpyHtoD (*(CUdeviceptr *)(params[i]),
1091 mem->mem_host_ptr, mem->size);
1092 cuStreamSynchronize (0);
1093 }
1094 #endif
1095 }
1096 else
1097 {
1098 params[i] = &null;
1099 }
1100 }
1101 break;
1102 }
1103 case POCL_ARG_TYPE_IMAGE:
1104 case POCL_ARG_TYPE_SAMPLER:
1105 POCL_ABORT ("Unhandled argument type for CUDA\n");
1106 break;
1107 }
1108 }
1109
1110 if (constantMemBytes > constant_mem_size)
1111 POCL_ABORT ("[CUDA] Total constant buffer size %u exceeds %lu allocated\n",
1112 constantMemBytes, constant_mem_size);
1113
1114 unsigned arg_index = meta->num_args;
1115
1116 if (sharedMemBytes != 0)
1117 {
1118 /* Deal with automatic local allocations if there are local function args
1119 */
1120 /* TODO: Would be better to remove arguments and make these static GEPs
1121 */
1122 for (i = 0; i < meta->num_locals; ++i, ++arg_index)
1123 {
1124 size_t size = meta->local_sizes[i];
1125 size_t align = kdata->alignments[arg_index];
1126
1127 /* Pad offset to align memory */
1128 if (sharedMemBytes % align)
1129 sharedMemBytes += align - (sharedMemBytes % align);
1130
1131 sharedMemOffsets[arg_index] = sharedMemBytes;
1132 sharedMemBytes += size;
1133 params[arg_index] = sharedMemOffsets + arg_index;
1134 }
1135 }
1136
1137 /* Add global work dimensionality */
1138 params[arg_index++] = &pc.work_dim;
1139
1140 /* Add global offsets if necessary */
1141 if (has_offsets)
1142 {
1143 globalOffsets[0] = pc.global_offset[0];
1144 globalOffsets[1] = pc.global_offset[1];
1145 globalOffsets[2] = pc.global_offset[2];
1146 params[arg_index++] = globalOffsets + 0;
1147 params[arg_index++] = globalOffsets + 1;
1148 params[arg_index++] = globalOffsets + 2;
1149 }
1150
1151 /* Launch kernel */
1152 result = cuLaunchKernel (function, pc.num_groups[0], pc.num_groups[1],
1153 pc.num_groups[2], pc.local_size[0],
1154 pc.local_size[1], pc.local_size[2], sharedMemBytes,
1155 stream, params, NULL);
1156 CUDA_CHECK (result, "cuLaunchKernel");
1157 }
1158
1159 void
pocl_cuda_submit_node(_cl_command_node * node,cl_command_queue cq,int locked)1160 pocl_cuda_submit_node (_cl_command_node *node, cl_command_queue cq, int locked)
1161 {
1162 CUresult result;
1163 CUstream stream = ((pocl_cuda_queue_data_t *)cq->data)->stream;
1164
1165 if (!locked)
1166 POCL_LOCK_OBJ (node->event);
1167
1168 pocl_cuda_event_data_t *event_data
1169 = (pocl_cuda_event_data_t *)node->event->data;
1170
1171 /* Process event dependencies */
1172 event_node *dep = NULL;
1173 LL_FOREACH (node->event->wait_list, dep)
1174 {
1175 /* If it is in the process of completing, just skip it */
1176 if (dep->event->status <= CL_COMPLETE)
1177 continue;
1178
1179 /* Add CUDA event dependency */
1180 if (dep->event->command_type != CL_COMMAND_USER
1181 && dep->event->queue->device->ops == cq->device->ops)
1182 {
1183 /* Block stream on event, but only for different queues */
1184 if (dep->event->queue != node->event->queue)
1185 {
1186 pocl_cuda_event_data_t *dep_data
1187 = (pocl_cuda_event_data_t *)dep->event->data;
1188
1189 /* Wait until dependency has finished being submitted */
1190 while (!dep_data->events_ready)
1191 ;
1192
1193 result = cuStreamWaitEvent (stream, dep_data->end, 0);
1194 CUDA_CHECK (result, "cuStreamWaitEvent");
1195 }
1196 }
1197 else
1198 {
1199 if (!((pocl_cuda_queue_data_t *)cq->data)->use_threads)
1200 POCL_ABORT (
1201 "Can't handle non-CUDA dependencies without queue threads\n");
1202
1203 event_data->num_ext_events++;
1204 }
1205 }
1206
1207 /* Wait on flag for external events */
1208 if (event_data->num_ext_events)
1209 {
1210 CUdeviceptr dev_ext_event_flag;
1211 result = cuMemHostAlloc ((void **)&event_data->ext_event_flag, 4,
1212 CU_MEMHOSTALLOC_DEVICEMAP);
1213 CUDA_CHECK (result, "cuMemAllocHost");
1214
1215 *event_data->ext_event_flag = 0;
1216
1217 result = cuMemHostGetDevicePointer (&dev_ext_event_flag,
1218 event_data->ext_event_flag, 0);
1219 CUDA_CHECK (result, "cuMemHostGetDevicePointer");
1220 result = cuStreamWaitValue32 (stream, dev_ext_event_flag, 1,
1221 CU_STREAM_WAIT_VALUE_GEQ);
1222 CUDA_CHECK (result, "cuStreamWaitValue32");
1223 }
1224
1225 /* Create and record event for command start if profiling enabled */
1226 if (cq->properties & CL_QUEUE_PROFILING_ENABLE)
1227 {
1228 result = cuEventCreate (&event_data->start, CU_EVENT_DEFAULT);
1229 CUDA_CHECK (result, "cuEventCreate");
1230 result = cuEventRecord (event_data->start, stream);
1231 CUDA_CHECK (result, "cuEventRecord");
1232 }
1233
1234 pocl_update_event_submitted (node->event);
1235
1236 POCL_UNLOCK_OBJ (node->event);
1237
1238 cl_event event = node->event;
1239 cl_device_id dev = node->device;
1240 _cl_command_t *cmd = &node->command;
1241
1242 switch (node->type)
1243 {
1244 case CL_COMMAND_READ_BUFFER:
1245 pocl_cuda_submit_read (
1246 stream, cmd->read.dst_host_ptr, cmd->read.src_mem_id->mem_ptr,
1247 node->command.read.offset, node->command.read.size);
1248 break;
1249 case CL_COMMAND_WRITE_BUFFER:
1250 pocl_cuda_submit_write (
1251 stream, cmd->write.src_host_ptr, cmd->write.dst_mem_id->mem_ptr,
1252 node->command.write.offset, node->command.write.size);
1253 break;
1254 case CL_COMMAND_COPY_BUFFER:
1255 {
1256 pocl_cuda_submit_copy (
1257 stream, cmd->copy.src_mem_id->mem_ptr, cmd->copy.src_offset,
1258 cmd->copy.dst_mem_id->mem_ptr, cmd->copy.dst_offset, cmd->copy.size);
1259 break;
1260 }
1261 case CL_COMMAND_READ_BUFFER_RECT:
1262 pocl_cuda_submit_read_rect (
1263 stream,
1264 cmd->read_rect.dst_host_ptr,
1265 cmd->read_rect.src_mem_id->mem_ptr,
1266 cmd->read_rect.buffer_origin,
1267 cmd->read_rect.host_origin,
1268 cmd->read_rect.region,
1269 cmd->read_rect.buffer_row_pitch,
1270 cmd->read_rect.buffer_slice_pitch,
1271 cmd->read_rect.host_row_pitch,
1272 cmd->read_rect.host_slice_pitch);
1273 break;
1274 case CL_COMMAND_WRITE_BUFFER_RECT:
1275 pocl_cuda_submit_write_rect (
1276 stream,
1277 cmd->write_rect.src_host_ptr,
1278 cmd->write_rect.dst_mem_id->mem_ptr,
1279 cmd->write_rect.buffer_origin,
1280 cmd->write_rect.host_origin,
1281 cmd->write_rect.region,
1282 cmd->read_rect.buffer_row_pitch,
1283 cmd->read_rect.buffer_slice_pitch,
1284 cmd->read_rect.host_row_pitch,
1285 cmd->read_rect.host_slice_pitch);
1286 break;
1287 case CL_COMMAND_COPY_BUFFER_RECT:
1288 {
1289 pocl_cuda_submit_copy_rect (
1290 stream, dev,
1291 cmd->copy_rect.src_mem_id->mem_ptr,
1292 cmd->copy_rect.dst_mem_id->mem_ptr,
1293 cmd->copy_rect.src_origin,
1294 cmd->copy_rect.dst_origin,
1295 cmd->copy_rect.region,
1296 cmd->copy_rect.src_row_pitch,
1297 cmd->copy_rect.src_slice_pitch,
1298 cmd->copy_rect.dst_row_pitch,
1299 cmd->copy_rect.dst_slice_pitch);
1300 break;
1301 }
1302 case CL_COMMAND_MAP_BUFFER:
1303 {
1304 cl_mem buffer = event->mem_objs[0];
1305 pocl_cuda_submit_map_mem (
1306 stream, buffer, cmd->map.mem_id, cmd->map.mapping->offset,
1307 cmd->map.mapping->size, cmd->map.mapping->host_ptr);
1308 break;
1309 }
1310 case CL_COMMAND_UNMAP_MEM_OBJECT:
1311 {
1312 cl_mem buffer = event->mem_objs[0];
1313 assert (buffer->is_image == CL_FALSE);
1314 pocl_cuda_submit_unmap_mem (
1315 stream,
1316 cmd->unmap.mem_id,
1317 cmd->unmap.mapping->offset,
1318 cmd->unmap.mapping->size,
1319 cmd->unmap.mapping->host_ptr,
1320 cmd->unmap.mapping->map_flags);
1321 break;
1322 }
1323 case CL_COMMAND_NDRANGE_KERNEL:
1324 pocl_cuda_submit_kernel (stream, node, node->device, node->event);
1325 break;
1326
1327 case CL_COMMAND_MIGRATE_MEM_OBJECTS:
1328 switch (cmd->migrate.type)
1329 {
1330 case ENQUEUE_MIGRATE_TYPE_D2H:
1331 {
1332 cl_mem mem = event->mem_objs[0];
1333 pocl_cuda_submit_read (stream, mem->mem_host_ptr,
1334 cmd->migrate.mem_id->mem_ptr, 0, mem->size);
1335 break;
1336 }
1337 case ENQUEUE_MIGRATE_TYPE_H2D:
1338 {
1339 cl_mem mem = event->mem_objs[0];
1340 pocl_cuda_submit_write (stream, mem->mem_host_ptr,
1341 cmd->migrate.mem_id->mem_ptr, 0,
1342 mem->size);
1343 break;
1344 }
1345 case ENQUEUE_MIGRATE_TYPE_D2D:
1346 {
1347 POCL_ABORT_UNIMPLEMENTED (
1348 "CUDA does not support D2D migration.\n");
1349 }
1350 case ENQUEUE_MIGRATE_TYPE_NOP:
1351 {
1352 break;
1353 }
1354 }
1355 break;
1356
1357 case CL_COMMAND_MARKER:
1358 case CL_COMMAND_BARRIER:
1359 break;
1360
1361 case CL_COMMAND_FILL_BUFFER:
1362 pocl_cuda_submit_memfill (stream, cmd->memfill.dst_mem_id->mem_ptr,
1363 cmd->memfill.size, cmd->memfill.offset,
1364 cmd->memfill.pattern,
1365 cmd->memfill.pattern_size);
1366 break;
1367 case CL_COMMAND_READ_IMAGE:
1368 case CL_COMMAND_WRITE_IMAGE:
1369 case CL_COMMAND_COPY_IMAGE:
1370 case CL_COMMAND_COPY_BUFFER_TO_IMAGE:
1371 case CL_COMMAND_COPY_IMAGE_TO_BUFFER:
1372 case CL_COMMAND_FILL_IMAGE:
1373 case CL_COMMAND_MAP_IMAGE:
1374 case CL_COMMAND_NATIVE_KERNEL:
1375 case CL_COMMAND_SVM_FREE:
1376 case CL_COMMAND_SVM_MAP:
1377 case CL_COMMAND_SVM_UNMAP:
1378 case CL_COMMAND_SVM_MEMCPY:
1379 case CL_COMMAND_SVM_MEMFILL:
1380 default:
1381 POCL_ABORT_UNIMPLEMENTED (pocl_command_to_str (node->type));
1382 break;
1383 }
1384
1385 /* Create and record event for command end */
1386 if (cq->properties & CL_QUEUE_PROFILING_ENABLE)
1387 result = cuEventCreate (&event_data->end, CU_EVENT_DEFAULT);
1388 else
1389 result = cuEventCreate (&event_data->end, CU_EVENT_DISABLE_TIMING);
1390 CUDA_CHECK (result, "cuEventCreate");
1391 result = cuEventRecord (event_data->end, stream);
1392 CUDA_CHECK (result, "cuEventRecord");
1393
1394 event_data->events_ready = 1;
1395 }
1396
1397 void
pocl_cuda_submit(_cl_command_node * node,cl_command_queue cq)1398 pocl_cuda_submit (_cl_command_node *node, cl_command_queue cq)
1399 {
1400 /* Allocate CUDA event data */
1401 pocl_cuda_event_data_t *p
1402 = (pocl_cuda_event_data_t *)calloc (1, sizeof (pocl_cuda_event_data_t));
1403 node->event->data = p;
1404
1405 if (((pocl_cuda_queue_data_t *)cq->data)->use_threads)
1406 {
1407
1408 pthread_cond_init (&p->event_cond, NULL);
1409 /* Add command to work queue */
1410 POCL_UNLOCK_OBJ (node->event);
1411 pocl_cuda_queue_data_t *queue_data = (pocl_cuda_queue_data_t *)cq->data;
1412 pthread_mutex_lock (&queue_data->lock);
1413 DL_APPEND (queue_data->pending_queue, node);
1414 pthread_cond_signal (&queue_data->pending_cond);
1415 pthread_mutex_unlock (&queue_data->lock);
1416 }
1417 else
1418 {
1419 /* Submit command in this thread */
1420 cuCtxSetCurrent (((pocl_cuda_device_data_t *)cq->device->data)->context);
1421 pocl_cuda_submit_node (node, cq, 1);
1422 }
1423 }
1424
1425 void
pocl_cuda_notify(cl_device_id device,cl_event event,cl_event finished)1426 pocl_cuda_notify (cl_device_id device, cl_event event, cl_event finished)
1427 {
1428 /* Ignore CUDA device events, we've already handled these dependencies */
1429 if (finished->queue && finished->queue->device->ops == device->ops)
1430 return;
1431
1432 if (event->status == CL_QUEUED)
1433 return;
1434
1435 pocl_cuda_event_data_t *event_data = (pocl_cuda_event_data_t *)event->data;
1436
1437 assert (event_data);
1438 assert (event_data->num_ext_events > 0);
1439 assert (event_data->ext_event_flag);
1440
1441 /* If dependency failed, so should we */
1442 /* TODO: This isn't true if this is an implicit dependency */
1443 if (finished->status < 0)
1444 event->status = -1;
1445
1446 /* Decrement external event counter */
1447 /* Trigger flag if none left */
1448 if (!--event_data->num_ext_events)
1449 *event_data->ext_event_flag = 1;
1450 }
1451
1452 void
pocl_cuda_flush(cl_device_id device,cl_command_queue cq)1453 pocl_cuda_flush (cl_device_id device, cl_command_queue cq)
1454 {
1455 /* TODO: Something here? */
1456 }
1457
1458 void
pocl_cuda_finalize_command(cl_device_id device,cl_event event)1459 pocl_cuda_finalize_command (cl_device_id device, cl_event event)
1460 {
1461 CUresult result;
1462 pocl_cuda_event_data_t *event_data = (pocl_cuda_event_data_t *)event->data;
1463
1464 /* Wait for command to finish */
1465 cuCtxSetCurrent (((pocl_cuda_device_data_t *)device->data)->context);
1466 result = cuEventSynchronize (event_data->end);
1467 CUDA_CHECK (result, "cuEventSynchronize");
1468
1469 /* Clean up mapped memory allocations */
1470 if (event->command_type == CL_COMMAND_UNMAP_MEM_OBJECT)
1471 {
1472 pocl_unmap_command_finished2 (event, &event->command->command);
1473 }
1474
1475 if (event->command_type == CL_COMMAND_NDRANGE_KERNEL
1476 || event->command_type == CL_COMMAND_TASK)
1477 {
1478 #if defined __arm__
1479 /* On ARM with USE_HOST_PTR, perform explict copies back from device */
1480 cl_kernel kernel = event->command.run.kernel;
1481 pocl_argument *arguments = event->command.run.arguments;
1482 unsigned i;
1483 for (i = 0; i < meta->num_args; i++)
1484 {
1485 if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER)
1486 {
1487 if (!ARG_IS_LOCAL (meta->arg_info[i]) && arguments[i].value)
1488 {
1489 cl_mem mem = *(void **)arguments[i].value;
1490 if (mem->flags & CL_MEM_USE_HOST_PTR)
1491 {
1492 CUdeviceptr ptr
1493 = (CUdeviceptr)mem->device_ptrs[device->global_mem_id]
1494 .mem_ptr;
1495 cuMemcpyDtoH (mem->mem_host_ptr, ptr, mem->size);
1496 cuStreamSynchronize (0);
1497 }
1498 }
1499 }
1500 }
1501 #endif
1502
1503 pocl_ndrange_node_cleanup (event->command);
1504 }
1505 else
1506 {
1507 pocl_mem_manager_free_command (event->command);
1508 }
1509
1510 /* Handle failed events */
1511
1512
1513 pocl_update_event_running (event);
1514 if (event->status < 0)
1515 pocl_update_event_failed (event);
1516 else
1517 POCL_UPDATE_EVENT_COMPLETE_MSG (event, "CUDA event");
1518 }
1519
1520 void
pocl_cuda_update_event(cl_device_id device,cl_event event)1521 pocl_cuda_update_event (cl_device_id device, cl_event event)
1522 {
1523 if ((event->status == CL_COMPLETE)
1524 && (event->queue->properties & CL_QUEUE_PROFILING_ENABLE))
1525 {
1526 /* Update timing info with CUDA event timers if profiling enabled */
1527 /* CUDA doesn't provide a way to get event timestamps directly,
1528 * only the elapsed time between two events. We use the elapsed
1529 * time from the epoch event enqueued on device creation to get
1530 * the actual timestamps.
1531 *
1532 * Since the CUDA timer resolution is lower than the host timer,
1533 * this can sometimes result in the start time being before the
1534 * submit time, so we use max() to ensure the timestamps are
1535 * sane. */
1536
1537 float diff;
1538 CUresult result;
1539 pocl_cuda_event_data_t *event_data
1540 = (pocl_cuda_event_data_t *)event->data;
1541 cl_ulong epoch = ((pocl_cuda_device_data_t *)device->data)->epoch;
1542
1543 result = cuEventElapsedTime (
1544 &diff, ((pocl_cuda_device_data_t *)device->data)->epoch_event,
1545 event_data->start);
1546 CUDA_CHECK (result, "cuEventElapsedTime");
1547 event->time_start = (cl_ulong) (epoch + diff * 1e6);
1548 event->time_start = max (event->time_start, epoch + 1);
1549
1550 result = cuEventElapsedTime (
1551 &diff, ((pocl_cuda_device_data_t *)device->data)->epoch_event,
1552 event_data->end);
1553 CUDA_CHECK (result, "cuEventElapsedTime");
1554 event->time_end = (cl_ulong) (epoch + diff * 1e6);
1555 event->time_end = max (event->time_end, event->time_start + 1);
1556 }
1557 }
1558
1559 void
pocl_cuda_wait_event_recurse(cl_device_id device,cl_event event)1560 pocl_cuda_wait_event_recurse (cl_device_id device, cl_event event)
1561 {
1562 while (event->wait_list)
1563 pocl_cuda_wait_event_recurse (device, event->wait_list->event);
1564
1565 if (event->status > CL_COMPLETE)
1566 pocl_cuda_finalize_command (device, event);
1567 }
1568
1569 void
pocl_cuda_notify_event_finished(cl_event event)1570 pocl_cuda_notify_event_finished (cl_event event)
1571 {
1572 pocl_cuda_event_data_t *e_d = (pocl_cuda_event_data_t *)event->data;
1573
1574 if (((pocl_cuda_queue_data_t *)event->queue->data)->use_threads)
1575 pthread_cond_broadcast (&e_d->event_cond);
1576 }
1577
1578 void
pocl_cuda_wait_event(cl_device_id device,cl_event event)1579 pocl_cuda_wait_event (cl_device_id device, cl_event event)
1580 {
1581 pocl_cuda_event_data_t *e_d = (pocl_cuda_event_data_t *)event->data;
1582
1583 if (((pocl_cuda_queue_data_t *)event->queue->data)->use_threads)
1584 {
1585 /* Wait until background thread marks command as complete */
1586 POCL_LOCK_OBJ (event);
1587 while (event->status > CL_COMPLETE)
1588 {
1589 pthread_cond_wait (&e_d->event_cond, &event->pocl_lock);
1590 }
1591 POCL_UNLOCK_OBJ (event);
1592 }
1593 else
1594 {
1595 /* Recursively finalize commands in this thread */
1596 pocl_cuda_wait_event_recurse (device, event);
1597 }
1598 }
1599
1600 void
pocl_cuda_free_event_data(cl_event event)1601 pocl_cuda_free_event_data (cl_event event)
1602 {
1603 if (event->data)
1604 {
1605 pocl_cuda_event_data_t *event_data
1606 = (pocl_cuda_event_data_t *)event->data;
1607 pthread_cond_destroy (&event_data->event_cond);
1608 if (event->queue->properties & CL_QUEUE_PROFILING_ENABLE)
1609 cuEventDestroy (event_data->start);
1610 cuEventDestroy (event_data->end);
1611 if (event_data->ext_event_flag)
1612 {
1613 CUresult result = cuMemFreeHost (event_data->ext_event_flag);
1614 CUDA_CHECK (result, "cuMemFreeHost");
1615 }
1616 free (event->data);
1617 }
1618 }
1619
1620 void
pocl_cuda_join(cl_device_id device,cl_command_queue cq)1621 pocl_cuda_join (cl_device_id device, cl_command_queue cq)
1622 {
1623 /* Grab event at end of queue */
1624 POCL_LOCK_OBJ (cq);
1625 cl_event event = cq->last_event.event;
1626 if (!event)
1627 {
1628 POCL_UNLOCK_OBJ (cq);
1629 return;
1630 }
1631 POname (clRetainEvent) (event);
1632 POCL_UNLOCK_OBJ (cq);
1633
1634 pocl_cuda_wait_event (device, event);
1635
1636 POname (clReleaseEvent) (event);
1637 }
1638
1639 void *
pocl_cuda_submit_thread(void * data)1640 pocl_cuda_submit_thread (void *data)
1641 {
1642 pocl_cuda_queue_data_t *queue_data = (pocl_cuda_queue_data_t *)data;
1643
1644 cl_command_queue queue = queue_data->queue;
1645 if (queue)
1646 cuCtxSetCurrent (
1647 ((pocl_cuda_device_data_t *)queue->device->data)->context);
1648 else
1649 /* This queue has already been released */
1650 return NULL;
1651
1652 while (1)
1653 {
1654 /* Attempt to get next command from work queue */
1655 _cl_command_node *node = NULL;
1656 pthread_mutex_lock (&queue_data->lock);
1657 if (!queue_data->queue)
1658 {
1659 pthread_mutex_unlock (&queue_data->lock);
1660 break;
1661 }
1662 if (!queue_data->pending_queue)
1663 {
1664 pthread_cond_wait (&queue_data->pending_cond, &queue_data->lock);
1665 }
1666 if (queue_data->pending_queue)
1667 {
1668 node = queue_data->pending_queue;
1669 DL_DELETE (queue_data->pending_queue, node);
1670 }
1671 pthread_mutex_unlock (&queue_data->lock);
1672
1673 /* Submit command, if we found one */
1674 if (node)
1675 {
1676 pocl_cuda_submit_node (node, queue_data->queue, 0);
1677
1678 /* Add command to running queue */
1679 pthread_mutex_lock (&queue_data->lock);
1680 DL_APPEND (queue_data->running_queue, node);
1681 pthread_cond_signal (&queue_data->running_cond);
1682 pthread_mutex_unlock (&queue_data->lock);
1683 }
1684 }
1685
1686 return NULL;
1687 }
1688
1689 void *
pocl_cuda_finalize_thread(void * data)1690 pocl_cuda_finalize_thread (void *data)
1691 {
1692 pocl_cuda_queue_data_t *queue_data = (pocl_cuda_queue_data_t *)data;
1693
1694 cl_command_queue queue = queue_data->queue;
1695 if (queue)
1696 cuCtxSetCurrent (
1697 ((pocl_cuda_device_data_t *)queue->device->data)->context);
1698 else
1699 /* This queue has already been released */
1700 return NULL;
1701
1702 while (1)
1703 {
1704 /* Attempt to get next node from running queue */
1705 _cl_command_node *node = NULL;
1706 pthread_mutex_lock (&queue_data->lock);
1707 if (!queue_data->queue)
1708 {
1709 pthread_mutex_unlock (&queue_data->lock);
1710 break;
1711 }
1712 if (!queue_data->running_queue)
1713 {
1714 pthread_cond_wait (&queue_data->running_cond, &queue_data->lock);
1715 }
1716 if (queue_data->running_queue)
1717 {
1718 node = queue_data->running_queue;
1719 DL_DELETE (queue_data->running_queue, node);
1720 }
1721 pthread_mutex_unlock (&queue_data->lock);
1722
1723 /* Wait for command to finish, if we found one */
1724 if (node)
1725 pocl_cuda_finalize_command (queue->device, node->event);
1726 }
1727
1728 return NULL;
1729 }
1730
pocl_cuda_init_build(void * data)1731 char* pocl_cuda_init_build(void *data)
1732 {
1733 #ifdef LLVM_OLDER_THAN_7_0
1734 return strdup("");
1735 #else
1736 return strdup("-mllvm --nvptx-short-ptr");
1737 #endif
1738 }
1739