1 /* basic.c - a minimalistic single core pocl device driver layer implementation
2
3 Copyright (c) 2011-2013 Universidad Rey Juan Carlos and
4 2011-2020 Pekka Jääskeläinen
5
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to
8 deal in the Software without restriction, including without limitation the
9 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 sell copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 IN THE SOFTWARE.
23 */
24
25 #include "basic.h"
26 #include "common.h"
27 #include "config.h"
28 #include "config2.h"
29 #include "cpuinfo.h"
30 #include "devices.h"
31 #include "pocl_local_size.h"
32 #include "pocl_util.h"
33 #include "topology/pocl_topology.h"
34 #include "utlist.h"
35
36 #include <assert.h>
37 #include <limits.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <utlist.h>
42
43 #include "pocl_cache.h"
44 #include "pocl_file_util.h"
45 #include "pocl_mem_management.h"
46 #include "pocl_timing.h"
47 #include "pocl_workgroup_func.h"
48
49 #include "common_driver.h"
50
51 #ifdef ENABLE_LLVM
52 #include "pocl_llvm.h"
53 #endif
54
55 struct data {
56 /* List of commands ready to be executed */
57 _cl_command_node *ready_list;
58 /* List of commands not yet ready to be executed */
59 _cl_command_node *command_list;
60 /* Lock for command list related operations */
61 pocl_lock_t cq_lock;
62
63 /* Currently loaded kernel. */
64 cl_kernel current_kernel;
65
66 /* printf buffer */
67 void *printf_buffer;
68 };
69
70 void
pocl_basic_init_device_ops(struct pocl_device_ops * ops)71 pocl_basic_init_device_ops(struct pocl_device_ops *ops)
72 {
73 ops->device_name = "basic";
74
75 ops->probe = pocl_basic_probe;
76 ops->uninit = pocl_basic_uninit;
77 ops->reinit = pocl_basic_reinit;
78 ops->init = pocl_basic_init;
79
80 ops->alloc_mem_obj = pocl_basic_alloc_mem_obj;
81 ops->free = pocl_basic_free;
82
83 ops->read = pocl_driver_read;
84 ops->read_rect = pocl_driver_read_rect;
85 ops->write = pocl_driver_write;
86 ops->write_rect = pocl_driver_write_rect;
87 ops->copy = pocl_driver_copy;
88 ops->copy_with_size = pocl_driver_copy_with_size;
89 ops->copy_rect = pocl_driver_copy_rect;
90 ops->memfill = pocl_driver_memfill;
91 ops->map_mem = pocl_driver_map_mem;
92 ops->unmap_mem = pocl_driver_unmap_mem;
93 ops->get_mapping_ptr = pocl_driver_get_mapping_ptr;
94 ops->free_mapping_ptr = pocl_driver_free_mapping_ptr;
95
96 ops->can_migrate_d2d = NULL;
97 ops->migrate_d2d = NULL;
98
99 ops->run = pocl_basic_run;
100 ops->run_native = pocl_basic_run_native;
101
102 ops->build_source = pocl_driver_build_source;
103 ops->link_program = pocl_driver_link_program;
104 ops->build_binary = pocl_driver_build_binary;
105 ops->free_program = pocl_driver_free_program;
106 ops->setup_metadata = pocl_driver_setup_metadata;
107 ops->supports_binary = pocl_driver_supports_binary;
108 ops->build_poclbinary = pocl_driver_build_poclbinary;
109 ops->compile_kernel = pocl_basic_compile_kernel;
110
111 ops->join = pocl_basic_join;
112 ops->submit = pocl_basic_submit;
113 ops->broadcast = pocl_broadcast;
114 ops->notify = pocl_basic_notify;
115 ops->flush = pocl_basic_flush;
116 ops->build_hash = pocl_basic_build_hash;
117 ops->compute_local_size = pocl_default_local_size_optimizer;
118
119 ops->get_device_info_ext = NULL;
120
121 ops->svm_free = pocl_basic_svm_free;
122 ops->svm_alloc = pocl_basic_svm_alloc;
123 /* no need to implement these two as they're noop
124 * and pocl_exec_command takes care of it */
125 ops->svm_map = NULL;
126 ops->svm_unmap = NULL;
127 ops->svm_copy = pocl_basic_svm_copy;
128 ops->svm_fill = pocl_basic_svm_fill;
129
130 ops->create_kernel = NULL;
131 ops->free_kernel = NULL;
132 ops->create_sampler = NULL;
133 ops->free_sampler = NULL;
134 ops->copy_image_rect = pocl_basic_copy_image_rect;
135 ops->write_image_rect = pocl_basic_write_image_rect;
136 ops->read_image_rect = pocl_basic_read_image_rect;
137 ops->map_image = pocl_basic_map_image;
138 ops->unmap_image = pocl_basic_unmap_image;
139 ops->fill_image = pocl_basic_fill_image;
140 }
141
142 char *
pocl_basic_build_hash(cl_device_id device)143 pocl_basic_build_hash (cl_device_id device)
144 {
145 char* res = calloc(1000, sizeof(char));
146 #ifdef KERNELLIB_HOST_DISTRO_VARIANTS
147 char *name = get_llvm_cpu_name ();
148 snprintf (res, 1000, "basic-%s-%s", HOST_DEVICE_BUILD_HASH, name);
149 POCL_MEM_FREE (name);
150 #else
151 snprintf (res, 1000, "basic-%s", HOST_DEVICE_BUILD_HASH);
152 #endif
153 return res;
154 }
155
156 unsigned int
pocl_basic_probe(struct pocl_device_ops * ops)157 pocl_basic_probe(struct pocl_device_ops *ops)
158 {
159 int env_count = pocl_device_get_env_count(ops->device_name);
160
161 /* No env specified, so pthread will be used instead of basic */
162 if(env_count < 0)
163 return 0;
164
165 return env_count;
166 }
167
168 cl_int
pocl_basic_init(unsigned j,cl_device_id device,const char * parameters)169 pocl_basic_init (unsigned j, cl_device_id device, const char* parameters)
170 {
171 struct data *d;
172 cl_int ret = CL_SUCCESS;
173 int err;
174 static int first_basic_init = 1;
175
176 if (first_basic_init)
177 {
178 POCL_MSG_WARN ("INIT dlcache DOTO delete\n");
179 pocl_init_dlhandle_cache();
180 first_basic_init = 0;
181 }
182
183 d = (struct data *) calloc (1, sizeof (struct data));
184 if (d == NULL)
185 return CL_OUT_OF_HOST_MEMORY;
186
187 d->current_kernel = NULL;
188 device->data = d;
189
190 pocl_init_default_device_infos (device);
191 /* 0 is the host memory shared with all drivers that use it */
192 device->global_mem_id = 0;
193 device->extensions = HOST_DEVICE_EXTENSIONS;
194
195 /* full memory consistency model for atomic memory and fence operations
196 except CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES. see
197 https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#opencl-3.0-backwards-compatibility*/
198 device->atomic_memory_capabilities = CL_DEVICE_ATOMIC_ORDER_RELAXED
199 | CL_DEVICE_ATOMIC_ORDER_ACQ_REL
200 | CL_DEVICE_ATOMIC_ORDER_SEQ_CST
201 | CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP
202 | CL_DEVICE_ATOMIC_SCOPE_DEVICE;
203 device->atomic_fence_capabilities = CL_DEVICE_ATOMIC_ORDER_RELAXED
204 | CL_DEVICE_ATOMIC_ORDER_ACQ_REL
205 | CL_DEVICE_ATOMIC_ORDER_SEQ_CST
206 | CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM
207 | CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP
208 | CL_DEVICE_ATOMIC_SCOPE_DEVICE;
209
210 device->svm_allocation_priority = 1;
211 /* OpenCL 2.0 properties */
212 device->svm_caps = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER
213 | CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_ATOMICS;
214
215 /* hwloc probes OpenCL device info at its initialization in case
216 the OpenCL extension is enabled. This causes to printout
217 an unimplemented property error because hwloc is used to
218 initialize global_mem_size which it is not yet. Just put
219 a nonzero there for now. */
220 device->global_mem_size = 1;
221 err = pocl_topology_detect_device_info(device);
222 if (err)
223 ret = CL_INVALID_DEVICE;
224
225 POCL_INIT_LOCK (d->cq_lock);
226
227 assert (device->printf_buffer_size > 0);
228 d->printf_buffer = pocl_aligned_malloc (MAX_EXTENDED_ALIGNMENT,
229 device->printf_buffer_size);
230 assert (d->printf_buffer != NULL);
231
232 pocl_cpuinfo_detect_device_info(device);
233 pocl_set_buffer_image_limits(device);
234
235 if (device->vendor_id == 0)
236 device->vendor_id = CL_KHRONOS_VENDOR_ID_POCL;
237
238 /* The basic driver represents only one "compute unit" as
239 it doesn't exploit multiple hardware threads. Multiple
240 basic devices can be still used for task level parallelism
241 using multiple OpenCL devices. */
242 device->max_compute_units = 1;
243
244 return ret;
245 }
246
247
248 cl_int
pocl_basic_alloc_mem_obj(cl_device_id device,cl_mem mem,void * host_ptr)249 pocl_basic_alloc_mem_obj (cl_device_id device, cl_mem mem, void* host_ptr)
250 {
251 pocl_mem_identifier *p = &mem->device_ptrs[device->global_mem_id];
252
253 /* let other drivers preallocate */
254 if ((mem->flags & CL_MEM_ALLOC_HOST_PTR) && (mem->mem_host_ptr == NULL))
255 return CL_MEM_OBJECT_ALLOCATION_FAILURE;
256
257 /* malloc mem_host_ptr then increase refcount */
258 pocl_alloc_or_retain_mem_host_ptr (mem);
259
260 cl_device_id svm_dev = mem->context->svm_allocdev;
261 /* if we have a device which shares global memory with host,
262 * and it needs to do anything to make allocations accessible
263 * to itself, do it here */
264 if (svm_dev && svm_dev->global_mem_id == 0 && svm_dev->ops->svm_register)
265 svm_dev->ops->svm_register (svm_dev, mem->mem_host_ptr, mem->size);
266
267 p->version = mem->mem_host_ptr_version;
268 p->mem_ptr = mem->mem_host_ptr;
269
270 POCL_MSG_PRINT_MEMORY ("Basic device ALLOC %p / size %zu \n", p->mem_ptr,
271 mem->size);
272
273 return CL_SUCCESS;
274 }
275
276
277 void
pocl_basic_free(cl_device_id device,cl_mem mem)278 pocl_basic_free (cl_device_id device, cl_mem mem)
279 {
280 cl_device_id svm_dev = mem->context->svm_allocdev;
281 if (svm_dev && svm_dev->global_mem_id == 0 && svm_dev->ops->svm_unregister)
282 svm_dev->ops->svm_unregister (svm_dev, mem->mem_host_ptr, mem->size);
283
284 pocl_mem_identifier *p = &mem->device_ptrs[device->global_mem_id];
285 pocl_release_mem_host_ptr (mem);
286 p->mem_ptr = NULL;
287 p->version = 0;
288 }
289
290 void
pocl_basic_run(void * data,_cl_command_node * cmd)291 pocl_basic_run (void *data, _cl_command_node *cmd)
292 {
293 struct data *d;
294 struct pocl_argument *al;
295 size_t x, y, z;
296 unsigned i;
297 cl_kernel kernel = cmd->command.run.kernel;
298 pocl_kernel_metadata_t *meta = kernel->meta;
299 struct pocl_context *pc = &cmd->command.run.pc;
300
301 assert (data != NULL);
302 d = (struct data *) data;
303
304 d->current_kernel = kernel;
305
306 void **arguments = (void **)malloc (sizeof (void *)
307 * (meta->num_args + meta->num_locals));
308
309 /* Process the kernel arguments. Convert the opaque buffer
310 pointers to real device pointers, allocate dynamic local
311 memory buffers, etc. */
312 for (i = 0; i < meta->num_args; ++i)
313 {
314 al = &(cmd->command.run.arguments[i]);
315 if (ARG_IS_LOCAL (meta->arg_info[i]))
316 {
317 if (cmd->device->device_alloca_locals)
318 {
319 /* Local buffers are allocated in the device side work-group
320 launcher. Let's pass only the sizes of the local args in
321 the arg buffer. */
322 assert (sizeof (size_t) == sizeof (void *));
323 arguments[i] = (void *)al->size;
324 }
325 else
326 {
327 arguments[i] = malloc (sizeof (void *));
328 *(void **)(arguments[i]) =
329 pocl_aligned_malloc(MAX_EXTENDED_ALIGNMENT, al->size);
330 }
331 }
332 else if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER)
333 {
334 /* It's legal to pass a NULL pointer to clSetKernelArguments. In
335 that case we must pass the same NULL forward to the kernel.
336 Otherwise, the user must have created a buffer with per device
337 pointers stored in the cl_mem. */
338 arguments[i] = malloc (sizeof (void *));
339 if (al->value == NULL)
340 {
341 *(void **)arguments[i] = NULL;
342 }
343 else
344 {
345 void *ptr = NULL;
346 if (al->is_svm)
347 {
348 ptr = *(void **)al->value;
349 }
350 else
351 {
352 cl_mem m = (*(cl_mem *)(al->value));
353 ptr = m->device_ptrs[cmd->device->global_mem_id].mem_ptr;
354 }
355 *(void **)arguments[i] = (char *)ptr + al->offset;
356 }
357 }
358 else if (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
359 {
360 dev_image_t di;
361 pocl_fill_dev_image_t (&di, al, cmd->device);
362
363 void *devptr = pocl_aligned_malloc (MAX_EXTENDED_ALIGNMENT,
364 sizeof (dev_image_t));
365 arguments[i] = malloc (sizeof (void *));
366 *(void **)(arguments[i]) = devptr;
367 memcpy (devptr, &di, sizeof (dev_image_t));
368 }
369 else if (meta->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
370 {
371 dev_sampler_t ds;
372 pocl_fill_dev_sampler_t (&ds, al);
373 arguments[i] = malloc (sizeof (void *));
374 *(void **)(arguments[i]) = (void *)ds;
375 }
376 else
377 {
378 arguments[i] = al->value;
379 }
380 }
381
382 if (!cmd->device->device_alloca_locals)
383 for (i = 0; i < meta->num_locals; ++i)
384 {
385 size_t s = meta->local_sizes[i];
386 size_t j = meta->num_args + i;
387 arguments[j] = malloc (sizeof (void *));
388 void *pp = pocl_aligned_malloc (MAX_EXTENDED_ALIGNMENT, s);
389 *(void **)(arguments[j]) = pp;
390 }
391
392 pc->printf_buffer = d->printf_buffer;
393 assert (pc->printf_buffer != NULL);
394 pc->printf_buffer_capacity = cmd->device->printf_buffer_size;
395 assert (pc->printf_buffer_capacity > 0);
396 uint32_t position = 0;
397 pc->printf_buffer_position = &position;
398
399 unsigned rm = pocl_save_rm ();
400 pocl_set_default_rm ();
401 unsigned ftz = pocl_save_ftz ();
402 pocl_set_ftz (kernel->program->flush_denorms);
403
404 for (z = 0; z < pc->num_groups[2]; ++z)
405 for (y = 0; y < pc->num_groups[1]; ++y)
406 for (x = 0; x < pc->num_groups[0]; ++x)
407 ((pocl_workgroup_func) cmd->command.run.wg)
408 ((uint8_t *)arguments, (uint8_t *)pc, x, y, z);
409
410 pocl_restore_rm (rm);
411 pocl_restore_ftz (ftz);
412
413 if (position > 0)
414 {
415 write (STDOUT_FILENO, pc->printf_buffer, position);
416 position = 0;
417 }
418
419 for (i = 0; i < meta->num_args; ++i)
420 {
421 if (ARG_IS_LOCAL (meta->arg_info[i]))
422 {
423 if (!cmd->device->device_alloca_locals)
424 {
425 POCL_MEM_FREE(*(void **)(arguments[i]));
426 POCL_MEM_FREE(arguments[i]);
427 }
428 else
429 {
430 /* Device side local space allocation has deallocation via stack
431 unwind. */
432 }
433 }
434 else if (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE
435 || meta->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
436 {
437 if (meta->arg_info[i].type != POCL_ARG_TYPE_SAMPLER)
438 POCL_MEM_FREE (*(void **)(arguments[i]));
439 POCL_MEM_FREE(arguments[i]);
440 }
441 else if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER)
442 {
443 POCL_MEM_FREE(arguments[i]);
444 }
445 }
446
447 if (!cmd->device->device_alloca_locals)
448 for (i = 0; i < meta->num_locals; ++i)
449 {
450 POCL_MEM_FREE (*(void **)(arguments[meta->num_args + i]));
451 POCL_MEM_FREE (arguments[meta->num_args + i]);
452 }
453 free(arguments);
454
455 pocl_release_dlhandle_cache (cmd);
456 }
457
458 void
pocl_basic_run_native(void * data,_cl_command_node * cmd)459 pocl_basic_run_native (void *data, _cl_command_node *cmd)
460 {
461 cl_event ev = cmd->event;
462 cl_device_id dev = cmd->device;
463 size_t i;
464 for (i = 0; i < ev->num_buffers; i++)
465 {
466 void *arg_loc = cmd->command.native.arg_locs[i];
467 void *buf = ev->mem_objs[i]->device_ptrs[dev->global_mem_id].mem_ptr;
468 if (dev->address_bits == 32)
469 *((uint32_t *)arg_loc) = (uint32_t) (((uintptr_t)buf) & 0xFFFFFFFF);
470 else
471 *((uint64_t *)arg_loc) = (uint64_t) (uintptr_t)buf;
472 }
473
474 cmd->command.native.user_func(cmd->command.native.args);
475
476 POCL_MEM_FREE (cmd->command.native.arg_locs);
477 }
478
479 cl_int
pocl_basic_uninit(unsigned j,cl_device_id device)480 pocl_basic_uninit (unsigned j, cl_device_id device)
481 {
482 struct data *d = (struct data*)device->data;
483 POCL_DESTROY_LOCK (d->cq_lock);
484 pocl_aligned_free (d->printf_buffer);
485 POCL_MEM_FREE(d);
486 device->data = NULL;
487 return CL_SUCCESS;
488 }
489
490 cl_int
pocl_basic_reinit(unsigned j,cl_device_id device)491 pocl_basic_reinit (unsigned j, cl_device_id device)
492 {
493 struct data *d = (struct data *)calloc (1, sizeof (struct data));
494 if (d == NULL)
495 return CL_OUT_OF_HOST_MEMORY;
496
497 d->current_kernel = NULL;
498
499 assert (device->printf_buffer_size > 0);
500 d->printf_buffer = pocl_aligned_malloc (MAX_EXTENDED_ALIGNMENT,
501 device->printf_buffer_size);
502 assert (d->printf_buffer != NULL);
503
504 POCL_INIT_LOCK (d->cq_lock);
505 device->data = d;
506 return CL_SUCCESS;
507 }
508
509
basic_command_scheduler(struct data * d)510 static void basic_command_scheduler (struct data *d)
511 {
512 _cl_command_node *node;
513
514 /* execute commands from ready list */
515 while ((node = d->ready_list))
516 {
517 assert (pocl_command_is_ready(node->event));
518 assert (node->event->status == CL_SUBMITTED);
519 CDL_DELETE (d->ready_list, node);
520 POCL_UNLOCK (d->cq_lock);
521 pocl_exec_command (node);
522 POCL_LOCK (d->cq_lock);
523 }
524
525 return;
526 }
527
528 void
pocl_basic_submit(_cl_command_node * node,cl_command_queue cq)529 pocl_basic_submit (_cl_command_node *node, cl_command_queue cq)
530 {
531 struct data *d = node->device->data;
532
533 if (node != NULL && node->type == CL_COMMAND_NDRANGE_KERNEL)
534 pocl_check_kernel_dlhandle_cache (node, 1, 1);
535
536 node->ready = 1;
537 POCL_LOCK (d->cq_lock);
538 pocl_command_push(node, &d->ready_list, &d->command_list);
539
540 POCL_UNLOCK_OBJ (node->event);
541 basic_command_scheduler (d);
542 POCL_UNLOCK (d->cq_lock);
543
544 return;
545 }
546
pocl_basic_flush(cl_device_id device,cl_command_queue cq)547 void pocl_basic_flush (cl_device_id device, cl_command_queue cq)
548 {
549 struct data *d = (struct data*)device->data;
550
551 POCL_LOCK (d->cq_lock);
552 basic_command_scheduler (d);
553 POCL_UNLOCK (d->cq_lock);
554 }
555
556 void
pocl_basic_join(cl_device_id device,cl_command_queue cq)557 pocl_basic_join (cl_device_id device, cl_command_queue cq)
558 {
559 struct data *d = (struct data*)device->data;
560
561 POCL_LOCK (d->cq_lock);
562 basic_command_scheduler (d);
563 POCL_UNLOCK (d->cq_lock);
564
565 return;
566 }
567
568 void
pocl_basic_notify(cl_device_id device,cl_event event,cl_event finished)569 pocl_basic_notify (cl_device_id device, cl_event event, cl_event finished)
570 {
571 struct data *d = (struct data*)device->data;
572 _cl_command_node * volatile node = event->command;
573
574 if (finished->status < CL_COMPLETE)
575 {
576 pocl_update_event_failed (event);
577 return;
578 }
579
580 if (!node->ready)
581 return;
582
583 if (pocl_command_is_ready (event))
584 {
585 if (event->status == CL_QUEUED)
586 {
587 pocl_update_event_submitted (event);
588 POCL_LOCK (d->cq_lock);
589 CDL_DELETE (d->command_list, node);
590 CDL_PREPEND (d->ready_list, node);
591 basic_command_scheduler (d);
592 POCL_UNLOCK (d->cq_lock);
593 }
594 return;
595 }
596 }
597
598 void
pocl_basic_compile_kernel(_cl_command_node * cmd,cl_kernel kernel,cl_device_id device,int specialize)599 pocl_basic_compile_kernel (_cl_command_node *cmd, cl_kernel kernel,
600 cl_device_id device, int specialize)
601 {
602 if (cmd != NULL && cmd->type == CL_COMMAND_NDRANGE_KERNEL)
603 pocl_check_kernel_dlhandle_cache (cmd, 0, specialize);
604 }
605
606 /*********************** IMAGES ********************************/
607
pocl_basic_copy_image_rect(void * data,cl_mem src_image,cl_mem dst_image,pocl_mem_identifier * src_mem_id,pocl_mem_identifier * dst_mem_id,const size_t * src_origin,const size_t * dst_origin,const size_t * region)608 cl_int pocl_basic_copy_image_rect( void *data,
609 cl_mem src_image,
610 cl_mem dst_image,
611 pocl_mem_identifier *src_mem_id,
612 pocl_mem_identifier *dst_mem_id,
613 const size_t *src_origin,
614 const size_t *dst_origin,
615 const size_t *region)
616 {
617
618 size_t px = src_image->image_elem_size * src_image->image_channels;
619 const size_t adj_src_origin[3]
620 = { src_origin[0] * px, src_origin[1], src_origin[2] };
621 const size_t adj_dst_origin[3]
622 = { dst_origin[0] * px, dst_origin[1], dst_origin[2] };
623 const size_t adj_region[3] = { region[0] * px, region[1], region[2] };
624
625 POCL_MSG_PRINT_MEMORY (
626 " BASIC COPY IMAGE RECT \n"
627 "dst_image %p dst_mem_id %p \n"
628 "src_image %p src_mem_id %p \n"
629 "dst_origin [0,1,2] %zu %zu %zu \n"
630 "src_origin [0,1,2] %zu %zu %zu \n"
631 "region [0,1,2] %zu %zu %zu \n"
632 "px %zu\n",
633 dst_image, dst_mem_id,
634 src_image, src_mem_id,
635 dst_origin[0], dst_origin[1], dst_origin[2],
636 src_origin[0], src_origin[1], src_origin[2],
637 region[0], region[1], region[2],
638 px);
639
640 pocl_driver_copy_rect (
641 data, dst_mem_id, NULL, src_mem_id, NULL, adj_dst_origin, adj_src_origin,
642 adj_region, dst_image->image_row_pitch, dst_image->image_slice_pitch,
643 src_image->image_row_pitch, src_image->image_slice_pitch);
644
645 return CL_SUCCESS;
646 }
647
648 /* copies a region from host or device buffer to device image */
pocl_basic_write_image_rect(void * data,cl_mem dst_image,pocl_mem_identifier * dst_mem_id,const void * __restrict__ src_host_ptr,pocl_mem_identifier * src_mem_id,const size_t * origin,const size_t * region,size_t src_row_pitch,size_t src_slice_pitch,size_t src_offset)649 cl_int pocl_basic_write_image_rect ( void *data,
650 cl_mem dst_image,
651 pocl_mem_identifier *dst_mem_id,
652 const void *__restrict__ src_host_ptr,
653 pocl_mem_identifier *src_mem_id,
654 const size_t *origin,
655 const size_t *region,
656 size_t src_row_pitch,
657 size_t src_slice_pitch,
658 size_t src_offset)
659 {
660 POCL_MSG_PRINT_MEMORY (
661 "BASIC WRITE IMAGE RECT \n"
662 "dst_image %p dst_mem_id %p \n"
663 "src_hostptr %p src_mem_id %p \n"
664 "origin [0,1,2] %zu %zu %zu \n"
665 "region [0,1,2] %zu %zu %zu \n"
666 "row %zu slice %zu offset %zu \n",
667 dst_image, dst_mem_id,
668 src_host_ptr, src_mem_id,
669 origin[0], origin[1], origin[2],
670 region[0], region[1], region[2],
671 src_row_pitch, src_slice_pitch, src_offset);
672
673 const void *__restrict__ ptr
674 = src_host_ptr ? src_host_ptr : src_mem_id->mem_ptr;
675 ptr += src_offset;
676 const size_t zero_origin[3] = { 0 };
677 size_t px = dst_image->image_elem_size * dst_image->image_channels;
678 if (src_row_pitch == 0)
679 src_row_pitch = px * region[0];
680 if (src_slice_pitch == 0)
681 src_slice_pitch = src_row_pitch * region[1];
682
683 const size_t adj_origin[3] = { origin[0] * px, origin[1], origin[2] };
684 const size_t adj_region[3] = { region[0] * px, region[1], region[2] };
685
686 pocl_driver_write_rect (data, ptr, dst_mem_id, NULL, adj_origin, zero_origin,
687 adj_region, dst_image->image_row_pitch,
688 dst_image->image_slice_pitch, src_row_pitch,
689 src_slice_pitch);
690 return CL_SUCCESS;
691 }
692
693 /* copies a region from device image to host or device buffer */
pocl_basic_read_image_rect(void * data,cl_mem src_image,pocl_mem_identifier * src_mem_id,void * __restrict__ dst_host_ptr,pocl_mem_identifier * dst_mem_id,const size_t * origin,const size_t * region,size_t dst_row_pitch,size_t dst_slice_pitch,size_t dst_offset)694 cl_int pocl_basic_read_image_rect( void *data,
695 cl_mem src_image,
696 pocl_mem_identifier *src_mem_id,
697 void *__restrict__ dst_host_ptr,
698 pocl_mem_identifier *dst_mem_id,
699 const size_t *origin,
700 const size_t *region,
701 size_t dst_row_pitch,
702 size_t dst_slice_pitch,
703 size_t dst_offset)
704 {
705 POCL_MSG_PRINT_MEMORY (
706 "BASIC READ IMAGE RECT \n"
707 "src_image %p src_mem_id %p \n"
708 "dst_hostptr %p dst_mem_id %p \n"
709 "origin [0,1,2] %zu %zu %zu \n"
710 "region [0,1,2] %zu %zu %zu \n"
711 "row %zu slice %zu offset %zu \n",
712 src_image, src_mem_id,
713 dst_host_ptr, dst_mem_id,
714 origin[0], origin[1], origin[2],
715 region[0], region[1], region[2],
716 dst_row_pitch, dst_slice_pitch, dst_offset);
717
718 void *__restrict__ ptr = dst_host_ptr ? dst_host_ptr : dst_mem_id->mem_ptr;
719 ptr += dst_offset;
720 const size_t zero_origin[3] = { 0 };
721 size_t px = src_image->image_elem_size * src_image->image_channels;
722 if (dst_row_pitch == 0)
723 dst_row_pitch = px * region[0];
724 if (dst_slice_pitch == 0)
725 dst_slice_pitch = dst_row_pitch * region[1];
726 const size_t adj_origin[3] = { origin[0] * px, origin[1], origin[2] };
727 const size_t adj_region[3] = { region[0] * px, region[1], region[2] };
728
729 pocl_driver_read_rect (data, ptr, src_mem_id, NULL, adj_origin, zero_origin,
730 adj_region, src_image->image_row_pitch,
731 src_image->image_slice_pitch, dst_row_pitch,
732 dst_slice_pitch);
733 return CL_SUCCESS;
734 }
735
736
pocl_basic_map_image(void * data,pocl_mem_identifier * mem_id,cl_mem src_image,mem_mapping_t * map)737 cl_int pocl_basic_map_image (void *data,
738 pocl_mem_identifier *mem_id,
739 cl_mem src_image,
740 mem_mapping_t *map)
741 {
742 assert (map->host_ptr != NULL);
743
744 if (map->map_flags & CL_MAP_WRITE_INVALIDATE_REGION)
745 return CL_SUCCESS;
746
747 if (map->host_ptr != ((char *)mem_id->mem_ptr + map->offset))
748 {
749 pocl_basic_read_image_rect (data, src_image, mem_id, map->host_ptr,
750 NULL, map->origin, map->region,
751 map->row_pitch, map->slice_pitch, 0);
752 }
753 return CL_SUCCESS;
754 }
755
pocl_basic_unmap_image(void * data,pocl_mem_identifier * mem_id,cl_mem dst_image,mem_mapping_t * map)756 cl_int pocl_basic_unmap_image(void *data,
757 pocl_mem_identifier *mem_id,
758 cl_mem dst_image,
759 mem_mapping_t *map)
760 {
761 if (map->map_flags == CL_MAP_READ)
762 return CL_SUCCESS;
763
764 if (map->host_ptr != ((char *)mem_id->mem_ptr + map->offset))
765 {
766 pocl_basic_write_image_rect (data, dst_image, mem_id, map->host_ptr,
767 NULL, map->origin, map->region,
768 map->row_pitch, map->slice_pitch, 0);
769 }
770 return CL_SUCCESS;
771 }
772
773 cl_int
pocl_basic_fill_image(void * data,cl_mem image,pocl_mem_identifier * image_data,const size_t * origin,const size_t * region,cl_uint4 orig_pixel,pixel_t fill_pixel,size_t pixel_size)774 pocl_basic_fill_image (void *data, cl_mem image,
775 pocl_mem_identifier *image_data, const size_t *origin,
776 const size_t *region, cl_uint4 orig_pixel,
777 pixel_t fill_pixel, size_t pixel_size)
778 {
779 POCL_MSG_PRINT_MEMORY ("BASIC / FILL IMAGE \n"
780 "image %p data %p \n"
781 "origin [0,1,2] %zu %zu %zu \n"
782 "region [0,1,2] %zu %zu %zu \n"
783 "pixel %p size %zu \n",
784 image, image_data,
785 origin[0], origin[1], origin[2],
786 region[0], region[1], region[2],
787 fill_pixel, pixel_size);
788
789 size_t row_pitch = image->image_row_pitch;
790 size_t slice_pitch = image->image_slice_pitch;
791 char *__restrict const adjusted_device_ptr
792 = (char *)image_data->mem_ptr
793 + origin[0] * pixel_size
794 + row_pitch * origin[1]
795 + slice_pitch * origin[2];
796
797 size_t i, j, k;
798
799 for (k = 0; k < region[2]; ++k)
800 for (j = 0; j < region[1]; ++j)
801 for (i = 0; i < region[0]; ++i)
802 memcpy (adjusted_device_ptr
803 + pixel_size * i
804 + row_pitch * j
805 + slice_pitch * k,
806 fill_pixel,
807 pixel_size);
808 return CL_SUCCESS;
809 }
810
811 /***************************************************************************/
812 void
pocl_basic_svm_free(cl_device_id dev,void * svm_ptr)813 pocl_basic_svm_free (cl_device_id dev, void *svm_ptr)
814 {
815 /* TODO we should somehow figure out the size argument
816 * and call pocl_free_global_mem */
817 pocl_aligned_free (svm_ptr);
818 }
819
820 void *
pocl_basic_svm_alloc(cl_device_id dev,cl_svm_mem_flags flags,size_t size)821 pocl_basic_svm_alloc (cl_device_id dev, cl_svm_mem_flags flags, size_t size)
822
823 {
824 return pocl_aligned_malloc (MAX_EXTENDED_ALIGNMENT, size);
825 }
826
827 void
pocl_basic_svm_copy(cl_device_id dev,void * __restrict__ dst,const void * __restrict__ src,size_t size)828 pocl_basic_svm_copy (cl_device_id dev, void *__restrict__ dst,
829 const void *__restrict__ src, size_t size)
830 {
831 memcpy (dst, src, size);
832 }
833
834 void
pocl_basic_svm_fill(cl_device_id dev,void * __restrict__ svm_ptr,size_t size,void * __restrict__ pattern,size_t pattern_size)835 pocl_basic_svm_fill (cl_device_id dev, void *__restrict__ svm_ptr, size_t size,
836 void *__restrict__ pattern, size_t pattern_size)
837 {
838 pocl_mem_identifier temp;
839 temp.mem_ptr = svm_ptr;
840 pocl_driver_memfill (dev->data, &temp, NULL, size, 0, pattern, pattern_size);
841 }
842