1 /* basic.c - a minimalistic single core pocl device driver layer implementation
2 
3    Copyright (c) 2011-2013 Universidad Rey Juan Carlos and
4                  2011-2020 Pekka Jääskeläinen
5 
6    Permission is hereby granted, free of charge, to any person obtaining a copy
7    of this software and associated documentation files (the "Software"), to
8    deal in the Software without restriction, including without limitation the
9    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10    sell copies of the Software, and to permit persons to whom the Software is
11    furnished to do so, subject to the following conditions:
12 
13    The above copyright notice and this permission notice shall be included in
14    all copies or substantial portions of the Software.
15 
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21    FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22    IN THE SOFTWARE.
23 */
24 
25 #include "basic.h"
26 #include "common.h"
27 #include "config.h"
28 #include "config2.h"
29 #include "cpuinfo.h"
30 #include "devices.h"
31 #include "pocl_local_size.h"
32 #include "pocl_util.h"
33 #include "topology/pocl_topology.h"
34 #include "utlist.h"
35 
36 #include <assert.h>
37 #include <limits.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <utlist.h>
42 
43 #include "pocl_cache.h"
44 #include "pocl_file_util.h"
45 #include "pocl_mem_management.h"
46 #include "pocl_timing.h"
47 #include "pocl_workgroup_func.h"
48 
49 #include "common_driver.h"
50 
51 #ifdef ENABLE_LLVM
52 #include "pocl_llvm.h"
53 #endif
54 
55 struct data {
56   /* List of commands ready to be executed */
57   _cl_command_node *ready_list;
58   /* List of commands not yet ready to be executed */
59   _cl_command_node *command_list;
60   /* Lock for command list related operations */
61   pocl_lock_t cq_lock;
62 
63   /* Currently loaded kernel. */
64   cl_kernel current_kernel;
65 
66   /* printf buffer */
67   void *printf_buffer;
68 };
69 
70 void
pocl_basic_init_device_ops(struct pocl_device_ops * ops)71 pocl_basic_init_device_ops(struct pocl_device_ops *ops)
72 {
73   ops->device_name = "basic";
74 
75   ops->probe = pocl_basic_probe;
76   ops->uninit = pocl_basic_uninit;
77   ops->reinit = pocl_basic_reinit;
78   ops->init = pocl_basic_init;
79 
80   ops->alloc_mem_obj = pocl_basic_alloc_mem_obj;
81   ops->free = pocl_basic_free;
82 
83   ops->read = pocl_driver_read;
84   ops->read_rect = pocl_driver_read_rect;
85   ops->write = pocl_driver_write;
86   ops->write_rect = pocl_driver_write_rect;
87   ops->copy = pocl_driver_copy;
88   ops->copy_with_size = pocl_driver_copy_with_size;
89   ops->copy_rect = pocl_driver_copy_rect;
90   ops->memfill = pocl_driver_memfill;
91   ops->map_mem = pocl_driver_map_mem;
92   ops->unmap_mem = pocl_driver_unmap_mem;
93   ops->get_mapping_ptr = pocl_driver_get_mapping_ptr;
94   ops->free_mapping_ptr = pocl_driver_free_mapping_ptr;
95 
96   ops->can_migrate_d2d = NULL;
97   ops->migrate_d2d = NULL;
98 
99   ops->run = pocl_basic_run;
100   ops->run_native = pocl_basic_run_native;
101 
102   ops->build_source = pocl_driver_build_source;
103   ops->link_program = pocl_driver_link_program;
104   ops->build_binary = pocl_driver_build_binary;
105   ops->free_program = pocl_driver_free_program;
106   ops->setup_metadata = pocl_driver_setup_metadata;
107   ops->supports_binary = pocl_driver_supports_binary;
108   ops->build_poclbinary = pocl_driver_build_poclbinary;
109   ops->compile_kernel = pocl_basic_compile_kernel;
110 
111   ops->join = pocl_basic_join;
112   ops->submit = pocl_basic_submit;
113   ops->broadcast = pocl_broadcast;
114   ops->notify = pocl_basic_notify;
115   ops->flush = pocl_basic_flush;
116   ops->build_hash = pocl_basic_build_hash;
117   ops->compute_local_size = pocl_default_local_size_optimizer;
118 
119   ops->get_device_info_ext = NULL;
120 
121   ops->svm_free = pocl_basic_svm_free;
122   ops->svm_alloc = pocl_basic_svm_alloc;
123   /* no need to implement these two as they're noop
124    * and pocl_exec_command takes care of it */
125   ops->svm_map = NULL;
126   ops->svm_unmap = NULL;
127   ops->svm_copy = pocl_basic_svm_copy;
128   ops->svm_fill = pocl_basic_svm_fill;
129 
130   ops->create_kernel = NULL;
131   ops->free_kernel = NULL;
132   ops->create_sampler = NULL;
133   ops->free_sampler = NULL;
134   ops->copy_image_rect = pocl_basic_copy_image_rect;
135   ops->write_image_rect = pocl_basic_write_image_rect;
136   ops->read_image_rect = pocl_basic_read_image_rect;
137   ops->map_image = pocl_basic_map_image;
138   ops->unmap_image = pocl_basic_unmap_image;
139   ops->fill_image = pocl_basic_fill_image;
140 }
141 
142 char *
pocl_basic_build_hash(cl_device_id device)143 pocl_basic_build_hash (cl_device_id device)
144 {
145   char* res = calloc(1000, sizeof(char));
146 #ifdef KERNELLIB_HOST_DISTRO_VARIANTS
147   char *name = get_llvm_cpu_name ();
148   snprintf (res, 1000, "basic-%s-%s", HOST_DEVICE_BUILD_HASH, name);
149   POCL_MEM_FREE (name);
150 #else
151   snprintf (res, 1000, "basic-%s", HOST_DEVICE_BUILD_HASH);
152 #endif
153   return res;
154 }
155 
156 unsigned int
pocl_basic_probe(struct pocl_device_ops * ops)157 pocl_basic_probe(struct pocl_device_ops *ops)
158 {
159   int env_count = pocl_device_get_env_count(ops->device_name);
160 
161   /* No env specified, so pthread will be used instead of basic */
162   if(env_count < 0)
163     return 0;
164 
165   return env_count;
166 }
167 
168 cl_int
pocl_basic_init(unsigned j,cl_device_id device,const char * parameters)169 pocl_basic_init (unsigned j, cl_device_id device, const char* parameters)
170 {
171   struct data *d;
172   cl_int ret = CL_SUCCESS;
173   int err;
174   static int first_basic_init = 1;
175 
176   if (first_basic_init)
177     {
178       POCL_MSG_WARN ("INIT dlcache DOTO delete\n");
179       pocl_init_dlhandle_cache();
180       first_basic_init = 0;
181     }
182 
183   d = (struct data *) calloc (1, sizeof (struct data));
184   if (d == NULL)
185     return CL_OUT_OF_HOST_MEMORY;
186 
187   d->current_kernel = NULL;
188   device->data = d;
189 
190   pocl_init_default_device_infos (device);
191   /* 0 is the host memory shared with all drivers that use it */
192   device->global_mem_id = 0;
193   device->extensions = HOST_DEVICE_EXTENSIONS;
194 
195   /* full memory consistency model for atomic memory and fence operations
196   except CL_DEVICE_ATOMIC_SCOPE_ALL_DEVICES. see
197   https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_API.html#opencl-3.0-backwards-compatibility*/
198   device->atomic_memory_capabilities = CL_DEVICE_ATOMIC_ORDER_RELAXED
199                                        | CL_DEVICE_ATOMIC_ORDER_ACQ_REL
200                                        | CL_DEVICE_ATOMIC_ORDER_SEQ_CST
201                                        | CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP
202                                        | CL_DEVICE_ATOMIC_SCOPE_DEVICE;
203   device->atomic_fence_capabilities = CL_DEVICE_ATOMIC_ORDER_RELAXED
204                                       | CL_DEVICE_ATOMIC_ORDER_ACQ_REL
205                                       | CL_DEVICE_ATOMIC_ORDER_SEQ_CST
206                                       | CL_DEVICE_ATOMIC_SCOPE_WORK_ITEM
207                                       | CL_DEVICE_ATOMIC_SCOPE_WORK_GROUP
208                                       | CL_DEVICE_ATOMIC_SCOPE_DEVICE;
209 
210   device->svm_allocation_priority = 1;
211   /* OpenCL 2.0 properties */
212   device->svm_caps = CL_DEVICE_SVM_COARSE_GRAIN_BUFFER
213                      | CL_DEVICE_SVM_FINE_GRAIN_BUFFER | CL_DEVICE_SVM_ATOMICS;
214 
215   /* hwloc probes OpenCL device info at its initialization in case
216      the OpenCL extension is enabled. This causes to printout
217      an unimplemented property error because hwloc is used to
218      initialize global_mem_size which it is not yet. Just put
219      a nonzero there for now. */
220   device->global_mem_size = 1;
221   err = pocl_topology_detect_device_info(device);
222   if (err)
223     ret = CL_INVALID_DEVICE;
224 
225   POCL_INIT_LOCK (d->cq_lock);
226 
227   assert (device->printf_buffer_size > 0);
228   d->printf_buffer = pocl_aligned_malloc (MAX_EXTENDED_ALIGNMENT,
229                                           device->printf_buffer_size);
230   assert (d->printf_buffer != NULL);
231 
232   pocl_cpuinfo_detect_device_info(device);
233   pocl_set_buffer_image_limits(device);
234 
235   if (device->vendor_id == 0)
236     device->vendor_id = CL_KHRONOS_VENDOR_ID_POCL;
237 
238   /* The basic driver represents only one "compute unit" as
239      it doesn't exploit multiple hardware threads. Multiple
240      basic devices can be still used for task level parallelism
241      using multiple OpenCL devices. */
242   device->max_compute_units = 1;
243 
244   return ret;
245 }
246 
247 
248 cl_int
pocl_basic_alloc_mem_obj(cl_device_id device,cl_mem mem,void * host_ptr)249 pocl_basic_alloc_mem_obj (cl_device_id device, cl_mem mem, void* host_ptr)
250 {
251   pocl_mem_identifier *p = &mem->device_ptrs[device->global_mem_id];
252 
253   /* let other drivers preallocate */
254   if ((mem->flags & CL_MEM_ALLOC_HOST_PTR) && (mem->mem_host_ptr == NULL))
255     return CL_MEM_OBJECT_ALLOCATION_FAILURE;
256 
257   /* malloc mem_host_ptr then increase refcount */
258   pocl_alloc_or_retain_mem_host_ptr (mem);
259 
260   cl_device_id svm_dev = mem->context->svm_allocdev;
261   /* if we have a device which shares global memory with host,
262    * and it needs to do anything to make allocations accessible
263    * to itself, do it here */
264   if (svm_dev && svm_dev->global_mem_id == 0 && svm_dev->ops->svm_register)
265     svm_dev->ops->svm_register (svm_dev, mem->mem_host_ptr, mem->size);
266 
267   p->version = mem->mem_host_ptr_version;
268   p->mem_ptr = mem->mem_host_ptr;
269 
270   POCL_MSG_PRINT_MEMORY ("Basic device ALLOC %p / size %zu \n", p->mem_ptr,
271                          mem->size);
272 
273   return CL_SUCCESS;
274 }
275 
276 
277 void
pocl_basic_free(cl_device_id device,cl_mem mem)278 pocl_basic_free (cl_device_id device, cl_mem mem)
279 {
280   cl_device_id svm_dev = mem->context->svm_allocdev;
281   if (svm_dev && svm_dev->global_mem_id == 0 && svm_dev->ops->svm_unregister)
282     svm_dev->ops->svm_unregister (svm_dev, mem->mem_host_ptr, mem->size);
283 
284   pocl_mem_identifier *p = &mem->device_ptrs[device->global_mem_id];
285   pocl_release_mem_host_ptr (mem);
286   p->mem_ptr = NULL;
287   p->version = 0;
288 }
289 
290 void
pocl_basic_run(void * data,_cl_command_node * cmd)291 pocl_basic_run (void *data, _cl_command_node *cmd)
292 {
293   struct data *d;
294   struct pocl_argument *al;
295   size_t x, y, z;
296   unsigned i;
297   cl_kernel kernel = cmd->command.run.kernel;
298   pocl_kernel_metadata_t *meta = kernel->meta;
299   struct pocl_context *pc = &cmd->command.run.pc;
300 
301   assert (data != NULL);
302   d = (struct data *) data;
303 
304   d->current_kernel = kernel;
305 
306   void **arguments = (void **)malloc (sizeof (void *)
307                                       * (meta->num_args + meta->num_locals));
308 
309   /* Process the kernel arguments. Convert the opaque buffer
310      pointers to real device pointers, allocate dynamic local
311      memory buffers, etc. */
312   for (i = 0; i < meta->num_args; ++i)
313     {
314       al = &(cmd->command.run.arguments[i]);
315       if (ARG_IS_LOCAL (meta->arg_info[i]))
316         {
317           if (cmd->device->device_alloca_locals)
318             {
319               /* Local buffers are allocated in the device side work-group
320                  launcher. Let's pass only the sizes of the local args in
321                  the arg buffer. */
322               assert (sizeof (size_t) == sizeof (void *));
323               arguments[i] = (void *)al->size;
324             }
325           else
326             {
327               arguments[i] = malloc (sizeof (void *));
328               *(void **)(arguments[i]) =
329                 pocl_aligned_malloc(MAX_EXTENDED_ALIGNMENT, al->size);
330             }
331         }
332       else if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER)
333         {
334           /* It's legal to pass a NULL pointer to clSetKernelArguments. In
335              that case we must pass the same NULL forward to the kernel.
336              Otherwise, the user must have created a buffer with per device
337              pointers stored in the cl_mem. */
338           arguments[i] = malloc (sizeof (void *));
339           if (al->value == NULL)
340             {
341               *(void **)arguments[i] = NULL;
342             }
343           else
344             {
345               void *ptr = NULL;
346               if (al->is_svm)
347                 {
348                   ptr = *(void **)al->value;
349                 }
350               else
351                 {
352                   cl_mem m = (*(cl_mem *)(al->value));
353                   ptr = m->device_ptrs[cmd->device->global_mem_id].mem_ptr;
354                 }
355               *(void **)arguments[i] = (char *)ptr + al->offset;
356             }
357         }
358       else if (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
359         {
360           dev_image_t di;
361           pocl_fill_dev_image_t (&di, al, cmd->device);
362 
363           void *devptr = pocl_aligned_malloc (MAX_EXTENDED_ALIGNMENT,
364                                               sizeof (dev_image_t));
365           arguments[i] = malloc (sizeof (void *));
366           *(void **)(arguments[i]) = devptr;
367           memcpy (devptr, &di, sizeof (dev_image_t));
368         }
369       else if (meta->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
370         {
371           dev_sampler_t ds;
372           pocl_fill_dev_sampler_t (&ds, al);
373           arguments[i] = malloc (sizeof (void *));
374           *(void **)(arguments[i]) = (void *)ds;
375         }
376       else
377         {
378           arguments[i] = al->value;
379         }
380     }
381 
382   if (!cmd->device->device_alloca_locals)
383     for (i = 0; i < meta->num_locals; ++i)
384       {
385         size_t s = meta->local_sizes[i];
386         size_t j = meta->num_args + i;
387         arguments[j] = malloc (sizeof (void *));
388         void *pp = pocl_aligned_malloc (MAX_EXTENDED_ALIGNMENT, s);
389         *(void **)(arguments[j]) = pp;
390       }
391 
392   pc->printf_buffer = d->printf_buffer;
393   assert (pc->printf_buffer != NULL);
394   pc->printf_buffer_capacity = cmd->device->printf_buffer_size;
395   assert (pc->printf_buffer_capacity > 0);
396   uint32_t position = 0;
397   pc->printf_buffer_position = &position;
398 
399   unsigned rm = pocl_save_rm ();
400   pocl_set_default_rm ();
401   unsigned ftz = pocl_save_ftz ();
402   pocl_set_ftz (kernel->program->flush_denorms);
403 
404   for (z = 0; z < pc->num_groups[2]; ++z)
405     for (y = 0; y < pc->num_groups[1]; ++y)
406       for (x = 0; x < pc->num_groups[0]; ++x)
407         ((pocl_workgroup_func) cmd->command.run.wg)
408 	  ((uint8_t *)arguments, (uint8_t *)pc, x, y, z);
409 
410   pocl_restore_rm (rm);
411   pocl_restore_ftz (ftz);
412 
413   if (position > 0)
414     {
415       write (STDOUT_FILENO, pc->printf_buffer, position);
416       position = 0;
417     }
418 
419   for (i = 0; i < meta->num_args; ++i)
420     {
421       if (ARG_IS_LOCAL (meta->arg_info[i]))
422         {
423           if (!cmd->device->device_alloca_locals)
424             {
425               POCL_MEM_FREE(*(void **)(arguments[i]));
426               POCL_MEM_FREE(arguments[i]);
427             }
428           else
429             {
430               /* Device side local space allocation has deallocation via stack
431                  unwind. */
432             }
433         }
434       else if (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE
435                || meta->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)
436         {
437           if (meta->arg_info[i].type != POCL_ARG_TYPE_SAMPLER)
438             POCL_MEM_FREE (*(void **)(arguments[i]));
439           POCL_MEM_FREE(arguments[i]);
440         }
441       else if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER)
442         {
443           POCL_MEM_FREE(arguments[i]);
444         }
445     }
446 
447   if (!cmd->device->device_alloca_locals)
448     for (i = 0; i < meta->num_locals; ++i)
449       {
450         POCL_MEM_FREE (*(void **)(arguments[meta->num_args + i]));
451         POCL_MEM_FREE (arguments[meta->num_args + i]);
452       }
453   free(arguments);
454 
455   pocl_release_dlhandle_cache (cmd);
456 }
457 
458 void
pocl_basic_run_native(void * data,_cl_command_node * cmd)459 pocl_basic_run_native (void *data, _cl_command_node *cmd)
460 {
461   cl_event ev = cmd->event;
462   cl_device_id dev = cmd->device;
463   size_t i;
464   for (i = 0; i < ev->num_buffers; i++)
465     {
466       void *arg_loc = cmd->command.native.arg_locs[i];
467       void *buf = ev->mem_objs[i]->device_ptrs[dev->global_mem_id].mem_ptr;
468       if (dev->address_bits == 32)
469         *((uint32_t *)arg_loc) = (uint32_t) (((uintptr_t)buf) & 0xFFFFFFFF);
470       else
471         *((uint64_t *)arg_loc) = (uint64_t) (uintptr_t)buf;
472     }
473 
474   cmd->command.native.user_func(cmd->command.native.args);
475 
476   POCL_MEM_FREE (cmd->command.native.arg_locs);
477 }
478 
479 cl_int
pocl_basic_uninit(unsigned j,cl_device_id device)480 pocl_basic_uninit (unsigned j, cl_device_id device)
481 {
482   struct data *d = (struct data*)device->data;
483   POCL_DESTROY_LOCK (d->cq_lock);
484   pocl_aligned_free (d->printf_buffer);
485   POCL_MEM_FREE(d);
486   device->data = NULL;
487   return CL_SUCCESS;
488 }
489 
490 cl_int
pocl_basic_reinit(unsigned j,cl_device_id device)491 pocl_basic_reinit (unsigned j, cl_device_id device)
492 {
493   struct data *d = (struct data *)calloc (1, sizeof (struct data));
494   if (d == NULL)
495     return CL_OUT_OF_HOST_MEMORY;
496 
497   d->current_kernel = NULL;
498 
499   assert (device->printf_buffer_size > 0);
500   d->printf_buffer = pocl_aligned_malloc (MAX_EXTENDED_ALIGNMENT,
501                                           device->printf_buffer_size);
502   assert (d->printf_buffer != NULL);
503 
504   POCL_INIT_LOCK (d->cq_lock);
505   device->data = d;
506   return CL_SUCCESS;
507 }
508 
509 
basic_command_scheduler(struct data * d)510 static void basic_command_scheduler (struct data *d)
511 {
512   _cl_command_node *node;
513 
514   /* execute commands from ready list */
515   while ((node = d->ready_list))
516     {
517       assert (pocl_command_is_ready(node->event));
518       assert (node->event->status == CL_SUBMITTED);
519       CDL_DELETE (d->ready_list, node);
520       POCL_UNLOCK (d->cq_lock);
521       pocl_exec_command (node);
522       POCL_LOCK (d->cq_lock);
523     }
524 
525   return;
526 }
527 
528 void
pocl_basic_submit(_cl_command_node * node,cl_command_queue cq)529 pocl_basic_submit (_cl_command_node *node, cl_command_queue cq)
530 {
531   struct data *d = node->device->data;
532 
533   if (node != NULL && node->type == CL_COMMAND_NDRANGE_KERNEL)
534     pocl_check_kernel_dlhandle_cache (node, 1, 1);
535 
536   node->ready = 1;
537   POCL_LOCK (d->cq_lock);
538   pocl_command_push(node, &d->ready_list, &d->command_list);
539 
540   POCL_UNLOCK_OBJ (node->event);
541   basic_command_scheduler (d);
542   POCL_UNLOCK (d->cq_lock);
543 
544   return;
545 }
546 
pocl_basic_flush(cl_device_id device,cl_command_queue cq)547 void pocl_basic_flush (cl_device_id device, cl_command_queue cq)
548 {
549   struct data *d = (struct data*)device->data;
550 
551   POCL_LOCK (d->cq_lock);
552   basic_command_scheduler (d);
553   POCL_UNLOCK (d->cq_lock);
554 }
555 
556 void
pocl_basic_join(cl_device_id device,cl_command_queue cq)557 pocl_basic_join (cl_device_id device, cl_command_queue cq)
558 {
559   struct data *d = (struct data*)device->data;
560 
561   POCL_LOCK (d->cq_lock);
562   basic_command_scheduler (d);
563   POCL_UNLOCK (d->cq_lock);
564 
565   return;
566 }
567 
568 void
pocl_basic_notify(cl_device_id device,cl_event event,cl_event finished)569 pocl_basic_notify (cl_device_id device, cl_event event, cl_event finished)
570 {
571   struct data *d = (struct data*)device->data;
572   _cl_command_node * volatile node = event->command;
573 
574   if (finished->status < CL_COMPLETE)
575     {
576       pocl_update_event_failed (event);
577       return;
578     }
579 
580   if (!node->ready)
581     return;
582 
583   if (pocl_command_is_ready (event))
584     {
585       if (event->status == CL_QUEUED)
586         {
587           pocl_update_event_submitted (event);
588           POCL_LOCK (d->cq_lock);
589           CDL_DELETE (d->command_list, node);
590           CDL_PREPEND (d->ready_list, node);
591           basic_command_scheduler (d);
592           POCL_UNLOCK (d->cq_lock);
593         }
594       return;
595     }
596 }
597 
598 void
pocl_basic_compile_kernel(_cl_command_node * cmd,cl_kernel kernel,cl_device_id device,int specialize)599 pocl_basic_compile_kernel (_cl_command_node *cmd, cl_kernel kernel,
600                            cl_device_id device, int specialize)
601 {
602   if (cmd != NULL && cmd->type == CL_COMMAND_NDRANGE_KERNEL)
603     pocl_check_kernel_dlhandle_cache (cmd, 0, specialize);
604 }
605 
606 /*********************** IMAGES ********************************/
607 
pocl_basic_copy_image_rect(void * data,cl_mem src_image,cl_mem dst_image,pocl_mem_identifier * src_mem_id,pocl_mem_identifier * dst_mem_id,const size_t * src_origin,const size_t * dst_origin,const size_t * region)608 cl_int pocl_basic_copy_image_rect( void *data,
609                                    cl_mem src_image,
610                                    cl_mem dst_image,
611                                    pocl_mem_identifier *src_mem_id,
612                                    pocl_mem_identifier *dst_mem_id,
613                                    const size_t *src_origin,
614                                    const size_t *dst_origin,
615                                    const size_t *region)
616 {
617 
618   size_t px = src_image->image_elem_size * src_image->image_channels;
619   const size_t adj_src_origin[3]
620       = { src_origin[0] * px, src_origin[1], src_origin[2] };
621   const size_t adj_dst_origin[3]
622       = { dst_origin[0] * px, dst_origin[1], dst_origin[2] };
623   const size_t adj_region[3] = { region[0] * px, region[1], region[2] };
624 
625   POCL_MSG_PRINT_MEMORY (
626       " BASIC COPY IMAGE RECT \n"
627       "dst_image %p dst_mem_id %p \n"
628       "src_image %p src_mem_id %p \n"
629       "dst_origin [0,1,2] %zu %zu %zu \n"
630       "src_origin [0,1,2] %zu %zu %zu \n"
631       "region [0,1,2] %zu %zu %zu \n"
632       "px %zu\n",
633       dst_image, dst_mem_id,
634       src_image, src_mem_id,
635       dst_origin[0], dst_origin[1], dst_origin[2],
636       src_origin[0], src_origin[1], src_origin[2],
637       region[0], region[1], region[2],
638       px);
639 
640   pocl_driver_copy_rect (
641       data, dst_mem_id, NULL, src_mem_id, NULL, adj_dst_origin, adj_src_origin,
642       adj_region, dst_image->image_row_pitch, dst_image->image_slice_pitch,
643       src_image->image_row_pitch, src_image->image_slice_pitch);
644 
645   return CL_SUCCESS;
646 }
647 
648 /* copies a region from host or device buffer to device image */
pocl_basic_write_image_rect(void * data,cl_mem dst_image,pocl_mem_identifier * dst_mem_id,const void * __restrict__ src_host_ptr,pocl_mem_identifier * src_mem_id,const size_t * origin,const size_t * region,size_t src_row_pitch,size_t src_slice_pitch,size_t src_offset)649 cl_int pocl_basic_write_image_rect (  void *data,
650                                       cl_mem dst_image,
651                                       pocl_mem_identifier *dst_mem_id,
652                                       const void *__restrict__ src_host_ptr,
653                                       pocl_mem_identifier *src_mem_id,
654                                       const size_t *origin,
655                                       const size_t *region,
656                                       size_t src_row_pitch,
657                                       size_t src_slice_pitch,
658                                       size_t src_offset)
659 {
660   POCL_MSG_PRINT_MEMORY (
661       "BASIC WRITE IMAGE RECT \n"
662       "dst_image %p dst_mem_id %p \n"
663       "src_hostptr %p src_mem_id %p \n"
664       "origin [0,1,2] %zu %zu %zu \n"
665       "region [0,1,2] %zu %zu %zu \n"
666       "row %zu slice %zu offset %zu \n",
667       dst_image, dst_mem_id,
668       src_host_ptr, src_mem_id,
669       origin[0], origin[1], origin[2],
670       region[0], region[1], region[2],
671       src_row_pitch, src_slice_pitch, src_offset);
672 
673   const void *__restrict__ ptr
674       = src_host_ptr ? src_host_ptr : src_mem_id->mem_ptr;
675   ptr += src_offset;
676   const size_t zero_origin[3] = { 0 };
677   size_t px = dst_image->image_elem_size * dst_image->image_channels;
678   if (src_row_pitch == 0)
679     src_row_pitch = px * region[0];
680   if (src_slice_pitch == 0)
681     src_slice_pitch = src_row_pitch * region[1];
682 
683   const size_t adj_origin[3] = { origin[0] * px, origin[1], origin[2] };
684   const size_t adj_region[3] = { region[0] * px, region[1], region[2] };
685 
686   pocl_driver_write_rect (data, ptr, dst_mem_id, NULL, adj_origin, zero_origin,
687                           adj_region, dst_image->image_row_pitch,
688                           dst_image->image_slice_pitch, src_row_pitch,
689                           src_slice_pitch);
690   return CL_SUCCESS;
691 }
692 
693 /* copies a region from device image to host or device buffer */
pocl_basic_read_image_rect(void * data,cl_mem src_image,pocl_mem_identifier * src_mem_id,void * __restrict__ dst_host_ptr,pocl_mem_identifier * dst_mem_id,const size_t * origin,const size_t * region,size_t dst_row_pitch,size_t dst_slice_pitch,size_t dst_offset)694 cl_int pocl_basic_read_image_rect(  void *data,
695                                     cl_mem src_image,
696                                     pocl_mem_identifier *src_mem_id,
697                                     void *__restrict__ dst_host_ptr,
698                                     pocl_mem_identifier *dst_mem_id,
699                                     const size_t *origin,
700                                     const size_t *region,
701                                     size_t dst_row_pitch,
702                                     size_t dst_slice_pitch,
703                                     size_t dst_offset)
704 {
705   POCL_MSG_PRINT_MEMORY (
706       "BASIC READ IMAGE RECT \n"
707       "src_image %p src_mem_id %p \n"
708       "dst_hostptr %p dst_mem_id %p \n"
709       "origin [0,1,2] %zu %zu %zu \n"
710       "region [0,1,2] %zu %zu %zu \n"
711       "row %zu slice %zu offset %zu \n",
712       src_image, src_mem_id,
713       dst_host_ptr, dst_mem_id,
714       origin[0], origin[1], origin[2],
715       region[0], region[1], region[2],
716       dst_row_pitch, dst_slice_pitch, dst_offset);
717 
718   void *__restrict__ ptr = dst_host_ptr ? dst_host_ptr : dst_mem_id->mem_ptr;
719   ptr += dst_offset;
720   const size_t zero_origin[3] = { 0 };
721   size_t px = src_image->image_elem_size * src_image->image_channels;
722   if (dst_row_pitch == 0)
723     dst_row_pitch = px * region[0];
724   if (dst_slice_pitch == 0)
725     dst_slice_pitch = dst_row_pitch * region[1];
726   const size_t adj_origin[3] = { origin[0] * px, origin[1], origin[2] };
727   const size_t adj_region[3] = { region[0] * px, region[1], region[2] };
728 
729   pocl_driver_read_rect (data, ptr, src_mem_id, NULL, adj_origin, zero_origin,
730                          adj_region, src_image->image_row_pitch,
731                          src_image->image_slice_pitch, dst_row_pitch,
732                          dst_slice_pitch);
733   return CL_SUCCESS;
734 }
735 
736 
pocl_basic_map_image(void * data,pocl_mem_identifier * mem_id,cl_mem src_image,mem_mapping_t * map)737 cl_int pocl_basic_map_image (void *data,
738                              pocl_mem_identifier *mem_id,
739                              cl_mem src_image,
740                              mem_mapping_t *map)
741 {
742   assert (map->host_ptr != NULL);
743 
744   if (map->map_flags & CL_MAP_WRITE_INVALIDATE_REGION)
745     return CL_SUCCESS;
746 
747   if (map->host_ptr != ((char *)mem_id->mem_ptr + map->offset))
748     {
749       pocl_basic_read_image_rect (data, src_image, mem_id, map->host_ptr,
750                                   NULL, map->origin, map->region,
751                                   map->row_pitch, map->slice_pitch, 0);
752     }
753   return CL_SUCCESS;
754 }
755 
pocl_basic_unmap_image(void * data,pocl_mem_identifier * mem_id,cl_mem dst_image,mem_mapping_t * map)756 cl_int pocl_basic_unmap_image(void *data,
757                               pocl_mem_identifier *mem_id,
758                               cl_mem dst_image,
759                               mem_mapping_t *map)
760 {
761   if (map->map_flags == CL_MAP_READ)
762     return CL_SUCCESS;
763 
764   if (map->host_ptr != ((char *)mem_id->mem_ptr + map->offset))
765     {
766       pocl_basic_write_image_rect (data, dst_image, mem_id, map->host_ptr,
767                                    NULL, map->origin, map->region,
768                                    map->row_pitch, map->slice_pitch, 0);
769     }
770   return CL_SUCCESS;
771 }
772 
773 cl_int
pocl_basic_fill_image(void * data,cl_mem image,pocl_mem_identifier * image_data,const size_t * origin,const size_t * region,cl_uint4 orig_pixel,pixel_t fill_pixel,size_t pixel_size)774 pocl_basic_fill_image (void *data, cl_mem image,
775                        pocl_mem_identifier *image_data, const size_t *origin,
776                        const size_t *region, cl_uint4 orig_pixel,
777                        pixel_t fill_pixel, size_t pixel_size)
778 {
779    POCL_MSG_PRINT_MEMORY ("BASIC / FILL IMAGE \n"
780                           "image %p data %p \n"
781                           "origin [0,1,2] %zu %zu %zu \n"
782                           "region [0,1,2] %zu %zu %zu \n"
783                           "pixel %p size %zu \n",
784                           image, image_data,
785                           origin[0], origin[1], origin[2],
786                           region[0], region[1], region[2],
787                           fill_pixel, pixel_size);
788 
789   size_t row_pitch = image->image_row_pitch;
790   size_t slice_pitch = image->image_slice_pitch;
791   char *__restrict const adjusted_device_ptr
792       = (char *)image_data->mem_ptr
793         + origin[0] * pixel_size
794         + row_pitch * origin[1]
795         + slice_pitch * origin[2];
796 
797   size_t i, j, k;
798 
799   for (k = 0; k < region[2]; ++k)
800     for (j = 0; j < region[1]; ++j)
801       for (i = 0; i < region[0]; ++i)
802         memcpy (adjusted_device_ptr
803                   + pixel_size * i
804                   + row_pitch * j
805                   + slice_pitch * k,
806                 fill_pixel,
807                 pixel_size);
808   return CL_SUCCESS;
809 }
810 
811 /***************************************************************************/
812 void
pocl_basic_svm_free(cl_device_id dev,void * svm_ptr)813 pocl_basic_svm_free (cl_device_id dev, void *svm_ptr)
814 {
815   /* TODO we should somehow figure out the size argument
816    * and call pocl_free_global_mem */
817   pocl_aligned_free (svm_ptr);
818 }
819 
820 void *
pocl_basic_svm_alloc(cl_device_id dev,cl_svm_mem_flags flags,size_t size)821 pocl_basic_svm_alloc (cl_device_id dev, cl_svm_mem_flags flags, size_t size)
822 
823 {
824   return pocl_aligned_malloc (MAX_EXTENDED_ALIGNMENT, size);
825 }
826 
827 void
pocl_basic_svm_copy(cl_device_id dev,void * __restrict__ dst,const void * __restrict__ src,size_t size)828 pocl_basic_svm_copy (cl_device_id dev, void *__restrict__ dst,
829                      const void *__restrict__ src, size_t size)
830 {
831   memcpy (dst, src, size);
832 }
833 
834 void
pocl_basic_svm_fill(cl_device_id dev,void * __restrict__ svm_ptr,size_t size,void * __restrict__ pattern,size_t pattern_size)835 pocl_basic_svm_fill (cl_device_id dev, void *__restrict__ svm_ptr, size_t size,
836                      void *__restrict__ pattern, size_t pattern_size)
837 {
838   pocl_mem_identifier temp;
839   temp.mem_ptr = svm_ptr;
840   pocl_driver_memfill (dev->data, &temp, NULL, size, 0, pattern, pattern_size);
841 }
842