1 #include "config.h"
2 
3 #include <assert.h>
4 #include <stdlib.h>
5 #include <string.h>
6 #include <unistd.h>
7 
8 #include "pocl_cl.h"
9 #include "utlist.h"
10 
11 // for pocl_aligned_malloc
12 #include "pocl_util.h"
13 
14 #ifdef ENABLE_LLVM
15 #include "pocl_llvm.h"
16 #endif
17 
18 #include "common_driver.h"
19 
20 #define APPEND_TO_BUILD_LOG_RET(err, ...)                                     \
21   do                                                                          \
22     {                                                                         \
23       char temp[1024];                                                        \
24       ssize_t written = snprintf (temp, 1024, __VA_ARGS__);                   \
25       if (written > 0)                                                        \
26         {                                                                     \
27           size_t l = strlen (program->build_log[device_i]);                   \
28           size_t newl = l + (size_t)written;                                  \
29           char *newp = realloc (program->build_log[device_i], newl);          \
30           assert (newp);                                                      \
31           memcpy (newp + l, temp, (size_t)written);                           \
32           newp[newl] = 0;                                                     \
33           program->build_log[device_i] = newp;                                \
34         }                                                                     \
35       POCL_RETURN_ERROR_ON (1, err, __VA_ARGS__);                             \
36     }                                                                         \
37   while (0)
38 
39 void
pocl_driver_read(void * data,void * __restrict__ host_ptr,pocl_mem_identifier * src_mem_id,cl_mem src_buf,size_t offset,size_t size)40 pocl_driver_read (void *data, void *__restrict__ host_ptr,
41                   pocl_mem_identifier *src_mem_id, cl_mem src_buf,
42                   size_t offset, size_t size)
43 {
44   void *__restrict__ device_ptr = src_mem_id->mem_ptr;
45   if (host_ptr == device_ptr)
46     return;
47 
48   memcpy (host_ptr, (char *)device_ptr + offset, size);
49 }
50 
51 void
pocl_driver_write(void * data,const void * __restrict__ host_ptr,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,size_t offset,size_t size)52 pocl_driver_write (void *data, const void *__restrict__ host_ptr,
53                    pocl_mem_identifier *dst_mem_id, cl_mem dst_buf,
54                    size_t offset, size_t size)
55 {
56   void *__restrict__ device_ptr = dst_mem_id->mem_ptr;
57   if (host_ptr == device_ptr)
58     return;
59 
60   memcpy ((char *)device_ptr + offset, host_ptr, size);
61 }
62 
63 void
pocl_driver_copy(void * data,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,pocl_mem_identifier * src_mem_id,cl_mem src_buf,size_t dst_offset,size_t src_offset,size_t size)64 pocl_driver_copy (void *data, pocl_mem_identifier *dst_mem_id, cl_mem dst_buf,
65                   pocl_mem_identifier *src_mem_id, cl_mem src_buf,
66                   size_t dst_offset, size_t src_offset, size_t size)
67 {
68   char *__restrict__ src_ptr = (char *)src_mem_id->mem_ptr;
69   char *__restrict__ dst_ptr = (char *)dst_mem_id->mem_ptr;
70   if ((src_ptr + src_offset) == (dst_ptr + dst_offset))
71     return;
72 
73   memcpy (dst_ptr + dst_offset, src_ptr + src_offset, size);
74 }
75 
76 void
pocl_driver_copy_with_size(void * data,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,pocl_mem_identifier * src_mem_id,cl_mem src_buf,pocl_mem_identifier * content_size_buf_mem_id,cl_mem content_size_buf,size_t dst_offset,size_t src_offset,size_t size)77 pocl_driver_copy_with_size (void *data, pocl_mem_identifier *dst_mem_id,
78                             cl_mem dst_buf, pocl_mem_identifier *src_mem_id,
79                             cl_mem src_buf,
80                             pocl_mem_identifier *content_size_buf_mem_id,
81                             cl_mem content_size_buf, size_t dst_offset,
82                             size_t src_offset, size_t size)
83 {
84   char *__restrict__ src_ptr = (char *)src_mem_id->mem_ptr;
85   char *__restrict__ dst_ptr = (char *)dst_mem_id->mem_ptr;
86   if ((src_ptr + src_offset) == (dst_ptr + dst_offset))
87     return;
88 
89   uint64_t *content_size = (uint64_t *)content_size_buf_mem_id->mem_ptr;
90   if (*content_size < (src_offset + size))
91     {
92       if (*content_size > src_offset)
93         {
94           size_t real_bytes = *content_size - src_offset;
95           size_t to_copy = real_bytes < size ? real_bytes : size;
96           memcpy (dst_ptr + dst_offset, src_ptr + src_offset, to_copy);
97         }
98     }
99   else
100     memcpy (dst_ptr + dst_offset, src_ptr + src_offset, size);
101 }
102 
103 void
pocl_driver_copy_rect(void * data,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,pocl_mem_identifier * src_mem_id,cl_mem src_buf,const size_t * __restrict__ const dst_origin,const size_t * __restrict__ const src_origin,const size_t * __restrict__ const region,size_t const dst_row_pitch,size_t const dst_slice_pitch,size_t const src_row_pitch,size_t const src_slice_pitch)104 pocl_driver_copy_rect (void *data, pocl_mem_identifier *dst_mem_id,
105                        cl_mem dst_buf, pocl_mem_identifier *src_mem_id,
106                        cl_mem src_buf,
107                        const size_t *__restrict__ const dst_origin,
108                        const size_t *__restrict__ const src_origin,
109                        const size_t *__restrict__ const region,
110                        size_t const dst_row_pitch,
111                        size_t const dst_slice_pitch,
112                        size_t const src_row_pitch,
113                        size_t const src_slice_pitch)
114 {
115 
116   void *__restrict__ src_ptr = src_mem_id->mem_ptr;
117   void *__restrict__ dst_ptr = dst_mem_id->mem_ptr;
118   char const *__restrict const adjusted_src_ptr
119       = (char const *)src_ptr + src_origin[0] + src_row_pitch * src_origin[1]
120         + src_slice_pitch * src_origin[2];
121   char *__restrict__ const adjusted_dst_ptr
122       = (char *)dst_ptr + dst_origin[0] + dst_row_pitch * dst_origin[1]
123         + dst_slice_pitch * dst_origin[2];
124 
125   POCL_MSG_PRINT_MEMORY (
126       "COPY RECT \n"
127       "SRC %p DST %p SIZE %zu\n"
128       "src origin %u %u %u dst origin %u %u %u \n"
129       "src_row_pitch %lu src_slice pitch %lu\n"
130       "dst_row_pitch %lu dst_slice_pitch %lu\n"
131       "reg[0] %lu reg[1] %lu reg[2] %lu\n",
132       adjusted_src_ptr, adjusted_dst_ptr, region[0] * region[1] * region[2],
133       (unsigned)src_origin[0], (unsigned)src_origin[1],
134       (unsigned)src_origin[2], (unsigned)dst_origin[0],
135       (unsigned)dst_origin[1], (unsigned)dst_origin[2],
136       (unsigned long)src_row_pitch, (unsigned long)src_slice_pitch,
137       (unsigned long)dst_row_pitch, (unsigned long)dst_slice_pitch,
138       (unsigned long)region[0], (unsigned long)region[1],
139       (unsigned long)region[2]);
140 
141   size_t j, k;
142 
143   /* TODO: handle overlaping regions */
144   if ((src_row_pitch == dst_row_pitch && dst_row_pitch == region[0])
145       && (src_slice_pitch == dst_slice_pitch
146           && dst_slice_pitch == (region[1] * region[0])))
147     {
148       memcpy (adjusted_dst_ptr, adjusted_src_ptr,
149               region[2] * region[1] * region[0]);
150     }
151   else
152     {
153       for (k = 0; k < region[2]; ++k)
154         for (j = 0; j < region[1]; ++j)
155           memcpy (adjusted_dst_ptr + dst_row_pitch * j + dst_slice_pitch * k,
156                   adjusted_src_ptr + src_row_pitch * j + src_slice_pitch * k,
157                   region[0]);
158     }
159 }
160 
161 void
pocl_driver_write_rect(void * data,const void * __restrict__ const host_ptr,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,const size_t * __restrict__ const buffer_origin,const size_t * __restrict__ const host_origin,const size_t * __restrict__ const region,size_t const buffer_row_pitch,size_t const buffer_slice_pitch,size_t const host_row_pitch,size_t const host_slice_pitch)162 pocl_driver_write_rect (void *data, const void *__restrict__ const host_ptr,
163                         pocl_mem_identifier *dst_mem_id, cl_mem dst_buf,
164                         const size_t *__restrict__ const buffer_origin,
165                         const size_t *__restrict__ const host_origin,
166                         const size_t *__restrict__ const region,
167                         size_t const buffer_row_pitch,
168                         size_t const buffer_slice_pitch,
169                         size_t const host_row_pitch,
170                         size_t const host_slice_pitch)
171 {
172   void *__restrict__ device_ptr = dst_mem_id->mem_ptr;
173 
174   char *__restrict const adjusted_device_ptr
175       = (char *)device_ptr + buffer_origin[0]
176         + buffer_row_pitch * buffer_origin[1]
177         + buffer_slice_pitch * buffer_origin[2];
178   char const *__restrict__ const adjusted_host_ptr
179       = (char const *)host_ptr + host_origin[0]
180         + host_row_pitch * host_origin[1] + host_slice_pitch * host_origin[2];
181 
182   POCL_MSG_PRINT_MEMORY (
183       "WRITE RECT \n"
184       "SRC HOST %p DST DEV %p SIZE %zu\n"
185       "borigin %u %u %u horigin %u %u %u \n"
186       "row_pitch %lu slice pitch \n"
187       "%lu host_row_pitch %lu host_slice_pitch %lu\n"
188       "reg[0] %lu reg[1] %lu reg[2] %lu\n",
189       adjusted_host_ptr, adjusted_device_ptr,
190       region[0] * region[1] * region[2], (unsigned)buffer_origin[0],
191       (unsigned)buffer_origin[1], (unsigned)buffer_origin[2],
192       (unsigned)host_origin[0], (unsigned)host_origin[1],
193       (unsigned)host_origin[2], (unsigned long)buffer_row_pitch,
194       (unsigned long)buffer_slice_pitch, (unsigned long)host_row_pitch,
195       (unsigned long)host_slice_pitch, (unsigned long)region[0],
196       (unsigned long)region[1], (unsigned long)region[2]);
197 
198   size_t j, k;
199 
200   /* TODO: handle overlaping regions */
201   if ((buffer_row_pitch == host_row_pitch && host_row_pitch == region[0])
202       && (buffer_slice_pitch == host_slice_pitch
203           && host_slice_pitch == (region[1] * region[0])))
204     {
205       memcpy (adjusted_device_ptr, adjusted_host_ptr,
206               region[2] * region[1] * region[0]);
207     }
208   else
209     {
210       for (k = 0; k < region[2]; ++k)
211         for (j = 0; j < region[1]; ++j)
212           memcpy (adjusted_device_ptr + buffer_row_pitch * j
213                       + buffer_slice_pitch * k,
214                   adjusted_host_ptr + host_row_pitch * j
215                       + host_slice_pitch * k,
216                   region[0]);
217     }
218 }
219 
220 void
pocl_driver_read_rect(void * data,void * __restrict__ const host_ptr,pocl_mem_identifier * src_mem_id,cl_mem src_buf,const size_t * __restrict__ const buffer_origin,const size_t * __restrict__ const host_origin,const size_t * __restrict__ const region,size_t const buffer_row_pitch,size_t const buffer_slice_pitch,size_t const host_row_pitch,size_t const host_slice_pitch)221 pocl_driver_read_rect (void *data, void *__restrict__ const host_ptr,
222                        pocl_mem_identifier *src_mem_id, cl_mem src_buf,
223                        const size_t *__restrict__ const buffer_origin,
224                        const size_t *__restrict__ const host_origin,
225                        const size_t *__restrict__ const region,
226                        size_t const buffer_row_pitch,
227                        size_t const buffer_slice_pitch,
228                        size_t const host_row_pitch,
229                        size_t const host_slice_pitch)
230 {
231   void *__restrict__ device_ptr = src_mem_id->mem_ptr;
232 
233   char const *__restrict const adjusted_device_ptr
234       = (char const *)device_ptr + buffer_origin[2] * buffer_slice_pitch
235         + buffer_origin[1] * buffer_row_pitch + buffer_origin[0];
236   char *__restrict__ const adjusted_host_ptr
237       = (char *)host_ptr + host_origin[2] * host_slice_pitch
238         + host_origin[1] * host_row_pitch + host_origin[0];
239 
240   POCL_MSG_PRINT_MEMORY (
241       "READ RECT \n"
242       "SRC DEV %p DST HOST %p SIZE %zu\n"
243       "borigin %u %u %u horigin %u %u %u row_pitch %lu slice pitch "
244       "%lu host_row_pitch %lu host_slice_pitch %lu\n"
245       "reg[0] %lu reg[1] %lu reg[2] %lu\n",
246       adjusted_device_ptr, adjusted_host_ptr,
247       region[0] * region[1] * region[2], (unsigned)buffer_origin[0],
248       (unsigned)buffer_origin[1], (unsigned)buffer_origin[2],
249       (unsigned)host_origin[0], (unsigned)host_origin[1],
250       (unsigned)host_origin[2], (unsigned long)buffer_row_pitch,
251       (unsigned long)buffer_slice_pitch, (unsigned long)host_row_pitch,
252       (unsigned long)host_slice_pitch, (unsigned long)region[0],
253       (unsigned long)region[1], (unsigned long)region[2]);
254 
255   size_t j, k;
256 
257   /* TODO: handle overlaping regions */
258   if ((buffer_row_pitch == host_row_pitch && host_row_pitch == region[0])
259       && (buffer_slice_pitch == host_slice_pitch
260           && host_slice_pitch == (region[1] * region[0])))
261     {
262       memcpy (adjusted_host_ptr, adjusted_device_ptr,
263               region[2] * region[1] * region[0]);
264     }
265   else
266     {
267       for (k = 0; k < region[2]; ++k)
268         for (j = 0; j < region[1]; ++j)
269           memcpy (adjusted_host_ptr + host_row_pitch * j
270                       + host_slice_pitch * k,
271                   adjusted_device_ptr + buffer_row_pitch * j
272                       + buffer_slice_pitch * k,
273                   region[0]);
274     }
275 }
276 
277 void
pocl_driver_memfill(void * data,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,size_t size,size_t offset,const void * __restrict__ pattern,size_t pattern_size)278 pocl_driver_memfill (void *data, pocl_mem_identifier *dst_mem_id,
279                      cl_mem dst_buf, size_t size, size_t offset,
280                      const void *__restrict__ pattern, size_t pattern_size)
281 {
282   void *__restrict__ ptr = dst_mem_id->mem_ptr;
283   size_t i;
284   unsigned j;
285 
286   /* memfill size is in bytes, we wanto make it into elements */
287   size /= pattern_size;
288   offset /= pattern_size;
289 
290   switch (pattern_size)
291     {
292     case 1:
293       {
294         uint8_t *p = (uint8_t *)ptr + offset;
295         for (i = 0; i < size; i++)
296           p[i] = *(uint8_t *)pattern;
297       }
298       break;
299     case 2:
300       {
301         uint16_t *p = (uint16_t *)ptr + offset;
302         for (i = 0; i < size; i++)
303           p[i] = *(uint16_t *)pattern;
304       }
305       break;
306     case 4:
307       {
308         uint32_t *p = (uint32_t *)ptr + offset;
309         for (i = 0; i < size; i++)
310           p[i] = *(uint32_t *)pattern;
311       }
312       break;
313     case 8:
314       {
315         uint64_t *p = (uint64_t *)ptr + offset;
316         for (i = 0; i < size; i++)
317           p[i] = *(uint64_t *)pattern;
318       }
319       break;
320     case 16:
321       {
322         uint64_t *p = (uint64_t *)ptr + (offset << 1);
323         for (i = 0; i < size; i++)
324           for (j = 0; j < 2; j++)
325             p[(i << 1) + j] = *((uint64_t *)pattern + j);
326       }
327       break;
328     case 32:
329       {
330         uint64_t *p = (uint64_t *)ptr + (offset << 2);
331         for (i = 0; i < size; i++)
332           for (j = 0; j < 4; j++)
333             p[(i << 2) + j] = *((uint64_t *)pattern + j);
334       }
335       break;
336     case 64:
337       {
338         uint64_t *p = (uint64_t *)ptr + (offset << 3);
339         for (i = 0; i < size; i++)
340           for (j = 0; j < 8; j++)
341             p[(i << 3) + j] = *((uint64_t *)pattern + j);
342       }
343       break;
344     case 128:
345       {
346         uint64_t *p = (uint64_t *)ptr + (offset << 4);
347         for (i = 0; i < size; i++)
348           for (j = 0; j < 16; j++)
349             p[(i << 4) + j] = *((uint64_t *)pattern + j);
350       }
351       break;
352     default:
353       assert (0 && "Invalid pattern size");
354       break;
355     }
356 }
357 
358 cl_int
pocl_driver_map_mem(void * data,pocl_mem_identifier * src_mem_id,cl_mem src_buf,mem_mapping_t * map)359 pocl_driver_map_mem (void *data, pocl_mem_identifier *src_mem_id,
360                      cl_mem src_buf, mem_mapping_t *map)
361 {
362   char *__restrict__ src_device_ptr = (char *)src_mem_id->mem_ptr;
363   assert (map->host_ptr);
364 
365   if (map->map_flags & CL_MAP_WRITE_INVALIDATE_REGION)
366     {
367       return CL_SUCCESS;
368     }
369 
370   if (map->host_ptr == (src_device_ptr + map->offset))
371     NULL;
372   else
373     memcpy (map->host_ptr, src_device_ptr + map->offset, map->size);
374 
375   return CL_SUCCESS;
376 }
377 
378 cl_int
pocl_driver_unmap_mem(void * data,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,mem_mapping_t * map)379 pocl_driver_unmap_mem (void *data, pocl_mem_identifier *dst_mem_id,
380                        cl_mem dst_buf, mem_mapping_t *map)
381 {
382   char *__restrict__ dst_device_ptr = (char *)dst_mem_id->mem_ptr;
383   assert (map->host_ptr);
384 
385   if (map->host_ptr == (dst_device_ptr + map->offset))
386     NULL;
387   else
388     {
389       if (map->map_flags != CL_MAP_READ)
390         memcpy (dst_device_ptr + map->offset, map->host_ptr, map->size);
391     }
392 
393   return CL_SUCCESS;
394 }
395 
396 cl_int
pocl_driver_get_mapping_ptr(void * data,pocl_mem_identifier * mem_id,cl_mem mem,mem_mapping_t * map)397 pocl_driver_get_mapping_ptr (void *data, pocl_mem_identifier *mem_id,
398                              cl_mem mem, mem_mapping_t *map)
399 {
400   char *__restrict__ src_device_ptr = (char *)mem_id->mem_ptr;
401   assert (mem->size > 0);
402   assert (map->size > 0);
403 
404   if (mem->mem_host_ptr != NULL)
405     {
406       map->host_ptr = mem->mem_host_ptr + map->offset;
407     }
408   else
409     {
410       map->host_ptr = pocl_aligned_malloc (16, map->size);
411     }
412 
413   assert (map->host_ptr);
414   return CL_SUCCESS;
415 }
416 
417 cl_int
pocl_driver_free_mapping_ptr(void * data,pocl_mem_identifier * mem_id,cl_mem mem,mem_mapping_t * map)418 pocl_driver_free_mapping_ptr (void *data, pocl_mem_identifier *mem_id,
419                               cl_mem mem, mem_mapping_t *map)
420 {
421   char *__restrict__ src_device_ptr = (char *)mem_id->mem_ptr;
422   if (map->host_ptr == NULL)
423     return CL_SUCCESS;
424 
425   if ((mem->mem_host_ptr != NULL)
426       && map->host_ptr != (mem->mem_host_ptr + map->offset))
427     pocl_aligned_free (map->host_ptr);
428 
429   map->host_ptr = NULL;
430   return CL_SUCCESS;
431 }
432 
433 /* These are implementations of compilation callbacks for all devices
434  * that support compilation via LLVM. They take care of compilation/linking
435  * of source/binary/spir down to parallel.bc level.
436  *
437  * The driver only has to provide the "device->ops->compile_kernel" callback,
438  * which compiles parallel.bc to whatever final binary format is needed.
439  *
440  * Devices that support compilation by other means than LLVM,
441  * must reimplement these callbacks.
442  */
443 
444 #ifdef ENABLE_LLVM
445 /* Converts SPIR to LLVM IR, and links it to pocl's kernel library. */
446 static int
pocl_llvm_link_and_convert_spir(cl_program program,cl_uint device_i,int link_program,int spir_build)447 pocl_llvm_link_and_convert_spir (cl_program program, cl_uint device_i,
448                                  int link_program, int spir_build)
449 {
450   cl_device_id device = program->devices[device_i];
451   int error;
452 
453   /* SPIR-V was handled; bitcode is now either plain LLVM IR or SPIR IR */
454   int spir_binary
455       = bitcode_is_triple ((char *)program->binaries[device_i],
456                            program->binary_sizes[device_i], "spir");
457   if (spir_binary)
458     POCL_MSG_PRINT_LLVM ("LLVM-SPIR binary detected\n");
459   else
460     POCL_MSG_PRINT_LLVM ("building from a BC binary for device %d\n",
461                          device_i);
462 
463   if (spir_binary)
464     {
465 #ifdef ENABLE_SPIR
466       if (!strstr (device->extensions, "cl_khr_spir"))
467         {
468           APPEND_TO_BUILD_LOG_RET (CL_LINK_PROGRAM_FAILURE,
469                                    "SPIR support is not available"
470                                    "for device %s\n",
471                                    device->short_name);
472         }
473       if (!spir_build)
474         POCL_MSG_WARN ("SPIR binary provided, but no spir in build options\n");
475 
476       /* SPIR binaries need to be explicitly linked to the kernel
477        * library. For non-SPIR binaries this happens as part of build
478        * process when program.bc is generated. */
479       error = pocl_llvm_link_program (
480           program, device_i, 1, &program->binaries[device_i],
481           &program->binary_sizes[device_i], NULL, link_program, 1);
482 
483       POCL_RETURN_ERROR_ON (error, CL_LINK_PROGRAM_FAILURE,
484                             "Failed to link SPIR program.bc\n");
485 #else
486       APPEND_TO_BUILD_LOG_RET (CL_LINK_PROGRAM_FAILURE,
487                                "SPIR support is not available"
488                                "for device %s\n",
489                                device->short_name);
490 #endif
491     }
492   return CL_SUCCESS;
493 }
494 #endif
495 
496 int
pocl_driver_build_source(cl_program program,cl_uint device_i,cl_uint num_input_headers,const cl_program * input_headers,const char ** header_include_names,int link_program)497 pocl_driver_build_source (cl_program program, cl_uint device_i,
498                           cl_uint num_input_headers,
499                           const cl_program *input_headers,
500                           const char **header_include_names, int link_program)
501 {
502   assert (program->devices[device_i]->compiler_available == CL_TRUE);
503   assert (program->devices[device_i]->linker_available == CL_TRUE);
504 
505 #ifdef ENABLE_LLVM
506 
507   POCL_MSG_PRINT_LLVM ("building from sources for device %d\n", device_i);
508 
509   return pocl_llvm_build_program (program, device_i, num_input_headers,
510                                   input_headers, header_include_names,
511                                   link_program);
512 
513 #else
514   POCL_RETURN_ERROR_ON (1, CL_BUILD_PROGRAM_FAILURE,
515                         "This device requires LLVM to build from sources\n");
516 #endif
517 }
518 
519 int
pocl_driver_build_binary(cl_program program,cl_uint device_i,int link_program,int spir_build)520 pocl_driver_build_binary (cl_program program, cl_uint device_i,
521                           int link_program, int spir_build)
522 {
523 
524 #ifdef ENABLE_LLVM
525   /* poclbinary doesn't need special handling */
526   if (program->pocl_binaries[device_i])
527     {
528       /* program.bc must be either NULL or unpacked by now */
529       if (program->binaries[device_i] == NULL)
530         POCL_MSG_WARN ("pocl-binary for this device doesn't contain "
531                        "program.bc - you won't be able to rebuild it\n");
532       else
533         pocl_llvm_read_program_llvm_irs (program, device_i, NULL);
534     }
535   else /* program->binaries but not poclbinary */
536     {
537       assert (program->binaries[device_i]);
538       int err = pocl_llvm_link_and_convert_spir (program, device_i,
539                                                  link_program, spir_build);
540       if (err != CL_SUCCESS)
541         return err;
542       pocl_llvm_read_program_llvm_irs (program, device_i, NULL);
543     }
544   return CL_SUCCESS;
545 #else
546   POCL_RETURN_ERROR_ON ((program->pocl_binaries[device_i] == NULL),
547                         CL_BUILD_PROGRAM_FAILURE,
548                         "This device requires LLVM to "
549                         "build from SPIR/LLVM bitcode\n");
550   return CL_SUCCESS;
551 #endif
552 }
553 
554 int
pocl_driver_link_program(cl_program program,cl_uint device_i,cl_uint num_input_programs,const cl_program * input_programs,int create_library)555 pocl_driver_link_program (cl_program program, cl_uint device_i,
556                           cl_uint num_input_programs,
557                           const cl_program *input_programs, int create_library)
558 {
559   assert (program->devices[device_i]->linker_available == CL_TRUE);
560 
561 #ifdef ENABLE_LLVM
562   cl_device_id device = program->devices[device_i];
563   /* just link binaries. */
564   unsigned char **cur_device_binaries = (unsigned char **)alloca (
565       num_input_programs * sizeof (unsigned char *));
566   size_t *cur_device_binary_sizes
567       = (size_t *)alloca (num_input_programs * sizeof (size_t));
568   void **cur_device_llvm_irs
569       = (void **)alloca (num_input_programs * sizeof (void *));
570 
571   cl_uint i;
572   for (i = 0; i < num_input_programs; i++)
573     {
574       assert (device == input_programs[i]->devices[device_i]);
575       POCL_LOCK_OBJ (input_programs[i]);
576 
577       cur_device_binaries[i] = input_programs[i]->binaries[device_i];
578       assert (cur_device_binaries[i]);
579       cur_device_binary_sizes[i] = input_programs[i]->binary_sizes[device_i];
580       assert (cur_device_binary_sizes[i] > 0);
581 
582       pocl_llvm_read_program_llvm_irs (input_programs[i], device_i, NULL);
583 
584       cur_device_llvm_irs[i] = input_programs[i]->data[device_i];
585       assert (cur_device_llvm_irs[i]);
586       POCL_UNLOCK_OBJ (input_programs[i]);
587     }
588 
589   int err = pocl_llvm_link_program (
590       program, device_i, num_input_programs, cur_device_binaries,
591       cur_device_binary_sizes, cur_device_llvm_irs, !create_library, 0);
592 
593   POCL_RETURN_ERROR_ON ((err != CL_SUCCESS), CL_LINK_PROGRAM_FAILURE,
594                         "This device requires LLVM to link binaries\n");
595   return CL_SUCCESS;
596 #else
597   POCL_RETURN_ERROR_ON (1, CL_BUILD_PROGRAM_FAILURE,
598                         "This device cannot link anything\n");
599 
600 #endif
601 }
602 
603 int
pocl_driver_free_program(cl_device_id device,cl_program program,unsigned program_device_i)604 pocl_driver_free_program (cl_device_id device, cl_program program,
605                           unsigned program_device_i)
606 {
607 #ifdef ENABLE_LLVM
608   pocl_llvm_free_llvm_irs (program, program_device_i);
609 #endif
610   return 0;
611 }
612 
613 int
pocl_driver_setup_metadata(cl_device_id device,cl_program program,unsigned program_device_i)614 pocl_driver_setup_metadata (cl_device_id device, cl_program program,
615                             unsigned program_device_i)
616 {
617 #ifdef ENABLE_LLVM
618   unsigned num_kernels
619       = pocl_llvm_get_kernel_count (program, program_device_i);
620 
621   /* TODO zero kernels in program case */
622   if (num_kernels)
623     {
624       program->num_kernels = num_kernels;
625       program->kernel_meta
626           = calloc (program->num_kernels, sizeof (pocl_kernel_metadata_t));
627       pocl_llvm_get_kernels_metadata (program, program_device_i);
628     }
629   return 1;
630 #else
631   return 0;
632 #endif
633 }
634 
635 int
pocl_driver_supports_binary(cl_device_id device,const size_t length,const char * binary)636 pocl_driver_supports_binary (cl_device_id device, const size_t length,
637                              const char *binary)
638 {
639 #ifdef ENABLE_LLVM
640 
641   /* SPIR binary is supported */
642   if (bitcode_is_triple (binary, length, "spir"))
643     {
644       POCL_RETURN_ERROR_ON (
645           (strstr (device->extensions, "cl_khr_spir") == NULL),
646           CL_BUILD_PROGRAM_FAILURE,
647           "SPIR binary provided, but device has no SPIR support");
648       return 1;
649     }
650 
651   /* LLVM IR can be supported by the driver, if the triple matches */
652   if (device->llvm_target_triplet
653       && bitcode_is_triple (binary, length, device->llvm_target_triplet))
654     return 1;
655 
656   POCL_MSG_ERR ("Unknown binary type.\n");
657   return 0;
658 #else
659   POCL_MSG_ERR (
660       "This driver was not build with LLVM support, so "
661       "don't support loading SPIR or LLVM IR binaries, only poclbinaries.\n");
662   return 0;
663 #endif
664 }
665 
666 /* Build the dynamic WG sized parallel.bc and device specific code,
667    for each kernel. This must be called *after* metadata has been setup  */
668 int
pocl_driver_build_poclbinary(cl_program program,cl_uint device_i)669 pocl_driver_build_poclbinary (cl_program program, cl_uint device_i)
670 {
671   unsigned i;
672   _cl_command_node cmd;
673   cl_device_id device = program->devices[device_i];
674 
675   assert (program->build_status == CL_BUILD_SUCCESS);
676   if (program->num_kernels == 0)
677     return CL_SUCCESS;
678 
679   /* For binaries of other than Executable type (libraries, compiled but
680    * not linked programs, etc), do not attempt to compile the kernels. */
681   if (program->binary_type != CL_PROGRAM_BINARY_TYPE_EXECUTABLE)
682     return CL_SUCCESS;
683 
684   memset (&cmd, 0, sizeof (_cl_command_node));
685   cmd.type = CL_COMMAND_NDRANGE_KERNEL;
686 
687   POCL_LOCK_OBJ (program);
688 
689   assert (program->binaries[device_i]);
690 
691   cmd.device = device;
692   cmd.device_i = device_i;
693 
694   struct _cl_kernel fake_k;
695   memset (&fake_k, 0, sizeof (fake_k));
696   fake_k.context = program->context;
697   fake_k.program = program;
698   fake_k.next = NULL;
699   cl_kernel kernel = &fake_k;
700 
701   for (i = 0; i < program->num_kernels; i++)
702     {
703       fake_k.meta = &program->kernel_meta[i];
704       fake_k.name = fake_k.meta->name;
705       cmd.command.run.hash = fake_k.meta->build_hash[device_i];
706 
707       size_t local_x = 0, local_y = 0, local_z = 0;
708 
709       if (kernel->meta->reqd_wg_size[0] > 0
710           && kernel->meta->reqd_wg_size[1] > 0
711           && kernel->meta->reqd_wg_size[2] > 0)
712         {
713           local_x = kernel->meta->reqd_wg_size[0];
714           local_y = kernel->meta->reqd_wg_size[1];
715           local_z = kernel->meta->reqd_wg_size[2];
716         }
717 
718       cmd.command.run.pc.local_size[0] = local_x;
719       cmd.command.run.pc.local_size[1] = local_y;
720       cmd.command.run.pc.local_size[2] = local_z;
721 
722       cmd.command.run.kernel = kernel;
723 
724       cmd.command.run.pc.global_offset[0] = cmd.command.run.pc.global_offset[1]
725           = cmd.command.run.pc.global_offset[2] = 0;
726 
727       /* Force generate a generic WG function to ensure generality. */
728       device->ops->compile_kernel (&cmd, kernel, device, 0);
729       /* Then generate a specialized one with goffset 0 since it's a very
730          common case. */
731       device->ops->compile_kernel (&cmd, kernel, device, 1);
732     }
733 
734   POCL_UNLOCK_OBJ (program);
735 
736   return CL_SUCCESS;
737 }
738