1 /* tce_common.cc - common functionality over the different TCE/TTA device
2    drivers.
3 
4    Copyright (c) 2012-2019 Pekka Jääskeläinen / Tampere University of Technology
5 
6    Permission is hereby granted, free of charge, to any person obtaining a copy
7    of this software and associated documentation files (the "Software"), to deal
8    in the Software without restriction, including without limitation the rights
9    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10    copies of the Software, and to permit persons to whom the Software is
11    furnished to do so, subject to the following conditions:
12 
13    The above copyright notice and this permission notice shall be included in
14    all copies or substantial portions of the Software.
15 
16    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22    THE SOFTWARE.
23 */
24 #include "config.h"
25 
26 #include "tce_common.h"
27 #include "pocl_util.h"
28 #include "pocl_cache.h"
29 #include "pocl_llvm.h"
30 #include "pocl_device.h"
31 #include "utlist.h"
32 #include "common.h"
33 
34 #include "pocl_runtime_config.h"
35 #include "pocl_hash.h"
36 #include "pocl_cache.h"
37 
38 #ifndef _MSC_VER
39 #  include <unistd.h>
40 #else
41 #  include "vccompat.hpp"
42 #endif
43 
44 /* Supress some warnings because of including tce_config.h after pocl's config.h. */
45 #undef PACKAGE
46 #undef PACKAGE_BUGREPORT
47 #undef PACKAGE_NAME
48 #undef PACKAGE_STRING
49 #undef PACKAGE_TARNAME
50 #undef PACKAGE_VERSION
51 #undef VERSION
52 #undef SIZEOF_DOUBLE
53 
54 #include <Machine.hh>
55 #include <Program.hh>
56 #include <DataLabel.hh>
57 #include <AddressSpace.hh>
58 #include <GlobalScope.hh>
59 #include <Environment.hh>
60 
61 using namespace TTAMachine;
62 
63 #include <algorithm>
64 #include <sstream>
65 
66 #define ALIGNMENT MAX_EXTENDED_ALIGNMENT
67 
68 //#define DEBUG_TTA_DRIVER
69 
TCEDevice(cl_device_id dev,const char * adfName)70 TCEDevice::TCEDevice(cl_device_id dev, const char* adfName) :
71   local_as(NULL), global_as(NULL), private_as(NULL), machine_file(adfName), parent(dev),
72   currentProgram(NULL), curKernelAddr(0), curKernel(NULL), globalCycleCount(0),
73   ready_list(NULL), command_list(NULL) {
74   parent->data = this;
75   pthread_mutex_init (&cq_lock, NULL);
76   POCL_INIT_LOCK(tce_compile_lock);
77   dev->address_bits = 32;
78   dev->autolocals_to_args = POCL_AUTOLOCALS_TO_ARGS_ALWAYS;
79   /* This assumes TCE is always Little-endian;
80    * needsByteSwap is set up again in TTASimDevice
81    * after we know whether ADF is big- or little-endian. */
82 #if defined(WORDS_BIGENDIAN) && WORDS_BIGENDIAN == 1
83   needsByteSwap = true;
84 #else
85   needsByteSwap = false;
86 #endif
87 }
88 
~TCEDevice()89 TCEDevice::~TCEDevice() {
90   parent->data = NULL;
91 }
92 
93 bool
isMultiCoreMachine() const94 TCEDevice::isMultiCoreMachine() const {
95 #ifdef TCEMC_AVAILABLE
96   assert (machine_ != NULL);
97   return machine_->coreCount() > 1;
98 #else
99   return false;
100 #endif
101 }
102 
103 /**
104  * This should be called by the derived classes at the point the
105  * TTA machine description is loaded. It loads additional device
106  * properties from the parsed ADF.
107  */
108 void
setMachine(const TTAMachine::Machine & machine)109 TCEDevice::setMachine(const TTAMachine::Machine& machine) {
110   machine_ = &machine;
111 }
112 
113 void
writeWordToDevice(uint32_t dest_addr,uint32_t word)114 TCEDevice::writeWordToDevice(uint32_t dest_addr, uint32_t word) {
115   uint32_t swapped = byteswap_uint32_t(word, needsByteSwap);
116   copyHostToDevice(&swapped, dest_addr, sizeof (swapped));
117 }
118 
119 uint32_t
readWordFromDevice(uint32_t addr)120 TCEDevice::readWordFromDevice(uint32_t addr) {
121   uint32_t result;
122   copyDeviceToHost(addr, &result, sizeof(result));
123   return byteswap_uint32_t(result, needsByteSwap);
124 }
125 
126 void
findDataMemoryAddresses()127 TCEDevice::findDataMemoryAddresses() {
128   /* Figure out the locations of the shared data structures in
129      the device memories from the fully-linked program. */
130   const TTAProgram::Program* prog = currentProgram;
131   assert (prog != NULL);
132   commandQueueAddr = global_as->start() + TTA_UNALLOCATED_GLOBAL_SPACE;
133 }
134 
135 void
initDataMemory()136 TCEDevice::initDataMemory() {
137   findDataMemoryAddresses();
138   writeWordToDevice(commandQueueAddr, POCL_KST_FREE);
139 }
140 
141 void
initMemoryManagement(const TTAMachine::Machine & mach)142 TCEDevice::initMemoryManagement(const TTAMachine::Machine& mach) {
143   /* Create the memory allocation book keeping structures based on
144      the machine's address spaces (see tta.txt). */
145   Machine::AddressSpaceNavigator nav = mach.addressSpaceNavigator();
146 
147   for (int i = 0; i < nav.count(); ++i) {
148     AddressSpace *as = nav.item(i);
149     if (as->hasNumericalId(TTA_ASID_LOCAL)) {
150       local_as = as;
151     }
152     if (as->hasNumericalId(TTA_ASID_PRIVATE)) {
153       private_as = as;
154     }
155     if (as->hasNumericalId(TTA_ASID_GLOBAL) &&
156         as->hasNumericalId(TTA_ASID_CONSTANT)) {
157       global_as = as;
158     }
159   }
160   if (local_as == NULL)
161     POCL_ABORT("local address space not found in the ADF. "
162                "Mark it by adding numerical id 4 to the AS.\n"
163 	       "Local address space can be same as private AS.\n");
164 
165 
166   if (isMultiCoreMachine() && local_as->isShared())
167     POCL_ABORT("The local address space is marked as shared!\n");
168 
169   if (private_as == NULL)
170     POCL_ABORT("private address space not found in the ADF. "
171                "Mark it by adding numerical id 0 to the AS.\n"
172 	       "Private address space can be same as local AS.\n");
173 
174   if (isMultiCoreMachine() && private_as->isShared())
175     POCL_ABORT("The private address space is marked as shared!\n");
176 
177   if (global_as == NULL)
178     POCL_ABORT("global address space not found in the ADF. "
179                "Mark it by adding numerical ids 3 and 5 to the AS.\n");
180 
181   if (isMultiCoreMachine() && !global_as->isShared())
182     POCL_ABORT("The global address space is not marked as shared!\n");
183 
184   int local_size = (private_as == local_as) ?
185     local_as->end() - local_as->start() - TTA_UNALLOCATED_LOCAL_SPACE:
186     local_as->end() - local_as->start();
187   if (local_size < 0)
188     POCL_ABORT("Not enough space in the local memory with the assumed unallocated space.\n");
189 
190   parent->local_mem_size = local_size;
191   int global_size = global_as->end() - local_as->start() - TTA_UNALLOCATED_GLOBAL_SPACE;
192   if (global_size < 0)
193     POCL_ABORT("Not enough space in the global memory with the assumed unallocated space.\n");
194   parent->global_mem_size = global_size;
195   parent->max_mem_alloc_size = global_size;
196 
197   init_mem_region
198     (&local_mem, (memory_address_t)local_as->start(), parent->local_mem_size);
199   init_mem_region
200     (&global_mem, (memory_address_t)global_as->start() + TTA_UNALLOCATED_GLOBAL_SPACE + sizeof(__kernel_exec_cmd),
201      parent->global_mem_size);
202 }
203 
204 #define SUBST(x) "  -DKERNEL_EXE_CMD_OFFSET=" # x
205 #define OFFSET_ARG(c) SUBST(c)
206 
tceccCommandLine(_cl_command_run * run_cmd,const TCEString & tempDir,const TCEString & inputSrc,const TCEString & outputTpef,const TCEString extraParams)207 TCEString TCEDevice::tceccCommandLine(_cl_command_run *run_cmd,
208                                       const TCEString &tempDir,
209                                       const TCEString &inputSrc,
210                                       const TCEString &outputTpef,
211                                       const TCEString extraParams) {
212 
213   TCEString mainC;
214   if (isMultiCoreMachine())
215     mainC = "tta_device_main_dthread.c";
216   else
217     mainC = "tta_device_main.c";
218 
219   TCEString deviceMainSrc;
220   TCEString poclIncludePathSwitch;
221   if (pocl_get_bool_option("POCL_BUILDING", 0))
222     {
223       deviceMainSrc = TCEString(SRCDIR) + "/lib/CL/devices/tce/" + mainC;
224       poclIncludePathSwitch = " -I " SRCDIR "/include";
225     }
226   else
227     {
228       deviceMainSrc = TCEString(POCL_INSTALL_PRIVATE_DATADIR) + "/" + mainC;
229       assert(access(deviceMainSrc.c_str(), R_OK) == 0);
230       poclIncludePathSwitch = " -I " POCL_INSTALL_PRIVATE_DATADIR "/include";
231     }
232 
233   TCEString extraFlags = extraParams;
234   if (isMultiCoreMachine())
235     extraFlags += " -ldthread -lsync-lu -llockunit";
236 
237   extraFlags += OFFSET_ARG(TTA_UNALLOCATED_GLOBAL_SPACE);
238 
239   std::string kernelObjSrc = "";
240   kernelObjSrc += tempDir;
241   kernelObjSrc += "/../descriptor.so.kernel_obj.c";
242 
243   if (pocl_is_option_set("POCL_TCECC_EXTRA_FLAGS"))
244     extraFlags += " " +
245       TCEString(pocl_get_string_option("POCL_TCECC_EXTRA_FLAGS", ""));
246   if (parent->endian_little) {
247     extraFlags += " --little-endian";
248   }
249 
250   std::string kernelMdSymbolName = "_";
251   kernelMdSymbolName += run_cmd->kernel->name;
252   kernelMdSymbolName += "_md";
253 
254   TCEString programBcFile = tempDir + "/program.bc";
255   /* Compile in steps to save the program.bc for automated exploration
256      use case when producing the kernel capture scripts. */
257   TCEString cmdLine;
258   cmdLine << "tcecc -llwpr " + poclIncludePathSwitch + " " + deviceMainSrc + " " +
259     " " + kernelObjSrc + " " + inputSrc +
260     " -k " + kernelMdSymbolName +
261     " -g -O3 --emit-llvm -o " + programBcFile + " " + extraFlags + ";";
262 
263   cmdLine << "tcecc $* -a " << machine_file << " " << programBcFile
264           << " -O3 -o " << outputTpef << + " " + extraFlags + "\n";
265   return cmdLine;
266 }
267 
isNewKernel(const _cl_command_run * runCmd)268 bool TCEDevice::isNewKernel(const _cl_command_run *runCmd) {
269   if (curKernel == NULL || runCmd->kernel != curKernel)
270     return true;
271 
272   bool newKernel = true;
273   if (runCmd->pc.local_size[0] != curLocalX ||
274       runCmd->pc.local_size[1] != curLocalY ||
275       runCmd->pc.local_size[2] != curLocalZ)
276     newKernel = true;
277   else
278     newKernel = false;
279   return newKernel;
280 }
281 
updateCurrentKernel(const _cl_command_run * runCmd,uint32_t kernelAddr)282 void TCEDevice::updateCurrentKernel(const _cl_command_run *runCmd,
283                                     uint32_t kernelAddr) {
284   curKernelAddr = kernelAddr;
285   curKernel = runCmd->kernel;
286   curLocalX = runCmd->pc.local_size[0];
287   curLocalY = runCmd->pc.local_size[1];
288   curLocalZ = runCmd->pc.local_size[2];
289 }
290 
291 cl_int
pocl_tce_alloc_mem_obj(cl_device_id device,cl_mem mem,void * host_ptr)292 pocl_tce_alloc_mem_obj (cl_device_id device, cl_mem mem, void* host_ptr)
293 {
294   TCEDevice *d = (TCEDevice*)device->data;
295   pocl_mem_identifier *p = &mem->device_ptrs[device->dev_id];
296   assert (p->mem_ptr == NULL);
297   chunk_info_t *chunk = NULL;
298   int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
299 
300   /* TCE driver doesn't preallocate */
301   if ((mem->flags & CL_MEM_ALLOC_HOST_PTR) && (mem->mem_host_ptr == NULL))
302     goto ERROR;
303 
304   chunk = alloc_buffer_from_region(&d->global_mem, mem->size);
305   if (chunk == NULL)
306     goto ERROR;
307 
308   POCL_MSG_PRINT_MEMORY ("TCE: alloc 0x%zu bytes from 0x%zu\n",
309                           mem->size, chunk->start_address);
310 
311   p->mem_ptr = chunk;
312   p->version = 0;
313   err = CL_SUCCESS;
314 
315 ERROR:
316   return err;
317 }
318 
319 void
pocl_tce_free(cl_device_id device,cl_mem mem)320 pocl_tce_free (cl_device_id device, cl_mem mem) {
321 
322   TCEDevice *d = (TCEDevice*)device->data;
323   pocl_mem_identifier *p = &mem->device_ptrs[device->dev_id];
324   assert (p->mem_ptr != NULL);
325 
326   chunk_info_t *chunk =
327       (chunk_info_t *)p->mem_ptr;
328 
329   POCL_MSG_PRINT_MEMORY ("TCE: freed 0x%zu bytes from 0x%zu\n",
330                           mem->size, chunk->start_address);
331 
332   free_chunk (chunk);
333 
334   p->mem_ptr = NULL;
335   p->version = 0;
336 }
337 
338 
339 void
pocl_tce_write(void * data,const void * __restrict__ src_host_ptr,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,size_t offset,size_t size)340 pocl_tce_write (void *data,
341                 const void *__restrict__  src_host_ptr,
342                 pocl_mem_identifier * dst_mem_id,
343                 cl_mem dst_buf,
344                 size_t offset, size_t size)
345 {
346   void *__restrict__ device_ptr = dst_mem_id->mem_ptr;
347   TCEDevice *d = (TCEDevice*)data;
348   chunk_info_t *chunk = (chunk_info_t*)device_ptr;
349 #ifdef DEBUG_TTA_DRIVER
350   printf ("host: write %p <- %lx / %zu\n", src_host_ptr, chunk->start_address + offset,
351           size);
352 #endif
353   d->copyHostToDevice(src_host_ptr, chunk->start_address + offset, size);
354 }
355 
pocl_tce_read(void * data,void * __restrict__ dst_host_ptr,pocl_mem_identifier * src_mem_id,cl_mem src_buf,size_t offset,size_t size)356 void pocl_tce_read(void *data, void *__restrict__ dst_host_ptr,
357                    pocl_mem_identifier *src_mem_id, cl_mem src_buf,
358                    size_t offset, size_t size) {
359   void *__restrict__ device_ptr = src_mem_id->mem_ptr;
360   TCEDevice* d = (TCEDevice*)data;
361   chunk_info_t *chunk = (chunk_info_t*)device_ptr;
362 #ifdef DEBUG_TTA_DRIVER
363   printf ("host: read %p -> %lx / %zu\n", dst_host_ptr,
364           chunk->start_address + offset, size);
365 #endif
366   d->copyDeviceToHost(chunk->start_address + offset, dst_host_ptr, size);
367 }
368 
pocl_tce_copy(void * data,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,pocl_mem_identifier * src_mem_id,cl_mem src_buf,size_t dst_offset,size_t src_offset,size_t size)369 void pocl_tce_copy(void *data, pocl_mem_identifier *dst_mem_id, cl_mem dst_buf,
370                    pocl_mem_identifier *src_mem_id, cl_mem src_buf,
371                    size_t dst_offset, size_t src_offset, size_t size) {
372   void *__restrict__ dst_device_ptr = dst_mem_id->mem_ptr;
373   void *__restrict__ src_device_ptr = src_mem_id->mem_ptr;
374   TCEDevice *d = (TCEDevice *)data;
375   chunk_info_t *src_chunk = (chunk_info_t *)src_device_ptr;
376   chunk_info_t *dst_chunk = (chunk_info_t *)dst_device_ptr;
377 #ifdef DEBUG_TTA_DRIVER
378   printf("device: copy %x %x %zu\n", src_chunk, dst_chunk, size);
379 #endif
380   d->copyDeviceToDevice(src_chunk->start_address + src_offset,
381                         dst_chunk->start_address + dst_offset, size);
382 }
383 
384 chunk_info_t*
pocl_tce_malloc_local(void * device_data,size_t size)385 pocl_tce_malloc_local (void *device_data, size_t size)
386 {
387   TCEDevice *d = (TCEDevice*)device_data;
388   return alloc_buffer_from_region(&d->local_mem, size);
389 }
390 
391 
pocl_tce_write_kernel_descriptor(cl_device_id device,unsigned device_i,cl_kernel kernel)392 static void pocl_tce_write_kernel_descriptor(cl_device_id device,
393                                              unsigned device_i,
394                                              cl_kernel kernel) {
395   // Generate the kernel_obj.c file. This should be optional
396   // and generated only for the heterogeneous standalone devices which
397   // need the definitions to accompany the kernels, for the launcher
398   // code.
399   // TODO: the scripts use a generated kernel.h header file that
400   // gets added to this file. No checks seem to fail if that file
401   // is missing though, so it is left out from there for now
402 
403   std::stringstream content;
404   pocl_kernel_metadata_t *meta = kernel->meta;
405 
406   content << std::endl
407           << "#include <pocl_device.h>" << std::endl
408           << "void _pocl_kernel_" << meta->name
409           << "_workgroup(uint8_t* args, uint8_t*, "
410           << "uint32_t, uint32_t, uint32_t);" << std::endl
411           << "void _pocl_kernel_" << meta->name
412           << "_workgroup_fast(uint8_t* args, uint8_t*, "
413           << "uint32_t, uint32_t, uint32_t);" << std::endl;
414 
415   if (device->global_as_id != 0)
416     content << "__attribute__((address_space(" << device->global_as_id << ")))"
417             << std::endl;
418 
419   content << "__kernel_metadata _" << meta->name << "_md = {" << std::endl
420           << "     \"" << meta->name << "\"," << std::endl
421           << "     " << meta->num_args << "," << std::endl
422           << "     " << meta->num_locals << "," << std::endl
423           << "     _pocl_kernel_" << meta->name << "_workgroup_fast"
424           << std::endl
425           << " };" << std::endl;
426 
427   pocl_cache_write_descriptor(kernel->program, device_i, meta->name,
428                               content.str().c_str(), content.str().size());
429 }
430 
pocl_tce_compile_kernel(_cl_command_node * Command,cl_kernel Kernel,cl_device_id Device,int Specialize)431 void pocl_tce_compile_kernel(_cl_command_node *Command, cl_kernel Kernel,
432                              cl_device_id Device, int Specialize) {
433   if (Command->type != CL_COMMAND_NDRANGE_KERNEL)
434     return;
435   _cl_command_run *RunCommand = &Command->command.run;
436 
437   void *Data = Command->device->data;
438   TCEDevice *Dev = (TCEDevice *)Data;
439 
440   if (!Kernel)
441     Kernel = Command->command.run.kernel;
442   if (!Device)
443     Device = Command->device;
444 
445   POCL_LOCK(Dev->tce_compile_lock);
446   int Error = pocl_llvm_generate_workgroup_function(
447       Command->device_i, Device, Kernel, Command, Specialize);
448 
449   if (Error) {
450     POCL_UNLOCK(Dev->tce_compile_lock);
451     POCL_MSG_PRINT_GENERAL("TCE: pocl_llvm_generate_workgroup_function()"
452                            " failed for kernel %s\n",
453                            Kernel->name);
454     assert(Error == 0);
455   }
456 
457   // 12 == strlen (POCL_PARALLEL_BC_FILENAME)
458   char ByteCode[POCL_FILENAME_LENGTH + 13];
459 
460   assert(Dev != NULL);
461   assert(Command->command.run.kernel);
462 
463   char CacheDir[POCL_FILENAME_LENGTH];
464   pocl_cache_kernel_cachedir_path(CacheDir, Kernel->program, Command->device_i,
465                                   Kernel, "", Command, 1);
466   RunCommand->device_data = strdup(CacheDir);
467 
468   if (Dev->isNewKernel(RunCommand)) {
469 
470     pocl_tce_write_kernel_descriptor(Device, Command->device_i, Kernel);
471 
472     std::string AssemblyFileName(CacheDir);
473     TCEString TempDir(CacheDir);
474     AssemblyFileName += "/parallel.tpef";
475 
476     if (access(AssemblyFileName.c_str(), F_OK) != 0) {
477       Error = snprintf(ByteCode, POCL_FILENAME_LENGTH + 13, "%s%s", CacheDir,
478                        POCL_PARALLEL_BC_FILENAME);
479       TCEString BuildCmd = Dev->tceccCommandLine(RunCommand, TempDir, ByteCode,
480                                                  AssemblyFileName);
481 
482 #ifdef DEBUG_TTA_DRIVER
483       std::cerr << "CMD: " << BuildCmd << std::endl;
484 #endif
485       Error = system(BuildCmd.c_str());
486       if (Error != 0)
487         POCL_ABORT("Error while running tcecc.\n");
488     }
489   }
490 
491   POCL_UNLOCK(Dev->tce_compile_lock);
492 }
493 
494 void
pocl_tce_run(void * data,_cl_command_node * cmd)495 pocl_tce_run(void *data, _cl_command_node* cmd)
496 {
497   assert(cmd->type == CL_COMMAND_NDRANGE_KERNEL);
498 
499   TCEDevice *d = (TCEDevice*)data;
500   uint32_t kernelAddr;
501   unsigned i;
502 
503   assert(d != NULL);
504   assert(cmd->command.run.kernel);
505   assert(cmd->command.run.device_data);
506 
507   if (d->isNewKernel(&(cmd->command.run))) {
508     std::string assemblyFileName((const char*)cmd->command.run.device_data);
509     assemblyFileName += "/parallel.tpef";
510 
511     std::string kernelMdSymbolName = "_";
512     kernelMdSymbolName += cmd->command.run.kernel->name;
513     kernelMdSymbolName += "_md";
514 
515     try {
516       d->loadProgramToDevice(assemblyFileName);
517       d->restartProgram();
518     } catch (Exception &e) {
519       std::cerr << "error: " << e.errorMessage() << std::endl;
520       POCL_ABORT("error: Failed to load program to the TTA.\n");
521     }
522 
523     const TTAProgram::Program* prog = d->currentProgram;
524     assert (prog != NULL);
525 
526     const TTAProgram::GlobalScope& globalScope = prog->globalScopeConst();
527 
528     try {
529       kernelAddr = globalScope.dataLabel(kernelMdSymbolName).address().location();
530     } catch (const KeyNotFound& e) {
531       POCL_ABORT("Could not find the shared data structures from the device binary.\n");
532     }
533     // cache the currently device loaded kernel info
534     d->updateCurrentKernel(&(cmd->command.run), kernelAddr);
535   } else {
536     // Same kernel, no need to recompile
537     d->restartProgram();
538     kernelAddr = d->curKernelAddr;
539   }
540   __kernel_exec_cmd dev_cmd;
541   dev_cmd.kernel = byteswap_uint32_t (kernelAddr, d->needsByteSwap);
542 
543   struct pocl_argument *al;
544 
545   typedef std::vector<chunk_info_t*> ChunkVector;
546   /* Chunks to be freed after the kernel finishes. */
547   ChunkVector tempChunks;
548 
549   cl_kernel kernel = cmd->command.run.kernel;
550   pocl_kernel_metadata_t *meta = kernel->meta;
551 
552   for (i = 0; i < meta->num_args; ++i)
553     {
554       al = &(cmd->command.run.arguments[i]);
555       if (ARG_IS_LOCAL (meta->arg_info[i]))
556         {
557           chunk_info_t* local_chunk = pocl_tce_malloc_local (d, al->size);
558           if (local_chunk == NULL)
559             POCL_ABORT ("Could not allocate memory for a local argument. Out of local mem?\n");
560 
561           dev_cmd.args[i] = byteswap_uint32_t (local_chunk->start_address, d->needsByteSwap);
562 #ifdef DEBUG_TTA_DRIVER
563           printf ("host: allocated %zu bytes of local memory for arg %u @ %lu\n",
564                   al->size, i, local_chunk->start_address);
565 #endif
566           tempChunks.push_back(local_chunk);
567         }
568       else if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER)
569         {
570           /* It's legal to pass a NULL pointer to clSetKernelArguments. In
571              that case we must pass the same NULL forward to the kernel.
572              Otherwise, the user must have created a buffer with per device
573              pointers stored in the cl_mem. */
574           if (al->value == NULL)
575             dev_cmd.args[i] = 0;
576           else {
577             assert(al->is_svm == 0);
578             cl_mem m = (*(cl_mem *)(al->value));
579             chunk_info_t *p =
580                 (chunk_info_t *)m->device_ptrs[d->parent->dev_id].mem_ptr;
581             dev_cmd.args[i] = byteswap_uint32_t(p->start_address + al->offset,
582                                                 d->needsByteSwap);
583           }
584         }
585       else /* The scalar values should be byteswapped by the user. */
586         {
587           /* Copy the scalar argument data to the shared memory. */
588           chunk_info_t* arg_space =
589               alloc_buffer (&d->global_mem, al->size);
590           if (arg_space == NULL)
591             POCL_ABORT ("Could not allocate memory from the device argument space. Out of global mem?\n");
592           d->copyHostToDevice (al->value, arg_space->start_address, al->size );
593 #ifdef DEBUG_TTA_DRIVER
594           printf ("host: copied value from %p to global argument memory\n", al->value);
595 #endif
596           dev_cmd.args[i] = byteswap_uint32_t (arg_space->start_address, d->needsByteSwap);
597           tempChunks.push_back(arg_space);
598         }
599     }
600 
601   /* Allocate the automatic local buffers. */
602   for (i = 0; i < meta->num_locals; ++i)
603     {
604       size_t s = meta->local_sizes[i];
605       chunk_info_t* local_chunk = pocl_tce_malloc_local (d, s);
606       if (local_chunk == NULL)
607         POCL_ABORT ("Could not allocate memory for an automatic local argument. Out of local mem?\n");
608 
609       dev_cmd.args[meta->num_args + i] = byteswap_uint32_t (local_chunk->start_address, d->needsByteSwap);
610 #ifdef DEBUG_TTA_DRIVER
611       printf ("host: allocated %zu bytes of local memory for automated local arg %u @ %lu\n",
612               s, (meta->num_args + i), local_chunk->start_address);
613 #endif
614       tempChunks.push_back(local_chunk);
615     }
616 
617   dev_cmd.work_dim = byteswap_uint32_t (cmd->command.run.pc.work_dim, d->needsByteSwap);
618   dev_cmd.num_groups[0] = byteswap_uint32_t (cmd->command.run.pc.num_groups[0], d->needsByteSwap);
619   dev_cmd.num_groups[1] = byteswap_uint32_t (cmd->command.run.pc.num_groups[1], d->needsByteSwap);
620   dev_cmd.num_groups[2] = byteswap_uint32_t (cmd->command.run.pc.num_groups[2], d->needsByteSwap);
621 
622   dev_cmd.global_offset[0] = byteswap_uint32_t (cmd->command.run.pc.global_offset[0], d->needsByteSwap);
623   dev_cmd.global_offset[1] = byteswap_uint32_t (cmd->command.run.pc.global_offset[1], d->needsByteSwap);
624   dev_cmd.global_offset[2] = byteswap_uint32_t (cmd->command.run.pc.global_offset[2], d->needsByteSwap);
625 
626   dev_cmd.status = byteswap_uint32_t (POCL_KST_FREE, d->needsByteSwap);
627 
628 #ifdef DEBUG_TTA_DRIVER
629   printf("host: waiting for the device command queue (@ %x) to get room.\n",
630          d->commandQueueAddr);
631   printf("host: command queue status: %d\n",
632          d->readWordFromDevice (d->commandQueueAddr));
633 #endif
634   /* Wait until the device command queue has room. */
635   do {}
636   while (d->readWordFromDevice (d->commandQueueAddr) != POCL_KST_FREE);
637 
638 #ifdef DEBUG_TTA_DRIVER
639   printf( "host: writing the command.\n");
640 #endif
641   d->copyHostToDevice (&dev_cmd, d->commandQueueAddr, sizeof(__kernel_exec_cmd) );
642 
643   /* Ensure the READY status is written the last so the device doesn't
644      start executing before all the cmd data has been written. We
645      need a flush or similar mechanism to ensure all the data has
646      been really written, in case the data transfers are not guaranteed
647      to be ordered. */
648   d->writeWordToDevice(d->commandQueueAddr, POCL_KST_READY);
649 
650   dev_cmd.status = byteswap_uint32_t (POCL_KST_READY, d->needsByteSwap);
651 
652   d->notifyKernelRunCommandSent(dev_cmd, &cmd->command.run);
653 
654 #ifdef DEBUG_TTA_DRIVER
655   printf("host: commmand queue status: %x\n",
656          d->readWordFromDevice(d->commandQueueAddr));
657 
658   printf("host: waiting for the command to get executed.\n");
659 #endif
660   /* Wait until the command has executed. */
661   unsigned long ticks = 0;
662   do {
663 #ifdef DEBUG_TTA_DRIVER
664       if ((ticks & 50) == 0)
665         printf("host: commmand queue status: %x\n",
666              d->readWordFromDevice(d->commandQueueAddr));
667 #endif
668       usleep(20000);
669       ++ticks;
670   } while (d->readWordFromDevice(d->commandQueueAddr) != POCL_KST_FINISHED);
671 
672 #ifdef DEBUG_TTA_DRIVER
673   printf( "host: done. Freeing the command queue entry.\n");
674 #endif
675   /* We are done with this kernel, free the command queue entry. */
676   d->writeWordToDevice(d->commandQueueAddr, POCL_KST_FREE);
677 
678   for (ChunkVector::iterator i = tempChunks.begin();
679        i != tempChunks.end(); ++i)
680     free_chunk (*i);
681 
682   POCL_MEM_FREE(cmd->command.run.device_data);
683 
684 #ifdef DEBUG_TTA_DRIVER
685   printf("host: local memory allocations:\n");
686   print_chunks (d->local_mem.chunks);
687 
688   printf("host: global memory allocations:\n");
689   print_chunks (d->global_mem.chunks);
690 #endif
691 }
692 
693 cl_int
pocl_tce_map_mem(void * data,pocl_mem_identifier * src_mem_id,cl_mem src_buf,mem_mapping_t * map)694 pocl_tce_map_mem (void *data,
695                   pocl_mem_identifier * src_mem_id,
696                   cl_mem src_buf,
697                   mem_mapping_t *map)
698 {
699   /* Synch the device global region to the host memory. */
700   pocl_tce_read(data, map->host_ptr, src_mem_id, src_buf, map->offset,
701                 map->size);
702 
703   return CL_SUCCESS;
704 }
705 
706 cl_int
pocl_tce_unmap_mem(void * data,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,mem_mapping_t * map)707 pocl_tce_unmap_mem (void *data,
708                     pocl_mem_identifier *dst_mem_id,
709                     cl_mem dst_buf,
710                     mem_mapping_t *map)
711 {
712   if (map->map_flags != CL_MAP_READ) {
713     /* Synch the device global region to the host memory. */
714     pocl_tce_write (data, map->host_ptr, dst_mem_id, dst_buf, map->offset, map->size);
715   }
716 
717   return CL_SUCCESS;
718 }
719 
720 
721 char*
pocl_tce_init_build(void * data)722 pocl_tce_init_build(void *data)
723 {
724   TCEDevice *tce_dev = (TCEDevice*)data;
725   TCEString mach_tmpdir =
726       Environment::llvmtceCachePath();
727 
728   TCEString mach_header_base =
729       mach_tmpdir + "/" + tce_dev->machine_->hash();
730 
731   int error = 0;
732 
733   std::string devextHeaderFn =
734     std::string(mach_header_base) + std::string("_opencl_devext.h");
735 
736   /* Generate the vendor extensions header to provide explicit
737      access to the (custom) hardware operations. */
738   std::string tceopgenCmd =
739       std::string("tceopgen > ") + devextHeaderFn;
740 
741   error = system (tceopgenCmd.c_str());
742   if (error == -1) return NULL;
743 
744   std::string extgenCmd =
745     std::string("tceoclextgen ") + tce_dev->machine_file +
746       std::string(" >> ") + devextHeaderFn;
747 
748   error = system (extgenCmd.c_str());
749   if (error == -1) return NULL;
750 
751   // gnu-keywords needed to support the inline asm blocks
752   // -fasm doesn't work in the frontend
753 
754   std::string includeSwitch =
755     std::string("-fgnu-keywords -Dasm=__asm__ -include ") + devextHeaderFn;
756 
757   char *include_switch = strdup(includeSwitch.c_str());
758 
759   return include_switch;
760 }
761 
762 char *
pocl_tce_build_hash(cl_device_id device)763 pocl_tce_build_hash (cl_device_id device)
764 {
765   TCEDevice *tce_dev = (TCEDevice*)device->data;
766   FILE* adf_file = fopen (tce_dev->machine_file.c_str(), "r");
767   size_t size;
768   uint8_t* adf_data = 0;
769   const char *extra_flags = NULL;
770 
771   fseek (adf_file, 0 , SEEK_END);
772   size = ftell (adf_file);
773   fseek (adf_file, 0, SEEK_SET);
774   adf_data = (uint8_t*)malloc (size);
775   if (fread (adf_data, 1, size, adf_file) == 0)
776       POCL_ABORT("Could not read ADF.\n");
777 
778   SHA1_CTX ctx;
779   uint8_t bin_dig[SHA1_DIGEST_SIZE];
780   pocl_SHA1_Init(&ctx);
781   pocl_SHA1_Update(&ctx, adf_data, size);
782   pocl_SHA1_Final(&ctx, bin_dig);
783 
784   char *result = (char *)calloc(1000, sizeof(char));
785   strcpy(result, device->llvm_target_triplet);
786   char *temp = result + strlen(result);
787   *temp++ = '-';
788   unsigned i;
789   for (i=0; i < SHA1_DIGEST_SIZE; i++)
790     {
791       *temp++ = (bin_dig[i] & 0x0F) + 65;
792       *temp++ = ((bin_dig[i] & 0xF0) >> 4) + 65;
793     }
794   *temp++ = '_';
795   *temp = 0;
796 
797   if (pocl_is_option_set("POCL_TCECC_EXTRA_FLAGS"))
798     {
799       extra_flags = pocl_get_string_option("POCL_TCECC_EXTRA_FLAGS", "");
800       strncpy(temp, extra_flags, (1000-(temp-result)) );
801     }
802 
803   return result;
804 }
805 
806 void
pocl_tce_copy_rect(void * data,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,pocl_mem_identifier * src_mem_id,cl_mem src_buf,const size_t * __restrict__ const dst_origin,const size_t * __restrict__ const src_origin,const size_t * __restrict__ const region,size_t const dst_row_pitch,size_t const dst_slice_pitch,size_t const src_row_pitch,size_t const src_slice_pitch)807 pocl_tce_copy_rect (void *data,
808                     pocl_mem_identifier * dst_mem_id,
809                     cl_mem dst_buf,
810                     pocl_mem_identifier * src_mem_id,
811                     cl_mem src_buf,
812                     const size_t *__restrict__ const dst_origin,
813                     const size_t *__restrict__ const src_origin,
814                     const size_t *__restrict__ const region,
815                     size_t const dst_row_pitch,
816                     size_t const dst_slice_pitch,
817                     size_t const src_row_pitch,
818                     size_t const src_slice_pitch)
819 {
820   TCEDevice *d = (TCEDevice*)data;
821   chunk_info_t *src_chunk = (chunk_info_t*)src_mem_id->mem_ptr;
822   chunk_info_t *dst_chunk = (chunk_info_t*)dst_mem_id->mem_ptr;
823 
824   size_t src_offset = src_origin[0] + src_row_pitch * src_origin[1] + src_slice_pitch * src_origin[2];
825   size_t dst_offset = dst_origin[0] + dst_row_pitch * dst_origin[1] + dst_slice_pitch * dst_origin[2];
826 
827   size_t j, k;
828 
829   /* TODO: handle overlaping regions */
830 
831   for (k = 0; k < region[2]; ++k)
832     for (j = 0; j < region[1]; ++j)
833       d->copyDeviceToDevice(src_chunk->start_address + src_offset + src_row_pitch * j + src_slice_pitch * k,
834                             dst_chunk->start_address + dst_offset + dst_row_pitch * j + dst_slice_pitch * k,
835                             region[0]);
836 
837 }
838 
839 void
pocl_tce_write_rect(void * data,const void * __restrict__ src_host_ptr,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,const size_t * __restrict__ const buffer_origin,const size_t * __restrict__ const host_origin,const size_t * __restrict__ const region,size_t const buffer_row_pitch,size_t const buffer_slice_pitch,size_t const host_row_pitch,size_t const host_slice_pitch)840 pocl_tce_write_rect (void *data,
841                      const void *__restrict__ src_host_ptr,
842                      pocl_mem_identifier * dst_mem_id,
843                      cl_mem dst_buf,
844                      const size_t *__restrict__ const buffer_origin,
845                      const size_t *__restrict__ const host_origin,
846                      const size_t *__restrict__ const region,
847                      size_t const buffer_row_pitch,
848                      size_t const buffer_slice_pitch,
849                      size_t const host_row_pitch,
850                      size_t const host_slice_pitch)
851 {
852   TCEDevice *d = (TCEDevice *)data;
853   chunk_info_t *dst_chunk = (chunk_info_t *)dst_mem_id->mem_ptr;
854   size_t adjusted_dst_ptr = dst_chunk->start_address + buffer_origin[0] +
855                             buffer_row_pitch * buffer_origin[1] +
856                             buffer_slice_pitch * buffer_origin[2];
857 
858   char const *__restrict__ const adjusted_host_ptr =
859       (char const *)src_host_ptr + host_origin[0] +
860       host_row_pitch * host_origin[1] + host_slice_pitch * host_origin[2];
861 
862   size_t j, k;
863 
864   /* TODO: handle overlaping regions */
865 
866   for (k = 0; k < region[2]; ++k)
867     for (j = 0; j < region[1]; ++j)
868       {
869       size_t s_offset = host_row_pitch * j + host_slice_pitch * k;
870 
871       size_t d_offset = buffer_row_pitch * j + buffer_slice_pitch * k;
872 
873       d->copyHostToDevice(adjusted_host_ptr + s_offset,
874                           adjusted_dst_ptr + d_offset, region[0]);
875       }
876 }
877 
878 void
pocl_tce_read_rect(void * data,void * __restrict__ dst_host_ptr,pocl_mem_identifier * src_mem_id,cl_mem src_buf,const size_t * __restrict__ const buffer_origin,const size_t * __restrict__ const host_origin,const size_t * __restrict__ const region,size_t const buffer_row_pitch,size_t const buffer_slice_pitch,size_t const host_row_pitch,size_t const host_slice_pitch)879 pocl_tce_read_rect (void *data,
880                     void *__restrict__ dst_host_ptr,
881                     pocl_mem_identifier * src_mem_id,
882                     cl_mem src_buf,
883                     const size_t *__restrict__ const buffer_origin,
884                     const size_t *__restrict__ const host_origin,
885                     const size_t *__restrict__ const region,
886                     size_t const buffer_row_pitch,
887                     size_t const buffer_slice_pitch,
888                     size_t const host_row_pitch,
889                     size_t const host_slice_pitch)
890 {
891   TCEDevice *d = (TCEDevice *)data;
892   chunk_info_t *src_chunk = (chunk_info_t *)src_mem_id->mem_ptr;
893   size_t adjusted_src_ptr = src_chunk->start_address + buffer_origin[0] +
894                             buffer_row_pitch * buffer_origin[1] +
895                             buffer_slice_pitch * buffer_origin[2];
896 
897   char const *__restrict__ const adjusted_host_ptr =
898       (char const *)dst_host_ptr + host_origin[0] +
899       host_row_pitch * host_origin[1] + host_slice_pitch * host_origin[2];
900 
901   size_t j, k;
902 
903   /* TODO: handle overlaping regions */
904 
905   for (k = 0; k < region[2]; ++k)
906     for (j = 0; j < region[1]; ++j)
907       {
908       size_t d_offset = host_row_pitch * j + host_slice_pitch * k;
909       size_t s_offset = buffer_row_pitch * j + buffer_slice_pitch * k;
910 
911       d->copyDeviceToHost(adjusted_src_ptr + s_offset,
912                           adjusted_host_ptr + d_offset, region[0]);
913       }
914 }
915 
tce_command_scheduler(TCEDevice * d)916 static void tce_command_scheduler (TCEDevice *d)
917 {
918   _cl_command_node *node;
919 
920   /* execute commands from ready list */
921   while ((node = d->ready_list))
922     {
923       assert (pocl_command_is_ready(node->event));
924       CDL_DELETE (d->ready_list, node);
925       POCL_UNLOCK(d->cq_lock);
926       assert (node->event->status == CL_SUBMITTED);
927       if (node->type == CL_COMMAND_NDRANGE_KERNEL)
928         pocl_tce_compile_kernel(node, NULL, NULL, 1);
929       pocl_exec_command(node);
930       POCL_LOCK(d->cq_lock);
931     }
932 
933   return;
934 }
935 
936 void
pocl_tce_submit(_cl_command_node * node,cl_command_queue)937 pocl_tce_submit (_cl_command_node *node, cl_command_queue /*cq*/)
938 {
939   TCEDevice *d = (TCEDevice*)node->device->data;
940 
941   node->ready = 1;
942   POCL_LOCK(d->cq_lock);
943   pocl_command_push(node, &d->ready_list, &d->command_list);
944   POCL_UNLOCK_OBJ(node->event);
945 
946   tce_command_scheduler (d);
947   POCL_UNLOCK(d->cq_lock);
948 
949   return;
950 }
951 
pocl_tce_flush(cl_device_id device,cl_command_queue)952 void pocl_tce_flush (cl_device_id device, cl_command_queue /*cq*/)
953 {
954   TCEDevice *d = (TCEDevice*)device->data;
955 
956   POCL_LOCK (d->cq_lock);
957   tce_command_scheduler (d);
958   POCL_UNLOCK (d->cq_lock);
959 }
960 
961 
962 void
pocl_tce_join(cl_device_id device,cl_command_queue)963 pocl_tce_join(cl_device_id device, cl_command_queue /*cq*/)
964 {
965   TCEDevice *d = (TCEDevice*)device->data;
966 
967   POCL_LOCK (d->cq_lock);
968   tce_command_scheduler (d);
969   POCL_UNLOCK (d->cq_lock);
970 
971   return;
972 }
973 
974 void
pocl_tce_notify(cl_device_id device,cl_event event,cl_event finished)975 pocl_tce_notify (cl_device_id device, cl_event event, cl_event finished)
976 {
977   TCEDevice *d = (TCEDevice*)device->data;
978   _cl_command_node *node = event->command;
979 
980   if (finished->status < CL_COMPLETE) {
981     pocl_update_event_failed(event);
982     return;
983   }
984 
985   if (!node->ready)
986     return;
987 
988   if (pocl_command_is_ready(event)) {
989     if (event->status == CL_QUEUED) {
990       pocl_update_event_submitted(event);
991       POCL_LOCK(d->cq_lock);
992       CDL_DELETE(d->command_list, node);
993       CDL_PREPEND(d->ready_list, node);
994       POCL_UNLOCK(d->cq_lock);
995     }
996   }
997 }
998