1 /* tce_common.cc - common functionality over the different TCE/TTA device
2 drivers.
3
4 Copyright (c) 2012-2019 Pekka Jääskeläinen / Tampere University of Technology
5
6 Permission is hereby granted, free of charge, to any person obtaining a copy
7 of this software and associated documentation files (the "Software"), to deal
8 in the Software without restriction, including without limitation the rights
9 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 copies of the Software, and to permit persons to whom the Software is
11 furnished to do so, subject to the following conditions:
12
13 The above copyright notice and this permission notice shall be included in
14 all copies or substantial portions of the Software.
15
16 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 THE SOFTWARE.
23 */
24 #include "config.h"
25
26 #include "tce_common.h"
27 #include "pocl_util.h"
28 #include "pocl_cache.h"
29 #include "pocl_llvm.h"
30 #include "pocl_device.h"
31 #include "utlist.h"
32 #include "common.h"
33
34 #include "pocl_runtime_config.h"
35 #include "pocl_hash.h"
36 #include "pocl_cache.h"
37
38 #ifndef _MSC_VER
39 # include <unistd.h>
40 #else
41 # include "vccompat.hpp"
42 #endif
43
44 /* Supress some warnings because of including tce_config.h after pocl's config.h. */
45 #undef PACKAGE
46 #undef PACKAGE_BUGREPORT
47 #undef PACKAGE_NAME
48 #undef PACKAGE_STRING
49 #undef PACKAGE_TARNAME
50 #undef PACKAGE_VERSION
51 #undef VERSION
52 #undef SIZEOF_DOUBLE
53
54 #include <Machine.hh>
55 #include <Program.hh>
56 #include <DataLabel.hh>
57 #include <AddressSpace.hh>
58 #include <GlobalScope.hh>
59 #include <Environment.hh>
60
61 using namespace TTAMachine;
62
63 #include <algorithm>
64 #include <sstream>
65
66 #define ALIGNMENT MAX_EXTENDED_ALIGNMENT
67
68 //#define DEBUG_TTA_DRIVER
69
TCEDevice(cl_device_id dev,const char * adfName)70 TCEDevice::TCEDevice(cl_device_id dev, const char* adfName) :
71 local_as(NULL), global_as(NULL), private_as(NULL), machine_file(adfName), parent(dev),
72 currentProgram(NULL), curKernelAddr(0), curKernel(NULL), globalCycleCount(0),
73 ready_list(NULL), command_list(NULL) {
74 parent->data = this;
75 pthread_mutex_init (&cq_lock, NULL);
76 POCL_INIT_LOCK(tce_compile_lock);
77 dev->address_bits = 32;
78 dev->autolocals_to_args = POCL_AUTOLOCALS_TO_ARGS_ALWAYS;
79 /* This assumes TCE is always Little-endian;
80 * needsByteSwap is set up again in TTASimDevice
81 * after we know whether ADF is big- or little-endian. */
82 #if defined(WORDS_BIGENDIAN) && WORDS_BIGENDIAN == 1
83 needsByteSwap = true;
84 #else
85 needsByteSwap = false;
86 #endif
87 }
88
~TCEDevice()89 TCEDevice::~TCEDevice() {
90 parent->data = NULL;
91 }
92
93 bool
isMultiCoreMachine() const94 TCEDevice::isMultiCoreMachine() const {
95 #ifdef TCEMC_AVAILABLE
96 assert (machine_ != NULL);
97 return machine_->coreCount() > 1;
98 #else
99 return false;
100 #endif
101 }
102
103 /**
104 * This should be called by the derived classes at the point the
105 * TTA machine description is loaded. It loads additional device
106 * properties from the parsed ADF.
107 */
108 void
setMachine(const TTAMachine::Machine & machine)109 TCEDevice::setMachine(const TTAMachine::Machine& machine) {
110 machine_ = &machine;
111 }
112
113 void
writeWordToDevice(uint32_t dest_addr,uint32_t word)114 TCEDevice::writeWordToDevice(uint32_t dest_addr, uint32_t word) {
115 uint32_t swapped = byteswap_uint32_t(word, needsByteSwap);
116 copyHostToDevice(&swapped, dest_addr, sizeof (swapped));
117 }
118
119 uint32_t
readWordFromDevice(uint32_t addr)120 TCEDevice::readWordFromDevice(uint32_t addr) {
121 uint32_t result;
122 copyDeviceToHost(addr, &result, sizeof(result));
123 return byteswap_uint32_t(result, needsByteSwap);
124 }
125
126 void
findDataMemoryAddresses()127 TCEDevice::findDataMemoryAddresses() {
128 /* Figure out the locations of the shared data structures in
129 the device memories from the fully-linked program. */
130 const TTAProgram::Program* prog = currentProgram;
131 assert (prog != NULL);
132 commandQueueAddr = global_as->start() + TTA_UNALLOCATED_GLOBAL_SPACE;
133 }
134
135 void
initDataMemory()136 TCEDevice::initDataMemory() {
137 findDataMemoryAddresses();
138 writeWordToDevice(commandQueueAddr, POCL_KST_FREE);
139 }
140
141 void
initMemoryManagement(const TTAMachine::Machine & mach)142 TCEDevice::initMemoryManagement(const TTAMachine::Machine& mach) {
143 /* Create the memory allocation book keeping structures based on
144 the machine's address spaces (see tta.txt). */
145 Machine::AddressSpaceNavigator nav = mach.addressSpaceNavigator();
146
147 for (int i = 0; i < nav.count(); ++i) {
148 AddressSpace *as = nav.item(i);
149 if (as->hasNumericalId(TTA_ASID_LOCAL)) {
150 local_as = as;
151 }
152 if (as->hasNumericalId(TTA_ASID_PRIVATE)) {
153 private_as = as;
154 }
155 if (as->hasNumericalId(TTA_ASID_GLOBAL) &&
156 as->hasNumericalId(TTA_ASID_CONSTANT)) {
157 global_as = as;
158 }
159 }
160 if (local_as == NULL)
161 POCL_ABORT("local address space not found in the ADF. "
162 "Mark it by adding numerical id 4 to the AS.\n"
163 "Local address space can be same as private AS.\n");
164
165
166 if (isMultiCoreMachine() && local_as->isShared())
167 POCL_ABORT("The local address space is marked as shared!\n");
168
169 if (private_as == NULL)
170 POCL_ABORT("private address space not found in the ADF. "
171 "Mark it by adding numerical id 0 to the AS.\n"
172 "Private address space can be same as local AS.\n");
173
174 if (isMultiCoreMachine() && private_as->isShared())
175 POCL_ABORT("The private address space is marked as shared!\n");
176
177 if (global_as == NULL)
178 POCL_ABORT("global address space not found in the ADF. "
179 "Mark it by adding numerical ids 3 and 5 to the AS.\n");
180
181 if (isMultiCoreMachine() && !global_as->isShared())
182 POCL_ABORT("The global address space is not marked as shared!\n");
183
184 int local_size = (private_as == local_as) ?
185 local_as->end() - local_as->start() - TTA_UNALLOCATED_LOCAL_SPACE:
186 local_as->end() - local_as->start();
187 if (local_size < 0)
188 POCL_ABORT("Not enough space in the local memory with the assumed unallocated space.\n");
189
190 parent->local_mem_size = local_size;
191 int global_size = global_as->end() - local_as->start() - TTA_UNALLOCATED_GLOBAL_SPACE;
192 if (global_size < 0)
193 POCL_ABORT("Not enough space in the global memory with the assumed unallocated space.\n");
194 parent->global_mem_size = global_size;
195 parent->max_mem_alloc_size = global_size;
196
197 init_mem_region
198 (&local_mem, (memory_address_t)local_as->start(), parent->local_mem_size);
199 init_mem_region
200 (&global_mem, (memory_address_t)global_as->start() + TTA_UNALLOCATED_GLOBAL_SPACE + sizeof(__kernel_exec_cmd),
201 parent->global_mem_size);
202 }
203
204 #define SUBST(x) " -DKERNEL_EXE_CMD_OFFSET=" # x
205 #define OFFSET_ARG(c) SUBST(c)
206
tceccCommandLine(_cl_command_run * run_cmd,const TCEString & tempDir,const TCEString & inputSrc,const TCEString & outputTpef,const TCEString extraParams)207 TCEString TCEDevice::tceccCommandLine(_cl_command_run *run_cmd,
208 const TCEString &tempDir,
209 const TCEString &inputSrc,
210 const TCEString &outputTpef,
211 const TCEString extraParams) {
212
213 TCEString mainC;
214 if (isMultiCoreMachine())
215 mainC = "tta_device_main_dthread.c";
216 else
217 mainC = "tta_device_main.c";
218
219 TCEString deviceMainSrc;
220 TCEString poclIncludePathSwitch;
221 if (pocl_get_bool_option("POCL_BUILDING", 0))
222 {
223 deviceMainSrc = TCEString(SRCDIR) + "/lib/CL/devices/tce/" + mainC;
224 poclIncludePathSwitch = " -I " SRCDIR "/include";
225 }
226 else
227 {
228 deviceMainSrc = TCEString(POCL_INSTALL_PRIVATE_DATADIR) + "/" + mainC;
229 assert(access(deviceMainSrc.c_str(), R_OK) == 0);
230 poclIncludePathSwitch = " -I " POCL_INSTALL_PRIVATE_DATADIR "/include";
231 }
232
233 TCEString extraFlags = extraParams;
234 if (isMultiCoreMachine())
235 extraFlags += " -ldthread -lsync-lu -llockunit";
236
237 extraFlags += OFFSET_ARG(TTA_UNALLOCATED_GLOBAL_SPACE);
238
239 std::string kernelObjSrc = "";
240 kernelObjSrc += tempDir;
241 kernelObjSrc += "/../descriptor.so.kernel_obj.c";
242
243 if (pocl_is_option_set("POCL_TCECC_EXTRA_FLAGS"))
244 extraFlags += " " +
245 TCEString(pocl_get_string_option("POCL_TCECC_EXTRA_FLAGS", ""));
246 if (parent->endian_little) {
247 extraFlags += " --little-endian";
248 }
249
250 std::string kernelMdSymbolName = "_";
251 kernelMdSymbolName += run_cmd->kernel->name;
252 kernelMdSymbolName += "_md";
253
254 TCEString programBcFile = tempDir + "/program.bc";
255 /* Compile in steps to save the program.bc for automated exploration
256 use case when producing the kernel capture scripts. */
257 TCEString cmdLine;
258 cmdLine << "tcecc -llwpr " + poclIncludePathSwitch + " " + deviceMainSrc + " " +
259 " " + kernelObjSrc + " " + inputSrc +
260 " -k " + kernelMdSymbolName +
261 " -g -O3 --emit-llvm -o " + programBcFile + " " + extraFlags + ";";
262
263 cmdLine << "tcecc $* -a " << machine_file << " " << programBcFile
264 << " -O3 -o " << outputTpef << + " " + extraFlags + "\n";
265 return cmdLine;
266 }
267
isNewKernel(const _cl_command_run * runCmd)268 bool TCEDevice::isNewKernel(const _cl_command_run *runCmd) {
269 if (curKernel == NULL || runCmd->kernel != curKernel)
270 return true;
271
272 bool newKernel = true;
273 if (runCmd->pc.local_size[0] != curLocalX ||
274 runCmd->pc.local_size[1] != curLocalY ||
275 runCmd->pc.local_size[2] != curLocalZ)
276 newKernel = true;
277 else
278 newKernel = false;
279 return newKernel;
280 }
281
updateCurrentKernel(const _cl_command_run * runCmd,uint32_t kernelAddr)282 void TCEDevice::updateCurrentKernel(const _cl_command_run *runCmd,
283 uint32_t kernelAddr) {
284 curKernelAddr = kernelAddr;
285 curKernel = runCmd->kernel;
286 curLocalX = runCmd->pc.local_size[0];
287 curLocalY = runCmd->pc.local_size[1];
288 curLocalZ = runCmd->pc.local_size[2];
289 }
290
291 cl_int
pocl_tce_alloc_mem_obj(cl_device_id device,cl_mem mem,void * host_ptr)292 pocl_tce_alloc_mem_obj (cl_device_id device, cl_mem mem, void* host_ptr)
293 {
294 TCEDevice *d = (TCEDevice*)device->data;
295 pocl_mem_identifier *p = &mem->device_ptrs[device->dev_id];
296 assert (p->mem_ptr == NULL);
297 chunk_info_t *chunk = NULL;
298 int err = CL_MEM_OBJECT_ALLOCATION_FAILURE;
299
300 /* TCE driver doesn't preallocate */
301 if ((mem->flags & CL_MEM_ALLOC_HOST_PTR) && (mem->mem_host_ptr == NULL))
302 goto ERROR;
303
304 chunk = alloc_buffer_from_region(&d->global_mem, mem->size);
305 if (chunk == NULL)
306 goto ERROR;
307
308 POCL_MSG_PRINT_MEMORY ("TCE: alloc 0x%zu bytes from 0x%zu\n",
309 mem->size, chunk->start_address);
310
311 p->mem_ptr = chunk;
312 p->version = 0;
313 err = CL_SUCCESS;
314
315 ERROR:
316 return err;
317 }
318
319 void
pocl_tce_free(cl_device_id device,cl_mem mem)320 pocl_tce_free (cl_device_id device, cl_mem mem) {
321
322 TCEDevice *d = (TCEDevice*)device->data;
323 pocl_mem_identifier *p = &mem->device_ptrs[device->dev_id];
324 assert (p->mem_ptr != NULL);
325
326 chunk_info_t *chunk =
327 (chunk_info_t *)p->mem_ptr;
328
329 POCL_MSG_PRINT_MEMORY ("TCE: freed 0x%zu bytes from 0x%zu\n",
330 mem->size, chunk->start_address);
331
332 free_chunk (chunk);
333
334 p->mem_ptr = NULL;
335 p->version = 0;
336 }
337
338
339 void
pocl_tce_write(void * data,const void * __restrict__ src_host_ptr,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,size_t offset,size_t size)340 pocl_tce_write (void *data,
341 const void *__restrict__ src_host_ptr,
342 pocl_mem_identifier * dst_mem_id,
343 cl_mem dst_buf,
344 size_t offset, size_t size)
345 {
346 void *__restrict__ device_ptr = dst_mem_id->mem_ptr;
347 TCEDevice *d = (TCEDevice*)data;
348 chunk_info_t *chunk = (chunk_info_t*)device_ptr;
349 #ifdef DEBUG_TTA_DRIVER
350 printf ("host: write %p <- %lx / %zu\n", src_host_ptr, chunk->start_address + offset,
351 size);
352 #endif
353 d->copyHostToDevice(src_host_ptr, chunk->start_address + offset, size);
354 }
355
pocl_tce_read(void * data,void * __restrict__ dst_host_ptr,pocl_mem_identifier * src_mem_id,cl_mem src_buf,size_t offset,size_t size)356 void pocl_tce_read(void *data, void *__restrict__ dst_host_ptr,
357 pocl_mem_identifier *src_mem_id, cl_mem src_buf,
358 size_t offset, size_t size) {
359 void *__restrict__ device_ptr = src_mem_id->mem_ptr;
360 TCEDevice* d = (TCEDevice*)data;
361 chunk_info_t *chunk = (chunk_info_t*)device_ptr;
362 #ifdef DEBUG_TTA_DRIVER
363 printf ("host: read %p -> %lx / %zu\n", dst_host_ptr,
364 chunk->start_address + offset, size);
365 #endif
366 d->copyDeviceToHost(chunk->start_address + offset, dst_host_ptr, size);
367 }
368
pocl_tce_copy(void * data,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,pocl_mem_identifier * src_mem_id,cl_mem src_buf,size_t dst_offset,size_t src_offset,size_t size)369 void pocl_tce_copy(void *data, pocl_mem_identifier *dst_mem_id, cl_mem dst_buf,
370 pocl_mem_identifier *src_mem_id, cl_mem src_buf,
371 size_t dst_offset, size_t src_offset, size_t size) {
372 void *__restrict__ dst_device_ptr = dst_mem_id->mem_ptr;
373 void *__restrict__ src_device_ptr = src_mem_id->mem_ptr;
374 TCEDevice *d = (TCEDevice *)data;
375 chunk_info_t *src_chunk = (chunk_info_t *)src_device_ptr;
376 chunk_info_t *dst_chunk = (chunk_info_t *)dst_device_ptr;
377 #ifdef DEBUG_TTA_DRIVER
378 printf("device: copy %x %x %zu\n", src_chunk, dst_chunk, size);
379 #endif
380 d->copyDeviceToDevice(src_chunk->start_address + src_offset,
381 dst_chunk->start_address + dst_offset, size);
382 }
383
384 chunk_info_t*
pocl_tce_malloc_local(void * device_data,size_t size)385 pocl_tce_malloc_local (void *device_data, size_t size)
386 {
387 TCEDevice *d = (TCEDevice*)device_data;
388 return alloc_buffer_from_region(&d->local_mem, size);
389 }
390
391
pocl_tce_write_kernel_descriptor(cl_device_id device,unsigned device_i,cl_kernel kernel)392 static void pocl_tce_write_kernel_descriptor(cl_device_id device,
393 unsigned device_i,
394 cl_kernel kernel) {
395 // Generate the kernel_obj.c file. This should be optional
396 // and generated only for the heterogeneous standalone devices which
397 // need the definitions to accompany the kernels, for the launcher
398 // code.
399 // TODO: the scripts use a generated kernel.h header file that
400 // gets added to this file. No checks seem to fail if that file
401 // is missing though, so it is left out from there for now
402
403 std::stringstream content;
404 pocl_kernel_metadata_t *meta = kernel->meta;
405
406 content << std::endl
407 << "#include <pocl_device.h>" << std::endl
408 << "void _pocl_kernel_" << meta->name
409 << "_workgroup(uint8_t* args, uint8_t*, "
410 << "uint32_t, uint32_t, uint32_t);" << std::endl
411 << "void _pocl_kernel_" << meta->name
412 << "_workgroup_fast(uint8_t* args, uint8_t*, "
413 << "uint32_t, uint32_t, uint32_t);" << std::endl;
414
415 if (device->global_as_id != 0)
416 content << "__attribute__((address_space(" << device->global_as_id << ")))"
417 << std::endl;
418
419 content << "__kernel_metadata _" << meta->name << "_md = {" << std::endl
420 << " \"" << meta->name << "\"," << std::endl
421 << " " << meta->num_args << "," << std::endl
422 << " " << meta->num_locals << "," << std::endl
423 << " _pocl_kernel_" << meta->name << "_workgroup_fast"
424 << std::endl
425 << " };" << std::endl;
426
427 pocl_cache_write_descriptor(kernel->program, device_i, meta->name,
428 content.str().c_str(), content.str().size());
429 }
430
pocl_tce_compile_kernel(_cl_command_node * Command,cl_kernel Kernel,cl_device_id Device,int Specialize)431 void pocl_tce_compile_kernel(_cl_command_node *Command, cl_kernel Kernel,
432 cl_device_id Device, int Specialize) {
433 if (Command->type != CL_COMMAND_NDRANGE_KERNEL)
434 return;
435 _cl_command_run *RunCommand = &Command->command.run;
436
437 void *Data = Command->device->data;
438 TCEDevice *Dev = (TCEDevice *)Data;
439
440 if (!Kernel)
441 Kernel = Command->command.run.kernel;
442 if (!Device)
443 Device = Command->device;
444
445 POCL_LOCK(Dev->tce_compile_lock);
446 int Error = pocl_llvm_generate_workgroup_function(
447 Command->device_i, Device, Kernel, Command, Specialize);
448
449 if (Error) {
450 POCL_UNLOCK(Dev->tce_compile_lock);
451 POCL_MSG_PRINT_GENERAL("TCE: pocl_llvm_generate_workgroup_function()"
452 " failed for kernel %s\n",
453 Kernel->name);
454 assert(Error == 0);
455 }
456
457 // 12 == strlen (POCL_PARALLEL_BC_FILENAME)
458 char ByteCode[POCL_FILENAME_LENGTH + 13];
459
460 assert(Dev != NULL);
461 assert(Command->command.run.kernel);
462
463 char CacheDir[POCL_FILENAME_LENGTH];
464 pocl_cache_kernel_cachedir_path(CacheDir, Kernel->program, Command->device_i,
465 Kernel, "", Command, 1);
466 RunCommand->device_data = strdup(CacheDir);
467
468 if (Dev->isNewKernel(RunCommand)) {
469
470 pocl_tce_write_kernel_descriptor(Device, Command->device_i, Kernel);
471
472 std::string AssemblyFileName(CacheDir);
473 TCEString TempDir(CacheDir);
474 AssemblyFileName += "/parallel.tpef";
475
476 if (access(AssemblyFileName.c_str(), F_OK) != 0) {
477 Error = snprintf(ByteCode, POCL_FILENAME_LENGTH + 13, "%s%s", CacheDir,
478 POCL_PARALLEL_BC_FILENAME);
479 TCEString BuildCmd = Dev->tceccCommandLine(RunCommand, TempDir, ByteCode,
480 AssemblyFileName);
481
482 #ifdef DEBUG_TTA_DRIVER
483 std::cerr << "CMD: " << BuildCmd << std::endl;
484 #endif
485 Error = system(BuildCmd.c_str());
486 if (Error != 0)
487 POCL_ABORT("Error while running tcecc.\n");
488 }
489 }
490
491 POCL_UNLOCK(Dev->tce_compile_lock);
492 }
493
494 void
pocl_tce_run(void * data,_cl_command_node * cmd)495 pocl_tce_run(void *data, _cl_command_node* cmd)
496 {
497 assert(cmd->type == CL_COMMAND_NDRANGE_KERNEL);
498
499 TCEDevice *d = (TCEDevice*)data;
500 uint32_t kernelAddr;
501 unsigned i;
502
503 assert(d != NULL);
504 assert(cmd->command.run.kernel);
505 assert(cmd->command.run.device_data);
506
507 if (d->isNewKernel(&(cmd->command.run))) {
508 std::string assemblyFileName((const char*)cmd->command.run.device_data);
509 assemblyFileName += "/parallel.tpef";
510
511 std::string kernelMdSymbolName = "_";
512 kernelMdSymbolName += cmd->command.run.kernel->name;
513 kernelMdSymbolName += "_md";
514
515 try {
516 d->loadProgramToDevice(assemblyFileName);
517 d->restartProgram();
518 } catch (Exception &e) {
519 std::cerr << "error: " << e.errorMessage() << std::endl;
520 POCL_ABORT("error: Failed to load program to the TTA.\n");
521 }
522
523 const TTAProgram::Program* prog = d->currentProgram;
524 assert (prog != NULL);
525
526 const TTAProgram::GlobalScope& globalScope = prog->globalScopeConst();
527
528 try {
529 kernelAddr = globalScope.dataLabel(kernelMdSymbolName).address().location();
530 } catch (const KeyNotFound& e) {
531 POCL_ABORT("Could not find the shared data structures from the device binary.\n");
532 }
533 // cache the currently device loaded kernel info
534 d->updateCurrentKernel(&(cmd->command.run), kernelAddr);
535 } else {
536 // Same kernel, no need to recompile
537 d->restartProgram();
538 kernelAddr = d->curKernelAddr;
539 }
540 __kernel_exec_cmd dev_cmd;
541 dev_cmd.kernel = byteswap_uint32_t (kernelAddr, d->needsByteSwap);
542
543 struct pocl_argument *al;
544
545 typedef std::vector<chunk_info_t*> ChunkVector;
546 /* Chunks to be freed after the kernel finishes. */
547 ChunkVector tempChunks;
548
549 cl_kernel kernel = cmd->command.run.kernel;
550 pocl_kernel_metadata_t *meta = kernel->meta;
551
552 for (i = 0; i < meta->num_args; ++i)
553 {
554 al = &(cmd->command.run.arguments[i]);
555 if (ARG_IS_LOCAL (meta->arg_info[i]))
556 {
557 chunk_info_t* local_chunk = pocl_tce_malloc_local (d, al->size);
558 if (local_chunk == NULL)
559 POCL_ABORT ("Could not allocate memory for a local argument. Out of local mem?\n");
560
561 dev_cmd.args[i] = byteswap_uint32_t (local_chunk->start_address, d->needsByteSwap);
562 #ifdef DEBUG_TTA_DRIVER
563 printf ("host: allocated %zu bytes of local memory for arg %u @ %lu\n",
564 al->size, i, local_chunk->start_address);
565 #endif
566 tempChunks.push_back(local_chunk);
567 }
568 else if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER)
569 {
570 /* It's legal to pass a NULL pointer to clSetKernelArguments. In
571 that case we must pass the same NULL forward to the kernel.
572 Otherwise, the user must have created a buffer with per device
573 pointers stored in the cl_mem. */
574 if (al->value == NULL)
575 dev_cmd.args[i] = 0;
576 else {
577 assert(al->is_svm == 0);
578 cl_mem m = (*(cl_mem *)(al->value));
579 chunk_info_t *p =
580 (chunk_info_t *)m->device_ptrs[d->parent->dev_id].mem_ptr;
581 dev_cmd.args[i] = byteswap_uint32_t(p->start_address + al->offset,
582 d->needsByteSwap);
583 }
584 }
585 else /* The scalar values should be byteswapped by the user. */
586 {
587 /* Copy the scalar argument data to the shared memory. */
588 chunk_info_t* arg_space =
589 alloc_buffer (&d->global_mem, al->size);
590 if (arg_space == NULL)
591 POCL_ABORT ("Could not allocate memory from the device argument space. Out of global mem?\n");
592 d->copyHostToDevice (al->value, arg_space->start_address, al->size );
593 #ifdef DEBUG_TTA_DRIVER
594 printf ("host: copied value from %p to global argument memory\n", al->value);
595 #endif
596 dev_cmd.args[i] = byteswap_uint32_t (arg_space->start_address, d->needsByteSwap);
597 tempChunks.push_back(arg_space);
598 }
599 }
600
601 /* Allocate the automatic local buffers. */
602 for (i = 0; i < meta->num_locals; ++i)
603 {
604 size_t s = meta->local_sizes[i];
605 chunk_info_t* local_chunk = pocl_tce_malloc_local (d, s);
606 if (local_chunk == NULL)
607 POCL_ABORT ("Could not allocate memory for an automatic local argument. Out of local mem?\n");
608
609 dev_cmd.args[meta->num_args + i] = byteswap_uint32_t (local_chunk->start_address, d->needsByteSwap);
610 #ifdef DEBUG_TTA_DRIVER
611 printf ("host: allocated %zu bytes of local memory for automated local arg %u @ %lu\n",
612 s, (meta->num_args + i), local_chunk->start_address);
613 #endif
614 tempChunks.push_back(local_chunk);
615 }
616
617 dev_cmd.work_dim = byteswap_uint32_t (cmd->command.run.pc.work_dim, d->needsByteSwap);
618 dev_cmd.num_groups[0] = byteswap_uint32_t (cmd->command.run.pc.num_groups[0], d->needsByteSwap);
619 dev_cmd.num_groups[1] = byteswap_uint32_t (cmd->command.run.pc.num_groups[1], d->needsByteSwap);
620 dev_cmd.num_groups[2] = byteswap_uint32_t (cmd->command.run.pc.num_groups[2], d->needsByteSwap);
621
622 dev_cmd.global_offset[0] = byteswap_uint32_t (cmd->command.run.pc.global_offset[0], d->needsByteSwap);
623 dev_cmd.global_offset[1] = byteswap_uint32_t (cmd->command.run.pc.global_offset[1], d->needsByteSwap);
624 dev_cmd.global_offset[2] = byteswap_uint32_t (cmd->command.run.pc.global_offset[2], d->needsByteSwap);
625
626 dev_cmd.status = byteswap_uint32_t (POCL_KST_FREE, d->needsByteSwap);
627
628 #ifdef DEBUG_TTA_DRIVER
629 printf("host: waiting for the device command queue (@ %x) to get room.\n",
630 d->commandQueueAddr);
631 printf("host: command queue status: %d\n",
632 d->readWordFromDevice (d->commandQueueAddr));
633 #endif
634 /* Wait until the device command queue has room. */
635 do {}
636 while (d->readWordFromDevice (d->commandQueueAddr) != POCL_KST_FREE);
637
638 #ifdef DEBUG_TTA_DRIVER
639 printf( "host: writing the command.\n");
640 #endif
641 d->copyHostToDevice (&dev_cmd, d->commandQueueAddr, sizeof(__kernel_exec_cmd) );
642
643 /* Ensure the READY status is written the last so the device doesn't
644 start executing before all the cmd data has been written. We
645 need a flush or similar mechanism to ensure all the data has
646 been really written, in case the data transfers are not guaranteed
647 to be ordered. */
648 d->writeWordToDevice(d->commandQueueAddr, POCL_KST_READY);
649
650 dev_cmd.status = byteswap_uint32_t (POCL_KST_READY, d->needsByteSwap);
651
652 d->notifyKernelRunCommandSent(dev_cmd, &cmd->command.run);
653
654 #ifdef DEBUG_TTA_DRIVER
655 printf("host: commmand queue status: %x\n",
656 d->readWordFromDevice(d->commandQueueAddr));
657
658 printf("host: waiting for the command to get executed.\n");
659 #endif
660 /* Wait until the command has executed. */
661 unsigned long ticks = 0;
662 do {
663 #ifdef DEBUG_TTA_DRIVER
664 if ((ticks & 50) == 0)
665 printf("host: commmand queue status: %x\n",
666 d->readWordFromDevice(d->commandQueueAddr));
667 #endif
668 usleep(20000);
669 ++ticks;
670 } while (d->readWordFromDevice(d->commandQueueAddr) != POCL_KST_FINISHED);
671
672 #ifdef DEBUG_TTA_DRIVER
673 printf( "host: done. Freeing the command queue entry.\n");
674 #endif
675 /* We are done with this kernel, free the command queue entry. */
676 d->writeWordToDevice(d->commandQueueAddr, POCL_KST_FREE);
677
678 for (ChunkVector::iterator i = tempChunks.begin();
679 i != tempChunks.end(); ++i)
680 free_chunk (*i);
681
682 POCL_MEM_FREE(cmd->command.run.device_data);
683
684 #ifdef DEBUG_TTA_DRIVER
685 printf("host: local memory allocations:\n");
686 print_chunks (d->local_mem.chunks);
687
688 printf("host: global memory allocations:\n");
689 print_chunks (d->global_mem.chunks);
690 #endif
691 }
692
693 cl_int
pocl_tce_map_mem(void * data,pocl_mem_identifier * src_mem_id,cl_mem src_buf,mem_mapping_t * map)694 pocl_tce_map_mem (void *data,
695 pocl_mem_identifier * src_mem_id,
696 cl_mem src_buf,
697 mem_mapping_t *map)
698 {
699 /* Synch the device global region to the host memory. */
700 pocl_tce_read(data, map->host_ptr, src_mem_id, src_buf, map->offset,
701 map->size);
702
703 return CL_SUCCESS;
704 }
705
706 cl_int
pocl_tce_unmap_mem(void * data,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,mem_mapping_t * map)707 pocl_tce_unmap_mem (void *data,
708 pocl_mem_identifier *dst_mem_id,
709 cl_mem dst_buf,
710 mem_mapping_t *map)
711 {
712 if (map->map_flags != CL_MAP_READ) {
713 /* Synch the device global region to the host memory. */
714 pocl_tce_write (data, map->host_ptr, dst_mem_id, dst_buf, map->offset, map->size);
715 }
716
717 return CL_SUCCESS;
718 }
719
720
721 char*
pocl_tce_init_build(void * data)722 pocl_tce_init_build(void *data)
723 {
724 TCEDevice *tce_dev = (TCEDevice*)data;
725 TCEString mach_tmpdir =
726 Environment::llvmtceCachePath();
727
728 TCEString mach_header_base =
729 mach_tmpdir + "/" + tce_dev->machine_->hash();
730
731 int error = 0;
732
733 std::string devextHeaderFn =
734 std::string(mach_header_base) + std::string("_opencl_devext.h");
735
736 /* Generate the vendor extensions header to provide explicit
737 access to the (custom) hardware operations. */
738 std::string tceopgenCmd =
739 std::string("tceopgen > ") + devextHeaderFn;
740
741 error = system (tceopgenCmd.c_str());
742 if (error == -1) return NULL;
743
744 std::string extgenCmd =
745 std::string("tceoclextgen ") + tce_dev->machine_file +
746 std::string(" >> ") + devextHeaderFn;
747
748 error = system (extgenCmd.c_str());
749 if (error == -1) return NULL;
750
751 // gnu-keywords needed to support the inline asm blocks
752 // -fasm doesn't work in the frontend
753
754 std::string includeSwitch =
755 std::string("-fgnu-keywords -Dasm=__asm__ -include ") + devextHeaderFn;
756
757 char *include_switch = strdup(includeSwitch.c_str());
758
759 return include_switch;
760 }
761
762 char *
pocl_tce_build_hash(cl_device_id device)763 pocl_tce_build_hash (cl_device_id device)
764 {
765 TCEDevice *tce_dev = (TCEDevice*)device->data;
766 FILE* adf_file = fopen (tce_dev->machine_file.c_str(), "r");
767 size_t size;
768 uint8_t* adf_data = 0;
769 const char *extra_flags = NULL;
770
771 fseek (adf_file, 0 , SEEK_END);
772 size = ftell (adf_file);
773 fseek (adf_file, 0, SEEK_SET);
774 adf_data = (uint8_t*)malloc (size);
775 if (fread (adf_data, 1, size, adf_file) == 0)
776 POCL_ABORT("Could not read ADF.\n");
777
778 SHA1_CTX ctx;
779 uint8_t bin_dig[SHA1_DIGEST_SIZE];
780 pocl_SHA1_Init(&ctx);
781 pocl_SHA1_Update(&ctx, adf_data, size);
782 pocl_SHA1_Final(&ctx, bin_dig);
783
784 char *result = (char *)calloc(1000, sizeof(char));
785 strcpy(result, device->llvm_target_triplet);
786 char *temp = result + strlen(result);
787 *temp++ = '-';
788 unsigned i;
789 for (i=0; i < SHA1_DIGEST_SIZE; i++)
790 {
791 *temp++ = (bin_dig[i] & 0x0F) + 65;
792 *temp++ = ((bin_dig[i] & 0xF0) >> 4) + 65;
793 }
794 *temp++ = '_';
795 *temp = 0;
796
797 if (pocl_is_option_set("POCL_TCECC_EXTRA_FLAGS"))
798 {
799 extra_flags = pocl_get_string_option("POCL_TCECC_EXTRA_FLAGS", "");
800 strncpy(temp, extra_flags, (1000-(temp-result)) );
801 }
802
803 return result;
804 }
805
806 void
pocl_tce_copy_rect(void * data,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,pocl_mem_identifier * src_mem_id,cl_mem src_buf,const size_t * __restrict__ const dst_origin,const size_t * __restrict__ const src_origin,const size_t * __restrict__ const region,size_t const dst_row_pitch,size_t const dst_slice_pitch,size_t const src_row_pitch,size_t const src_slice_pitch)807 pocl_tce_copy_rect (void *data,
808 pocl_mem_identifier * dst_mem_id,
809 cl_mem dst_buf,
810 pocl_mem_identifier * src_mem_id,
811 cl_mem src_buf,
812 const size_t *__restrict__ const dst_origin,
813 const size_t *__restrict__ const src_origin,
814 const size_t *__restrict__ const region,
815 size_t const dst_row_pitch,
816 size_t const dst_slice_pitch,
817 size_t const src_row_pitch,
818 size_t const src_slice_pitch)
819 {
820 TCEDevice *d = (TCEDevice*)data;
821 chunk_info_t *src_chunk = (chunk_info_t*)src_mem_id->mem_ptr;
822 chunk_info_t *dst_chunk = (chunk_info_t*)dst_mem_id->mem_ptr;
823
824 size_t src_offset = src_origin[0] + src_row_pitch * src_origin[1] + src_slice_pitch * src_origin[2];
825 size_t dst_offset = dst_origin[0] + dst_row_pitch * dst_origin[1] + dst_slice_pitch * dst_origin[2];
826
827 size_t j, k;
828
829 /* TODO: handle overlaping regions */
830
831 for (k = 0; k < region[2]; ++k)
832 for (j = 0; j < region[1]; ++j)
833 d->copyDeviceToDevice(src_chunk->start_address + src_offset + src_row_pitch * j + src_slice_pitch * k,
834 dst_chunk->start_address + dst_offset + dst_row_pitch * j + dst_slice_pitch * k,
835 region[0]);
836
837 }
838
839 void
pocl_tce_write_rect(void * data,const void * __restrict__ src_host_ptr,pocl_mem_identifier * dst_mem_id,cl_mem dst_buf,const size_t * __restrict__ const buffer_origin,const size_t * __restrict__ const host_origin,const size_t * __restrict__ const region,size_t const buffer_row_pitch,size_t const buffer_slice_pitch,size_t const host_row_pitch,size_t const host_slice_pitch)840 pocl_tce_write_rect (void *data,
841 const void *__restrict__ src_host_ptr,
842 pocl_mem_identifier * dst_mem_id,
843 cl_mem dst_buf,
844 const size_t *__restrict__ const buffer_origin,
845 const size_t *__restrict__ const host_origin,
846 const size_t *__restrict__ const region,
847 size_t const buffer_row_pitch,
848 size_t const buffer_slice_pitch,
849 size_t const host_row_pitch,
850 size_t const host_slice_pitch)
851 {
852 TCEDevice *d = (TCEDevice *)data;
853 chunk_info_t *dst_chunk = (chunk_info_t *)dst_mem_id->mem_ptr;
854 size_t adjusted_dst_ptr = dst_chunk->start_address + buffer_origin[0] +
855 buffer_row_pitch * buffer_origin[1] +
856 buffer_slice_pitch * buffer_origin[2];
857
858 char const *__restrict__ const adjusted_host_ptr =
859 (char const *)src_host_ptr + host_origin[0] +
860 host_row_pitch * host_origin[1] + host_slice_pitch * host_origin[2];
861
862 size_t j, k;
863
864 /* TODO: handle overlaping regions */
865
866 for (k = 0; k < region[2]; ++k)
867 for (j = 0; j < region[1]; ++j)
868 {
869 size_t s_offset = host_row_pitch * j + host_slice_pitch * k;
870
871 size_t d_offset = buffer_row_pitch * j + buffer_slice_pitch * k;
872
873 d->copyHostToDevice(adjusted_host_ptr + s_offset,
874 adjusted_dst_ptr + d_offset, region[0]);
875 }
876 }
877
878 void
pocl_tce_read_rect(void * data,void * __restrict__ dst_host_ptr,pocl_mem_identifier * src_mem_id,cl_mem src_buf,const size_t * __restrict__ const buffer_origin,const size_t * __restrict__ const host_origin,const size_t * __restrict__ const region,size_t const buffer_row_pitch,size_t const buffer_slice_pitch,size_t const host_row_pitch,size_t const host_slice_pitch)879 pocl_tce_read_rect (void *data,
880 void *__restrict__ dst_host_ptr,
881 pocl_mem_identifier * src_mem_id,
882 cl_mem src_buf,
883 const size_t *__restrict__ const buffer_origin,
884 const size_t *__restrict__ const host_origin,
885 const size_t *__restrict__ const region,
886 size_t const buffer_row_pitch,
887 size_t const buffer_slice_pitch,
888 size_t const host_row_pitch,
889 size_t const host_slice_pitch)
890 {
891 TCEDevice *d = (TCEDevice *)data;
892 chunk_info_t *src_chunk = (chunk_info_t *)src_mem_id->mem_ptr;
893 size_t adjusted_src_ptr = src_chunk->start_address + buffer_origin[0] +
894 buffer_row_pitch * buffer_origin[1] +
895 buffer_slice_pitch * buffer_origin[2];
896
897 char const *__restrict__ const adjusted_host_ptr =
898 (char const *)dst_host_ptr + host_origin[0] +
899 host_row_pitch * host_origin[1] + host_slice_pitch * host_origin[2];
900
901 size_t j, k;
902
903 /* TODO: handle overlaping regions */
904
905 for (k = 0; k < region[2]; ++k)
906 for (j = 0; j < region[1]; ++j)
907 {
908 size_t d_offset = host_row_pitch * j + host_slice_pitch * k;
909 size_t s_offset = buffer_row_pitch * j + buffer_slice_pitch * k;
910
911 d->copyDeviceToHost(adjusted_src_ptr + s_offset,
912 adjusted_host_ptr + d_offset, region[0]);
913 }
914 }
915
tce_command_scheduler(TCEDevice * d)916 static void tce_command_scheduler (TCEDevice *d)
917 {
918 _cl_command_node *node;
919
920 /* execute commands from ready list */
921 while ((node = d->ready_list))
922 {
923 assert (pocl_command_is_ready(node->event));
924 CDL_DELETE (d->ready_list, node);
925 POCL_UNLOCK(d->cq_lock);
926 assert (node->event->status == CL_SUBMITTED);
927 if (node->type == CL_COMMAND_NDRANGE_KERNEL)
928 pocl_tce_compile_kernel(node, NULL, NULL, 1);
929 pocl_exec_command(node);
930 POCL_LOCK(d->cq_lock);
931 }
932
933 return;
934 }
935
936 void
pocl_tce_submit(_cl_command_node * node,cl_command_queue)937 pocl_tce_submit (_cl_command_node *node, cl_command_queue /*cq*/)
938 {
939 TCEDevice *d = (TCEDevice*)node->device->data;
940
941 node->ready = 1;
942 POCL_LOCK(d->cq_lock);
943 pocl_command_push(node, &d->ready_list, &d->command_list);
944 POCL_UNLOCK_OBJ(node->event);
945
946 tce_command_scheduler (d);
947 POCL_UNLOCK(d->cq_lock);
948
949 return;
950 }
951
pocl_tce_flush(cl_device_id device,cl_command_queue)952 void pocl_tce_flush (cl_device_id device, cl_command_queue /*cq*/)
953 {
954 TCEDevice *d = (TCEDevice*)device->data;
955
956 POCL_LOCK (d->cq_lock);
957 tce_command_scheduler (d);
958 POCL_UNLOCK (d->cq_lock);
959 }
960
961
962 void
pocl_tce_join(cl_device_id device,cl_command_queue)963 pocl_tce_join(cl_device_id device, cl_command_queue /*cq*/)
964 {
965 TCEDevice *d = (TCEDevice*)device->data;
966
967 POCL_LOCK (d->cq_lock);
968 tce_command_scheduler (d);
969 POCL_UNLOCK (d->cq_lock);
970
971 return;
972 }
973
974 void
pocl_tce_notify(cl_device_id device,cl_event event,cl_event finished)975 pocl_tce_notify (cl_device_id device, cl_event event, cl_event finished)
976 {
977 TCEDevice *d = (TCEDevice*)device->data;
978 _cl_command_node *node = event->command;
979
980 if (finished->status < CL_COMPLETE) {
981 pocl_update_event_failed(event);
982 return;
983 }
984
985 if (!node->ready)
986 return;
987
988 if (pocl_command_is_ready(event)) {
989 if (event->status == CL_QUEUED) {
990 pocl_update_event_submitted(event);
991 POCL_LOCK(d->cq_lock);
992 CDL_DELETE(d->command_list, node);
993 CDL_PREPEND(d->ready_list, node);
994 POCL_UNLOCK(d->cq_lock);
995 }
996 }
997 }
998