1 /*
2
3 HyPhy - Hypothesis Testing Using Phylogenies.
4
5 Copyright (C) 1997-now
6 Core Developers:
7 Sergei L Kosakovsky Pond (sergeilkp@icloud.com)
8 Art FY Poon (apoon42@uwo.ca)
9 Steven Weaver (sweaver@temple.edu)
10
11 Module Developers:
12 Lance Hepler (nlhepler@gmail.com)
13 Martin Smith (martin.audacis@gmail.com)
14
15 Significant contributions from:
16 Spencer V Muse (muse@stat.ncsu.edu)
17 Simon DW Frost (sdf22@cam.ac.uk)
18
19 Permission is hereby granted, free of charge, to any person obtaining a
20 copy of this software and associated documentation files (the
21 "Software"), to deal in the Software without restriction, including
22 without limitation the rights to use, copy, modify, merge, publish,
23 distribute, sublicense, and/or sell copies of the Software, and to
24 permit persons to whom the Software is furnished to do so, subject to
25 the following conditions:
26
27 The above copyright notice and this permission notice shall be included
28 in all copies or substantial portions of the Software.
29
30 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
31 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
33 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
34 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
35 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
36 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37
38 */
39
40 // *********************************************************************
41 // OpenCL likelihood function Notes:
42 //
43 // Runs computations with OpenCL on the GPU device and then checks results
44 // against basic host CPU/C++ computation.
45 //
46 //
47 // *********************************************************************
48
49 #ifdef MDSOCL
50
51 #include <string>
52 #include <stdio.h>
53 #include <assert.h>
54 #include <sys/sysctl.h>
55 #include <sys/stat.h>
56 #include <stdlib.h>
57 #include <stdio.h>
58 #include <time.h>
59 #include <math.h>
60 #include "calcnode.h"
61
62 #include "opencl_kernels.h"
63
64 //#define FLOAT
65 //#define OCLVERBOSE
66
67 #if defined(__APPLE__)
68 #include <OpenCL/OpenCL.h>
69 typedef float fpoint;
70 typedef cl_float clfp;
71 #define FLOATPREC "typedef float fpoint; \n"
72 //#define PRAGMADEF "#pragma OPENCL EXTENSION cl_khr_fp64: enable \n"
73 #define PRAGMADEF " \n"
74 //#pragma OPENCL EXTENSION cl_khr_fp64: enable
75 #elif defined(NVIDIA)
76 #define __GPUResults__
77 #define __OCLPOSIX__
78 #include <oclUtils.h>
79 typedef double fpoint;
80 typedef cl_double clfp;
81 #define FLOATPREC "typedef double fpoint; \n"
82 #define PRAGMADEF "#pragma OPENCL EXTENSION cl_khr_fp64: enable \n"
83 #pragma OPENCL EXTENSION cl_khr_fp64: enable
84 #elif defined(AMD)
85 #define __GPUResults__
86 #define __OCLPOSIX__
87 #include <CL/opencl.h>
88 typedef double fpoint;
89 typedef cl_double clfp;
90 #define FLOATPREC "typedef double fpoint; \n"
91 #define PRAGMADEF "#pragma OPENCL EXTENSION cl_amd_fp64: enable \n"
92 #pragma OPENCL EXTENSION cl_amd_fp64: enable
93 #elif defined(FLOAT)
94 #include <CL/opencl.h>
95 typedef float fpoint;
96 typedef cl_float clfp;
97 #define FLOATPREC "typedef float fpoint; \n"
98 #define PRAGMADEF " \n"
99 #endif
100
101 //#define __VERBOSE__
102 #define OCLGPU
103 #ifdef OCLGPU
104 #define OCLTARGET " #define BLOCK_SIZE 16 \n"
105 #else
106 #define OCLTARGET " #define BLOCK_SIZE 1 \n"
107 #endif
108
109 #ifdef __GPUResults__
110 #define OCLGPUResults " #define __GPUResults__ \n"
111 #else
112 #define OCLGPUResults " \n"
113 #endif
114
115
116 // #define MIN(a,b) ((a)>(b)?(b):(a))
117
118 // time stuff:
119 #define BILLION 1E9
120 struct timespec mainStart, mainEnd, bufferStart, bufferEnd, queueStart, queueEnd, setupStart, setupEnd;
121 double mainSecs;
122 double buffSecs;
123 double queueSecs;
124 double setupSecs;
125
126 bool clean;
127
128 cl_context cxGPUContext; // OpenCL context
129 cl_command_queue cqCommandQueue;// OpenCL command que
130 cl_platform_id cpPlatform; // OpenCL platform
131 cl_device_id cdDevice; // OpenCL device
132 cl_program cpMLProgram;
133 cl_program cpLeafProgram;
134 cl_program cpInternalProgram;
135 cl_program cpAmbigProgram;
136 cl_program cpResultProgram;
137 cl_kernel ckLeafKernel;
138 cl_kernel ckInternalKernel;
139 cl_kernel ckAmbigKernel;
140 cl_kernel ckResultKernel;
141 cl_kernel ckReductionKernel;
142 size_t szGlobalWorkSize[2]; // 1D var for Total # of work items
143 size_t szLocalWorkSize[2]; // 1D var for # of work items in the work group
144 size_t localMemorySize; // size of local memory buffer for kernel scratch
145 size_t szParmDataBytes; // Byte size of context information
146 size_t szKernelLength; // Byte size of kernel code
147 cl_int ciErr1, ciErr2; // Error code var
148
149 cl_mem cmNode_cache;
150 cl_mem cmModel_cache;
151 cl_mem cmNodRes_cache;
152 cl_mem cmNodFlag_cache;
153 cl_mem cmroot_cache;
154 cl_mem cmroot_scalings;
155 cl_mem cmScalings_cache;
156 cl_mem cmFreq_cache;
157 cl_mem cmProb_cache;
158 cl_mem cmResult_cache;
159 long siteCount, alphabetDimension;
160 long* lNodeFlags;
161 _SimpleList updateNodes,
162 flatParents,
163 flatNodes,
164 flatCLeaves,
165 flatLeaves,
166 flatTree,
167 theFrequencies;
168 hyFloat *iNodeCache,
169 *theProbs;
170 _SimpleList taggedInternals;
171 _Vector* lNodeResolutions;
172 float scalar;
173
174 void *node_cache, *nodRes_cache, *nodFlag_cache, *scalings_cache, *prob_cache, *freq_cache, *root_cache, *result_cache, *root_scalings, *model;
175
init(long esiteCount,long ealphabetDimension,hyFloat * eiNodeCache)176 void _OCLEvaluator::init( long esiteCount,
177 long ealphabetDimension,
178 hyFloat* eiNodeCache)
179 {
180 clean = false;
181 contextSet = false;
182 siteCount = esiteCount;
183 alphabetDimension = ealphabetDimension;
184 iNodeCache = eiNodeCache;
185 mainSecs = 0.0;
186 buffSecs = 0.0;
187 queueSecs = 0.0;
188 setupSecs = 0.0;
189 scalar = 10.0;
190 }
191
192 // So the two interfacing functions will be the constructor, called in SetupLFCaches, and launchmdsocl, called in ComputeBlock.
193 // Therefore all of these functions need to be finished, the context needs to be setup separately from the execution, the data needs
194 // to be passed piecewise, and a pointer needs to be passed around in likefunc2.cpp. After that things should be going a bit faster,
195 // though honestly this solution is geared towards analyses with a larger number of sites.
196
197 // *********************************************************************
setupContext(void)198 int _OCLEvaluator::setupContext(void)
199 {
200 #ifdef __OCLPOSIX__
201 clock_gettime(CLOCK_MONOTONIC, &setupStart);
202 #endif
203 //printf("Made it to the oclmain() function!\n");
204
205 //long nodeResCount = sizeof(lNodeResolutions->theData)/sizeof(lNodeResolutions->theData[0]);
206 long nodeFlagCount = flatLeaves.lLength*siteCount;
207 long nodeResCount = lNodeResolutions->get_used();
208 int roundCharacters = roundUpToNextPowerOfTwo(alphabetDimension);
209 // long nodeCount = flatLeaves.lLength + flatNodes.lLength + 1;
210 // long iNodeCount = flatNodes.lLength + 1;
211
212 bool ambiguousNodes = true;
213 if (nodeResCount == 0)
214 {
215 nodeResCount++;
216 ambiguousNodes = false;
217 }
218
219 //printf("Got the sizes of nodeRes and nodeFlag: %i, %i\n", nodeResCount, nodeFlagCount);
220
221 // Make transitionMatrixArray, do other host stuff:
222 node_cache = (void*)malloc(sizeof(cl_float)*roundCharacters*siteCount*(flatNodes.lLength));
223 nodRes_cache = (void*)malloc(sizeof(cl_float)*roundUpToNextPowerOfTwo(nodeResCount));
224 nodFlag_cache = (void*)malloc(sizeof(cl_long)*roundUpToNextPowerOfTwo(nodeFlagCount));
225 scalings_cache = (void*)malloc(sizeof(cl_int)*roundCharacters*siteCount*(flatNodes.lLength));
226 prob_cache = (void*)malloc(sizeof(cl_float)*roundCharacters);
227 freq_cache = (void*)malloc(sizeof(cl_int)*siteCount);
228 freq_cache = (void*)malloc(sizeof(cl_int)*siteCount);
229 root_cache = (void*)malloc(sizeof(cl_float)*siteCount*roundCharacters);
230 root_scalings = (void*)malloc(sizeof(cl_int)*siteCount*roundCharacters);
231 /*
232 #ifdef __GPUResults__
233 result_cache = (void*)malloc(sizeof(cl_double)*roundUpToNextPowerOfTwo(siteCount));
234 #else
235 result_cache = (void*)malloc(sizeof(cl_float)*roundUpToNextPowerOfTwo(siteCount));
236 #endif
237 */
238 result_cache = (void*)malloc(sizeof(clfp)*roundUpToNextPowerOfTwo(siteCount));
239 model = (void*)malloc(sizeof(cl_float)*roundCharacters*roundCharacters*(flatParents.lLength-1));
240
241 //printf("Allocated all of the arrays!\n");
242 //printf("setup the model, fixed tagged internals!\n");
243 printf("flatleaves: %ld\n", flatLeaves.lLength);
244 printf("flatParents: %ld\n", flatParents.lLength);
245 //printf("flatCleaves: %i\n", flatCLeaves.lLength);
246 printf("flatNodes: %ld\n", flatNodes.lLength);
247 printf("updateNodes: %ld\n", updateNodes.lLength);
248 printf("flatTree: %ld\n", flatTree.lLength);
249 //printf("nodeFlagCount: %i\n", nodeFlagCount);
250 //printf("nodeResCount: %i\n", nodeResCount);
251
252 //for (int i = 0; i < nodeCount*siteCount*alphabetDimension; i++)
253 printf("siteCount: %ld, alphabetDimension: %ld \n", siteCount, alphabetDimension);
254 if (ambiguousNodes)
255 for (int i = 0; i < nodeResCount; i++)
256 ((float*)nodRes_cache)[i] = (float)(lNodeResolutions->theData[i]);
257 for (int i = 0; i < nodeFlagCount; i++)
258 ((long*)nodFlag_cache)[i] = lNodeFlags[i];
259 for (int i = 0; i < siteCount; i++)
260 ((int*)freq_cache)[i] = theFrequencies[i];
261 for (int i = 0; i < alphabetDimension; i++)
262 ((float*)prob_cache)[i] = theProbs[i];
263
264 //printf("Created all of the arrays!\n");
265
266 // alright, by now taggedInternals have been taken care of, and model has
267 // been filled with all of the transition matrices.
268
269 #ifdef __OCLPOSIX__
270 clock_gettime(CLOCK_MONOTONIC, &setupEnd);
271 setupSecs += (setupEnd.tv_sec - setupStart.tv_sec)+(setupEnd.tv_nsec - setupStart.tv_nsec)/BILLION;
272 #endif
273
274
275
276 //**************************************************
277
278 //Get an OpenCL platform
279 ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL);
280
281 // printf("clGetPlatformID...\n");
282 if (ciErr1 != CL_SUCCESS)
283 {
284 printf("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
285 Cleanup(EXIT_FAILURE);
286 }
287
288
289 //Get the devices
290 #ifdef OCLGPU
291 ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
292 #else
293 ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 1, &cdDevice, NULL);
294 #endif
295 // printf("clGetDeviceIDs...\n");
296 if (ciErr1 != CL_SUCCESS)
297 {
298 printf("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
299 Cleanup(EXIT_FAILURE);
300 }
301
302 size_t maxWorkGroupSize;
303 ciErr1 = clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE,
304 sizeof(size_t), &maxWorkGroupSize, NULL);
305 if (ciErr1 != CL_SUCCESS)
306 {
307 printf("Getting max work group size failed!\n");
308 }
309 printf("Max work group size: %lu\n", (unsigned long)maxWorkGroupSize);
310
311 size_t maxLocalSize;
312 ciErr1 = clGetDeviceInfo(cdDevice, CL_DEVICE_LOCAL_MEM_SIZE,
313 sizeof(size_t), &maxLocalSize, NULL);
314 size_t maxConstSize;
315 ciErr1 = clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
316 sizeof(size_t), &maxConstSize, NULL);
317 printf("LocalSize: %ld, Const size: %ld\n", (long unsigned) maxLocalSize, (long unsigned) maxConstSize);
318
319 printf("sites: %ld\n", siteCount);
320
321 // set and log Global and Local work size dimensions
322
323 #ifdef OCLGPU
324 szLocalWorkSize[0] = 16; // All of these will have to be generalized.
325 szLocalWorkSize[1] = 16;
326 #else
327 szLocalWorkSize[0] = 1; // All of these will have to be generalized.
328 szLocalWorkSize[1] = 1;
329 #endif
330 szGlobalWorkSize[0] = 64;
331 //szGlobalWorkSize[1] = ((siteCount + 16)/16)*16;
332 szGlobalWorkSize[1] = roundUpToNextPowerOfTwo(siteCount);
333 //szGlobalWorkSize[1] = roundUpToNextPowerOfTwo(siteCount);
334 printf("Global Work Size \t\t= %ld, %ld\nLocal Work Size \t\t= %ld, %ld\n# of Work Groups \t\t= %ld\n\n",
335 (long unsigned) szGlobalWorkSize[0],
336 (long unsigned) szGlobalWorkSize[1],
337 (long unsigned) szLocalWorkSize[0],
338 (long unsigned) szLocalWorkSize[1],
339 (long unsigned) ((szGlobalWorkSize[0]*szGlobalWorkSize[1])/(szLocalWorkSize[0]*szLocalWorkSize[1])));
340
341
342 size_t returned_size = 0;
343 cl_char vendor_name[1024] = {0};
344 cl_char device_name[1024] = {0};
345 ciErr1 = clGetDeviceInfo(cdDevice, CL_DEVICE_VENDOR, sizeof(vendor_name),
346 vendor_name, &returned_size);
347 ciErr1 |= clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(device_name),
348 device_name, &returned_size);
349 assert(ciErr1 == CL_SUCCESS);
350 // printf("Connecting to %s %s...\n", vendor_name, device_name);
351
352 //Create the context
353 cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1);
354 // printf("clCreateContext...\n");
355 if (ciErr1 != CL_SUCCESS)
356 {
357 printf("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
358 Cleanup(EXIT_FAILURE);
359 }
360
361 // Create a command-queue
362 cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1);
363 // printf("clCreateCommandQueue...\n");
364 if (ciErr1 != CL_SUCCESS)
365 {
366 printf("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
367 Cleanup(EXIT_FAILURE);
368 }
369
370
371 printf("Setup all of the OpenCL stuff!\n");
372
373 // Allocate the OpenCL buffer memory objects for the input and output on the
374 // device GMEM
375 cmNode_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
376 sizeof(cl_float)*roundCharacters*siteCount*(flatNodes.lLength), node_cache,
377 &ciErr1);
378 cmModel_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY,
379 sizeof(cl_float)*roundCharacters*roundCharacters*(flatParents.lLength-1),
380 NULL, &ciErr2);
381 ciErr1 |= ciErr2;
382 cmScalings_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
383 sizeof(cl_int)*roundCharacters*siteCount*flatNodes.lLength, scalings_cache, &ciErr2);
384 ciErr1 |= ciErr2;
385 cmNodRes_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
386 sizeof(cl_float)*roundUpToNextPowerOfTwo(nodeResCount), nodRes_cache, &ciErr2);
387 ciErr1 |= ciErr2;
388 cmNodFlag_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
389 sizeof(cl_long)*roundUpToNextPowerOfTwo(nodeFlagCount), nodFlag_cache, &ciErr2);
390 ciErr1 |= ciErr2;
391 cmroot_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE,
392 sizeof(cl_float)*siteCount*roundCharacters, NULL, &ciErr2);
393 ciErr1 |= ciErr2;
394 cmroot_scalings = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE,
395 sizeof(cl_int)*siteCount*roundCharacters, NULL, &ciErr2);
396 ciErr1 |= ciErr2;
397 cmProb_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
398 sizeof(cl_float)*roundCharacters, prob_cache, &ciErr2);
399 ciErr1 |= ciErr2;
400 cmFreq_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY,
401 sizeof(cl_float)*siteCount, NULL, &ciErr2);
402 ciErr1 |= ciErr2;
403 //cmResult_cache = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,
404 // sizeof(cl_float)*siteCount, NULL, &ciErr2);
405 /*
406 #ifdef __GPUResults__
407 cmResult_cache = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,
408 sizeof(cl_double)*roundUpToNextPowerOfTwo(siteCount), NULL, &ciErr2);
409 #else
410 cmResult_cache = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,
411 sizeof(cl_float)*roundUpToNextPowerOfTwo(siteCount), NULL, &ciErr2);
412 #endif
413 */
414 cmResult_cache = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,
415 sizeof(clfp)*roundUpToNextPowerOfTwo(siteCount), NULL, &ciErr2);
416 ciErr1 |= ciErr2;
417 // printf("clCreateBuffer...\n");
418 if (ciErr1 != CL_SUCCESS)
419 {
420 printf("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
421 switch(ciErr1)
422 {
423 case CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
424 case CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
425 case CL_INVALID_BUFFER_SIZE: printf("CL_INVALID_BUFFER_SIZE\n"); break;
426 case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
427 case CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
428 default: printf("Strange error\n");
429 }
430 Cleanup(EXIT_FAILURE);
431 }
432
433
434 #ifdef __OCLPOSIX__
435 clock_gettime(CLOCK_MONOTONIC, &setupStart);
436 #endif
437 /*
438 for (int i = 0; i < siteCount*roundCharacters; i++)
439 {
440 (root_cache)[i] = 0.0;
441 (root_scalings)[i] = 1;
442 }
443 */
444 #ifdef __OCLPOSIX__
445 clock_gettime(CLOCK_MONOTONIC, &setupEnd);
446 setupSecs += (setupEnd.tv_sec - setupStart.tv_sec)+(setupEnd.tv_nsec - setupStart.tv_nsec)/BILLION;
447 #endif
448
449 printf("Made all of the buffers on the device!\n");
450
451 // printf("clCreateBuffer...\n");
452 if (ciErr1 != CL_SUCCESS)
453 {
454 printf("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
455 Cleanup(EXIT_FAILURE);
456 }
457
458 // "" FLOATPREC
459 // Create the program
460 const char * program_source = "" OCLTARGET PRAGMADEF FLOATPREC OCLGPUResults KERNEL_STRING;
461
462 // TODO: result_cache size can be reduced to siteCount/BLOCK_SIZE
463 cpMLProgram = clCreateProgramWithSource(cxGPUContext, 1, &program_source,
464 NULL, &ciErr1);
465 if (ciErr1 != CL_SUCCESS)
466 {
467 printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
468 Cleanup(EXIT_FAILURE);
469 }
470
471 ciErr1 = clBuildProgram(cpMLProgram, 1, &cdDevice, "-cl-mad-enable -cl-fast-relaxed-math", NULL, NULL);
472 //ciErr1 = clBuildProgram(cpMLProgram, 1, &cdDevice, NULL, NULL, NULL);
473 if (ciErr1 != CL_SUCCESS)
474 {
475 printf("%i\n", ciErr1); //prints "1"
476 switch(ciErr1)
477 {
478 case CL_INVALID_PROGRAM: printf("CL_INVALID_PROGRAM\n"); break;
479 case CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
480 case CL_INVALID_DEVICE: printf("CL_INVALID_DEVICE\n"); break;
481 case CL_INVALID_BINARY: printf("CL_INVALID_BINARY\n"); break;
482 case CL_INVALID_BUILD_OPTIONS: printf("CL_INVALID_BUILD_OPTIONS\n"); break;
483 case CL_COMPILER_NOT_AVAILABLE: printf("CL_COMPILER_NOT_AVAILABLE\n"); break;
484 case CL_BUILD_PROGRAM_FAILURE: printf("CL_BUILD_PROGRAM_FAILURE\n"); break;
485 case CL_INVALID_OPERATION: printf("CL_INVALID_OPERATION\n"); break;
486 case CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
487 default: printf("Strange error\n"); //This is printed
488 }
489 printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
490 Cleanup(EXIT_FAILURE);
491 }
492
493
494 // Shows the log
495 char* build_log;
496 size_t log_size;
497 // First call to know the proper size
498 clGetProgramBuildInfo(cpMLProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
499 build_log = new char[log_size+1];
500 // Second call to get the log
501 clGetProgramBuildInfo(cpMLProgram, cdDevice, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
502 build_log[log_size] = '\0';
503 printf("%s", build_log);
504 delete[] build_log;
505
506 if (ciErr1 != CL_SUCCESS)
507 {
508 printf("%i\n", ciErr1); //prints "1"
509 switch(ciErr1)
510 {
511 case CL_INVALID_PROGRAM: printf("CL_INVALID_PROGRAM\n"); break;
512 case CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
513 case CL_INVALID_DEVICE: printf("CL_INVALID_DEVICE\n"); break;
514 case CL_INVALID_BINARY: printf("CL_INVALID_BINARY\n"); break;
515 case CL_INVALID_BUILD_OPTIONS: printf("CL_INVALID_BUILD_OPTIONS\n"); break;
516 case CL_COMPILER_NOT_AVAILABLE: printf("CL_COMPILER_NOT_AVAILABLE\n"); break;
517 case CL_BUILD_PROGRAM_FAILURE: printf("CL_BUILD_PROGRAM_FAILURE\n"); break;
518 case CL_INVALID_OPERATION: printf("CL_INVALID_OPERATION\n"); break;
519 case CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
520 default: printf("Strange error\n"); //This is printed
521 }
522 printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
523 Cleanup(EXIT_FAILURE);
524 }
525
526 // Create the kernel
527 //ckKernel = clCreateKernel(cpProgram, "FirstLoop", &ciErr1);
528 ckLeafKernel = clCreateKernel(cpMLProgram, "LeafKernel", &ciErr1);
529 printf("clCreateKernel (LeafKernel)...\n");
530 if (ciErr1 != CL_SUCCESS)
531 {
532 printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
533 Cleanup(EXIT_FAILURE);
534 }
535 ckAmbigKernel = clCreateKernel(cpMLProgram, "AmbigKernel", &ciErr1);
536 printf("clCreateKernel (AmbigKernel)...\n");
537 if (ciErr1 != CL_SUCCESS)
538 {
539 printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
540 Cleanup(EXIT_FAILURE);
541 }
542 ckInternalKernel = clCreateKernel(cpMLProgram, "InternalKernel", &ciErr1);
543 printf("clCreateKernel (InternalKernel)...\n");
544 if (ciErr1 != CL_SUCCESS)
545 {
546 printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
547 Cleanup(EXIT_FAILURE);
548 }
549 ckResultKernel = clCreateKernel(cpMLProgram, "ResultKernel", &ciErr1);
550 printf("clCreateKernel (ResultKernel)...\n");
551 if (ciErr1 != CL_SUCCESS)
552 {
553 printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
554 Cleanup(EXIT_FAILURE);
555 }
556 ckReductionKernel = clCreateKernel(cpMLProgram, "ReductionKernel", &ciErr1);
557 printf("clCreateKernel (ReductionKernel)...\n");
558 if (ciErr1 != CL_SUCCESS)
559 {
560 printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
561 Cleanup(EXIT_FAILURE);
562 }
563
564 size_t maxKernelSize;
565 ciErr1 = clGetKernelWorkGroupInfo(ckLeafKernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE,
566 sizeof(size_t), &maxKernelSize, NULL);
567 printf("Max Leaf Kernel Work Group Size: %ld \n", (long unsigned) maxKernelSize);
568 ciErr1 = clGetKernelWorkGroupInfo(ckAmbigKernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE,
569 sizeof(size_t), &maxKernelSize, NULL);
570 printf("Max Ambig Kernel Work Group Size: %ld \n", (long unsigned) maxKernelSize);
571 ciErr1 = clGetKernelWorkGroupInfo(ckInternalKernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE,
572 sizeof(size_t), &maxKernelSize, NULL);
573 printf("Max Internal Kernel Work Group Size: %ld \n", (long unsigned) maxKernelSize);
574 ciErr1 = clGetKernelWorkGroupInfo(ckResultKernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE,
575 sizeof(size_t), &maxKernelSize, NULL);
576 printf("Max Result Kernel Work Group Size: %ld \n", (long unsigned) maxKernelSize);
577 ciErr1 = clGetKernelWorkGroupInfo(ckReductionKernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE,
578 sizeof(size_t), &maxKernelSize, NULL);
579 printf("Max Reduction Kernel Work Group Size: %ld \n", (long unsigned) maxKernelSize);
580
581 long tempLeafState = 1;
582 long tempSiteCount = siteCount;
583 long tempCharCount = alphabetDimension;
584 long tempChildNodeIndex = 0;
585 long tempParentNodeIndex = 0;
586 long tempRoundCharCount = roundUpToNextPowerOfTwo(alphabetDimension);
587 int tempTagIntState = 0;
588 int tempNodeID = 0;
589 float tempScalar = scalar;
590 float tempuFlowThresh = 0.000000001f;
591
592 ciErr1 = clSetKernelArg(ckLeafKernel, 0, sizeof(cl_mem), (void*)&cmNode_cache);
593 ciErr1 |= clSetKernelArg(ckLeafKernel, 1, sizeof(cl_mem), (void*)&cmModel_cache);
594 ciErr1 |= clSetKernelArg(ckLeafKernel, 2, sizeof(cl_mem), (void*)&cmNodRes_cache);
595 ciErr1 |= clSetKernelArg(ckLeafKernel, 3, sizeof(cl_mem), (void*)&cmNodFlag_cache);
596 ciErr1 |= clSetKernelArg(ckLeafKernel, 4, sizeof(cl_long), (void*)&tempSiteCount);
597 ciErr1 |= clSetKernelArg(ckLeafKernel, 5, sizeof(cl_long), (void*)&tempCharCount);
598 ciErr1 |= clSetKernelArg(ckLeafKernel, 6, sizeof(cl_long), (void*)&tempChildNodeIndex); // reset this in the loop
599 ciErr1 |= clSetKernelArg(ckLeafKernel, 7, sizeof(cl_long), (void*)&tempParentNodeIndex); // reset this in the loop
600 ciErr1 |= clSetKernelArg(ckLeafKernel, 8, sizeof(cl_long), (void*)&tempRoundCharCount);
601 ciErr1 |= clSetKernelArg(ckLeafKernel, 9, sizeof(cl_int), (void*)&tempTagIntState); // reset this in the loop
602 ciErr1 |= clSetKernelArg(ckLeafKernel, 10, sizeof(cl_int), (void*)&tempNodeID); // reset this in the loop
603 ciErr1 |= clSetKernelArg(ckLeafKernel, 11, sizeof(cl_mem), (void*)&cmScalings_cache);
604 ciErr1 |= clSetKernelArg(ckLeafKernel, 12, sizeof(cl_float), (void*)&tempScalar);
605 ciErr1 |= clSetKernelArg(ckLeafKernel, 13, sizeof(cl_float), (void*)&tempuFlowThresh);
606
607 ciErr1 |= clSetKernelArg(ckAmbigKernel, 0, sizeof(cl_mem), (void*)&cmNode_cache);
608 ciErr1 |= clSetKernelArg(ckAmbigKernel, 1, sizeof(cl_mem), (void*)&cmModel_cache);
609 ciErr1 |= clSetKernelArg(ckAmbigKernel, 2, sizeof(cl_mem), (void*)&cmNodRes_cache);
610 ciErr1 |= clSetKernelArg(ckAmbigKernel, 3, sizeof(cl_mem), (void*)&cmNodFlag_cache);
611 ciErr1 |= clSetKernelArg(ckAmbigKernel, 4, sizeof(cl_long), (void*)&tempSiteCount);
612 ciErr1 |= clSetKernelArg(ckAmbigKernel, 5, sizeof(cl_long), (void*)&tempCharCount);
613 ciErr1 |= clSetKernelArg(ckAmbigKernel, 6, sizeof(cl_long), (void*)&tempChildNodeIndex); // reset this in the loop
614 ciErr1 |= clSetKernelArg(ckAmbigKernel, 7, sizeof(cl_long), (void*)&tempParentNodeIndex); // reset this in the loop
615 ciErr1 |= clSetKernelArg(ckAmbigKernel, 8, sizeof(cl_long), (void*)&tempRoundCharCount);
616 ciErr1 |= clSetKernelArg(ckAmbigKernel, 9, sizeof(cl_int), (void*)&tempTagIntState);
617 ciErr1 |= clSetKernelArg(ckAmbigKernel, 10, sizeof(cl_int), (void*)&tempNodeID);
618 ciErr1 |= clSetKernelArg(ckAmbigKernel, 11, sizeof(cl_mem), (void*)&cmScalings_cache);
619 ciErr1 |= clSetKernelArg(ckAmbigKernel, 12, sizeof(cl_float), (void*)&tempScalar);
620 ciErr1 |= clSetKernelArg(ckAmbigKernel, 13, sizeof(cl_float), (void*)&tempuFlowThresh);
621
622 ciErr1 |= clSetKernelArg(ckInternalKernel, 0, sizeof(cl_mem), (void*)&cmNode_cache);
623 ciErr1 |= clSetKernelArg(ckInternalKernel, 1, sizeof(cl_mem), (void*)&cmModel_cache);
624 ciErr1 |= clSetKernelArg(ckInternalKernel, 2, sizeof(cl_mem), (void*)&cmNodRes_cache);
625 ciErr1 |= clSetKernelArg(ckInternalKernel, 3, sizeof(cl_long), (void*)&tempSiteCount);
626 ciErr1 |= clSetKernelArg(ckInternalKernel, 4, sizeof(cl_long), (void*)&tempCharCount);
627 ciErr1 |= clSetKernelArg(ckInternalKernel, 5, sizeof(cl_long), (void*)&tempChildNodeIndex); // reset this in the loop
628 ciErr1 |= clSetKernelArg(ckInternalKernel, 6, sizeof(cl_long), (void*)&tempParentNodeIndex); // reset this in the loop
629 ciErr1 |= clSetKernelArg(ckInternalKernel, 7, sizeof(cl_long), (void*)&tempRoundCharCount);
630 ciErr1 |= clSetKernelArg(ckInternalKernel, 8, sizeof(cl_int), (void*)&tempTagIntState); // reset this in the loop
631 ciErr1 |= clSetKernelArg(ckInternalKernel, 9, sizeof(cl_int), (void*)&tempNodeID); // reset this in the loop
632 ciErr1 |= clSetKernelArg(ckInternalKernel, 10, sizeof(cl_mem), (void*)&cmroot_cache);
633 ciErr1 |= clSetKernelArg(ckInternalKernel, 11, sizeof(cl_mem), (void*)&cmScalings_cache);
634 ciErr1 |= clSetKernelArg(ckInternalKernel, 12, sizeof(cl_float), (void*)&tempScalar);
635 ciErr1 |= clSetKernelArg(ckInternalKernel, 13, sizeof(cl_float), (void*)&tempuFlowThresh);
636 ciErr1 |= clSetKernelArg(ckInternalKernel, 14, sizeof(cl_mem), (void*)&cmroot_scalings);
637
638 ciErr1 |= clSetKernelArg(ckResultKernel, 0, sizeof(cl_mem), (void*)&cmFreq_cache);
639 ciErr1 |= clSetKernelArg(ckResultKernel, 1, sizeof(cl_mem), (void*)&cmProb_cache);
640 ciErr1 |= clSetKernelArg(ckResultKernel, 2, sizeof(cl_mem), (void*)&cmResult_cache);
641 ciErr1 |= clSetKernelArg(ckResultKernel, 3, sizeof(cl_mem), (void*)&cmroot_cache);
642 ciErr1 |= clSetKernelArg(ckResultKernel, 4, sizeof(cl_mem), (void*)&cmroot_scalings);
643 ciErr1 |= clSetKernelArg(ckResultKernel, 5, sizeof(cl_long), (void*)&tempSiteCount);
644 ciErr1 |= clSetKernelArg(ckResultKernel, 6, sizeof(cl_long), (void*)&tempRoundCharCount);
645 ciErr1 |= clSetKernelArg(ckResultKernel, 7, sizeof(cl_float), (void*)&tempScalar);
646 ciErr1 |= clSetKernelArg(ckResultKernel, 8, sizeof(cl_long), (void*)&tempCharCount);
647
648 ciErr1 |= clSetKernelArg(ckReductionKernel, 0, sizeof(cl_mem), (void*)&cmResult_cache);
649
650 //printf("clSetKernelArg 0 - 12...\n\n");
651 if (ciErr1 != CL_SUCCESS)
652 {
653 printf("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
654 Cleanup(EXIT_FAILURE);
655 }
656
657 // --------------------------------------------------------
658 // Start Core sequence... copy input data to GPU, compute, copy results back
659 // Asynchronous write of data to GPU device
660 ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmFreq_cache, CL_FALSE, 0,
661 sizeof(cl_int)*siteCount, freq_cache, 0, NULL, NULL);
662 printf("clEnqueueWriteBuffer (root_cache, etc.)...");
663 if (ciErr1 != CL_SUCCESS)
664 {
665 printf("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
666 Cleanup(EXIT_FAILURE);
667 }
668 printf(" Done!\n");
669 }
670
oclmain(void)671 double _OCLEvaluator::oclmain(void)
672 {
673 //printf("newLF!\n");
674 //printf("LF");
675 // so far this wholebuffer rebuild takes almost no time at all. Perhaps not true re:queue
676 // Fix the model cache
677 #ifdef __OCLPOSIX__
678 clock_gettime(CLOCK_MONOTONIC, &bufferStart);
679 #endif
680 int roundCharacters = roundUpToNextPowerOfTwo(alphabetDimension);
681 /*
682 printf("Update Nodes:");
683 for (int i = 0; i < updateNodes.lLength; i++)
684 {
685 printf(" %i ", updateNodes.list_data[i]);
686 }
687 printf("\n");
688
689 printf("Tagged Internals:");
690 for (int i = 0; i < taggedInternals.lLength; i++)
691 {
692 printf(" %i", taggedInternals.list_data[i]);
693 }
694 printf("\n");
695 */
696 long nodeCode, parentCode;
697 bool isLeaf;
698 hyFloat* tMatrix;
699 int a1, a2;
700 //printf("updateNodes.lLength: %i", updateNodes.lLength);
701 //#pragma omp parallel for default(none) shared(updateNodes, flatParents, flatLeaves, flatCLeaves, flatTree, alphabetDimension, model, roundCharacters) private(nodeCode, parentCode, isLeaf, tMatrix, a1, a2)
702 for (int nodeID = 0; nodeID < updateNodes.lLength; nodeID++)
703 {
704 nodeCode = updateNodes.list_data[nodeID];
705 parentCode = flatParents.list_data[nodeCode];
706
707 isLeaf = nodeCode < flatLeaves.lLength;
708
709 if (!isLeaf) nodeCode -= flatLeaves.lLength;
710
711 tMatrix = (isLeaf? ((_CalcNode*) flatCLeaves (nodeCode)):
712 ((_CalcNode*) flatTree (nodeCode)))->GetCompExp(0)->theData;
713
714 for (a1 = 0; a1 < alphabetDimension; a1++)
715 {
716 for (a2 = 0; a2 < alphabetDimension; a2++)
717 {
718 ((float*)model)[nodeID*roundCharacters*roundCharacters+a2*roundCharacters+a1] =
719 (float)(tMatrix[a1*alphabetDimension+a2]);
720 }
721 }
722 }
723
724 // enqueueing the read and write buffers takes 1/2 the time, the kernel takes the other 1/2.
725 // with no queueing, however, we still only see ~700lf/s, which isn't much better than the threaded CPU code.
726 ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmModel_cache, CL_TRUE, 0,
727 sizeof(cl_float)*roundCharacters*roundCharacters*(flatParents.lLength-1),
728 model, 0, NULL, NULL);
729 //clFinish(cqCommandQueue);
730 if (ciErr1 != CL_SUCCESS)
731 {
732 printf("%i\n", ciErr1); //prints "1"
733 switch(ciErr1)
734 {
735 case CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
736 case CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
737 case CL_INVALID_MEM_OBJECT: printf("CL_INVALID_MEM_OBJECT\n"); break;
738 case CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
739 case CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
740 // case CL_MISALIGNED_SUB_BUFFER_OFFSET: printf("CL_MISALIGNED_SUB_BUFFER_OFFSET\n"); break;
741 // case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n"); break;
742 case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
743 case CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
744 case CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
745 default: printf("Strange error\n"); //This is printed
746 }
747 printf("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
748 Cleanup(EXIT_FAILURE);
749 }
750 /*
751 if (ciErr1 != CL_SUCCESS)
752 {
753 printf("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
754 Cleanup(EXIT_FAILURE);
755 }
756 */
757 #ifdef __OCLPOSIX__
758 clock_gettime(CLOCK_MONOTONIC, &bufferEnd);
759 buffSecs += (bufferEnd.tv_sec - bufferStart.tv_sec)+(bufferEnd.tv_nsec - bufferStart.tv_nsec)/BILLION;
760
761 clock_gettime(CLOCK_MONOTONIC, &queueStart);
762 #endif
763 //printf("Finished writing the model stuff\n");
764 // Launch kernel
765 for (int nodeIndex = 0; nodeIndex < updateNodes.lLength; nodeIndex++)
766 {
767 //printf("NewNode\n");
768 long nodeCode = updateNodes.list_data[nodeIndex],
769 parentCode = flatParents.list_data[nodeCode];
770
771 //printf("NewNode: %i, NodeCode: %i\n", nodeIndex, nodeCode);
772 bool isLeaf = nodeCode < flatLeaves.lLength;
773
774 if (isLeaf)
775 {
776 long nodeCodeTemp = nodeCode;
777 int tempIntTagState = taggedInternals.list_data[parentCode];
778 int ambig = 0;
779 for (int aI = 0; aI < siteCount; aI++)
780 if (lNodeFlags[nodeCode*siteCount + aI] < 0)
781 {
782 ambig = 1;
783 }
784 if (!ambig)
785 {
786 ciErr1 |= clSetKernelArg(ckLeafKernel, 6, sizeof(cl_long), (void*)&nodeCodeTemp);
787 ciErr1 |= clSetKernelArg(ckLeafKernel, 7, sizeof(cl_long), (void*)&parentCode);
788 ciErr1 |= clSetKernelArg(ckLeafKernel, 9, sizeof(cl_int), (void*)&tempIntTagState);
789 ciErr1 |= clSetKernelArg(ckLeafKernel, 10, sizeof(cl_int), (void*)&nodeIndex);
790 taggedInternals.list_data[parentCode] = 1;
791
792 //printf("Leaf!\n");
793 #ifdef __VERBOSE__
794 printf("Leaf/Ambig Started (ParentCode: %i)...", parentCode);
795 #endif
796 ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckLeafKernel, 2, NULL,
797 szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
798 }
799 else
800 {
801 ciErr1 |= clSetKernelArg(ckAmbigKernel, 6, sizeof(cl_long), (void*)&nodeCodeTemp);
802 ciErr1 |= clSetKernelArg(ckAmbigKernel, 7, sizeof(cl_long), (void*)&parentCode);
803 ciErr1 |= clSetKernelArg(ckAmbigKernel, 9, sizeof(cl_int), (void*)&tempIntTagState);
804 ciErr1 |= clSetKernelArg(ckAmbigKernel, 10, sizeof(cl_int), (void*)&nodeIndex);
805 taggedInternals.list_data[parentCode] = 1;
806
807 //printf("ambig!\n");
808 #ifdef __VERBOSE__
809 printf("Leaf/Ambig Started ...");
810 #endif
811 ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckAmbigKernel, 2, NULL,
812 szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
813 }
814 ciErr1 |= clFlush(cqCommandQueue);
815 clFinish(cqCommandQueue);
816 #ifdef __VERBOSE__
817 printf("Finished\n");
818 #endif
819 }
820 else
821 {
822 long tempLeafState = 0;
823 nodeCode -= flatLeaves.lLength;
824 long nodeCodeTemp = nodeCode;
825 int tempIntTagState = taggedInternals.list_data[parentCode];
826 ciErr1 |= clSetKernelArg(ckInternalKernel, 5, sizeof(cl_long), (void*)&nodeCodeTemp);
827 ciErr1 |= clSetKernelArg(ckInternalKernel, 6, sizeof(cl_long), (void*)&parentCode);
828 ciErr1 |= clSetKernelArg(ckInternalKernel, 8, sizeof(cl_int), (void*)&tempIntTagState);
829 ciErr1 |= clSetKernelArg(ckInternalKernel, 9, sizeof(cl_int), (void*)&nodeIndex);
830 taggedInternals.list_data[parentCode] = 1;
831 #ifdef __VERBOSE__
832 printf("Internal Started (ParentCode: %i)...", parentCode);
833 #endif
834 ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckInternalKernel, 2, NULL,
835 szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
836
837 //printf("internal!\n");
838 ciErr1 |= clFlush(cqCommandQueue);
839 clFinish(cqCommandQueue);
840 #ifdef __VERBOSE__
841 printf("Finished\n");
842 #endif
843 }
844 if (ciErr1 != CL_SUCCESS)
845 {
846 printf("%i\n", ciErr1); //prints "1"
847 switch(ciErr1)
848 {
849 case CL_INVALID_PROGRAM_EXECUTABLE: printf("CL_INVALID_PROGRAM_EXECUTABLE\n"); break;
850 case CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
851 case CL_INVALID_KERNEL: printf("CL_INVALID_KERNEL\n"); break;
852 case CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
853 case CL_INVALID_KERNEL_ARGS: printf("CL_INVALID_KERNEL_ARGS\n"); break;
854 case CL_INVALID_WORK_DIMENSION: printf("CL_INVALID_WORK_DIMENSION\n"); break;
855 case CL_INVALID_GLOBAL_WORK_SIZE: printf("CL_INVALID_GLOBAL_WORK_SIZE\n"); break;
856 case CL_INVALID_GLOBAL_OFFSET: printf("CL_INVALID_GLOBAL_OFFSET\n"); break;
857 case CL_INVALID_WORK_GROUP_SIZE: printf("CL_INVALID_WORK_GROUP_SIZE\n"); break;
858 case CL_INVALID_WORK_ITEM_SIZE: printf("CL_INVALID_WORK_ITEM_SIZE\n"); break;
859 case CL_INVALID_IMAGE_SIZE: printf("CL_INVALID_IMAGE_SIZE\n"); break;
860 case CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
861 case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
862 case CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
863 case CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
864 default: printf("Strange error\n"); //This is printed
865 }
866 printf("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
867 Cleanup(EXIT_FAILURE);
868 }
869 }
870 #ifdef __GPUResults__
871 size_t szGlobalWorkSize2 = 256;
872 size_t szLocalWorkSize2 = 256;
873 //size_t szLocalWorkSize2 = roundUpToNextPowerOfTwo(siteCount);
874 //size_t szLocalWorkSize2 = MIN(roundUpToNextPowerOfTwo(siteCount), 256*256);
875 size_t szGlobalWorkSize3 = 256;
876 size_t szLocalWorkSize3 = 256;
877
878 ciErr1 |= clEnqueueNDRangeKernel(cqCommandQueue, ckResultKernel, 1, NULL,
879 &szGlobalWorkSize2, &szLocalWorkSize2, 0, NULL, NULL);
880 ciErr1 |= clEnqueueNDRangeKernel(cqCommandQueue, ckReductionKernel, 1, NULL,
881 &szGlobalWorkSize3, &szLocalWorkSize3, 0, NULL, NULL);
882 #else
883 ciErr1 |= clEnqueueNDRangeKernel(cqCommandQueue, ckResultKernel, 2, NULL,
884 szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
885 #endif
886 /*
887 */
888 /*
889 ciErr1 |= clEnqueueNDRangeKernel(cqCommandQueue, ckReductionKernel, 2, NULL,
890 szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
891 */
892
893 if (ciErr1 != CL_SUCCESS)
894 {
895 printf("%i\n", ciErr1); //prints "1"
896 switch(ciErr1)
897 {
898 case CL_INVALID_PROGRAM_EXECUTABLE: printf("CL_INVALID_PROGRAM_EXECUTABLE\n"); break;
899 case CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
900 case CL_INVALID_KERNEL: printf("CL_INVALID_KERNEL\n"); break;
901 case CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
902 case CL_INVALID_KERNEL_ARGS: printf("CL_INVALID_KERNEL_ARGS\n"); break;
903 case CL_INVALID_WORK_DIMENSION: printf("CL_INVALID_WORK_DIMENSION\n"); break;
904 case CL_INVALID_GLOBAL_WORK_SIZE: printf("CL_INVALID_GLOBAL_WORK_SIZE\n"); break;
905 case CL_INVALID_GLOBAL_OFFSET: printf("CL_INVALID_GLOBAL_OFFSET\n"); break;
906 case CL_INVALID_WORK_GROUP_SIZE: printf("CL_INVALID_WORK_GROUP_SIZE\n"); break;
907 case CL_INVALID_WORK_ITEM_SIZE: printf("CL_INVALID_WORK_ITEM_SIZE\n"); break;
908 // case CL_MISALIGNED_SUB_BUFFER_OFFSET: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
909 case CL_INVALID_IMAGE_SIZE: printf("CL_INVALID_IMAGE_SIZE\n"); break;
910 case CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
911 case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
912 case CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
913 case CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
914 default: printf("Strange error\n"); //This is printed
915 }
916 printf("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
917 Cleanup(EXIT_FAILURE);
918 }
919 // Synchronous/blocking read of results, and check accumulated errors
920 /*
921 #ifdef __GPUResults_
922 ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmResult_cache, CL_FALSE, 0,
923 sizeof(cl_double)*roundUpToNextPowerOfTwo(siteCount), result_cache, 0,
924 NULL, NULL);
925 //ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmResult_cache, CL_FALSE, 0,
926 // sizeof(cl_double)*1, result_cache, 0,
927 // NULL, NULL);
928 #else
929 //ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmResult_cache, CL_FALSE, 0,
930 // sizeof(cl_float)*roundUpToNextPowerOfTwo(siteCount), result_cache, 0,
931 // NULL, NULL);
932 ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmResult_cache, CL_FALSE, 0,
933 sizeof(cl_float)*roundUpToNextPowerOfTwo(siteCount), result_cache, 0,
934 NULL, NULL);
935
936 #endif
937 */
938 ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmResult_cache, CL_FALSE, 0,
939 sizeof(clfp)*roundUpToNextPowerOfTwo(siteCount), result_cache, 0,
940 NULL, NULL);
941
942 if (ciErr1 != CL_SUCCESS)
943 {
944 printf("%i\n", ciErr1); //prints "1"
945 switch(ciErr1)
946 {
947 case CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
948 case CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
949 case CL_INVALID_MEM_OBJECT: printf("CL_INVALID_MEM_OBJECT\n"); break;
950 case CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
951 case CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
952 // case CL_MISALIGNED_SUB_BUFFER_OFFSET: printf("CL_MISALIGNED_SUB_BUFFER_OFFSET\n"); break;
953 // case CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n"); break;
954 case CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
955 case CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
956 case CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
957 default: printf("Strange error\n"); //This is printed
958 }
959 printf("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
960 Cleanup(EXIT_FAILURE);
961 }
962 //--------------------------------------------------------
963
964
965 clFinish(cqCommandQueue);
966 double oResult = 0.0;
967
968 #ifdef __GPUResults__
969 /*
970 for (int i = 0; i < siteCount; i++)
971 {
972 oResult += ((fpoint*)result_cache)[i];
973 }
974 #ifdef __VERBOSE__
975 printf("Result_Cache: \n");
976 for (int i = 0; i < siteCount; i++)
977 printf("%4.10g ", ((fpoint*)result_cache)[i]);
978 printf("\n\n");
979 #endif
980 for (int i = 0; i < siteCount; i++)
981 {
982 oResult += ((float*)result_cache)[i];
983 }
984 //oResult = ((double*)result_cache)[0];
985 printf("Result_Cache: \n");
986 for (int i = 0; i < siteCount; i++)
987 printf("%4.10g ", ((fpoint*)result_cache)[i]);
988 printf("\n\n");
989 */
990 oResult = ((fpoint*)result_cache)[0];
991 #else
992 #ifdef __OCLPOSIX__
993 clock_gettime(CLOCK_MONOTONIC, &queueEnd);
994 queueSecs += (queueEnd.tv_sec - queueStart.tv_sec)+(queueEnd.tv_nsec - queueStart.tv_nsec)/BILLION;
995 clock_gettime(CLOCK_MONOTONIC, &mainStart);
996 #endif
997 //#pragma omp parallel for reduction (+:oResult) schedule(static)
998 for (int i = 0; i < siteCount; i++)
999 {
1000 oResult += ((fpoint*)result_cache)[i];
1001 }
1002 #ifdef __VERBOSE__
1003 printf("Result_Cache: \n");
1004 for (int i = 0; i < siteCount; i++)
1005 printf("%4.10g ", ((float*)result_cache)[i]);
1006 printf("\n\n");
1007 #endif
1008 #ifdef __OCLPOSIX__
1009 clock_gettime(CLOCK_MONOTONIC, &mainEnd);
1010 mainSecs += (mainEnd.tv_sec - mainStart.tv_sec)+(mainEnd.tv_nsec - mainStart.tv_nsec)/BILLION;
1011 #endif
1012 #endif
1013 /*
1014 */
1015 /*
1016 //#pragma omp parallel for reduction (+:oResult) schedule(static)
1017 for (int i = 0; i < siteCount; i++)
1018 {
1019 oResult += ((float*)result_cache)[i];
1020 }
1021 printf("Result_Cache: \n");
1022 for (int i = 0; i < siteCount; i++)
1023 printf("%4.10g ", ((float*)result_cache)[i]);
1024 printf("\n\n");
1025 */
1026 /*
1027 //printf("! ");
1028 //return result;
1029 printf("oResult: %4.10g, gpuResult: %4.10g\n", oResult, ((double*)result_cache)[4]);
1030 if (oResult != ((double*)result_cache)[4])
1031 {
1032 printf("Result_Cache: \n");
1033 //for (int i = 0; i < roundUpToNextPowerOfTwo(siteCount); i++)
1034 for (int i = 0; i < 5; i++)
1035 printf("%4.10g ", ((double*)result_cache)[i]);
1036 printf("\n\n");
1037 }
1038 */
1039 return oResult;
1040 }
1041
1042
launchmdsocl(_SimpleList & eupdateNodes,_SimpleList & eflatParents,_SimpleList & eflatNodes,_SimpleList & eflatCLeaves,_SimpleList & eflatLeaves,_SimpleList & eflatTree,hyFloat * etheProbs,_SimpleList & etheFrequencies,long * elNodeFlags,_SimpleList & etaggedInternals,_Vector * elNodeResolutions)1043 double _OCLEvaluator::launchmdsocl( _SimpleList& eupdateNodes,
1044 _SimpleList& eflatParents,
1045 _SimpleList& eflatNodes,
1046 _SimpleList& eflatCLeaves,
1047 _SimpleList& eflatLeaves,
1048 _SimpleList& eflatTree,
1049 hyFloat* etheProbs,
1050 _SimpleList& etheFrequencies,
1051 long* elNodeFlags,
1052 _SimpleList& etaggedInternals,
1053 _Vector* elNodeResolutions)
1054 {
1055 #ifdef __OCLPOSIX__
1056 clock_gettime(CLOCK_MONOTONIC, &mainStart);
1057 #endif
1058
1059
1060 updateNodes = eupdateNodes;
1061 taggedInternals = etaggedInternals;
1062 theFrequencies = etheFrequencies;
1063
1064
1065 if (!contextSet)
1066 {
1067 theProbs = etheProbs;
1068 flatNodes = eflatNodes;
1069 flatCLeaves = eflatCLeaves;
1070 flatLeaves = eflatLeaves;
1071 flatTree = eflatTree;
1072 flatParents = eflatParents;
1073 lNodeFlags = elNodeFlags;
1074 lNodeResolutions = elNodeResolutions;
1075 setupContext();
1076 contextSet = true;
1077 }
1078
1079 #ifdef __OCLPOSIX__
1080 clock_gettime(CLOCK_MONOTONIC, &mainEnd);
1081 mainSecs += (mainEnd.tv_sec - mainStart.tv_sec)+(mainEnd.tv_nsec - mainStart.tv_nsec)/BILLION;
1082 #endif
1083
1084 return oclmain();
1085 }
1086
1087
Cleanup(int iExitCode)1088 void _OCLEvaluator::Cleanup (int iExitCode)
1089 {
1090 if (!clean)
1091 {
1092 printf("Time in main: %.4lf seconds\n", mainSecs);
1093 printf("Time in updating transition buffer: %.4lf seconds\n", buffSecs);
1094 printf("Time in queue: %.4lf seconds\n", queueSecs);
1095 printf("Time in Setup: %.4lf seconds\n", setupSecs);
1096 // Cleanup allocated objects
1097 printf("Starting Cleanup...\n\n");
1098 if(ckLeafKernel)clReleaseKernel(ckLeafKernel);
1099 if(ckInternalKernel)clReleaseKernel(ckInternalKernel);
1100 if(ckAmbigKernel)clReleaseKernel(ckAmbigKernel);
1101 if(ckResultKernel)clReleaseKernel(ckResultKernel);
1102 if(cpLeafProgram)clReleaseProgram(cpLeafProgram);
1103 if(cpInternalProgram)clReleaseProgram(cpInternalProgram);
1104 if(cpAmbigProgram)clReleaseProgram(cpAmbigProgram);
1105 if(cpResultProgram)clReleaseProgram(cpResultProgram);
1106 if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
1107 printf("Halfway...\n\n");
1108 if(cxGPUContext)clReleaseContext(cxGPUContext);
1109 if(cmNode_cache)clReleaseMemObject(cmNode_cache);
1110 if(cmModel_cache)clReleaseMemObject(cmModel_cache);
1111 if(cmNodRes_cache)clReleaseMemObject(cmNodRes_cache);
1112 if(cmNodFlag_cache)clReleaseMemObject(cmNodFlag_cache);
1113 if(cmroot_cache)clReleaseMemObject(cmroot_cache);
1114 if(cmroot_scalings)clReleaseMemObject(cmroot_scalings);
1115 if(cmScalings_cache)clReleaseMemObject(cmScalings_cache);
1116 if(cmFreq_cache)clReleaseMemObject(cmFreq_cache);
1117 if(cmProb_cache)clReleaseMemObject(cmProb_cache);
1118 if(cmResult_cache)clReleaseMemObject(cmResult_cache);
1119 printf("Done with ocl stuff...\n\n");
1120 // Free host memory
1121 free(node_cache);
1122 free(model);
1123 free(nodRes_cache);
1124 free(nodFlag_cache);
1125 free(scalings_cache);
1126 free(prob_cache);
1127 free(freq_cache);
1128 free(root_cache);
1129 free(result_cache);
1130 free(root_scalings);
1131 printf("Done!\n\n");
1132 clean = true;
1133 exit(0);
1134
1135 if (iExitCode = EXIT_FAILURE)
1136 exit (iExitCode);
1137 }
1138 }
1139
roundUpToNextPowerOfTwo(unsigned int x)1140 unsigned int _OCLEvaluator::roundUpToNextPowerOfTwo(unsigned int x)
1141 {
1142 x--;
1143 x |= x >> 1;
1144 x |= x >> 2;
1145 x |= x >> 4;
1146 x |= x >> 8;
1147 x |= x >> 16;
1148 x++;
1149
1150 return x;
1151 }
1152
roundDoubleUpToNextPowerOfTwo(double x)1153 double _OCLEvaluator::roundDoubleUpToNextPowerOfTwo(double x)
1154 {
1155 return pow(2, ceil(log2(x)));
1156 }
1157 #endif
1158
1159