1 /*
2 
3  HyPhy - Hypothesis Testing Using Phylogenies.
4 
5  Copyright (C) 1997-now
6  Core Developers:
7  Sergei L Kosakovsky Pond (sergeilkp@icloud.com)
8  Art FY Poon    (apoon42@uwo.ca)
9  Steven Weaver (sweaver@temple.edu)
10 
11  Module Developers:
12  Lance Hepler (nlhepler@gmail.com)
13  Martin Smith (martin.audacis@gmail.com)
14 
15  Significant contributions from:
16  Spencer V Muse (muse@stat.ncsu.edu)
17  Simon DW Frost (sdf22@cam.ac.uk)
18 
19  Permission is hereby granted, free of charge, to any person obtaining a
20  copy of this software and associated documentation files (the
21  "Software"), to deal in the Software without restriction, including
22  without limitation the rights to use, copy, modify, merge, publish,
23  distribute, sublicense, and/or sell copies of the Software, and to
24  permit persons to whom the Software is furnished to do so, subject to
25  the following conditions:
26 
27  The above copyright notice and this permission notice shall be included
28  in all copies or substantial portions of the Software.
29 
30  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
31  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
32  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
33  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
34  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
35  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
36  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
37 
38  */
39 
40 // *********************************************************************
41 // OpenCL likelihood function Notes:
42 //
43 // Runs computations with OpenCL on the GPU device and then checks results
44 // against basic host CPU/C++ computation.
45 //
46 //
47 // *********************************************************************
48 
49 #ifdef MDSOCL
50 
51 #include <string>
52 #include <stdio.h>
53 #include <assert.h>
54 #include <sys/sysctl.h>
55 #include <sys/stat.h>
56 #include <stdlib.h>
57 #include <stdio.h>
58 #include <time.h>
59 #include <math.h>
60 #include "calcnode.h"
61 
62 #include "opencl_kernels.h"
63 
64 //#define FLOAT
65 //#define OCLVERBOSE
66 
67 #if defined(__APPLE__)
68 #include <OpenCL/OpenCL.h>
69 typedef float fpoint;
70 typedef cl_float clfp;
71 #define FLOATPREC "typedef float fpoint; \n"
72 //#define PRAGMADEF "#pragma OPENCL EXTENSION cl_khr_fp64: enable \n"
73 #define PRAGMADEF " \n"
74 //#pragma OPENCL EXTENSION cl_khr_fp64: enable
75 #elif defined(NVIDIA)
76 #define __GPUResults__
77 #define __OCLPOSIX__
78 #include <oclUtils.h>
79 typedef double fpoint;
80 typedef cl_double clfp;
81 #define FLOATPREC "typedef double fpoint; \n"
82 #define PRAGMADEF "#pragma OPENCL EXTENSION cl_khr_fp64: enable \n"
83 #pragma OPENCL EXTENSION cl_khr_fp64: enable
84 #elif defined(AMD)
85 #define __GPUResults__
86 #define __OCLPOSIX__
87 #include <CL/opencl.h>
88 typedef double fpoint;
89 typedef cl_double clfp;
90 #define FLOATPREC "typedef double fpoint; \n"
91 #define PRAGMADEF "#pragma OPENCL EXTENSION cl_amd_fp64: enable \n"
92 #pragma OPENCL EXTENSION cl_amd_fp64: enable
93 #elif defined(FLOAT)
94 #include <CL/opencl.h>
95 typedef float fpoint;
96 typedef cl_float clfp;
97 #define FLOATPREC "typedef float fpoint; \n"
98 #define PRAGMADEF " \n"
99 #endif
100 
101 //#define __VERBOSE__
102 #define OCLGPU
103 #ifdef OCLGPU
104 #define OCLTARGET " #define BLOCK_SIZE 16 \n"
105 #else
106 #define OCLTARGET " #define BLOCK_SIZE 1 \n"
107 #endif
108 
109 #ifdef __GPUResults__
110 #define OCLGPUResults " #define __GPUResults__ \n"
111 #else
112 #define OCLGPUResults " \n"
113 #endif
114 
115 
116 // #define MIN(a,b) ((a)>(b)?(b):(a))
117 
118 // time stuff:
119 #define BILLION 1E9
120 struct timespec mainStart, mainEnd, bufferStart, bufferEnd, queueStart, queueEnd, setupStart, setupEnd;
121 double mainSecs;
122 double buffSecs;
123 double queueSecs;
124 double setupSecs;
125 
126 bool clean;
127 
128 cl_context cxGPUContext;        // OpenCL context
129 cl_command_queue cqCommandQueue;// OpenCL command que
130 cl_platform_id cpPlatform;      // OpenCL platform
131 cl_device_id cdDevice;          // OpenCL device
132 cl_program cpMLProgram;
133 cl_program cpLeafProgram;
134 cl_program cpInternalProgram;
135 cl_program cpAmbigProgram;
136 cl_program cpResultProgram;
137 cl_kernel ckLeafKernel;
138 cl_kernel ckInternalKernel;
139 cl_kernel ckAmbigKernel;
140 cl_kernel ckResultKernel;
141 cl_kernel ckReductionKernel;
142 size_t szGlobalWorkSize[2];        // 1D var for Total # of work items
143 size_t szLocalWorkSize[2];         // 1D var for # of work items in the work group
144 size_t localMemorySize;         // size of local memory buffer for kernel scratch
145 size_t szParmDataBytes;         // Byte size of context information
146 size_t szKernelLength;          // Byte size of kernel code
147 cl_int ciErr1, ciErr2;          // Error code var
148 
149 cl_mem cmNode_cache;
150 cl_mem cmModel_cache;
151 cl_mem cmNodRes_cache;
152 cl_mem cmNodFlag_cache;
153 cl_mem cmroot_cache;
154 cl_mem cmroot_scalings;
155 cl_mem cmScalings_cache;
156 cl_mem cmFreq_cache;
157 cl_mem cmProb_cache;
158 cl_mem cmResult_cache;
159 long siteCount, alphabetDimension;
160 long* lNodeFlags;
161 _SimpleList     updateNodes,
162                 flatParents,
163                 flatNodes,
164                 flatCLeaves,
165                 flatLeaves,
166                 flatTree,
167                 theFrequencies;
168 hyFloat      *iNodeCache,
169                 *theProbs;
170 _SimpleList taggedInternals;
171 _Vector* lNodeResolutions;
172 float scalar;
173 
174 void *node_cache, *nodRes_cache, *nodFlag_cache, *scalings_cache, *prob_cache, *freq_cache, *root_cache, *result_cache, *root_scalings, *model;
175 
init(long esiteCount,long ealphabetDimension,hyFloat * eiNodeCache)176 void _OCLEvaluator::init(   long esiteCount,
177                                     long ealphabetDimension,
178                                     hyFloat* eiNodeCache)
179 {
180     clean = false;
181     contextSet = false;
182     siteCount = esiteCount;
183     alphabetDimension = ealphabetDimension;
184     iNodeCache = eiNodeCache;
185     mainSecs = 0.0;
186     buffSecs = 0.0;
187     queueSecs = 0.0;
188     setupSecs = 0.0;
189     scalar = 10.0;
190 }
191 
192 // So the two interfacing functions will be the constructor, called in SetupLFCaches, and launchmdsocl, called in ComputeBlock.
193 // Therefore all of these functions need to be finished, the context needs to be setup separately from the execution, the data needs
194 // to be passed piecewise, and a pointer needs to be passed around in likefunc2.cpp. After that things should be going a bit faster,
195 // though honestly this solution is geared towards analyses with a larger number of sites.
196 
197 // *********************************************************************
setupContext(void)198 int _OCLEvaluator::setupContext(void)
199 {
200 #ifdef __OCLPOSIX__
201     clock_gettime(CLOCK_MONOTONIC, &setupStart);
202 #endif
203     //printf("Made it to the oclmain() function!\n");
204 
205     //long nodeResCount = sizeof(lNodeResolutions->theData)/sizeof(lNodeResolutions->theData[0]);
206     long nodeFlagCount = flatLeaves.lLength*siteCount;
207     long nodeResCount = lNodeResolutions->get_used();
208     int roundCharacters = roundUpToNextPowerOfTwo(alphabetDimension);
209 //    long nodeCount = flatLeaves.lLength + flatNodes.lLength + 1;
210 //    long iNodeCount = flatNodes.lLength + 1;
211 
212     bool ambiguousNodes = true;
213     if (nodeResCount == 0)
214     {
215         nodeResCount++;
216         ambiguousNodes = false;
217     }
218 
219     //printf("Got the sizes of nodeRes and nodeFlag: %i, %i\n", nodeResCount, nodeFlagCount);
220 
221     // Make transitionMatrixArray, do other host stuff:
222     node_cache      = (void*)malloc(sizeof(cl_float)*roundCharacters*siteCount*(flatNodes.lLength));
223     nodRes_cache    = (void*)malloc(sizeof(cl_float)*roundUpToNextPowerOfTwo(nodeResCount));
224     nodFlag_cache   = (void*)malloc(sizeof(cl_long)*roundUpToNextPowerOfTwo(nodeFlagCount));
225     scalings_cache  = (void*)malloc(sizeof(cl_int)*roundCharacters*siteCount*(flatNodes.lLength));
226     prob_cache      = (void*)malloc(sizeof(cl_float)*roundCharacters);
227     freq_cache      = (void*)malloc(sizeof(cl_int)*siteCount);
228     freq_cache      = (void*)malloc(sizeof(cl_int)*siteCount);
229     root_cache      = (void*)malloc(sizeof(cl_float)*siteCount*roundCharacters);
230     root_scalings   = (void*)malloc(sizeof(cl_int)*siteCount*roundCharacters);
231 /*
232 #ifdef __GPUResults__
233     result_cache    = (void*)malloc(sizeof(cl_double)*roundUpToNextPowerOfTwo(siteCount));
234 #else
235     result_cache    = (void*)malloc(sizeof(cl_float)*roundUpToNextPowerOfTwo(siteCount));
236 #endif
237 */
238     result_cache    = (void*)malloc(sizeof(clfp)*roundUpToNextPowerOfTwo(siteCount));
239     model           = (void*)malloc(sizeof(cl_float)*roundCharacters*roundCharacters*(flatParents.lLength-1));
240 
241     //printf("Allocated all of the arrays!\n");
242     //printf("setup the model, fixed tagged internals!\n");
243     printf("flatleaves: %ld\n", flatLeaves.lLength);
244     printf("flatParents: %ld\n", flatParents.lLength);
245     //printf("flatCleaves: %i\n", flatCLeaves.lLength);
246     printf("flatNodes: %ld\n", flatNodes.lLength);
247     printf("updateNodes: %ld\n", updateNodes.lLength);
248     printf("flatTree: %ld\n", flatTree.lLength);
249     //printf("nodeFlagCount: %i\n", nodeFlagCount);
250     //printf("nodeResCount: %i\n", nodeResCount);
251 
252     //for (int i = 0; i < nodeCount*siteCount*alphabetDimension; i++)
253     printf("siteCount: %ld, alphabetDimension: %ld \n", siteCount, alphabetDimension);
254     if (ambiguousNodes)
255         for (int i = 0; i < nodeResCount; i++)
256             ((float*)nodRes_cache)[i] = (float)(lNodeResolutions->theData[i]);
257     for (int i = 0; i < nodeFlagCount; i++)
258         ((long*)nodFlag_cache)[i] = lNodeFlags[i];
259     for (int i = 0; i < siteCount; i++)
260         ((int*)freq_cache)[i] = theFrequencies[i];
261     for (int i = 0; i < alphabetDimension; i++)
262         ((float*)prob_cache)[i] = theProbs[i];
263 
264     //printf("Created all of the arrays!\n");
265 
266     // alright, by now taggedInternals have been taken care of, and model has
267     // been filled with all of the transition matrices.
268 
269 #ifdef __OCLPOSIX__
270     clock_gettime(CLOCK_MONOTONIC, &setupEnd);
271     setupSecs += (setupEnd.tv_sec - setupStart.tv_sec)+(setupEnd.tv_nsec - setupStart.tv_nsec)/BILLION;
272 #endif
273 
274 
275 
276     //**************************************************
277 
278     //Get an OpenCL platform
279     ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL);
280 
281 //    printf("clGetPlatformID...\n");
282     if (ciErr1 != CL_SUCCESS)
283     {
284         printf("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
285         Cleanup(EXIT_FAILURE);
286     }
287 
288 
289     //Get the devices
290 #ifdef OCLGPU
291     ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL);
292 #else
293     ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_CPU, 1, &cdDevice, NULL);
294 #endif
295  //   printf("clGetDeviceIDs...\n");
296     if (ciErr1 != CL_SUCCESS)
297     {
298         printf("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
299         Cleanup(EXIT_FAILURE);
300     }
301 
302     size_t maxWorkGroupSize;
303     ciErr1 = clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE,
304                              sizeof(size_t), &maxWorkGroupSize, NULL);
305     if (ciErr1 != CL_SUCCESS)
306     {
307         printf("Getting max work group size failed!\n");
308     }
309     printf("Max work group size: %lu\n", (unsigned long)maxWorkGroupSize);
310 
311     size_t maxLocalSize;
312     ciErr1 = clGetDeviceInfo(cdDevice, CL_DEVICE_LOCAL_MEM_SIZE,
313                              sizeof(size_t), &maxLocalSize, NULL);
314     size_t maxConstSize;
315     ciErr1 = clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
316                              sizeof(size_t), &maxConstSize, NULL);
317     printf("LocalSize: %ld, Const size: %ld\n", (long unsigned) maxLocalSize, (long unsigned) maxConstSize);
318 
319     printf("sites: %ld\n", siteCount);
320 
321     // set and log Global and Local work size dimensions
322 
323 #ifdef OCLGPU
324     szLocalWorkSize[0] = 16; // All of these will have to be generalized.
325     szLocalWorkSize[1] = 16;
326 #else
327     szLocalWorkSize[0] = 1; // All of these will have to be generalized.
328     szLocalWorkSize[1] = 1;
329 #endif
330     szGlobalWorkSize[0] = 64;
331     //szGlobalWorkSize[1] = ((siteCount + 16)/16)*16;
332     szGlobalWorkSize[1] = roundUpToNextPowerOfTwo(siteCount);
333     //szGlobalWorkSize[1] = roundUpToNextPowerOfTwo(siteCount);
334     printf("Global Work Size \t\t= %ld, %ld\nLocal Work Size \t\t= %ld, %ld\n# of Work Groups \t\t= %ld\n\n",
335            (long unsigned) szGlobalWorkSize[0],
336            (long unsigned) szGlobalWorkSize[1],
337            (long unsigned) szLocalWorkSize[0],
338            (long unsigned) szLocalWorkSize[1],
339            (long unsigned) ((szGlobalWorkSize[0]*szGlobalWorkSize[1])/(szLocalWorkSize[0]*szLocalWorkSize[1])));
340 
341 
342     size_t returned_size = 0;
343     cl_char vendor_name[1024] = {0};
344     cl_char device_name[1024] = {0};
345     ciErr1 = clGetDeviceInfo(cdDevice, CL_DEVICE_VENDOR, sizeof(vendor_name),
346                              vendor_name, &returned_size);
347     ciErr1 |= clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(device_name),
348                               device_name, &returned_size);
349     assert(ciErr1 == CL_SUCCESS);
350 //    printf("Connecting to %s %s...\n", vendor_name, device_name);
351 
352     //Create the context
353     cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1);
354 //    printf("clCreateContext...\n");
355     if (ciErr1 != CL_SUCCESS)
356     {
357         printf("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
358         Cleanup(EXIT_FAILURE);
359     }
360 
361     // Create a command-queue
362     cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, 0, &ciErr1);
363 //    printf("clCreateCommandQueue...\n");
364     if (ciErr1 != CL_SUCCESS)
365     {
366         printf("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
367         Cleanup(EXIT_FAILURE);
368     }
369 
370 
371     printf("Setup all of the OpenCL stuff!\n");
372 
373     // Allocate the OpenCL buffer memory objects for the input and output on the
374     // device GMEM
375     cmNode_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
376                     sizeof(cl_float)*roundCharacters*siteCount*(flatNodes.lLength), node_cache,
377                     &ciErr1);
378     cmModel_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY,
379                     sizeof(cl_float)*roundCharacters*roundCharacters*(flatParents.lLength-1),
380                     NULL, &ciErr2);
381     ciErr1 |= ciErr2;
382     cmScalings_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
383                     sizeof(cl_int)*roundCharacters*siteCount*flatNodes.lLength, scalings_cache, &ciErr2);
384     ciErr1 |= ciErr2;
385     cmNodRes_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
386                     sizeof(cl_float)*roundUpToNextPowerOfTwo(nodeResCount), nodRes_cache, &ciErr2);
387     ciErr1 |= ciErr2;
388     cmNodFlag_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
389                     sizeof(cl_long)*roundUpToNextPowerOfTwo(nodeFlagCount), nodFlag_cache, &ciErr2);
390     ciErr1 |= ciErr2;
391     cmroot_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE,
392                     sizeof(cl_float)*siteCount*roundCharacters, NULL, &ciErr2);
393     ciErr1 |= ciErr2;
394     cmroot_scalings = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE,
395                     sizeof(cl_int)*siteCount*roundCharacters, NULL, &ciErr2);
396     ciErr1 |= ciErr2;
397     cmProb_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
398                     sizeof(cl_float)*roundCharacters, prob_cache, &ciErr2);
399     ciErr1 |= ciErr2;
400     cmFreq_cache = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY,
401                     sizeof(cl_float)*siteCount, NULL, &ciErr2);
402     ciErr1 |= ciErr2;
403     //cmResult_cache = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,
404      //               sizeof(cl_float)*siteCount, NULL, &ciErr2);
405 /*
406 #ifdef __GPUResults__
407     cmResult_cache = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,
408                     sizeof(cl_double)*roundUpToNextPowerOfTwo(siteCount), NULL, &ciErr2);
409 #else
410     cmResult_cache = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,
411                     sizeof(cl_float)*roundUpToNextPowerOfTwo(siteCount), NULL, &ciErr2);
412 #endif
413 */
414     cmResult_cache = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY,
415                     sizeof(clfp)*roundUpToNextPowerOfTwo(siteCount), NULL, &ciErr2);
416     ciErr1 |= ciErr2;
417 //    printf("clCreateBuffer...\n");
418     if (ciErr1 != CL_SUCCESS)
419     {
420         printf("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
421         switch(ciErr1)
422         {
423             case   CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
424             case   CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
425             case   CL_INVALID_BUFFER_SIZE: printf("CL_INVALID_BUFFER_SIZE\n"); break;
426             case   CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
427             case   CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
428             default: printf("Strange error\n");
429         }
430         Cleanup(EXIT_FAILURE);
431     }
432 
433 
434 #ifdef __OCLPOSIX__
435     clock_gettime(CLOCK_MONOTONIC, &setupStart);
436 #endif
437 /*
438     for (int i = 0; i < siteCount*roundCharacters; i++)
439     {
440         (root_cache)[i] = 0.0;
441         (root_scalings)[i] = 1;
442     }
443 */
444 #ifdef __OCLPOSIX__
445     clock_gettime(CLOCK_MONOTONIC, &setupEnd);
446     setupSecs += (setupEnd.tv_sec - setupStart.tv_sec)+(setupEnd.tv_nsec - setupStart.tv_nsec)/BILLION;
447 #endif
448 
449     printf("Made all of the buffers on the device!\n");
450 
451 //    printf("clCreateBuffer...\n");
452     if (ciErr1 != CL_SUCCESS)
453     {
454         printf("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
455         Cleanup(EXIT_FAILURE);
456     }
457 
458     //  "" FLOATPREC
459     // Create the program
460     const char * program_source = "" OCLTARGET PRAGMADEF FLOATPREC OCLGPUResults KERNEL_STRING;
461 
462 // TODO: result_cache size can be reduced to siteCount/BLOCK_SIZE
463     cpMLProgram = clCreateProgramWithSource(cxGPUContext, 1, &program_source,
464                                           NULL, &ciErr1);
465     if (ciErr1 != CL_SUCCESS)
466     {
467         printf("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
468         Cleanup(EXIT_FAILURE);
469     }
470 
471     ciErr1 = clBuildProgram(cpMLProgram, 1, &cdDevice, "-cl-mad-enable -cl-fast-relaxed-math", NULL, NULL);
472     //ciErr1 = clBuildProgram(cpMLProgram, 1, &cdDevice, NULL, NULL, NULL);
473     if (ciErr1 != CL_SUCCESS)
474     {
475         printf("%i\n", ciErr1); //prints "1"
476         switch(ciErr1)
477         {
478             case   CL_INVALID_PROGRAM: printf("CL_INVALID_PROGRAM\n"); break;
479             case   CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
480             case   CL_INVALID_DEVICE: printf("CL_INVALID_DEVICE\n"); break;
481             case   CL_INVALID_BINARY: printf("CL_INVALID_BINARY\n"); break;
482             case   CL_INVALID_BUILD_OPTIONS: printf("CL_INVALID_BUILD_OPTIONS\n"); break;
483             case   CL_COMPILER_NOT_AVAILABLE: printf("CL_COMPILER_NOT_AVAILABLE\n"); break;
484             case   CL_BUILD_PROGRAM_FAILURE: printf("CL_BUILD_PROGRAM_FAILURE\n"); break;
485             case   CL_INVALID_OPERATION: printf("CL_INVALID_OPERATION\n"); break;
486             case   CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
487             default: printf("Strange error\n"); //This is printed
488         }
489         printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
490         Cleanup(EXIT_FAILURE);
491     }
492 
493 
494     // Shows the log
495     char* build_log;
496     size_t log_size;
497     // First call to know the proper size
498     clGetProgramBuildInfo(cpMLProgram, cdDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
499     build_log = new char[log_size+1];
500     // Second call to get the log
501     clGetProgramBuildInfo(cpMLProgram, cdDevice, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL);
502     build_log[log_size] = '\0';
503     printf("%s", build_log);
504     delete[] build_log;
505 
506     if (ciErr1 != CL_SUCCESS)
507     {
508         printf("%i\n", ciErr1); //prints "1"
509         switch(ciErr1)
510         {
511             case   CL_INVALID_PROGRAM: printf("CL_INVALID_PROGRAM\n"); break;
512             case   CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
513             case   CL_INVALID_DEVICE: printf("CL_INVALID_DEVICE\n"); break;
514             case   CL_INVALID_BINARY: printf("CL_INVALID_BINARY\n"); break;
515             case   CL_INVALID_BUILD_OPTIONS: printf("CL_INVALID_BUILD_OPTIONS\n"); break;
516             case   CL_COMPILER_NOT_AVAILABLE: printf("CL_COMPILER_NOT_AVAILABLE\n"); break;
517             case   CL_BUILD_PROGRAM_FAILURE: printf("CL_BUILD_PROGRAM_FAILURE\n"); break;
518             case   CL_INVALID_OPERATION: printf("CL_INVALID_OPERATION\n"); break;
519             case   CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
520             default: printf("Strange error\n"); //This is printed
521         }
522         printf("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
523         Cleanup(EXIT_FAILURE);
524     }
525 
526     // Create the kernel
527     //ckKernel = clCreateKernel(cpProgram, "FirstLoop", &ciErr1);
528     ckLeafKernel = clCreateKernel(cpMLProgram, "LeafKernel", &ciErr1);
529     printf("clCreateKernel (LeafKernel)...\n");
530     if (ciErr1 != CL_SUCCESS)
531     {
532         printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
533         Cleanup(EXIT_FAILURE);
534     }
535     ckAmbigKernel = clCreateKernel(cpMLProgram, "AmbigKernel", &ciErr1);
536     printf("clCreateKernel (AmbigKernel)...\n");
537     if (ciErr1 != CL_SUCCESS)
538     {
539         printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
540         Cleanup(EXIT_FAILURE);
541     }
542     ckInternalKernel = clCreateKernel(cpMLProgram, "InternalKernel", &ciErr1);
543     printf("clCreateKernel (InternalKernel)...\n");
544     if (ciErr1 != CL_SUCCESS)
545     {
546         printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
547         Cleanup(EXIT_FAILURE);
548     }
549     ckResultKernel = clCreateKernel(cpMLProgram, "ResultKernel", &ciErr1);
550     printf("clCreateKernel (ResultKernel)...\n");
551     if (ciErr1 != CL_SUCCESS)
552     {
553         printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
554         Cleanup(EXIT_FAILURE);
555     }
556     ckReductionKernel = clCreateKernel(cpMLProgram, "ReductionKernel", &ciErr1);
557     printf("clCreateKernel (ReductionKernel)...\n");
558     if (ciErr1 != CL_SUCCESS)
559     {
560         printf("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
561         Cleanup(EXIT_FAILURE);
562     }
563 
564     size_t maxKernelSize;
565     ciErr1 = clGetKernelWorkGroupInfo(ckLeafKernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE,
566                              sizeof(size_t), &maxKernelSize, NULL);
567     printf("Max Leaf Kernel Work Group Size: %ld \n", (long unsigned) maxKernelSize);
568     ciErr1 = clGetKernelWorkGroupInfo(ckAmbigKernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE,
569                              sizeof(size_t), &maxKernelSize, NULL);
570     printf("Max Ambig Kernel Work Group Size: %ld \n", (long unsigned) maxKernelSize);
571     ciErr1 = clGetKernelWorkGroupInfo(ckInternalKernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE,
572                              sizeof(size_t), &maxKernelSize, NULL);
573     printf("Max Internal Kernel Work Group Size: %ld \n", (long unsigned) maxKernelSize);
574     ciErr1 = clGetKernelWorkGroupInfo(ckResultKernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE,
575                              sizeof(size_t), &maxKernelSize, NULL);
576     printf("Max Result Kernel Work Group Size: %ld \n", (long unsigned) maxKernelSize);
577     ciErr1 = clGetKernelWorkGroupInfo(ckReductionKernel, cdDevice, CL_KERNEL_WORK_GROUP_SIZE,
578                              sizeof(size_t), &maxKernelSize, NULL);
579     printf("Max Reduction Kernel Work Group Size: %ld \n", (long unsigned) maxKernelSize);
580 
581     long tempLeafState = 1;
582     long tempSiteCount = siteCount;
583     long tempCharCount = alphabetDimension;
584     long tempChildNodeIndex = 0;
585     long tempParentNodeIndex = 0;
586     long tempRoundCharCount = roundUpToNextPowerOfTwo(alphabetDimension);
587     int  tempTagIntState = 0;
588     int   tempNodeID = 0;
589     float tempScalar = scalar;
590     float tempuFlowThresh = 0.000000001f;
591 
592     ciErr1  = clSetKernelArg(ckLeafKernel, 0, sizeof(cl_mem), (void*)&cmNode_cache);
593     ciErr1 |= clSetKernelArg(ckLeafKernel, 1, sizeof(cl_mem), (void*)&cmModel_cache);
594     ciErr1 |= clSetKernelArg(ckLeafKernel, 2, sizeof(cl_mem), (void*)&cmNodRes_cache);
595     ciErr1 |= clSetKernelArg(ckLeafKernel, 3, sizeof(cl_mem), (void*)&cmNodFlag_cache);
596     ciErr1 |= clSetKernelArg(ckLeafKernel, 4, sizeof(cl_long), (void*)&tempSiteCount);
597     ciErr1 |= clSetKernelArg(ckLeafKernel, 5, sizeof(cl_long), (void*)&tempCharCount);
598     ciErr1 |= clSetKernelArg(ckLeafKernel, 6, sizeof(cl_long), (void*)&tempChildNodeIndex); // reset this in the loop
599     ciErr1 |= clSetKernelArg(ckLeafKernel, 7, sizeof(cl_long), (void*)&tempParentNodeIndex); // reset this in the loop
600     ciErr1 |= clSetKernelArg(ckLeafKernel, 8, sizeof(cl_long), (void*)&tempRoundCharCount);
601     ciErr1 |= clSetKernelArg(ckLeafKernel, 9, sizeof(cl_int), (void*)&tempTagIntState); // reset this in the loop
602     ciErr1 |= clSetKernelArg(ckLeafKernel, 10, sizeof(cl_int), (void*)&tempNodeID); // reset this in the loop
603     ciErr1 |= clSetKernelArg(ckLeafKernel, 11, sizeof(cl_mem), (void*)&cmScalings_cache);
604     ciErr1 |= clSetKernelArg(ckLeafKernel, 12, sizeof(cl_float), (void*)&tempScalar);
605     ciErr1 |= clSetKernelArg(ckLeafKernel, 13, sizeof(cl_float), (void*)&tempuFlowThresh);
606 
607     ciErr1 |= clSetKernelArg(ckAmbigKernel, 0, sizeof(cl_mem), (void*)&cmNode_cache);
608     ciErr1 |= clSetKernelArg(ckAmbigKernel, 1, sizeof(cl_mem), (void*)&cmModel_cache);
609     ciErr1 |= clSetKernelArg(ckAmbigKernel, 2, sizeof(cl_mem), (void*)&cmNodRes_cache);
610     ciErr1 |= clSetKernelArg(ckAmbigKernel, 3, sizeof(cl_mem), (void*)&cmNodFlag_cache);
611     ciErr1 |= clSetKernelArg(ckAmbigKernel, 4, sizeof(cl_long), (void*)&tempSiteCount);
612     ciErr1 |= clSetKernelArg(ckAmbigKernel, 5, sizeof(cl_long), (void*)&tempCharCount);
613     ciErr1 |= clSetKernelArg(ckAmbigKernel, 6, sizeof(cl_long), (void*)&tempChildNodeIndex); // reset this in the loop
614     ciErr1 |= clSetKernelArg(ckAmbigKernel, 7, sizeof(cl_long), (void*)&tempParentNodeIndex); // reset this in the loop
615     ciErr1 |= clSetKernelArg(ckAmbigKernel, 8, sizeof(cl_long), (void*)&tempRoundCharCount);
616     ciErr1 |= clSetKernelArg(ckAmbigKernel, 9, sizeof(cl_int), (void*)&tempTagIntState);
617     ciErr1 |= clSetKernelArg(ckAmbigKernel, 10, sizeof(cl_int), (void*)&tempNodeID);
618     ciErr1 |= clSetKernelArg(ckAmbigKernel, 11, sizeof(cl_mem), (void*)&cmScalings_cache);
619     ciErr1 |= clSetKernelArg(ckAmbigKernel, 12, sizeof(cl_float), (void*)&tempScalar);
620     ciErr1 |= clSetKernelArg(ckAmbigKernel, 13, sizeof(cl_float), (void*)&tempuFlowThresh);
621 
622     ciErr1 |= clSetKernelArg(ckInternalKernel, 0, sizeof(cl_mem), (void*)&cmNode_cache);
623     ciErr1 |= clSetKernelArg(ckInternalKernel, 1, sizeof(cl_mem), (void*)&cmModel_cache);
624     ciErr1 |= clSetKernelArg(ckInternalKernel, 2, sizeof(cl_mem), (void*)&cmNodRes_cache);
625     ciErr1 |= clSetKernelArg(ckInternalKernel, 3, sizeof(cl_long), (void*)&tempSiteCount);
626     ciErr1 |= clSetKernelArg(ckInternalKernel, 4, sizeof(cl_long), (void*)&tempCharCount);
627     ciErr1 |= clSetKernelArg(ckInternalKernel, 5, sizeof(cl_long), (void*)&tempChildNodeIndex); // reset this in the loop
628     ciErr1 |= clSetKernelArg(ckInternalKernel, 6, sizeof(cl_long), (void*)&tempParentNodeIndex); // reset this in the loop
629     ciErr1 |= clSetKernelArg(ckInternalKernel, 7, sizeof(cl_long), (void*)&tempRoundCharCount);
630     ciErr1 |= clSetKernelArg(ckInternalKernel, 8, sizeof(cl_int), (void*)&tempTagIntState); // reset this in the loop
631     ciErr1 |= clSetKernelArg(ckInternalKernel, 9, sizeof(cl_int), (void*)&tempNodeID); // reset this in the loop
632     ciErr1 |= clSetKernelArg(ckInternalKernel, 10, sizeof(cl_mem), (void*)&cmroot_cache);
633     ciErr1 |= clSetKernelArg(ckInternalKernel, 11, sizeof(cl_mem), (void*)&cmScalings_cache);
634     ciErr1 |= clSetKernelArg(ckInternalKernel, 12, sizeof(cl_float), (void*)&tempScalar);
635     ciErr1 |= clSetKernelArg(ckInternalKernel, 13, sizeof(cl_float), (void*)&tempuFlowThresh);
636     ciErr1 |= clSetKernelArg(ckInternalKernel, 14, sizeof(cl_mem), (void*)&cmroot_scalings);
637 
638     ciErr1 |= clSetKernelArg(ckResultKernel, 0, sizeof(cl_mem), (void*)&cmFreq_cache);
639     ciErr1 |= clSetKernelArg(ckResultKernel, 1, sizeof(cl_mem), (void*)&cmProb_cache);
640     ciErr1 |= clSetKernelArg(ckResultKernel, 2, sizeof(cl_mem), (void*)&cmResult_cache);
641     ciErr1 |= clSetKernelArg(ckResultKernel, 3, sizeof(cl_mem), (void*)&cmroot_cache);
642     ciErr1 |= clSetKernelArg(ckResultKernel, 4, sizeof(cl_mem), (void*)&cmroot_scalings);
643     ciErr1 |= clSetKernelArg(ckResultKernel, 5, sizeof(cl_long), (void*)&tempSiteCount);
644     ciErr1 |= clSetKernelArg(ckResultKernel, 6, sizeof(cl_long), (void*)&tempRoundCharCount);
645     ciErr1 |= clSetKernelArg(ckResultKernel, 7, sizeof(cl_float), (void*)&tempScalar);
646     ciErr1 |= clSetKernelArg(ckResultKernel, 8, sizeof(cl_long), (void*)&tempCharCount);
647 
648     ciErr1 |= clSetKernelArg(ckReductionKernel, 0, sizeof(cl_mem), (void*)&cmResult_cache);
649 
650     //printf("clSetKernelArg 0 - 12...\n\n");
651     if (ciErr1 != CL_SUCCESS)
652     {
653         printf("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
654         Cleanup(EXIT_FAILURE);
655     }
656 
657     // --------------------------------------------------------
658     // Start Core sequence... copy input data to GPU, compute, copy results back
659     // Asynchronous write of data to GPU device
660     ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmFreq_cache, CL_FALSE, 0,
661                 sizeof(cl_int)*siteCount, freq_cache, 0, NULL, NULL);
662     printf("clEnqueueWriteBuffer (root_cache, etc.)...");
663     if (ciErr1 != CL_SUCCESS)
664     {
665         printf("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
666         Cleanup(EXIT_FAILURE);
667     }
668     printf(" Done!\n");
669 }
670 
oclmain(void)671 double _OCLEvaluator::oclmain(void)
672 {
673     //printf("newLF!\n");
674     //printf("LF");
675     // so far this wholebuffer rebuild takes almost no time at all. Perhaps not true re:queue
676     // Fix the model cache
677 #ifdef __OCLPOSIX__
678     clock_gettime(CLOCK_MONOTONIC, &bufferStart);
679 #endif
680     int roundCharacters = roundUpToNextPowerOfTwo(alphabetDimension);
681 /*
682     printf("Update Nodes:");
683     for (int i = 0; i < updateNodes.lLength; i++)
684     {
685         printf(" %i ", updateNodes.list_data[i]);
686     }
687     printf("\n");
688 
689     printf("Tagged Internals:");
690     for (int i = 0; i < taggedInternals.lLength; i++)
691     {
692         printf(" %i", taggedInternals.list_data[i]);
693     }
694     printf("\n");
695 */
696     long nodeCode, parentCode;
697     bool isLeaf;
698     hyFloat* tMatrix;
699     int a1, a2;
700     //printf("updateNodes.lLength: %i", updateNodes.lLength);
701     //#pragma omp parallel for default(none) shared(updateNodes, flatParents, flatLeaves, flatCLeaves, flatTree, alphabetDimension, model, roundCharacters) private(nodeCode, parentCode, isLeaf, tMatrix, a1, a2)
702     for (int nodeID = 0; nodeID < updateNodes.lLength; nodeID++)
703     {
704         nodeCode = updateNodes.list_data[nodeID];
705         parentCode = flatParents.list_data[nodeCode];
706 
707         isLeaf = nodeCode < flatLeaves.lLength;
708 
709         if (!isLeaf) nodeCode -= flatLeaves.lLength;
710 
711         tMatrix = (isLeaf? ((_CalcNode*) flatCLeaves (nodeCode)):
712                    ((_CalcNode*) flatTree    (nodeCode)))->GetCompExp(0)->theData;
713 
714         for (a1 = 0; a1 < alphabetDimension; a1++)
715         {
716             for (a2 = 0; a2 < alphabetDimension; a2++)
717             {
718                 ((float*)model)[nodeID*roundCharacters*roundCharacters+a2*roundCharacters+a1] =
719                    (float)(tMatrix[a1*alphabetDimension+a2]);
720             }
721         }
722     }
723 
724     // enqueueing the read and write buffers takes 1/2 the time, the kernel takes the other 1/2.
725     // with no queueing, however, we still only see ~700lf/s, which isn't much better than the threaded CPU code.
726     ciErr1 |= clEnqueueWriteBuffer(cqCommandQueue, cmModel_cache, CL_TRUE, 0,
727                 sizeof(cl_float)*roundCharacters*roundCharacters*(flatParents.lLength-1),
728                 model, 0, NULL, NULL);
729     //clFinish(cqCommandQueue);
730     if (ciErr1 != CL_SUCCESS)
731     {
732         printf("%i\n", ciErr1); //prints "1"
733         switch(ciErr1)
734         {
735             case   CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
736             case   CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
737             case   CL_INVALID_MEM_OBJECT: printf("CL_INVALID_MEM_OBJECT\n"); break;
738             case   CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
739             case   CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
740                 //          case   CL_MISALIGNED_SUB_BUFFER_OFFSET: printf("CL_MISALIGNED_SUB_BUFFER_OFFSET\n"); break;
741                 //          case   CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n"); break;
742             case   CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
743             case   CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
744             case   CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
745             default: printf("Strange error\n"); //This is printed
746         }
747         printf("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
748         Cleanup(EXIT_FAILURE);
749     }
750     /*
751     if (ciErr1 != CL_SUCCESS)
752     {
753         printf("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
754         Cleanup(EXIT_FAILURE);
755     }
756     */
757 #ifdef __OCLPOSIX__
758     clock_gettime(CLOCK_MONOTONIC, &bufferEnd);
759     buffSecs += (bufferEnd.tv_sec - bufferStart.tv_sec)+(bufferEnd.tv_nsec - bufferStart.tv_nsec)/BILLION;
760 
761     clock_gettime(CLOCK_MONOTONIC, &queueStart);
762 #endif
763     //printf("Finished writing the model stuff\n");
764     // Launch kernel
765     for (int nodeIndex = 0; nodeIndex < updateNodes.lLength; nodeIndex++)
766     {
767         //printf("NewNode\n");
768         long    nodeCode = updateNodes.list_data[nodeIndex],
769                 parentCode = flatParents.list_data[nodeCode];
770 
771         //printf("NewNode: %i, NodeCode: %i\n", nodeIndex, nodeCode);
772         bool isLeaf = nodeCode < flatLeaves.lLength;
773 
774         if (isLeaf)
775         {
776             long nodeCodeTemp = nodeCode;
777             int tempIntTagState = taggedInternals.list_data[parentCode];
778             int ambig = 0;
779             for (int aI = 0; aI < siteCount; aI++)
780                 if (lNodeFlags[nodeCode*siteCount + aI] < 0)
781                     {
782                         ambig = 1;
783                     }
784             if (!ambig)
785             {
786                 ciErr1 |= clSetKernelArg(ckLeafKernel, 6, sizeof(cl_long), (void*)&nodeCodeTemp);
787                 ciErr1 |= clSetKernelArg(ckLeafKernel, 7, sizeof(cl_long), (void*)&parentCode);
788                 ciErr1 |= clSetKernelArg(ckLeafKernel, 9, sizeof(cl_int), (void*)&tempIntTagState);
789                 ciErr1 |= clSetKernelArg(ckLeafKernel, 10, sizeof(cl_int), (void*)&nodeIndex);
790                 taggedInternals.list_data[parentCode] = 1;
791 
792                 //printf("Leaf!\n");
793 #ifdef __VERBOSE__
794                 printf("Leaf/Ambig Started (ParentCode: %i)...", parentCode);
795 #endif
796                 ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckLeafKernel, 2, NULL,
797                                                 szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
798             }
799             else
800             {
801                 ciErr1 |= clSetKernelArg(ckAmbigKernel, 6, sizeof(cl_long), (void*)&nodeCodeTemp);
802                 ciErr1 |= clSetKernelArg(ckAmbigKernel, 7, sizeof(cl_long), (void*)&parentCode);
803                 ciErr1 |= clSetKernelArg(ckAmbigKernel, 9, sizeof(cl_int), (void*)&tempIntTagState);
804                 ciErr1 |= clSetKernelArg(ckAmbigKernel, 10, sizeof(cl_int), (void*)&nodeIndex);
805                 taggedInternals.list_data[parentCode] = 1;
806 
807                 //printf("ambig!\n");
808 #ifdef __VERBOSE__
809                 printf("Leaf/Ambig Started ...");
810 #endif
811                 ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckAmbigKernel, 2, NULL,
812                                                 szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
813             }
814             ciErr1 |= clFlush(cqCommandQueue);
815             clFinish(cqCommandQueue);
816 #ifdef __VERBOSE__
817             printf("Finished\n");
818 #endif
819         }
820         else
821         {
822             long tempLeafState = 0;
823             nodeCode -= flatLeaves.lLength;
824             long nodeCodeTemp = nodeCode;
825             int tempIntTagState = taggedInternals.list_data[parentCode];
826             ciErr1 |= clSetKernelArg(ckInternalKernel, 5, sizeof(cl_long), (void*)&nodeCodeTemp);
827             ciErr1 |= clSetKernelArg(ckInternalKernel, 6, sizeof(cl_long), (void*)&parentCode);
828             ciErr1 |= clSetKernelArg(ckInternalKernel, 8, sizeof(cl_int), (void*)&tempIntTagState);
829             ciErr1 |= clSetKernelArg(ckInternalKernel, 9, sizeof(cl_int), (void*)&nodeIndex);
830             taggedInternals.list_data[parentCode] = 1;
831 #ifdef __VERBOSE__
832             printf("Internal Started (ParentCode: %i)...", parentCode);
833 #endif
834             ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckInternalKernel, 2, NULL,
835                                             szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
836 
837             //printf("internal!\n");
838             ciErr1 |= clFlush(cqCommandQueue);
839             clFinish(cqCommandQueue);
840 #ifdef __VERBOSE__
841             printf("Finished\n");
842 #endif
843         }
844         if (ciErr1 != CL_SUCCESS)
845         {
846             printf("%i\n", ciErr1); //prints "1"
847             switch(ciErr1)
848             {
849                 case   CL_INVALID_PROGRAM_EXECUTABLE: printf("CL_INVALID_PROGRAM_EXECUTABLE\n"); break;
850                 case   CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
851                 case   CL_INVALID_KERNEL: printf("CL_INVALID_KERNEL\n"); break;
852                 case   CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
853                 case   CL_INVALID_KERNEL_ARGS: printf("CL_INVALID_KERNEL_ARGS\n"); break;
854                 case   CL_INVALID_WORK_DIMENSION: printf("CL_INVALID_WORK_DIMENSION\n"); break;
855                 case   CL_INVALID_GLOBAL_WORK_SIZE: printf("CL_INVALID_GLOBAL_WORK_SIZE\n"); break;
856                 case   CL_INVALID_GLOBAL_OFFSET: printf("CL_INVALID_GLOBAL_OFFSET\n"); break;
857                 case   CL_INVALID_WORK_GROUP_SIZE: printf("CL_INVALID_WORK_GROUP_SIZE\n"); break;
858                 case   CL_INVALID_WORK_ITEM_SIZE: printf("CL_INVALID_WORK_ITEM_SIZE\n"); break;
859                 case   CL_INVALID_IMAGE_SIZE: printf("CL_INVALID_IMAGE_SIZE\n"); break;
860                 case   CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
861                 case   CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
862                 case   CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
863                 case   CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
864                 default: printf("Strange error\n"); //This is printed
865             }
866             printf("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
867             Cleanup(EXIT_FAILURE);
868         }
869     }
870 #ifdef __GPUResults__
871 	size_t szGlobalWorkSize2 = 256;
872 	size_t szLocalWorkSize2 = 256;
873 	//size_t szLocalWorkSize2 = roundUpToNextPowerOfTwo(siteCount);
874 	//size_t szLocalWorkSize2 = MIN(roundUpToNextPowerOfTwo(siteCount), 256*256);
875 	size_t szGlobalWorkSize3 = 256;
876 	size_t szLocalWorkSize3 = 256;
877 
878     ciErr1 |= clEnqueueNDRangeKernel(cqCommandQueue, ckResultKernel, 1, NULL,
879         &szGlobalWorkSize2, &szLocalWorkSize2, 0, NULL, NULL);
880     ciErr1 |= clEnqueueNDRangeKernel(cqCommandQueue, ckReductionKernel, 1, NULL,
881         &szGlobalWorkSize3, &szLocalWorkSize3, 0, NULL, NULL);
882 #else
883     ciErr1 |= clEnqueueNDRangeKernel(cqCommandQueue, ckResultKernel, 2, NULL,
884         szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
885 #endif
886 /*
887 */
888 	/*
889     ciErr1 |= clEnqueueNDRangeKernel(cqCommandQueue, ckReductionKernel, 2, NULL,
890         szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL);
891 	*/
892 
893     if (ciErr1 != CL_SUCCESS)
894     {
895         printf("%i\n", ciErr1); //prints "1"
896         switch(ciErr1)
897         {
898             case   CL_INVALID_PROGRAM_EXECUTABLE: printf("CL_INVALID_PROGRAM_EXECUTABLE\n"); break;
899             case   CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
900             case   CL_INVALID_KERNEL: printf("CL_INVALID_KERNEL\n"); break;
901             case   CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
902             case   CL_INVALID_KERNEL_ARGS: printf("CL_INVALID_KERNEL_ARGS\n"); break;
903             case   CL_INVALID_WORK_DIMENSION: printf("CL_INVALID_WORK_DIMENSION\n"); break;
904             case   CL_INVALID_GLOBAL_WORK_SIZE: printf("CL_INVALID_GLOBAL_WORK_SIZE\n"); break;
905             case   CL_INVALID_GLOBAL_OFFSET: printf("CL_INVALID_GLOBAL_OFFSET\n"); break;
906             case   CL_INVALID_WORK_GROUP_SIZE: printf("CL_INVALID_WORK_GROUP_SIZE\n"); break;
907             case   CL_INVALID_WORK_ITEM_SIZE: printf("CL_INVALID_WORK_ITEM_SIZE\n"); break;
908                 //          case   CL_MISALIGNED_SUB_BUFFER_OFFSET: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
909             case   CL_INVALID_IMAGE_SIZE: printf("CL_INVALID_IMAGE_SIZE\n"); break;
910             case   CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
911             case   CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
912             case   CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
913             case   CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
914             default: printf("Strange error\n"); //This is printed
915         }
916         printf("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
917         Cleanup(EXIT_FAILURE);
918     }
919     // Synchronous/blocking read of results, and check accumulated errors
920 /*
921 #ifdef __GPUResults_
922     ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmResult_cache, CL_FALSE, 0,
923             sizeof(cl_double)*roundUpToNextPowerOfTwo(siteCount), result_cache, 0,
924             NULL, NULL);
925     //ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmResult_cache, CL_FALSE, 0,
926      //       sizeof(cl_double)*1, result_cache, 0,
927       //      NULL, NULL);
928 #else
929     //ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmResult_cache, CL_FALSE, 0,
930      //       sizeof(cl_float)*roundUpToNextPowerOfTwo(siteCount), result_cache, 0,
931       //      NULL, NULL);
932     ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmResult_cache, CL_FALSE, 0,
933             sizeof(cl_float)*roundUpToNextPowerOfTwo(siteCount), result_cache, 0,
934             NULL, NULL);
935 
936 #endif
937 */
938     ciErr1 = clEnqueueReadBuffer(cqCommandQueue, cmResult_cache, CL_FALSE, 0,
939             sizeof(clfp)*roundUpToNextPowerOfTwo(siteCount), result_cache, 0,
940             NULL, NULL);
941 
942     if (ciErr1 != CL_SUCCESS)
943     {
944         printf("%i\n", ciErr1); //prints "1"
945         switch(ciErr1)
946         {
947             case   CL_INVALID_COMMAND_QUEUE: printf("CL_INVALID_COMMAND_QUEUE\n"); break;
948             case   CL_INVALID_CONTEXT: printf("CL_INVALID_CONTEXT\n"); break;
949             case   CL_INVALID_MEM_OBJECT: printf("CL_INVALID_MEM_OBJECT\n"); break;
950             case   CL_INVALID_VALUE: printf("CL_INVALID_VALUE\n"); break;
951             case   CL_INVALID_EVENT_WAIT_LIST: printf("CL_INVALID_EVENT_WAIT_LIST\n"); break;
952                 //          case   CL_MISALIGNED_SUB_BUFFER_OFFSET: printf("CL_MISALIGNED_SUB_BUFFER_OFFSET\n"); break;
953                 //          case   CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST: printf("CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST\n"); break;
954             case   CL_MEM_OBJECT_ALLOCATION_FAILURE: printf("CL_MEM_OBJECT_ALLOCATION_FAILURE\n"); break;
955             case   CL_OUT_OF_RESOURCES: printf("CL_OUT_OF_RESOURCES\n"); break;
956             case   CL_OUT_OF_HOST_MEMORY: printf("CL_OUT_OF_HOST_MEMORY\n"); break;
957             default: printf("Strange error\n"); //This is printed
958         }
959         printf("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__);
960         Cleanup(EXIT_FAILURE);
961     }
962     //--------------------------------------------------------
963 
964 
965     clFinish(cqCommandQueue);
966     double oResult = 0.0;
967 
968 #ifdef __GPUResults__
969 	/*
970     for (int i = 0; i < siteCount; i++)
971     {
972         oResult += ((fpoint*)result_cache)[i];
973     }
974 #ifdef __VERBOSE__
975     printf("Result_Cache: \n");
976     for (int i = 0; i < siteCount; i++)
977         printf("%4.10g ", ((fpoint*)result_cache)[i]);
978     printf("\n\n");
979 #endif
980     for (int i = 0; i < siteCount; i++)
981     {
982         oResult += ((float*)result_cache)[i];
983     }
984     //oResult = ((double*)result_cache)[0];
985     printf("Result_Cache: \n");
986     for (int i = 0; i < siteCount; i++)
987         printf("%4.10g ", ((fpoint*)result_cache)[i]);
988     printf("\n\n");
989 	*/
990     oResult = ((fpoint*)result_cache)[0];
991 #else
992 #ifdef __OCLPOSIX__
993     clock_gettime(CLOCK_MONOTONIC, &queueEnd);
994     queueSecs += (queueEnd.tv_sec - queueStart.tv_sec)+(queueEnd.tv_nsec - queueStart.tv_nsec)/BILLION;
995     clock_gettime(CLOCK_MONOTONIC, &mainStart);
996 #endif
997     //#pragma omp parallel for reduction (+:oResult) schedule(static)
998     for (int i = 0; i < siteCount; i++)
999     {
1000         oResult += ((fpoint*)result_cache)[i];
1001     }
1002 #ifdef __VERBOSE__
1003     printf("Result_Cache: \n");
1004     for (int i = 0; i < siteCount; i++)
1005         printf("%4.10g ", ((float*)result_cache)[i]);
1006     printf("\n\n");
1007 #endif
1008 #ifdef __OCLPOSIX__
1009     clock_gettime(CLOCK_MONOTONIC, &mainEnd);
1010     mainSecs += (mainEnd.tv_sec - mainStart.tv_sec)+(mainEnd.tv_nsec - mainStart.tv_nsec)/BILLION;
1011 #endif
1012 #endif
1013 /*
1014 */
1015 /*
1016     //#pragma omp parallel for reduction (+:oResult) schedule(static)
1017     for (int i = 0; i < siteCount; i++)
1018     {
1019         oResult += ((float*)result_cache)[i];
1020     }
1021     printf("Result_Cache: \n");
1022     for (int i = 0; i < siteCount; i++)
1023         printf("%4.10g ", ((float*)result_cache)[i]);
1024     printf("\n\n");
1025 */
1026 /*
1027     //printf("! ");
1028     //return result;
1029     printf("oResult: %4.10g, gpuResult: %4.10g\n", oResult, ((double*)result_cache)[4]);
1030     if (oResult != ((double*)result_cache)[4])
1031     {
1032         printf("Result_Cache: \n");
1033         //for (int i = 0; i < roundUpToNextPowerOfTwo(siteCount); i++)
1034         for (int i = 0; i < 5; i++)
1035             printf("%4.10g ", ((double*)result_cache)[i]);
1036         printf("\n\n");
1037     }
1038 */
1039     return oResult;
1040 }
1041 
1042 
launchmdsocl(_SimpleList & eupdateNodes,_SimpleList & eflatParents,_SimpleList & eflatNodes,_SimpleList & eflatCLeaves,_SimpleList & eflatLeaves,_SimpleList & eflatTree,hyFloat * etheProbs,_SimpleList & etheFrequencies,long * elNodeFlags,_SimpleList & etaggedInternals,_Vector * elNodeResolutions)1043 double _OCLEvaluator::launchmdsocl( _SimpleList& eupdateNodes,
1044                                     _SimpleList& eflatParents,
1045                                     _SimpleList& eflatNodes,
1046                                     _SimpleList& eflatCLeaves,
1047                                     _SimpleList& eflatLeaves,
1048                                     _SimpleList& eflatTree,
1049                                     hyFloat* etheProbs,
1050                                     _SimpleList& etheFrequencies,
1051                                     long* elNodeFlags,
1052                                     _SimpleList& etaggedInternals,
1053                                     _Vector* elNodeResolutions)
1054 {
1055 #ifdef __OCLPOSIX__
1056     clock_gettime(CLOCK_MONOTONIC, &mainStart);
1057 #endif
1058 
1059 
1060     updateNodes = eupdateNodes;
1061     taggedInternals = etaggedInternals;
1062     theFrequencies = etheFrequencies;
1063 
1064 
1065     if (!contextSet)
1066     {
1067         theProbs = etheProbs;
1068         flatNodes = eflatNodes;
1069         flatCLeaves = eflatCLeaves;
1070         flatLeaves = eflatLeaves;
1071         flatTree = eflatTree;
1072         flatParents = eflatParents;
1073         lNodeFlags = elNodeFlags;
1074         lNodeResolutions = elNodeResolutions;
1075         setupContext();
1076         contextSet = true;
1077     }
1078 
1079 #ifdef __OCLPOSIX__
1080     clock_gettime(CLOCK_MONOTONIC, &mainEnd);
1081     mainSecs += (mainEnd.tv_sec - mainStart.tv_sec)+(mainEnd.tv_nsec - mainStart.tv_nsec)/BILLION;
1082 #endif
1083 
1084     return oclmain();
1085 }
1086 
1087 
Cleanup(int iExitCode)1088 void _OCLEvaluator::Cleanup (int iExitCode)
1089 {
1090     if (!clean)
1091     {
1092         printf("Time in main: %.4lf seconds\n", mainSecs);
1093         printf("Time in updating transition buffer: %.4lf seconds\n", buffSecs);
1094         printf("Time in queue: %.4lf seconds\n", queueSecs);
1095         printf("Time in Setup: %.4lf seconds\n", setupSecs);
1096         // Cleanup allocated objects
1097         printf("Starting Cleanup...\n\n");
1098         if(ckLeafKernel)clReleaseKernel(ckLeafKernel);
1099         if(ckInternalKernel)clReleaseKernel(ckInternalKernel);
1100         if(ckAmbigKernel)clReleaseKernel(ckAmbigKernel);
1101         if(ckResultKernel)clReleaseKernel(ckResultKernel);
1102         if(cpLeafProgram)clReleaseProgram(cpLeafProgram);
1103         if(cpInternalProgram)clReleaseProgram(cpInternalProgram);
1104         if(cpAmbigProgram)clReleaseProgram(cpAmbigProgram);
1105         if(cpResultProgram)clReleaseProgram(cpResultProgram);
1106         if(cqCommandQueue)clReleaseCommandQueue(cqCommandQueue);
1107         printf("Halfway...\n\n");
1108         if(cxGPUContext)clReleaseContext(cxGPUContext);
1109         if(cmNode_cache)clReleaseMemObject(cmNode_cache);
1110         if(cmModel_cache)clReleaseMemObject(cmModel_cache);
1111         if(cmNodRes_cache)clReleaseMemObject(cmNodRes_cache);
1112         if(cmNodFlag_cache)clReleaseMemObject(cmNodFlag_cache);
1113         if(cmroot_cache)clReleaseMemObject(cmroot_cache);
1114         if(cmroot_scalings)clReleaseMemObject(cmroot_scalings);
1115         if(cmScalings_cache)clReleaseMemObject(cmScalings_cache);
1116         if(cmFreq_cache)clReleaseMemObject(cmFreq_cache);
1117         if(cmProb_cache)clReleaseMemObject(cmProb_cache);
1118         if(cmResult_cache)clReleaseMemObject(cmResult_cache);
1119         printf("Done with ocl stuff...\n\n");
1120         // Free host memory
1121         free(node_cache);
1122         free(model);
1123         free(nodRes_cache);
1124         free(nodFlag_cache);
1125         free(scalings_cache);
1126         free(prob_cache);
1127         free(freq_cache);
1128         free(root_cache);
1129         free(result_cache);
1130         free(root_scalings);
1131         printf("Done!\n\n");
1132         clean = true;
1133         exit(0);
1134 
1135         if (iExitCode = EXIT_FAILURE)
1136             exit (iExitCode);
1137     }
1138 }
1139 
roundUpToNextPowerOfTwo(unsigned int x)1140 unsigned int _OCLEvaluator::roundUpToNextPowerOfTwo(unsigned int x)
1141 {
1142     x--;
1143     x |= x >> 1;
1144     x |= x >> 2;
1145     x |= x >> 4;
1146     x |= x >> 8;
1147     x |= x >> 16;
1148     x++;
1149 
1150     return x;
1151 }
1152 
roundDoubleUpToNextPowerOfTwo(double x)1153 double _OCLEvaluator::roundDoubleUpToNextPowerOfTwo(double x)
1154 {
1155     return pow(2, ceil(log2(x)));
1156 }
1157 #endif
1158 
1159