1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 #include <iostream>
18 #include <clBLAS.h>
19 #include <string>
20 #include <map>
21 
22 cl_int gemm_err;
23 
prettyPrintClStatus(const cl_int & status)24 std::string prettyPrintClStatus( const cl_int& status )
25 {
26 	switch( status )
27 	{
28 		case CL_INVALID_GLOBAL_WORK_SIZE:
29 			return "CL_INVALID_GLOBAL_WORK_SIZE";
30 		case CL_INVALID_MIP_LEVEL:
31 			return "CL_INVALID_MIP_LEVEL";
32 		case CL_INVALID_BUFFER_SIZE:
33 			return "CL_INVALID_BUFFER_SIZE";
34 		case CL_INVALID_GL_OBJECT:
35 			return "CL_INVALID_GL_OBJECT";
36 		case CL_INVALID_OPERATION:
37 			return "CL_INVALID_OPERATION";
38 		case CL_INVALID_EVENT:
39 			return "CL_INVALID_EVENT";
40 		case CL_INVALID_EVENT_WAIT_LIST:
41 			return "CL_INVALID_EVENT_WAIT_LIST";
42 		case CL_INVALID_GLOBAL_OFFSET:
43 			return "CL_INVALID_GLOBAL_OFFSET";
44 		case CL_INVALID_WORK_ITEM_SIZE:
45 			return "CL_INVALID_WORK_ITEM_SIZE";
46 		case CL_INVALID_WORK_GROUP_SIZE:
47 			return "CL_INVALID_WORK_GROUP_SIZE";
48 		case CL_INVALID_WORK_DIMENSION:
49 			return "CL_INVALID_WORK_DIMENSION";
50 		case CL_INVALID_KERNEL_ARGS:
51 			return "CL_INVALID_KERNEL_ARGS";
52 		case CL_INVALID_ARG_SIZE:
53 			return "CL_INVALID_ARG_SIZE";
54 		case CL_INVALID_ARG_VALUE:
55 			return "CL_INVALID_ARG_VALUE";
56 		case CL_INVALID_ARG_INDEX:
57 			return "CL_INVALID_ARG_INDEX";
58 		case CL_INVALID_KERNEL:
59 			return "CL_INVALID_KERNEL";
60 		case CL_INVALID_KERNEL_DEFINITION:
61 			return "CL_INVALID_KERNEL_DEFINITION";
62 		case CL_INVALID_KERNEL_NAME:
63 			return "CL_INVALID_KERNEL_NAME";
64 		case CL_INVALID_PROGRAM_EXECUTABLE:
65 			return "CL_INVALID_PROGRAM_EXECUTABLE";
66 		case CL_INVALID_PROGRAM:
67 			return "CL_INVALID_PROGRAM";
68 		case CL_INVALID_BUILD_OPTIONS:
69 			return "CL_INVALID_BUILD_OPTIONS";
70 		case CL_INVALID_BINARY:
71 			return "CL_INVALID_BINARY";
72 		case CL_INVALID_SAMPLER:
73 			return "CL_INVALID_SAMPLER";
74 		case CL_INVALID_IMAGE_SIZE:
75 			return "CL_INVALID_IMAGE_SIZE";
76 		case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
77 			return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
78 		case CL_INVALID_MEM_OBJECT:
79 			return "CL_INVALID_MEM_OBJECT";
80 		case CL_INVALID_HOST_PTR:
81 			return "CL_INVALID_HOST_PTR";
82 		case CL_INVALID_COMMAND_QUEUE:
83 			return "CL_INVALID_COMMAND_QUEUE";
84 		case CL_INVALID_QUEUE_PROPERTIES:
85 			return "CL_INVALID_QUEUE_PROPERTIES";
86 		case CL_INVALID_CONTEXT:
87 			return "CL_INVALID_CONTEXT";
88 		case CL_INVALID_DEVICE:
89 			return "CL_INVALID_DEVICE";
90 		case CL_INVALID_PLATFORM:
91 			return "CL_INVALID_PLATFORM";
92 		case CL_INVALID_DEVICE_TYPE:
93 			return "CL_INVALID_DEVICE_TYPE";
94 		case CL_INVALID_VALUE:
95 			return "CL_INVALID_VALUE";
96 		case CL_MAP_FAILURE:
97 			return "CL_MAP_FAILURE";
98 		case CL_BUILD_PROGRAM_FAILURE:
99 			return "CL_BUILD_PROGRAM_FAILURE";
100 		case CL_IMAGE_FORMAT_NOT_SUPPORTED:
101 			return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
102 		case CL_IMAGE_FORMAT_MISMATCH:
103 			return "CL_IMAGE_FORMAT_MISMATCH";
104 		case CL_MEM_COPY_OVERLAP:
105 			return "CL_MEM_COPY_OVERLAP";
106 		case CL_PROFILING_INFO_NOT_AVAILABLE:
107 			return "CL_PROFILING_INFO_NOT_AVAILABLE";
108 		case CL_OUT_OF_HOST_MEMORY:
109 			return "CL_OUT_OF_HOST_MEMORY";
110 		case CL_OUT_OF_RESOURCES:
111 			return "CL_OUT_OF_RESOURCES";
112 		case CL_MEM_OBJECT_ALLOCATION_FAILURE:
113 			return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
114 		case CL_COMPILER_NOT_AVAILABLE:
115 			return "CL_COMPILER_NOT_AVAILABLE";
116 		case CL_DEVICE_NOT_AVAILABLE:
117 			return "CL_DEVICE_NOT_AVAILABLE";
118 		case CL_DEVICE_NOT_FOUND:
119 			return "CL_DEVICE_NOT_FOUND";
120 		case CL_SUCCESS:
121 			return "CL_SUCCESS";
122 		default:
123 			return "Error code not defined";
124 		break;
125 	}
126 }
127 
128 //	This is used to either wrap an OpenCL function call, or to explicitly check a variable for an OpenCL error condition.
129 //	If an error occurs, we throw.
130 //	Note: std::runtime_error does not take unicode strings as input, so only strings supported
OpenCL_V_Throw(cl_int res,const std::string & msg,size_t lineno)131 inline cl_int OpenCL_V_Throw( cl_int res, const std::string& msg, size_t lineno )
132 {
133 	switch( res )
134 	{
135 		case CL_SUCCESS: /**< No error */
136 			break;
137 		default:
138 		{
139 			std::stringstream tmp;
140 			tmp << "OPENCL_V_THROWERROR< ";
141 			tmp << prettyPrintClStatus(res) ;
142 			tmp << " > (";
143 			tmp << lineno;
144 			tmp << "): ";
145 			tmp << msg;
146 			std::string errorm(tmp.str());
147 			std::cout << errorm<< std::endl;
148 			throw	std::runtime_error( errorm );
149 		}
150 	}
151 
152 	return	res;
153 }
154 #define OPENCL_V_THROW(_status,_message) OpenCL_V_Throw(_status, _message, __LINE__)
155 
156 enum complexity_t { not_complex = 1, yes_complex = 2 };
157 
158 //can be cl_float, cl_double
159 //TODO should be cl_float2 and cl_double2 instead of using float/double * complexity?
160 template< class T >
161 class buffers
162 {
163 public:
164     size_t M, N, K;
165     size_t lda, ldb, ldc;
166     complexity_t complexity;
167     T* A;
168     T* B;
169     T* C;
170     cl_mem bufA, bufB, bufC;
171     cl_command_queue queue;
172     std::map<std::string, T*> buffer_map;
173     std::map<std::string, size_t> rows_map;
174     std::map<std::string, size_t> ldx_map;
175 
buffers(cl_context ctx,cl_command_queue _queue,size_t _M,size_t _N,size_t _K,size_t _lda,size_t _ldb,size_t _ldc,complexity_t _complexity)176     buffers( cl_context ctx, cl_command_queue _queue,
177              size_t _M, size_t _N, size_t _K,
178              size_t _lda, size_t _ldb, size_t _ldc,
179              complexity_t _complexity )
180     : M(_M)
181     , N(_N)
182     , K(_K)
183     , lda(_lda)
184     , ldb(_ldb)
185     , ldc(_ldc)
186     , complexity(_complexity)
187     , A(new T[M*lda*sizeof(T)*complexity])
188     , B(new T[K*ldb*sizeof(T)*complexity])
189     , C(new T[M*ldc*sizeof(T)*complexity])
190     , queue(_queue)
191     {
192         // request and initialize openCL memory
193         bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * lda * sizeof(*A) * complexity,
194                               NULL, &gemm_err);
195         OPENCL_V_THROW( gemm_err, "creating buffer A" );
196         bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K * ldb * sizeof(*B) * complexity,
197                               NULL, &gemm_err);
198         OPENCL_V_THROW( gemm_err, "creating buffer B" );
199         bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * ldc * sizeof(*C) * complexity,
200                               NULL, &gemm_err);
201         OPENCL_V_THROW( gemm_err, "creating buffer C" );
202 
203         buffer_map.insert(std::pair<std::string,T*>("A",A));
204         buffer_map.insert(std::pair<std::string,T*>("B",B));
205         buffer_map.insert(std::pair<std::string,T*>("C",C));
206         rows_map.insert(std::pair<std::string,size_t>("A",M));
207         rows_map.insert(std::pair<std::string,size_t>("B",K));
208         rows_map.insert(std::pair<std::string,size_t>("C",M));
209         ldx_map.insert(std::pair<std::string,size_t>("A",lda));
210         ldx_map.insert(std::pair<std::string,size_t>("B",ldb));
211         ldx_map.insert(std::pair<std::string,size_t>("C",ldc));
212 
213         initialize_data();
214     }
215 
~buffers()216     ~buffers()
217     {
218         OPENCL_V_THROW( clReleaseMemObject(bufC), "releasing buffer A");
219         OPENCL_V_THROW( clReleaseMemObject(bufB), "releasing buffer B");
220         OPENCL_V_THROW( clReleaseMemObject(bufA), "releasing buffer C");
221         delete[] A;
222         delete[] B;
223         delete[] C;
224     }
225 
initialize_data()226     void initialize_data()
227     {
228         initializeLocalMatrix("A");
229         initializeLocalMatrix("B");
230         initializeLocalMatrix("C");
231 
232         gemm_err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
233             M * K * sizeof(*A) * complexity, A, 0, NULL, NULL);
234         OPENCL_V_THROW( gemm_err, "writing to buffer A" );
235         gemm_err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
236             K * N * sizeof(*B) * complexity, A, 0, NULL, NULL);
237         OPENCL_V_THROW( gemm_err, "writing to buffer B" );
238         gemm_err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
239             M * N * sizeof(*C) * complexity, C, 0, NULL, NULL);
240         OPENCL_V_THROW( gemm_err, "writing to buffer C" );
241     }
242 
read_back_result()243     void read_back_result()
244     {
245         OPENCL_V_THROW( clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C) * complexity, C, 0, NULL, NULL),
246                         "reading from buffer C" );
247     }
248 
initializeLocalMatrix(std::string matrix)249     void initializeLocalMatrix(std::string matrix)
250     {
251         for (size_t i = 0; i < rows_map[matrix]*complexity; i++) {
252             for (size_t j = 0; j < ldx_map[matrix]; j++) {
253                 buffer_map[matrix][i * ldx_map[matrix] + j] = (i+1)*10 + (j+1);
254             }
255         }
256     }
257 
printLocalMatrix(std::string matrix)258     void printLocalMatrix(std::string matrix)
259     {
260         for (size_t i = 0; i < rows_map[matrix]*complexity; i++) {
261             for (size_t j = 0; j < ldx_map[matrix]; j++) {
262                 std::cout << (int)buffer_map[matrix][i * ldx_map[matrix] + j] << " ";
263             }
264             std::cout << std::endl;
265         }
266         std::cout << std::endl;
267     }
268 };
269 
270 class clGemm
271 {
272 public:
273     size_t M;
274     size_t N;
275     size_t K;
276     size_t lda;
277     size_t ldb;
278     size_t ldc;
279     clblasOrder order;
280     clblasTranspose transA;
281     clblasTranspose transB;
282     cl_context_properties props[3];
283     cl_platform_id platform;
284     cl_device_id device;
285     cl_context ctx;
286     cl_device_type deviceType;
287     cl_command_queue queue;
288     cl_event event;
289     cl_uint commandQueueFlags;
290     bool useimages;
291     cl_ulong imgA;
292     cl_ulong imgB;
293     StatisticalTimer& timer;
294 	StatisticalTimer::sTimerID gemm_timer_id;
295 
clGemm(size_t _M,size_t _N,size_t _K,size_t _lda,size_t _ldb,size_t _ldc,bool _useimages,clblasOrder _order,clblasTranspose _transA,clblasTranspose _transB,cl_device_type _deviceType,cl_uint _commandQueueFlags,StatisticalTimer & _timer)296     clGemm( size_t _M, size_t _N, size_t _K,
297             size_t _lda, size_t _ldb, size_t _ldc,
298             bool _useimages,
299             clblasOrder _order,
300             clblasTranspose _transA, clblasTranspose _transB,
301             cl_device_type _deviceType, cl_uint _commandQueueFlags,
302             StatisticalTimer& _timer )
303     : M(_M)
304     , N(_N)
305     , K(_K)
306     , lda(_lda)
307     , ldb(_ldb)
308     , ldc(_ldc)
309     , order(_order)
310     , transA(_transA)
311     , transB(_transB)
312     , deviceType(_deviceType)
313     , event(NULL)
314     , commandQueueFlags(_commandQueueFlags)
315     , useimages(_useimages)
316     , imgA(0)
317     , imgB(0)
318     , timer(_timer)
319     {
320         props[0] = CL_CONTEXT_PLATFORM;
321         props[1] = 0;
322         props[2] = 0;
323         OPENCL_V_THROW( clGetPlatformIDs(1, &platform, NULL), "getting platform IDs" );
324         OPENCL_V_THROW( clGetDeviceIDs(platform, deviceType, 1, &device, NULL), "getting device IDs" );
325         props[1] = (cl_context_properties)platform;
326         ctx = clCreateContext(props, 1, &device, NULL, NULL, &gemm_err);
327         OPENCL_V_THROW( gemm_err, "creating context" );
328         queue = clCreateCommandQueue(ctx, device, commandQueueFlags, &gemm_err);
329         OPENCL_V_THROW( gemm_err, "creating command queue" );
330 
331         gemm_err = clblasSetup();
332         if (gemm_err != CL_SUCCESS) {
333             std::cout << "clblasSetup() failed with " << gemm_err << std::endl;
334             clReleaseCommandQueue(queue);
335             clReleaseContext(ctx);
336             exit(1);
337         }
338 
339         if (useimages) {
340             imgA = clblasAddScratchImage(ctx, 16, 64, NULL);
341             imgB = clblasAddScratchImage(ctx, 16, 64, NULL);
342         }
343 
344 	    gemm_timer_id = timer.getUniqueID( "clGemm", 0 );
345     }
346 
~clGemm()347     ~clGemm()
348     {
349         if (useimages) {
350             clblasRemoveScratchImage(imgA);
351             clblasRemoveScratchImage(imgB);
352         }
353 
354         clblasTeardown();
355         OPENCL_V_THROW( clReleaseCommandQueue(queue), "releasing command queue" );
356         OPENCL_V_THROW( clReleaseContext(ctx), "releasing context" );
357     }
358 
wait_and_check()359     void wait_and_check()
360     {
361         cl_int wait_status = clWaitForEvents(1, &event);
362 
363         if( wait_status != CL_SUCCESS )
364         {
365     	    if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST )
366     	    {
367     	    	clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &gemm_err, NULL );
368     	    	std::cout << "blas function execution status error: " << gemm_err << std::endl;
369                 exit(1);
370     	    }
371             else
372             {
373     	    	std::cout << "blas function wait status error: " << wait_status << std::endl;
374                 exit(1);
375             }
376         }
377     }
378 
time_in_ns()379     double time_in_ns()
380     {
381 	    StatisticalTimer& timer = StatisticalTimer::getInstance( );
382         return timer.getAverageTime( gemm_timer_id ) * 1e9;
383     }
384 
385     virtual void call_gemm() = 0;
386     virtual void clear_buffers() = 0;
387     virtual double gflops() = 0;
388     virtual std::string gflops_formula() = 0;
389 };
390 
391 class clSgemm : public clGemm
392 {
393 public:
394     cl_float alpha;
395     cl_float beta;
396     buffers<cl_float> mybuffers;
397 
clSgemm(size_t _M,size_t _N,size_t _K,size_t _lda,size_t _ldb,size_t _ldc,bool _useimages,clblasOrder _order,clblasTranspose _transA,clblasTranspose _transB,cl_float _alpha,cl_float _beta,cl_device_type _deviceType,cl_uint _commandQueueFlags,StatisticalTimer & _timer)398     clSgemm( size_t _M, size_t _N, size_t _K,
399             size_t _lda, size_t _ldb, size_t _ldc,
400             bool _useimages,
401             clblasOrder _order,
402             clblasTranspose _transA, clblasTranspose _transB,
403             cl_float _alpha, cl_float _beta,
404             cl_device_type _deviceType, cl_uint _commandQueueFlags,
405             StatisticalTimer& _timer)
406     : clGemm( _M, _N, _K,
407               _lda, _ldb, _ldc,
408               _useimages, _order, _transA, _transB,
409               _deviceType, _commandQueueFlags, _timer )
410     , alpha(_alpha)
411     , beta(_beta)
412     , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, not_complex )
413     {}
414 
call_gemm()415     void call_gemm()
416     {
417 	    timer.Start(gemm_timer_id);
418         OPENCL_V_THROW( clblasSgemm(order, transA, transB,
419                                        M, N, K,
420                                        alpha,
421                                        mybuffers.bufA, lda,
422                                        mybuffers.bufB, ldb,
423                                        beta,
424                                        mybuffers.bufC, ldc,
425                                        1, &queue, 0, NULL, &event),
426                         "clblasSgemm" );
427         wait_and_check();
428 	    timer.Stop(gemm_timer_id);
429         //mybuffers.read_back_result();
430         //mybuffers.printLocalMatrix("C");
431     }
432 
clear_buffers()433     void clear_buffers()
434     {
435         mybuffers.initialize_data();
436     }
437 
gflops()438     double gflops()
439     {
440         return (2*M*N*K)/time_in_ns();
441     }
442 
gflops_formula()443     std::string gflops_formula()
444     {
445         return "(2*M*N*K)/time_in_ns";
446     }
447 };
448 
449 class clDgemm : public clGemm
450 {
451 public:
452     cl_double alpha;
453     cl_double beta;
454     buffers<cl_double> mybuffers;
455 
clDgemm(size_t _M,size_t _N,size_t _K,size_t _lda,size_t _ldb,size_t _ldc,bool _useimages,clblasOrder _order,clblasTranspose _transA,clblasTranspose _transB,cl_double _alpha,cl_double _beta,cl_device_type _deviceType,cl_uint _commandQueueFlags,StatisticalTimer & _timer)456     clDgemm( size_t _M, size_t _N, size_t _K,
457             size_t _lda, size_t _ldb, size_t _ldc,
458             bool _useimages,
459             clblasOrder _order,
460             clblasTranspose _transA, clblasTranspose _transB,
461             cl_double _alpha, cl_double _beta,
462             cl_device_type _deviceType, cl_uint _commandQueueFlags,
463             StatisticalTimer& _timer)
464     : clGemm( _M, _N, _K,
465               _lda, _ldb, _ldc,
466               _useimages, _order, _transA, _transB,
467               _deviceType, _commandQueueFlags, _timer )
468     , alpha(_alpha)
469     , beta(_beta)
470     , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, not_complex )
471     {}
472 
call_gemm()473     void call_gemm()
474     {
475 	    timer.Start(gemm_timer_id);
476         OPENCL_V_THROW( clblasDgemm(order, transA, transB,
477                                        M, N, K,
478                                        alpha,
479                                        mybuffers.bufA, lda,
480                                        mybuffers.bufB, ldb,
481                                        beta,
482                                        mybuffers.bufC, ldc,
483                                        1, &queue, 0, NULL, &event),
484                         "clblasDgemm" );
485         wait_and_check();
486 	    timer.Stop(gemm_timer_id);
487         //mybuffers.read_back_result();
488         //mybuffers.printLocalMatrix("C");
489     }
490 
clear_buffers()491     void clear_buffers()
492     {
493         mybuffers.initialize_data();
494     }
495 
gflops()496     double gflops()
497     {
498         return (2*M*N*K)/time_in_ns();
499     }
500 
gflops_formula()501     std::string gflops_formula()
502     {
503         return "(2*M*N*K)/time_in_ns";
504     }
505 };
506 
507 class clCgemm : public clGemm
508 {
509 public:
510     cl_float2 alpha;
511     cl_float2 beta;
512     buffers<cl_float> mybuffers;
513 
clCgemm(size_t _M,size_t _N,size_t _K,size_t _lda,size_t _ldb,size_t _ldc,bool _useimages,clblasOrder _order,clblasTranspose _transA,clblasTranspose _transB,cl_float _alpha,cl_float _beta,cl_device_type _deviceType,cl_uint _commandQueueFlags,StatisticalTimer & _timer)514     clCgemm( size_t _M, size_t _N, size_t _K,
515             size_t _lda, size_t _ldb, size_t _ldc,
516             bool _useimages,
517             clblasOrder _order,
518             clblasTranspose _transA, clblasTranspose _transB,
519             cl_float _alpha, cl_float _beta,
520             cl_device_type _deviceType, cl_uint _commandQueueFlags,
521             StatisticalTimer& _timer)
522     : clGemm( _M, _N, _K,
523               _lda, _ldb, _ldc,
524               _useimages, _order, _transA, _transB,
525               _deviceType, _commandQueueFlags, _timer )
526     , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, yes_complex )
527     {
528         alpha.s[0] = _alpha;
529         alpha.s[1] = _alpha;
530         beta.s[0] = _beta;
531         beta.s[1] = _beta;
532     }
533 
call_gemm()534     void call_gemm()
535     {
536 	    timer.Start(gemm_timer_id);
537         OPENCL_V_THROW( clblasCgemm(order, transA, transB,
538                                        M, N, K,
539                                        alpha,
540                                        mybuffers.bufA, lda,
541                                        mybuffers.bufB, ldb,
542                                        beta,
543                                        mybuffers.bufC, ldc,
544                                        1, &queue, 0, NULL, &event),
545                         "clblasCgemm" );
546         wait_and_check();
547 	    timer.Stop(gemm_timer_id);
548         //mybuffers.read_back_result();
549         //mybuffers.printLocalMatrix("C");
550     }
551 
clear_buffers()552     void clear_buffers()
553     {
554         mybuffers.initialize_data();
555     }
556 
gflops()557     double gflops()
558     {
559         return (8*M*N*K)/time_in_ns();
560     }
561 
gflops_formula()562     std::string gflops_formula()
563     {
564         return "(8*M*N*K)/time_in_ns";
565     }
566 };
567 
568 class clZgemm : public clGemm
569 {
570 public:
571     cl_double2 alpha;
572     cl_double2 beta;
573     buffers<cl_double> mybuffers;
574 
clZgemm(size_t _M,size_t _N,size_t _K,size_t _lda,size_t _ldb,size_t _ldc,bool _useimages,clblasOrder _order,clblasTranspose _transA,clblasTranspose _transB,cl_double _alpha,cl_double _beta,cl_device_type _deviceType,cl_uint _commandQueueFlags,StatisticalTimer & _timer)575     clZgemm( size_t _M, size_t _N, size_t _K,
576             size_t _lda, size_t _ldb, size_t _ldc,
577             bool _useimages,
578             clblasOrder _order,
579             clblasTranspose _transA, clblasTranspose _transB,
580             cl_double _alpha, cl_double _beta,
581             cl_device_type _deviceType, cl_uint _commandQueueFlags,
582             StatisticalTimer& _timer)
583     : clGemm( _M, _N, _K,
584               _lda, _ldb, _ldc,
585               _useimages, _order, _transA, _transB,
586               _deviceType, _commandQueueFlags, _timer )
587     , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, yes_complex )
588     {
589         alpha.s[0] = _alpha;
590         alpha.s[1] = _alpha;
591         beta.s[0] = _beta;
592         beta.s[1] = _beta;
593     }
594 
call_gemm()595     void call_gemm()
596     {
597 	    timer.Start(gemm_timer_id);
598         OPENCL_V_THROW( clblasZgemm(order, transA, transB,
599                                        M, N, K,
600                                        alpha,
601                                        mybuffers.bufA, lda,
602                                        mybuffers.bufB, ldb,
603                                        beta,
604                                        mybuffers.bufC, ldc,
605                                        1, &queue, 0, NULL, &event),
606                         "clblasZgemm" );
607         wait_and_check();
608 	    timer.Stop(gemm_timer_id);
609         //mybuffers.read_back_result();
610         //mybuffers.printLocalMatrix("C");
611     }
612 
clear_buffers()613     void clear_buffers()
614     {
615         mybuffers.initialize_data();
616     }
617 
gflops()618     double gflops()
619     {
620         return (8*M*N*K)/time_in_ns();
621     }
622 
gflops_formula()623     std::string gflops_formula()
624     {
625         return "(8*M*N*K)/time_in_ns";
626     }
627 };
628