1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17 #include <iostream>
18 #include <clBLAS.h>
19 #include <string>
20 #include <map>
21
22 cl_int gemm_err;
23
prettyPrintClStatus(const cl_int & status)24 std::string prettyPrintClStatus( const cl_int& status )
25 {
26 switch( status )
27 {
28 case CL_INVALID_GLOBAL_WORK_SIZE:
29 return "CL_INVALID_GLOBAL_WORK_SIZE";
30 case CL_INVALID_MIP_LEVEL:
31 return "CL_INVALID_MIP_LEVEL";
32 case CL_INVALID_BUFFER_SIZE:
33 return "CL_INVALID_BUFFER_SIZE";
34 case CL_INVALID_GL_OBJECT:
35 return "CL_INVALID_GL_OBJECT";
36 case CL_INVALID_OPERATION:
37 return "CL_INVALID_OPERATION";
38 case CL_INVALID_EVENT:
39 return "CL_INVALID_EVENT";
40 case CL_INVALID_EVENT_WAIT_LIST:
41 return "CL_INVALID_EVENT_WAIT_LIST";
42 case CL_INVALID_GLOBAL_OFFSET:
43 return "CL_INVALID_GLOBAL_OFFSET";
44 case CL_INVALID_WORK_ITEM_SIZE:
45 return "CL_INVALID_WORK_ITEM_SIZE";
46 case CL_INVALID_WORK_GROUP_SIZE:
47 return "CL_INVALID_WORK_GROUP_SIZE";
48 case CL_INVALID_WORK_DIMENSION:
49 return "CL_INVALID_WORK_DIMENSION";
50 case CL_INVALID_KERNEL_ARGS:
51 return "CL_INVALID_KERNEL_ARGS";
52 case CL_INVALID_ARG_SIZE:
53 return "CL_INVALID_ARG_SIZE";
54 case CL_INVALID_ARG_VALUE:
55 return "CL_INVALID_ARG_VALUE";
56 case CL_INVALID_ARG_INDEX:
57 return "CL_INVALID_ARG_INDEX";
58 case CL_INVALID_KERNEL:
59 return "CL_INVALID_KERNEL";
60 case CL_INVALID_KERNEL_DEFINITION:
61 return "CL_INVALID_KERNEL_DEFINITION";
62 case CL_INVALID_KERNEL_NAME:
63 return "CL_INVALID_KERNEL_NAME";
64 case CL_INVALID_PROGRAM_EXECUTABLE:
65 return "CL_INVALID_PROGRAM_EXECUTABLE";
66 case CL_INVALID_PROGRAM:
67 return "CL_INVALID_PROGRAM";
68 case CL_INVALID_BUILD_OPTIONS:
69 return "CL_INVALID_BUILD_OPTIONS";
70 case CL_INVALID_BINARY:
71 return "CL_INVALID_BINARY";
72 case CL_INVALID_SAMPLER:
73 return "CL_INVALID_SAMPLER";
74 case CL_INVALID_IMAGE_SIZE:
75 return "CL_INVALID_IMAGE_SIZE";
76 case CL_INVALID_IMAGE_FORMAT_DESCRIPTOR:
77 return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
78 case CL_INVALID_MEM_OBJECT:
79 return "CL_INVALID_MEM_OBJECT";
80 case CL_INVALID_HOST_PTR:
81 return "CL_INVALID_HOST_PTR";
82 case CL_INVALID_COMMAND_QUEUE:
83 return "CL_INVALID_COMMAND_QUEUE";
84 case CL_INVALID_QUEUE_PROPERTIES:
85 return "CL_INVALID_QUEUE_PROPERTIES";
86 case CL_INVALID_CONTEXT:
87 return "CL_INVALID_CONTEXT";
88 case CL_INVALID_DEVICE:
89 return "CL_INVALID_DEVICE";
90 case CL_INVALID_PLATFORM:
91 return "CL_INVALID_PLATFORM";
92 case CL_INVALID_DEVICE_TYPE:
93 return "CL_INVALID_DEVICE_TYPE";
94 case CL_INVALID_VALUE:
95 return "CL_INVALID_VALUE";
96 case CL_MAP_FAILURE:
97 return "CL_MAP_FAILURE";
98 case CL_BUILD_PROGRAM_FAILURE:
99 return "CL_BUILD_PROGRAM_FAILURE";
100 case CL_IMAGE_FORMAT_NOT_SUPPORTED:
101 return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
102 case CL_IMAGE_FORMAT_MISMATCH:
103 return "CL_IMAGE_FORMAT_MISMATCH";
104 case CL_MEM_COPY_OVERLAP:
105 return "CL_MEM_COPY_OVERLAP";
106 case CL_PROFILING_INFO_NOT_AVAILABLE:
107 return "CL_PROFILING_INFO_NOT_AVAILABLE";
108 case CL_OUT_OF_HOST_MEMORY:
109 return "CL_OUT_OF_HOST_MEMORY";
110 case CL_OUT_OF_RESOURCES:
111 return "CL_OUT_OF_RESOURCES";
112 case CL_MEM_OBJECT_ALLOCATION_FAILURE:
113 return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
114 case CL_COMPILER_NOT_AVAILABLE:
115 return "CL_COMPILER_NOT_AVAILABLE";
116 case CL_DEVICE_NOT_AVAILABLE:
117 return "CL_DEVICE_NOT_AVAILABLE";
118 case CL_DEVICE_NOT_FOUND:
119 return "CL_DEVICE_NOT_FOUND";
120 case CL_SUCCESS:
121 return "CL_SUCCESS";
122 default:
123 return "Error code not defined";
124 break;
125 }
126 }
127
128 // This is used to either wrap an OpenCL function call, or to explicitly check a variable for an OpenCL error condition.
129 // If an error occurs, we throw.
130 // Note: std::runtime_error does not take unicode strings as input, so only strings supported
OpenCL_V_Throw(cl_int res,const std::string & msg,size_t lineno)131 inline cl_int OpenCL_V_Throw( cl_int res, const std::string& msg, size_t lineno )
132 {
133 switch( res )
134 {
135 case CL_SUCCESS: /**< No error */
136 break;
137 default:
138 {
139 std::stringstream tmp;
140 tmp << "OPENCL_V_THROWERROR< ";
141 tmp << prettyPrintClStatus(res) ;
142 tmp << " > (";
143 tmp << lineno;
144 tmp << "): ";
145 tmp << msg;
146 std::string errorm(tmp.str());
147 std::cout << errorm<< std::endl;
148 throw std::runtime_error( errorm );
149 }
150 }
151
152 return res;
153 }
154 #define OPENCL_V_THROW(_status,_message) OpenCL_V_Throw(_status, _message, __LINE__)
155
156 enum complexity_t { not_complex = 1, yes_complex = 2 };
157
158 //can be cl_float, cl_double
159 //TODO should be cl_float2 and cl_double2 instead of using float/double * complexity?
160 template< class T >
161 class buffers
162 {
163 public:
164 size_t M, N, K;
165 size_t lda, ldb, ldc;
166 complexity_t complexity;
167 T* A;
168 T* B;
169 T* C;
170 cl_mem bufA, bufB, bufC;
171 cl_command_queue queue;
172 std::map<std::string, T*> buffer_map;
173 std::map<std::string, size_t> rows_map;
174 std::map<std::string, size_t> ldx_map;
175
buffers(cl_context ctx,cl_command_queue _queue,size_t _M,size_t _N,size_t _K,size_t _lda,size_t _ldb,size_t _ldc,complexity_t _complexity)176 buffers( cl_context ctx, cl_command_queue _queue,
177 size_t _M, size_t _N, size_t _K,
178 size_t _lda, size_t _ldb, size_t _ldc,
179 complexity_t _complexity )
180 : M(_M)
181 , N(_N)
182 , K(_K)
183 , lda(_lda)
184 , ldb(_ldb)
185 , ldc(_ldc)
186 , complexity(_complexity)
187 , A(new T[M*lda*sizeof(T)*complexity])
188 , B(new T[K*ldb*sizeof(T)*complexity])
189 , C(new T[M*ldc*sizeof(T)*complexity])
190 , queue(_queue)
191 {
192 // request and initialize openCL memory
193 bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * lda * sizeof(*A) * complexity,
194 NULL, &gemm_err);
195 OPENCL_V_THROW( gemm_err, "creating buffer A" );
196 bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K * ldb * sizeof(*B) * complexity,
197 NULL, &gemm_err);
198 OPENCL_V_THROW( gemm_err, "creating buffer B" );
199 bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * ldc * sizeof(*C) * complexity,
200 NULL, &gemm_err);
201 OPENCL_V_THROW( gemm_err, "creating buffer C" );
202
203 buffer_map.insert(std::pair<std::string,T*>("A",A));
204 buffer_map.insert(std::pair<std::string,T*>("B",B));
205 buffer_map.insert(std::pair<std::string,T*>("C",C));
206 rows_map.insert(std::pair<std::string,size_t>("A",M));
207 rows_map.insert(std::pair<std::string,size_t>("B",K));
208 rows_map.insert(std::pair<std::string,size_t>("C",M));
209 ldx_map.insert(std::pair<std::string,size_t>("A",lda));
210 ldx_map.insert(std::pair<std::string,size_t>("B",ldb));
211 ldx_map.insert(std::pair<std::string,size_t>("C",ldc));
212
213 initialize_data();
214 }
215
~buffers()216 ~buffers()
217 {
218 OPENCL_V_THROW( clReleaseMemObject(bufC), "releasing buffer A");
219 OPENCL_V_THROW( clReleaseMemObject(bufB), "releasing buffer B");
220 OPENCL_V_THROW( clReleaseMemObject(bufA), "releasing buffer C");
221 delete[] A;
222 delete[] B;
223 delete[] C;
224 }
225
initialize_data()226 void initialize_data()
227 {
228 initializeLocalMatrix("A");
229 initializeLocalMatrix("B");
230 initializeLocalMatrix("C");
231
232 gemm_err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
233 M * K * sizeof(*A) * complexity, A, 0, NULL, NULL);
234 OPENCL_V_THROW( gemm_err, "writing to buffer A" );
235 gemm_err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
236 K * N * sizeof(*B) * complexity, A, 0, NULL, NULL);
237 OPENCL_V_THROW( gemm_err, "writing to buffer B" );
238 gemm_err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
239 M * N * sizeof(*C) * complexity, C, 0, NULL, NULL);
240 OPENCL_V_THROW( gemm_err, "writing to buffer C" );
241 }
242
read_back_result()243 void read_back_result()
244 {
245 OPENCL_V_THROW( clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C) * complexity, C, 0, NULL, NULL),
246 "reading from buffer C" );
247 }
248
initializeLocalMatrix(std::string matrix)249 void initializeLocalMatrix(std::string matrix)
250 {
251 for (size_t i = 0; i < rows_map[matrix]*complexity; i++) {
252 for (size_t j = 0; j < ldx_map[matrix]; j++) {
253 buffer_map[matrix][i * ldx_map[matrix] + j] = (i+1)*10 + (j+1);
254 }
255 }
256 }
257
printLocalMatrix(std::string matrix)258 void printLocalMatrix(std::string matrix)
259 {
260 for (size_t i = 0; i < rows_map[matrix]*complexity; i++) {
261 for (size_t j = 0; j < ldx_map[matrix]; j++) {
262 std::cout << (int)buffer_map[matrix][i * ldx_map[matrix] + j] << " ";
263 }
264 std::cout << std::endl;
265 }
266 std::cout << std::endl;
267 }
268 };
269
270 class clGemm
271 {
272 public:
273 size_t M;
274 size_t N;
275 size_t K;
276 size_t lda;
277 size_t ldb;
278 size_t ldc;
279 clblasOrder order;
280 clblasTranspose transA;
281 clblasTranspose transB;
282 cl_context_properties props[3];
283 cl_platform_id platform;
284 cl_device_id device;
285 cl_context ctx;
286 cl_device_type deviceType;
287 cl_command_queue queue;
288 cl_event event;
289 cl_uint commandQueueFlags;
290 bool useimages;
291 cl_ulong imgA;
292 cl_ulong imgB;
293 StatisticalTimer& timer;
294 StatisticalTimer::sTimerID gemm_timer_id;
295
clGemm(size_t _M,size_t _N,size_t _K,size_t _lda,size_t _ldb,size_t _ldc,bool _useimages,clblasOrder _order,clblasTranspose _transA,clblasTranspose _transB,cl_device_type _deviceType,cl_uint _commandQueueFlags,StatisticalTimer & _timer)296 clGemm( size_t _M, size_t _N, size_t _K,
297 size_t _lda, size_t _ldb, size_t _ldc,
298 bool _useimages,
299 clblasOrder _order,
300 clblasTranspose _transA, clblasTranspose _transB,
301 cl_device_type _deviceType, cl_uint _commandQueueFlags,
302 StatisticalTimer& _timer )
303 : M(_M)
304 , N(_N)
305 , K(_K)
306 , lda(_lda)
307 , ldb(_ldb)
308 , ldc(_ldc)
309 , order(_order)
310 , transA(_transA)
311 , transB(_transB)
312 , deviceType(_deviceType)
313 , event(NULL)
314 , commandQueueFlags(_commandQueueFlags)
315 , useimages(_useimages)
316 , imgA(0)
317 , imgB(0)
318 , timer(_timer)
319 {
320 props[0] = CL_CONTEXT_PLATFORM;
321 props[1] = 0;
322 props[2] = 0;
323 OPENCL_V_THROW( clGetPlatformIDs(1, &platform, NULL), "getting platform IDs" );
324 OPENCL_V_THROW( clGetDeviceIDs(platform, deviceType, 1, &device, NULL), "getting device IDs" );
325 props[1] = (cl_context_properties)platform;
326 ctx = clCreateContext(props, 1, &device, NULL, NULL, &gemm_err);
327 OPENCL_V_THROW( gemm_err, "creating context" );
328 queue = clCreateCommandQueue(ctx, device, commandQueueFlags, &gemm_err);
329 OPENCL_V_THROW( gemm_err, "creating command queue" );
330
331 gemm_err = clblasSetup();
332 if (gemm_err != CL_SUCCESS) {
333 std::cout << "clblasSetup() failed with " << gemm_err << std::endl;
334 clReleaseCommandQueue(queue);
335 clReleaseContext(ctx);
336 exit(1);
337 }
338
339 if (useimages) {
340 imgA = clblasAddScratchImage(ctx, 16, 64, NULL);
341 imgB = clblasAddScratchImage(ctx, 16, 64, NULL);
342 }
343
344 gemm_timer_id = timer.getUniqueID( "clGemm", 0 );
345 }
346
~clGemm()347 ~clGemm()
348 {
349 if (useimages) {
350 clblasRemoveScratchImage(imgA);
351 clblasRemoveScratchImage(imgB);
352 }
353
354 clblasTeardown();
355 OPENCL_V_THROW( clReleaseCommandQueue(queue), "releasing command queue" );
356 OPENCL_V_THROW( clReleaseContext(ctx), "releasing context" );
357 }
358
wait_and_check()359 void wait_and_check()
360 {
361 cl_int wait_status = clWaitForEvents(1, &event);
362
363 if( wait_status != CL_SUCCESS )
364 {
365 if( wait_status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST )
366 {
367 clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &gemm_err, NULL );
368 std::cout << "blas function execution status error: " << gemm_err << std::endl;
369 exit(1);
370 }
371 else
372 {
373 std::cout << "blas function wait status error: " << wait_status << std::endl;
374 exit(1);
375 }
376 }
377 }
378
time_in_ns()379 double time_in_ns()
380 {
381 StatisticalTimer& timer = StatisticalTimer::getInstance( );
382 return timer.getAverageTime( gemm_timer_id ) * 1e9;
383 }
384
385 virtual void call_gemm() = 0;
386 virtual void clear_buffers() = 0;
387 virtual double gflops() = 0;
388 virtual std::string gflops_formula() = 0;
389 };
390
391 class clSgemm : public clGemm
392 {
393 public:
394 cl_float alpha;
395 cl_float beta;
396 buffers<cl_float> mybuffers;
397
clSgemm(size_t _M,size_t _N,size_t _K,size_t _lda,size_t _ldb,size_t _ldc,bool _useimages,clblasOrder _order,clblasTranspose _transA,clblasTranspose _transB,cl_float _alpha,cl_float _beta,cl_device_type _deviceType,cl_uint _commandQueueFlags,StatisticalTimer & _timer)398 clSgemm( size_t _M, size_t _N, size_t _K,
399 size_t _lda, size_t _ldb, size_t _ldc,
400 bool _useimages,
401 clblasOrder _order,
402 clblasTranspose _transA, clblasTranspose _transB,
403 cl_float _alpha, cl_float _beta,
404 cl_device_type _deviceType, cl_uint _commandQueueFlags,
405 StatisticalTimer& _timer)
406 : clGemm( _M, _N, _K,
407 _lda, _ldb, _ldc,
408 _useimages, _order, _transA, _transB,
409 _deviceType, _commandQueueFlags, _timer )
410 , alpha(_alpha)
411 , beta(_beta)
412 , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, not_complex )
413 {}
414
call_gemm()415 void call_gemm()
416 {
417 timer.Start(gemm_timer_id);
418 OPENCL_V_THROW( clblasSgemm(order, transA, transB,
419 M, N, K,
420 alpha,
421 mybuffers.bufA, lda,
422 mybuffers.bufB, ldb,
423 beta,
424 mybuffers.bufC, ldc,
425 1, &queue, 0, NULL, &event),
426 "clblasSgemm" );
427 wait_and_check();
428 timer.Stop(gemm_timer_id);
429 //mybuffers.read_back_result();
430 //mybuffers.printLocalMatrix("C");
431 }
432
clear_buffers()433 void clear_buffers()
434 {
435 mybuffers.initialize_data();
436 }
437
gflops()438 double gflops()
439 {
440 return (2*M*N*K)/time_in_ns();
441 }
442
gflops_formula()443 std::string gflops_formula()
444 {
445 return "(2*M*N*K)/time_in_ns";
446 }
447 };
448
449 class clDgemm : public clGemm
450 {
451 public:
452 cl_double alpha;
453 cl_double beta;
454 buffers<cl_double> mybuffers;
455
clDgemm(size_t _M,size_t _N,size_t _K,size_t _lda,size_t _ldb,size_t _ldc,bool _useimages,clblasOrder _order,clblasTranspose _transA,clblasTranspose _transB,cl_double _alpha,cl_double _beta,cl_device_type _deviceType,cl_uint _commandQueueFlags,StatisticalTimer & _timer)456 clDgemm( size_t _M, size_t _N, size_t _K,
457 size_t _lda, size_t _ldb, size_t _ldc,
458 bool _useimages,
459 clblasOrder _order,
460 clblasTranspose _transA, clblasTranspose _transB,
461 cl_double _alpha, cl_double _beta,
462 cl_device_type _deviceType, cl_uint _commandQueueFlags,
463 StatisticalTimer& _timer)
464 : clGemm( _M, _N, _K,
465 _lda, _ldb, _ldc,
466 _useimages, _order, _transA, _transB,
467 _deviceType, _commandQueueFlags, _timer )
468 , alpha(_alpha)
469 , beta(_beta)
470 , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, not_complex )
471 {}
472
call_gemm()473 void call_gemm()
474 {
475 timer.Start(gemm_timer_id);
476 OPENCL_V_THROW( clblasDgemm(order, transA, transB,
477 M, N, K,
478 alpha,
479 mybuffers.bufA, lda,
480 mybuffers.bufB, ldb,
481 beta,
482 mybuffers.bufC, ldc,
483 1, &queue, 0, NULL, &event),
484 "clblasDgemm" );
485 wait_and_check();
486 timer.Stop(gemm_timer_id);
487 //mybuffers.read_back_result();
488 //mybuffers.printLocalMatrix("C");
489 }
490
clear_buffers()491 void clear_buffers()
492 {
493 mybuffers.initialize_data();
494 }
495
gflops()496 double gflops()
497 {
498 return (2*M*N*K)/time_in_ns();
499 }
500
gflops_formula()501 std::string gflops_formula()
502 {
503 return "(2*M*N*K)/time_in_ns";
504 }
505 };
506
507 class clCgemm : public clGemm
508 {
509 public:
510 cl_float2 alpha;
511 cl_float2 beta;
512 buffers<cl_float> mybuffers;
513
clCgemm(size_t _M,size_t _N,size_t _K,size_t _lda,size_t _ldb,size_t _ldc,bool _useimages,clblasOrder _order,clblasTranspose _transA,clblasTranspose _transB,cl_float _alpha,cl_float _beta,cl_device_type _deviceType,cl_uint _commandQueueFlags,StatisticalTimer & _timer)514 clCgemm( size_t _M, size_t _N, size_t _K,
515 size_t _lda, size_t _ldb, size_t _ldc,
516 bool _useimages,
517 clblasOrder _order,
518 clblasTranspose _transA, clblasTranspose _transB,
519 cl_float _alpha, cl_float _beta,
520 cl_device_type _deviceType, cl_uint _commandQueueFlags,
521 StatisticalTimer& _timer)
522 : clGemm( _M, _N, _K,
523 _lda, _ldb, _ldc,
524 _useimages, _order, _transA, _transB,
525 _deviceType, _commandQueueFlags, _timer )
526 , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, yes_complex )
527 {
528 alpha.s[0] = _alpha;
529 alpha.s[1] = _alpha;
530 beta.s[0] = _beta;
531 beta.s[1] = _beta;
532 }
533
call_gemm()534 void call_gemm()
535 {
536 timer.Start(gemm_timer_id);
537 OPENCL_V_THROW( clblasCgemm(order, transA, transB,
538 M, N, K,
539 alpha,
540 mybuffers.bufA, lda,
541 mybuffers.bufB, ldb,
542 beta,
543 mybuffers.bufC, ldc,
544 1, &queue, 0, NULL, &event),
545 "clblasCgemm" );
546 wait_and_check();
547 timer.Stop(gemm_timer_id);
548 //mybuffers.read_back_result();
549 //mybuffers.printLocalMatrix("C");
550 }
551
clear_buffers()552 void clear_buffers()
553 {
554 mybuffers.initialize_data();
555 }
556
gflops()557 double gflops()
558 {
559 return (8*M*N*K)/time_in_ns();
560 }
561
gflops_formula()562 std::string gflops_formula()
563 {
564 return "(8*M*N*K)/time_in_ns";
565 }
566 };
567
568 class clZgemm : public clGemm
569 {
570 public:
571 cl_double2 alpha;
572 cl_double2 beta;
573 buffers<cl_double> mybuffers;
574
clZgemm(size_t _M,size_t _N,size_t _K,size_t _lda,size_t _ldb,size_t _ldc,bool _useimages,clblasOrder _order,clblasTranspose _transA,clblasTranspose _transB,cl_double _alpha,cl_double _beta,cl_device_type _deviceType,cl_uint _commandQueueFlags,StatisticalTimer & _timer)575 clZgemm( size_t _M, size_t _N, size_t _K,
576 size_t _lda, size_t _ldb, size_t _ldc,
577 bool _useimages,
578 clblasOrder _order,
579 clblasTranspose _transA, clblasTranspose _transB,
580 cl_double _alpha, cl_double _beta,
581 cl_device_type _deviceType, cl_uint _commandQueueFlags,
582 StatisticalTimer& _timer)
583 : clGemm( _M, _N, _K,
584 _lda, _ldb, _ldc,
585 _useimages, _order, _transA, _transB,
586 _deviceType, _commandQueueFlags, _timer )
587 , mybuffers( ctx, queue, M, N, K, lda, ldb, ldc, yes_complex )
588 {
589 alpha.s[0] = _alpha;
590 alpha.s[1] = _alpha;
591 beta.s[0] = _beta;
592 beta.s[1] = _beta;
593 }
594
call_gemm()595 void call_gemm()
596 {
597 timer.Start(gemm_timer_id);
598 OPENCL_V_THROW( clblasZgemm(order, transA, transB,
599 M, N, K,
600 alpha,
601 mybuffers.bufA, lda,
602 mybuffers.bufB, ldb,
603 beta,
604 mybuffers.bufC, ldc,
605 1, &queue, 0, NULL, &event),
606 "clblasZgemm" );
607 wait_and_check();
608 timer.Stop(gemm_timer_id);
609 //mybuffers.read_back_result();
610 //mybuffers.printLocalMatrix("C");
611 }
612
clear_buffers()613 void clear_buffers()
614 {
615 mybuffers.initialize_data();
616 }
617
gflops()618 double gflops()
619 {
620 return (8*M*N*K)/time_in_ns();
621 }
622
gflops_formula()623 std::string gflops_formula()
624 {
625 return "(8*M*N*K)/time_in_ns";
626 }
627 };
628