1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 // $Id
19 
20 #ifndef CLBLAS_BENCHMARK_XSYMM_HXX__
21 #define CLBLAS_BENCHMARK_XSYMM_HXX__
22 
23 #include "clfunc_common.hpp"
24 
25 template <typename T>
26 struct xSymmBuffer
27 {
28   clblasOrder order;
29   clblasSide side;
30   clblasUplo uplo;
31   size_t M;
32   size_t N;
33   T alpha;
34   T* cpuA;
35   size_t a_num_vectors;
36   cl_mem A;
37   size_t offa;
38   size_t lda;
39   T* cpuB;
40   cl_mem B;
41   size_t offb;
42   size_t ldb;
43   T beta;
44   T* cpuC;
45   cl_mem C;
46   size_t offc;
47   size_t ldc;
48 }; // struct buffer
49 
50 template <typename T>
51 class xSymm : public clblasFunc
52 {
53 public:
xSymm(StatisticalTimer & timer,cl_device_type devType)54   xSymm(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
55   {
56     timer.getUniqueID("clSymm", 0);
57   }
58 
~xSymm()59   ~xSymm()
60   {
61   }
62 
gflops()63   double gflops()
64   {
65     if (buffer.side == clblasLeft)
66       return static_cast<double>((2 * buffer.M * buffer.M * buffer.N)/time_in_ns());
67     else
68       return static_cast<double>((2 * buffer.N * buffer.N * buffer.M)/time_in_ns());
69   }
70 
gflops_formula()71   std::string gflops_formula()
72   {
73     if (buffer.side == clblasLeft)
74       return "2*M*M*N/time";
75     else
76       return "2*N*N*M/time";
77   }
78 
79   void setup_buffer(int order_option, int side_option, int
80                     uplo_option, int diag_option, int transA_option, int
81                     transB_option, size_t M, size_t N, size_t K,
82                     size_t lda, size_t ldb, size_t ldc,size_t offA,
83 					          size_t offB, size_t offC, double alpha,
84                     double beta);
85   void initialize_cpu_buffer();
86   void initialize_gpu_buffer();
87   void reset_gpu_write_buffer();
88   void call_func();
read_gpu_buffer()89   void read_gpu_buffer()
90 	{
91 		cl_int err;
92 		err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE,
93 			                      buffer.offc * sizeof(T), buffer.ldc * buffer.N *
94                                        sizeof(T),
95 								  buffer.cpuC, 0, NULL, NULL);
96 	}
roundtrip_func()97   void roundtrip_func()
98 	{
99 				std::cout << "xSymm::roundtrip_func\n";
100 	}
zerocopy_roundtrip_func()101 	void zerocopy_roundtrip_func()
102 	{
103 		std::cout << "xSymm::zerocopy_roundtrip_func\n";
104 	}
roundtrip_setup_buffer(int order_option,int side_option,int uplo_option,int diag_option,int transA_option,int transB_option,size_t M,size_t N,size_t K,size_t lda,size_t ldb,size_t ldc,size_t offA,size_t offB,size_t offC,double alpha,double beta)105   void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
106                       int diag_option, int transA_option, int  transB_option,
107                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
108                       size_t ldc, size_t offA, size_t offB, size_t offC,
109                       double alpha, double beta)
110   {
111   initialize_scalars(alpha, beta);
112   buffer.offa = offA;
113   buffer.offb = offB;
114   buffer.offc = offC;
115   buffer.M = M;
116   buffer.N = N;
117   if (order_option == 0)
118   {
119   buffer.order = clblasRowMajor;
120   }
121   else
122   {
123   buffer.order = clblasColumnMajor;
124   }
125   if (uplo_option == 0)
126   {
127     buffer.uplo = clblasUpper;
128   }
129   else
130   {
131     buffer.uplo = clblasLower;
132   }
133   if (side_option == 0)
134   {
135       buffer.side = clblasLeft;
136       buffer.a_num_vectors = M;
137       if (lda == 0)
138       {
139         buffer.lda = buffer.M;
140       }
141       else if (lda < buffer.M)
142       {
143         std::cerr << "lda:wrong size\n";
144         exit(1);
145       }
146       else
147       {
148         buffer.lda = lda;
149       }
150   }
151   else
152   {
153       buffer.side = clblasRight;
154       buffer.a_num_vectors = N;
155       if (lda == 0)
156       {
157         buffer.lda = buffer.N;
158       }
159       else if (lda < buffer.N)
160       {
161         std::cerr << "lda:wrong size\n";
162         exit(1);
163       }
164       else
165       {
166         buffer.lda = lda;
167       }
168   }
169   /*}
170   if (lda == 0)
171   {
172     buffer.lda = buffer.M;
173   }
174   else if (lda < buffer.M)
175   {
176     std::cerr << "lda:wrong size\n";
177     exit(1);
178   }
179   else
180   {
181     buffer.lda = lda;
182   }*/
183   if (ldb == 0)
184   {
185     buffer.ldb = buffer.M;
186   }
187   else if (ldb < buffer.M)
188   {
189     std::cerr << "ldb:wrong size\n";
190     exit(1);
191   }
192   else
193   {
194     buffer.ldb = ldb;
195   }
196   if (ldc == 0)
197   {
198     buffer.ldc = buffer.M;
199   }
200   else if (ldc < buffer.M)
201   {
202     std::cerr << "ldc:wrong size\n";
203     exit(1);
204   }
205   else
206   {
207     buffer.ldc = ldc;
208   }
209   buffer.cpuB = new T[buffer.N * buffer.ldb];
210   buffer.cpuC = new T[buffer.N * buffer.ldc];
211   buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda];
212   }
releaseGPUBuffer_deleteCPUBuffer()213   	void releaseGPUBuffer_deleteCPUBuffer()
214 	{
215 		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
216 		//need to do this before we eventually hit the destructor
217 		delete buffer.cpuA;
218 		delete buffer.cpuB;
219 		delete buffer.cpuC;
220 		OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
221 		OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
222 		OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
223 	}
224 protected:
initialize_scalars(double alpha,double beta)225   void initialize_scalars(double alpha, double beta)
226   {
227       buffer.alpha = makeScalar<T>(alpha);
228       buffer.beta = makeScalar<T>(beta);
229   }
230 
231 private:
232   xSymmBuffer<T> buffer;
233 };
234 
235 template <typename T>
setup_buffer(int order_option,int side_option,int uplo_option,int diag_option,int transA_option,int transB_option,size_t M,size_t N,size_t K,size_t lda,size_t ldb,size_t ldc,size_t offA,size_t offB,size_t offC,double alpha,double beta)236 void xSymm<T>::setup_buffer(int order_option, int side_option, int
237                     uplo_option, int diag_option, int transA_option, int
238                     transB_option, size_t M, size_t N, size_t K,
239                     size_t lda, size_t ldb, size_t ldc,size_t offA,
240 					          size_t offB, size_t offC, double alpha,
241                     double beta)
242 {
243   initialize_scalars(alpha, beta);
244   buffer.offa = offA;
245   buffer.offb = offB;
246   buffer.offc = offC;
247   buffer.M = M;
248   buffer.N = N;
249   if (order_option == 0)
250   {
251   buffer.order = clblasRowMajor;
252   }
253   else
254   {
255   buffer.order = clblasColumnMajor;
256   }
257   if (uplo_option == 0)
258   {
259     buffer.uplo = clblasUpper;
260   }
261   else
262   {
263     buffer.uplo = clblasLower;
264   }
265   if (side_option == 0)
266   {
267       buffer.side = clblasLeft;
268       buffer.a_num_vectors = M;
269       if (lda == 0)
270       {
271         buffer.lda = buffer.M;
272       }
273       else if (lda < buffer.M)
274       {
275         std::cerr << "lda:wrong size\n";
276         exit(1);
277       }
278       else
279       {
280         buffer.lda = lda;
281       }
282   }
283   else
284   {
285       buffer.side = clblasRight;
286       buffer.a_num_vectors = N;
287       if (lda == 0)
288       {
289         buffer.lda = buffer.N;
290       }
291       else if (lda < buffer.N)
292       {
293         std::cerr << "lda:wrong size\n";
294         exit(1);
295       }
296       else
297       {
298         buffer.lda = lda;
299       }
300   }
301   /*}
302   if (lda == 0)
303   {
304     buffer.lda = buffer.M;
305   }
306   else if (lda < buffer.M)
307   {
308     std::cerr << "lda:wrong size\n";
309     exit(1);
310   }
311   else
312   {
313     buffer.lda = lda;
314   }*/
315   if (ldb == 0)
316   {
317     buffer.ldb = buffer.M;
318   }
319   else if (ldb < buffer.M)
320   {
321     std::cerr << "ldb:wrong size\n";
322     exit(1);
323   }
324   else
325   {
326     buffer.ldb = ldb;
327   }
328   if (ldc == 0)
329   {
330     buffer.ldc = buffer.M;
331   }
332   else if (ldc < buffer.M)
333   {
334     std::cerr << "ldc:wrong size\n";
335     exit(1);
336   }
337   else
338   {
339     buffer.ldc = ldc;
340   }
341   buffer.cpuB = new T[buffer.N * buffer.ldb];
342   buffer.cpuC = new T[buffer.N * buffer.ldc];
343   buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda];
344   cl_int err;
345   buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
346                                 buffer.a_num_vectors * buffer.lda*sizeof(T),
347                                 NULL, &err);
348 
349   buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
350                                     buffer.N*buffer.ldb*sizeof(T),
351                                     NULL, &err);
352   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
353                                     buffer.N*buffer.ldc*sizeof(T),
354                                     NULL, &err);
355 }
356 
357 template <typename T>
initialize_cpu_buffer()358 void xSymm<T>::initialize_cpu_buffer()
359 {
360   srand(10);
361   for (size_t i = 0; i < buffer.a_num_vectors; ++i)
362   {
363     for (size_t j = 0; j < buffer.lda; ++j)
364     {
365         buffer.cpuA[i*buffer.lda+j] = random<T>(UPPER_BOUND<T>()) /
366                                         randomScale<T>();
367     }
368   }
369   for (size_t i = 0; i < buffer.N; ++i)
370   {
371     for (size_t j = 0; j < buffer.ldb; ++j)
372     {
373         buffer.cpuB[i*buffer.ldb+j] = random<T>(UPPER_BOUND<T>()) /
374                                       randomScale<T>();
375     }
376   }
377   for (size_t i = 0; i < buffer.N; ++i)
378   {
379     for (size_t j = 0; j < buffer.ldc; ++j)
380     {
381         buffer.cpuC[i*buffer.ldc+j] = random<T>(UPPER_BOUND<T>()) /
382                                       randomScale<T>();
383     }
384   }
385 }
386 
387 template <typename T>
initialize_gpu_buffer()388 void xSymm<T>::initialize_gpu_buffer()
389 {
390   cl_int err;
391 
392   err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE,
393                               buffer.offa * sizeof(T),
394                               buffer.a_num_vectors * buffer.lda*sizeof(T),
395                               buffer.cpuA, 0, NULL, NULL);
396   err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0,
397                               buffer.ldb*buffer.N*sizeof(T),
398                               buffer.cpuB, 0, NULL, NULL);
399   err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0,
400                               buffer.ldc*buffer.N*sizeof(T),
401                               buffer.cpuC, 0, NULL, NULL);
402 }
403 
404 template <typename T>
reset_gpu_write_buffer()405 void xSymm<T>::reset_gpu_write_buffer()
406 {
407   cl_int err;
408   err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0,
409                               buffer.ldc*buffer.N*sizeof(T),
410                               buffer.cpuC, 0, NULL, NULL);
411 }
412 
413 template <>
call_func()414 void xSymm<cl_float>::call_func()
415 {
416   timer.Start(timer_id);
417   clblasSsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
418       buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
419       buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
420       0, NULL,&event_);
421   clWaitForEvents(1, &event_);
422   timer.Stop(timer_id);
423 }
424 
425 template <>
roundtrip_func()426 void xSymm<cl_float>::roundtrip_func()
427 {
428   timer.Start(timer_id);
429   //set up buffer
430     cl_int err;
431   buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
432                                 buffer.a_num_vectors * buffer.lda*sizeof(cl_float),
433                                 NULL, &err);
434 
435   buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
436                                     buffer.N*buffer.ldb*sizeof(cl_float),
437                                     NULL, &err);
438   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
439                                     buffer.N*buffer.ldc*sizeof(cl_float),
440                                     NULL, &err);
441   //initialize gpu buffer
442   err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE,
443                               buffer.offa * sizeof(cl_float),
444                               buffer.a_num_vectors * buffer.lda*sizeof(cl_float),
445                               buffer.cpuA, 0, NULL, NULL);
446   err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0,
447                               buffer.ldb*buffer.N*sizeof(cl_float),
448                               buffer.cpuB, 0, NULL, NULL);
449   err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0,
450                               buffer.ldc*buffer.N*sizeof(cl_float),
451                               buffer.cpuC, 0, NULL, NULL);
452   //call func
453   clblasSsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
454       buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
455       buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
456       0, NULL,NULL);
457   //read gpu buffer
458   err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE,
459 			                      buffer.offc * sizeof(cl_float), buffer.ldc * buffer.N *
460                                        sizeof(cl_float),
461 								  buffer.cpuC, 0, NULL, &event_);
462   clWaitForEvents(1, &event_);
463   timer.Stop(timer_id);
464 }
465 
466 template <>
call_func()467 void xSymm<cl_double>::call_func()
468 {
469   timer.Start(timer_id);
470   clblasDsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
471       buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
472       buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
473       0, NULL,&event_);
474   clWaitForEvents(1, &event_);
475   timer.Stop(timer_id);
476 }
477 
478 template <>
roundtrip_func()479 void xSymm<cl_double>::roundtrip_func()
480 {
481   timer.Start(timer_id);
482   //set up buffer
483     cl_int err;
484   buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
485                                 buffer.a_num_vectors * buffer.lda*sizeof(cl_double),
486                                 NULL, &err);
487 
488   buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
489                                     buffer.N*buffer.ldb*sizeof(cl_double),
490                                     NULL, &err);
491   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
492                                     buffer.N*buffer.ldc*sizeof(cl_double),
493                                     NULL, &err);
494   //initialize gpu buffer
495   err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE,
496                               buffer.offa * sizeof(cl_double),
497                               buffer.a_num_vectors * buffer.lda*sizeof(cl_double),
498                               buffer.cpuA, 0, NULL, NULL);
499   err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0,
500                               buffer.ldb*buffer.N*sizeof(cl_double),
501                               buffer.cpuB, 0, NULL, NULL);
502   err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0,
503                               buffer.ldc*buffer.N*sizeof(cl_double),
504                               buffer.cpuC, 0, NULL, NULL);
505   //call func
506   clblasDsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
507       buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
508       buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
509       0, NULL,NULL);
510   //read gpu buffer
511   err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE,
512 			                      buffer.offc * sizeof(cl_double), buffer.ldc * buffer.N *
513                                        sizeof(cl_double),
514 								  buffer.cpuC, 0, NULL, &event_);
515   clWaitForEvents(1, &event_);
516   timer.Stop(timer_id);
517 }
518 
519 template <>
call_func()520 void xSymm<cl_float2>::call_func()
521 {
522   timer.Start(timer_id);
523   clblasCsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
524       buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
525       buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
526       0, NULL,&event_);
527   clWaitForEvents(1, &event_);
528   timer.Stop(timer_id);
529 }
530 
531 template <>
roundtrip_func()532 void xSymm<cl_float2>::roundtrip_func()
533 {
534   timer.Start(timer_id);
535   //set up buffer
536     cl_int err;
537   buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
538                                 buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
539                                 NULL, &err);
540 
541   buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
542                                     buffer.N*buffer.ldb*sizeof(cl_float2),
543                                     NULL, &err);
544   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
545                                     buffer.N*buffer.ldc*sizeof(cl_float2),
546                                     NULL, &err);
547   //initialize gpu buffer
548   err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE,
549                               buffer.offa * sizeof(cl_float2),
550                               buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
551                               buffer.cpuA, 0, NULL, NULL);
552   err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0,
553                               buffer.ldb*buffer.N*sizeof(cl_float2),
554                               buffer.cpuB, 0, NULL, NULL);
555   err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0,
556                               buffer.ldc*buffer.N*sizeof(cl_float2),
557                               buffer.cpuC, 0, NULL, NULL);
558   //call func
559   clblasCsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
560       buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
561       buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
562       0, NULL,NULL);
563   //read gpu buffer
564   err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE,
565 			                      buffer.offc * sizeof(cl_float2), buffer.ldc * buffer.N *
566                                        sizeof(cl_float2),
567 								  buffer.cpuC, 0, NULL, &event_);
568   clWaitForEvents(1, &event_);
569   timer.Stop(timer_id);
570 }
571 
572 template <>
call_func()573 void xSymm<cl_double2>::call_func()
574 {
575   timer.Start(timer_id);
576   clblasZsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
577       buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
578       buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
579       0, NULL,&event_);
580   clWaitForEvents(1, &event_);
581   timer.Stop(timer_id);
582 }
583 
584 template <>
roundtrip_func()585 void xSymm<cl_double2>::roundtrip_func()
586 {
587   timer.Start(timer_id);
588   //set up buffer
589   cl_int err;
590   buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
591                                 buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
592                                 NULL, &err);
593 
594   buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
595                                     buffer.N*buffer.ldb*sizeof(cl_double2),
596                                     NULL, &err);
597   buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
598                                     buffer.N*buffer.ldc*sizeof(cl_double2),
599                                     NULL, &err);
600   //initialize gpu buffer
601   err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE,
602                               buffer.offa * sizeof(cl_double2),
603                               buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
604                               buffer.cpuA, 0, NULL, NULL);
605   err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0,
606                               buffer.ldb*buffer.N*sizeof(cl_double2),
607                               buffer.cpuB, 0, NULL, NULL);
608   err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0,
609                               buffer.ldc*buffer.N*sizeof(cl_double2),
610                               buffer.cpuC, 0, NULL, NULL);
611   //call func
612   clblasZsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
613       buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
614       buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
615       0, NULL,NULL);
616   //read gpu buffer
617   err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE,
618 			                      buffer.offc * sizeof(cl_double2), buffer.ldc * buffer.N *
619                                        sizeof(cl_double2),
620 								  buffer.cpuC, 0, NULL, &event_);
621   clWaitForEvents(1, &event_);
622   timer.Stop(timer_id);
623 }
624 
625 template<>
626 double
627 xSymm<cl_float2>::
gflops()628 gflops()
629 {
630   if (buffer.side == clblasLeft)
631     return static_cast<double>((8 * buffer.M * buffer.M * buffer.N)/time_in_ns());
632   else
633     return static_cast<double>((8 * buffer.N * buffer.N * buffer.M)/time_in_ns());
634 }
635 
636 template<>
637 double
638 xSymm<cl_double2>::
gflops()639 gflops()
640 {
641   if (buffer.side == clblasLeft)
642       return static_cast<double>((8 * buffer.M * buffer.M * buffer.N)/time_in_ns());
643   else
644       return static_cast<double>((8 * buffer.N * buffer.N * buffer.M)/time_in_ns());
645 }
646 
647 template<>
648 std::string
649 xSymm<cl_float2>::
gflops_formula()650 gflops_formula()
651 {
652   if (buffer.side == clblasLeft)
653       return "8*M*M*N/time";
654   else
655       return "8*N*N*M/time";
656 }
657 
658 template<>
659 std::string
660 xSymm<cl_double2>::
gflops_formula()661 gflops_formula()
662 {
663   if (buffer.side == clblasLeft)
664       return "8*M*M*N/time";
665   else
666       return "8*N*N*M/time";
667 }
668 
669 #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__