1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 // $Id
19
20 #ifndef CLBLAS_BENCHMARK_XSYMM_HXX__
21 #define CLBLAS_BENCHMARK_XSYMM_HXX__
22
23 #include "clfunc_common.hpp"
24
25 template <typename T>
26 struct xSymmBuffer
27 {
28 clblasOrder order;
29 clblasSide side;
30 clblasUplo uplo;
31 size_t M;
32 size_t N;
33 T alpha;
34 T* cpuA;
35 size_t a_num_vectors;
36 cl_mem A;
37 size_t offa;
38 size_t lda;
39 T* cpuB;
40 cl_mem B;
41 size_t offb;
42 size_t ldb;
43 T beta;
44 T* cpuC;
45 cl_mem C;
46 size_t offc;
47 size_t ldc;
48 }; // struct buffer
49
50 template <typename T>
51 class xSymm : public clblasFunc
52 {
53 public:
xSymm(StatisticalTimer & timer,cl_device_type devType)54 xSymm(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType)
55 {
56 timer.getUniqueID("clSymm", 0);
57 }
58
~xSymm()59 ~xSymm()
60 {
61 }
62
gflops()63 double gflops()
64 {
65 if (buffer.side == clblasLeft)
66 return static_cast<double>((2 * buffer.M * buffer.M * buffer.N)/time_in_ns());
67 else
68 return static_cast<double>((2 * buffer.N * buffer.N * buffer.M)/time_in_ns());
69 }
70
gflops_formula()71 std::string gflops_formula()
72 {
73 if (buffer.side == clblasLeft)
74 return "2*M*M*N/time";
75 else
76 return "2*N*N*M/time";
77 }
78
79 void setup_buffer(int order_option, int side_option, int
80 uplo_option, int diag_option, int transA_option, int
81 transB_option, size_t M, size_t N, size_t K,
82 size_t lda, size_t ldb, size_t ldc,size_t offA,
83 size_t offB, size_t offC, double alpha,
84 double beta);
85 void initialize_cpu_buffer();
86 void initialize_gpu_buffer();
87 void reset_gpu_write_buffer();
88 void call_func();
read_gpu_buffer()89 void read_gpu_buffer()
90 {
91 cl_int err;
92 err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE,
93 buffer.offc * sizeof(T), buffer.ldc * buffer.N *
94 sizeof(T),
95 buffer.cpuC, 0, NULL, NULL);
96 }
roundtrip_func()97 void roundtrip_func()
98 {
99 std::cout << "xSymm::roundtrip_func\n";
100 }
zerocopy_roundtrip_func()101 void zerocopy_roundtrip_func()
102 {
103 std::cout << "xSymm::zerocopy_roundtrip_func\n";
104 }
roundtrip_setup_buffer(int order_option,int side_option,int uplo_option,int diag_option,int transA_option,int transB_option,size_t M,size_t N,size_t K,size_t lda,size_t ldb,size_t ldc,size_t offA,size_t offB,size_t offC,double alpha,double beta)105 void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
106 int diag_option, int transA_option, int transB_option,
107 size_t M, size_t N, size_t K, size_t lda, size_t ldb,
108 size_t ldc, size_t offA, size_t offB, size_t offC,
109 double alpha, double beta)
110 {
111 initialize_scalars(alpha, beta);
112 buffer.offa = offA;
113 buffer.offb = offB;
114 buffer.offc = offC;
115 buffer.M = M;
116 buffer.N = N;
117 if (order_option == 0)
118 {
119 buffer.order = clblasRowMajor;
120 }
121 else
122 {
123 buffer.order = clblasColumnMajor;
124 }
125 if (uplo_option == 0)
126 {
127 buffer.uplo = clblasUpper;
128 }
129 else
130 {
131 buffer.uplo = clblasLower;
132 }
133 if (side_option == 0)
134 {
135 buffer.side = clblasLeft;
136 buffer.a_num_vectors = M;
137 if (lda == 0)
138 {
139 buffer.lda = buffer.M;
140 }
141 else if (lda < buffer.M)
142 {
143 std::cerr << "lda:wrong size\n";
144 exit(1);
145 }
146 else
147 {
148 buffer.lda = lda;
149 }
150 }
151 else
152 {
153 buffer.side = clblasRight;
154 buffer.a_num_vectors = N;
155 if (lda == 0)
156 {
157 buffer.lda = buffer.N;
158 }
159 else if (lda < buffer.N)
160 {
161 std::cerr << "lda:wrong size\n";
162 exit(1);
163 }
164 else
165 {
166 buffer.lda = lda;
167 }
168 }
169 /*}
170 if (lda == 0)
171 {
172 buffer.lda = buffer.M;
173 }
174 else if (lda < buffer.M)
175 {
176 std::cerr << "lda:wrong size\n";
177 exit(1);
178 }
179 else
180 {
181 buffer.lda = lda;
182 }*/
183 if (ldb == 0)
184 {
185 buffer.ldb = buffer.M;
186 }
187 else if (ldb < buffer.M)
188 {
189 std::cerr << "ldb:wrong size\n";
190 exit(1);
191 }
192 else
193 {
194 buffer.ldb = ldb;
195 }
196 if (ldc == 0)
197 {
198 buffer.ldc = buffer.M;
199 }
200 else if (ldc < buffer.M)
201 {
202 std::cerr << "ldc:wrong size\n";
203 exit(1);
204 }
205 else
206 {
207 buffer.ldc = ldc;
208 }
209 buffer.cpuB = new T[buffer.N * buffer.ldb];
210 buffer.cpuC = new T[buffer.N * buffer.ldc];
211 buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda];
212 }
releaseGPUBuffer_deleteCPUBuffer()213 void releaseGPUBuffer_deleteCPUBuffer()
214 {
215 //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
216 //need to do this before we eventually hit the destructor
217 delete buffer.cpuA;
218 delete buffer.cpuB;
219 delete buffer.cpuC;
220 OPENCL_V_THROW( clReleaseMemObject(buffer.A), "releasing buffer A");
221 OPENCL_V_THROW( clReleaseMemObject(buffer.B), "releasing buffer B");
222 OPENCL_V_THROW( clReleaseMemObject(buffer.C), "releasing buffer C");
223 }
224 protected:
initialize_scalars(double alpha,double beta)225 void initialize_scalars(double alpha, double beta)
226 {
227 buffer.alpha = makeScalar<T>(alpha);
228 buffer.beta = makeScalar<T>(beta);
229 }
230
231 private:
232 xSymmBuffer<T> buffer;
233 };
234
235 template <typename T>
setup_buffer(int order_option,int side_option,int uplo_option,int diag_option,int transA_option,int transB_option,size_t M,size_t N,size_t K,size_t lda,size_t ldb,size_t ldc,size_t offA,size_t offB,size_t offC,double alpha,double beta)236 void xSymm<T>::setup_buffer(int order_option, int side_option, int
237 uplo_option, int diag_option, int transA_option, int
238 transB_option, size_t M, size_t N, size_t K,
239 size_t lda, size_t ldb, size_t ldc,size_t offA,
240 size_t offB, size_t offC, double alpha,
241 double beta)
242 {
243 initialize_scalars(alpha, beta);
244 buffer.offa = offA;
245 buffer.offb = offB;
246 buffer.offc = offC;
247 buffer.M = M;
248 buffer.N = N;
249 if (order_option == 0)
250 {
251 buffer.order = clblasRowMajor;
252 }
253 else
254 {
255 buffer.order = clblasColumnMajor;
256 }
257 if (uplo_option == 0)
258 {
259 buffer.uplo = clblasUpper;
260 }
261 else
262 {
263 buffer.uplo = clblasLower;
264 }
265 if (side_option == 0)
266 {
267 buffer.side = clblasLeft;
268 buffer.a_num_vectors = M;
269 if (lda == 0)
270 {
271 buffer.lda = buffer.M;
272 }
273 else if (lda < buffer.M)
274 {
275 std::cerr << "lda:wrong size\n";
276 exit(1);
277 }
278 else
279 {
280 buffer.lda = lda;
281 }
282 }
283 else
284 {
285 buffer.side = clblasRight;
286 buffer.a_num_vectors = N;
287 if (lda == 0)
288 {
289 buffer.lda = buffer.N;
290 }
291 else if (lda < buffer.N)
292 {
293 std::cerr << "lda:wrong size\n";
294 exit(1);
295 }
296 else
297 {
298 buffer.lda = lda;
299 }
300 }
301 /*}
302 if (lda == 0)
303 {
304 buffer.lda = buffer.M;
305 }
306 else if (lda < buffer.M)
307 {
308 std::cerr << "lda:wrong size\n";
309 exit(1);
310 }
311 else
312 {
313 buffer.lda = lda;
314 }*/
315 if (ldb == 0)
316 {
317 buffer.ldb = buffer.M;
318 }
319 else if (ldb < buffer.M)
320 {
321 std::cerr << "ldb:wrong size\n";
322 exit(1);
323 }
324 else
325 {
326 buffer.ldb = ldb;
327 }
328 if (ldc == 0)
329 {
330 buffer.ldc = buffer.M;
331 }
332 else if (ldc < buffer.M)
333 {
334 std::cerr << "ldc:wrong size\n";
335 exit(1);
336 }
337 else
338 {
339 buffer.ldc = ldc;
340 }
341 buffer.cpuB = new T[buffer.N * buffer.ldb];
342 buffer.cpuC = new T[buffer.N * buffer.ldc];
343 buffer.cpuA = new T[buffer.a_num_vectors * buffer.lda];
344 cl_int err;
345 buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
346 buffer.a_num_vectors * buffer.lda*sizeof(T),
347 NULL, &err);
348
349 buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
350 buffer.N*buffer.ldb*sizeof(T),
351 NULL, &err);
352 buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
353 buffer.N*buffer.ldc*sizeof(T),
354 NULL, &err);
355 }
356
357 template <typename T>
initialize_cpu_buffer()358 void xSymm<T>::initialize_cpu_buffer()
359 {
360 srand(10);
361 for (size_t i = 0; i < buffer.a_num_vectors; ++i)
362 {
363 for (size_t j = 0; j < buffer.lda; ++j)
364 {
365 buffer.cpuA[i*buffer.lda+j] = random<T>(UPPER_BOUND<T>()) /
366 randomScale<T>();
367 }
368 }
369 for (size_t i = 0; i < buffer.N; ++i)
370 {
371 for (size_t j = 0; j < buffer.ldb; ++j)
372 {
373 buffer.cpuB[i*buffer.ldb+j] = random<T>(UPPER_BOUND<T>()) /
374 randomScale<T>();
375 }
376 }
377 for (size_t i = 0; i < buffer.N; ++i)
378 {
379 for (size_t j = 0; j < buffer.ldc; ++j)
380 {
381 buffer.cpuC[i*buffer.ldc+j] = random<T>(UPPER_BOUND<T>()) /
382 randomScale<T>();
383 }
384 }
385 }
386
387 template <typename T>
initialize_gpu_buffer()388 void xSymm<T>::initialize_gpu_buffer()
389 {
390 cl_int err;
391
392 err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE,
393 buffer.offa * sizeof(T),
394 buffer.a_num_vectors * buffer.lda*sizeof(T),
395 buffer.cpuA, 0, NULL, NULL);
396 err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0,
397 buffer.ldb*buffer.N*sizeof(T),
398 buffer.cpuB, 0, NULL, NULL);
399 err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0,
400 buffer.ldc*buffer.N*sizeof(T),
401 buffer.cpuC, 0, NULL, NULL);
402 }
403
404 template <typename T>
reset_gpu_write_buffer()405 void xSymm<T>::reset_gpu_write_buffer()
406 {
407 cl_int err;
408 err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0,
409 buffer.ldc*buffer.N*sizeof(T),
410 buffer.cpuC, 0, NULL, NULL);
411 }
412
413 template <>
call_func()414 void xSymm<cl_float>::call_func()
415 {
416 timer.Start(timer_id);
417 clblasSsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
418 buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
419 buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
420 0, NULL,&event_);
421 clWaitForEvents(1, &event_);
422 timer.Stop(timer_id);
423 }
424
425 template <>
roundtrip_func()426 void xSymm<cl_float>::roundtrip_func()
427 {
428 timer.Start(timer_id);
429 //set up buffer
430 cl_int err;
431 buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
432 buffer.a_num_vectors * buffer.lda*sizeof(cl_float),
433 NULL, &err);
434
435 buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
436 buffer.N*buffer.ldb*sizeof(cl_float),
437 NULL, &err);
438 buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
439 buffer.N*buffer.ldc*sizeof(cl_float),
440 NULL, &err);
441 //initialize gpu buffer
442 err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE,
443 buffer.offa * sizeof(cl_float),
444 buffer.a_num_vectors * buffer.lda*sizeof(cl_float),
445 buffer.cpuA, 0, NULL, NULL);
446 err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0,
447 buffer.ldb*buffer.N*sizeof(cl_float),
448 buffer.cpuB, 0, NULL, NULL);
449 err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0,
450 buffer.ldc*buffer.N*sizeof(cl_float),
451 buffer.cpuC, 0, NULL, NULL);
452 //call func
453 clblasSsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
454 buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
455 buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
456 0, NULL,NULL);
457 //read gpu buffer
458 err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE,
459 buffer.offc * sizeof(cl_float), buffer.ldc * buffer.N *
460 sizeof(cl_float),
461 buffer.cpuC, 0, NULL, &event_);
462 clWaitForEvents(1, &event_);
463 timer.Stop(timer_id);
464 }
465
466 template <>
call_func()467 void xSymm<cl_double>::call_func()
468 {
469 timer.Start(timer_id);
470 clblasDsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
471 buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
472 buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
473 0, NULL,&event_);
474 clWaitForEvents(1, &event_);
475 timer.Stop(timer_id);
476 }
477
478 template <>
roundtrip_func()479 void xSymm<cl_double>::roundtrip_func()
480 {
481 timer.Start(timer_id);
482 //set up buffer
483 cl_int err;
484 buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
485 buffer.a_num_vectors * buffer.lda*sizeof(cl_double),
486 NULL, &err);
487
488 buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
489 buffer.N*buffer.ldb*sizeof(cl_double),
490 NULL, &err);
491 buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
492 buffer.N*buffer.ldc*sizeof(cl_double),
493 NULL, &err);
494 //initialize gpu buffer
495 err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE,
496 buffer.offa * sizeof(cl_double),
497 buffer.a_num_vectors * buffer.lda*sizeof(cl_double),
498 buffer.cpuA, 0, NULL, NULL);
499 err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0,
500 buffer.ldb*buffer.N*sizeof(cl_double),
501 buffer.cpuB, 0, NULL, NULL);
502 err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0,
503 buffer.ldc*buffer.N*sizeof(cl_double),
504 buffer.cpuC, 0, NULL, NULL);
505 //call func
506 clblasDsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
507 buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
508 buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
509 0, NULL,NULL);
510 //read gpu buffer
511 err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE,
512 buffer.offc * sizeof(cl_double), buffer.ldc * buffer.N *
513 sizeof(cl_double),
514 buffer.cpuC, 0, NULL, &event_);
515 clWaitForEvents(1, &event_);
516 timer.Stop(timer_id);
517 }
518
519 template <>
call_func()520 void xSymm<cl_float2>::call_func()
521 {
522 timer.Start(timer_id);
523 clblasCsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
524 buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
525 buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
526 0, NULL,&event_);
527 clWaitForEvents(1, &event_);
528 timer.Stop(timer_id);
529 }
530
531 template <>
roundtrip_func()532 void xSymm<cl_float2>::roundtrip_func()
533 {
534 timer.Start(timer_id);
535 //set up buffer
536 cl_int err;
537 buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
538 buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
539 NULL, &err);
540
541 buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
542 buffer.N*buffer.ldb*sizeof(cl_float2),
543 NULL, &err);
544 buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
545 buffer.N*buffer.ldc*sizeof(cl_float2),
546 NULL, &err);
547 //initialize gpu buffer
548 err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE,
549 buffer.offa * sizeof(cl_float2),
550 buffer.a_num_vectors * buffer.lda*sizeof(cl_float2),
551 buffer.cpuA, 0, NULL, NULL);
552 err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0,
553 buffer.ldb*buffer.N*sizeof(cl_float2),
554 buffer.cpuB, 0, NULL, NULL);
555 err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0,
556 buffer.ldc*buffer.N*sizeof(cl_float2),
557 buffer.cpuC, 0, NULL, NULL);
558 //call func
559 clblasCsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
560 buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
561 buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
562 0, NULL,NULL);
563 //read gpu buffer
564 err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE,
565 buffer.offc * sizeof(cl_float2), buffer.ldc * buffer.N *
566 sizeof(cl_float2),
567 buffer.cpuC, 0, NULL, &event_);
568 clWaitForEvents(1, &event_);
569 timer.Stop(timer_id);
570 }
571
572 template <>
call_func()573 void xSymm<cl_double2>::call_func()
574 {
575 timer.Start(timer_id);
576 clblasZsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
577 buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
578 buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
579 0, NULL,&event_);
580 clWaitForEvents(1, &event_);
581 timer.Stop(timer_id);
582 }
583
584 template <>
roundtrip_func()585 void xSymm<cl_double2>::roundtrip_func()
586 {
587 timer.Start(timer_id);
588 //set up buffer
589 cl_int err;
590 buffer.A = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
591 buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
592 NULL, &err);
593
594 buffer.B = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
595 buffer.N*buffer.ldb*sizeof(cl_double2),
596 NULL, &err);
597 buffer.C = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
598 buffer.N*buffer.ldc*sizeof(cl_double2),
599 NULL, &err);
600 //initialize gpu buffer
601 err = clEnqueueWriteBuffer(queues_[0], buffer.A, CL_TRUE,
602 buffer.offa * sizeof(cl_double2),
603 buffer.a_num_vectors * buffer.lda*sizeof(cl_double2),
604 buffer.cpuA, 0, NULL, NULL);
605 err = clEnqueueWriteBuffer(queues_[0], buffer.B, CL_TRUE, 0,
606 buffer.ldb*buffer.N*sizeof(cl_double2),
607 buffer.cpuB, 0, NULL, NULL);
608 err = clEnqueueWriteBuffer(queues_[0], buffer.C, CL_TRUE, 0,
609 buffer.ldc*buffer.N*sizeof(cl_double2),
610 buffer.cpuC, 0, NULL, NULL);
611 //call func
612 clblasZsymm(buffer.order, buffer.side, buffer.uplo, buffer.M, buffer.N,
613 buffer.alpha, buffer.A, buffer.offa, buffer.lda, buffer.B, buffer.offb,
614 buffer.ldb, buffer.beta, buffer.C, buffer.offc, buffer.ldc, numQueues, queues_,
615 0, NULL,NULL);
616 //read gpu buffer
617 err = clEnqueueReadBuffer(queues_[0], buffer.C, CL_TRUE,
618 buffer.offc * sizeof(cl_double2), buffer.ldc * buffer.N *
619 sizeof(cl_double2),
620 buffer.cpuC, 0, NULL, &event_);
621 clWaitForEvents(1, &event_);
622 timer.Stop(timer_id);
623 }
624
625 template<>
626 double
627 xSymm<cl_float2>::
gflops()628 gflops()
629 {
630 if (buffer.side == clblasLeft)
631 return static_cast<double>((8 * buffer.M * buffer.M * buffer.N)/time_in_ns());
632 else
633 return static_cast<double>((8 * buffer.N * buffer.N * buffer.M)/time_in_ns());
634 }
635
636 template<>
637 double
638 xSymm<cl_double2>::
gflops()639 gflops()
640 {
641 if (buffer.side == clblasLeft)
642 return static_cast<double>((8 * buffer.M * buffer.M * buffer.N)/time_in_ns());
643 else
644 return static_cast<double>((8 * buffer.N * buffer.N * buffer.M)/time_in_ns());
645 }
646
647 template<>
648 std::string
649 xSymm<cl_float2>::
gflops_formula()650 gflops_formula()
651 {
652 if (buffer.side == clblasLeft)
653 return "8*M*M*N/time";
654 else
655 return "8*N*N*M/time";
656 }
657
658 template<>
659 std::string
660 xSymm<cl_double2>::
gflops_formula()661 gflops_formula()
662 {
663 if (buffer.side == clblasLeft)
664 return "8*M*M*N/time";
665 else
666 return "8*N*N*M/time";
667 }
668
669 #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__