1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 // $Id
19
20 #ifndef CLBLAS_BENCHMARK_XHERK_HXX__
21 #define CLBLAS_BENCHMARK_XHERK_HXX__
22
23 #include "clfunc_common.hpp"
24
25 template <typename T>
26 struct xHerkBuffer
27 {
28 clblasOrder order_;
29 clblasUplo uplo_;
30 clblasTranspose transA_;
31 size_t N_;
32 size_t K_;
33 T alpha_;
34 cl_mem A_;
35 size_t offa_;
36 size_t lda_;
37 T beta_;
38 cl_mem C_;
39 size_t offc_;
40 size_t ldc_;
41 size_t a_num_vectors_;
42 size_t c_num_vectors_;
43 T* cpuA_;
44 T* cpuC_;
45 }; // struct buffer
46
47 template <typename T>
48 class xHerk : public clblasFunc
49 {
50 public:
xHerk(StatisticalTimer & timer,cl_device_type devType)51 xHerk(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer, devType)
52 {
53 timer.getUniqueID("clHerk", 0);
54 }
55
~xHerk()56 ~xHerk()
57 {
58 }
59
gflops()60 double gflops()
61 {
62 return static_cast<double>(4*(buffer_.K_ * buffer_.N_ * (buffer_.N_+1))/time_in_ns());
63 }
64
gflops_formula()65 std::string gflops_formula()
66 {
67 return "4*K*N*(N+1)/time";
68 }
69
setup_buffer(int order_option,int side_option,int uplo_option,int diag_option,int transA_option,int transB_option,size_t M,size_t N,size_t K,size_t lda,size_t ldb,size_t ldc,size_t offA,size_t offB,size_t offC,double alpha,double beta)70 void setup_buffer(int order_option, int side_option, int
71 uplo_option, int diag_option, int transA_option, int
72 transB_option, size_t M, size_t N, size_t K,
73 size_t lda, size_t ldb, size_t ldc,size_t offA,
74 size_t offB, size_t offC, double alpha,
75 double beta)
76 {
77 DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
78 DUMMY_ARGS_USAGE_2(ldb, offB);
79
80 initialize_scalars(alpha,beta);
81
82 buffer_.N_ = N;
83 buffer_.K_ = K;
84 buffer_.offa_ = offA;
85 buffer_.offc_ = offC;
86
87 if (uplo_option == 0)
88 {
89 buffer_.uplo_ = clblasUpper;
90 }
91 else
92 {
93 buffer_.uplo_ = clblasLower;
94 }
95
96 if (ldc == 0)
97 {
98 buffer_.ldc_ = N;
99 }
100 else if (ldc < N)
101 {
102 std::cerr << "ldc:wrong size\n";
103 }
104 else
105 {
106 buffer_.ldc_ = ldc;
107 }
108
109 buffer_.c_num_vectors_ = N;
110
111 if (order_option == 0)
112 {
113 order_ = clblasRowMajor;
114 if (transA_option == 0)
115 {
116 buffer_.transA_ = clblasNoTrans;
117 buffer_.a_num_vectors_ = N;
118 if (lda == 0)
119 {
120 buffer_.lda_ = K;
121 }
122 else if (lda < K)
123 {
124 std::cerr << "lda:wrong size\n";
125 exit(1);
126 }
127 else
128 {
129 buffer_.lda_ = lda;
130 }
131 }
132 else
133 {
134 buffer_.a_num_vectors_ = K;
135 if (transA_option == 1)
136 {
137 buffer_.transA_ = clblasTrans;
138 }
139 else if (transA_option == 2)
140 {
141 buffer_.transA_ = clblasConjTrans;
142 }
143 if (lda == 0)
144 {
145 buffer_.lda_ = N;
146 }
147 else if (lda < N)
148 {
149 std::cerr << "lda:wrong size\n";
150 exit(1);
151 }
152 else
153 {
154 buffer_.lda_ = lda;
155 }
156 }
157 }
158 else
159 {
160 order_ = clblasColumnMajor;
161 if (transA_option == 0)
162 {
163 buffer_.a_num_vectors_ = K;
164 buffer_.transA_ = clblasNoTrans;
165 if (lda == 0)
166 {
167 buffer_.lda_ = N;
168 }
169 else if (lda < N)
170 {
171 std::cerr << "lda:wrong size\n";
172 exit(1);
173 }
174 else
175 {
176 buffer_.lda_ = lda;
177 }
178 }
179 else
180 {
181 buffer_.a_num_vectors_ = N;
182 if (transA_option == 1)
183 {
184 buffer_.transA_ = clblasTrans;
185 }
186 else if (transA_option == 2)
187 {
188 buffer_.transA_ = clblasConjTrans;
189 }
190
191 if (lda == 0)
192 {
193 buffer_.lda_ = K;
194 }
195 else if (lda < K)
196 {
197 std::cerr << "lda:wrong size\n";
198 exit(1);
199 }
200 else
201 {
202 buffer_.lda_ = lda;
203 }
204 }
205 }
206
207 buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
208 buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
209
210 cl_int err;
211 buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
212 (buffer_.lda_ * buffer_.a_num_vectors_ +
213 buffer_.offa_) * sizeof(T),
214 NULL, &err);
215
216 buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
217 (buffer_.ldc_ * buffer_.c_num_vectors_ +
218 buffer_.offc_) * sizeof(T),
219 NULL, &err);
220 }
initialize_cpu_buffer()221 void initialize_cpu_buffer()
222 {
223 srand(10);
224 for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
225 {
226 for (size_t j = 0; j < buffer_.lda_; ++j)
227 {
228 buffer_.cpuA_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) /
229 randomScale<T>();
230 }
231 }
232 for (size_t i = 0; i < buffer_.N_; ++i)
233 {
234 for (size_t j = 0; j < buffer_.ldc_; ++j)
235 {
236 buffer_.cpuC_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
237 randomScale<T>();
238 }
239 }
240 }
initialize_gpu_buffer()241 void initialize_gpu_buffer()
242 {
243 cl_int err;
244
245 err = clEnqueueWriteBuffer(queues_[0], buffer_.A_, CL_TRUE,
246 buffer_.offa_ * sizeof(T),
247 buffer_.lda_ * buffer_.a_num_vectors_ *
248 sizeof(T),
249 buffer_.cpuA_, 0, NULL, NULL);
250
251 err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE,
252 buffer_.offa_ * sizeof(T),
253 buffer_.ldc_ * buffer_.c_num_vectors_ *
254 sizeof(T),
255 buffer_.cpuC_, 0, NULL, NULL);
256 }
reset_gpu_write_buffer()257 void reset_gpu_write_buffer()
258 {
259 cl_int err;
260
261 err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE,
262 buffer_.offc_ * sizeof(T),
263 buffer_.ldc_ * buffer_.c_num_vectors_ *
264 sizeof(T),
265 buffer_.cpuC_, 0, NULL, NULL);
266 }
267 void call_func();
read_gpu_buffer()268 void read_gpu_buffer()
269 {
270 cl_int err;
271 err = clEnqueueReadBuffer(queues_[0], buffer_.C_, CL_TRUE,
272 buffer_.offc_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
273 buffer_.cpuC_, 0, NULL, NULL);
274 }
275 void roundtrip_func();
zerocopy_roundtrip_func()276 void zerocopy_roundtrip_func()
277 {
278 std::cout << "xTrmm::zerocopy_roundtrip_func\n";
279 }
roundtrip_setup_buffer(int order_option,int side_option,int uplo_option,int diag_option,int transA_option,int transB_option,size_t M,size_t N,size_t K,size_t lda,size_t ldb,size_t ldc,size_t offA,size_t offBX,size_t offCY,double alpha,double beta)280 void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
281 int diag_option, int transA_option, int transB_option,
282 size_t M, size_t N, size_t K, size_t lda, size_t ldb,
283 size_t ldc, size_t offA, size_t offBX, size_t offCY,
284 double alpha, double beta)
285 {
286 DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
287 DUMMY_ARGS_USAGE_2(ldb, offBX);
288
289 initialize_scalars(alpha,beta);
290
291 buffer_.N_ = N;
292 buffer_.K_ = K;
293 buffer_.offa_ = offA;
294 buffer_.offc_ = offCY;
295
296 if (uplo_option == 0)
297 {
298 buffer_.uplo_ = clblasUpper;
299 }
300 else
301 {
302 buffer_.uplo_ = clblasLower;
303 }
304
305 if (ldc == 0)
306 {
307 buffer_.ldc_ = N;
308 }
309 else if (ldc < N)
310 {
311 std::cerr << "ldc:wrong size\n";
312 }
313 else
314 {
315 buffer_.ldc_ = ldc;
316 }
317
318 buffer_.c_num_vectors_ = N;
319
320 if (order_option == 0)
321 {
322 order_ = clblasRowMajor;
323 if (transA_option == 0)
324 {
325 buffer_.transA_ = clblasNoTrans;
326 buffer_.a_num_vectors_ = N;
327 if (lda == 0)
328 {
329 buffer_.lda_ = K;
330 }
331 else if (lda < K)
332 {
333 std::cerr << "lda:wrong size\n";
334 exit(1);
335 }
336 else
337 {
338 buffer_.lda_ = lda;
339 }
340 }
341 else
342 {
343 buffer_.a_num_vectors_ = K;
344 if (transA_option == 1)
345 {
346 buffer_.transA_ = clblasTrans;
347 }
348 else if (transA_option == 2)
349 {
350 buffer_.transA_ = clblasConjTrans;
351 }
352 if (lda == 0)
353 {
354 buffer_.lda_ = N;
355 }
356 else if (lda < N)
357 {
358 std::cerr << "lda:wrong size\n";
359 exit(1);
360 }
361 else
362 {
363 buffer_.lda_ = lda;
364 }
365 }
366 }
367 else
368 {
369 order_ = clblasColumnMajor;
370 if (transA_option == 0)
371 {
372 buffer_.a_num_vectors_ = K;
373 buffer_.transA_ = clblasNoTrans;
374 if (lda == 0)
375 {
376 buffer_.lda_ = N;
377 }
378 else if (lda < N)
379 {
380 std::cerr << "lda:wrong size\n";
381 exit(1);
382 }
383 else
384 {
385 buffer_.lda_ = lda;
386 }
387 }
388 else
389 {
390 buffer_.a_num_vectors_ = N;
391 if (transA_option == 1)
392 {
393 buffer_.transA_ = clblasTrans;
394 }
395 else if (transA_option == 2)
396 {
397 buffer_.transA_ = clblasConjTrans;
398 }
399
400 if (lda == 0)
401 {
402 buffer_.lda_ = K;
403 }
404 else if (lda < K)
405 {
406 std::cerr << "lda:wrong size\n";
407 exit(1);
408 }
409 else
410 {
411 buffer_.lda_ = lda;
412 }
413 }
414 }
415
416 buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
417 buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
418 }
releaseGPUBuffer_deleteCPUBuffer()419 void releaseGPUBuffer_deleteCPUBuffer()
420 {
421 //this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
422 //need to do this before we eventually hit the destructor
423 delete buffer_.cpuA_;
424 delete buffer_.cpuC_;
425 OPENCL_V_THROW( clReleaseMemObject(buffer_.A_), "releasing buffer A");
426 OPENCL_V_THROW( clReleaseMemObject(buffer_.C_), "releasing buffer C");
427 }
428 protected:
429 protected:
initialize_scalars(double alpha,double beta)430 void initialize_scalars(double alpha, double beta)
431 {
432 buffer_.alpha_ = makeScalar<T>(alpha);
433 buffer_.beta_ = makeScalar<T>(beta);
434 }
435
436 private:
437 xHerkBuffer<T> buffer_;
438 };
439
440 template<>
441 void
call_func()442 xHerk<cl_float2>::call_func()
443 {
444 timer.Start(timer_id);
445
446 clblasCherk(order_, buffer_.uplo_, buffer_.transA_,
447 buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
448 buffer_.A_, buffer_.offa_, buffer_.lda_,
449 buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
450 buffer_.ldc_, numQueues, queues_, 0, NULL, &event_);
451
452 clWaitForEvents(1, &event_);
453 timer.Stop(timer_id);
454 }
455
456 template<>
457 void
roundtrip_func()458 xHerk<cl_float2>::roundtrip_func()
459 {
460 timer.Start(timer_id);
461 cl_int err;
462 buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
463 (buffer_.lda_ * buffer_.a_num_vectors_ +
464 buffer_.offa_) * sizeof(cl_float2),
465 NULL, &err);
466
467 buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
468 (buffer_.ldc_ * buffer_.c_num_vectors_ +
469 buffer_.offc_) * sizeof(cl_float2),
470 NULL, &err);
471 this->initialize_gpu_buffer();
472
473 clblasCherk(order_, buffer_.uplo_, buffer_.transA_,
474 buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
475 buffer_.A_, buffer_.offa_, buffer_.lda_,
476 buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
477 buffer_.ldc_, numQueues, queues_, 0, NULL, NULL);
478
479 err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE,
480 buffer_.offc_ * sizeof(cl_float2),
481 buffer_.ldc_ * buffer_.c_num_vectors_ *
482 sizeof(cl_float2),
483 buffer_.cpuC_, 0, NULL, &event_);
484 clWaitForEvents(1, &event_);
485 timer.Stop(timer_id);
486 }
487
488 template<>
489 void
call_func()490 xHerk<cl_double2>::call_func()
491 {
492 timer.Start(timer_id);
493
494 clblasZherk(order_, buffer_.uplo_, buffer_.transA_,
495 buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
496 buffer_.A_, buffer_.offa_, buffer_.lda_,
497 buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
498 buffer_.ldc_, numQueues, queues_, 0, NULL, &event_);
499
500 clWaitForEvents(1, &event_);
501 timer.Stop(timer_id);
502 }
503
504 template<>
505 void
roundtrip_func()506 xHerk<cl_double2>::roundtrip_func()
507 {
508 timer.Start(timer_id);
509 cl_int err;
510 buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
511 (buffer_.lda_ * buffer_.a_num_vectors_ +
512 buffer_.offa_) * sizeof(cl_double2),
513 NULL, &err);
514
515 buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
516 (buffer_.ldc_ * buffer_.c_num_vectors_ +
517 buffer_.offc_) * sizeof(cl_double2),
518 NULL, &err);
519 this->initialize_gpu_buffer();
520
521 clblasZherk(order_, buffer_.uplo_, buffer_.transA_,
522 buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
523 buffer_.A_, buffer_.offa_, buffer_.lda_,
524 buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
525 buffer_.ldc_, numQueues, queues_, 0, NULL, NULL);
526
527 err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE,
528 buffer_.offc_ * sizeof(cl_double2),
529 buffer_.ldc_ * buffer_.c_num_vectors_ *
530 sizeof(cl_double2),
531 buffer_.cpuC_, 0, NULL, &event_);
532 clWaitForEvents(1, &event_);
533 timer.Stop(timer_id);
534 }
535 #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__