1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 // $Id
19 
20 #ifndef CLBLAS_BENCHMARK_XHERK_HXX__
21 #define CLBLAS_BENCHMARK_XHERK_HXX__
22 
23 #include "clfunc_common.hpp"
24 
25 template <typename T>
26 struct xHerkBuffer
27 {
28     clblasOrder order_;
29     clblasUplo uplo_;
30     clblasTranspose transA_;
31     size_t N_;
32     size_t K_;
33     T alpha_;
34 	cl_mem A_;
35     size_t offa_;
36     size_t lda_;
37     T beta_;
38     cl_mem C_;
39     size_t offc_;
40     size_t ldc_;
41 	size_t a_num_vectors_;
42     size_t c_num_vectors_;
43 	T* cpuA_;
44 	T* cpuC_;
45 }; // struct buffer
46 
47 template <typename T>
48 class xHerk : public clblasFunc
49 {
50 public:
xHerk(StatisticalTimer & timer,cl_device_type devType)51   xHerk(StatisticalTimer& timer, cl_device_type devType) : clblasFunc(timer,  devType)
52   {
53     timer.getUniqueID("clHerk", 0);
54   }
55 
~xHerk()56   ~xHerk()
57   {
58   }
59 
gflops()60   double gflops()
61   {
62     return static_cast<double>(4*(buffer_.K_ * buffer_.N_ * (buffer_.N_+1))/time_in_ns());
63   }
64 
gflops_formula()65   std::string gflops_formula()
66   {
67     return "4*K*N*(N+1)/time";
68   }
69 
setup_buffer(int order_option,int side_option,int uplo_option,int diag_option,int transA_option,int transB_option,size_t M,size_t N,size_t K,size_t lda,size_t ldb,size_t ldc,size_t offA,size_t offB,size_t offC,double alpha,double beta)70   void setup_buffer(int order_option, int side_option, int
71                     uplo_option, int diag_option, int transA_option, int
72                     transB_option, size_t M, size_t N, size_t K,
73                     size_t lda, size_t ldb, size_t ldc,size_t offA,
74 					          size_t offB, size_t offC, double alpha,
75                     double beta)
76   {
77         DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
78         DUMMY_ARGS_USAGE_2(ldb, offB);
79 
80 		initialize_scalars(alpha,beta);
81 
82 		buffer_.N_ = N;
83 		buffer_.K_ = K;
84 		buffer_.offa_ = offA;
85 		buffer_.offc_ = offC;
86 
87 		if (uplo_option == 0)
88         {
89             buffer_.uplo_ = clblasUpper;
90         }
91         else
92         {
93             buffer_.uplo_ = clblasLower;
94         }
95 
96 		if (ldc == 0)
97         {
98             buffer_.ldc_ = N;
99         }
100         else if (ldc < N)
101         {
102             std::cerr << "ldc:wrong size\n";
103         }
104         else
105         {
106             buffer_.ldc_ = ldc;
107         }
108 
109 		buffer_.c_num_vectors_ = N;
110 
111 		if (order_option == 0)
112         {
113             order_ = clblasRowMajor;
114             if (transA_option == 0)
115             {
116                 buffer_.transA_ = clblasNoTrans;
117                 buffer_.a_num_vectors_ = N;
118                 if (lda == 0)
119                 {
120                     buffer_.lda_ = K;
121                 }
122                 else if (lda < K)
123                 {
124                     std::cerr << "lda:wrong size\n";
125                     exit(1);
126                 }
127                 else
128                 {
129                     buffer_.lda_ = lda;
130                 }
131             }
132             else
133             {
134                 buffer_.a_num_vectors_ = K;
135                 if (transA_option == 1)
136                 {
137                     buffer_.transA_ = clblasTrans;
138                 }
139                 else if (transA_option == 2)
140                 {
141                     buffer_.transA_ = clblasConjTrans;
142                 }
143                 if (lda == 0)
144                 {
145                     buffer_.lda_ = N;
146                 }
147                 else if (lda < N)
148                 {
149                     std::cerr << "lda:wrong size\n";
150                     exit(1);
151                 }
152                 else
153                 {
154                     buffer_.lda_ = lda;
155                 }
156             }
157         }
158         else
159         {
160             order_ = clblasColumnMajor;
161             if (transA_option == 0)
162             {
163                 buffer_.a_num_vectors_ = K;
164                 buffer_.transA_ = clblasNoTrans;
165                 if (lda == 0)
166                 {
167                     buffer_.lda_ = N;
168                 }
169                 else if (lda < N)
170                 {
171                     std::cerr << "lda:wrong size\n";
172                     exit(1);
173                 }
174                 else
175                 {
176                     buffer_.lda_ = lda;
177                 }
178             }
179             else
180             {
181                 buffer_.a_num_vectors_ = N;
182                 if (transA_option == 1)
183                 {
184                     buffer_.transA_ = clblasTrans;
185                 }
186                 else if (transA_option == 2)
187                 {
188                     buffer_.transA_ = clblasConjTrans;
189                 }
190 
191                 if (lda == 0)
192                 {
193                     buffer_.lda_ = K;
194                 }
195                 else if (lda < K)
196                 {
197                     std::cerr << "lda:wrong size\n";
198                     exit(1);
199                 }
200                 else
201                 {
202                     buffer_.lda_ = lda;
203                 }
204             }
205         }
206 
207         buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
208         buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
209 
210         cl_int err;
211         buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
212                                         (buffer_.lda_ * buffer_.a_num_vectors_ +
213                                             buffer_.offa_) * sizeof(T),
214                                         NULL, &err);
215 
216         buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
217                                         (buffer_.ldc_ * buffer_.c_num_vectors_ +
218                                             buffer_.offc_) * sizeof(T),
219                                         NULL, &err);
220   }
initialize_cpu_buffer()221   void initialize_cpu_buffer()
222   {
223 	  srand(10);
224 	  for (size_t i = 0; i < buffer_.a_num_vectors_; ++i)
225 	  {
226 		  for (size_t j = 0; j < buffer_.lda_; ++j)
227 		  {
228                 buffer_.cpuA_[i*buffer_.lda_+j] = random<T>(UPPER_BOUND<T>()) /
229                                                randomScale<T>();
230 		  }
231 	  }
232 	  for (size_t i = 0; i < buffer_.N_; ++i)
233 	  {
234 		  for (size_t j = 0; j < buffer_.ldc_; ++j)
235 		  {
236                 buffer_.cpuC_[i*buffer_.ldc_+j] = random<T>(UPPER_BOUND<T>()) /
237                                                randomScale<T>();
238 		  }
239 	  }
240   }
initialize_gpu_buffer()241   void initialize_gpu_buffer()
242   {
243 	    cl_int err;
244 
245         err = clEnqueueWriteBuffer(queues_[0], buffer_.A_, CL_TRUE,
246                                    buffer_.offa_ * sizeof(T),
247                                    buffer_.lda_ * buffer_.a_num_vectors_ *
248                                        sizeof(T),
249                                    buffer_.cpuA_, 0, NULL, NULL);
250 
251         err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE,
252                                    buffer_.offa_ * sizeof(T),
253                                    buffer_.ldc_ * buffer_.c_num_vectors_ *
254                                        sizeof(T),
255                                    buffer_.cpuC_, 0, NULL, NULL);
256   }
reset_gpu_write_buffer()257   void reset_gpu_write_buffer()
258   {
259 	    cl_int err;
260 
261         err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE,
262                                    buffer_.offc_ * sizeof(T),
263                                    buffer_.ldc_ * buffer_.c_num_vectors_ *
264                                        sizeof(T),
265                                    buffer_.cpuC_, 0, NULL, NULL);
266   }
267   void call_func();
read_gpu_buffer()268   void read_gpu_buffer()
269 	{
270 		cl_int err;
271 		err = clEnqueueReadBuffer(queues_[0], buffer_.C_, CL_TRUE,
272 								  buffer_.offc_*sizeof(T), buffer_.ldc_*buffer_.c_num_vectors_*sizeof(T),
273 								  buffer_.cpuC_, 0, NULL, NULL);
274 	}
275 	void roundtrip_func();
zerocopy_roundtrip_func()276 	void zerocopy_roundtrip_func()
277 	{
278 		std::cout << "xTrmm::zerocopy_roundtrip_func\n";
279 	}
roundtrip_setup_buffer(int order_option,int side_option,int uplo_option,int diag_option,int transA_option,int transB_option,size_t M,size_t N,size_t K,size_t lda,size_t ldb,size_t ldc,size_t offA,size_t offBX,size_t offCY,double alpha,double beta)280 	void roundtrip_setup_buffer(int order_option, int side_option, int uplo_option,
281                       int diag_option, int transA_option, int  transB_option,
282                       size_t M, size_t N, size_t K, size_t lda, size_t ldb,
283                       size_t ldc, size_t offA, size_t offBX, size_t offCY,
284                       double alpha, double beta)
285 	{
286         DUMMY_ARGS_USAGE_4(side_option, diag_option, transB_option, M);
287         DUMMY_ARGS_USAGE_2(ldb, offBX);
288 
289 		initialize_scalars(alpha,beta);
290 
291 		buffer_.N_ = N;
292 		buffer_.K_ = K;
293 		buffer_.offa_ = offA;
294 		buffer_.offc_ = offCY;
295 
296 		if (uplo_option == 0)
297         {
298             buffer_.uplo_ = clblasUpper;
299         }
300         else
301         {
302             buffer_.uplo_ = clblasLower;
303         }
304 
305 		if (ldc == 0)
306         {
307             buffer_.ldc_ = N;
308         }
309         else if (ldc < N)
310         {
311             std::cerr << "ldc:wrong size\n";
312         }
313         else
314         {
315             buffer_.ldc_ = ldc;
316         }
317 
318 		buffer_.c_num_vectors_ = N;
319 
320 		if (order_option == 0)
321         {
322             order_ = clblasRowMajor;
323             if (transA_option == 0)
324             {
325                 buffer_.transA_ = clblasNoTrans;
326                 buffer_.a_num_vectors_ = N;
327                 if (lda == 0)
328                 {
329                     buffer_.lda_ = K;
330                 }
331                 else if (lda < K)
332                 {
333                     std::cerr << "lda:wrong size\n";
334                     exit(1);
335                 }
336                 else
337                 {
338                     buffer_.lda_ = lda;
339                 }
340             }
341             else
342             {
343                 buffer_.a_num_vectors_ = K;
344                 if (transA_option == 1)
345                 {
346                     buffer_.transA_ = clblasTrans;
347                 }
348                 else if (transA_option == 2)
349                 {
350                     buffer_.transA_ = clblasConjTrans;
351                 }
352                 if (lda == 0)
353                 {
354                     buffer_.lda_ = N;
355                 }
356                 else if (lda < N)
357                 {
358                     std::cerr << "lda:wrong size\n";
359                     exit(1);
360                 }
361                 else
362                 {
363                     buffer_.lda_ = lda;
364                 }
365             }
366         }
367         else
368         {
369             order_ = clblasColumnMajor;
370             if (transA_option == 0)
371             {
372                 buffer_.a_num_vectors_ = K;
373                 buffer_.transA_ = clblasNoTrans;
374                 if (lda == 0)
375                 {
376                     buffer_.lda_ = N;
377                 }
378                 else if (lda < N)
379                 {
380                     std::cerr << "lda:wrong size\n";
381                     exit(1);
382                 }
383                 else
384                 {
385                     buffer_.lda_ = lda;
386                 }
387             }
388             else
389             {
390                 buffer_.a_num_vectors_ = N;
391                 if (transA_option == 1)
392                 {
393                     buffer_.transA_ = clblasTrans;
394                 }
395                 else if (transA_option == 2)
396                 {
397                     buffer_.transA_ = clblasConjTrans;
398                 }
399 
400                 if (lda == 0)
401                 {
402                     buffer_.lda_ = K;
403                 }
404                 else if (lda < K)
405                 {
406                     std::cerr << "lda:wrong size\n";
407                     exit(1);
408                 }
409                 else
410                 {
411                     buffer_.lda_ = lda;
412                 }
413             }
414         }
415 
416         buffer_.cpuA_ = new T[buffer_.lda_*buffer_.a_num_vectors_];
417         buffer_.cpuC_ = new T[buffer_.ldc_*buffer_.c_num_vectors_];
418 	}
releaseGPUBuffer_deleteCPUBuffer()419 	void releaseGPUBuffer_deleteCPUBuffer()
420 	{
421 		//this is necessary since we are running a iteration of tests and calculate the average time. (in client.cpp)
422 		//need to do this before we eventually hit the destructor
423 		delete buffer_.cpuA_;
424 		delete buffer_.cpuC_;
425 		OPENCL_V_THROW( clReleaseMemObject(buffer_.A_), "releasing buffer A");
426 		OPENCL_V_THROW( clReleaseMemObject(buffer_.C_), "releasing buffer C");
427 	}
428 protected:
429 protected:
initialize_scalars(double alpha,double beta)430   void initialize_scalars(double alpha, double beta)
431   {
432       buffer_.alpha_ = makeScalar<T>(alpha);
433       buffer_.beta_ = makeScalar<T>(beta);
434   }
435 
436 private:
437   xHerkBuffer<T> buffer_;
438 };
439 
440 template<>
441 void
call_func()442 xHerk<cl_float2>::call_func()
443 {
444 	timer.Start(timer_id);
445 
446 	clblasCherk(order_, buffer_.uplo_, buffer_.transA_,
447 				buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
448 				buffer_.A_, buffer_.offa_, buffer_.lda_,
449 				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
450 				buffer_.ldc_, numQueues, queues_, 0, NULL, &event_);
451 
452     clWaitForEvents(1, &event_);
453     timer.Stop(timer_id);
454 }
455 
456 template<>
457 void
roundtrip_func()458 xHerk<cl_float2>::roundtrip_func()
459 {
460 		timer.Start(timer_id);
461         cl_int err;
462         buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
463                                         (buffer_.lda_ * buffer_.a_num_vectors_ +
464                                             buffer_.offa_) * sizeof(cl_float2),
465                                         NULL, &err);
466 
467         buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
468                                         (buffer_.ldc_ * buffer_.c_num_vectors_ +
469                                             buffer_.offc_) * sizeof(cl_float2),
470                                         NULL, &err);
471 		this->initialize_gpu_buffer();
472 
473 		clblasCherk(order_, buffer_.uplo_, buffer_.transA_,
474 				buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
475 				buffer_.A_, buffer_.offa_, buffer_.lda_,
476 				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
477 				buffer_.ldc_, numQueues, queues_, 0, NULL, NULL);
478 
479 		err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE,
480                                    buffer_.offc_ * sizeof(cl_float2),
481                                    buffer_.ldc_ * buffer_.c_num_vectors_ *
482                                        sizeof(cl_float2),
483                                    buffer_.cpuC_, 0, NULL, &event_);
484 		clWaitForEvents(1, &event_);
485 		timer.Stop(timer_id);
486 }
487 
488 template<>
489 void
call_func()490 xHerk<cl_double2>::call_func()
491 {
492 	timer.Start(timer_id);
493 
494 	clblasZherk(order_, buffer_.uplo_, buffer_.transA_,
495 				buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
496 				buffer_.A_, buffer_.offa_, buffer_.lda_,
497 				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
498 				buffer_.ldc_, numQueues, queues_, 0, NULL, &event_);
499 
500     clWaitForEvents(1, &event_);
501     timer.Stop(timer_id);
502 }
503 
504 template<>
505 void
roundtrip_func()506 xHerk<cl_double2>::roundtrip_func()
507 {
508 		timer.Start(timer_id);
509         cl_int err;
510         buffer_.A_ = clCreateBuffer(ctx_, CL_MEM_READ_ONLY,
511                                         (buffer_.lda_ * buffer_.a_num_vectors_ +
512                                             buffer_.offa_) * sizeof(cl_double2),
513                                         NULL, &err);
514 
515         buffer_.C_ = clCreateBuffer(ctx_, CL_MEM_READ_WRITE,
516                                         (buffer_.ldc_ * buffer_.c_num_vectors_ +
517                                             buffer_.offc_) * sizeof(cl_double2),
518                                         NULL, &err);
519 		this->initialize_gpu_buffer();
520 
521 		clblasZherk(order_, buffer_.uplo_, buffer_.transA_,
522 				buffer_.N_, buffer_.K_, buffer_.alpha_.s[0],
523 				buffer_.A_, buffer_.offa_, buffer_.lda_,
524 				buffer_.beta_.s[0], buffer_.C_, buffer_.offc_,
525 				buffer_.ldc_, numQueues, queues_, 0, NULL, NULL);
526 
527 		err = clEnqueueWriteBuffer(queues_[0], buffer_.C_, CL_TRUE,
528                                    buffer_.offc_ * sizeof(cl_double2),
529                                    buffer_.ldc_ * buffer_.c_num_vectors_ *
530                                        sizeof(cl_double2),
531                                    buffer_.cpuC_, 0, NULL, &event_);
532 		clWaitForEvents(1, &event_);
533 		timer.Stop(timer_id);
534 }
535 #endif // ifndef CLBLAS_BENCHMARK_XSYR_HXX__