1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 /*
19  * GER performance test cases
20  */
21 
22 #include <stdlib.h>             // srand()
23 #include <string.h>             // memcpy()
24 #include <gtest/gtest.h>
25 #include <clBLAS.h>
26 
27 #include <common.h>
28 #include <clBLAS-wrapper.h>
29 #include <BlasBase.h>
30 #include <ger.h>
31 #include <blas-random.h>
32 
33 #ifdef PERF_TEST_WITH_ACML
34 #include <blas-internal.h>
35 #include <blas-wrapper.h>
36 #endif
37 
38 #include "PerformanceTest.h"
39 
40 /*
41  * NOTE: operation factor means overall number
42  *       of multiply and add per each operation involving
43  *       2 matrix elements
44  */
45 
46 using namespace std;
47 using namespace clMath;
48 
49 #define CHECK_RESULT(ret)                                                   \
50 do {                                                                        \
51     ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
52                          "perform an OpenCL request!" << endl;              \
53     EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
54                          endl;                                              \
55 } while (0)
56 
57 namespace clMath {
58 
59 template <typename ElemType> class GerPerformanceTest : public PerformanceTest
60 {
61 public:
62     virtual ~GerPerformanceTest();
63 
64     virtual int prepare(void);
65     virtual nano_time_t etalonPerfSingle(void);
66     virtual nano_time_t clblasPerfSingle(void);
67 
runInstance(BlasFunction fn,TestParams * params)68     static void runInstance(BlasFunction fn, TestParams *params)
69     {
70         GerPerformanceTest<ElemType> perfCase(fn, params);
71         int ret = 0;
72         int opFactor;
73         BlasBase *base;
74 
75         base = clMath::BlasBase::getInstance();
76 
77         opFactor =1;
78 
79         if ((fn == FN_DGER || fn == FN_ZGERU) &&
80             !base->isDevSupportDoublePrecision()) {
81 
82             std::cerr << ">> WARNING: The target device doesn't support native "
83                          "double precision floating point arithmetic" <<
84                          std::endl << ">> Test skipped" << std::endl;
85             return;
86         }
87 
88         if (!perfCase.areResourcesSufficient(params)) {
89             std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
90                         std::endl;
91 			return;
92         }
93         else {
94             ret = perfCase.run(opFactor);
95         }
96 
97         ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
98                              "perform an OpenCL request!" << endl;
99         EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
100     }
101 
102 private:
103     GerPerformanceTest(BlasFunction fn, TestParams *params);
104 
105     bool areResourcesSufficient(TestParams *params);
106 
107     TestParams params_;
108     ElemType alpha_;
109     ElemType *A_;
110     ElemType *backA_;
111     ElemType *x_;
112     ElemType *y_;
113     cl_mem mobjA_;
114     cl_mem mobjx_;
115     size_t  lengthA;
116     cl_mem mobjy_;
117     ::clMath::BlasBase *base_;
118 };
119 
120 template <typename ElemType>
GerPerformanceTest(BlasFunction fn,TestParams * params)121 GerPerformanceTest<ElemType>::GerPerformanceTest(
122     BlasFunction fn,
123     TestParams *params) : PerformanceTest(fn,(problem_size_t) ( ( (3 * params->M * params->N) + params->M )  * sizeof(ElemType) ) ), params_(*params), mobjA_(NULL), mobjx_(NULL), mobjy_(NULL)
124 {
125 
126 	if( params_.order == clblasColumnMajor )
127 			lengthA = params_.N * params_.lda;
128 		else
129                         lengthA = params_.M * params_.lda;
130 
131     A_ = new ElemType[lengthA + params_.offa];
132     backA_ = new ElemType[lengthA + params_.offa];
133     x_ = new ElemType[(1 + (params->M - 1) * abs(params_.incx))+ params_.offBX];
134     y_ = new ElemType[(1 + (params->N - 1) * abs(params_.incy)) + params_.offCY] ;
135 
136     base_ = ::clMath::BlasBase::getInstance();
137 }
138 
139 template <typename ElemType>
~GerPerformanceTest()140 GerPerformanceTest<ElemType>::~GerPerformanceTest()
141 {
142     if(A_ != NULL)
143     {
144     delete[] A_;
145     }
146 	if(x_ != NULL)
147 	{
148     delete[] x_;
149 	}
150 	if(y_ != NULL)
151 	{
152     delete[] y_;
153 	}
154 	if(backA_ != NULL)
155 	{
156     delete[] backA_;
157 	}
158 
159 	if( mobjy_ != NULL )
160 	    clReleaseMemObject(mobjy_);
161     if( mobjx_ != NULL )
162 		clReleaseMemObject(mobjx_);
163 	if( mobjA_ != NULL )
164 	    clReleaseMemObject(mobjA_);
165 }
166 
167 /*
168  * Check if available OpenCL resources are sufficient to
169  * run the test case
170  */
171 template <typename ElemType> bool
areResourcesSufficient(TestParams * params)172 GerPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
173 {
174     clMath::BlasBase *base;
175     size_t gmemSize, allocSize;
176     bool ret;
177     size_t m = params->M, n = params->N;
178 
179 	if((A_ == NULL) || (backA_ == NULL) || (x_ == NULL) || (y_ == NULL))
180 	{
181 		return 0;
182 	}
183 
184     base = clMath::BlasBase::getInstance();
185     gmemSize = (size_t)base->availGlobalMemSize( 0 );
186     allocSize = (size_t)base->maxMemAllocSize();
187 
188     ret = std::max(m, n) * params_.lda * sizeof(ElemType) < allocSize;
189     ret = ret && ( ((1 + (params_.M-1)*abs(params_.incx)))* sizeof(ElemType) < allocSize);
190     ret = ret && ( ((1 + (params_.N-1)*abs(params_.incy))) * sizeof(ElemType) < allocSize);
191 
192     ret = ret && (((std::max(m, n) * params_.lda) + ((1 + (params_.M-1)*abs(params_.incx))) +  ((1 + (params_.N-1)*abs(params_.incy)))) < gmemSize);
193 
194     return ret;
195 }
196 
197 template <typename ElemType> int
prepare(void)198 GerPerformanceTest<ElemType>::prepare(void)
199 {
200     bool useAlpha = base_->useAlpha();
201 
202     if (useAlpha) {
203         alpha_ = convertMultiplier<ElemType>(params_.alpha);
204     }
205 
206 
207     int creationFlags = 0;
208     creationFlags =  creationFlags | RANDOM_INIT;
209 
210     creationFlags = ( (params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
211 	BlasRoutineID BlasFn = CLBLAS_GER;
212 
213 	populate( A_+ params_.offa, params_.M, params_.N, params_.lda, BlasFn, creationFlags);
214 	populate( x_, (1 + (params_.M-1) * abs(params_.incx) + params_.offBX), 1, (1 + (params_.M-1) * abs(params_.incx) + params_.offBX), BlasFn, creationFlags );
215 	populate( y_, (1 + (params_.N-1) * abs(params_.incy) + params_.offCY), 1, (1 + (params_.N-1) * abs(params_.incy) + params_.offCY), BlasFn, creationFlags );
216 
217 
218     memcpy(backA_, A_, (lengthA + params_.offa)* sizeof(ElemType));
219 
220 	mobjA_ = base_->createEnqueueBuffer(A_, (lengthA + params_.offa) * sizeof(*A_), 0, CL_MEM_READ_WRITE);
221 	mobjx_ = base_->createEnqueueBuffer(x_, ( (1 + (params_.M-1) * abs(params_.incx) + params_.offBX)) * sizeof(*x_), 0, CL_MEM_READ_WRITE);
222 	mobjy_ = base_->createEnqueueBuffer(y_,( (1 + (params_.N-1) * abs(params_.incy) + params_.offCY)) * sizeof(*y_) , 0, CL_MEM_READ_WRITE);
223 
224      return ( (mobjA_ != NULL) &&  (mobjx_ != NULL) && (mobjy_ != NULL) ) ? 0 : -1;
225 }
226 
227 template <typename ElemType> nano_time_t
etalonPerfSingle(void)228 GerPerformanceTest<ElemType>::etalonPerfSingle(void)
229 {
230     nano_time_t time = 0;
231     clblasOrder order;
232     size_t lda, fN, fM;
233 
234 
235 #ifndef PERF_TEST_WITH_ROW_MAJOR
236     if (params_.order == clblasRowMajor) {
237         cerr << "Row major order is not allowed" << endl;
238         return NANOTIME_ERR;
239     }
240 #endif
241 
242     order = params_.order;
243     lda = params_.lda;
244      fM = params_.M;
245     fN = params_.N;
246 
247 #ifdef PERF_TEST_WITH_ACML
248 
249     clblasOrder fOrder;
250     size_t fOffx, fOffy;
251     int fIncx, fIncy;
252     ElemType *fX, *fY;
253     fOrder = params_.order;
254     fM = params_.M;
255     fN = params_.N;
256     fIncx = params_.incx;
257     fIncy = params_.incy;
258     fX = x_;
259     fY = y_;
260     fOffx = params_.offBX;
261     fOffy = params_.offCY;
262 
263     if (fOrder != clblasColumnMajor) {
264            fOrder = clblasColumnMajor;
265            fM = params_.N;
266            fN = params_.M;
267            fX = y_;
268            fY = x_;
269            fIncx = params_.incy;
270            fIncy = params_.incx;
271            fOffx = params_.offCY;
272            fOffy = params_.offBX;
273 		}
274 		time = getCurrentTime();
275 		clMath::blas::ger(order, fM, fN, alpha_, fX, fOffx, fIncx, fY, fOffy, fIncy,  A_, params_.offa, lda);
276 		time = getCurrentTime() - time;
277 
278 #endif  // PERF_TEST_WITH_ACML
279 
280     return time;
281 }
282 
283 
284 template <typename ElemType> nano_time_t
clblasPerfSingle(void)285 GerPerformanceTest<ElemType>::clblasPerfSingle(void)
286 {
287     nano_time_t time;
288     cl_event event;
289     cl_int status;
290     cl_command_queue queue = base_->commandQueues()[0];
291 
292     status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0,
293                                   (lengthA + params_.offa) * sizeof(ElemType), backA_, 0, NULL, &event);
294     if (status != CL_SUCCESS) {
295         cerr << "Matrix A buffer object enqueuing error, status = " <<
296                  status << endl;
297 
298         return NANOTIME_ERR;
299     }
300 
301     status = clWaitForEvents(1, &event);
302     if (status != CL_SUCCESS) {
303         cout << "Wait on event failed, status = " <<
304                 status << endl;
305 
306         return NANOTIME_ERR;
307     }
308 
309     event = NULL;
310      time = getCurrentTime();
311 
312 #define TIMING
313 #ifdef TIMING
314         clFinish( queue);
315 
316         int iter = 20;
317         for ( int i = 1; i <= iter; i++)
318         {
319 #endif
320 
321     status = (cl_int)clMath::clblas::ger(params_.order, params_.M, params_.N, alpha_,
322          mobjx_, params_.offBX, params_.incx, mobjy_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1,
323         &queue, 0, NULL, &event);
324     if (status != CL_SUCCESS) {
325         cerr << "The CLBLAS GER function failed, status = " <<
326                 status << endl;
327 
328         return NANOTIME_ERR;
329     }
330 #ifdef TIMING
331         } // iter loop
332         clFinish( queue);
333     time = getCurrentTime() - time;
334         time /= iter;
335 #else
336 
337     status = flushAll(1, &queue);
338     if (status != CL_SUCCESS) {
339         cerr << "clFlush() failed, status = " << status << endl;
340         return NANOTIME_ERR;
341     }
342 
343     time = getCurrentTime();
344     status = waitForSuccessfulFinish(1, &queue, &event);
345     if (status == CL_SUCCESS) {
346         time = getCurrentTime() - time;
347     }
348     else {
349         cerr << "Waiting for completion of commands to the queue failed, "
350                 "status = " << status << endl;
351         time = NANOTIME_ERR;
352     }
353 #endif
354     return time;
355 }
356 
357 } // namespace clMath
358 
359 // ger performance test
360 
TEST_P(GER,sger)361 TEST_P(GER, sger)
362 {
363     TestParams params;
364 
365     getParams(&params);
366     GerPerformanceTest<float>::runInstance(FN_SGER, &params);
367 }
368 
369 
TEST_P(GER,dger)370 TEST_P(GER, dger)
371 {
372     TestParams params;
373 
374     getParams(&params);
375     GerPerformanceTest<double>::runInstance(FN_DGER, &params);
376 }
377 
TEST_P(GER,cgeru)378 TEST_P(GER, cgeru)
379 {
380     TestParams params;
381 
382     getParams(&params);
383     GerPerformanceTest<FloatComplex>::runInstance(FN_CGERU, &params);
384 }
385 
386 
TEST_P(GER,zgeru)387 TEST_P(GER, zgeru)
388 {
389     TestParams params;
390 
391     getParams(&params);
392     GerPerformanceTest<DoubleComplex>::runInstance(FN_ZGERU, &params);
393 }
394