1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 /*
19  * Sbmv performance test cases
20  */
21 
22 #include <stdlib.h>             // srand()
23 #include <string.h>             // memcpy()
24 #include <gtest/gtest.h>
25 #include <clBLAS.h>
26 #include <common.h>
27 #include <clBLAS-wrapper.h>
28 #include <BlasBase.h>
29 #include <gbmv.h>
30 #include <sbmv.h>
31 #include <blas-random.h>
32 
33 #ifdef PERF_TEST_WITH_ACML
34 #include <blas-internal.h>
35 #include <blas-wrapper.h>
36 #endif
37 
38 #include "PerformanceTest.h"
39 
40 using namespace std;
41 using namespace clMath;
42 
43 #define CHECK_RESULT(ret)                                                   \
44 do {                                                                        \
45     ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
46                          "perform an OpenCL request!" << endl;              \
47     EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
48                          endl;                                              \
49 } while (0)
50 
51 namespace clMath {
52 
53 template <typename ElemType> class SbmvPerformanceTest : public PerformanceTest
54 {
55 public:
56     virtual ~SbmvPerformanceTest();
57 
58     virtual int prepare(void);
59     virtual nano_time_t etalonPerfSingle(void);
60     virtual nano_time_t clblasPerfSingle(void);
61 
runInstance(BlasFunction fn,TestParams * params)62     static void runInstance(BlasFunction fn, TestParams *params)
63     {
64         SbmvPerformanceTest<ElemType> perfCase(fn, params);
65         int ret = 0;
66         int opFactor = 1;
67         BlasBase *base;
68 
69         base = clMath::BlasBase::getInstance();
70 
71         if ((fn == FN_DSBMV) &&
72             !base->isDevSupportDoublePrecision()) {
73 
74             std::cerr << ">> WARNING: The target device doesn't support native "
75                          "double precision floating point arithmetic" <<
76                          std::endl << ">> Test skipped" << std::endl;
77             return;
78         }
79 
80         if (!perfCase.areResourcesSufficient(params)) {
81             std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
82                         std::endl;
83         }
84         else {
85             ret = perfCase.run(opFactor);
86         }
87 
88         ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
89                              "perform an OpenCL request!" << endl;
90         EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
91     }
92 
93 private:
94     SbmvPerformanceTest(BlasFunction fn, TestParams *params);
95 
96     bool areResourcesSufficient(TestParams *params);
97 
98     TestParams params_;
99     ElemType alpha;
100     ElemType beta;
101     ElemType *A_;
102     ElemType *X_;
103     ElemType *Y_;
104     ElemType *backY_;
105     cl_mem mobjA_;
106     cl_mem mobjX_;
107     cl_mem mobjY_;
108     ::clMath::BlasBase *base_;
109 };
110 
111 template <typename ElemType>
SbmvPerformanceTest(BlasFunction fn,TestParams * params)112 SbmvPerformanceTest<ElemType>::SbmvPerformanceTest(
113     BlasFunction fn,
114     TestParams *params) : PerformanceTest(fn,
115     (problem_size_t)( ( (2 * (params->N) * (params->K + 1)   // A-access
116                           - (2 * params->K *  (params->K+1)) )       // Substract hole-part for A & X
117                         +( ((2*params->K + 1) * params->N + 2*params->N))   // X & Y access
118                                                                                                               ) * sizeof(ElemType) ) ),
119                           params_(*params), mobjA_(NULL), mobjX_(NULL), mobjY_(NULL)
120 {
121     size_t lenA, lenX, lenY;
122     lenA = params_.N  * (params_.lda) + params_.offA;
123     lenX = params_.N - 1* params_.incx + 1 + params_.offBX;
124     lenY = params_.N - 1* params_.incy + 1 + params_.offCY;
125     A_ = new ElemType[ lenA ];
126     X_ = new ElemType[ lenX ];
127     Y_ = new ElemType[ lenY ];
128     backY_ = new ElemType[ lenY ];
129     alpha = convertMultiplier<ElemType>(params_.alpha);
130 	beta  = convertMultiplier<ElemType>(params_.beta);
131 
132     base_ = ::clMath::BlasBase::getInstance();
133 
134 	mobjA_ = NULL;
135 	mobjX_ = NULL;
136 	mobjY_ = NULL;
137 }
138 
139 template <typename ElemType>
~SbmvPerformanceTest()140 SbmvPerformanceTest<ElemType>::~SbmvPerformanceTest()
141 {
142     if(A_ != NULL)
143     {
144         delete[] A_;
145     }
146 	if(X_ != NULL)
147 	{
148         delete[] X_;
149 	}
150 	if(backY_ != NULL)
151 	{
152 		delete[] backY_;
153 	}
154 	if(Y_ != NULL)
155 	{
156 	    delete[] Y_;
157 	}
158 
159     if ( mobjA_ != NULL )
160 		clReleaseMemObject(mobjA_);
161 	if ( mobjX_ != NULL )
162 	    clReleaseMemObject(mobjX_);
163 	if ( mobjY_ != NULL )
164 		clReleaseMemObject(mobjY_);
165 }
166 
167 /*
168  * Check if available OpenCL resources are sufficient to
169  * run the test case
170  */
171 template <typename ElemType> bool
areResourcesSufficient(TestParams * params)172 SbmvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
173 {
174     clMath::BlasBase *base;
175     size_t gmemSize, allocSize;
176     size_t n = params->N, lda = params->lda;
177     size_t lenA = (n * lda)  + params->offA* sizeof(ElemType);
178     size_t lenX = (n - 1) * params->incx + 1 + params->offBX * sizeof(ElemType);
179     size_t lenY = (n - 1) * params->incy + 1 + params->offCY * sizeof(ElemType);
180 
181     if((A_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL))
182 	{
183 		return 0;
184 	}
185 
186     base = clMath::BlasBase::getInstance();
187     gmemSize = (size_t)base->availGlobalMemSize(0);
188     allocSize = (size_t)base->maxMemAllocSize();
189 
190     bool suff = (lenA < allocSize) && ( (lenA + lenX + lenY) < gmemSize );
191 
192     return suff;
193 }
194 
195 template <typename ElemType> int
prepare(void)196 SbmvPerformanceTest<ElemType>::prepare(void)
197 {
198     size_t lenX, lenY, lenA;
199 
200     lenA = (params_.N * params_.lda) + params_.offA;
201 
202     if (params_.transA == clblasNoTrans) {
203         lenX = (params_.N - 1) * abs(params_.incx) + 1 + params_.offBX;
204         lenY = (params_.N - 1) * abs(params_.incy) + 1 + params_.offCY;
205     }
206     else {
207         lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX;
208         lenY = (params_.N - 1)*abs(params_.incy) + 1 + params_.offCY;
209     }
210 
211     randomGbmvMatrices(params_.order, clblasNoTrans , params_.N, params_.N, &alpha, &beta,
212                         (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx, (Y_+params_.offCY), params_.incy );
213 
214     memcpy(backY_, Y_, lenY * sizeof(ElemType));
215 
216     mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
217     mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
218     mobjY_ = base_->createEnqueueBuffer(backY_, lenY * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
219 
220     return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1;
221 }
222 
223 template <typename ElemType> nano_time_t
etalonPerfSingle(void)224 SbmvPerformanceTest<ElemType>::etalonPerfSingle(void)
225 {
226     nano_time_t time = 0;
227     clblasOrder fOrder;
228     clblasUplo fUplo;
229     size_t lda, lenY;
230     size_t fN = params_.N, fK = params_.K;
231 
232     lenY = (params_.N - 1) * params_.incy + 1 + params_.offCY;
233 
234     memcpy(Y_, backY_, lenY * sizeof(ElemType));
235     fOrder = params_.order;
236     fUplo = params_.uplo;
237     lda = params_.lda;
238 
239     if (fOrder != clblasColumnMajor)
240     {
241         fOrder = clblasColumnMajor;
242         fUplo = (params_.uplo == clblasLower)? clblasUpper : clblasLower;
243         fN = params_.N;
244    	}
245 
246 #ifdef PERF_TEST_WITH_ACML
247 
248    	time = getCurrentTime();
249    	clMath::blas::sbmv(fOrder, fUplo, fN, fK , alpha, A_, params_.offA, lda,
250 							X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy);
251   	time = getCurrentTime() - time;
252 
253 #endif  // PERF_TEST_WITH_ACML
254 
255     return time;
256 }
257 
258 
259 template <typename ElemType> nano_time_t
clblasPerfSingle(void)260 SbmvPerformanceTest<ElemType>::clblasPerfSingle(void)
261 {
262     nano_time_t time;
263     cl_event event;
264     cl_int status;
265     size_t lenY;
266     cl_command_queue queue = base_->commandQueues()[0];
267 
268     lenY = (params_.N - 1)* params_.incy + 1 + params_.offCY;
269 
270     status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0,
271                                   lenY * sizeof(ElemType), backY_, 0, NULL, &event);
272 
273     if (status != CL_SUCCESS) {
274         cerr << "Vector Y buffer object enqueuing error, status = " <<
275                  status << endl;
276 
277         return NANOTIME_ERR;
278     }
279 
280     status = clWaitForEvents(1, &event);
281     if (status != CL_SUCCESS) {
282         cout << "Wait on event failed, status = " <<
283                 status << endl;
284 
285         return NANOTIME_ERR;
286     }
287 
288     event = NULL;
289     time = getCurrentTime();
290     int iter = 20;
291 	for ( int i = 1; i <= iter; i++)
292 	{
293         status = clMath::clblas::sbmv(params_.order, params_.uplo, params_.N, params_.K,
294                                         alpha, mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx,
295                                         beta, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event);
296 
297         if (status != CL_SUCCESS) {
298             cerr << "The CLBLAS GBMV function failed, status = " <<
299                     status << endl;
300             return NANOTIME_ERR;
301         }
302     }
303     clFinish( queue );
304     time = getCurrentTime() - time;
305 	time /= iter;
306 
307     return time;
308 }
309 
310 } // namespace clMath
311 
312 // sgbmv performance test
TEST_P(SBMV,ssbmv)313 TEST_P(SBMV, ssbmv)
314 {
315     TestParams params;
316 
317     getParams(&params);
318     SbmvPerformanceTest<float>::runInstance(FN_SSBMV, &params);
319 }
320 
321 // dgbmv performance test case
TEST_P(SBMV,dsbmv)322 TEST_P(SBMV, dsbmv)
323 {
324     TestParams params;
325 
326     getParams(&params);
327     SbmvPerformanceTest<double>::runInstance(FN_DSBMV, &params);
328 }
329