1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 /*
19 * Sbmv performance test cases
20 */
21
22 #include <stdlib.h> // srand()
23 #include <string.h> // memcpy()
24 #include <gtest/gtest.h>
25 #include <clBLAS.h>
26 #include <common.h>
27 #include <clBLAS-wrapper.h>
28 #include <BlasBase.h>
29 #include <gbmv.h>
30 #include <sbmv.h>
31 #include <blas-random.h>
32
33 #ifdef PERF_TEST_WITH_ACML
34 #include <blas-internal.h>
35 #include <blas-wrapper.h>
36 #endif
37
38 #include "PerformanceTest.h"
39
40 using namespace std;
41 using namespace clMath;
42
43 #define CHECK_RESULT(ret) \
44 do { \
45 ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \
46 "perform an OpenCL request!" << endl; \
47 EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \
48 endl; \
49 } while (0)
50
51 namespace clMath {
52
53 template <typename ElemType> class SbmvPerformanceTest : public PerformanceTest
54 {
55 public:
56 virtual ~SbmvPerformanceTest();
57
58 virtual int prepare(void);
59 virtual nano_time_t etalonPerfSingle(void);
60 virtual nano_time_t clblasPerfSingle(void);
61
runInstance(BlasFunction fn,TestParams * params)62 static void runInstance(BlasFunction fn, TestParams *params)
63 {
64 SbmvPerformanceTest<ElemType> perfCase(fn, params);
65 int ret = 0;
66 int opFactor = 1;
67 BlasBase *base;
68
69 base = clMath::BlasBase::getInstance();
70
71 if ((fn == FN_DSBMV) &&
72 !base->isDevSupportDoublePrecision()) {
73
74 std::cerr << ">> WARNING: The target device doesn't support native "
75 "double precision floating point arithmetic" <<
76 std::endl << ">> Test skipped" << std::endl;
77 return;
78 }
79
80 if (!perfCase.areResourcesSufficient(params)) {
81 std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
82 std::endl;
83 }
84 else {
85 ret = perfCase.run(opFactor);
86 }
87
88 ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
89 "perform an OpenCL request!" << endl;
90 EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
91 }
92
93 private:
94 SbmvPerformanceTest(BlasFunction fn, TestParams *params);
95
96 bool areResourcesSufficient(TestParams *params);
97
98 TestParams params_;
99 ElemType alpha;
100 ElemType beta;
101 ElemType *A_;
102 ElemType *X_;
103 ElemType *Y_;
104 ElemType *backY_;
105 cl_mem mobjA_;
106 cl_mem mobjX_;
107 cl_mem mobjY_;
108 ::clMath::BlasBase *base_;
109 };
110
111 template <typename ElemType>
SbmvPerformanceTest(BlasFunction fn,TestParams * params)112 SbmvPerformanceTest<ElemType>::SbmvPerformanceTest(
113 BlasFunction fn,
114 TestParams *params) : PerformanceTest(fn,
115 (problem_size_t)( ( (2 * (params->N) * (params->K + 1) // A-access
116 - (2 * params->K * (params->K+1)) ) // Substract hole-part for A & X
117 +( ((2*params->K + 1) * params->N + 2*params->N)) // X & Y access
118 ) * sizeof(ElemType) ) ),
119 params_(*params), mobjA_(NULL), mobjX_(NULL), mobjY_(NULL)
120 {
121 size_t lenA, lenX, lenY;
122 lenA = params_.N * (params_.lda) + params_.offA;
123 lenX = params_.N - 1* params_.incx + 1 + params_.offBX;
124 lenY = params_.N - 1* params_.incy + 1 + params_.offCY;
125 A_ = new ElemType[ lenA ];
126 X_ = new ElemType[ lenX ];
127 Y_ = new ElemType[ lenY ];
128 backY_ = new ElemType[ lenY ];
129 alpha = convertMultiplier<ElemType>(params_.alpha);
130 beta = convertMultiplier<ElemType>(params_.beta);
131
132 base_ = ::clMath::BlasBase::getInstance();
133
134 mobjA_ = NULL;
135 mobjX_ = NULL;
136 mobjY_ = NULL;
137 }
138
139 template <typename ElemType>
~SbmvPerformanceTest()140 SbmvPerformanceTest<ElemType>::~SbmvPerformanceTest()
141 {
142 if(A_ != NULL)
143 {
144 delete[] A_;
145 }
146 if(X_ != NULL)
147 {
148 delete[] X_;
149 }
150 if(backY_ != NULL)
151 {
152 delete[] backY_;
153 }
154 if(Y_ != NULL)
155 {
156 delete[] Y_;
157 }
158
159 if ( mobjA_ != NULL )
160 clReleaseMemObject(mobjA_);
161 if ( mobjX_ != NULL )
162 clReleaseMemObject(mobjX_);
163 if ( mobjY_ != NULL )
164 clReleaseMemObject(mobjY_);
165 }
166
167 /*
168 * Check if available OpenCL resources are sufficient to
169 * run the test case
170 */
171 template <typename ElemType> bool
areResourcesSufficient(TestParams * params)172 SbmvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
173 {
174 clMath::BlasBase *base;
175 size_t gmemSize, allocSize;
176 size_t n = params->N, lda = params->lda;
177 size_t lenA = (n * lda) + params->offA* sizeof(ElemType);
178 size_t lenX = (n - 1) * params->incx + 1 + params->offBX * sizeof(ElemType);
179 size_t lenY = (n - 1) * params->incy + 1 + params->offCY * sizeof(ElemType);
180
181 if((A_ == NULL) || (X_ == NULL) || (Y_ == NULL) || (backY_ == NULL))
182 {
183 return 0;
184 }
185
186 base = clMath::BlasBase::getInstance();
187 gmemSize = (size_t)base->availGlobalMemSize(0);
188 allocSize = (size_t)base->maxMemAllocSize();
189
190 bool suff = (lenA < allocSize) && ( (lenA + lenX + lenY) < gmemSize );
191
192 return suff;
193 }
194
195 template <typename ElemType> int
prepare(void)196 SbmvPerformanceTest<ElemType>::prepare(void)
197 {
198 size_t lenX, lenY, lenA;
199
200 lenA = (params_.N * params_.lda) + params_.offA;
201
202 if (params_.transA == clblasNoTrans) {
203 lenX = (params_.N - 1) * abs(params_.incx) + 1 + params_.offBX;
204 lenY = (params_.N - 1) * abs(params_.incy) + 1 + params_.offCY;
205 }
206 else {
207 lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX;
208 lenY = (params_.N - 1)*abs(params_.incy) + 1 + params_.offCY;
209 }
210
211 randomGbmvMatrices(params_.order, clblasNoTrans , params_.N, params_.N, &alpha, &beta,
212 (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx, (Y_+params_.offCY), params_.incy );
213
214 memcpy(backY_, Y_, lenY * sizeof(ElemType));
215
216 mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
217 mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
218 mobjY_ = base_->createEnqueueBuffer(backY_, lenY * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
219
220 return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjY_ != NULL)) ? 0 : -1;
221 }
222
223 template <typename ElemType> nano_time_t
etalonPerfSingle(void)224 SbmvPerformanceTest<ElemType>::etalonPerfSingle(void)
225 {
226 nano_time_t time = 0;
227 clblasOrder fOrder;
228 clblasUplo fUplo;
229 size_t lda, lenY;
230 size_t fN = params_.N, fK = params_.K;
231
232 lenY = (params_.N - 1) * params_.incy + 1 + params_.offCY;
233
234 memcpy(Y_, backY_, lenY * sizeof(ElemType));
235 fOrder = params_.order;
236 fUplo = params_.uplo;
237 lda = params_.lda;
238
239 if (fOrder != clblasColumnMajor)
240 {
241 fOrder = clblasColumnMajor;
242 fUplo = (params_.uplo == clblasLower)? clblasUpper : clblasLower;
243 fN = params_.N;
244 }
245
246 #ifdef PERF_TEST_WITH_ACML
247
248 time = getCurrentTime();
249 clMath::blas::sbmv(fOrder, fUplo, fN, fK , alpha, A_, params_.offA, lda,
250 X_, params_.offBX, params_.incx, beta, Y_, params_.offCY, params_.incy);
251 time = getCurrentTime() - time;
252
253 #endif // PERF_TEST_WITH_ACML
254
255 return time;
256 }
257
258
259 template <typename ElemType> nano_time_t
clblasPerfSingle(void)260 SbmvPerformanceTest<ElemType>::clblasPerfSingle(void)
261 {
262 nano_time_t time;
263 cl_event event;
264 cl_int status;
265 size_t lenY;
266 cl_command_queue queue = base_->commandQueues()[0];
267
268 lenY = (params_.N - 1)* params_.incy + 1 + params_.offCY;
269
270 status = clEnqueueWriteBuffer(queue, mobjY_, CL_TRUE, 0,
271 lenY * sizeof(ElemType), backY_, 0, NULL, &event);
272
273 if (status != CL_SUCCESS) {
274 cerr << "Vector Y buffer object enqueuing error, status = " <<
275 status << endl;
276
277 return NANOTIME_ERR;
278 }
279
280 status = clWaitForEvents(1, &event);
281 if (status != CL_SUCCESS) {
282 cout << "Wait on event failed, status = " <<
283 status << endl;
284
285 return NANOTIME_ERR;
286 }
287
288 event = NULL;
289 time = getCurrentTime();
290 int iter = 20;
291 for ( int i = 1; i <= iter; i++)
292 {
293 status = clMath::clblas::sbmv(params_.order, params_.uplo, params_.N, params_.K,
294 alpha, mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx,
295 beta, mobjY_, params_.offCY, params_.incy, 1, &queue, 0, NULL, &event);
296
297 if (status != CL_SUCCESS) {
298 cerr << "The CLBLAS GBMV function failed, status = " <<
299 status << endl;
300 return NANOTIME_ERR;
301 }
302 }
303 clFinish( queue );
304 time = getCurrentTime() - time;
305 time /= iter;
306
307 return time;
308 }
309
310 } // namespace clMath
311
312 // sgbmv performance test
TEST_P(SBMV,ssbmv)313 TEST_P(SBMV, ssbmv)
314 {
315 TestParams params;
316
317 getParams(¶ms);
318 SbmvPerformanceTest<float>::runInstance(FN_SSBMV, ¶ms);
319 }
320
321 // dgbmv performance test case
TEST_P(SBMV,dsbmv)322 TEST_P(SBMV, dsbmv)
323 {
324 TestParams params;
325
326 getParams(¶ms);
327 SbmvPerformanceTest<double>::runInstance(FN_DSBMV, ¶ms);
328 }
329