1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 /*
19 * Tbsv performance test cases
20 */
21
22 #include <stdlib.h> // srand()
23 #include <string.h> // memcpy()
24 #include <gtest/gtest.h>
25 #include <clBLAS.h>
26 #include <common.h>
27 #include <clBLAS-wrapper.h>
28 #include <BlasBase.h>
29 #include <tbmv.h>
30 #include <tbsv.h>
31 #include <blas-random.h>
32
33 #ifdef PERF_TEST_WITH_ACML
34 #include <blas-internal.h>
35 #include <blas-wrapper.h>
36 #endif
37
38 #include "PerformanceTest.h"
39
40 using namespace std;
41 using namespace clMath;
42
43 #define CHECK_RESULT(ret) \
44 do { \
45 ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \
46 "perform an OpenCL request!" << endl; \
47 EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \
48 endl; \
49 } while (0)
50
51 namespace clMath {
52
53 template <typename ElemType> class TbsvPerformanceTest : public PerformanceTest
54 {
55 public:
56 virtual ~TbsvPerformanceTest();
57
58 virtual int prepare(void);
59 virtual nano_time_t etalonPerfSingle(void);
60 virtual nano_time_t clblasPerfSingle(void);
61
runInstance(BlasFunction fn,TestParams * params)62 static void runInstance(BlasFunction fn, TestParams *params)
63 {
64 TbsvPerformanceTest<ElemType> perfCase(fn, params);
65 int ret = 0;
66 int opFactor = 1;
67 BlasBase *base;
68
69 base = clMath::BlasBase::getInstance();
70
71 if ((fn == FN_DTBSV || fn == FN_ZTBSV) &&
72 !base->isDevSupportDoublePrecision()) {
73
74 std::cerr << ">> WARNING: The target device doesn't support native "
75 "double precision floating point arithmetic" <<
76 std::endl << ">> Test skipped" << std::endl;
77 return;
78 }
79
80 if (!perfCase.areResourcesSufficient(params)) {
81 std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
82 std::endl;
83 }
84 else {
85 ret = perfCase.run(opFactor);
86 }
87
88 ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
89 "perform an OpenCL request!" << endl;
90 EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
91 }
92
93 private:
94 TbsvPerformanceTest(BlasFunction fn, TestParams *params);
95
96 bool areResourcesSufficient(TestParams *params);
97
98 TestParams params_;
99 ElemType *A_;
100 ElemType *X_;
101 ElemType *backX_;
102 cl_mem mobjA_;
103 cl_mem mobjX_;
104 cl_mem mobjScratch_;
105 ::clMath::BlasBase *base_;
106 };
107
108 template <typename ElemType>
TbsvPerformanceTest(BlasFunction fn,TestParams * params)109 TbsvPerformanceTest<ElemType>::TbsvPerformanceTest(
110 BlasFunction fn,
111 TestParams *params) : PerformanceTest(fn,
112 (problem_size_t)( params->N * (params->K+1) * 2 // A & X access
113 - (params->K * (params->K+1) ) // Substract hole-part for A & X
114 + (2*params->N) /* Y access */ ) * sizeof(ElemType) ),
115 params_(*params), mobjA_(NULL), mobjX_(NULL), mobjScratch_(NULL)
116 {
117 size_t lenA, lenX;
118 lenA = params_.N * params_.lda + params_.offA;
119 lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX;
120 A_ = new ElemType[ lenA ];
121 X_ = new ElemType[ lenX ];
122 backX_ = new ElemType[ lenX ];
123
124 base_ = ::clMath::BlasBase::getInstance();
125
126 mobjA_ = NULL;
127 mobjX_ = NULL;
128 mobjScratch_ = NULL;
129 }
130
131 template <typename ElemType>
~TbsvPerformanceTest()132 TbsvPerformanceTest<ElemType>::~TbsvPerformanceTest()
133 {
134 if(A_ != NULL)
135 {
136 delete[] A_;
137 }
138 if(X_ != NULL)
139 {
140 delete[] X_;
141 }
142 if(backX_ != NULL)
143 {
144 delete[] backX_;
145 }
146
147 if ( mobjA_ != NULL )
148 clReleaseMemObject(mobjA_);
149 if ( mobjX_ != NULL )
150 clReleaseMemObject(mobjX_);
151 if ( mobjScratch_ != NULL )
152 clReleaseMemObject(mobjScratch_);
153 }
154
155 /*
156 * Check if available OpenCL resources are sufficient to
157 * run the test case
158 */
159 template <typename ElemType> bool
areResourcesSufficient(TestParams * params)160 TbsvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
161 {
162 clMath::BlasBase *base;
163 size_t gmemSize, allocSize;
164 size_t n = params->N, lda = params->lda;
165 size_t lenA = (n * lda + params->offA)* sizeof(ElemType);
166 size_t lenX = ((params->N - 1)* params->incx + 1 + params->offBX) * sizeof(ElemType);
167
168 if((A_ == NULL) || (X_ == NULL) || (backX_ == NULL))
169 {
170 return 0;
171 }
172
173 base = clMath::BlasBase::getInstance();
174 gmemSize = (size_t)base->availGlobalMemSize(0);
175 allocSize = (size_t)base->maxMemAllocSize();
176
177 bool suff = (lenA < allocSize) && ( (lenA + 2 * lenX) < gmemSize );
178
179 return suff;
180 }
181
182 template <typename ElemType> int
prepare(void)183 TbsvPerformanceTest<ElemType>::prepare(void)
184 {
185 size_t lenX, lenA;
186
187 lenA = params_.N * params_.lda + params_.offA;
188 lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX;
189
190 randomTbsvMatrices( params_.order, params_.uplo, params_.diag, params_.N, params_.K,
191 (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx );
192
193 memcpy(backX_, X_, lenX * sizeof(ElemType));
194
195 mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
196 mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
197 mobjScratch_ = base_->createEnqueueBuffer(backX_, lenX * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
198
199 return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjScratch_ != NULL)) ? 0 : -1;
200 }
201
202 template <typename ElemType> nano_time_t
etalonPerfSingle(void)203 TbsvPerformanceTest<ElemType>::etalonPerfSingle(void)
204 {
205 nano_time_t time = 0;
206 clblasOrder fOrder;
207 clblasTranspose fTrans;
208 clblasUplo fUplo;
209 size_t lda, lenA, lenX;
210
211 lenA = params_.N * params_.lda;
212 lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX;
213
214 memcpy(X_, backX_, lenX * sizeof(ElemType));
215 fOrder = params_.order;
216 fTrans = params_.transA;
217 fUplo = params_.uplo;
218 lda = params_.lda;
219
220 if (fOrder != clblasColumnMajor)
221 {
222 fOrder = clblasColumnMajor;
223 fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
224 fUplo = (params_.uplo == clblasLower)? clblasUpper : clblasLower;
225
226 if( params_.transA == clblasConjTrans )
227 doConjugate( (A_+params_.offA), 1, lenA, lda );
228 }
229
230 #ifdef PERF_TEST_WITH_ACML
231
232 time = getCurrentTime();
233 clMath::blas::tbsv(fOrder, fUplo, fTrans, params_.diag, params_.N, params_.K, A_, params_.offA, lda, X_, params_.offBX, params_.incx);
234 time = getCurrentTime() - time;
235
236 #endif // PERF_TEST_WITH_ACML
237
238 return time;
239 }
240
241
242 template <typename ElemType> nano_time_t
clblasPerfSingle(void)243 TbsvPerformanceTest<ElemType>::clblasPerfSingle(void)
244 {
245 nano_time_t time;
246 cl_event event;
247 cl_int status;
248 size_t lenX;
249 cl_command_queue queue = base_->commandQueues()[0];
250
251 lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX;
252 status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0,
253 lenX * sizeof(ElemType), backX_, 0, NULL, &event);
254
255 if (status != CL_SUCCESS) {
256 cerr << "Vector X buffer object enqueuing error, status = " <<
257 status << endl;
258
259 return NANOTIME_ERR;
260 }
261
262 status = clWaitForEvents(1, &event);
263 if (status != CL_SUCCESS) {
264 cout << "Wait on event failed, status = " <<
265 status << endl;
266
267 return NANOTIME_ERR;
268 }
269 DataType type;
270 type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
271 ( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
272
273 event = NULL;
274 time = getCurrentTime();
275 int iter = 20;
276 for ( int i = 1; i <= iter; i++)
277 {
278 status = clMath::clblas::tbsv(type, params_.order, params_.uplo, params_.transA, params_.diag, params_.N, params_.K,
279 mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx,
280 1, &queue, 0, NULL, &event);
281
282 if (status != CL_SUCCESS) {
283 cerr << "The CLBLAS TBSV function failed, status = " <<
284 status << endl;
285 return NANOTIME_ERR;
286 }
287 }
288 clFinish( queue );
289 time = getCurrentTime() - time;
290 time /= iter;
291
292 return time;
293 }
294
295 } // namespace clMath
296
TEST_P(TBSV,stbsv)297 TEST_P(TBSV, stbsv)
298 {
299 TestParams params;
300
301 getParams(¶ms);
302 TbsvPerformanceTest<float>::runInstance(FN_STBSV, ¶ms);
303 }
304
TEST_P(TBSV,dtbsv)305 TEST_P(TBSV, dtbsv)
306 {
307 TestParams params;
308
309 getParams(¶ms);
310 TbsvPerformanceTest<double>::runInstance(FN_DTBSV, ¶ms);
311 }
312
TEST_P(TBSV,ctbsv)313 TEST_P(TBSV, ctbsv)
314 {
315 TestParams params;
316
317 getParams(¶ms);
318 TbsvPerformanceTest<FloatComplex>::runInstance(FN_CTBSV, ¶ms);
319 }
320
TEST_P(TBSV,ztbsv)321 TEST_P(TBSV, ztbsv)
322 {
323 TestParams params;
324
325 getParams(¶ms);
326 TbsvPerformanceTest<DoubleComplex>::runInstance(FN_ZTBSV, ¶ms);
327 }
328