1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 /*
19  * Tbsv performance test cases
20  */
21 
22 #include <stdlib.h>             // srand()
23 #include <string.h>             // memcpy()
24 #include <gtest/gtest.h>
25 #include <clBLAS.h>
26 #include <common.h>
27 #include <clBLAS-wrapper.h>
28 #include <BlasBase.h>
29 #include <tbmv.h>
30 #include <tbsv.h>
31 #include <blas-random.h>
32 
33 #ifdef PERF_TEST_WITH_ACML
34 #include <blas-internal.h>
35 #include <blas-wrapper.h>
36 #endif
37 
38 #include "PerformanceTest.h"
39 
40 using namespace std;
41 using namespace clMath;
42 
43 #define CHECK_RESULT(ret)                                                   \
44 do {                                                                        \
45     ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
46                          "perform an OpenCL request!" << endl;              \
47     EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
48                          endl;                                              \
49 } while (0)
50 
51 namespace clMath {
52 
53 template <typename ElemType> class TbsvPerformanceTest : public PerformanceTest
54 {
55 public:
56     virtual ~TbsvPerformanceTest();
57 
58     virtual int prepare(void);
59     virtual nano_time_t etalonPerfSingle(void);
60     virtual nano_time_t clblasPerfSingle(void);
61 
runInstance(BlasFunction fn,TestParams * params)62     static void runInstance(BlasFunction fn, TestParams *params)
63     {
64         TbsvPerformanceTest<ElemType> perfCase(fn, params);
65         int ret = 0;
66         int opFactor = 1;
67         BlasBase *base;
68 
69         base = clMath::BlasBase::getInstance();
70 
71         if ((fn == FN_DTBSV || fn == FN_ZTBSV) &&
72             !base->isDevSupportDoublePrecision()) {
73 
74             std::cerr << ">> WARNING: The target device doesn't support native "
75                          "double precision floating point arithmetic" <<
76                          std::endl << ">> Test skipped" << std::endl;
77             return;
78         }
79 
80         if (!perfCase.areResourcesSufficient(params)) {
81             std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
82                         std::endl;
83         }
84         else {
85             ret = perfCase.run(opFactor);
86         }
87 
88         ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
89                              "perform an OpenCL request!" << endl;
90         EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
91     }
92 
93 private:
94     TbsvPerformanceTest(BlasFunction fn, TestParams *params);
95 
96     bool areResourcesSufficient(TestParams *params);
97 
98     TestParams params_;
99     ElemType *A_;
100     ElemType *X_;
101     ElemType *backX_;
102     cl_mem mobjA_;
103     cl_mem mobjX_;
104     cl_mem mobjScratch_;
105     ::clMath::BlasBase *base_;
106 };
107 
108 template <typename ElemType>
TbsvPerformanceTest(BlasFunction fn,TestParams * params)109 TbsvPerformanceTest<ElemType>::TbsvPerformanceTest(
110     BlasFunction fn,
111     TestParams *params) : PerformanceTest(fn,
112     (problem_size_t)(   params->N * (params->K+1) * 2           // A & X access
113                      - (params->K * (params->K+1) )             // Substract hole-part for A & X
114                      + (2*params->N)   /* Y access */  ) * sizeof(ElemType)  ),
115                             params_(*params), mobjA_(NULL), mobjX_(NULL), mobjScratch_(NULL)
116 {
117     size_t lenA, lenX;
118     lenA = params_.N  * params_.lda + params_.offA;
119     lenX = (params_.N  - 1)* params_.incx + 1 + params_.offBX;
120     A_ = new ElemType[ lenA ];
121     X_ = new ElemType[ lenX ];
122     backX_ = new ElemType[ lenX ];
123 
124     base_ = ::clMath::BlasBase::getInstance();
125 
126 	mobjA_ = NULL;
127 	mobjX_ = NULL;
128 	mobjScratch_ = NULL;
129 }
130 
131 template <typename ElemType>
~TbsvPerformanceTest()132 TbsvPerformanceTest<ElemType>::~TbsvPerformanceTest()
133 {
134     if(A_ != NULL)
135     {
136         delete[] A_;
137     }
138 	if(X_ != NULL)
139 	{
140         delete[] X_;
141 	}
142 	if(backX_ != NULL)
143 	{
144 		delete[] backX_;
145 	}
146 
147     if ( mobjA_ != NULL )
148 		clReleaseMemObject(mobjA_);
149 	if ( mobjX_ != NULL )
150 	    clReleaseMemObject(mobjX_);
151 	if ( mobjScratch_ != NULL )
152 		clReleaseMemObject(mobjScratch_);
153 }
154 
155 /*
156  * Check if available OpenCL resources are sufficient to
157  * run the test case
158  */
159 template <typename ElemType> bool
areResourcesSufficient(TestParams * params)160 TbsvPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
161 {
162     clMath::BlasBase *base;
163     size_t gmemSize, allocSize;
164     size_t n = params->N, lda = params->lda;
165     size_t lenA = (n * lda  + params->offA)* sizeof(ElemType);
166     size_t lenX = ((params->N - 1)* params->incx + 1 + params->offBX) * sizeof(ElemType);
167 
168     if((A_ == NULL) || (X_ == NULL) || (backX_ == NULL))
169 	{
170 		return 0;
171 	}
172 
173     base = clMath::BlasBase::getInstance();
174     gmemSize = (size_t)base->availGlobalMemSize(0);
175     allocSize = (size_t)base->maxMemAllocSize();
176 
177     bool suff = (lenA < allocSize) && ( (lenA + 2 * lenX) < gmemSize );
178 
179     return suff;
180 }
181 
182 template <typename ElemType> int
prepare(void)183 TbsvPerformanceTest<ElemType>::prepare(void)
184 {
185     size_t lenX, lenA;
186 
187     lenA = params_.N * params_.lda + params_.offA;
188     lenX = (params_.N - 1)*abs(params_.incx) + 1 + params_.offBX;
189 
190     randomTbsvMatrices( params_.order, params_.uplo, params_.diag, params_.N, params_.K,
191                             (A_+params_.offA), params_.lda, (X_+params_.offBX), params_.incx );
192 
193     memcpy(backX_, X_, lenX * sizeof(ElemType));
194 
195     mobjA_ = base_->createEnqueueBuffer(A_, lenA * sizeof(ElemType), 0, CL_MEM_READ_ONLY);
196     mobjX_ = base_->createEnqueueBuffer(X_, lenX * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
197     mobjScratch_ = base_->createEnqueueBuffer(backX_, lenX * sizeof(ElemType), 0, CL_MEM_READ_WRITE);
198 
199     return ((mobjA_ != NULL) && (mobjX_ != NULL) && (mobjScratch_ != NULL)) ? 0 : -1;
200 }
201 
202 template <typename ElemType> nano_time_t
etalonPerfSingle(void)203 TbsvPerformanceTest<ElemType>::etalonPerfSingle(void)
204 {
205     nano_time_t time = 0;
206     clblasOrder fOrder;
207     clblasTranspose fTrans;
208     clblasUplo fUplo;
209     size_t lda, lenA, lenX;
210 
211     lenA = params_.N * params_.lda;
212     lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX;
213 
214     memcpy(X_, backX_, lenX * sizeof(ElemType));
215     fOrder = params_.order;
216     fTrans = params_.transA;
217     fUplo = params_.uplo;
218     lda = params_.lda;
219 
220     if (fOrder != clblasColumnMajor)
221     {
222         fOrder = clblasColumnMajor;
223         fTrans = (params_.transA == clblasNoTrans)? clblasTrans : clblasNoTrans;
224         fUplo = (params_.uplo == clblasLower)? clblasUpper : clblasLower;
225 
226 		if( params_.transA == clblasConjTrans )
227             doConjugate( (A_+params_.offA), 1, lenA, lda );
228    	}
229 
230 #ifdef PERF_TEST_WITH_ACML
231 
232    	time = getCurrentTime();
233    	clMath::blas::tbsv(fOrder, fUplo, fTrans, params_.diag, params_.N, params_.K, A_, params_.offA, lda, X_, params_.offBX, params_.incx);
234   	time = getCurrentTime() - time;
235 
236 #endif  // PERF_TEST_WITH_ACML
237 
238     return time;
239 }
240 
241 
242 template <typename ElemType> nano_time_t
clblasPerfSingle(void)243 TbsvPerformanceTest<ElemType>::clblasPerfSingle(void)
244 {
245     nano_time_t time;
246     cl_event event;
247     cl_int status;
248     size_t lenX;
249     cl_command_queue queue = base_->commandQueues()[0];
250 
251     lenX = (params_.N - 1)* params_.incx + 1 + params_.offBX;
252     status = clEnqueueWriteBuffer(queue, mobjX_, CL_TRUE, 0,
253                                   lenX * sizeof(ElemType), backX_, 0, NULL, &event);
254 
255     if (status != CL_SUCCESS) {
256         cerr << "Vector X buffer object enqueuing error, status = " <<
257                  status << endl;
258 
259         return NANOTIME_ERR;
260     }
261 
262     status = clWaitForEvents(1, &event);
263     if (status != CL_SUCCESS) {
264         cout << "Wait on event failed, status = " <<
265                 status << endl;
266 
267         return NANOTIME_ERR;
268     }
269     DataType type;
270     type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
271 										( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
272 
273     event = NULL;
274     time = getCurrentTime();
275     int iter = 20;
276 	for ( int i = 1; i <= iter; i++)
277 	{
278         status = clMath::clblas::tbsv(type, params_.order, params_.uplo, params_.transA, params_.diag, params_.N, params_.K,
279                                         mobjA_, params_.offA, params_.lda, mobjX_, params_.offBX, params_.incx,
280                                          1, &queue, 0, NULL, &event);
281 
282         if (status != CL_SUCCESS) {
283             cerr << "The CLBLAS TBSV function failed, status = " <<
284                     status << endl;
285             return NANOTIME_ERR;
286         }
287     }
288     clFinish( queue );
289     time = getCurrentTime() - time;
290 	time /= iter;
291 
292     return time;
293 }
294 
295 } // namespace clMath
296 
TEST_P(TBSV,stbsv)297 TEST_P(TBSV, stbsv)
298 {
299     TestParams params;
300 
301     getParams(&params);
302     TbsvPerformanceTest<float>::runInstance(FN_STBSV, &params);
303 }
304 
TEST_P(TBSV,dtbsv)305 TEST_P(TBSV, dtbsv)
306 {
307     TestParams params;
308 
309     getParams(&params);
310     TbsvPerformanceTest<double>::runInstance(FN_DTBSV, &params);
311 }
312 
TEST_P(TBSV,ctbsv)313 TEST_P(TBSV, ctbsv)
314 {
315     TestParams params;
316 
317     getParams(&params);
318     TbsvPerformanceTest<FloatComplex>::runInstance(FN_CTBSV, &params);
319 }
320 
TEST_P(TBSV,ztbsv)321 TEST_P(TBSV, ztbsv)
322 {
323     TestParams params;
324 
325     getParams(&params);
326     TbsvPerformanceTest<DoubleComplex>::runInstance(FN_ZTBSV, &params);
327 }
328