1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 /*
19 * Symm performance test cases
20 */
21
22 #include <stdlib.h> // srand()
23 #include <string.h> // memcpy()
24 #include <gtest/gtest.h>
25 #include <clBLAS.h>
26
27 #include <common.h>
28 #include <clBLAS-wrapper.h>
29 #include <BlasBase.h>
30 #include <gerc.h>
31 #include <blas-random.h>
32
33 #ifdef PERF_TEST_WITH_ACML
34 #include <blas-internal.h>
35 #include <blas-wrapper.h>
36 #endif
37
38 #include "PerformanceTest.h"
39
40 /*
41 * NOTE: operation factor means overall number
42 * of multiply and add per each operation involving
43 * 2 matrix elements
44 */
45
46 using namespace std;
47 using namespace clMath;
48
49 #define CHECK_RESULT(ret) \
50 do { \
51 ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \
52 "perform an OpenCL request!" << endl; \
53 EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \
54 endl; \
55 } while (0)
56
57 namespace clMath {
58
59 template <typename ElemType> class GercPerformanceTest : public PerformanceTest
60 {
61 public:
62 virtual ~GercPerformanceTest();
63
64 virtual int prepare(void);
65 virtual nano_time_t etalonPerfSingle(void);
66 virtual nano_time_t clblasPerfSingle(void);
67
runInstance(BlasFunction fn,TestParams * params)68 static void runInstance(BlasFunction fn, TestParams *params)
69 {
70 GercPerformanceTest<ElemType> perfCase(fn, params);
71 int ret = 0;
72 int opFactor;
73 BlasBase *base;
74
75 base = clMath::BlasBase::getInstance();
76
77 opFactor =1;
78
79 if (fn == FN_ZGERC &&
80 !base->isDevSupportDoublePrecision()) {
81
82 std::cerr << ">> WARNING: The target device doesn't support native "
83 "double precision floating point arithmetic" <<
84 std::endl << ">> Test skipped" << std::endl;
85 return;
86 }
87
88 if (!perfCase.areResourcesSufficient(params)) {
89 std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
90 std::endl;
91 return;
92 }
93 else {
94 ret = perfCase.run(opFactor);
95 }
96
97 ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
98 "perform an OpenCL request!" << endl;
99 EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
100 }
101
102 private:
103 GercPerformanceTest(BlasFunction fn, TestParams *params);
104
105 bool areResourcesSufficient(TestParams *params);
106
107 TestParams params_;
108 ElemType alpha_;
109 ElemType *A_;
110 ElemType *backA_;
111 ElemType *x_;
112 ElemType *y_;
113 cl_mem mobjA_;
114 cl_mem mobjx_;
115 cl_mem mobjy_;
116 int lengthA;
117 ::clMath::BlasBase *base_;
118 };
119
120 template <typename ElemType>
GercPerformanceTest(BlasFunction fn,TestParams * params)121 GercPerformanceTest<ElemType>::GercPerformanceTest(
122 BlasFunction fn,
123 TestParams *params) : PerformanceTest(fn,(problem_size_t) (((2 * params->M * params->N) + params->M + params->N ) * sizeof(ElemType) ) ), params_(*params), mobjA_(NULL), mobjx_(NULL), mobjy_(NULL)
124 {
125 //if( params_.side == clblasLeft )
126 // ka = params_.M;
127 //else ka = params_.N;
128
129 if( params_.order == clblasColumnMajor )
130 lengthA = params_.N * params_.lda;
131 else
132 lengthA = params_.M * params_.lda;
133
134 A_ = new ElemType[(lengthA) + params_.offa];
135 backA_ = new ElemType[lengthA+ params_.offa];
136 x_ = new ElemType[(1 + (params->M - 1) * abs(params_.incx))+ params_.offBX];
137 y_ = new ElemType[(1 + (params->N - 1) * abs(params_.incy)) + params_.offCY] ;
138
139 base_ = ::clMath::BlasBase::getInstance();
140 }
141
142 template <typename ElemType>
~GercPerformanceTest()143 GercPerformanceTest<ElemType>::~GercPerformanceTest()
144 {
145 if(A_ != NULL)
146 {
147 delete[] A_;
148 }
149 if(x_ != NULL)
150 {
151 delete[] x_;
152 }
153 if(y_ != NULL)
154 {
155 delete[] y_;
156 }
157 if(backA_ != NULL)
158 {
159 delete[] backA_;
160 }
161
162 if( mobjy_ != NULL )
163 clReleaseMemObject(mobjy_);
164 if( mobjx_ != NULL )
165 clReleaseMemObject(mobjx_);
166 if( mobjA_ != NULL )
167 clReleaseMemObject(mobjA_);
168 }
169
170 /*
171 * Check if available OpenCL resources are sufficient to
172 * run the test case
173 */
174 template <typename ElemType> bool
areResourcesSufficient(TestParams * params)175 GercPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
176 {
177 clMath::BlasBase *base;
178 size_t gmemSize, allocSize;
179 bool ret;
180 size_t m = params->M, n = params->N;
181
182 if((A_ == NULL) || (backA_ == NULL) || (x_ == NULL) || (y_ == NULL))
183 {
184 return 0;
185 }
186
187 base = clMath::BlasBase::getInstance();
188 gmemSize = (size_t)base->availGlobalMemSize( 0 );
189 allocSize = (size_t)base->maxMemAllocSize();
190
191 ret = std::max(m, n) * params_.lda * sizeof(ElemType) < allocSize;
192 ret = ret && ( ((1 + (params_.M-1)*abs(params_.incx)))* sizeof(ElemType) < allocSize);
193 ret = ret && ( ((1 + (params_.N-1)*abs(params_.incy))) * sizeof(ElemType) < allocSize);
194
195 ret = ret && (((std::max(m, n) * params_.lda) + ((1 + (params_.M-1)*abs(params_.incx))) + ((1 + (params_.N-1)*abs(params_.incy)))) < gmemSize);
196
197 return ret;
198 }
199
200 template <typename ElemType> int
prepare(void)201 GercPerformanceTest<ElemType>::prepare(void)
202 {
203 bool useAlpha = base_->useAlpha();
204
205 if (useAlpha) {
206 alpha_ = convertMultiplier<ElemType>(params_.alpha);
207 }
208
209
210 int creationFlags = 0;
211 creationFlags = creationFlags | RANDOM_INIT;
212
213 creationFlags = ( (params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
214 BlasRoutineID funcId = CLBLAS_GER;
215
216 populate( A_ + params_.offa, params_.M, params_.N, params_.lda, funcId, creationFlags);
217 populate( x_ , (1 + (params_.M-1) * abs(params_.incx) + params_.offBX),1, (1 + (params_.M-1) * abs(params_.incx) + params_.offBX), funcId, 0 );
218 populate( y_ , (1 + (params_.N-1) * abs(params_.incy) + params_.offCY),1, (1 + (params_.N-1) * abs(params_.incy) + params_.offCY), funcId, 0 );
219
220
221 memcpy(backA_, A_, (lengthA + params_.offa)* sizeof(ElemType));
222
223 mobjA_ = base_->createEnqueueBuffer(A_, (lengthA + params_.offa) * sizeof(*A_), 0, CL_MEM_READ_WRITE);
224 mobjx_ = base_->createEnqueueBuffer(x_, ( (1 + (params_.M-1) * abs(params_.incx) + params_.offBX)) * sizeof(*x_), 0, CL_MEM_READ_WRITE);
225 mobjy_ = base_->createEnqueueBuffer(y_,( (1 + (params_.N-1) * abs(params_.incy) + params_.offCY)) * sizeof(*y_) , 0, CL_MEM_READ_WRITE);
226
227 return ( (mobjA_ != NULL) && (mobjx_ != NULL) && (mobjy_ != NULL) ) ? 0 : -1;
228 }
229
230 template <typename ElemType> nano_time_t
etalonPerfSingle(void)231 GercPerformanceTest<ElemType>::etalonPerfSingle(void)
232 {
233 nano_time_t time = 0;
234 clblasOrder order;
235 size_t lda;
236 //int fIncx, fIncy;
237
238 #ifndef PERF_TEST_WITH_ROW_MAJOR
239 if (params_.order == clblasRowMajor) {
240 cerr << "Row major order is not allowed" << endl;
241 return NANOTIME_ERR;
242 }
243 #endif
244
245 order = params_.order;
246 lda = params_.lda;
247
248 #ifdef PERF_TEST_WITH_ACML
249
250 clblasOrder fOrder;
251 size_t fN, fM;
252 size_t fOffx, fOffy;
253 int fIncx, fIncy;
254 ElemType *fX, *fY;
255 fOrder = params_.order;
256 fM = params_.M;
257 fN = params_.N;
258 fIncx = params_.incx;
259 fIncy = params_.incy;
260 fX = x_;
261 fY = y_;
262 fOffx = params_.offBX;
263 fOffy = params_.offCY;
264
265 if (fOrder != clblasColumnMajor) {
266 fOrder = clblasColumnMajor;
267
268 doConjugate( (y_ + params_.offCY), (1 + (params_.N-1) * abs(params_.incy)), 1, 1 );
269 fM = params_.N;
270 fN = params_.M;
271 fX = y_;
272 fY = x_;
273 fIncx = params_.incy;
274 fIncy = params_.incx;
275 fOffx = params_.offCY;
276 fOffy = params_.offBX;
277 // Note this according to the Legacy guide
278 time = getCurrentTime();
279 clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy, A_, params_.offa, params_.lda);
280 }
281 else{
282 time = getCurrentTime();
283 clMath::blas::gerc(order, fM, fN, alpha_, fX, fOffx, params_.incx, fY, fOffy, params_.incy, A_, params_.offa, lda);
284 }
285 time = getCurrentTime() - time;
286
287 #endif // PERF_TEST_WITH_ACML<F2>
288
289 return time;
290 }
291
292
293 template <typename ElemType> nano_time_t
clblasPerfSingle(void)294 GercPerformanceTest<ElemType>::clblasPerfSingle(void)
295 {
296 nano_time_t time;
297 cl_event event;
298 cl_int status;
299 cl_command_queue queue = base_->commandQueues()[0];
300
301 status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0,
302 (lengthA + params_.offa) * sizeof(ElemType), backA_, 0, NULL, &event);
303 if (status != CL_SUCCESS) {
304 cerr << "Matrix A buffer object enqueuing error, status = " <<
305 status << endl;
306
307 return NANOTIME_ERR;
308 }
309
310 status = clWaitForEvents(1, &event);
311 if (status != CL_SUCCESS) {
312 cout << "Wait on event failed, status = " <<
313 status << endl;
314
315 return NANOTIME_ERR;
316 }
317
318 event = NULL;
319 time = getCurrentTime();
320
321 #define TIMING
322 #ifdef TIMING
323 clFinish( queue);
324
325 int iter = 20;
326 for ( int i = 1; i <= iter; i++)
327 {
328 #endif
329
330 status = (cl_int)clMath::clblas::gerc(params_.order, params_.M, params_.N, alpha_,
331 mobjx_, params_.offBX, params_.incx, mobjy_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1,
332 &queue, 0, NULL, &event);
333 if (status != CL_SUCCESS) {
334 cerr << "The CLBLAS GERC function failed, status = " <<
335 status << endl;
336
337 return NANOTIME_ERR;
338 }
339 #ifdef TIMING
340 } // iter loop
341 clFinish( queue);
342 time = getCurrentTime() - time;
343 time /= iter;
344 #else
345
346 status = flushAll(1, &queue);
347 if (status != CL_SUCCESS) {
348 cerr << "clFlush() failed, status = " << status << endl;
349 return NANOTIME_ERR;
350 }
351
352 time = getCurrentTime();
353 status = waitForSuccessfulFinish(1, &queue, &event);
354 if (status == CL_SUCCESS) {
355 time = getCurrentTime() - time;
356 }
357 else {
358 cerr << "Waiting for completion of commands to the queue failed, "
359 "status = " << status << endl;
360 time = NANOTIME_ERR;
361 }
362 #endif
363 return time;
364 }
365
366 } // namespace clMath
367
368
TEST_P(GERC,cgerc)369 TEST_P(GERC, cgerc)
370 {
371 TestParams params;
372
373 getParams(¶ms);
374 GercPerformanceTest<FloatComplex>::runInstance(FN_CGERC, ¶ms);
375 }
376
377
TEST_P(GERC,zgerc)378 TEST_P(GERC, zgerc)
379 {
380 TestParams params;
381
382 getParams(¶ms);
383 GercPerformanceTest<DoubleComplex>::runInstance(FN_ZGERC, ¶ms);
384 }
385