1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 /*
19 * GER performance test cases
20 */
21
22 #include <stdlib.h> // srand()
23 #include <string.h> // memcpy()
24 #include <gtest/gtest.h>
25 #include <clBLAS.h>
26
27 #include <common.h>
28 #include <clBLAS-wrapper.h>
29 #include <BlasBase.h>
30 #include <ger.h>
31 #include <blas-random.h>
32
33 #ifdef PERF_TEST_WITH_ACML
34 #include <blas-internal.h>
35 #include <blas-wrapper.h>
36 #endif
37
38 #include "PerformanceTest.h"
39
40 /*
41 * NOTE: operation factor means overall number
42 * of multiply and add per each operation involving
43 * 2 matrix elements
44 */
45
46 using namespace std;
47 using namespace clMath;
48
49 #define CHECK_RESULT(ret) \
50 do { \
51 ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \
52 "perform an OpenCL request!" << endl; \
53 EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \
54 endl; \
55 } while (0)
56
57 namespace clMath {
58
59 template <typename ElemType> class GerPerformanceTest : public PerformanceTest
60 {
61 public:
62 virtual ~GerPerformanceTest();
63
64 virtual int prepare(void);
65 virtual nano_time_t etalonPerfSingle(void);
66 virtual nano_time_t clblasPerfSingle(void);
67
runInstance(BlasFunction fn,TestParams * params)68 static void runInstance(BlasFunction fn, TestParams *params)
69 {
70 GerPerformanceTest<ElemType> perfCase(fn, params);
71 int ret = 0;
72 int opFactor;
73 BlasBase *base;
74
75 base = clMath::BlasBase::getInstance();
76
77 opFactor =1;
78
79 if ((fn == FN_DGER || fn == FN_ZGERU) &&
80 !base->isDevSupportDoublePrecision()) {
81
82 std::cerr << ">> WARNING: The target device doesn't support native "
83 "double precision floating point arithmetic" <<
84 std::endl << ">> Test skipped" << std::endl;
85 return;
86 }
87
88 if (!perfCase.areResourcesSufficient(params)) {
89 std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
90 std::endl;
91 return;
92 }
93 else {
94 ret = perfCase.run(opFactor);
95 }
96
97 ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
98 "perform an OpenCL request!" << endl;
99 EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
100 }
101
102 private:
103 GerPerformanceTest(BlasFunction fn, TestParams *params);
104
105 bool areResourcesSufficient(TestParams *params);
106
107 TestParams params_;
108 ElemType alpha_;
109 ElemType *A_;
110 ElemType *backA_;
111 ElemType *x_;
112 ElemType *y_;
113 cl_mem mobjA_;
114 cl_mem mobjx_;
115 size_t lengthA;
116 cl_mem mobjy_;
117 ::clMath::BlasBase *base_;
118 };
119
120 template <typename ElemType>
GerPerformanceTest(BlasFunction fn,TestParams * params)121 GerPerformanceTest<ElemType>::GerPerformanceTest(
122 BlasFunction fn,
123 TestParams *params) : PerformanceTest(fn,(problem_size_t) ( ( (3 * params->M * params->N) + params->M ) * sizeof(ElemType) ) ), params_(*params), mobjA_(NULL), mobjx_(NULL), mobjy_(NULL)
124 {
125
126 if( params_.order == clblasColumnMajor )
127 lengthA = params_.N * params_.lda;
128 else
129 lengthA = params_.M * params_.lda;
130
131 A_ = new ElemType[lengthA + params_.offa];
132 backA_ = new ElemType[lengthA + params_.offa];
133 x_ = new ElemType[(1 + (params->M - 1) * abs(params_.incx))+ params_.offBX];
134 y_ = new ElemType[(1 + (params->N - 1) * abs(params_.incy)) + params_.offCY] ;
135
136 base_ = ::clMath::BlasBase::getInstance();
137 }
138
139 template <typename ElemType>
~GerPerformanceTest()140 GerPerformanceTest<ElemType>::~GerPerformanceTest()
141 {
142 if(A_ != NULL)
143 {
144 delete[] A_;
145 }
146 if(x_ != NULL)
147 {
148 delete[] x_;
149 }
150 if(y_ != NULL)
151 {
152 delete[] y_;
153 }
154 if(backA_ != NULL)
155 {
156 delete[] backA_;
157 }
158
159 if( mobjy_ != NULL )
160 clReleaseMemObject(mobjy_);
161 if( mobjx_ != NULL )
162 clReleaseMemObject(mobjx_);
163 if( mobjA_ != NULL )
164 clReleaseMemObject(mobjA_);
165 }
166
167 /*
168 * Check if available OpenCL resources are sufficient to
169 * run the test case
170 */
171 template <typename ElemType> bool
areResourcesSufficient(TestParams * params)172 GerPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
173 {
174 clMath::BlasBase *base;
175 size_t gmemSize, allocSize;
176 bool ret;
177 size_t m = params->M, n = params->N;
178
179 if((A_ == NULL) || (backA_ == NULL) || (x_ == NULL) || (y_ == NULL))
180 {
181 return 0;
182 }
183
184 base = clMath::BlasBase::getInstance();
185 gmemSize = (size_t)base->availGlobalMemSize( 0 );
186 allocSize = (size_t)base->maxMemAllocSize();
187
188 ret = std::max(m, n) * params_.lda * sizeof(ElemType) < allocSize;
189 ret = ret && ( ((1 + (params_.M-1)*abs(params_.incx)))* sizeof(ElemType) < allocSize);
190 ret = ret && ( ((1 + (params_.N-1)*abs(params_.incy))) * sizeof(ElemType) < allocSize);
191
192 ret = ret && (((std::max(m, n) * params_.lda) + ((1 + (params_.M-1)*abs(params_.incx))) + ((1 + (params_.N-1)*abs(params_.incy)))) < gmemSize);
193
194 return ret;
195 }
196
197 template <typename ElemType> int
prepare(void)198 GerPerformanceTest<ElemType>::prepare(void)
199 {
200 bool useAlpha = base_->useAlpha();
201
202 if (useAlpha) {
203 alpha_ = convertMultiplier<ElemType>(params_.alpha);
204 }
205
206
207 int creationFlags = 0;
208 creationFlags = creationFlags | RANDOM_INIT;
209
210 creationFlags = ( (params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
211 BlasRoutineID BlasFn = CLBLAS_GER;
212
213 populate( A_+ params_.offa, params_.M, params_.N, params_.lda, BlasFn, creationFlags);
214 populate( x_, (1 + (params_.M-1) * abs(params_.incx) + params_.offBX), 1, (1 + (params_.M-1) * abs(params_.incx) + params_.offBX), BlasFn, creationFlags );
215 populate( y_, (1 + (params_.N-1) * abs(params_.incy) + params_.offCY), 1, (1 + (params_.N-1) * abs(params_.incy) + params_.offCY), BlasFn, creationFlags );
216
217
218 memcpy(backA_, A_, (lengthA + params_.offa)* sizeof(ElemType));
219
220 mobjA_ = base_->createEnqueueBuffer(A_, (lengthA + params_.offa) * sizeof(*A_), 0, CL_MEM_READ_WRITE);
221 mobjx_ = base_->createEnqueueBuffer(x_, ( (1 + (params_.M-1) * abs(params_.incx) + params_.offBX)) * sizeof(*x_), 0, CL_MEM_READ_WRITE);
222 mobjy_ = base_->createEnqueueBuffer(y_,( (1 + (params_.N-1) * abs(params_.incy) + params_.offCY)) * sizeof(*y_) , 0, CL_MEM_READ_WRITE);
223
224 return ( (mobjA_ != NULL) && (mobjx_ != NULL) && (mobjy_ != NULL) ) ? 0 : -1;
225 }
226
227 template <typename ElemType> nano_time_t
etalonPerfSingle(void)228 GerPerformanceTest<ElemType>::etalonPerfSingle(void)
229 {
230 nano_time_t time = 0;
231 clblasOrder order;
232 size_t lda, fN, fM;
233
234
235 #ifndef PERF_TEST_WITH_ROW_MAJOR
236 if (params_.order == clblasRowMajor) {
237 cerr << "Row major order is not allowed" << endl;
238 return NANOTIME_ERR;
239 }
240 #endif
241
242 order = params_.order;
243 lda = params_.lda;
244 fM = params_.M;
245 fN = params_.N;
246
247 #ifdef PERF_TEST_WITH_ACML
248
249 clblasOrder fOrder;
250 size_t fOffx, fOffy;
251 int fIncx, fIncy;
252 ElemType *fX, *fY;
253 fOrder = params_.order;
254 fM = params_.M;
255 fN = params_.N;
256 fIncx = params_.incx;
257 fIncy = params_.incy;
258 fX = x_;
259 fY = y_;
260 fOffx = params_.offBX;
261 fOffy = params_.offCY;
262
263 if (fOrder != clblasColumnMajor) {
264 fOrder = clblasColumnMajor;
265 fM = params_.N;
266 fN = params_.M;
267 fX = y_;
268 fY = x_;
269 fIncx = params_.incy;
270 fIncy = params_.incx;
271 fOffx = params_.offCY;
272 fOffy = params_.offBX;
273 }
274 time = getCurrentTime();
275 clMath::blas::ger(order, fM, fN, alpha_, fX, fOffx, fIncx, fY, fOffy, fIncy, A_, params_.offa, lda);
276 time = getCurrentTime() - time;
277
278 #endif // PERF_TEST_WITH_ACML
279
280 return time;
281 }
282
283
284 template <typename ElemType> nano_time_t
clblasPerfSingle(void)285 GerPerformanceTest<ElemType>::clblasPerfSingle(void)
286 {
287 nano_time_t time;
288 cl_event event;
289 cl_int status;
290 cl_command_queue queue = base_->commandQueues()[0];
291
292 status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0,
293 (lengthA + params_.offa) * sizeof(ElemType), backA_, 0, NULL, &event);
294 if (status != CL_SUCCESS) {
295 cerr << "Matrix A buffer object enqueuing error, status = " <<
296 status << endl;
297
298 return NANOTIME_ERR;
299 }
300
301 status = clWaitForEvents(1, &event);
302 if (status != CL_SUCCESS) {
303 cout << "Wait on event failed, status = " <<
304 status << endl;
305
306 return NANOTIME_ERR;
307 }
308
309 event = NULL;
310 time = getCurrentTime();
311
312 #define TIMING
313 #ifdef TIMING
314 clFinish( queue);
315
316 int iter = 20;
317 for ( int i = 1; i <= iter; i++)
318 {
319 #endif
320
321 status = (cl_int)clMath::clblas::ger(params_.order, params_.M, params_.N, alpha_,
322 mobjx_, params_.offBX, params_.incx, mobjy_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1,
323 &queue, 0, NULL, &event);
324 if (status != CL_SUCCESS) {
325 cerr << "The CLBLAS GER function failed, status = " <<
326 status << endl;
327
328 return NANOTIME_ERR;
329 }
330 #ifdef TIMING
331 } // iter loop
332 clFinish( queue);
333 time = getCurrentTime() - time;
334 time /= iter;
335 #else
336
337 status = flushAll(1, &queue);
338 if (status != CL_SUCCESS) {
339 cerr << "clFlush() failed, status = " << status << endl;
340 return NANOTIME_ERR;
341 }
342
343 time = getCurrentTime();
344 status = waitForSuccessfulFinish(1, &queue, &event);
345 if (status == CL_SUCCESS) {
346 time = getCurrentTime() - time;
347 }
348 else {
349 cerr << "Waiting for completion of commands to the queue failed, "
350 "status = " << status << endl;
351 time = NANOTIME_ERR;
352 }
353 #endif
354 return time;
355 }
356
357 } // namespace clMath
358
359 // ger performance test
360
TEST_P(GER,sger)361 TEST_P(GER, sger)
362 {
363 TestParams params;
364
365 getParams(¶ms);
366 GerPerformanceTest<float>::runInstance(FN_SGER, ¶ms);
367 }
368
369
TEST_P(GER,dger)370 TEST_P(GER, dger)
371 {
372 TestParams params;
373
374 getParams(¶ms);
375 GerPerformanceTest<double>::runInstance(FN_DGER, ¶ms);
376 }
377
TEST_P(GER,cgeru)378 TEST_P(GER, cgeru)
379 {
380 TestParams params;
381
382 getParams(¶ms);
383 GerPerformanceTest<FloatComplex>::runInstance(FN_CGERU, ¶ms);
384 }
385
386
TEST_P(GER,zgeru)387 TEST_P(GER, zgeru)
388 {
389 TestParams params;
390
391 getParams(¶ms);
392 GerPerformanceTest<DoubleComplex>::runInstance(FN_ZGERU, ¶ms);
393 }
394