1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 /*
19  * Symm performance test cases
20  */
21 
22 #include <stdlib.h>             // srand()
23 #include <string.h>             // memcpy()
24 #include <gtest/gtest.h>
25 #include <clBLAS.h>
26 
27 #include <common.h>
28 #include <clBLAS-wrapper.h>
29 #include <BlasBase.h>
30 #include <gerc.h>
31 #include <blas-random.h>
32 
33 #ifdef PERF_TEST_WITH_ACML
34 #include <blas-internal.h>
35 #include <blas-wrapper.h>
36 #endif
37 
38 #include "PerformanceTest.h"
39 
40 /*
41  * NOTE: operation factor means overall number
42  *       of multiply and add per each operation involving
43  *       2 matrix elements
44  */
45 
46 using namespace std;
47 using namespace clMath;
48 
49 #define CHECK_RESULT(ret)                                                   \
50 do {                                                                        \
51     ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
52                          "perform an OpenCL request!" << endl;              \
53     EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
54                          endl;                                              \
55 } while (0)
56 
57 namespace clMath {
58 
59 template <typename ElemType> class GercPerformanceTest : public PerformanceTest
60 {
61 public:
62     virtual ~GercPerformanceTest();
63 
64     virtual int prepare(void);
65     virtual nano_time_t etalonPerfSingle(void);
66     virtual nano_time_t clblasPerfSingle(void);
67 
runInstance(BlasFunction fn,TestParams * params)68     static void runInstance(BlasFunction fn, TestParams *params)
69     {
70         GercPerformanceTest<ElemType> perfCase(fn, params);
71         int ret = 0;
72         int opFactor;
73         BlasBase *base;
74 
75         base = clMath::BlasBase::getInstance();
76 
77         opFactor =1;
78 
79 		if (fn == FN_ZGERC &&
80             !base->isDevSupportDoublePrecision()) {
81 
82             std::cerr << ">> WARNING: The target device doesn't support native "
83                          "double precision floating point arithmetic" <<
84                          std::endl << ">> Test skipped" << std::endl;
85             return;
86         }
87 
88         if (!perfCase.areResourcesSufficient(params)) {
89             std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
90                         std::endl;
91 			return;
92         }
93         else {
94             ret = perfCase.run(opFactor);
95         }
96 
97         ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
98                              "perform an OpenCL request!" << endl;
99         EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
100     }
101 
102 private:
103     GercPerformanceTest(BlasFunction fn, TestParams *params);
104 
105     bool areResourcesSufficient(TestParams *params);
106 
107     TestParams params_;
108     ElemType alpha_;
109     ElemType *A_;
110     ElemType *backA_;
111     ElemType *x_;
112     ElemType *y_;
113     cl_mem mobjA_;
114     cl_mem mobjx_;
115     cl_mem mobjy_;
116     int lengthA;
117     ::clMath::BlasBase *base_;
118 };
119 
120 template <typename ElemType>
GercPerformanceTest(BlasFunction fn,TestParams * params)121 GercPerformanceTest<ElemType>::GercPerformanceTest(
122     BlasFunction fn,
123     TestParams *params) : PerformanceTest(fn,(problem_size_t) (((2 *  params->M * params->N) +  params->M + params->N ) * sizeof(ElemType) ) ), params_(*params), mobjA_(NULL), mobjx_(NULL), mobjy_(NULL)
124 {
125 	//if( params_.side == clblasLeft )
126           //      ka = params_.M;
127         //else    ka = params_.N;
128 
129 	if( params_.order == clblasColumnMajor )
130 			lengthA = params_.N * params_.lda;
131 		else
132 			lengthA = params_.M * params_.lda;
133 
134     A_ = new ElemType[(lengthA) + params_.offa];
135     backA_ = new ElemType[lengthA+ params_.offa];
136     x_ = new ElemType[(1 + (params->M - 1) * abs(params_.incx))+ params_.offBX];
137     y_ = new ElemType[(1 + (params->N - 1) * abs(params_.incy)) + params_.offCY] ;
138 
139     base_ = ::clMath::BlasBase::getInstance();
140 }
141 
142 template <typename ElemType>
~GercPerformanceTest()143 GercPerformanceTest<ElemType>::~GercPerformanceTest()
144 {
145     if(A_ != NULL)
146     {
147     delete[] A_;
148     }
149 	if(x_ != NULL)
150 	{
151     delete[] x_;
152 	}
153 	if(y_ != NULL)
154 	{
155     delete[] y_;
156 	}
157 	if(backA_ != NULL)
158 	{
159     delete[] backA_;
160 	}
161 
162 	if( mobjy_ != NULL )
163 	    clReleaseMemObject(mobjy_);
164     if( mobjx_ != NULL )
165 		clReleaseMemObject(mobjx_);
166 	if( mobjA_ != NULL )
167 	    clReleaseMemObject(mobjA_);
168 }
169 
170 /*
171  * Check if available OpenCL resources are sufficient to
172  * run the test case
173  */
174 template <typename ElemType> bool
areResourcesSufficient(TestParams * params)175 GercPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
176 {
177     clMath::BlasBase *base;
178     size_t gmemSize, allocSize;
179     bool ret;
180     size_t m = params->M, n = params->N;
181 
182 	if((A_ == NULL) || (backA_ == NULL) || (x_ == NULL) || (y_ == NULL))
183 	{
184         return 0;
185 	}
186 
187     base = clMath::BlasBase::getInstance();
188     gmemSize = (size_t)base->availGlobalMemSize( 0 );
189     allocSize = (size_t)base->maxMemAllocSize();
190 
191     ret = std::max(m, n) * params_.lda * sizeof(ElemType) < allocSize;
192     ret = ret && ( ((1 + (params_.M-1)*abs(params_.incx)))* sizeof(ElemType) < allocSize);
193     ret = ret && ( ((1 + (params_.N-1)*abs(params_.incy))) * sizeof(ElemType) < allocSize);
194 
195     ret = ret && (((std::max(m, n) * params_.lda) + ((1 + (params_.M-1)*abs(params_.incx))) +  ((1 + (params_.N-1)*abs(params_.incy)))) < gmemSize);
196 
197     return ret;
198 }
199 
200 template <typename ElemType> int
prepare(void)201 GercPerformanceTest<ElemType>::prepare(void)
202 {
203     bool useAlpha = base_->useAlpha();
204 
205     if (useAlpha) {
206         alpha_ = convertMultiplier<ElemType>(params_.alpha);
207     }
208 
209 
210     int creationFlags = 0;
211     creationFlags =  creationFlags | RANDOM_INIT;
212 
213     creationFlags = ( (params_.order) == clblasRowMajor)? (creationFlags | ROW_MAJOR_ORDER) : (creationFlags);
214 	BlasRoutineID funcId = CLBLAS_GER;
215 
216 	populate( A_ + params_.offa, params_.M, params_.N, params_.lda, funcId, creationFlags);
217 	populate( x_ , (1 + (params_.M-1) * abs(params_.incx) + params_.offBX),1, (1 + (params_.M-1) * abs(params_.incx) + params_.offBX), funcId, 0 );
218 	populate( y_ , (1 + (params_.N-1) * abs(params_.incy) + params_.offCY),1, (1 + (params_.N-1) * abs(params_.incy) + params_.offCY), funcId, 0 );
219 
220 
221         memcpy(backA_, A_, (lengthA + params_.offa)* sizeof(ElemType));
222 
223 	mobjA_ = base_->createEnqueueBuffer(A_, (lengthA + params_.offa) * sizeof(*A_), 0, CL_MEM_READ_WRITE);
224 	mobjx_ = base_->createEnqueueBuffer(x_, ( (1 + (params_.M-1) * abs(params_.incx) + params_.offBX)) * sizeof(*x_), 0, CL_MEM_READ_WRITE);
225 	mobjy_ = base_->createEnqueueBuffer(y_,( (1 + (params_.N-1) * abs(params_.incy) + params_.offCY)) * sizeof(*y_) , 0, CL_MEM_READ_WRITE);
226 
227      return ( (mobjA_ != NULL) &&  (mobjx_ != NULL) && (mobjy_ != NULL) ) ? 0 : -1;
228 }
229 
230 template <typename ElemType> nano_time_t
etalonPerfSingle(void)231 GercPerformanceTest<ElemType>::etalonPerfSingle(void)
232 {
233     nano_time_t time = 0;
234     clblasOrder order;
235     size_t lda;
236     //int fIncx, fIncy;
237 
238 #ifndef PERF_TEST_WITH_ROW_MAJOR
239     if (params_.order == clblasRowMajor) {
240         cerr << "Row major order is not allowed" << endl;
241         return NANOTIME_ERR;
242     }
243 #endif
244 
245     order = params_.order;
246     lda = params_.lda;
247 
248 #ifdef PERF_TEST_WITH_ACML
249 
250 	 clblasOrder fOrder;
251     size_t fN, fM;
252     size_t fOffx, fOffy;
253     int fIncx, fIncy;
254     ElemType  *fX, *fY;
255     fOrder = params_.order;
256     fM = params_.M;
257     fN = params_.N;
258     fIncx = params_.incx;
259     fIncy = params_.incy;
260     fX = x_;
261     fY = y_;
262     fOffx = params_.offBX;
263     fOffy = params_.offCY;
264 
265     if (fOrder != clblasColumnMajor) {
266            fOrder = clblasColumnMajor;
267 
268 		   doConjugate( (y_ + params_.offCY), (1 + (params_.N-1) * abs(params_.incy)), 1, 1 );
269            fM = params_.N;
270            fN = params_.M;
271            fX = y_;
272            fY = x_;
273            fIncx = params_.incy;
274            fIncy = params_.incx;
275            fOffx = params_.offCY;
276            fOffy = params_.offBX;
277 		   // Note this according to the Legacy guide
278 		   time = getCurrentTime();
279 			clMath::blas::ger(fOrder, fM, fN, alpha_, fX , fOffx, fIncx, fY, fOffy, fIncy,  A_, params_.offa, params_.lda);
280        }
281 	else{
282 		time = getCurrentTime();
283 		clMath::blas::gerc(order, fM, fN, alpha_, fX, fOffx, params_.incx, fY, fOffy, params_.incy,  A_, params_.offa, lda);
284 	}
285     time = getCurrentTime() - time;
286 
287 #endif  // PERF_TEST_WITH_ACML<F2>
288 
289     return time;
290 }
291 
292 
293 template <typename ElemType> nano_time_t
clblasPerfSingle(void)294 GercPerformanceTest<ElemType>::clblasPerfSingle(void)
295 {
296     nano_time_t time;
297     cl_event event;
298     cl_int status;
299     cl_command_queue queue = base_->commandQueues()[0];
300 
301     status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0,
302                                   (lengthA + params_.offa) * sizeof(ElemType), backA_, 0, NULL, &event);
303     if (status != CL_SUCCESS) {
304         cerr << "Matrix A buffer object enqueuing error, status = " <<
305                  status << endl;
306 
307         return NANOTIME_ERR;
308     }
309 
310     status = clWaitForEvents(1, &event);
311     if (status != CL_SUCCESS) {
312         cout << "Wait on event failed, status = " <<
313                 status << endl;
314 
315         return NANOTIME_ERR;
316     }
317 
318     event = NULL;
319      time = getCurrentTime();
320 
321 #define TIMING
322 #ifdef TIMING
323         clFinish( queue);
324 
325         int iter = 20;
326         for ( int i = 1; i <= iter; i++)
327         {
328 #endif
329 
330     status = (cl_int)clMath::clblas::gerc(params_.order, params_.M, params_.N, alpha_,
331          mobjx_, params_.offBX, params_.incx, mobjy_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1,
332         &queue, 0, NULL, &event);
333     if (status != CL_SUCCESS) {
334         cerr << "The CLBLAS GERC function failed, status = " <<
335                 status << endl;
336 
337         return NANOTIME_ERR;
338     }
339 #ifdef TIMING
340         } // iter loop
341         clFinish( queue);
342     time = getCurrentTime() - time;
343         time /= iter;
344 #else
345 
346     status = flushAll(1, &queue);
347     if (status != CL_SUCCESS) {
348         cerr << "clFlush() failed, status = " << status << endl;
349         return NANOTIME_ERR;
350     }
351 
352     time = getCurrentTime();
353     status = waitForSuccessfulFinish(1, &queue, &event);
354     if (status == CL_SUCCESS) {
355         time = getCurrentTime() - time;
356     }
357     else {
358         cerr << "Waiting for completion of commands to the queue failed, "
359                 "status = " << status << endl;
360         time = NANOTIME_ERR;
361     }
362 #endif
363     return time;
364 }
365 
366 } // namespace clMath
367 
368 
TEST_P(GERC,cgerc)369 TEST_P(GERC, cgerc)
370 {
371     TestParams params;
372 
373     getParams(&params);
374     GercPerformanceTest<FloatComplex>::runInstance(FN_CGERC, &params);
375 }
376 
377 
TEST_P(GERC,zgerc)378 TEST_P(GERC, zgerc)
379 {
380     TestParams params;
381 
382     getParams(&params);
383     GercPerformanceTest<DoubleComplex>::runInstance(FN_ZGERC, &params);
384 }
385