1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 #include <stdlib.h>             // srand()
18 #include <string.h>             // memcpy()
19 #include <gtest/gtest.h>
20 #include <clBLAS.h>
21 
22 #include <common.h>
23 #include <clBLAS-wrapper.h>
24 #include <BlasBase.h>
25 #include <hpr.h>
26 #include <blas-random.h>
27 
28 #ifdef PERF_TEST_WITH_ACML
29 #include <blas-internal.h>
30 #include <blas-wrapper.h>
31 #endif
32 
33 #include "PerformanceTest.h"
34 
35 /*
36  * NOTE: operation factor means overall number
37  *       of multiply and add per each operation involving
38  *       2 matrix elements
39  */
40 
41 using namespace std;
42 using namespace clMath;
43 
44 #define CHECK_RESULT(ret)                                                   \
45 do {                                                                        \
46     ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
47                          "perform an OpenCL request!" << endl;              \
48     EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
49                          endl;                                              \
50 } while (0)
51 
52 namespace clMath {
53 
54 template <typename ElemType> class HprPerformanceTest : public PerformanceTest
55 {
56 public:
57     virtual ~HprPerformanceTest();
58 
59     virtual int prepare(void);
60     virtual nano_time_t etalonPerfSingle(void);
61     virtual nano_time_t clblasPerfSingle(void);
62 
runInstance(BlasFunction fn,TestParams * params)63     static void runInstance(BlasFunction fn, TestParams *params)
64     {
65         HprPerformanceTest<ElemType> perfCase(fn, params);
66         int ret = 0;
67         int opFactor;
68         BlasBase *base;
69 
70         base = clMath::BlasBase::getInstance();
71 
72         opFactor = 1;
73 
74         if ((fn == FN_ZHPR) &&
75             !base->isDevSupportDoublePrecision()) {
76 
77             std::cerr << ">> WARNING: The target device doesn't support native "
78                          "double precision floating point arithmetic" <<
79                          std::endl << ">> Test skipped" << std::endl;
80             return;
81         }
82 
83         if (!perfCase.areResourcesSufficient(params)) {
84             std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
85                         std::endl;
86         }
87         else {
88             ret = perfCase.run(opFactor);
89         }
90 
91         ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
92                              "perform an OpenCL request!" << endl;
93         EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
94     }
95 
96 private:
97     HprPerformanceTest(BlasFunction fn, TestParams *params);
98 
99     bool areResourcesSufficient(TestParams *params);
100 
101     TestParams params_;
102     ElemType alpha_;
103     ElemType *AP_;
104     ElemType *X_;
105     ElemType *backAP_;
106     cl_mem mobjAP_;
107     cl_mem mobjX_;
108     ::clMath::BlasBase *base_;
109 };
110 
111 template <typename ElemType>
HprPerformanceTest(BlasFunction fn,TestParams * params)112 HprPerformanceTest<ElemType>::HprPerformanceTest(
113     BlasFunction fn,
114     TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + params->N) * 2 ) * sizeof(ElemType))),
115                           params_(*params), mobjAP_(NULL), mobjX_(NULL)
116 {
117     AP_ = new ElemType[( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa];
118     X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx)  + params_.offBX];
119     backAP_ = new ElemType[( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa];
120 
121     base_ = ::clMath::BlasBase::getInstance();
122 }
123 
124 template <typename ElemType>
~HprPerformanceTest()125 HprPerformanceTest<ElemType>::~HprPerformanceTest()
126 {
127     if(AP_ != NULL)
128     {
129         delete[] AP_;
130     }
131 	if(backAP_ != NULL)
132 	{
133 		delete[] backAP_;
134 	}
135 	if(X_ != NULL)
136 	{
137         delete[] X_;
138 	}
139 
140 	if(mobjX_ != NULL) {
141 		clReleaseMemObject(mobjX_);
142     }
143 	if(mobjAP_ != NULL) {
144 		clReleaseMemObject(mobjAP_);
145 	}
146 }
147 
148 /*
149  * Check if available OpenCL resources are sufficient to
150  * run the test case
151  */
152 template <typename ElemType> bool
areResourcesSufficient(TestParams * params)153 HprPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
154 {
155     clMath::BlasBase *base;
156     size_t gmemSize, allocSize;
157     size_t n = params->N;
158 
159 	if((AP_ == NULL) || (backAP_ == NULL) || (X_ == NULL))
160 	{
161         return 0;
162 	}
163 
164     base = clMath::BlasBase::getInstance();
165     gmemSize = (size_t)base->availGlobalMemSize(0);
166     allocSize = (size_t)base->maxMemAllocSize();
167 
168     bool suff = ( sizeof(ElemType) *(( n*( n + 1 ) )/2 )< allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
169     suff = suff && (((( (n*( n + 1 ) )/2 )  + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations
170 
171     return suff ;
172 }
173 
174 template <typename ElemType> int
prepare(void)175 HprPerformanceTest<ElemType>::prepare(void)
176 {
177 	size_t lenX = 1 + (params_.N-1) * abs(params_.incx);
178     alpha_ = convertMultiplier<ElemType>(params_.alpha);
179 	randomHerMatrices( params_.order, params_.uplo, params_.N, &alpha_, (AP_ + params_.offa), 0, (X_ + params_.offBX), params_.incx );
180 	memcpy(backAP_, AP_, ((( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(ElemType)));
181 
182     mobjAP_ = base_->createEnqueueBuffer(AP_, (( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(*AP_), 0, CL_MEM_READ_WRITE);
183     mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY);
184 
185     return ( (mobjAP_ != NULL) &&  (mobjX_ != NULL) ) ? 0 : -1;
186 }
187 
188 template <typename ElemType> nano_time_t
etalonPerfSingle(void)189 HprPerformanceTest<ElemType>::etalonPerfSingle(void)
190 {
191     nano_time_t time = 0;
192 	clblasOrder order;
193 //	size_t lda;
194 
195 #ifndef PERF_TEST_WITH_ROW_MAJOR
196     if (params_.order == clblasRowMajor) {
197         cerr << "Row major order is not allowed" << endl;
198         return NANOTIME_ERR;
199     }
200 #endif
201 
202     order = params_.order;
203 
204 
205 #ifdef PERF_TEST_WITH_ACML
206 
207     clblasOrder fOrder;
208     clblasUplo fUplo;
209     fOrder = params_.order;
210 	fUplo = params_.uplo;
211 
212 	if (order != clblasColumnMajor)
213     {
214 		doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 );
215         fOrder = clblasColumnMajor;
216         fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower;
217     }
218 
219    	time = getCurrentTime();
220    	clMath::blas::hpr(fOrder, fUplo, params_.N, CREAL(alpha_), X_, params_.offBX, params_.incx, AP_, params_.offa);
221 	time = getCurrentTime() - time;
222 
223 #endif  // PERF_TEST_WITH_ACML
224 
225     return time;
226 }
227 
228 
229 template <typename ElemType> nano_time_t
clblasPerfSingle(void)230 HprPerformanceTest<ElemType>::clblasPerfSingle(void)
231 {
232     nano_time_t time;
233     cl_event event;
234     cl_int status;
235     cl_command_queue queue = base_->commandQueues()[0];
236 
237     status = clEnqueueWriteBuffer(queue, mobjAP_, CL_TRUE, 0,
238                                   ((( params_.N*( params_.N + 1 ) )/2 ) + params_.offa) *
239                                   sizeof(ElemType), backAP_, 0, NULL, &event);
240     if (status != CL_SUCCESS) {
241         cerr << "Matrix A buffer object enqueuing error, status = " <<
242                  status << endl;
243 
244         return NANOTIME_ERR;
245     }
246 
247     status = clWaitForEvents(1, &event);
248     if (status != CL_SUCCESS) {
249         cout << "Wait on event failed, status = " <<
250                 status << endl;
251 
252         return NANOTIME_ERR;
253     }
254 
255     event = NULL;
256 
257 #define TIMING
258 #ifdef TIMING
259     clFinish( queue);
260     time = getCurrentTime();
261 
262     int iter = 20;
263     for ( int i = 1; i <= iter; i++)
264     {
265 #endif
266     status = (cl_int)clMath::clblas::hpr(params_.order, params_.uplo, params_.N, CREAL(alpha_), mobjX_, params_.offBX, params_.incx,
267 				mobjAP_, params_.offa, 1, &queue, 0, NULL, &event);
268 
269     if (status != CL_SUCCESS) {
270         cerr << "The CLBLAS HPR function failed, status = " <<
271                 status << endl;
272 
273         return NANOTIME_ERR;
274     }
275 
276 #ifdef TIMING
277     } // iter loop
278     clFinish( queue);
279     time = getCurrentTime() - time;
280     time /= iter;
281 #else
282     status = flushAll(1, &queue);
283     if (status != CL_SUCCESS) {
284         cerr << "clFlush() failed, status = " << status << endl;
285         return NANOTIME_ERR;
286     }
287 
288     time = getCurrentTime();
289     status = waitForSuccessfulFinish(1, &queue, &event);
290     if (status == CL_SUCCESS) {
291         time = getCurrentTime() - time;
292     }
293     else {
294         cerr << "Waiting for completion of commands to the queue failed, "
295                 "status = " << status << endl;
296         time = NANOTIME_ERR;
297     }
298 #endif
299 
300     return time;
301 }
302 
303 } // namespace clMath
304 
TEST_P(HPR,chpr)305 TEST_P(HPR, chpr)
306 {
307     TestParams params;
308 
309     getParams(&params);
310     HprPerformanceTest<FloatComplex>::runInstance(FN_CHPR, &params);
311 }
312 
TEST_P(HPR,zhpr)313 TEST_P(HPR, zhpr)
314 {
315     TestParams params;
316 
317     getParams(&params);
318     HprPerformanceTest<DoubleComplex>::runInstance(FN_ZHPR, &params);
319 }
320