1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 #include <stdlib.h>             // srand()
18 #include <string.h>             // memcpy()
19 #include <gtest/gtest.h>
20 #include <clBLAS.h>
21 
22 #include <common.h>
23 #include <clBLAS-wrapper.h>
24 #include <BlasBase.h>
25 #include <dot.h>
26 #include <blas-random.h>
27 
28 #ifdef PERF_TEST_WITH_ACML
29 #include <blas-internal.h>
30 #include <blas-wrapper.h>
31 #endif
32 
33 #include "PerformanceTest.h"
34 
35 using namespace std;
36 using namespace clMath;
37 
38 #define CHECK_RESULT(ret)                                                   \
39 do {                                                                        \
40     ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
41                          "perform an OpenCL request!" << endl;              \
42     EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
43                          endl;                                              \
44 } while (0)
45 
46 namespace clMath {
47 
48 template <typename ElemType> class DotPerformanceTest : public PerformanceTest
49 {
50 public:
51     virtual ~DotPerformanceTest();
52 
53     virtual int prepare(void);
54     virtual nano_time_t etalonPerfSingle(void);
55     virtual nano_time_t clblasPerfSingle(void);
56 
runInstance(BlasFunction fn,TestParams * params)57     static void runInstance(BlasFunction fn, TestParams *params)
58     {
59         DotPerformanceTest<ElemType> perfCase(fn, params);
60         int ret = 0;
61         int opFactor;
62         BlasBase *base;
63 
64         base = clMath::BlasBase::getInstance();
65 
66         opFactor =1;
67 
68         if (((fn == FN_DDOT) || (fn == FN_ZDOTU)) &&
69             !base->isDevSupportDoublePrecision()) {
70 
71             std::cerr << ">> WARNING: The target device doesn't support native "
72                          "double precision floating point arithmetic" <<
73                          std::endl << ">> Test skipped" << std::endl;
74             return;
75         }
76 
77         if (!perfCase.areResourcesSufficient(params)) {
78             std::cerr << ">> RESOURCE CHECK: Skip due to insufficient resources" <<
79                         std::endl;
80 			return;
81         }
82         else {
83             ret = perfCase.run(opFactor);
84         }
85 
86         ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
87                              "perform an OpenCL request!" << endl;
88         EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
89     }
90 
91 private:
92     DotPerformanceTest(BlasFunction fn, TestParams *params);
93 
94     bool areResourcesSufficient(TestParams *params);
95 
96     TestParams params_;
97     ElemType *blasX_;
98     ElemType *blasY_;
99     cl_mem mobjX_;
100     cl_mem mobjY_;
101 	cl_mem mobjDP_;
102 	cl_mem scratchBuff;
103     size_t  lengthX;
104     size_t  lengthY;
105     ::clMath::BlasBase *base_;
106 };
107 
108 template <typename ElemType>
DotPerformanceTest(BlasFunction fn,TestParams * params)109 DotPerformanceTest<ElemType>::DotPerformanceTest(
110     BlasFunction fn,
111     TestParams *params) : PerformanceTest(fn,(problem_size_t) ( (2 * params->N)  * sizeof(ElemType) ) ), params_(*params), mobjX_(NULL), mobjY_(NULL),mobjDP_(NULL)
112 {
113 
114     blasX_ = NULL;
115     blasY_ = NULL;
116 	mobjX_= mobjY_ = mobjDP_= scratchBuff = NULL;
117     lengthX = 1 + (params->N - 1) * abs(params_.incx);
118     lengthY = 1 + (params->N - 1) * abs(params_.incy);
119 
120     try
121     {
122         blasX_ = new ElemType[lengthX + params_.offBX];
123         blasY_ = new ElemType[lengthY + params_.offCY];
124     }
125     catch(bad_alloc& ba) {
126         blasX_ = blasY_ = NULL;     // areResourcesSufficient() will handle the rest and return
127         mobjX_= mobjY_ = mobjDP_= scratchBuff = NULL;
128         ba = ba;
129     }
130 
131     base_ = ::clMath::BlasBase::getInstance();
132 }
133 
134 template <typename ElemType>
~DotPerformanceTest()135 DotPerformanceTest<ElemType>::~DotPerformanceTest()
136 {
137 	if(blasX_ != NULL)
138     {
139         delete[] blasX_;
140 	}
141 	if(blasY_ != NULL)
142     {
143         delete[] blasY_;
144 	}
145     if( mobjX_ != NULL )
146     {
147 		clReleaseMemObject(mobjX_);
148     }
149     if( mobjY_ != NULL )
150     {
151 		clReleaseMemObject(mobjY_);
152     }
153 	if( mobjDP_ != NULL )
154     {
155         clReleaseMemObject(mobjDP_);
156     }
157 	if( scratchBuff!= NULL )
158     {
159         clReleaseMemObject(scratchBuff);
160     }
161 
162 }
163 
164 /*
165  * Check if available OpenCL resources are sufficient to
166  * run the test case
167  */
168 template <typename ElemType> bool
areResourcesSufficient(TestParams * params)169 DotPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
170 {
171     clMath::BlasBase *base;
172     size_t gmemSize, allocSize;
173     bool ret;
174     size_t sizeX, sizeY, sizeDP;
175 
176 	if((blasX_ == NULL) || (blasY_ == NULL) ) {
177 		return 0;
178 	}
179 
180     base = clMath::BlasBase::getInstance();
181     gmemSize = (size_t)base->availGlobalMemSize( 0 );
182     allocSize = (size_t)base->maxMemAllocSize();
183     sizeX = (lengthX + params->offBX) * sizeof(ElemType);
184     sizeY = (lengthY + params->offCY) * sizeof(ElemType);
185 	sizeDP = (1 + params->offa) * sizeof(ElemType);
186 
187     ret = ((sizeX < allocSize) && (sizeY < allocSize) && (sizeDP < allocSize));
188     ret = (ret && ((sizeX + sizeY + sizeDP) < gmemSize));
189 
190     return ret;
191 }
192 
193 template <typename ElemType> int
prepare(void)194 DotPerformanceTest<ElemType>::prepare(void)
195 {
196 
197     randomVectors(params_.N, (blasX_ + params_.offBX), params_.incx, (blasY_ + params_.offCY), params_.incy, true);
198 
199 	mobjX_ = base_->createEnqueueBuffer(blasX_, ((lengthX + params_.offBX) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
200 	mobjY_ = base_->createEnqueueBuffer(blasY_, ((lengthY + params_.offCY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
201 	mobjDP_ = base_->createEnqueueBuffer(NULL, ((1 + params_.offa) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
202 	scratchBuff = base_->createEnqueueBuffer(NULL, ((lengthY) * sizeof(ElemType)), 0, CL_MEM_READ_WRITE);
203 
204     return ((mobjX_ != NULL) && (mobjY_ != NULL) &&  (mobjDP_ != NULL)&& (scratchBuff != NULL) )? 0 : -1;
205 }
206 
207 template <typename ElemType> nano_time_t
etalonPerfSingle(void)208 DotPerformanceTest<ElemType>::etalonPerfSingle(void)
209 {
210     nano_time_t time = 0;
211 
212 #ifdef PERF_TEST_WITH_ACML
213 
214 	time = getCurrentTime();
215 	clMath::blas::dot(params_.N, blasX_, params_.offBX, params_.incx, blasY_, params_.offCY, params_.incy);
216 	time = getCurrentTime() - time;
217 
218 #endif  // PERF_TEST_WITH_ACML
219 
220     return time;
221 }
222 
223 
224 template <typename ElemType> nano_time_t
clblasPerfSingle(void)225 DotPerformanceTest<ElemType>::clblasPerfSingle(void)
226 {
227     nano_time_t time;
228     cl_event event;
229     cl_int status;
230     cl_command_queue queue = base_->commandQueues()[0];
231 
232     DataType type;
233     type = ( typeid(ElemType) == typeid(float))? TYPE_FLOAT:( typeid(ElemType) == typeid(double))? TYPE_DOUBLE:
234 										( typeid(ElemType) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
235 
236     event = NULL;
237     clFinish( queue);
238     time = getCurrentTime();
239 
240 #define TIMING
241 #ifdef TIMING
242     int iter = 100;
243     for ( int i=1; i <= iter; i++)
244     {
245 #endif
246 
247         status = (cl_int)clMath::clblas::dot( type, params_.N, mobjDP_, params_.offa, mobjX_, params_.offBX, params_.incx,
248                              mobjY_, params_.offCY, params_.incy, scratchBuff, 1, &queue, 0, NULL, &event);
249         if (status != CL_SUCCESS) {
250             cerr << "The CLBLAS DOT function failed, status = " <<
251                     status << endl;
252 
253             return NANOTIME_ERR;
254         }
255 #ifdef TIMING
256     } // iter loop
257     clFinish( queue);
258     time = getCurrentTime() - time;
259     time /= iter;
260 #else
261 
262     status = flushAll(1, &queue);
263     if (status != CL_SUCCESS) {
264         cerr << "clFlush() failed, status = " << status << endl;
265         return NANOTIME_ERR;
266     }
267 
268     time = getCurrentTime();
269     status = waitForSuccessfulFinish(1, &queue, &event);
270     if (status == CL_SUCCESS) {
271         time = getCurrentTime() - time;
272     }
273     else {
274         cerr << "Waiting for completion of commands to the queue failed, "
275                 "status = " << status << endl;
276         time = NANOTIME_ERR;
277     }
278 #endif
279     return time;
280 }
281 
282 } // namespace clMath
283 
TEST_P(DOT,sdot)284 TEST_P(DOT, sdot)
285 {
286     TestParams params;
287 
288     getParams(&params);
289     DotPerformanceTest<float>::runInstance(FN_SDOT, &params);
290 }
291 
292 
TEST_P(DOT,ddot)293 TEST_P(DOT, ddot)
294 {
295     TestParams params;
296 
297     getParams(&params);
298     DotPerformanceTest<double>::runInstance(FN_DDOT, &params);
299 }
300 
TEST_P(DOT,cdotu)301 TEST_P(DOT, cdotu)
302 {
303     TestParams params;
304 
305     getParams(&params);
306     DotPerformanceTest<FloatComplex>::runInstance(FN_CDOTU, &params);
307 }
308 
309 
TEST_P(DOT,zdotu)310 TEST_P(DOT, zdotu)
311 {
312     TestParams params;
313 
314     getParams(&params);
315     DotPerformanceTest<DoubleComplex>::runInstance(FN_ZDOTU, &params);
316 }
317