1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17
18 /*
19 * ROTG performance test cases
20 */
21
22 #include <stdlib.h> // srand()
23 #include <string.h> // memcpy()
24 #include <gtest/gtest.h>
25 #include <clBLAS.h>
26
27 #include <common.h>
28 #include <clBLAS-wrapper.h>
29 #include <BlasBase.h>
30 #include <rotg.h>
31 #include <blas-random.h>
32
33 #ifdef PERF_TEST_WITH_ACML
34 #include <blas-internal.h>
35 #include <blas-wrapper.h>
36 #endif
37
38 #include "PerformanceTest.h"
39
40 /*
41 * NOTE: operation factor means overall number
42 * of multiply and add per each operation involving
43 * 2 matrix elements
44 */
45
46 using namespace std;
47 using namespace clMath;
48
49 #define CHECK_RESULT(ret) \
50 do { \
51 ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \
52 "perform an OpenCL request!" << endl; \
53 EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \
54 endl; \
55 } while (0)
56
57 namespace clMath {
58
59 // ElemType1 for storing general type, ElemType2 to store type of C which is only float/double
60 template <typename ElemType1, typename ElemType2> class RotgPerformanceTest : public PerformanceTest
61 {
62 public:
63 virtual ~RotgPerformanceTest();
64
65 virtual int prepare(void);
66 virtual nano_time_t etalonPerfSingle(void);
67 virtual nano_time_t clblasPerfSingle(void);
68
runInstance(BlasFunction fn,TestParams * params)69 static void runInstance(BlasFunction fn, TestParams *params)
70 {
71 RotgPerformanceTest<ElemType1, ElemType2> perfCase(fn, params);
72 int ret = 0;
73 int opFactor;
74 BlasBase *base;
75
76 base = clMath::BlasBase::getInstance();
77
78 opFactor =1;
79
80 if (((fn == FN_DROTG) || (fn == FN_ZROTG)) &&
81 !base->isDevSupportDoublePrecision())
82 {
83 std::cerr << ">> WARNING: The target device doesn't support native "
84 "double precision floating point arithmetic" <<
85 std::endl << ">> Test skipped" << std::endl;
86 return;
87 }
88
89 if (!perfCase.areResourcesSufficient(params))
90 {
91 std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
92 std::endl;
93 return;
94 }
95 else
96 {
97 ret = perfCase.run(opFactor);
98 }
99
100 ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
101 "perform an OpenCL request!" << endl;
102 EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
103 }
104
105 private:
106 RotgPerformanceTest(BlasFunction fn, TestParams *params);
107
108 bool areResourcesSufficient(TestParams *params);
109
110 TestParams params_;
111 ElemType1 *SA_, *SB_, *S_, *back_SA_, *back_SB_, *back_S_;
112 ElemType2 *C_, *back_C_;
113 cl_mem mobjSA_, mobjSB_, mobjC_, mobjS_;
114 ::clMath::BlasBase *base_;
115 };
116
117 template <typename ElemType1, typename ElemType2>
RotgPerformanceTest(BlasFunction fn,TestParams * params)118 RotgPerformanceTest<ElemType1, ElemType2>::RotgPerformanceTest(
119 BlasFunction fn,
120 TestParams *params) : PerformanceTest(fn,(problem_size_t) (5 * sizeof(ElemType1) + sizeof(ElemType2))), params_(*params)
121 {
122
123 SA_ = SB_ = S_ = NULL;
124 back_SA_ = back_SB_ = back_S_ = NULL;
125 C_ = back_C_ = NULL;
126 mobjSA_= mobjSB_ = mobjC_ = mobjS_ = NULL;
127
128 try
129 {
130 SA_ = new ElemType1[1 + params_.offBX];
131 back_SA_ = new ElemType1[1 + params_.offBX];
132 SB_ = new ElemType1[1 + params_.offCY];
133 back_SB_ = new ElemType1[1 + params_.offCY];
134 C_ = new ElemType2[1 + params_.offa];
135 back_C_ = new ElemType2[1 + params_.offa];
136 S_ = new ElemType1[1 + params_.offb];
137 back_S_ = new ElemType1[1 + params_.offb];
138 }
139 catch(bad_alloc& ba)
140 {
141 SA_ = back_SA_ = SB_ = back_SB_ = NULL; // areResourcesSufficient() will handle the rest and return
142 S_ = back_S_ = NULL;
143 C_ = back_C_ = NULL;
144 ba = ba;
145 }
146
147 base_ = ::clMath::BlasBase::getInstance();
148 }
149
150 template <typename ElemType1, typename ElemType2>
~RotgPerformanceTest()151 RotgPerformanceTest<ElemType1, ElemType2>::~RotgPerformanceTest()
152 {
153 if(SA_ != NULL)
154 {
155 delete[] SA_;
156 }
157 if(back_SA_ != NULL)
158 {
159 delete[] back_SA_;
160 }
161 if( mobjSA_ != NULL )
162 {
163 clReleaseMemObject(mobjSA_);
164 }
165
166 if(SB_ != NULL)
167 {
168 delete[] SB_;
169 }
170 if(back_SB_ != NULL)
171 {
172 delete[] back_SB_;
173 }
174 if( mobjSB_ != NULL )
175 {
176 clReleaseMemObject(mobjSB_);
177 }
178
179 if(C_ != NULL)
180 {
181 delete[] C_;
182 }
183 if(back_C_ != NULL)
184 {
185 delete[] back_C_;
186 }
187 if( mobjC_ != NULL )
188 {
189 clReleaseMemObject(mobjC_);
190 }
191
192 if(S_ != NULL)
193 {
194 delete[] S_;
195 }
196 if(back_S_ != NULL)
197 {
198 delete[] back_S_;
199 }
200 if( mobjS_ != NULL )
201 {
202 clReleaseMemObject(mobjS_);
203 }
204 }
205
206 /*
207 * Check if available OpenCL resources are sufficient to
208 * run the test case
209 */
210 template <typename ElemType1, typename ElemType2> bool
areResourcesSufficient(TestParams * params)211 RotgPerformanceTest<ElemType1, ElemType2>::areResourcesSufficient(TestParams *params)
212 {
213 clMath::BlasBase *base;
214 size_t gmemSize, allocSize;
215 size_t offSA_ = params->offBX;
216 size_t offSB_ = params->offCY;
217 size_t offC_ = params->offa;
218 size_t offS_ = params->offb;
219 bool ret;
220 size_t sizeRequired = ((1 + offSA_) + (1 + offSB_) + (1 + offS_)) * sizeof(ElemType1)
221 + ((1 + offC_) * sizeof(ElemType2));
222
223 if((SA_ == NULL) || (back_SA_ == NULL) || (SB_ == NULL) || (back_SB_ == NULL) ||
224 (C_ == NULL) || (back_C_ == NULL) || (S_ == NULL) || (back_S_ == NULL))
225 {
226 return 0;
227 }
228
229 base = clMath::BlasBase::getInstance();
230 gmemSize = (size_t)base->availGlobalMemSize( 0 );
231 allocSize = (size_t)base->maxMemAllocSize();
232
233 ret = (sizeRequired) < allocSize;
234 ret = ret && (sizeRequired < gmemSize);
235
236 return ret;
237 }
238
239 template <typename ElemType1, typename ElemType2> int
prepare(void)240 RotgPerformanceTest<ElemType1, ElemType2>::prepare(void)
241 {
242 randomVectors(1, (SA_ + params_.offBX), 1, (SB_ + params_.offCY), 1);
243 C_[params_.offa] = back_C_[params_.offa] = ZERO<ElemType2>();
244 S_[params_.offb] = back_S_[params_.offb] = ZERO<ElemType1>();
245 back_SA_[params_.offBX] = SA_[params_.offBX];
246 back_SB_[params_.offCY] = SB_[params_.offCY];
247
248 //printing the inputs, as they change after processing
249 ::std::cerr << "A = ";
250 printElement<ElemType1>(SA_[params_.offBX]);
251 ::std::cerr << "\tB = ";
252 printElement<ElemType1>(SB_[params_.offCY]);
253 ::std::cerr << "\tC = ";
254 printElement<ElemType2>(C_[params_.offa]);
255 ::std::cerr << "\tS = ";
256 printElement<ElemType1>(S_[params_.offb]);
257 ::std::cout << std::endl << std::endl;
258
259 // Allocate buffers
260 mobjSA_ = base_->createEnqueueBuffer(SA_, (1 + params_.offBX) * sizeof(ElemType1), 0, CL_MEM_READ_WRITE);
261 mobjSB_ = base_->createEnqueueBuffer(SB_, (1 + params_.offCY) * sizeof(ElemType1), 0, CL_MEM_READ_WRITE);
262 mobjC_ = base_->createEnqueueBuffer(C_, (1 + params_.offa ) * sizeof(ElemType2), 0, CL_MEM_WRITE_ONLY);
263 mobjS_ = base_->createEnqueueBuffer(S_, (1 + params_.offb ) * sizeof(ElemType1), 0, CL_MEM_WRITE_ONLY);
264
265 if((mobjSA_ == NULL) || (mobjSB_ == NULL) || (mobjC_ == NULL) || (mobjS_ == NULL))
266 {
267 return -1;
268 }
269 return 0;
270 }
271
272 template <typename ElemType1, typename ElemType2> nano_time_t
etalonPerfSingle(void)273 RotgPerformanceTest<ElemType1, ElemType2>::etalonPerfSingle(void)
274 {
275 nano_time_t time = 0;
276
277 #ifdef PERF_TEST_WITH_ACML
278
279 time = getCurrentTime();
280 clMath::blas::rotg(back_SA_, params_.offBX, back_SB_, params_.offCY, back_C_, params_.offa, back_S_, params_.offb);
281 time = getCurrentTime() - time;
282
283 #endif // PERF_TEST_WITH_ACML
284
285 return time;
286 }
287
288
289 template <typename ElemType1, typename ElemType2> nano_time_t
clblasPerfSingle(void)290 RotgPerformanceTest<ElemType1, ElemType2>::clblasPerfSingle(void)
291 {
292 nano_time_t time;
293 cl_event event;
294 cl_int status;
295 cl_command_queue queue = base_->commandQueues()[0];
296
297 DataType type;
298 type = ( typeid(ElemType1) == typeid(float))? TYPE_FLOAT:( typeid(ElemType1) == typeid(double))? TYPE_DOUBLE:
299 ( typeid(ElemType1) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
300
301 status = clEnqueueWriteBuffer(queue, mobjSA_, CL_TRUE, 0, (1 + params_.offBX) * sizeof(ElemType1), SA_, 0, NULL, &event);
302 if (status != CL_SUCCESS)
303 {
304 cerr << "Vector SA buffer object enqueuing error, status = " << status << endl;
305 return NANOTIME_ERR;
306 }
307
308 status = clEnqueueWriteBuffer(queue, mobjSB_, CL_TRUE, 0, (1 + params_.offCY) * sizeof(ElemType1), SB_, 0, NULL, &event);
309 if (status != CL_SUCCESS)
310 {
311 cerr << "Vector SB buffer object enqueuing error, status = " << status << endl;
312 return NANOTIME_ERR;
313 }
314
315 status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, (1 + params_.offa) * sizeof(ElemType2), C_, 0, NULL, &event);
316 if (status != CL_SUCCESS)
317 {
318 cerr << "Vector C buffer object enqueuing error, status = " << status << endl;
319 return NANOTIME_ERR;
320 }
321
322 status = clEnqueueWriteBuffer(queue, mobjS_, CL_TRUE, 0, (1 + params_.offb) * sizeof(ElemType1), S_, 0, NULL, &event);
323 if (status != CL_SUCCESS)
324 {
325 cerr << "Vector S buffer object enqueuing error, status = " << status << endl;
326 return NANOTIME_ERR;
327 }
328
329 status = clWaitForEvents(1, &event);
330 if (status != CL_SUCCESS)
331 {
332 cout << "Wait on event failed, status = " << status << endl;
333 return NANOTIME_ERR;
334 }
335
336 event = NULL;
337 time = getCurrentTime();
338
339 #define TIMING
340 #ifdef TIMING
341 clFinish( queue);
342 int iter = 50;
343 for ( int i=1; i <= iter; i++)
344 {
345 #endif
346 status = (cl_int)clMath::clblas::rotg(type, mobjSA_, params_.offBX, mobjSB_, params_.offCY, mobjC_, params_.offa, mobjS_, params_.offb,
347 1, &queue, 0, NULL, &event);
348 if (status != CL_SUCCESS)
349 {
350 cerr << "The CLBLAS ROTG function failed, status = " << status << endl;
351 return NANOTIME_ERR;
352 }
353 #ifdef TIMING
354 } // iter loop
355 clFinish( queue);
356 time = getCurrentTime() - time;
357 time /= iter;
358 #else
359
360 status = flushAll(1, &queue);
361 if (status != CL_SUCCESS)
362 {
363 cerr << "clFlush() failed, status = " << status << endl;
364 return NANOTIME_ERR;
365 }
366
367 time = getCurrentTime();
368 status = waitForSuccessfulFinish(1, &queue, &event);
369 if (status == CL_SUCCESS)
370 {
371 time = getCurrentTime() - time;
372 }
373 else
374 {
375 cerr << "Waiting for completion of commands to the queue failed, "
376 "status = " << status << endl;
377 time = NANOTIME_ERR;
378 }
379 #endif
380 return time;
381 }
382
383 } // namespace clMath
384
385 // rotg performance test
TEST_P(ROTG,srotg)386 TEST_P(ROTG, srotg)
387 {
388 TestParams params;
389
390 getParams(¶ms);
391 RotgPerformanceTest<float, float>::runInstance(FN_SROTG, ¶ms);
392 }
393
394
TEST_P(ROTG,drotg)395 TEST_P(ROTG, drotg)
396 {
397 TestParams params;
398
399 getParams(¶ms);
400 RotgPerformanceTest<double, double>::runInstance(FN_DROTG, ¶ms);
401 }
402
TEST_P(ROTG,crotg)403 TEST_P(ROTG, crotg)
404 {
405 TestParams params;
406
407 getParams(¶ms);
408 RotgPerformanceTest<FloatComplex, float>::runInstance(FN_CROTG, ¶ms);
409 }
410
411
TEST_P(ROTG,zrotg)412 TEST_P(ROTG, zrotg)
413 {
414 TestParams params;
415
416 getParams(¶ms);
417 RotgPerformanceTest<DoubleComplex, double>::runInstance(FN_ZROTG, ¶ms);
418 }
419