1 /* ************************************************************************
2  * Copyright 2013 Advanced Micro Devices, Inc.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  * http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  * ************************************************************************/
16 
17 
18 /*
19  * ROTG performance test cases
20  */
21 
22 #include <stdlib.h>             // srand()
23 #include <string.h>             // memcpy()
24 #include <gtest/gtest.h>
25 #include <clBLAS.h>
26 
27 #include <common.h>
28 #include <clBLAS-wrapper.h>
29 #include <BlasBase.h>
30 #include <rotg.h>
31 #include <blas-random.h>
32 
33 #ifdef PERF_TEST_WITH_ACML
34 #include <blas-internal.h>
35 #include <blas-wrapper.h>
36 #endif
37 
38 #include "PerformanceTest.h"
39 
40 /*
41  * NOTE: operation factor means overall number
42  *       of multiply and add per each operation involving
43  *       2 matrix elements
44  */
45 
46 using namespace std;
47 using namespace clMath;
48 
49 #define CHECK_RESULT(ret)                                                   \
50 do {                                                                        \
51     ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "      \
52                          "perform an OpenCL request!" << endl;              \
53     EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" <<      \
54                          endl;                                              \
55 } while (0)
56 
57 namespace clMath {
58 
59 // ElemType1 for storing general type, ElemType2 to store type of C which is only float/double
60 template <typename ElemType1, typename ElemType2> class RotgPerformanceTest : public PerformanceTest
61 {
62 public:
63     virtual ~RotgPerformanceTest();
64 
65     virtual int prepare(void);
66     virtual nano_time_t etalonPerfSingle(void);
67     virtual nano_time_t clblasPerfSingle(void);
68 
runInstance(BlasFunction fn,TestParams * params)69     static void runInstance(BlasFunction fn, TestParams *params)
70     {
71         RotgPerformanceTest<ElemType1, ElemType2> perfCase(fn, params);
72         int ret = 0;
73         int opFactor;
74         BlasBase *base;
75 
76         base = clMath::BlasBase::getInstance();
77 
78         opFactor =1;
79 
80         if (((fn == FN_DROTG) || (fn == FN_ZROTG)) &&
81             !base->isDevSupportDoublePrecision())
82         {
83             std::cerr << ">> WARNING: The target device doesn't support native "
84                          "double precision floating point arithmetic" <<
85                          std::endl << ">> Test skipped" << std::endl;
86             return;
87         }
88 
89         if (!perfCase.areResourcesSufficient(params))
90         {
91             std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
92                         std::endl;
93 			return;
94         }
95         else
96         {
97             ret = perfCase.run(opFactor);
98         }
99 
100         ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
101                              "perform an OpenCL request!" << endl;
102         EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
103     }
104 
105 private:
106     RotgPerformanceTest(BlasFunction fn, TestParams *params);
107 
108     bool areResourcesSufficient(TestParams *params);
109 
110     TestParams params_;
111     ElemType1 *SA_, *SB_, *S_, *back_SA_, *back_SB_, *back_S_;
112     ElemType2 *C_, *back_C_;
113     cl_mem mobjSA_, mobjSB_, mobjC_, mobjS_;
114     ::clMath::BlasBase *base_;
115 };
116 
117 template <typename ElemType1, typename ElemType2>
RotgPerformanceTest(BlasFunction fn,TestParams * params)118 RotgPerformanceTest<ElemType1, ElemType2>::RotgPerformanceTest(
119     BlasFunction fn,
120     TestParams *params) : PerformanceTest(fn,(problem_size_t) (5 * sizeof(ElemType1) + sizeof(ElemType2))), params_(*params)
121 {
122 
123     SA_ = SB_ = S_ = NULL;
124     back_SA_ = back_SB_ = back_S_ = NULL;
125     C_ = back_C_ = NULL;
126     mobjSA_= mobjSB_ = mobjC_ = mobjS_ = NULL;
127 
128     try
129     {
130         SA_ = new ElemType1[1 + params_.offBX];
131         back_SA_ = new ElemType1[1 + params_.offBX];
132         SB_ = new ElemType1[1 + params_.offCY];
133         back_SB_ = new ElemType1[1 + params_.offCY];
134         C_ = new ElemType2[1 + params_.offa];
135         back_C_ = new ElemType2[1 + params_.offa];
136         S_ = new ElemType1[1 + params_.offb];
137         back_S_ = new ElemType1[1 + params_.offb];
138     }
139     catch(bad_alloc& ba)
140     {
141         SA_ = back_SA_ = SB_ = back_SB_ = NULL;     // areResourcesSufficient() will handle the rest and return
142         S_ = back_S_ = NULL;
143         C_ = back_C_ = NULL;
144         ba = ba;
145     }
146 
147     base_ = ::clMath::BlasBase::getInstance();
148 }
149 
150 template <typename ElemType1, typename ElemType2>
~RotgPerformanceTest()151 RotgPerformanceTest<ElemType1, ElemType2>::~RotgPerformanceTest()
152 {
153 	if(SA_ != NULL)
154     {
155         delete[] SA_;
156 	}
157 	if(back_SA_ != NULL)
158     {
159         delete[] back_SA_;
160 	}
161     if( mobjSA_ != NULL )
162     {
163 		clReleaseMemObject(mobjSA_);
164     }
165 
166     if(SB_ != NULL)
167     {
168         delete[] SB_;
169 	}
170 	if(back_SB_ != NULL)
171     {
172         delete[] back_SB_;
173 	}
174     if( mobjSB_ != NULL )
175     {
176 		clReleaseMemObject(mobjSB_);
177     }
178 
179     if(C_ != NULL)
180     {
181         delete[] C_;
182 	}
183 	if(back_C_ != NULL)
184     {
185         delete[] back_C_;
186 	}
187     if( mobjC_ != NULL )
188     {
189 		clReleaseMemObject(mobjC_);
190     }
191 
192     if(S_ != NULL)
193     {
194         delete[] S_;
195 	}
196 	if(back_S_ != NULL)
197     {
198         delete[] back_S_;
199 	}
200     if( mobjS_ != NULL )
201     {
202 		clReleaseMemObject(mobjS_);
203     }
204 }
205 
206 /*
207  * Check if available OpenCL resources are sufficient to
208  * run the test case
209  */
210 template <typename ElemType1, typename ElemType2> bool
areResourcesSufficient(TestParams * params)211 RotgPerformanceTest<ElemType1, ElemType2>::areResourcesSufficient(TestParams *params)
212 {
213     clMath::BlasBase *base;
214     size_t gmemSize, allocSize;
215     size_t offSA_ = params->offBX;
216     size_t offSB_ = params->offCY;
217     size_t offC_ = params->offa;
218     size_t offS_ = params->offb;
219     bool ret;
220     size_t sizeRequired = ((1 + offSA_) + (1 + offSB_) + (1 + offS_)) * sizeof(ElemType1)
221                              + ((1 + offC_) * sizeof(ElemType2));
222 
223 	if((SA_ == NULL) || (back_SA_ == NULL) || (SB_ == NULL) || (back_SB_ == NULL) ||
224         (C_ == NULL) || (back_C_ == NULL) || (S_ == NULL) || (back_S_ == NULL))
225     {
226 		return 0;
227 	}
228 
229     base = clMath::BlasBase::getInstance();
230     gmemSize = (size_t)base->availGlobalMemSize( 0 );
231     allocSize = (size_t)base->maxMemAllocSize();
232 
233     ret = (sizeRequired) < allocSize;
234     ret = ret && (sizeRequired < gmemSize);
235 
236     return ret;
237 }
238 
239 template <typename ElemType1, typename ElemType2> int
prepare(void)240 RotgPerformanceTest<ElemType1, ElemType2>::prepare(void)
241 {
242     randomVectors(1, (SA_ + params_.offBX), 1, (SB_ + params_.offCY), 1);
243     C_[params_.offa] = back_C_[params_.offa] = ZERO<ElemType2>();
244     S_[params_.offb] = back_S_[params_.offb] = ZERO<ElemType1>();
245     back_SA_[params_.offBX] = SA_[params_.offBX];
246     back_SB_[params_.offCY] = SB_[params_.offCY];
247 
248     //printing the inputs, as they change after processing
249     ::std::cerr << "A = ";
250     printElement<ElemType1>(SA_[params_.offBX]);
251     ::std::cerr << "\tB = ";
252     printElement<ElemType1>(SB_[params_.offCY]);
253     ::std::cerr << "\tC = ";
254     printElement<ElemType2>(C_[params_.offa]);
255     ::std::cerr << "\tS = ";
256     printElement<ElemType1>(S_[params_.offb]);
257     ::std::cout << std::endl << std::endl;
258 
259 	// Allocate buffers
260     mobjSA_ = base_->createEnqueueBuffer(SA_, (1 + params_.offBX) * sizeof(ElemType1), 0, CL_MEM_READ_WRITE);
261     mobjSB_ = base_->createEnqueueBuffer(SB_, (1 + params_.offCY) * sizeof(ElemType1), 0, CL_MEM_READ_WRITE);
262     mobjC_  = base_->createEnqueueBuffer(C_,  (1 + params_.offa ) * sizeof(ElemType2), 0, CL_MEM_WRITE_ONLY);
263     mobjS_  = base_->createEnqueueBuffer(S_,  (1 + params_.offb ) * sizeof(ElemType1), 0, CL_MEM_WRITE_ONLY);
264 
265     if((mobjSA_ == NULL) || (mobjSB_ == NULL) || (mobjC_ == NULL) || (mobjS_ == NULL))
266     {
267         return -1;
268     }
269     return 0;
270 }
271 
272 template <typename ElemType1, typename ElemType2> nano_time_t
etalonPerfSingle(void)273 RotgPerformanceTest<ElemType1, ElemType2>::etalonPerfSingle(void)
274 {
275     nano_time_t time = 0;
276 
277 #ifdef PERF_TEST_WITH_ACML
278 
279 		time = getCurrentTime();
280 		clMath::blas::rotg(back_SA_, params_.offBX, back_SB_, params_.offCY, back_C_, params_.offa, back_S_, params_.offb);
281 		time = getCurrentTime() - time;
282 
283 #endif  // PERF_TEST_WITH_ACML
284 
285     return time;
286 }
287 
288 
289 template <typename ElemType1, typename ElemType2> nano_time_t
clblasPerfSingle(void)290 RotgPerformanceTest<ElemType1, ElemType2>::clblasPerfSingle(void)
291 {
292     nano_time_t time;
293     cl_event event;
294     cl_int status;
295     cl_command_queue queue = base_->commandQueues()[0];
296 
297     DataType type;
298     type = ( typeid(ElemType1) == typeid(float))? TYPE_FLOAT:( typeid(ElemType1) == typeid(double))? TYPE_DOUBLE:
299 										( typeid(ElemType1) == typeid(FloatComplex))? TYPE_COMPLEX_FLOAT: TYPE_COMPLEX_DOUBLE;
300 
301     status = clEnqueueWriteBuffer(queue, mobjSA_, CL_TRUE, 0, (1 + params_.offBX) * sizeof(ElemType1), SA_, 0, NULL, &event);
302     if (status != CL_SUCCESS)
303     {
304         cerr << "Vector SA buffer object enqueuing error, status = " << status << endl;
305         return NANOTIME_ERR;
306     }
307 
308     status = clEnqueueWriteBuffer(queue, mobjSB_, CL_TRUE, 0, (1 + params_.offCY) * sizeof(ElemType1), SB_, 0, NULL, &event);
309     if (status != CL_SUCCESS)
310     {
311         cerr << "Vector SB buffer object enqueuing error, status = " << status << endl;
312         return NANOTIME_ERR;
313     }
314 
315     status = clEnqueueWriteBuffer(queue, mobjC_, CL_TRUE, 0, (1 + params_.offa) * sizeof(ElemType2), C_, 0, NULL, &event);
316     if (status != CL_SUCCESS)
317     {
318         cerr << "Vector C buffer object enqueuing error, status = " << status << endl;
319         return NANOTIME_ERR;
320     }
321 
322     status = clEnqueueWriteBuffer(queue, mobjS_, CL_TRUE, 0, (1 + params_.offb) * sizeof(ElemType1), S_, 0, NULL, &event);
323     if (status != CL_SUCCESS)
324     {
325         cerr << "Vector S buffer object enqueuing error, status = " << status << endl;
326         return NANOTIME_ERR;
327     }
328 
329     status = clWaitForEvents(1, &event);
330     if (status != CL_SUCCESS)
331     {
332         cout << "Wait on event failed, status = " << status << endl;
333         return NANOTIME_ERR;
334     }
335 
336     event = NULL;
337     time = getCurrentTime();
338 
339 #define TIMING
340 #ifdef TIMING
341     clFinish( queue);
342     int iter = 50;
343     for ( int i=1; i <= iter; i++)
344     {
345 #endif
346         status = (cl_int)clMath::clblas::rotg(type, mobjSA_, params_.offBX, mobjSB_, params_.offCY, mobjC_, params_.offa, mobjS_, params_.offb,
347                                                 1, &queue, 0, NULL, &event);
348         if (status != CL_SUCCESS)
349         {
350             cerr << "The CLBLAS ROTG function failed, status = " << status << endl;
351             return NANOTIME_ERR;
352         }
353 #ifdef TIMING
354     } // iter loop
355     clFinish( queue);
356     time = getCurrentTime() - time;
357     time /= iter;
358 #else
359 
360     status = flushAll(1, &queue);
361     if (status != CL_SUCCESS)
362     {
363         cerr << "clFlush() failed, status = " << status << endl;
364         return NANOTIME_ERR;
365     }
366 
367     time = getCurrentTime();
368     status = waitForSuccessfulFinish(1, &queue, &event);
369     if (status == CL_SUCCESS)
370     {
371         time = getCurrentTime() - time;
372     }
373     else
374     {
375         cerr << "Waiting for completion of commands to the queue failed, "
376                 "status = " << status << endl;
377         time = NANOTIME_ERR;
378     }
379 #endif
380     return time;
381 }
382 
383 } // namespace clMath
384 
385 // rotg performance test
TEST_P(ROTG,srotg)386 TEST_P(ROTG, srotg)
387 {
388     TestParams params;
389 
390     getParams(&params);
391     RotgPerformanceTest<float, float>::runInstance(FN_SROTG, &params);
392 }
393 
394 
TEST_P(ROTG,drotg)395 TEST_P(ROTG, drotg)
396 {
397     TestParams params;
398 
399     getParams(&params);
400     RotgPerformanceTest<double, double>::runInstance(FN_DROTG, &params);
401 }
402 
TEST_P(ROTG,crotg)403 TEST_P(ROTG, crotg)
404 {
405     TestParams params;
406 
407     getParams(&params);
408     RotgPerformanceTest<FloatComplex, float>::runInstance(FN_CROTG, &params);
409 }
410 
411 
TEST_P(ROTG,zrotg)412 TEST_P(ROTG, zrotg)
413 {
414     TestParams params;
415 
416     getParams(&params);
417     RotgPerformanceTest<DoubleComplex, double>::runInstance(FN_ZROTG, &params);
418 }
419