1 /* ************************************************************************
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 * ************************************************************************/
16
17 #include <stdlib.h> // srand()
18 #include <string.h> // memcpy()
19 #include <gtest/gtest.h>
20 #include <clBLAS.h>
21
22 #include <common.h>
23 #include <clBLAS-wrapper.h>
24 #include <BlasBase.h>
25 #include <hpr.h>
26 #include <blas-random.h>
27
28 #ifdef PERF_TEST_WITH_ACML
29 #include <blas-internal.h>
30 #include <blas-wrapper.h>
31 #endif
32
33 #include "PerformanceTest.h"
34
35 /*
36 * NOTE: operation factor means overall number
37 * of multiply and add per each operation involving
38 * 2 matrix elements
39 */
40
41 using namespace std;
42 using namespace clMath;
43
44 #define CHECK_RESULT(ret) \
45 do { \
46 ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or " \
47 "perform an OpenCL request!" << endl; \
48 EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << \
49 endl; \
50 } while (0)
51
52 namespace clMath {
53
54 template <typename ElemType> class HprPerformanceTest : public PerformanceTest
55 {
56 public:
57 virtual ~HprPerformanceTest();
58
59 virtual int prepare(void);
60 virtual nano_time_t etalonPerfSingle(void);
61 virtual nano_time_t clblasPerfSingle(void);
62
runInstance(BlasFunction fn,TestParams * params)63 static void runInstance(BlasFunction fn, TestParams *params)
64 {
65 HprPerformanceTest<ElemType> perfCase(fn, params);
66 int ret = 0;
67 int opFactor;
68 BlasBase *base;
69
70 base = clMath::BlasBase::getInstance();
71
72 opFactor = 1;
73
74 if ((fn == FN_ZHPR) &&
75 !base->isDevSupportDoublePrecision()) {
76
77 std::cerr << ">> WARNING: The target device doesn't support native "
78 "double precision floating point arithmetic" <<
79 std::endl << ">> Test skipped" << std::endl;
80 return;
81 }
82
83 if (!perfCase.areResourcesSufficient(params)) {
84 std::cerr << ">> RESOURCE CHECK: Skip due to unsufficient resources" <<
85 std::endl;
86 }
87 else {
88 ret = perfCase.run(opFactor);
89 }
90
91 ASSERT_GE(ret, 0) << "Fatal error: can not allocate resources or "
92 "perform an OpenCL request!" << endl;
93 EXPECT_EQ(0, ret) << "The OpenCL version is slower in the case" << endl;
94 }
95
96 private:
97 HprPerformanceTest(BlasFunction fn, TestParams *params);
98
99 bool areResourcesSufficient(TestParams *params);
100
101 TestParams params_;
102 ElemType alpha_;
103 ElemType *AP_;
104 ElemType *X_;
105 ElemType *backAP_;
106 cl_mem mobjAP_;
107 cl_mem mobjX_;
108 ::clMath::BlasBase *base_;
109 };
110
111 template <typename ElemType>
HprPerformanceTest(BlasFunction fn,TestParams * params)112 HprPerformanceTest<ElemType>::HprPerformanceTest(
113 BlasFunction fn,
114 TestParams *params) : PerformanceTest(fn, (problem_size_t)((((params->N * params->N) + params->N) * 2 ) * sizeof(ElemType))),
115 params_(*params), mobjAP_(NULL), mobjX_(NULL)
116 {
117 AP_ = new ElemType[( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa];
118 X_ = new ElemType[ 1 + (params_.N-1) * abs(params_.incx) + params_.offBX];
119 backAP_ = new ElemType[( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa];
120
121 base_ = ::clMath::BlasBase::getInstance();
122 }
123
124 template <typename ElemType>
~HprPerformanceTest()125 HprPerformanceTest<ElemType>::~HprPerformanceTest()
126 {
127 if(AP_ != NULL)
128 {
129 delete[] AP_;
130 }
131 if(backAP_ != NULL)
132 {
133 delete[] backAP_;
134 }
135 if(X_ != NULL)
136 {
137 delete[] X_;
138 }
139
140 if(mobjX_ != NULL) {
141 clReleaseMemObject(mobjX_);
142 }
143 if(mobjAP_ != NULL) {
144 clReleaseMemObject(mobjAP_);
145 }
146 }
147
148 /*
149 * Check if available OpenCL resources are sufficient to
150 * run the test case
151 */
152 template <typename ElemType> bool
areResourcesSufficient(TestParams * params)153 HprPerformanceTest<ElemType>::areResourcesSufficient(TestParams *params)
154 {
155 clMath::BlasBase *base;
156 size_t gmemSize, allocSize;
157 size_t n = params->N;
158
159 if((AP_ == NULL) || (backAP_ == NULL) || (X_ == NULL))
160 {
161 return 0;
162 }
163
164 base = clMath::BlasBase::getInstance();
165 gmemSize = (size_t)base->availGlobalMemSize(0);
166 allocSize = (size_t)base->maxMemAllocSize();
167
168 bool suff = ( sizeof(ElemType) *(( n*( n + 1 ) )/2 )< allocSize ) && ((1 + (n-1)*abs(params->incx))*sizeof(ElemType) < allocSize); //for individual allocations
169 suff = suff && (((( (n*( n + 1 ) )/2 ) + (1 + (n-1)*abs(params->incx))*2)*sizeof(ElemType)) < gmemSize) ; //for total global allocations
170
171 return suff ;
172 }
173
174 template <typename ElemType> int
prepare(void)175 HprPerformanceTest<ElemType>::prepare(void)
176 {
177 size_t lenX = 1 + (params_.N-1) * abs(params_.incx);
178 alpha_ = convertMultiplier<ElemType>(params_.alpha);
179 randomHerMatrices( params_.order, params_.uplo, params_.N, &alpha_, (AP_ + params_.offa), 0, (X_ + params_.offBX), params_.incx );
180 memcpy(backAP_, AP_, ((( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(ElemType)));
181
182 mobjAP_ = base_->createEnqueueBuffer(AP_, (( ( params_.N*( params_.N + 1 ) )/2 )+ params_.offa)* sizeof(*AP_), 0, CL_MEM_READ_WRITE);
183 mobjX_ = base_->createEnqueueBuffer(X_, (lenX + params_.offBX )* sizeof(*X_), 0, CL_MEM_READ_ONLY);
184
185 return ( (mobjAP_ != NULL) && (mobjX_ != NULL) ) ? 0 : -1;
186 }
187
188 template <typename ElemType> nano_time_t
etalonPerfSingle(void)189 HprPerformanceTest<ElemType>::etalonPerfSingle(void)
190 {
191 nano_time_t time = 0;
192 clblasOrder order;
193 // size_t lda;
194
195 #ifndef PERF_TEST_WITH_ROW_MAJOR
196 if (params_.order == clblasRowMajor) {
197 cerr << "Row major order is not allowed" << endl;
198 return NANOTIME_ERR;
199 }
200 #endif
201
202 order = params_.order;
203
204
205 #ifdef PERF_TEST_WITH_ACML
206
207 clblasOrder fOrder;
208 clblasUplo fUplo;
209 fOrder = params_.order;
210 fUplo = params_.uplo;
211
212 if (order != clblasColumnMajor)
213 {
214 doConjugate( (X_ + params_.offBX), (1 + (params_.N-1) * abs(params_.incx)), 1, 1 );
215 fOrder = clblasColumnMajor;
216 fUplo = (fUplo == clblasLower)? clblasUpper : clblasLower;
217 }
218
219 time = getCurrentTime();
220 clMath::blas::hpr(fOrder, fUplo, params_.N, CREAL(alpha_), X_, params_.offBX, params_.incx, AP_, params_.offa);
221 time = getCurrentTime() - time;
222
223 #endif // PERF_TEST_WITH_ACML
224
225 return time;
226 }
227
228
229 template <typename ElemType> nano_time_t
clblasPerfSingle(void)230 HprPerformanceTest<ElemType>::clblasPerfSingle(void)
231 {
232 nano_time_t time;
233 cl_event event;
234 cl_int status;
235 cl_command_queue queue = base_->commandQueues()[0];
236
237 status = clEnqueueWriteBuffer(queue, mobjAP_, CL_TRUE, 0,
238 ((( params_.N*( params_.N + 1 ) )/2 ) + params_.offa) *
239 sizeof(ElemType), backAP_, 0, NULL, &event);
240 if (status != CL_SUCCESS) {
241 cerr << "Matrix A buffer object enqueuing error, status = " <<
242 status << endl;
243
244 return NANOTIME_ERR;
245 }
246
247 status = clWaitForEvents(1, &event);
248 if (status != CL_SUCCESS) {
249 cout << "Wait on event failed, status = " <<
250 status << endl;
251
252 return NANOTIME_ERR;
253 }
254
255 event = NULL;
256
257 #define TIMING
258 #ifdef TIMING
259 clFinish( queue);
260 time = getCurrentTime();
261
262 int iter = 20;
263 for ( int i = 1; i <= iter; i++)
264 {
265 #endif
266 status = (cl_int)clMath::clblas::hpr(params_.order, params_.uplo, params_.N, CREAL(alpha_), mobjX_, params_.offBX, params_.incx,
267 mobjAP_, params_.offa, 1, &queue, 0, NULL, &event);
268
269 if (status != CL_SUCCESS) {
270 cerr << "The CLBLAS HPR function failed, status = " <<
271 status << endl;
272
273 return NANOTIME_ERR;
274 }
275
276 #ifdef TIMING
277 } // iter loop
278 clFinish( queue);
279 time = getCurrentTime() - time;
280 time /= iter;
281 #else
282 status = flushAll(1, &queue);
283 if (status != CL_SUCCESS) {
284 cerr << "clFlush() failed, status = " << status << endl;
285 return NANOTIME_ERR;
286 }
287
288 time = getCurrentTime();
289 status = waitForSuccessfulFinish(1, &queue, &event);
290 if (status == CL_SUCCESS) {
291 time = getCurrentTime() - time;
292 }
293 else {
294 cerr << "Waiting for completion of commands to the queue failed, "
295 "status = " << status << endl;
296 time = NANOTIME_ERR;
297 }
298 #endif
299
300 return time;
301 }
302
303 } // namespace clMath
304
TEST_P(HPR,chpr)305 TEST_P(HPR, chpr)
306 {
307 TestParams params;
308
309 getParams(¶ms);
310 HprPerformanceTest<FloatComplex>::runInstance(FN_CHPR, ¶ms);
311 }
312
TEST_P(HPR,zhpr)313 TEST_P(HPR, zhpr)
314 {
315 TestParams params;
316
317 getParams(¶ms);
318 HprPerformanceTest<DoubleComplex>::runInstance(FN_ZHPR, ¶ms);
319 }
320