1 // performance.cpp : performance benchmarking for native floating-point
2 //
3 // Copyright (C) 2017-2021 Stillwater Supercomputing, Inc.
4 //
5 // This file is part of the universal numbers project, which is released under an MIT Open Source license.
6 #include <universal/utility/directives.hpp>
7 #include <universal/utility/long_double.hpp>
8 #include <universal/utility/bit_cast.hpp> // TODO: can this be integrated in category headers?
9 #include <chrono>
10 #include <vector>
11
12 #include <universal/native/ieee754.hpp>
13 #define CFLOAT_SUPPORT
14 #include <universal/number/cfloat/cfloat.hpp>
15
16 #define POSIT_SUPPORT
17 //#define POSIT_FAST_SPECIALIZATION 1
18 #define POSIT_FAST_POSIT_8_0 0
19 #define POSIT_FAST_POSIT_16_1 1
20 #define POSIT_FAST_POSIT_32_2 1
21 #include <universal/number/posit/posit.hpp>
22 // is representable
23 #include <universal/functions/isrepresentable.hpp>
24 #include <universal/verification/test_suite.hpp>
25 #include <universal/verification/performance_runner.hpp>
26
27 template<typename NativeFloat>
CopyWorkload(size_t NR_OPS)28 void CopyWorkload(size_t NR_OPS) {
29 using namespace sw::universal;
30 NativeFloat a,b,c;
31
32 bool bFail = false;
33 size_t j = 0;
34 for (size_t i = 0; i < NR_OPS; ++i,++j) {
35 a = NativeFloat(i);
36 b = a;
37 c = NativeFloat(j);
38 if (b != c) {
39 bFail = true;
40 }
41 }
42 if (bFail) std::cout << "COPY FAIL\n"; // just a quick double check that all went well
43 }
44
45 /*
46
47 */
48
49 /// <summary>
50 /// measure performance of copying numbers around
51 /// </summary>
TestCopyPerformance()52 void TestCopyPerformance() {
53 using namespace sw::universal;
54 std::cout << "comparative floating-point copy performance\n";
55
56 uint64_t NR_OPS = 10000000;
57 // single block representations
58 std::cout << "single block representations\n";
59 PerformanceRunner("float copy ", CopyWorkload< float >, NR_OPS);
60 PerformanceRunner("double copy ", CopyWorkload< double >, NR_OPS);
61 #if LONG_DOUBLE_SUPPORT
62 PerformanceRunner("long double copy ", CopyWorkload< long double >, NR_OPS);
63 #endif
64 }
65
66 template<typename NativeFloat>
DecodeWorkload(size_t NR_OPS)67 void DecodeWorkload(size_t NR_OPS) {
68 using namespace sw::universal;
69
70 NativeFloat a{ 1.0f };
71 size_t success{ 0 };
72 bool first{ true };
73 for (size_t i = 0; i < NR_OPS; ++i) {
74 a = NativeFloat(i);
75 bool s;
76 uint64_t e, f;
77 extractFields(a, s, e, f);
78 if (s == false) {
79 ++success;
80 }
81 else {
82 if (first) {
83 first = false;
84 std::cout << typeid(a).name() << " :\n"
85 << to_binary(a) << "\n"
86 << "sign : " << (s ? "-1\n" : "+1\n")
87 << "exponent: " << to_binary(e) << "\n"
88 << "fraction: " << to_binary(f) << "\n";
89 }
90 }
91 }
92 if (success == 0) std::cout << "DECODE FAIL\n"; // just a quick double check that all went well
93 }
94
95 /*
96
97 */
98
99 /// <summary>
100 /// measure performance of decode operator
101 /// NOTE: es is <= 11 due to limits of dynamic range of a 64-bit double
102 /// </summary>
TestDecodePerformance()103 void TestDecodePerformance() {
104 using namespace sw::universal;
105 std::cout << "comparative floating-point decode operator performance\n";
106
107 uint64_t NR_OPS = 100000;
108 // single block representations
109 std::cout << "single block representations\n";
110 PerformanceRunner("float decode ", DecodeWorkload< float >, NR_OPS);
111 PerformanceRunner("double decode ", DecodeWorkload< double >, NR_OPS);
112 #if LONG_DOUBLE_SUPPORT
113 PerformanceRunner("long double decode ", DecodeWorkload< long double >, NR_OPS);
114 #endif
115 }
116
117 // measure performance of conversion operators
TestConversionPerformance()118 void TestConversionPerformance() {
119 using namespace sw::universal;
120 std::cout << "comparative floating-point conversion performance\n";
121
122 // uint64_t NR_OPS = 1000000;
123 }
124
125 // Generic set of adds and subtracts for a given number system type
126 template<typename NativeFloat>
AdditionSubtractionWorkload(size_t NR_OPS)127 void AdditionSubtractionWorkload(size_t NR_OPS) {
128 std::vector<NativeFloat> data = { 0.99999f, -1.00001f };
129 NativeFloat a, b{ 1.0625f };
130 for (size_t i = 1; i < NR_OPS; ++i) {
131 a = data[i % 2];
132 b = b + a;
133 }
134 if (b == 1.0625f) {
135 std::cout << "dummy case to fool the optimizer\n";
136 }
137 }
138
139 // Generic set of multiplies for a given number system type
140 template<typename NativeFloat>
MultiplicationWorkload(size_t NR_OPS)141 void MultiplicationWorkload(size_t NR_OPS) {
142 std::vector<NativeFloat> data = { 0.99999f, 1.00001f };
143 NativeFloat a, b{ 1.0625f };
144 for (size_t i = 1; i < NR_OPS; ++i) {
145 a = data[i % 2];
146 b = b * a;
147 }
148 if (b == 1.0625f) {
149 std::cout << "dummy case to fool the optimizer\n";
150 }
151 }
152
153 // Generic set of divides for a given number system type
154 template<typename NativeFloat>
DivisionWorkload(size_t NR_OPS)155 void DivisionWorkload(size_t NR_OPS) {
156 std::vector<NativeFloat> data = { 0.99999f, 1.00001f };
157 NativeFloat a, b{ 1.0625f };
158 for (size_t i = 1; i < NR_OPS; ++i) {
159 a = data[i%2];
160 b = b / a;
161 }
162 if (b == 1.0625f) {
163 std::cout << "dummy case to fool the optimizer\n";
164 }
165 }
166
167 // measure performance of arithmetic operators
TestArithmeticOperatorPerformance()168 void TestArithmeticOperatorPerformance() {
169 std::cout << "comparative floating-point arithmetic operator performance\n";
170
171 uint64_t NR_OPS = 16 * 1024 * 1024;
172
173 sw::universal::PerformanceRunner("float add/subtract ", AdditionSubtractionWorkload< float >, NR_OPS);
174 sw::universal::PerformanceRunner("double add/subtract ", AdditionSubtractionWorkload< double >, NR_OPS);
175 #if LONG_DOUBLE_SUPPORT
176 sw::universal::PerformanceRunner("long double add/subtract ", AdditionSubtractionWorkload< long double >, NR_OPS);
177 #endif
178
179 sw::universal::PerformanceRunner("float multiply ", MultiplicationWorkload< float >, NR_OPS);
180 sw::universal::PerformanceRunner("double multiply ", MultiplicationWorkload< double >, NR_OPS);
181 #if LONG_DOUBLE_SUPPORT
182 sw::universal::PerformanceRunner("long double multiply ", MultiplicationWorkload< long double >, NR_OPS);
183 #endif
184
185 sw::universal::PerformanceRunner("float division ", DivisionWorkload< float >, NR_OPS);
186 sw::universal::PerformanceRunner("double division ", DivisionWorkload< double >, NR_OPS);
187 #if LONG_DOUBLE_SUPPORT
188 sw::universal::PerformanceRunner("long double division ", DivisionWorkload< long double >, NR_OPS);
189 #endif
190 }
191
192 // special values handling
193
194 template<typename NativeFloat>
CustomPerfRunner(const std::string & tag,void (f)(std::vector<NativeFloat> &),std::vector<NativeFloat> & data)195 void CustomPerfRunner(const std::string& tag, void (f)(std::vector<NativeFloat>&), std::vector<NativeFloat>& data) {
196 using namespace std;
197 using namespace std::chrono;
198
199 size_t NR_OPS = data.size();
200 steady_clock::time_point begin = steady_clock::now();
201 f(data);
202 steady_clock::time_point end = steady_clock::now();
203 duration<double> time_span = duration_cast<duration<double>> (end - begin);
204 double elapsed_time = time_span.count();
205
206 cout << tag << ' ' << setw(10) << NR_OPS << " per " << setw(15) << elapsed_time << "sec -> " << sw::universal::toPowerOfTen(double(NR_OPS) / elapsed_time) << "ops/sec" << endl;
207 }
208
209 template<typename NativeFloat>
ArrayWorkload(std::vector<NativeFloat> & data)210 void ArrayWorkload(std::vector<NativeFloat>& data) {
211 for (size_t i = 0; i < data.size()-1; ++i)
212 data[i] = NativeFloat(0.5) * (data[i] + data[i + 1]);
213 }
214
215 template<typename NativeFloat>
TestSpecialValueWorkload(const std::string & tag,size_t NR_ELEMENTS)216 void TestSpecialValueWorkload(const std::string& tag, size_t NR_ELEMENTS) {
217 std::vector<NativeFloat> data(NR_ELEMENTS);
218
219 for (size_t i = 0; i < NR_ELEMENTS; ++i) data[i] = 0.0;
220 CustomPerfRunner(tag + std::string("zeros "), ArrayWorkload<NativeFloat>, data);
221
222 for (size_t i = 0; i < NR_ELEMENTS; ++i) data[i] = 1.0;
223 CustomPerfRunner(tag + std::string("ones "), ArrayWorkload<NativeFloat>, data);
224
225 for (size_t i = 0; i < NR_ELEMENTS; ++i) data[i] = std::numeric_limits<NativeFloat>::denorm_min();
226 CustomPerfRunner(tag + std::string("subnormals "), ArrayWorkload<NativeFloat>, data);
227
228 for (size_t i = 0; i < NR_ELEMENTS; ++i) data[i] = std::numeric_limits<NativeFloat>::infinity();
229 CustomPerfRunner(tag + std::string("Inf "), ArrayWorkload<NativeFloat>, data);
230
231 for (size_t i = 0; i < NR_ELEMENTS; ++i) data[i] = std::numeric_limits<NativeFloat>::quiet_NaN();
232 CustomPerfRunner(tag + std::string("NaN "), ArrayWorkload<NativeFloat>, data);
233 }
234
TestSpecialValuePerformanceLevel1()235 void TestSpecialValuePerformanceLevel1() {
236 std::cout << "comparative floating-point special value processing performance\n";
237 constexpr size_t NR_OPS = 1024 * 1024;
238
239 TestSpecialValueWorkload<float>(std::string("float "), NR_OPS);
240 TestSpecialValueWorkload<double>(std::string("double "), NR_OPS);
241 #if LONG_DOUBLE_SUPPORT
242 TestSpecialValueWorkload<long double>(std::string("long double "), NR_OPS);
243 #endif
244
245 #ifdef CFLOAT_SUPPORT
246 TestSpecialValueWorkload<sw::universal::cfloat< 8, 2>>
247 (std::string("cfloat< 8, 2> "), NR_OPS);
248 TestSpecialValueWorkload<sw::universal::cfloat< 16, 5>>
249 (std::string("cfloat< 16, 5> "), NR_OPS);
250 TestSpecialValueWorkload<sw::universal::cfloat< 32, 8>>
251 (std::string("cfloat< 32, 8> "), NR_OPS);
252 // TestSpecialValueWorkload<sw::universal::cfloat< 64, 11>>
253 // (std::string("cfloat< 64,11> "), NR_OPS);
254 // TestSpecialValueWorkload<sw::universal::cfloat< 80, 15>>
255 // (std::string("cfloat< 80,15> "), NR_OPS);
256 // TestSpecialValueWorkload<sw::universal::cfloat<128, 15>>
257 // (std::string("cfloat< 80,15> "), NR_OPS);
258 #endif
259
260 #ifdef POSIT_SUPPORT
261 TestSpecialValueWorkload<sw::universal::posit< 8, 0>>
262 (std::string("posit< 8,0> "), NR_OPS);
263 TestSpecialValueWorkload<sw::universal::posit< 16, 1>>
264 (std::string("posit< 16,1> "), NR_OPS);
265 TestSpecialValueWorkload<sw::universal::posit< 32, 2>>
266 (std::string("posit< 32,2> "), NR_OPS);
267 #endif
268 }
269
TestSpecialValuePerformanceLevel4()270 void TestSpecialValuePerformanceLevel4() {
271 std::cout << "comparative floating-point special value processing performance\n";
272 constexpr size_t NR_OPS = 1024 * 1024;
273
274 TestSpecialValueWorkload<float> (std::string("float "), NR_OPS);
275 TestSpecialValueWorkload<double> (std::string("double "), NR_OPS);
276 #if LONG_DOUBLE_SUPPORT
277 TestSpecialValueWorkload<long double>(std::string("long double "), NR_OPS);
278 #endif
279
280 #ifdef CFLOAT_SUPPORT
281 TestSpecialValueWorkload<sw::universal::cfloat< 8, 2>>
282 (std::string("cfloat< 8, 2> "), NR_OPS);
283 TestSpecialValueWorkload<sw::universal::cfloat< 16, 5>>
284 (std::string("cfloat< 16, 5> "), NR_OPS);
285 TestSpecialValueWorkload<sw::universal::cfloat< 32, 8>>
286 (std::string("cfloat< 32, 8> "), NR_OPS);
287 // TestSpecialValueWorkload<sw::universal::cfloat< 64, 11>>
288 // (std::string("cfloat< 64,11> "), NR_OPS);
289 // TestSpecialValueWorkload<sw::universal::cfloat< 80, 15>>
290 // (std::string("cfloat< 80,15> "), NR_OPS);
291 // TestSpecialValueWorkload<sw::universal::cfloat<128, 15>>
292 // (std::string("cfloat< 80,15> "), NR_OPS);
293 #endif
294
295 #ifdef POSIT_SUPPORT
296 TestSpecialValueWorkload<sw::universal::posit< 8, 0>>
297 (std::string("posit< 8,0> "), NR_OPS);
298 TestSpecialValueWorkload<sw::universal::posit< 16, 1>>
299 (std::string("posit< 16,1> "), NR_OPS);
300 TestSpecialValueWorkload<sw::universal::posit< 32, 2>>
301 (std::string("posit< 32,2> "), NR_OPS);
302 TestSpecialValueWorkload<sw::universal::posit< 64, 3>>
303 (std::string("posit< 64,3> "), NR_OPS);
304 TestSpecialValueWorkload<sw::universal::posit<128, 4>>
305 (std::string("posit<128,4> "), NR_OPS);
306 TestSpecialValueWorkload<sw::universal::posit<256, 5>>
307 (std::string("posit<256,5> "), NR_OPS);
308 #endif
309 }
310
311 // Regression testing guards: typically set by the cmake configuration, but MANUAL_TESTING is an override
312 #define MANUAL_TESTING 0
313 // REGRESSION_LEVEL_OVERRIDE is set by the cmake file to drive a specific regression intensity
314 // It is the responsibility of the regression test to organize the tests in a quartile progression.
315 //#undef REGRESSION_LEVEL_OVERRIDE
316 #ifndef REGRESSION_LEVEL_OVERRIDE
317 #define REGRESSION_LEVEL_1 1
318 #define REGRESSION_LEVEL_2 1
319 #define REGRESSION_LEVEL_3 1
320 #define REGRESSION_LEVEL_4 1
321 #endif
322
main()323 int main()
324 try {
325 using namespace sw::universal;
326
327 std::string test_suite = "native floating-point operator performance benchmarking ";
328 std::cout << test_suite << '\n';
329
330 int nrOfFailedTestCases = 0;
331
332 #if MANUAL_TESTING
333
334 using Scalar = float;
335 Scalar a{ 1.0f }, b;
336 b = a;
337 std::cout << a << " : " << b << std::endl;
338
339 size_t NR_OPS = 10000000;
340 PerformanceRunner("float copy ", CopyWorkload< float >, NR_OPS);
341 PerformanceRunner("double copy ", CopyWorkload< double >, NR_OPS);
342
343 TestSpecialValuePerformanceLevel4();
344
345 ReportTestSuiteResults(test_suite, nrOfFailedTestCases);
346 return EXIT_SUCCESS; // ignore failures
347 #else
348
349
350 #if REGRESSION_LEVEL_1
351 TestSpecialValuePerformanceLevel1();
352 #endif
353
354 #if REGRESSION_LEVEL_2
355 //TestSpecialValuePerformanceLevel2();
356 #endif
357
358 #if REGRESSION_LEVEL_3
359 //TestSpecialValuePerformanceLevel3();
360 #endif
361
362 #if REGRESSION_LEVEL_4
363 TestSpecialValuePerformanceLevel4();
364 #endif
365
366 ReportTestSuiteResults(test_suite, nrOfFailedTestCases);
367 return (nrOfFailedTestCases > 0 ? EXIT_FAILURE : EXIT_SUCCESS);
368 #endif // MANUAL_TESTING
369 }
370 catch (char const* msg) {
371 std::cerr << "Caught exception: " << msg << '\n';
372 return EXIT_FAILURE;
373 }
374 catch (const std::runtime_error& err) {
375 std::cerr << "Uncaught runtime exception: " << err.what() << std::endl;
376 return EXIT_FAILURE;
377 }
378 catch (...) {
379 std::cerr << "Caught unknown exception" << '\n';
380 return EXIT_FAILURE;
381 }
382
383 /*
384 ETLO
385 Date run : 2/23/2020
386 Processor: Intel Core i7-7500 CPU @ 2.70GHz, 2 cores, 4 threads, 15W mobile processor
387 Memory : 16GB
388 System : 64-bit Windows 10 Pro, Version 1803, x64-based processor, OS build 17134.165
389
390 */
391