1 // performance.cpp : performance benchmarking for native floating-point
2 //
3 // Copyright (C) 2017-2021 Stillwater Supercomputing, Inc.
4 //
5 // This file is part of the universal numbers project, which is released under an MIT Open Source license.
6 #include <universal/utility/directives.hpp>
7 #include <universal/utility/long_double.hpp>
8 #include <universal/utility/bit_cast.hpp>   // TODO: can this be integrated in category headers?
9 #include <chrono>
10 #include <vector>
11 
12 #include <universal/native/ieee754.hpp>
13 #define CFLOAT_SUPPORT
14 #include <universal/number/cfloat/cfloat.hpp>
15 
16 #define POSIT_SUPPORT
17 //#define POSIT_FAST_SPECIALIZATION 1
18 #define POSIT_FAST_POSIT_8_0  0
19 #define POSIT_FAST_POSIT_16_1 1
20 #define POSIT_FAST_POSIT_32_2 1
21 #include <universal/number/posit/posit.hpp>
22 // is representable
23 #include <universal/functions/isrepresentable.hpp>
24 #include <universal/verification/test_suite.hpp>
25 #include <universal/verification/performance_runner.hpp>
26 
27 template<typename NativeFloat>
CopyWorkload(size_t NR_OPS)28 void CopyWorkload(size_t NR_OPS) {
29 	using namespace sw::universal;
30 	NativeFloat a,b,c;
31 
32 	bool bFail = false;
33 	size_t j = 0;
34 	for (size_t i = 0; i < NR_OPS; ++i,++j) {
35 		a = NativeFloat(i);
36 		b = a;
37 		c = NativeFloat(j);
38 		if (b != c) {
39 			bFail = true;
40 		}
41 	}
42 	if (bFail) std::cout << "COPY FAIL\n"; // just a quick double check that all went well
43 }
44 
45 /*
46 
47 */
48 
49 /// <summary>
50 /// measure performance of copying numbers around
51 /// </summary>
TestCopyPerformance()52 void TestCopyPerformance() {
53 	using namespace sw::universal;
54 	std::cout << "comparative floating-point copy performance\n";
55 
56 	uint64_t NR_OPS = 10000000;
57 	// single block representations
58 	std::cout << "single block representations\n";
59 	PerformanceRunner("float                    copy           ", CopyWorkload< float >, NR_OPS);
60 	PerformanceRunner("double                   copy           ", CopyWorkload< double >, NR_OPS);
61 #if LONG_DOUBLE_SUPPORT
62 	PerformanceRunner("long double              copy           ", CopyWorkload< long double >, NR_OPS);
63 #endif
64 }
65 
66 template<typename NativeFloat>
DecodeWorkload(size_t NR_OPS)67 void DecodeWorkload(size_t NR_OPS) {
68 	using namespace sw::universal;
69 
70 	NativeFloat a{ 1.0f };
71 	size_t success{ 0 };
72 	bool first{ true };
73 	for (size_t i = 0; i < NR_OPS; ++i) {
74 		a = NativeFloat(i);
75 		bool s;
76 		uint64_t e, f;
77 		extractFields(a, s, e, f);
78 		if (s == false) {
79 			++success;
80 		}
81 		else {
82 			if (first) {
83 				first = false;
84 				std::cout << typeid(a).name() << " :\n"
85 					<< to_binary(a) << "\n"
86 					<< "sign    : " << (s ? "-1\n" : "+1\n")
87 					<< "exponent: " << to_binary(e) << "\n"
88 					<< "fraction: " << to_binary(f) << "\n";
89 			}
90 		}
91 	}
92 	if (success == 0) std::cout << "DECODE FAIL\n"; // just a quick double check that all went well
93 }
94 
95 /*
96 
97 */
98 
99 /// <summary>
100 /// measure performance of decode operator
101 /// NOTE: es is <= 11 due to limits of dynamic range of a 64-bit double
102 /// </summary>
TestDecodePerformance()103 void TestDecodePerformance() {
104 	using namespace sw::universal;
105 	std::cout << "comparative floating-point decode operator performance\n";
106 
107 	uint64_t NR_OPS = 100000;
108 	// single block representations
109 	std::cout << "single block representations\n";
110 	PerformanceRunner("float                    decode         ", DecodeWorkload< float >, NR_OPS);
111 	PerformanceRunner("double                   decode         ", DecodeWorkload< double >, NR_OPS);
112 #if LONG_DOUBLE_SUPPORT
113 	PerformanceRunner("long double              decode         ", DecodeWorkload< long double >, NR_OPS);
114 #endif
115 }
116 
117 // measure performance of conversion operators
TestConversionPerformance()118 void TestConversionPerformance() {
119 	using namespace sw::universal;
120 	std::cout << "comparative floating-point conversion performance\n";
121 
122 //	uint64_t NR_OPS = 1000000;
123 }
124 
125 // Generic set of adds and subtracts for a given number system type
126 template<typename NativeFloat>
AdditionSubtractionWorkload(size_t NR_OPS)127 void AdditionSubtractionWorkload(size_t NR_OPS) {
128 	std::vector<NativeFloat> data = { 0.99999f, -1.00001f };
129 	NativeFloat a, b{ 1.0625f };
130 	for (size_t i = 1; i < NR_OPS; ++i) {
131 		a = data[i % 2];
132 		b = b + a;
133 	}
134 	if (b == 1.0625f) {
135 		std::cout << "dummy case to fool the optimizer\n";
136 	}
137 }
138 
139 // Generic set of multiplies for a given number system type
140 template<typename NativeFloat>
MultiplicationWorkload(size_t NR_OPS)141 void MultiplicationWorkload(size_t NR_OPS) {
142 	std::vector<NativeFloat> data = { 0.99999f, 1.00001f };
143 	NativeFloat a, b{ 1.0625f };
144 	for (size_t i = 1; i < NR_OPS; ++i) {
145 		a = data[i % 2];
146 		b = b * a;
147 	}
148 	if (b == 1.0625f) {
149 		std::cout << "dummy case to fool the optimizer\n";
150 	}
151 }
152 
153 // Generic set of divides for a given number system type
154 template<typename NativeFloat>
DivisionWorkload(size_t NR_OPS)155 void DivisionWorkload(size_t NR_OPS) {
156 	std::vector<NativeFloat> data = { 0.99999f, 1.00001f };
157 	NativeFloat a, b{ 1.0625f };
158 	for (size_t i = 1; i < NR_OPS; ++i) {
159 		a = data[i%2];
160 		b = b / a;
161 	}
162 	if (b == 1.0625f) {
163 		std::cout << "dummy case to fool the optimizer\n";
164 	}
165 }
166 
167 // measure performance of arithmetic operators
TestArithmeticOperatorPerformance()168 void TestArithmeticOperatorPerformance() {
169 	std::cout << "comparative floating-point  arithmetic operator performance\n";
170 
171 	uint64_t NR_OPS = 16 * 1024 * 1024;
172 
173 	sw::universal::PerformanceRunner("float                    add/subtract   ", AdditionSubtractionWorkload< float >, NR_OPS);
174 	sw::universal::PerformanceRunner("double                   add/subtract   ", AdditionSubtractionWorkload< double >, NR_OPS);
175 #if LONG_DOUBLE_SUPPORT
176 	sw::universal::PerformanceRunner("long double              add/subtract   ", AdditionSubtractionWorkload< long double >, NR_OPS);
177 #endif
178 
179 	sw::universal::PerformanceRunner("float                    multiply       ", MultiplicationWorkload< float >, NR_OPS);
180 	sw::universal::PerformanceRunner("double                   multiply       ", MultiplicationWorkload< double >, NR_OPS);
181 #if LONG_DOUBLE_SUPPORT
182 	sw::universal::PerformanceRunner("long double              multiply       ", MultiplicationWorkload< long double >, NR_OPS);
183 #endif
184 
185 	sw::universal::PerformanceRunner("float                    division       ", DivisionWorkload< float >, NR_OPS);
186 	sw::universal::PerformanceRunner("double                   division       ", DivisionWorkload< double >, NR_OPS);
187 #if LONG_DOUBLE_SUPPORT
188 	sw::universal::PerformanceRunner("long double              division       ", DivisionWorkload< long double >, NR_OPS);
189 #endif
190 }
191 
192 // special values handling
193 
194 template<typename NativeFloat>
CustomPerfRunner(const std::string & tag,void (f)(std::vector<NativeFloat> &),std::vector<NativeFloat> & data)195 void CustomPerfRunner(const std::string& tag, void (f)(std::vector<NativeFloat>&), std::vector<NativeFloat>& data) {
196 	using namespace std;
197 	using namespace std::chrono;
198 
199 	size_t NR_OPS = data.size();
200 	steady_clock::time_point begin = steady_clock::now();
201 	f(data);
202 	steady_clock::time_point end = steady_clock::now();
203 	duration<double> time_span = duration_cast<duration<double>> (end - begin);
204 	double elapsed_time = time_span.count();
205 
206 	cout << tag << ' ' << setw(10) << NR_OPS << " per " << setw(15) << elapsed_time << "sec -> " << sw::universal::toPowerOfTen(double(NR_OPS) / elapsed_time) << "ops/sec" << endl;
207 }
208 
209 template<typename NativeFloat>
ArrayWorkload(std::vector<NativeFloat> & data)210 void ArrayWorkload(std::vector<NativeFloat>& data) {
211 	for (size_t i = 0; i < data.size()-1; ++i)
212 		data[i] = NativeFloat(0.5) * (data[i] + data[i + 1]);
213 }
214 
215 template<typename NativeFloat>
TestSpecialValueWorkload(const std::string & tag,size_t NR_ELEMENTS)216 void TestSpecialValueWorkload(const std::string& tag, size_t NR_ELEMENTS) {
217 	std::vector<NativeFloat> data(NR_ELEMENTS);
218 
219 	for (size_t i = 0; i < NR_ELEMENTS; ++i) data[i] = 0.0;
220 	CustomPerfRunner(tag + std::string("zeros          "), ArrayWorkload<NativeFloat>, data);
221 
222 	for (size_t i = 0; i < NR_ELEMENTS; ++i) data[i] = 1.0;
223 	CustomPerfRunner(tag + std::string("ones           "), ArrayWorkload<NativeFloat>, data);
224 
225 	for (size_t i = 0; i < NR_ELEMENTS; ++i) data[i] = std::numeric_limits<NativeFloat>::denorm_min();
226 	CustomPerfRunner(tag + std::string("subnormals     "), ArrayWorkload<NativeFloat>, data);
227 
228 	for (size_t i = 0; i < NR_ELEMENTS; ++i) data[i] = std::numeric_limits<NativeFloat>::infinity();
229 	CustomPerfRunner(tag + std::string("Inf            "), ArrayWorkload<NativeFloat>, data);
230 
231 	for (size_t i = 0; i < NR_ELEMENTS; ++i) data[i] = std::numeric_limits<NativeFloat>::quiet_NaN();
232 	CustomPerfRunner(tag + std::string("NaN            "), ArrayWorkload<NativeFloat>, data);
233 }
234 
TestSpecialValuePerformanceLevel1()235 void TestSpecialValuePerformanceLevel1() {
236 	std::cout << "comparative floating-point special value processing performance\n";
237 	constexpr size_t NR_OPS = 1024 * 1024;
238 
239 	TestSpecialValueWorkload<float>(std::string("float                    "), NR_OPS);
240 	TestSpecialValueWorkload<double>(std::string("double                   "), NR_OPS);
241 #if LONG_DOUBLE_SUPPORT
242 	TestSpecialValueWorkload<long double>(std::string("long double              "), NR_OPS);
243 #endif
244 
245 #ifdef CFLOAT_SUPPORT
246 	TestSpecialValueWorkload<sw::universal::cfloat< 8, 2>>
247 		(std::string("cfloat<  8, 2>           "), NR_OPS);
248 	TestSpecialValueWorkload<sw::universal::cfloat< 16, 5>>
249 		(std::string("cfloat< 16, 5>           "), NR_OPS);
250 	TestSpecialValueWorkload<sw::universal::cfloat< 32, 8>>
251 		(std::string("cfloat< 32, 8>           "), NR_OPS);
252 	//	TestSpecialValueWorkload<sw::universal::cfloat< 64, 11>>
253 	//                                         (std::string("cfloat< 64,11>           "), NR_OPS);
254 	//	TestSpecialValueWorkload<sw::universal::cfloat< 80, 15>>
255 	//                                         (std::string("cfloat< 80,15>           "), NR_OPS);
256 	//	TestSpecialValueWorkload<sw::universal::cfloat<128, 15>>
257 	//                                         (std::string("cfloat< 80,15>           "), NR_OPS);
258 #endif
259 
260 #ifdef POSIT_SUPPORT
261 	TestSpecialValueWorkload<sw::universal::posit<  8, 0>>
262 		(std::string("posit<  8,0>             "), NR_OPS);
263 	TestSpecialValueWorkload<sw::universal::posit< 16, 1>>
264 		(std::string("posit< 16,1>             "), NR_OPS);
265 	TestSpecialValueWorkload<sw::universal::posit< 32, 2>>
266 		(std::string("posit< 32,2>             "), NR_OPS);
267 #endif
268 }
269 
TestSpecialValuePerformanceLevel4()270 void TestSpecialValuePerformanceLevel4() {
271 	std::cout << "comparative floating-point special value processing performance\n";
272 	constexpr size_t NR_OPS = 1024 * 1024;
273 
274 	TestSpecialValueWorkload<float>      (std::string("float                    "), NR_OPS);
275 	TestSpecialValueWorkload<double>     (std::string("double                   "), NR_OPS);
276 #if LONG_DOUBLE_SUPPORT
277 	TestSpecialValueWorkload<long double>(std::string("long double              "), NR_OPS);
278 #endif
279 
280 #ifdef CFLOAT_SUPPORT
281 	TestSpecialValueWorkload<sw::universal::cfloat< 8, 2>>
282 		                                 (std::string("cfloat<  8, 2>           "), NR_OPS);
283 	TestSpecialValueWorkload<sw::universal::cfloat< 16, 5>>
284 		                                 (std::string("cfloat< 16, 5>           "), NR_OPS);
285 	TestSpecialValueWorkload<sw::universal::cfloat< 32, 8>>
286 		                                 (std::string("cfloat< 32, 8>           "), NR_OPS);
287 //	TestSpecialValueWorkload<sw::universal::cfloat< 64, 11>>
288 //                                         (std::string("cfloat< 64,11>           "), NR_OPS);
289 //	TestSpecialValueWorkload<sw::universal::cfloat< 80, 15>>
290 //                                         (std::string("cfloat< 80,15>           "), NR_OPS);
291 //	TestSpecialValueWorkload<sw::universal::cfloat<128, 15>>
292 //                                         (std::string("cfloat< 80,15>           "), NR_OPS);
293 #endif
294 
295 #ifdef POSIT_SUPPORT
296 	TestSpecialValueWorkload<sw::universal::posit<  8, 0>>
297 		                                 (std::string("posit<  8,0>             "), NR_OPS);
298 	TestSpecialValueWorkload<sw::universal::posit< 16, 1>>
299 		                                 (std::string("posit< 16,1>             "), NR_OPS);
300 	TestSpecialValueWorkload<sw::universal::posit< 32, 2>>
301 		                                 (std::string("posit< 32,2>             "), NR_OPS);
302 	TestSpecialValueWorkload<sw::universal::posit< 64, 3>>
303 		                                 (std::string("posit< 64,3>             "), NR_OPS);
304 	TestSpecialValueWorkload<sw::universal::posit<128, 4>>
305                                          (std::string("posit<128,4>             "), NR_OPS);
306 	TestSpecialValueWorkload<sw::universal::posit<256, 5>>
307                                          (std::string("posit<256,5>             "), NR_OPS);
308 #endif
309 }
310 
311 // Regression testing guards: typically set by the cmake configuration, but MANUAL_TESTING is an override
312 #define MANUAL_TESTING 0
313 // REGRESSION_LEVEL_OVERRIDE is set by the cmake file to drive a specific regression intensity
314 // It is the responsibility of the regression test to organize the tests in a quartile progression.
315 //#undef REGRESSION_LEVEL_OVERRIDE
316 #ifndef REGRESSION_LEVEL_OVERRIDE
317 #define REGRESSION_LEVEL_1 1
318 #define REGRESSION_LEVEL_2 1
319 #define REGRESSION_LEVEL_3 1
320 #define REGRESSION_LEVEL_4 1
321 #endif
322 
main()323 int main()
324 try {
325 	using namespace sw::universal;
326 
327 	std::string test_suite = "native floating-point operator performance benchmarking ";
328 	std::cout << test_suite << '\n';
329 
330 	int nrOfFailedTestCases = 0;
331 
332 #if MANUAL_TESTING
333 
334 	using Scalar = float;
335 	Scalar a{ 1.0f }, b;
336 	b = a;
337 	std::cout << a << " : " << b << std::endl;
338 
339 	size_t NR_OPS = 10000000;
340 	PerformanceRunner("float                    copy           ", CopyWorkload< float >, NR_OPS);
341 	PerformanceRunner("double                   copy           ", CopyWorkload< double >, NR_OPS);
342 
343 	TestSpecialValuePerformanceLevel4();
344 
345 	ReportTestSuiteResults(test_suite, nrOfFailedTestCases);
346 	return EXIT_SUCCESS; // ignore failures
347 #else
348 
349 
350 #if REGRESSION_LEVEL_1
351 	TestSpecialValuePerformanceLevel1();
352 #endif
353 
354 #if REGRESSION_LEVEL_2
355 	//TestSpecialValuePerformanceLevel2();
356 #endif
357 
358 #if REGRESSION_LEVEL_3
359 	//TestSpecialValuePerformanceLevel3();
360 #endif
361 
362 #if REGRESSION_LEVEL_4
363 	TestSpecialValuePerformanceLevel4();
364 #endif
365 
366 	ReportTestSuiteResults(test_suite, nrOfFailedTestCases);
367 	return (nrOfFailedTestCases > 0 ? EXIT_FAILURE : EXIT_SUCCESS);
368 #endif  // MANUAL_TESTING
369 }
370 catch (char const* msg) {
371 	std::cerr << "Caught exception: " << msg << '\n';
372 	return EXIT_FAILURE;
373 }
374 catch (const std::runtime_error& err) {
375 	std::cerr << "Uncaught runtime exception: " << err.what() << std::endl;
376 	return EXIT_FAILURE;
377 }
378 catch (...) {
379 	std::cerr << "Caught unknown exception" << '\n';
380 	return EXIT_FAILURE;
381 }
382 
383 /*
384 ETLO
385 Date run : 2/23/2020
386 Processor: Intel Core i7-7500 CPU @ 2.70GHz, 2 cores, 4 threads, 15W mobile processor
387 Memory   : 16GB
388 System   : 64-bit Windows 10 Pro, Version 1803, x64-based processor, OS build 17134.165
389 
390 */
391