1 #pragma once
2 // cfloat.hpp: 'classic' float: definition of an arbitrary configuration linear floating-point representation
3 //
4 // Copyright (C) 2017-2021 Stillwater Supercomputing, Inc.
5 //
6 // This file is part of the universal numbers project, which is released under an MIT Open Source license.
7 
8 // compiler specific environment has been delegated to be handled by the
9 // number system include file <universal/number/cfloat.hpp>
10 //
11 // supporting types and functions
12 #include <universal/native/ieee754.hpp>
13 #include <universal/native/subnormal.hpp>
14 #include <universal/native/bit_functions.hpp>
15 #include <universal/number/shared/nan_encoding.hpp>
16 #include <universal/number/shared/infinite_encoding.hpp>
17 #include <universal/number/shared/specific_value_encoding.hpp>
18 // cfloat exception structure
19 #include <universal/number/cfloat/exceptions.hpp>
20 // cfloat tracing options
21 #include <universal/number/cfloat/trace_constants.hpp>
22 // composition types used by cfloat
23 #include <universal/internal/blockbinary/blockbinary.hpp>
24 #include <universal/internal/blocktriple/blocktriple.hpp>
25 
26 #ifndef CFLOAT_THROW_ARITHMETIC_EXCEPTION
27 #define CFLOAT_THROW_ARITHMETIC_EXCEPTION 0
28 #endif
29 #ifndef TRACE_CONVERSION
30 #define TRACE_CONVERSION 0
31 #endif
32 
33 namespace sw::universal {
34 
35 /*
36  * classic floats have denorms, but no gradual overflow, and
37  * project values outside of their dynamic range to +-inf
38  *
39  * Behavior flags
40  *   gradual underflow: use all fraction encodings when exponent is all 0's
41  *   gradual overflow: use all fraction encodings when exponent is all 1's
42  *   saturation to maxneg or maxpos when value is out of dynamic range
43  */
44 // Forward definitions
45 template<size_t nbits, size_t es, typename bt,
46 	bool hasSubnormals, bool hasSupernormals, bool isSaturating> class cfloat;
47 template<size_t nbits, size_t es, typename bt,
48 	bool hasSubnormals, bool hasSupernormals, bool isSaturating>
49 cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>
50 	abs(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>&);
51 
52 /// <summary>
53 /// decode an cfloat value into its constituent parts
54 /// </summary>
55 /// <typeparam name="bt"></typeparam>
56 /// <param name="v"></param>
57 /// <param name="s"></param>
58 /// <param name="e"></param>
59 /// <param name="f"></param>
60 template<size_t nbits, size_t es, size_t fbits, typename bt,
61 	bool hasSubnormals, bool hasSupernormals, bool isSaturating>
decode(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & v,bool & s,blockbinary<es,bt> & e,blockbinary<fbits,bt> & f)62 void decode(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& v, bool& s, blockbinary<es, bt>& e, blockbinary<fbits, bt>& f) {
63 	v.sign(s);
64 	v.exponent(e);
65 	v.fraction(f);
66 }
67 
68 /// <summary>
69 /// return the binary scale of the given number
70 /// </summary>
71 /// <typeparam name="bt">Block type used for storage: derived through ADL</typeparam>
72 /// <param name="v">the cfloat number for which we seek to know the binary scale</param>
73 /// <returns>binary scale, i.e. 2^scale, of the value of the cfloat</returns>
74 template<size_t nbits, size_t es, typename bt,
75 	bool hasSubnormals, bool hasSupernormals, bool isSaturating>
scale(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & v)76 int scale(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& v) {
77 	return v.scale();
78 }
79 
80 /// <summary>
81 /// parse a text string into a cfloat value
82 /// </summary>
83 /// <typeparam name="bt"></typeparam>
84 /// <param name="str"></param>
85 /// <returns></returns>
86 template<size_t nbits, size_t es, typename bt,
87 	bool hasSubnormals, bool hasSupernormals, bool isSaturating>
88 cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>
parse(const std::string & stringRep)89 parse(const std::string& stringRep) {
90 	cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> a{ 0 };
91 	return a.assign(stringRep);
92 }
93 
94 /// <summary>
95 /// convert a blocktriple to a cfloat. blocktriples come out of the arithmetic
96 /// engine in the form ii.ff...ff and a scale. The conversion must take this
97 /// denormalized form into account during conversion.
98 ///
99 /// The blocktriple must be in this form to round correctly, as all the bits
100 /// after an arithmetic operation must be taken into account.
101 ///
102 /// Transformation:
103 ///    ii.ff...ff  transform to    s.eee.fffff
104 /// All number systems that depend on blocktriple will need to have
105 /// the rounding decision answered, so that functionality can be
106 /// reused if we locate it inside blocktriple.
107 ///
108 /// if (srcbits > fbits) // we need to round
109 ///     if (ii.00..00 > 1)
110 ///         mask is at srcbits - fbits + 1
111 ///     else
112 ///		    mask is at srcbits - fbits
113 /// }
114 /// else {               // no need to round
115 /// }
116 /// </summary>
117 /// <typeparam name="bt">type of the block used for cfloat storage</typeparam>
118 /// <param name="src">the blocktriple to be converted</param>
119 /// <param name="tgt">the resulting cfloat</param>
120 template<size_t srcbits, BlockTripleOperator op, size_t nbits, size_t es, typename bt,
121 	bool hasSubnormals, bool hasSupernormals, bool isSaturating>
convert(const blocktriple<srcbits,op,bt> & src,cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & tgt)122 inline /*constexpr*/ void convert(const blocktriple<srcbits, op, bt>& src,
123 	                              cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& tgt) {
124 //	std::cout << "convert: " << to_binary(src) << std::endl;
125 	using btType = blocktriple<srcbits, op, bt>;
126 	using cfloatType = cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>;
127 	// test special cases
128 	if (src.isnan()) {
129 		tgt.setnan(src.sign() ? NAN_TYPE_SIGNALLING : NAN_TYPE_QUIET);
130 	}
131 	else	if (src.isinf()) {
132 		tgt.setinf(src.sign());
133 	}
134 	else 	if (src.iszero()) {
135 		tgt.setzero();
136 		tgt.setsign(src.sign()); // preserve sign
137 	}
138 	else {
139 		int significantScale = src.significantscale();
140 		int exponent = src.scale() + significantScale;
141 		// special case of underflow
142 		if constexpr (hasSubnormals) {
143 //			std::cout << "exponent = " << exponent << " bias = " << cfloatType::EXP_BIAS << " exp subnormal = " << cfloatType::MIN_EXP_SUBNORMAL << '\n';
144 			//if (exponent < (cfloatType::MIN_EXP_SUBNORMAL - 1) || exponent + cfloatType::EXP_BIAS < 0) { // this culls subnormal values too much
145 			if (exponent < (cfloatType::MIN_EXP_SUBNORMAL - 1)) {
146 
147 				tgt.setzero();
148 				return;
149 			}
150 		}
151 		else {
152 			if (exponent < (cfloatType::MIN_EXP_NORMAL - 1) || exponent + cfloatType::EXP_BIAS < 0) {
153 			//if (exponent < (cfloatType::MIN_EXP_NORMAL - 1)) {
154 				tgt.setzero();
155 				return;
156 			}
157 		}
158 		// special case of overflow
159 		if constexpr (hasSupernormals) {
160 			if constexpr (isSaturating) {
161 				if (exponent > cfloatType::MAX_EXP) {
162 					if (src.sign()) tgt.maxneg(); else tgt.maxpos();
163 					return;
164 				}
165 			}
166 			else {
167 				if (exponent > cfloatType::MAX_EXP) {
168 					tgt.setinf(src.sign());
169 					return;
170 				}
171 			}
172 		}
173 		else {  // no supernormals will saturate at a different encoding: TODO can we hide it all in maxpos?
174 			if constexpr (isSaturating) {
175 				if (exponent > cfloatType::MAX_EXP) {
176 					if (src.sign()) tgt.maxneg(); else tgt.maxpos();
177 					return;
178 				}
179 			}
180 			else {
181 				if (exponent > cfloatType::MAX_EXP) {
182 					tgt.setinf(src.sign());
183 					return;
184 				}
185 			}
186 		}
187 
188 		// our value needs to go through rounding to be correctly interpreted
189 		//
190 		// tgt.clear();  // no need as all bits are going to be set by the code below
191 		int adjustment{ 0 };
192 		if constexpr (btType::bfbits < 65) {
193 			// we can use a uint64_t to construct the cfloat
194 			if constexpr (hasSubnormals) {
195 				if (exponent >= cfloatType::MIN_EXP_SUBNORMAL && exponent < cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>::MIN_EXP_NORMAL) {
196 					// the value is in the subnormal range of the cfloat
197 
198 					// -exponent because we are right shifting and exponent in this range is negative
199 					adjustment = -(exponent + subnormal_reciprocal_shift[es]);
200 					// this is the right shift adjustment required for subnormal representation due
201 					// to the scale of the input number, i.e. the exponent of 2^-adjustment
202 				}
203 				else {
204 					// the value is in the normal range of the cfloat
205 				}
206 			}
207 
208 
209 			// process sign
210 			uint64_t raw = (src.sign() ? 1ull : 0ull);
211 //			std::cout << "raw bits (sign)  " << to_binary(raw) << '\n';
212 			// get the rounding direction and the LSB right shift:
213 			// TODO: do we want to support arbitrary blocktriples instead of the ALU output versions?
214 			std::pair<bool, size_t> alignment = src.roundingDecision(adjustment);
215 			bool roundup = alignment.first;
216 			size_t rightShift = alignment.second;  // this is the shift to get the LSB of the src to the LSB of the tgt
217 //			std::cout << "round-up?        " << (roundup ? "yes" : "no") << '\n';
218 //			std::cout << "rightShift       " << rightShift << '\n';
219 			// process exponent
220 
221 			uint64_t expBits = static_cast<uint64_t>(static_cast<long long>(exponent) + static_cast<long long>(cfloatType::EXP_BIAS)); // this is guaranteed to be positive
222 			raw <<= es; // shift sign to make room for the exponent bits
223 			raw |= (roundup ? (expBits+1) : expBits);
224 //			std::cout << "raw bits (exp)   " << to_binary(raw) << '\n';
225 			// process fraction bits
226 			uint64_t fracbits = src.get_ull(); // get all the bits, including the integer bits
227 //			std::cout << "fracbits         " << to_binary(fracbits) << '\n';
228 			raw <<= cfloatType::fbits;
229 //			int rightShift = cfloatType::MIN_EXP_NORMAL - static_cast<int>(exponent) + (srcbits - cfloatType::fbits);
230 //			std::cout << "right shift      " << rightShift << '\n';
231 			fracbits >>= rightShift;
232 //			std::cout << "fracbits shifted " << to_binary(fracbits) << '\n';
233 			fracbits &= cfloatType::ALL_ONES_FR;
234 //			std::cout << "fracbits masked  " << to_binary(fracbits) << '\n';
235 			raw |= fracbits;
236 			tgt.setbits(raw);
237 //			std::cout << "raw bits (all)   " << to_binary(raw) << '\n';
238 		}
239 		else {
240 			// TODO
241 			// compose the segments
242 			tgt.setsign(src.sign());
243 			tgt.setexponent(src.scale());
244 			// this api doesn't work: tgt.setfraction(src.significant());
245 			std::cerr << "convert nbits > 64 TBD\n";
246 		}
247 	}
248 }
249 
250 
251 /// <summary>
252 /// An arbitrary, fixed-size floating-point number with configurable gradual under/overflow and saturation/non-saturation arithmetic.
253 /// Default configuration offers normal encoding and non-saturating arithmetic.
254 /// /// </summary>
255 /// <typeparam name="nbits">number of bits in the encoding</typeparam>
256 /// <typeparam name="es">number of exponent bits in the encoding</typeparam>
257 /// <typeparam name="bt">the type to use as storage class: one of [uint8_t|uint16_t|uint32_t]</typeparam>
258 /// <typeparam name="hasSubnormals">configure gradual underflow (==subnormals)</typeparam>
259 /// <typeparam name="hasSupernormals">configure graudal overflow (==supernormals)</typeparam>
260 /// <typeparam name="isSaturating">configure saturation arithmetic</typeparam>
261 template<size_t _nbits, size_t _es, typename bt = uint8_t,
262 	bool _hasSubnormals = false, bool _hasSupernormals = false, bool _isSaturating = false>
263 class cfloat {
264 public:
265 	static_assert(_nbits > _es + 1ull, "nbits is too small to accomodate the requested number of exponent bits");
266 	static_assert(_es < 21ull, "my God that is a big number, are you trying to break the Interweb?");
267 	static_assert(_es > 0, "number of exponent bits must be bigger than 0 to be a classic floating point number");
268 	// how do you assert on the condition that if es == 1 then subnormals and supernormals must be true?
269 	static constexpr bool subsuper = (_hasSubnormals && _hasSupernormals);
270 	static constexpr bool special = (subsuper ? true : (_es > 1));
271 	static_assert(special, "when es == 1, cfloat must have both subnormals and supernormals");
272 	static constexpr size_t bitsInByte = 8ull;
273 	static constexpr size_t bitsInBlock = sizeof(bt) * bitsInByte;
274 	static_assert(bitsInBlock <= 64, "storage unit for block arithmetic needs to be <= uint64_t"); // TODO: carry propagation on uint64_t requires assembly code
275 
276 	static constexpr size_t nbits = _nbits;
277 	static constexpr size_t es = _es;
278 	static constexpr size_t fbits  = nbits - 1ull - es;    // number of fraction bits excluding the hidden bit
279 	static constexpr size_t fhbits = nbits - es;           // number of fraction bits including the hidden bit
280 	static constexpr size_t abits = 2 * fhbits;            // size of the addend
281 	static constexpr size_t mbits = 2ull * fbits + 1ull;   // size of the multiplier output
282 	static constexpr size_t divbits = 3ull * fhbits + 4ull;// size of the divider output
283 
284 	static constexpr size_t storageMask = (0xFFFFFFFFFFFFFFFFull >> (64ull - bitsInBlock));
285 	static constexpr bt ALL_ONES = bt(~0); // block type specific all 1's value
286 	static constexpr uint32_t ALL_ONES_ES = (0xFFFF'FFFFul >> (32 - es));
287 	static constexpr uint64_t topfbits = fbits % 64;
288 	static constexpr uint64_t FR_SHIFT = (topfbits > 0 ? (64 - topfbits) : 0);
289 	static constexpr uint64_t ALL_ONES_FR = (topfbits > 0 ? (0xFFFF'FFFF'FFFF'FFFFull >> FR_SHIFT) : 0ull); // special case for nbits <= 64
290 	static constexpr uint64_t INF_ENCODING = (ALL_ONES_FR & ~1ull);
291 
292 	static constexpr size_t nrBlocks = 1ull + ((nbits - 1ull) / bitsInBlock);
293 	static constexpr size_t MSU = nrBlocks - 1ull; // MSU == Most Significant Unit, as MSB is already taken
294 	static constexpr bt     MSU_MASK = (ALL_ONES >> (nrBlocks * bitsInBlock - nbits));
295 	static constexpr size_t bitsInMSU = bitsInBlock - (nrBlocks * bitsInBlock - nbits);
296 	static constexpr size_t fBlocks = 1ull + ((fbits - 1ull) / bitsInBlock); // nr of blocks with fraction bits
297 	static constexpr size_t FSU = fBlocks - 1ull;  // FSU = Fraction Significant Unit: the index of the block that contains the most significant fraction bits
298 	static constexpr bt     FSU_MASK = (ALL_ONES >> (fBlocks * bitsInBlock - fbits));
299 	static constexpr size_t bitsInFSU = bitsInBlock - (fBlocks * bitsInBlock - fbits);
300 
301 	static constexpr bt SIGN_BIT_MASK = bt(bt(1ull) << ((nbits - 1ull) % bitsInBlock));
302 	static constexpr bt LSB_BIT_MASK = bt(1ull);
303 	static constexpr bool MSU_CAPTURES_EXP = (1ull + es) <= bitsInMSU;
304 	static constexpr size_t EXP_SHIFT = (MSU_CAPTURES_EXP ? (1 == nrBlocks ? (nbits - 1ull - es) : (bitsInMSU - 1ull - es)) : 0);
305 	static constexpr bt MSU_EXP_MASK = ((ALL_ONES << EXP_SHIFT) & ~SIGN_BIT_MASK) & MSU_MASK;
306 	static constexpr int EXP_BIAS = ((1l << (es - 1ull)) - 1l);
307 	static constexpr int MAX_EXP = (es == 1) ? 1 : ((1l << es) - EXP_BIAS - 1);
308 	static constexpr int MIN_EXP_NORMAL = 1 - EXP_BIAS;
309 	static constexpr int MIN_EXP_SUBNORMAL = 1 - EXP_BIAS - int(fbits); // the scale of smallest ULP
310 	static constexpr bt BLOCK_MASK = bt(~0);
311 
312 	static constexpr bool hasSubnormals   = _hasSubnormals;
313 	static constexpr bool hasSupernormals = _hasSupernormals;
314 	static constexpr bool isSaturating    = _isSaturating;
315 	typedef bt BlockType;
316 
317 	// constructors
cfloat()318 	constexpr cfloat() noexcept : _block{ 0 } {};
319 
320 	constexpr cfloat(const cfloat&) noexcept = default;
321 	constexpr cfloat(cfloat&&) noexcept = default;
322 
323 	constexpr cfloat& operator=(const cfloat&) noexcept = default;
324 	constexpr cfloat& operator=(cfloat&&) noexcept = default;
325 
326 	// decorated/converting constructors
cfloat(const std::string & stringRep)327 	constexpr cfloat(const std::string& stringRep) {
328 		assign(stringRep);
329 	}
330 	/// <summary>
331 	/// construct an cfloat from another, block type bt must be the same
332 	/// </summary>
333 	/// <param name="rhs"></param>
334 	template<size_t nnbits, size_t ees>
cfloat(const cfloat<nnbits,ees,bt,hasSubnormals,hasSupernormals,isSaturating> & rhs)335 	cfloat(const cfloat<nnbits, ees, bt, hasSubnormals, hasSupernormals, isSaturating>& rhs) {
336 		// this->assign(rhs);
337 	}
338 
339 	// specific value constructor
cfloat(const SpecificValue code)340 	constexpr cfloat(const SpecificValue code) noexcept
341 		: _block{ 0 } {
342 		switch (code) {
343 		case SpecificValue::maxpos:
344 			maxpos();
345 			break;
346 		case SpecificValue::minpos:
347 			minpos();
348 			break;
349 		case SpecificValue::zero:
350 		default:
351 			zero();
352 			break;
353 		case SpecificValue::minneg:
354 			minneg();
355 			break;
356 		case SpecificValue::maxneg:
357 			maxneg();
358 			break;
359 		case SpecificValue::infpos:
360 			setinf(false);
361 			break;
362 		case SpecificValue::infneg:
363 			setinf(true);
364 			break;
365 		case SpecificValue::nar: // approximation as cfloats don't have a NaR
366 		case SpecificValue::qnan:
367 			setnan(NAN_TYPE_QUIET);
368 			break;
369 		case SpecificValue::snan:
370 			setnan(NAN_TYPE_SIGNALLING);
371 			break;
372 		}
373 	}
374 
375 	/// <summary>
376 	/// construct an cfloat from a native type, specialized for size
377 	/// </summary>
378 	/// <param name="iv">initial value to construct</param>
cfloat(signed char iv)379 	constexpr cfloat(signed char iv)                    noexcept : _block{ 0 } { *this = iv; }
cfloat(short iv)380 	constexpr cfloat(short iv)                          noexcept : _block{ 0 } { *this = iv; }
cfloat(int iv)381 	constexpr cfloat(int iv)                            noexcept : _block{ 0 } { *this = iv; }
cfloat(long iv)382 	constexpr cfloat(long iv)                           noexcept : _block{ 0 } { *this = iv; }
cfloat(long long iv)383 	constexpr cfloat(long long iv)                      noexcept : _block{ 0 } { *this = iv; }
cfloat(char iv)384 	constexpr cfloat(char iv)                           noexcept : _block{ 0 } { *this = iv; }
cfloat(unsigned short iv)385 	constexpr cfloat(unsigned short iv)                 noexcept : _block{ 0 } { *this = iv; }
cfloat(unsigned int iv)386 	constexpr cfloat(unsigned int iv)                   noexcept : _block{ 0 } { *this = iv; }
cfloat(unsigned long iv)387 	constexpr cfloat(unsigned long iv)                  noexcept : _block{ 0 } { *this = iv; }
cfloat(unsigned long long iv)388 	constexpr cfloat(unsigned long long iv)             noexcept : _block{ 0 } { *this = iv; }
cfloat(float iv)389 	CONSTEXPRESSION cfloat(float iv)                    noexcept : _block{ 0 } { *this = iv; }
cfloat(double iv)390 	CONSTEXPRESSION cfloat(double iv)                   noexcept : _block{ 0 } { *this = iv; }
391 
392 	// assignment operators
operator =(signed char rhs)393 	constexpr cfloat& operator=(signed char rhs)        noexcept { return convert_signed_integer(rhs); }
operator =(short rhs)394 	constexpr cfloat& operator=(short rhs)              noexcept { return convert_signed_integer(rhs); }
operator =(int rhs)395 	constexpr cfloat& operator=(int rhs)                noexcept { return convert_signed_integer(rhs); }
operator =(long rhs)396 	constexpr cfloat& operator=(long rhs)               noexcept { return convert_signed_integer(rhs); }
operator =(long long rhs)397 	constexpr cfloat& operator=(long long rhs)          noexcept { return convert_signed_integer(rhs); }
398 
operator =(char rhs)399 	constexpr cfloat& operator=(char rhs)               noexcept { return convert_unsigned_integer(rhs); }
operator =(unsigned short rhs)400 	constexpr cfloat& operator=(unsigned short rhs)     noexcept { return convert_unsigned_integer(rhs); }
operator =(unsigned int rhs)401 	constexpr cfloat& operator=(unsigned int rhs)       noexcept { return convert_unsigned_integer(rhs); }
operator =(unsigned long rhs)402 	constexpr cfloat& operator=(unsigned long rhs)      noexcept { return convert_unsigned_integer(rhs); }
operator =(unsigned long long rhs)403 	constexpr cfloat& operator=(unsigned long long rhs) noexcept { return convert_unsigned_integer(rhs); }
404 
operator =(float rhs)405 	CONSTEXPRESSION cfloat& operator=(float rhs)        noexcept { return convert_ieee754(rhs); }
operator =(double rhs)406 	CONSTEXPRESSION cfloat& operator=(double rhs)       noexcept { return convert_ieee754(rhs); }
407 
408 	// guard long double support to enable ARM and RISC-V embedded environments
409 #if LONG_DOUBLE_SUPPORT
cfloat(long double iv)410 	CONSTEXPRESSION cfloat(long double iv)  noexcept : _block{ 0 } { *this = iv; }
operator =(long double rhs)411 	CONSTEXPRESSION cfloat& operator=(long double rhs)  noexcept { return convert_ieee754(rhs); }
operator long double() const412 	explicit operator long double() const { return to_native<long double>(); }
413 #endif
414 
415 	// arithmetic operators
416 	// prefix operator
operator -() const417 	inline cfloat operator-() const {
418 		cfloat tmp(*this);
419 		tmp._block[MSU] ^= SIGN_BIT_MASK;
420 		return tmp;
421 	}
422 
operator +=(const cfloat & rhs)423 	cfloat& operator+=(const cfloat& rhs) {
424 		if constexpr (cfloat_trace_add) std::cout << "---------------------- ADD -------------------" << std::endl;
425 		// special case handling of the inputs
426 #if CFLOAT_THROW_ARITHMETIC_EXCEPTION
427 		if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
428 			throw cfloat_operand_is_nan{};
429 		}
430 #else
431 		if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
432 			setnan(NAN_TYPE_SIGNALLING);
433 			return *this;
434 		}
435 		if (isnan(NAN_TYPE_QUIET) || rhs.isnan(NAN_TYPE_QUIET)) {
436 			setnan(NAN_TYPE_QUIET);
437 			return *this;
438 		}
439 #endif
440 		// normal + inf  = inf
441 		// normal + -inf = -inf
442 		// inf + normal = inf
443 		// inf + inf    = inf
444 		// inf + -inf    = ?
445 		// -inf + normal = -inf
446 		// -inf + -inf   = -inf
447 		// -inf + inf    = ?
448 		if (isinf()) {
449 			if (rhs.isinf()) {
450 				if (sign() != rhs.sign()) {
451 					setnan(NAN_TYPE_SIGNALLING);
452 				}
453 				return *this;
454 			}
455 			else {
456 				return *this;
457 			}
458 		}
459 		else {
460 			if (rhs.isinf()) {
461 				*this = rhs;
462 				return *this;
463 			}
464 		}
465 
466 		if (iszero()) {
467 			*this = rhs;
468 			return *this;
469 		}
470 		if (rhs.iszero()) return *this;
471 
472 		// arithmetic operation
473 		blocktriple<fbits, BlockTripleOperator::ADD, bt> a, b, sum;
474 
475 		// transform the inputs into (sign,scale,significant)
476 		// triples of the correct width
477 		normalizeAddition(a);
478 		rhs.normalizeAddition(b);
479 		sum.add(a, b);
480 
481 		convert(sum, *this);
482 
483 		return *this;
484 	}
operator +=(double rhs)485 	cfloat& operator+=(double rhs) {
486 		return *this += cfloat(rhs);
487 	}
operator -=(const cfloat & rhs)488 	cfloat& operator-=(const cfloat& rhs) {
489 		if constexpr (cfloat_trace_sub) std::cout << "---------------------- SUB -------------------" << std::endl;
490 		if (rhs.isnan())
491 			return *this += rhs;
492 		else
493 			return *this += -rhs;
494 	}
operator -=(double rhs)495 	cfloat& operator-=(double rhs) {
496 		return *this -= cfloat(rhs);
497 	}
operator *=(const cfloat & rhs)498 	cfloat& operator*=(const cfloat& rhs) {
499 		if constexpr (cfloat_trace_mul) std::cout << "---------------------- MUL -------------------" << std::endl;
500 		// special case handling of the inputs
501 #if CFLOAT_THROW_ARITHMETIC_EXCEPTION
502 		if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
503 			throw cfloat_operand_is_nan{};
504 		}
505 #else
506 		if (isnan(NAN_TYPE_SIGNALLING) || rhs.isnan(NAN_TYPE_SIGNALLING)) {
507 			setnan(NAN_TYPE_SIGNALLING);
508 			return *this;
509 		}
510 		if (isnan(NAN_TYPE_QUIET) || rhs.isnan(NAN_TYPE_QUIET)) {
511 			setnan(NAN_TYPE_QUIET);
512 			return *this;
513 		}
514 #endif
515 		//  inf * inf = inf
516 		//  inf * -inf = -inf
517 		// -inf * inf = -inf
518 		// -inf * -inf = inf
519 		//	0 * inf = -nan(ind)
520 		bool resultSign = sign() != rhs.sign();
521 		if (isinf()) {
522 			if (rhs.isinf()) {
523 				setsign(resultSign);
524 				return *this;
525 			}
526 			else {
527 				setnan(NAN_TYPE_SIGNALLING);
528 				return *this;
529 			}
530 		}
531 		else {
532 			if (rhs.isinf()) {
533 				setnan(NAN_TYPE_SIGNALLING);
534 				return *this;
535 			}
536 		}
537 
538 		if (iszero() || rhs.iszero()) {
539 			setzero();
540 			setsign(resultSign); // deal with negative 0
541 			return *this;
542 		}
543 
544 		// arithmetic operation
545 		blocktriple<fbits, BlockTripleOperator::MUL, bt> a, b, product;
546 
547 		// transform the inputs into (sign,scale,significant)
548 		// triples of the correct width
549 		normalizeMultiplication(a);
550 		rhs.normalizeMultiplication(b);
551 		product.mul(a, b);
552 
553 		convert(product, *this);
554 
555 		if constexpr (cfloat_trace_mul) {
556 			std::cout << to_binary(a) << " * " << to_binary(b) << " = " << to_binary(product) << '\n';
557 		}
558 		return *this;
559 	}
operator *=(double rhs)560 	cfloat& operator*=(double rhs) {
561 		return *this *= cfloat(rhs);
562 	}
operator /=(const cfloat & rhs)563 	cfloat& operator/=(const cfloat& rhs) {
564 		if constexpr (cfloat_trace_div) std::cout << "---------------------- DIV -------------------" << std::endl;
565 #if CFLOAT_THROW_ARITHMETIC_EXCEPTION
566 		if (rhs.iszero()) throw cfloat_divide_by_zero();
567 		if (rhs.isnan()) throw cfloat_divide_by_nan();
568 		if (isnan()) throw cfloat_operand_is_nan();
569 #else
570 		if (rhs.iszero()) std::cerr << "cfloat_negative_sqrt_argument" << std::endl;
571 		if (rhs.isnan()) std::cerr << "cfloat_divide_by_nan" << std::endl;
572 		if (isnan()) std::cerr << "cfloat_operand_is_nan" << std::endl;
573 #endif
574 		return *this;
575 	}
operator /=(double rhs)576 	cfloat& operator/=(double rhs) {
577 		return *this /= cfloat(rhs);
578 	}
579 	/// <summary>
580 	/// move to the next bit encoding modulo 2^nbits
581 	/// </summary>
582 	/// <typeparam name="bt"></typeparam>
operator ++()583 	inline cfloat& operator++() {
584 		if constexpr (0 == nrBlocks) {
585 			return *this;
586 		}
587 		else if constexpr (1 == nrBlocks) {
588 			if (ispos()) {
589 				if ((_block[MSU] & (MSU_MASK >> 1)) == (MSU_MASK >> 1)) { // pattern: 0.11.111 = nan
590 					_block[MSU] |= SIGN_BIT_MASK; // pattern: 1.11.111 = snan
591 				}
592 				else {
593 					++_block[MSU];
594 				}
595 			}
596 			else {
597 				if ((_block[MSU] & SIGN_BIT_MASK) == _block[MSU]) { // pattern: 1.00.000 = -0
598 					_block[MSU] = 0; // pattern: 0.00.000 = +0
599 				}
600 				else {
601 					--_block[MSU];
602 				}
603 			}
604 		}
605 		else {
606 			if (ispos()) {
607 				// special case: pattern: 0.11.111 = nan transitions to pattern: 1.11.111 = snan
608 				if (isnanencoding()) {
609 					setnan(NAN_TYPE_SIGNALLING);
610 				}
611 				else {
612 					bool carry = true;
613 					for (unsigned i = 0; i < MSU; ++i) {
614 						if (carry) {
615 							if ((_block[i] & storageMask) == storageMask) { // block will overflow
616 								_block[i] = 0;
617 								carry = true;
618 							}
619 							else {
620 								++_block[i];
621 								carry = false;
622 							}
623 						}
624 					}
625 					if (carry) {
626 						++_block[MSU];
627 					}
628 				}
629 			}
630 			else {
631 				// special case: pattern: 1.00.000 = -0 transitions to pattern: 0.00.000 = +0
632 				if (iszeroencoding()) {
633 					setzero();
634 				}
635 				else {
636 					//  1111 0000
637 					//  1110 1111
638 					bool borrow = true;
639 					for (unsigned i = 0; i < MSU; ++i) {
640 						if (borrow) {
641 							if ((_block[i] & storageMask) == 0) { // block will underflow
642 								--_block[i];
643 								borrow = true;
644 							}
645 							else {
646 								--_block[i];
647 								borrow = false;
648 							}
649 						}
650 					}
651 					if (borrow) {
652 						--_block[MSU];
653 					}
654 				}
655 			}
656 		}
657 		return *this;
658 	}
operator ++(int)659 	inline cfloat operator++(int) {
660 		cfloat tmp(*this);
661 		operator++();
662 		return tmp;
663 	}
operator --()664 	inline cfloat& operator--() {
665 		if constexpr (0 == nrBlocks) {
666 			return *this;
667 		}
668 		else if constexpr (1 == nrBlocks) {
669 			if (ispos()) {
670 				if (_block[MSU] == 0) { // pattern: 0.00.000 = 0
671 					_block[MSU] |= SIGN_BIT_MASK; // pattern: 1.00.000 = -0
672 				}
673 				else {
674 					--_block[MSU];
675 				}
676 			}
677 			else {
678 				if ((_block[MSU] & MSU_MASK) == MSU_MASK) { // pattern: 1.11.111 = snan
679 					_block[MSU] &= ~SIGN_BIT_MASK; // pattern: 0.11.111 = qnan
680 				}
681 				else {
682 					++_block[MSU];
683 				}
684 			}
685 
686 		}
687 		else {
688 			if (ispos()) {
689 				// special case: pattern: 0.00.000 = +0 transitions to pattern: 1.00.000 = -0
690 				if (iszeroencoding()) {
691 					setsign(true);
692 				}
693 				else {
694 					bool borrow = true;
695 					for (unsigned i = 0; i < MSU; ++i) {
696 						if (borrow) {
697 							if ((_block[i] & storageMask) == 0) { // block will underflow
698 								--_block[i];
699 								borrow = true;
700 							}
701 							else {
702 								--_block[i];
703 								borrow = false;
704 							}
705 						}
706 					}
707 					if (borrow) {
708 						--_block[MSU];
709 					}
710 				}
711 			}
712 			else {
713 				// special case: pattern: 1.11.111 = snan transitions to pattern: 0.11.111 = qnan
714 				if (isnanencoding()) {
715 					setsign(false);
716 				}
717 				else {
718 					bool carry = true;
719 					for (unsigned i = 0; i < MSU; ++i) {
720 						if (carry) {
721 							if ((_block[i] & storageMask) == storageMask) { // block will overflow
722 								_block[i] = 0;
723 								carry = true;
724 							}
725 							else {
726 								++_block[i];
727 								carry = false;
728 							}
729 						}
730 					}
731 					if (carry) {
732 						++_block[MSU];
733 					}
734 				}
735 			}
736 		}
737 		return *this;
738 	}
operator --(int)739 	inline cfloat operator--(int) {
740 		cfloat tmp(*this);
741 		operator--();
742 		return tmp;
743 	}
744 
745 	// modifiers
746 
747 	/// <summary>
748 	/// clear the content of this cfloat to zero
749 	/// </summary>
750 	/// <returns>void</returns>
clear()751 	inline constexpr void clear() noexcept {
752 		if constexpr (0 == nrBlocks) {
753 			return;
754 		}
755 		else if constexpr (1 == nrBlocks) {
756 			_block[0] = bt(0);
757 		}
758 		else if constexpr (2 == nrBlocks) {
759 			_block[0] = bt(0);
760 			_block[1] = bt(0);
761 		}
762 		else if constexpr (3 == nrBlocks) {
763 			_block[0] = bt(0);
764 			_block[1] = bt(0);
765 			_block[2] = bt(0);
766 		}
767 		else if constexpr (4 == nrBlocks) {
768 			_block[0] = bt(0);
769 			_block[1] = bt(0);
770 			_block[2] = bt(0);
771 			_block[3] = bt(0);
772 		}
773 		else if constexpr (5 == nrBlocks) {
774 			_block[0] = bt(0);
775 			_block[1] = bt(0);
776 			_block[2] = bt(0);
777 			_block[3] = bt(0);
778 			_block[4] = bt(0);
779 		}
780 		else if constexpr (6 == nrBlocks) {
781 			_block[0] = bt(0);
782 			_block[1] = bt(0);
783 			_block[2] = bt(0);
784 			_block[3] = bt(0);
785 			_block[4] = bt(0);
786 			_block[5] = bt(0);
787 		}
788 		else if constexpr (7 == nrBlocks) {
789 			_block[0] = bt(0);
790 			_block[1] = bt(0);
791 			_block[2] = bt(0);
792 			_block[3] = bt(0);
793 			_block[4] = bt(0);
794 			_block[5] = bt(0);
795 			_block[6] = bt(0);
796 		}
797 		else if constexpr (8 == nrBlocks) {
798 			_block[0] = bt(0);
799 			_block[1] = bt(0);
800 			_block[2] = bt(0);
801 			_block[3] = bt(0);
802 			_block[4] = bt(0);
803 			_block[5] = bt(0);
804 			_block[6] = bt(0);
805 			_block[7] = bt(0);
806 		}
807 		else {
808 			for (size_t i = 0; i < nrBlocks; ++i) {
809 				_block[i] = bt(0);
810 			}
811 		}
812 	}
813 	/// <summary>
814 	/// set the number to +0
815 	/// </summary>
816 	/// <returns>void</returns>
setzero()817 	inline constexpr void setzero() noexcept { clear(); }
818 	/// <summary>
819 	/// set the number to +inf
820 	/// </summary>
821 	/// <param name="sign">boolean to make it + or - infinity, default is -inf</param>
822 	/// <returns>void</returns>
setinf(bool sign=true)823 	inline constexpr void setinf(bool sign = true) noexcept {
824 		if constexpr (0 == nrBlocks) {
825 			return;
826 		}
827 		else if constexpr (1 == nrBlocks) {
828 			_block[MSU] = sign ? bt(MSU_MASK ^ LSB_BIT_MASK) : bt(~SIGN_BIT_MASK & (MSU_MASK ^ LSB_BIT_MASK));
829 		}
830 		else if constexpr (2 == nrBlocks) {
831 			_block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
832 			_block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
833 		}
834 		else if constexpr (3 == nrBlocks) {
835 			_block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
836 			_block[1] = BLOCK_MASK;
837 			_block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
838 		}
839 		else if constexpr (4 == nrBlocks) {
840 			_block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
841 			_block[1] = BLOCK_MASK;
842 			_block[2] = BLOCK_MASK;
843 			_block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
844 		}
845 		else if constexpr (5 == nrBlocks) {
846 			_block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
847 			_block[1] = BLOCK_MASK;
848 			_block[2] = BLOCK_MASK;
849 			_block[3] = BLOCK_MASK;
850 			_block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
851 		}
852 		else if constexpr (6 == nrBlocks) {
853 			_block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
854 			_block[1] = BLOCK_MASK;
855 			_block[2] = BLOCK_MASK;
856 			_block[3] = BLOCK_MASK;
857 			_block[4] = BLOCK_MASK;
858 			_block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
859 		}
860 		else if constexpr (7 == nrBlocks) {
861 			_block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
862 			_block[1] = BLOCK_MASK;
863 			_block[2] = BLOCK_MASK;
864 			_block[3] = BLOCK_MASK;
865 			_block[4] = BLOCK_MASK;
866 			_block[5] = BLOCK_MASK;
867 			_block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
868 		}
869 		else if constexpr (8 == nrBlocks) {
870 			_block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
871 			_block[1] = BLOCK_MASK;
872 			_block[2] = BLOCK_MASK;
873 			_block[3] = BLOCK_MASK;
874 			_block[4] = BLOCK_MASK;
875 			_block[5] = BLOCK_MASK;
876 			_block[6] = BLOCK_MASK;
877 			_block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
878 		}
879 		else {
880 			_block[0] = BLOCK_MASK ^ LSB_BIT_MASK;
881 			for (size_t i = 1; i < nrBlocks - 1; ++i) {
882 				_block[i] = BLOCK_MASK;
883 			}
884 			_block[MSU] = sign ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
885 		}
886 	}
887 	/// <summary>
888 	/// set the number to a quiet NaN (+nan) or a signalling NaN (-nan, default)
889 	/// </summary>
890 	/// <param name="sign">boolean to make it + or - infinity, default is -inf</param>
891 	/// <returns>void</returns>
setnan(int NaNType=NAN_TYPE_SIGNALLING)892 	inline constexpr void setnan(int NaNType = NAN_TYPE_SIGNALLING) noexcept {
893 		if constexpr (0 == nrBlocks) {
894 			return;
895 		}
896 		else if constexpr (1 == nrBlocks) {
897 			// fall through
898 		}
899 		else if constexpr (2 == nrBlocks) {
900 			_block[0] = BLOCK_MASK;
901 		}
902 		else if constexpr (3 == nrBlocks) {
903 			_block[0] = BLOCK_MASK;
904 			_block[1] = BLOCK_MASK;
905 		}
906 		else if constexpr (4 == nrBlocks) {
907 			_block[0] = BLOCK_MASK;
908 			_block[1] = BLOCK_MASK;
909 			_block[2] = BLOCK_MASK;
910 		}
911 		else if constexpr (5 == nrBlocks) {
912 			_block[0] = BLOCK_MASK;
913 			_block[1] = BLOCK_MASK;
914 			_block[2] = BLOCK_MASK;
915 			_block[3] = BLOCK_MASK;
916 		}
917 		else if constexpr (6 == nrBlocks) {
918 			_block[0] = BLOCK_MASK;
919 			_block[1] = BLOCK_MASK;
920 			_block[2] = BLOCK_MASK;
921 			_block[3] = BLOCK_MASK;
922 			_block[4] = BLOCK_MASK;
923 		}
924 		else if constexpr (7 == nrBlocks) {
925 			_block[0] = BLOCK_MASK;
926 			_block[1] = BLOCK_MASK;
927 			_block[2] = BLOCK_MASK;
928 			_block[3] = BLOCK_MASK;
929 			_block[4] = BLOCK_MASK;
930 			_block[5] = BLOCK_MASK;
931 		}
932 		else if constexpr (8 == nrBlocks) {
933 			_block[0] = BLOCK_MASK;
934 			_block[1] = BLOCK_MASK;
935 			_block[2] = BLOCK_MASK;
936 			_block[3] = BLOCK_MASK;
937 			_block[4] = BLOCK_MASK;
938 			_block[5] = BLOCK_MASK;
939 			_block[6] = BLOCK_MASK;
940 		}
941 		else {
942 			for (size_t i = 0; i < nrBlocks - 1; ++i) {
943 				_block[i] = BLOCK_MASK;
944 			}
945 		}
946 		_block[MSU] = NaNType == NAN_TYPE_SIGNALLING ? MSU_MASK : bt(~SIGN_BIT_MASK & MSU_MASK);
947 	}
setsign(bool sign=true)948 	inline constexpr void setsign(bool sign = true) {
949 		if (sign) {
950 			_block[MSU] |= SIGN_BIT_MASK;
951 		}
952 		else {
953 			_block[MSU] &= ~SIGN_BIT_MASK;
954 		}
955 	}
setexponent(int scale)956 	inline constexpr bool setexponent(int scale) {
957 		if (scale < MIN_EXP_SUBNORMAL || scale > MAX_EXP) return false; // this scale cannot be represented
958 		if constexpr (nbits < 65) {
959 			// we can use a uint64_t to construct the cfloat
960 			//uint64_t raw{ 0 };
961 			if (scale >= MIN_EXP_SUBNORMAL && scale < MIN_EXP_NORMAL) {
962 				// we are a subnormal number: all exponent bits are 1
963 				// what do you do know? If you set them all to 1, you still
964 				// don't have the right scale
965 				return false;
966 			}
967 			else {
968 				// TODO: optimize
969 				uint32_t exponentBits = static_cast<uint32_t>(scale + EXP_BIAS);
970 				uint32_t mask = (1ul << (es - 1));
971 				for (size_t i = nbits - 2; i > nbits - 2 - es; --i) {
972 					setbit(i, (mask & exponentBits));
973 					mask >>= 1;
974 				}
975 			}
976 		}
977 		else {
978 			// TODO: optimize
979 			uint32_t exponentBits = static_cast<uint32_t>(scale + EXP_BIAS);
980 			uint32_t mask = (1ul << (es - 1));
981 			for (size_t i = nbits - 2; i > nbits - 2 - es; --i) {
982 				setbit(i, (mask & exponentBits));
983 				mask >>= 1;
984 			}
985 		}
986 		return true;
987 	}
988 	/// <summary>
989 	/// set the fraction bits given a significant in the form ???
990 	/// </summary>
991 	/// <param name="significant"></param>
setfraction(const blockbinary<fbits,bt> & fraction)992 	inline constexpr void setfraction(const blockbinary<fbits, bt>& fraction) {
993 		for (size_t i = 0; i < fbits; ++i) {
994 			setbit(i, fraction.test(i));
995 		}
996 	}
setfraction(uint64_t raw_bits)997 	inline constexpr void setfraction(uint64_t raw_bits) {
998 		// unoptimized as it is not meant to be an end-user API, it is a test API
999 		if constexpr (fbits < 65) {
1000 			uint64_t mask{ 1ull };
1001 			for (size_t i = 0; i < fbits; ++i) {
1002 				setbit(i, (mask & raw_bits));
1003 				mask <<= 1;
1004 			}
1005 		}
1006 	}
1007 	// specific number system values of interest
maxpos()1008 	inline constexpr cfloat& maxpos() noexcept {
1009 		if constexpr (hasSupernormals) {
1010 			// maximum positive value has this bit pattern: 0-1...1-111...101, that is, sign = 0, e = 11..11, f = 111...101
1011 			clear();
1012 			flip();
1013 			setbit(nbits - 1ull, false); // sign = 0
1014 			setbit(1ull, false); // bit1 = 0
1015 		}
1016 		else {
1017 			// maximum positive value has this bit pattern: 0-1...0-111...111, that is, sign = 0, e = 11..10, f = 111...111
1018 			clear();
1019 			flip();
1020 			setbit(fbits, false); // set least significant exponent bit to 0
1021 			setbit(nbits - 1ull, false); // set sign to 0
1022 		}
1023 		return *this;
1024 	}
minpos()1025 	inline constexpr cfloat& minpos() noexcept {
1026 		if constexpr (hasSubnormals) {
1027 			// minimum positive value has this bit pattern: 0-000-00...01, that is, sign = 0, e = 000, f = 00001
1028 			clear();
1029 			setbit(0);
1030 		}
1031 		else {
1032 			// minimum positive value has this bit pattern: 0-001-00...0, that is, sign = 0, e = 001, f = 0000
1033 			clear();
1034 			setbit(fbits);
1035 		}
1036 		return *this;
1037 	}
zero()1038 	inline constexpr cfloat& zero() noexcept {
1039 		// the zero value
1040 		clear();
1041 		return *this;
1042 	}
minneg()1043 	inline constexpr cfloat& minneg() noexcept {
1044 		if constexpr (hasSubnormals) {
1045 			// minimum negative value has this bit pattern: 1-000-00...01, that is, sign = 1, e = 00, f = 00001
1046 			clear();
1047 			setbit(nbits - 1ull);
1048 			setbit(0);
1049 		}
1050 		else {
1051 			// minimum negative value has this bit pattern: 1-001-00...0, that is, sign = 1, e = 001, f = 0000
1052 			clear();
1053 			setbit(fbits);
1054 			setbit(nbits - 1ull);
1055 		}
1056 		return *this;
1057 	}
maxneg()1058 	inline constexpr cfloat& maxneg() noexcept {
1059 		if constexpr (hasSupernormals) {
1060 			// maximum negative value has this bit pattern: 1-1...1-111...101, that is, sign = 1, e = 1..1, f = 111...101
1061 			clear();
1062 			flip();
1063 			setbit(1ull, false);
1064 		}
1065 		else {
1066 			// maximum negative value has this bit pattern: 1-1...0-111...111, that is, sign = 1, e = 11..10, f = 111...111
1067 			clear();
1068 			flip();
1069 			setbit(fbits, false);
1070 		}
1071 		return *this;
1072 	}
1073 
1074 	/// <summary>
1075 	/// set a specific bit in the encoding to true or false. If bit index is out of bounds, no modification takes place.
1076 	/// </summary>
1077 	/// <param name="i">bit index to set</param>
1078 	/// <param name="v">boolean value to set the bit to. Default is true.</param>
1079 	/// <returns>void</returns>
setbit(size_t i,bool v=true)1080 	inline constexpr void setbit(size_t i, bool v = true) noexcept {
1081 		if (i < nbits) {
1082 			bt block = _block[i / bitsInBlock];
1083 			bt null = ~(1ull << (i % bitsInBlock));
1084 			bt bit = bt(v ? 1 : 0);
1085 			bt mask = bt(bit << (i % bitsInBlock));
1086 			_block[i / bitsInBlock] = bt((block & null) | mask);
1087 			return;
1088 		}
1089 	}
1090 	/// <summary>
1091 	/// set the raw bits of the cfloat. This is a required API function for number systems in the Universal Numbers Library
1092 	/// This enables verification test suites to inject specific test bit patterns using a common interface.
1093 	//  This is a memcpy type operator, but the target number system may not have a linear memory layout and
1094 	//  thus needs to steer the bits in potentially more complicated ways then memcpy.
1095 	/// </summary>
1096 	/// <param name="raw_bits">unsigned long long carrying bits that will be written verbatim to the cfloat</param>
1097 	/// <returns>reference to the cfloat</returns>
setbits(uint64_t raw_bits)1098 	inline constexpr cfloat& setbits(uint64_t raw_bits) noexcept {
1099 		if constexpr (0 == nrBlocks) {
1100 			return *this;
1101 		}
1102 		else if constexpr (1 == nrBlocks) {
1103 			_block[0] = raw_bits & storageMask;
1104 		}
1105 		else if constexpr (2 == nrBlocks) {
1106 			if constexpr (bitsInBlock < 64) {
1107 				_block[0] = raw_bits & storageMask;
1108 				raw_bits >>= bitsInBlock;
1109 				_block[1] = raw_bits & storageMask;
1110 			}
1111 			else {
1112 				_block[0] = raw_bits & storageMask;
1113 				_block[1] = 0;
1114 			}
1115 		}
1116 		else if constexpr (3 == nrBlocks) {
1117 			if constexpr (bitsInBlock < 64) {
1118 				_block[0] = raw_bits & storageMask;
1119 				raw_bits >>= bitsInBlock;
1120 				_block[1] = raw_bits & storageMask;
1121 				raw_bits >>= bitsInBlock;
1122 				_block[2] = raw_bits & storageMask;
1123 			}
1124 			else {
1125 				_block[0] = raw_bits & storageMask;
1126 				_block[1] = 0;
1127 				_block[2] = 0;
1128 			}
1129 		}
1130 		else if constexpr (4 == nrBlocks) {
1131 			if constexpr (bitsInBlock < 64) {
1132 				_block[0] = raw_bits & storageMask;
1133 				raw_bits >>= bitsInBlock;
1134 				_block[1] = raw_bits & storageMask;
1135 				raw_bits >>= bitsInBlock;
1136 				_block[2] = raw_bits & storageMask;
1137 				raw_bits >>= bitsInBlock;
1138 				_block[3] = raw_bits & storageMask;
1139 			}
1140 			else {
1141 				_block[0] = raw_bits & storageMask;
1142 				_block[1] = 0;
1143 				_block[2] = 0;
1144 				_block[3] = 0;
1145 			}
1146 		}
1147 		else {
1148 			if constexpr (bitsInBlock < 64) {
1149 				for (size_t i = 0; i < nrBlocks; ++i) {
1150 					_block[i] = raw_bits & storageMask;
1151 					raw_bits >>= bitsInBlock;
1152 				}
1153 			}
1154 			else {
1155 				_block[0] = raw_bits & storageMask;
1156 				for (size_t i = 1; i < nrBlocks; ++i) {
1157 					_block[i] = 0;
1158 				}
1159 			}
1160 		}
1161 		_block[MSU] &= MSU_MASK; // enforce precondition for fast comparison by properly nulling bits that are outside of nbits
1162 		return *this;
1163 	}
1164 
1165 	/// <summary>
1166 	/// 1's complement of the encoding
1167 	/// </summary>
1168 	/// <returns>reference to this cfloat object</returns>
flip()1169 	inline constexpr cfloat& flip() noexcept { // in-place one's complement
1170 		for (size_t i = 0; i < nrBlocks; ++i) {
1171 			_block[i] = bt(~_block[i]);
1172 		}
1173 		_block[MSU] &= MSU_MASK; // assert precondition of properly nulled leading non-bits
1174 		return *this;
1175 	}
1176 
1177 	/// <summary>
1178 	/// assign the value of the string representation to the cfloat
1179 	/// </summary>
1180 	/// <param name="stringRep">decimal scientific notation of a real number to be assigned</param>
1181 	/// <returns>reference to this cfloat</returns>
assign(const std::string & str)1182 	inline constexpr cfloat& assign(const std::string& str) noexcept {
1183 		clear();
1184 		if (str.length() == 0) return *this;
1185 		// TODO: regex based determination of scientific form or binary form
1186 
1187 		if (str.length() > 2) {
1188 			if (str[0] == '0' && str[1] == 'b') {
1189 				// binary string needs to be at least nbits+4 characters
1190 				int field(0);
1191 				int exponentBits(-1); // we start the field with a '.'
1192 				if (str.size() != (nbits + 4)) {
1193 					std::cerr << "provided binary string representation does not contain " << nbits << " bits. Reset to 0\n";
1194 					return *this;
1195 				}
1196 				size_t index = nbits;
1197 				for (size_t i = 1; i < str.size(); ++i) {
1198 					if (str[i] == '1') {
1199 						setbit(--index, true);
1200 					}
1201 					else if (str[i] == '0') {
1202 						setbit(--index, false);
1203 					}
1204 					else if (str[i] == '.' || str[i] == '\'') {
1205 						++field;
1206 						if (field == 2) { // just finished parsing exponent field: we can now check the number of exponent bits
1207 							if (exponentBits != es) {
1208 								std::cerr << "provided binary string representation does not contain " << es << " exponent bits. Found " << exponentBits << ". Reset to 0\n";
1209 								return *this;
1210 							}
1211 						}
1212 					}
1213 					if (field == 1) { // exponent field
1214 						++exponentBits;
1215 					}
1216 				}
1217 				if (field != 2) {
1218 					std::cerr << "provided binary string did not contain three fields separated by '.': Reset to 0\n";
1219 					return *this;
1220 				}
1221 			}
1222 		}
1223 		else {
1224 			std::cerr << "parse/assign currently only parse binary string formats that start with 0b\n";
1225 		}
1226 		return *this;
1227 	}
1228 
1229 	// selectors
sign() const1230 	inline constexpr bool sign() const noexcept { return (_block[MSU] & SIGN_BIT_MASK) == SIGN_BIT_MASK; }
scale() const1231 	inline constexpr int  scale() const noexcept {
1232 		int e{ 0 };
1233 		if constexpr (MSU_CAPTURES_EXP) {
1234 			e = static_cast<int>((_block[MSU] & ~SIGN_BIT_MASK) >> EXP_SHIFT);
1235 			if (e == 0) {
1236 				// subnormal scale is determined by fraction
1237 				// subnormals: (-1)^s * 2^(2-2^(es-1)) * (f/2^fbits))
1238 				e = (2l - (1l << (es - 1ull))) - 1;
1239 				if constexpr (nbits > 2 + es) {
1240 					for (size_t i = nbits - 2ull - es; i > 0; --i) {
1241 						if (test(i)) break;
1242 						--e;
1243 					}
1244 				}
1245 			}
1246 			else {
1247 				e -= EXP_BIAS;
1248 			}
1249 		}
1250 		else {
1251 			blockbinary<es, bt> ebits;
1252 			exponent(ebits);
1253 			if (ebits.iszero()) {
1254 				// subnormal scale is determined by fraction
1255 				// subnormals: (-1)^s * 2^(2-2^(es-1)) * (f/2^fbits))
1256 				e = (2l - (1l << (es - 1ull))) - 1;
1257 				if constexpr (nbits > 2 + es) {
1258 					for (size_t i = nbits - 2ull - es; i > 0; --i) {
1259 						if (test(i)) break;
1260 						--e;
1261 					}
1262 				}
1263 			}
1264 			else {
1265 				e = static_cast<int>(unsigned(ebits) - EXP_BIAS);
1266 			}
1267 		}
1268 		return e;
1269 	}
1270 	// tests
isneg() const1271 	inline constexpr bool isneg() const noexcept { return sign(); }
ispos() const1272 	inline constexpr bool ispos() const noexcept { return !sign(); }
iszeroencoding() const1273 	inline constexpr bool iszeroencoding() const noexcept {
1274 		if constexpr (0 == nrBlocks) {
1275 			return true;
1276 		}
1277 		else if constexpr (1 == nrBlocks) {
1278 			return (_block[MSU] & ~SIGN_BIT_MASK) == 0;
1279 		}
1280 		else if constexpr (2 == nrBlocks) {
1281 			return (_block[0] == 0) && (_block[MSU] & ~SIGN_BIT_MASK) == 0;
1282 		}
1283 		else if constexpr (3 == nrBlocks) {
1284 			return (_block[0] == 0) && _block[1] == 0 && (_block[MSU] & ~SIGN_BIT_MASK) == 0;
1285 		}
1286 		else if constexpr (4 == nrBlocks) {
1287 			return (_block[0] == 0) && _block[1] == 0 && _block[2] == 0 && (_block[MSU] & ~SIGN_BIT_MASK) == 0;
1288 		}
1289 		else {
1290 			for (size_t i = 0; i < nrBlocks - 1; ++i) if (_block[i] != 0) return false;
1291 			return (_block[MSU] & ~SIGN_BIT_MASK) == 0;
1292 		}
1293 	}
iszero() const1294 	inline constexpr bool iszero() const noexcept {
1295 		if constexpr (hasSubnormals) {
1296 			return iszeroencoding();
1297 		}
1298 		else { // all subnormals round to 0
1299 			blockbinary<es, bt> ebits;
1300 			exponent(ebits);
1301 			if (ebits.iszero()) return true; else return false;
1302 		}
1303 	}
isone() const1304 	inline constexpr bool isone() const noexcept {
1305 		// unbiased exponent = scale = 0, fraction = 0
1306 		int s = scale();
1307 		if (s == 0) {
1308 			blockbinary<fbits, bt> f;
1309 			fraction(f);
1310 			return f.iszero();
1311 		}
1312 		return false;
1313 	}
1314 	/// <summary>
1315 	/// check if value is infinite, -inf, or +inf.
1316 	/// +inf = 0-1111-11111-0: sign = 0, uncertainty = 0, es/fraction bits = 1
1317 	/// -inf = 1-1111-11111-0: sign = 1, uncertainty = 0, es/fraction bits = 1
1318 	/// </summary>
1319 	/// <param name="InfType">default is 0, both types, -1 checks for -inf, 1 checks for +inf</param>
1320 	/// <returns>true if +-inf, false otherwise</returns>
isinf(int InfType=INF_TYPE_EITHER) const1321 	inline constexpr bool isinf(int InfType = INF_TYPE_EITHER) const noexcept {
1322 		// the bit pattern encoding of Inf is independent of gradual overflow (supernormal) configuration
1323 		bool isNegInf = false;
1324 		bool isPosInf = false;
1325 		if constexpr (0 == nrBlocks) {
1326 			return false;
1327 		}
1328 		else if constexpr (1 == nrBlocks) {
1329 			isNegInf = (_block[MSU] & MSU_MASK) == (MSU_MASK ^ LSB_BIT_MASK);
1330 			isPosInf = (_block[MSU] & MSU_MASK) == ((MSU_MASK ^ SIGN_BIT_MASK) ^ LSB_BIT_MASK);
1331 		}
1332 		else if constexpr (2 == nrBlocks) {
1333 			bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK));
1334 			isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
1335 			isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
1336 		}
1337 		else if constexpr (3 == nrBlocks) {
1338 			bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK)) && (_block[1] == BLOCK_MASK);
1339 			isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
1340 			isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
1341 		}
1342 		else if constexpr (4 == nrBlocks) {
1343 			bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK)) && (_block[1] == BLOCK_MASK) && (_block[2] == BLOCK_MASK);
1344 			isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
1345 			isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
1346 		}
1347 		else {
1348 			bool isInf = (_block[0] == (BLOCK_MASK ^ LSB_BIT_MASK));
1349 			for (size_t i = 1; i < nrBlocks - 1; ++i) {
1350 				if (_block[i] != BLOCK_MASK) {
1351 					isInf = false;
1352 					break;
1353 				}
1354 			}
1355 			isNegInf = isInf && ((_block[MSU] & MSU_MASK) == MSU_MASK);
1356 			isPosInf = isInf && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
1357 		}
1358 
1359 		return (InfType == INF_TYPE_EITHER ? (isNegInf || isPosInf) :
1360 			(InfType == INF_TYPE_NEGATIVE ? isNegInf :
1361 				(InfType == INF_TYPE_POSITIVE ? isPosInf : false)));
1362 	}
1363 	/// <summary>
1364 	/// check if a value is a quiet or a signalling NaN
1365 	/// quiet NaN      = 0-1111-11111-1: sign = 0, uncertainty = 1, es/fraction bits = 1
1366 	/// signalling NaN = 1-1111-11111-1: sign = 1, uncertainty = 1, es/fraction bits = 1
1367 	/// </summary>
1368 	/// <param name="NaNType">default is 0, both types, 1 checks for Signalling NaN, -1 checks for Quiet NaN</param>
1369 	/// <returns>true if the right kind of NaN, false otherwise</returns>
isnanencoding(int NaNType=NAN_TYPE_EITHER) const1370 	inline constexpr bool isnanencoding(int NaNType = NAN_TYPE_EITHER) const noexcept {
1371 		// the bit encoding of NaN is independent of the gradual overflow configuration
1372 		bool isNaN = true;
1373 		bool isNegNaN = false;
1374 		bool isPosNaN = false;
1375 
1376 		if constexpr (0 == nrBlocks) {
1377 			return false;
1378 		}
1379 		else if constexpr (1 == nrBlocks) {
1380 		}
1381 		else if constexpr (2 == nrBlocks) {
1382 			isNaN = (_block[0] == BLOCK_MASK);
1383 		}
1384 		else if constexpr (3 == nrBlocks) {
1385 			isNaN = (_block[0] == BLOCK_MASK) && (_block[1] == BLOCK_MASK);
1386 		}
1387 		else if constexpr (4 == nrBlocks) {
1388 			isNaN = (_block[0] == BLOCK_MASK) && (_block[1] == BLOCK_MASK) && (_block[2] == BLOCK_MASK);
1389 		}
1390 		else {
1391 			for (size_t i = 0; i < nrBlocks - 1; ++i) {
1392 				if (_block[i] != BLOCK_MASK) {
1393 					isNaN = false;
1394 					break;
1395 				}
1396 			}
1397 		}
1398 		isNegNaN = isNaN && ((_block[MSU] & MSU_MASK) == MSU_MASK);
1399 		isPosNaN = isNaN && (_block[MSU] & MSU_MASK) == (MSU_MASK ^ SIGN_BIT_MASK);
1400 
1401 		return (NaNType == NAN_TYPE_EITHER ? (isNegNaN || isPosNaN) :
1402 			(NaNType == NAN_TYPE_SIGNALLING ? isNegNaN :
1403 				(NaNType == NAN_TYPE_QUIET ? isPosNaN : false)));
1404 	}
isnan(int NaNType=NAN_TYPE_EITHER) const1405 	inline constexpr bool isnan(int NaNType = NAN_TYPE_EITHER) const noexcept {
1406 		if constexpr (hasSupernormals) {
1407 			return isnanencoding(NaNType);
1408 		}
1409 		else {
1410 			bool isNaN = true;
1411 			bool isNegNaN = false;
1412 			bool isPosNaN = false;
1413 			if (!issupernormal()) { isNaN = false; }
1414 			isNegNaN = isNaN && sign();
1415 			isPosNaN = isNaN && !sign();
1416 			return (NaNType == NAN_TYPE_EITHER ? (isNegNaN || isPosNaN) :
1417 				(NaNType == NAN_TYPE_SIGNALLING ? isNegNaN :
1418 					(NaNType == NAN_TYPE_QUIET ? isPosNaN : false)));
1419 		}
1420 	}
1421 	// isnormal returns true if exponent bits are not all zero or one, false otherwise
isnormal() const1422 	inline constexpr bool isnormal() const noexcept {
1423 		blockbinary<es, bt> e;
1424 		exponent(e);
1425 //		return !e.iszero() && !isinf() && !isnan();  // old definition that included the supernormals but excluded the extreme encodings
1426 		return !e.iszero() && !e.isallones();
1427 	}
1428 	// isdenormal returns true if exponent bits are all zero, false otherwise
isdenormal() const1429 	inline constexpr bool isdenormal() const noexcept {
1430 		blockbinary<es, bt> e;
1431 		exponent(e);
1432 		return e.iszero();
1433 	}
1434 	// issupernormal returns true if exponent bits are all one, false otherwise
issupernormal() const1435 	inline constexpr bool issupernormal() const noexcept {
1436 		blockbinary<es, bt> e;
1437 		exponent(e);
1438 		return e.isallones();
1439 	}
1440 	template<typename NativeReal>
inrange(NativeReal v)1441 	inline constexpr bool inrange(NativeReal v) {
1442 		// the valid range for this cfloat includes the interval between
1443 		// maxpos and the value that would round down to maxpos
1444 		bool bIsInRange = true;
1445 		if (v > 0) {
1446 			cfloat c(SpecificValue::maxpos);
1447 			cfloat<nbits + 1, es, BlockType, hasSubnormals, hasSupernormals, isSaturating> d;
1448 			d = NativeReal(c);
1449 			++d;
1450 			if (v >= NativeReal(d)) bIsInRange = false;
1451 		}
1452 		else {
1453 			cfloat c(SpecificValue::maxneg);
1454 			cfloat<nbits + 1, es, BlockType, hasSubnormals, hasSupernormals, isSaturating> d;
1455 			d = NativeReal(c);
1456 			--d;
1457 			if (v <= NativeReal(d)) bIsInRange = false;
1458 		}
1459 
1460 		return bIsInRange;
1461 	}
test(size_t bitIndex) const1462 	inline constexpr bool test(size_t bitIndex) const noexcept {
1463 		return at(bitIndex);
1464 	}
at(size_t bitIndex) const1465 	inline constexpr bool at(size_t bitIndex) const noexcept {
1466 		if (bitIndex < nbits) {
1467 			bt word = _block[bitIndex / bitsInBlock];
1468 			bt mask = bt(1ull << (bitIndex % bitsInBlock));
1469 			return (word & mask);
1470 		}
1471 		return false;
1472 	}
nibble(size_t n) const1473 	inline constexpr uint8_t nibble(size_t n) const noexcept {
1474 		if (n < (1 + ((nbits - 1) >> 2))) {
1475 			bt word = _block[(n * 4) / bitsInBlock];
1476 			int nibbleIndexInWord = int(n % (bitsInBlock >> 2ull));
1477 			bt mask = bt(0xF << (nibbleIndexInWord * 4));
1478 			bt nibblebits = bt(mask & word);
1479 			return uint8_t(nibblebits >> (nibbleIndexInWord * 4));
1480 		}
1481 		return false;
1482 	}
block(size_t b) const1483 	inline constexpr bt block(size_t b) const noexcept {
1484 		if (b < nrBlocks) {
1485 			return _block[b];
1486 		}
1487 		return 0;
1488 	}
1489 
1490 	// helper debug function
constexprClassParameters() const1491 	void constexprClassParameters() const {
1492 		std::cout << "-------------------------------------------------------------\n";
1493 		std::cout << "type              : " << typeid(*this).name() << '\n';
1494 		std::cout << "nbits             : " << nbits << '\n';
1495 		std::cout << "es                : " << es << std::endl;
1496 		std::cout << "hasSubnormals     : " << (hasSubnormals ? "true" : "false") << '\n';
1497 		std::cout << "hasSupernormals   : " << (hasSupernormals ? "true" : "false") << '\n';
1498 		std::cout << "isSaturating      : " << (isSaturating ? "true" : "false") << '\n';
1499 		std::cout << "ALL_ONES          : " << to_binary(ALL_ONES, 0, true) << '\n';
1500 		std::cout << "BLOCK_MASK        : " << to_binary(BLOCK_MASK, 0, true) << '\n';
1501 		std::cout << "nrBlocks          : " << nrBlocks << '\n';
1502 		std::cout << "bits in MSU       : " << bitsInMSU << '\n';
1503 		std::cout << "MSU               : " << MSU << '\n';
1504 		std::cout << "MSU MASK          : " << to_binary(MSU_MASK, 0, true) << '\n';
1505 		std::cout << "SIGN_BIT_MASK     : " << to_binary(SIGN_BIT_MASK, 0, true) << '\n';
1506 		std::cout << "LSB_BIT_MASK      : " << to_binary(LSB_BIT_MASK, 0, true) << '\n';
1507 		std::cout << "MSU CAPTURES_EXP  : " << (MSU_CAPTURES_EXP ? "yes\n" : "no\n");
1508 		std::cout << "EXP_SHIFT         : " << EXP_SHIFT << '\n';
1509 		std::cout << "MSU EXP MASK      : " << to_binary(MSU_EXP_MASK, 0, true) << '\n';
1510 		std::cout << "ALL_ONE_MASK_ES   : " << to_binary(ALL_ONES_ES) << '\n';
1511 		std::cout << "EXP_BIAS          : " << EXP_BIAS << '\n';
1512 		std::cout << "MAX_EXP           : " << MAX_EXP << '\n';
1513 		std::cout << "MIN_EXP_NORMAL    : " << MIN_EXP_NORMAL << '\n';
1514 		std::cout << "MIN_EXP_SUBNORMAL : " << MIN_EXP_SUBNORMAL << '\n';
1515 		std::cout << "fraction Blocks   : " << fBlocks << '\n';
1516 		std::cout << "bits in FSU       : " << bitsInFSU << '\n';
1517 		std::cout << "FSU               : " << FSU << '\n';
1518 		std::cout << "FSU MASK          : " << to_binary(FSU_MASK, 0, true) << '\n';
1519 		std::cout << "topfbits          : " << topfbits << '\n';
1520 		std::cout << "ALL_ONE_MASK_FR   : " << to_binary(ALL_ONES_FR) << '\n';
1521 	}
1522 	// extract the sign field from the encoding
sign(bool & s) const1523 	inline constexpr void sign(bool& s) const {
1524 		s = sign();
1525 	}
1526 	// extract the exponent field from the encoding
exponent(blockbinary<es,bt> & e) const1527 	inline constexpr void exponent(blockbinary<es, bt>& e) const {
1528 		e.clear();
1529 		if constexpr (0 == nrBlocks) return;
1530 		else if constexpr (1 == nrBlocks) {
1531 			bt ebits = bt(_block[MSU] & ~SIGN_BIT_MASK);
1532 			e.setbits(uint64_t(ebits >> EXP_SHIFT));
1533 		}
1534 		else if constexpr (nrBlocks > 1) {
1535 			if (MSU_CAPTURES_EXP) {
1536 				bt ebits = bt(_block[MSU] & ~SIGN_BIT_MASK);
1537 				e.setbits(uint64_t(ebits >> ((nbits - 1ull - es) % bitsInBlock)));
1538 			}
1539 			else {
1540 				for (size_t i = 0; i < es; ++i) { e.setbit(i, at(nbits - 1ull - es + i)); }
1541 			}
1542 		}
1543 	}
1544 	// extract the fraction field from the encoding
fraction(blockbinary<fbits,bt> & f) const1545 	inline constexpr void fraction(blockbinary<fbits, bt>& f) const {
1546 		f.clear();
1547 		if constexpr (0 == nrBlocks) return;
1548 		else if constexpr (1 == nrBlocks) {
1549 			bt fraction = bt(_block[MSU] & ~MSU_EXP_MASK);
1550 			f.setbits(fraction);
1551 		}
1552 		else if constexpr (nrBlocks > 1) {
1553 			for (size_t i = 0; i < fbits; ++i) { f.setbit(i, at(i)); } // TODO: TEST!
1554 		}
1555 	}
fraction_ull() const1556 	inline constexpr uint64_t fraction_ull() const {
1557 		uint64_t raw{ 0 };
1558 		if constexpr (nbits - es - 1ull < 65ull) { // no-op if precondition doesn't hold
1559 			if constexpr (1 == nrBlocks) {
1560 				uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
1561 				raw = fbitMask & uint64_t(_block[0]);
1562 			}
1563 			else if constexpr (2 == nrBlocks) {
1564 				uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
1565 				raw = fbitMask & ((uint64_t(_block[1]) << bitsInBlock) | uint64_t(_block[0]));
1566 			}
1567 			else if constexpr (3 == nrBlocks) {
1568 				uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
1569 				raw = fbitMask & ((uint64_t(_block[2]) << (2 * bitsInBlock)) | (uint64_t(_block[1]) << bitsInBlock) | uint64_t(_block[0]));
1570 			}
1571 			else if constexpr (4 == nrBlocks) {
1572 				uint64_t fbitMask = 0xFFFF'FFFF'FFFF'FFFF >> (64 - fbits);
1573 				raw = fbitMask & ((uint64_t(_block[3]) << (3 * bitsInBlock)) | (uint64_t(_block[2]) << (2 * bitsInBlock)) | (uint64_t(_block[1]) << bitsInBlock) | uint64_t(_block[0]));
1574 			}
1575 			else {
1576 				uint64_t mask{ 1 };
1577 				for (size_t i = 0; i < fbits; ++i) {
1578 					if (test(i)) {
1579 						raw |= mask;
1580 					}
1581 					mask <<= 1;
1582 				}
1583 			}
1584 		}
1585 		return raw;
1586 	}
1587 	// construct the significant from the encoding, returns normalization offset
significant(blockbinary<fhbits,bt> & s,bool isNormal=true) const1588 	inline constexpr size_t significant(blockbinary<fhbits, bt>& s, bool isNormal = true) const {
1589 		size_t shift = 0;
1590 		if (iszero()) return 0;
1591 		if constexpr (0 == nrBlocks) return 0;
1592 		else if constexpr (1 == nrBlocks) {
1593 			bt significant = bt(_block[MSU] & ~MSU_EXP_MASK & ~SIGN_BIT_MASK);
1594 			if (isNormal) {
1595 				significant |= (bt(0x1ul) << fbits);
1596 			}
1597 			else {
1598 				size_t msb = findMostSignificantBit(significant);
1599 //				std::cout << "msb : " << msb << " : fhbits : " << fhbits << " : " << to_binary(significant, true) << std::endl;
1600 				shift = fhbits - msb;
1601 				significant <<= shift;
1602 			}
1603 			s.setbits(significant);
1604 		}
1605 		else if constexpr (nrBlocks > 1) {
1606 			s.clear();
1607 			// TODO: design and implement a block-oriented algorithm, this sequential algorithm is super slow
1608 			if (isNormal) {
1609 				s.setbit(fbits);
1610 				for (size_t i = 0; i < fbits; ++i) { s.setbit(i, at(i)); }
1611 			}
1612 			else {
1613 				// Find the MSB of the subnormal:
1614 				size_t msb = 0;
1615 				for (size_t i = 0; i < fbits; ++i) { // msb protected from not being assigned through iszero test at prelude of function
1616 					msb = fbits - 1ull - i;
1617 					if (test(msb)) break;
1618 				}
1619 				//      m-----lsb
1620 				// h00001010101
1621 				// 101010100000
1622 				for (size_t i = 0; i <= msb; ++i) {
1623 					s.setbit(fbits - msb + i, at(i));
1624 				}
1625 				shift = fhbits - msb;
1626 			}
1627 		}
1628 		return shift;
1629 	}
1630 	// get the raw bits from the encoding
getbits(blockbinary<nbits,bt> & b) const1631 	inline constexpr void getbits(blockbinary<nbits, bt>& b) const {
1632 		b.clear();
1633 		for (size_t i = 0; i < nbits; ++i) { b.setbit(i, at(i)); }
1634 	}
1635 	// casts to native types
to_long() const1636 	long to_long() const { return long(to_native<double>()); }
to_long_long() const1637 	long long to_long_long() const { return (long long)(to_native<double>()); }
1638 	// transform an cfloat to a native C++ floating-point. We are using the native
1639 	// precision to compute, which means that all sub-values need to be representable
1640 	// by the native precision.
1641 	// A more accurate approximation would require an adaptive precision algorithm
1642 	// with a final rounding step.
1643 	template<typename TargetFloat>
to_native() const1644 	TargetFloat to_native() const {
1645 		TargetFloat v{ 0.0 };
1646 		if (iszero()) {
1647 			// the optimizer might destroy the sign
1648 			return sign() ? -TargetFloat(0) : TargetFloat(0);
1649 		}
1650 		else if (isnan()) {
1651 			v = sign() ? std::numeric_limits<TargetFloat>::signaling_NaN() : std::numeric_limits<TargetFloat>::quiet_NaN();
1652 		}
1653 		else if (isinf()) {
1654 			v = sign() ? -std::numeric_limits<TargetFloat>::infinity() : std::numeric_limits<TargetFloat>::infinity();
1655 		}
1656 		else { // TODO: this approach has catastrophic cancellation when nbits is large and native target float is too small
1657 			TargetFloat f{ 0 };
1658 			TargetFloat fbit{ 0.5 };
1659 			for (int i = static_cast<int>(nbits - 2ull - es); i >= 0; --i) {
1660 				f += at(static_cast<size_t>(i)) ? fbit : TargetFloat(0);
1661 				fbit *= TargetFloat(0.5);
1662 			}
1663 			blockbinary<es, bt> ebits;
1664 			exponent(ebits);
1665 			if constexpr (hasSubnormals) {
1666 				if (ebits.iszero()) {
1667 					// subnormals: (-1)^s * 2^(2-2^(es-1)) * (f/2^fbits))
1668 					TargetFloat exponentiation = TargetFloat(subnormal_exponent[es]); // precomputed values for 2^(2-2^(es-1))
1669 					v = exponentiation * f;  // f is already f/2^fbits
1670 					return sign() ? -v : v;
1671 				}
1672 			}
1673 			else {
1674 				if (ebits.iszero()) { // underflow to 0
1675 					// compiler fast float optimization might destroy the sign
1676 					return sign() ? -TargetFloat(0) : TargetFloat(0);
1677 				}
1678 			}
1679 			if constexpr (hasSupernormals) {
1680 				// regular: (-1)^s * 2^(e+1-2^(es-1)) * (1 + f/2^fbits))
1681 				int exponent = static_cast<int>(unsigned(ebits) - EXP_BIAS);
1682 				if (-64 < exponent && exponent < 64) {
1683 					TargetFloat exponentiation = (exponent >= 0 ? TargetFloat(1ull << exponent) : (1.0f / TargetFloat(1ull << -exponent)));
1684 					v = exponentiation * (TargetFloat(1.0) + f);
1685 				}
1686 				else {
1687 					double exponentiation = ipow(exponent);
1688 					v = TargetFloat(exponentiation * (1.0 + f));
1689 				}
1690 			}
1691 			else {
1692 				if (ebits.isallones()) {
1693 					// supernormals are mapped to quiet NaNs
1694 					v = std::numeric_limits<TargetFloat>::quiet_NaN();
1695 					return v;
1696 				}
1697 				else {
1698 					// regular: (-1)^s * 2^(e+1-2^(es-1)) * (1 + f/2^fbits))
1699 					int exponent = static_cast<int>(unsigned(ebits) - EXP_BIAS);
1700 					if (-64 < exponent && exponent < 64) {
1701 						TargetFloat exponentiation = (exponent >= 0 ? TargetFloat(1ull << exponent) : (1.0f / TargetFloat(1ull << -exponent)));
1702 						v = exponentiation * (TargetFloat(1.0) + f);
1703 					}
1704 					else {
1705 						double exponentiation = ipow(exponent);
1706 						v = TargetFloat(exponentiation * (1.0 + f));
1707 					}
1708 				}
1709 			}
1710 			v = sign() ? -v : v;
1711 		}
1712 		return v;
1713 	}
1714 
1715 	// make conversions to native types explicit
operator int() const1716 	explicit operator int() const { return to_long_long(); }
operator long long() const1717 	explicit operator long long() const { return to_long_long(); }
operator double() const1718 	explicit operator double() const { return to_native<double>(); }
operator float() const1719 	explicit operator float() const { return to_native<float>(); }
1720 
1721 	// convert a cfloat to a blocktriple with the fraction format 1.ffff
1722 	// we are using the same block type so that we can use block copies to move bits around.
1723 	// Since we tend to have at least two exponent bits, this will lead to
1724 	// most cfloat<->blocktriple cases being efficient as the block types are aligned.
1725 	// The relationship between the source cfloat and target blocktriple is not
1726 	// arbitrary, enforce it by setting the blocktriple fbits to the cfloat's (nbits - es - 1)
normalize(blocktriple<fbits,BlockTripleOperator::REPRESENTATION,bt> & tgt) const1727 	constexpr void normalize(blocktriple<fbits, BlockTripleOperator::REPRESENTATION, bt>& tgt) const {
1728 		// test special cases
1729 		if (isnan()) {
1730 			tgt.setnan();
1731 		}
1732 		else if (isinf()) {
1733 			tgt.setinf();
1734 		}
1735 		else if (iszero()) {
1736 			tgt.setzero();
1737 		}
1738 		else {
1739 			tgt.setnormal(); // a blocktriple is always normalized
1740 			int scale = this->scale();
1741 			tgt.setsign(sign());
1742 			tgt.setscale(scale);
1743 			// set significant
1744 			// we are going to unify to the format 01.ffffeeee
1745 			// where 'f' is a fraction bit, and 'e' is an extension bit
1746 			// so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
1747 			if (isnormal()) {
1748 				if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
1749 					uint64_t raw = fraction_ull();
1750 					raw |= (1ull << fbits);
1751 					tgt.setbits(raw);
1752 				}
1753 				else {
1754 					// brute force copy of blocks
1755 					if constexpr (1 == fBlocks) {
1756 						tgt.setblock(0, _block[0] & FSU_MASK);
1757 					}
1758 					else if constexpr (2 == fBlocks) {
1759 						tgt.setblock(0, _block[0]);
1760 						tgt.setblock(1, _block[1] & FSU_MASK);
1761 					}
1762 					else if constexpr (3 == fBlocks) {
1763 						tgt.setblock(0, _block[0]);
1764 						tgt.setblock(1, _block[1]);
1765 						tgt.setblock(2, _block[2] & FSU_MASK);
1766 					}
1767 					else if constexpr (4 == fBlocks) {
1768 						tgt.setblock(0, _block[0]);
1769 						tgt.setblock(1, _block[1]);
1770 						tgt.setblock(2, _block[2]);
1771 						tgt.setblock(3, _block[3] & FSU_MASK);
1772 					}
1773 					else {
1774 						for (size_t i = 0; i < FSU; ++i) {
1775 							tgt.setblock(i, _block[i]);
1776 						}
1777 						tgt.setblock(FSU, _block[FSU] & FSU_MASK);
1778 					}
1779 				}
1780 			}
1781 			else { // it is a subnormal encoding in this target cfloat
1782 				if constexpr (fbits < 64) {
1783 					uint64_t raw = fraction_ull();
1784 					int shift = MIN_EXP_NORMAL - scale;
1785 					raw <<= shift;
1786 					raw |= (1ull << fbits);
1787 					tgt.setbits(raw);
1788 				}
1789 				else {
1790 					// brute force copy of blocks
1791 					if constexpr (1 == fBlocks) {
1792 						tgt.setblock(0, _block[0] & FSU_MASK);
1793 					}
1794 					else if constexpr (2 == fBlocks) {
1795 						tgt.setblock(0, _block[0]);
1796 						tgt.setblock(1, _block[1] & FSU_MASK);
1797 					}
1798 					else if constexpr (3 == fBlocks) {
1799 						tgt.setblock(0, _block[0]);
1800 						tgt.setblock(1, _block[1]);
1801 						tgt.setblock(2, _block[2] & FSU_MASK);
1802 					}
1803 					else if constexpr (4 == fBlocks) {
1804 						tgt.setblock(0, _block[0]);
1805 						tgt.setblock(1, _block[1]);
1806 						tgt.setblock(2, _block[2]);
1807 						tgt.setblock(3, _block[3] & FSU_MASK);
1808 					}
1809 					else {
1810 						for (size_t i = 0; i < FSU; ++i) {
1811 							tgt.setblock(i, _block[i]);
1812 						}
1813 						tgt.setblock(FSU, _block[FSU] & FSU_MASK);
1814 					}
1815 				}
1816 			}
1817 		}
1818 	}
1819 
1820 	// normalize a cfloat to a blocktriple used in add/sub, which has the form 00h.fffff
1821 	// that is 3 + fbits, the 3 extra bits are required to be able to use 2's complement
1822 	// and capture the largest value of an addition/subtraction.
1823 	// TODO: currently abits = 2*fhbits as the worst case input argument size to
1824 	// capture the smallest normal value in aligned form. There is a faster/smaller
1825 	// implementation where the input is constrainted to just the round, guard, and sticky bits.
normalizeAddition(blocktriple<fbits,BlockTripleOperator::ADD,bt> & tgt) const1826 	constexpr void normalizeAddition(blocktriple<fbits, BlockTripleOperator::ADD, bt>& tgt) const {
1827 		// test special cases
1828 		if (isnan()) {
1829 			tgt.setnan();
1830 		}
1831 		else if (isinf()) {
1832 			tgt.setinf();
1833 		}
1834 		else if (iszero()) {
1835 			tgt.setzero();
1836 		}
1837 		else {
1838 			tgt.setnormal(); // a blocktriple is always normalized
1839 			int scale = this->scale();
1840 			tgt.setsign(sign());
1841 			tgt.setscale(scale);
1842 			// set significant
1843 			// we are going to unify to the format 001.ffffeeee
1844 			// where 'f' is a fraction bit, and 'e' is an extension bit
1845 			// so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
1846 			if (isnormal()) {
1847 				if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
1848 					uint64_t raw = fraction_ull();
1849 					raw |= (1ull << fbits); // add the hidden bit
1850 					tgt.setbits(raw);
1851 				}
1852 				else {
1853 					// brute force copy of blocks
1854 					if constexpr (1 == fBlocks) {
1855 						tgt.setblock(0, _block[0] & FSU_MASK);
1856 					}
1857 					else if constexpr (2 == fBlocks) {
1858 						tgt.setblock(0, _block[0]);
1859 						tgt.setblock(1, _block[1] & FSU_MASK);
1860 					}
1861 					else if constexpr (3 == fBlocks) {
1862 						tgt.setblock(0, _block[0]);
1863 						tgt.setblock(1, _block[1]);
1864 						tgt.setblock(2, _block[2] & FSU_MASK);
1865 					}
1866 					else if constexpr (4 == fBlocks) {
1867 						tgt.setblock(0, _block[0]);
1868 						tgt.setblock(1, _block[1]);
1869 						tgt.setblock(2, _block[2]);
1870 						tgt.setblock(3, _block[3] & FSU_MASK);
1871 					}
1872 					else if constexpr (5 == fBlocks) {
1873 						tgt.setblock(0, _block[0]);
1874 						tgt.setblock(1, _block[1]);
1875 						tgt.setblock(2, _block[2]);
1876 						tgt.setblock(3, _block[3]);
1877 						tgt.setblock(4, _block[4] & FSU_MASK);
1878 					}
1879 					else if constexpr (6 == fBlocks) {
1880 						tgt.setblock(0, _block[0]);
1881 						tgt.setblock(1, _block[1]);
1882 						tgt.setblock(2, _block[2]);
1883 						tgt.setblock(3, _block[3]);
1884 						tgt.setblock(4, _block[4]);
1885 						tgt.setblock(5, _block[5] & FSU_MASK);
1886 					}
1887 					else if constexpr (7 == fBlocks) {
1888 						tgt.setblock(0, _block[0]);
1889 						tgt.setblock(1, _block[1]);
1890 						tgt.setblock(2, _block[2]);
1891 						tgt.setblock(3, _block[3]);
1892 						tgt.setblock(4, _block[4]);
1893 						tgt.setblock(5, _block[5]);
1894 						tgt.setblock(6, _block[6] & FSU_MASK);
1895 					}
1896 					else if constexpr (8 == fBlocks) {
1897 						tgt.setblock(0, _block[0]);
1898 						tgt.setblock(1, _block[1]);
1899 						tgt.setblock(2, _block[2]);
1900 						tgt.setblock(3, _block[3]);
1901 						tgt.setblock(4, _block[4]);
1902 						tgt.setblock(5, _block[5]);
1903 						tgt.setblock(6, _block[6]);
1904 						tgt.setblock(7, _block[7] & FSU_MASK);
1905 					}
1906 					else {
1907 						for (size_t i = 0; i < FSU; ++i) {
1908 							tgt.setblock(i, _block[i]);
1909 						}
1910 						tgt.setblock(FSU, _block[FSU] & FSU_MASK);
1911 					}
1912 				}
1913 			}
1914 			else {
1915 				if (isdenormal()) { // it is a subnormal encoding in this target cfloat
1916 					if constexpr (hasSubnormals) {
1917 						if constexpr (fbits < 64) {
1918 							uint64_t raw = fraction_ull();
1919 							int shift = MIN_EXP_NORMAL - scale;
1920 							raw <<= shift; // shift but do NOT add a hidden bit as the MSB of the subnormal is shifted in the hidden bit position
1921 							tgt.setbits(raw);
1922 						}
1923 						else {
1924 							// brute force copy of blocks
1925 							if constexpr (1 == fBlocks) {
1926 								tgt.setblock(0, _block[0] & FSU_MASK);
1927 							}
1928 							else if constexpr (2 == fBlocks) {
1929 								tgt.setblock(0, _block[0]);
1930 								tgt.setblock(1, _block[1] & FSU_MASK);
1931 							}
1932 							else if constexpr (3 == fBlocks) {
1933 								tgt.setblock(0, _block[0]);
1934 								tgt.setblock(1, _block[1]);
1935 								tgt.setblock(2, _block[2] & FSU_MASK);
1936 							}
1937 							else if constexpr (4 == fBlocks) {
1938 								tgt.setblock(0, _block[0]);
1939 								tgt.setblock(1, _block[1]);
1940 								tgt.setblock(2, _block[2]);
1941 								tgt.setblock(3, _block[3] & FSU_MASK);
1942 							}
1943 							else if constexpr (5 == fBlocks) {
1944 								tgt.setblock(0, _block[0]);
1945 								tgt.setblock(1, _block[1]);
1946 								tgt.setblock(2, _block[2]);
1947 								tgt.setblock(3, _block[3]);
1948 								tgt.setblock(4, _block[4] & FSU_MASK);
1949 							}
1950 							else if constexpr (6 == fBlocks) {
1951 								tgt.setblock(0, _block[0]);
1952 								tgt.setblock(1, _block[1]);
1953 								tgt.setblock(2, _block[2]);
1954 								tgt.setblock(3, _block[3]);
1955 								tgt.setblock(4, _block[4]);
1956 								tgt.setblock(5, _block[5] & FSU_MASK);
1957 							}
1958 							else if constexpr (7 == fBlocks) {
1959 								tgt.setblock(0, _block[0]);
1960 								tgt.setblock(1, _block[1]);
1961 								tgt.setblock(2, _block[2]);
1962 								tgt.setblock(3, _block[3]);
1963 								tgt.setblock(4, _block[4]);
1964 								tgt.setblock(5, _block[5]);
1965 								tgt.setblock(6, _block[6] & FSU_MASK);
1966 							}
1967 							else if constexpr (8 == fBlocks) {
1968 								tgt.setblock(0, _block[0]);
1969 								tgt.setblock(1, _block[1]);
1970 								tgt.setblock(2, _block[2]);
1971 								tgt.setblock(3, _block[3]);
1972 								tgt.setblock(4, _block[4]);
1973 								tgt.setblock(5, _block[5]);
1974 								tgt.setblock(6, _block[6]);
1975 								tgt.setblock(7, _block[7] & FSU_MASK);
1976 							}
1977 							else {
1978 								for (size_t i = 0; i < FSU; ++i) {
1979 									tgt.setblock(i, _block[i]);
1980 								}
1981 								tgt.setblock(FSU, _block[FSU] & FSU_MASK);
1982 							}
1983 						}
1984 					}
1985 					else {  // this cfloat has no subnormals
1986 						tgt.setzero(tgt.sign()); // preserve the sign
1987 					}
1988 				}
1989 				else {
1990 					// by design, a cfloat is either normal, subnormal, or supernormal, so this else clause is by deduction covering a supernormal
1991 //					if (issupernormal()) { // it is a supernormal encoding
1992 						if constexpr (hasSupernormals) {
1993 							if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
1994 								uint64_t raw = fraction_ull();
1995 								raw |= (1ull << fbits); // add the hidden bit
1996 								tgt.setbits(raw);
1997 							}
1998 							else {
1999 								// brute force copy of blocks
2000 								if constexpr (1 == fBlocks) {
2001 									tgt.setblock(0, _block[0] & FSU_MASK);
2002 								}
2003 								else if constexpr (2 == fBlocks) {
2004 									tgt.setblock(0, _block[0]);
2005 									tgt.setblock(1, _block[1] & FSU_MASK);
2006 								}
2007 								else if constexpr (3 == fBlocks) {
2008 									tgt.setblock(0, _block[0]);
2009 									tgt.setblock(1, _block[1]);
2010 									tgt.setblock(2, _block[2] & FSU_MASK);
2011 								}
2012 								else if constexpr (4 == fBlocks) {
2013 									tgt.setblock(0, _block[0]);
2014 									tgt.setblock(1, _block[1]);
2015 									tgt.setblock(2, _block[2]);
2016 									tgt.setblock(3, _block[3] & FSU_MASK);
2017 								}
2018 								else if constexpr (5 == fBlocks) {
2019 									tgt.setblock(0, _block[0]);
2020 									tgt.setblock(1, _block[1]);
2021 									tgt.setblock(2, _block[2]);
2022 									tgt.setblock(3, _block[3]);
2023 									tgt.setblock(4, _block[4] & FSU_MASK);
2024 								}
2025 								else if constexpr (6 == fBlocks) {
2026 									tgt.setblock(0, _block[0]);
2027 									tgt.setblock(1, _block[1]);
2028 									tgt.setblock(2, _block[2]);
2029 									tgt.setblock(3, _block[3]);
2030 									tgt.setblock(4, _block[4]);
2031 									tgt.setblock(5, _block[5] & FSU_MASK);
2032 								}
2033 								else if constexpr (7 == fBlocks) {
2034 									tgt.setblock(0, _block[0]);
2035 									tgt.setblock(1, _block[1]);
2036 									tgt.setblock(2, _block[2]);
2037 									tgt.setblock(3, _block[3]);
2038 									tgt.setblock(4, _block[4]);
2039 									tgt.setblock(5, _block[5]);
2040 									tgt.setblock(6, _block[6] & FSU_MASK);
2041 								}
2042 								else if constexpr (8 == fBlocks) {
2043 									tgt.setblock(0, _block[0]);
2044 									tgt.setblock(1, _block[1]);
2045 									tgt.setblock(2, _block[2]);
2046 									tgt.setblock(3, _block[3]);
2047 									tgt.setblock(4, _block[4]);
2048 									tgt.setblock(5, _block[5]);
2049 									tgt.setblock(6, _block[6]);
2050 									tgt.setblock(7, _block[7] & FSU_MASK);
2051 								}
2052 								else {
2053 									for (size_t i = 0; i < FSU; ++i) {
2054 										tgt.setblock(i, _block[i]);
2055 									}
2056 									tgt.setblock(FSU, _block[FSU] & FSU_MASK);
2057 								}
2058 							}
2059 						}
2060 						else {  // this cfloat has no supernormals and thus this represents a nan, signalling or quiet determined by the sign
2061 							tgt.setnan(tgt.sign());
2062 						}
2063 //					}
2064 				}
2065 			}
2066 		}
2067 	}
2068 
2069 	// Normalize a cfloat to a blocktriple used in mul, which has the form 0'00001.fffff
2070 	// that is 2*fbits, plus 1 overflow bit, and the radix set at <fbits>.
2071 	// The result radix will go to 2*fbits after multiplication.
normalizeMultiplication(blocktriple<fbits,BlockTripleOperator::MUL,bt> & tgt) const2072 	constexpr void normalizeMultiplication(blocktriple<fbits, BlockTripleOperator::MUL, bt>& tgt) const {
2073 		// test special cases
2074 		if (isnan()) {
2075 			tgt.setnan();
2076 		}
2077 		else if (isinf()) {
2078 			tgt.setinf();
2079 		}
2080 		else if (iszero()) {
2081 			tgt.setzero();
2082 		}
2083 		else {
2084 			tgt.setnormal(); // a blocktriple is always normalized
2085 			int scale = this->scale();
2086 			tgt.setsign(sign());
2087 			tgt.setscale(scale);
2088 			tgt.setradix(fbits);
2089 			// set significant
2090 			// we are going to unify to the format 01.ffffeeee
2091 			// where 'f' is a fraction bit, and 'e' is an extension bit
2092 			// so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
2093 			if (isnormal()) {
2094 				if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
2095 					uint64_t raw = fraction_ull();
2096 					raw |= (1ull << fbits);
2097 					tgt.setbits(raw);
2098 				}
2099 				else {
2100 					// brute force copy of blocks
2101 					if constexpr (1 == fBlocks) {
2102 						tgt.setblock(0, _block[0] & FSU_MASK);
2103 					}
2104 					else if constexpr (2 == fBlocks) {
2105 						tgt.setblock(0, _block[0]);
2106 						tgt.setblock(1, _block[1] & FSU_MASK);
2107 					}
2108 					else if constexpr (3 == fBlocks) {
2109 						tgt.setblock(0, _block[0]);
2110 						tgt.setblock(1, _block[1]);
2111 						tgt.setblock(2, _block[2] & FSU_MASK);
2112 					}
2113 					else if constexpr (4 == fBlocks) {
2114 						tgt.setblock(0, _block[0]);
2115 						tgt.setblock(1, _block[1]);
2116 						tgt.setblock(2, _block[2]);
2117 						tgt.setblock(3, _block[3] & FSU_MASK);
2118 					}
2119 					else if constexpr (5 == fBlocks) {
2120 						tgt.setblock(0, _block[0]);
2121 						tgt.setblock(1, _block[1]);
2122 						tgt.setblock(2, _block[2]);
2123 						tgt.setblock(3, _block[3]);
2124 						tgt.setblock(4, _block[4] & FSU_MASK);
2125 					}
2126 					else if constexpr (6 == fBlocks) {
2127 						tgt.setblock(0, _block[0]);
2128 						tgt.setblock(1, _block[1]);
2129 						tgt.setblock(2, _block[2]);
2130 						tgt.setblock(3, _block[3]);
2131 						tgt.setblock(4, _block[4]);
2132 						tgt.setblock(5, _block[5] & FSU_MASK);
2133 					}
2134 					else if constexpr (7 == fBlocks) {
2135 						tgt.setblock(0, _block[0]);
2136 						tgt.setblock(1, _block[1]);
2137 						tgt.setblock(2, _block[2]);
2138 						tgt.setblock(3, _block[3]);
2139 						tgt.setblock(4, _block[4]);
2140 						tgt.setblock(5, _block[5]);
2141 						tgt.setblock(6, _block[6] & FSU_MASK);
2142 					}
2143 					else if constexpr (8 == fBlocks) {
2144 						tgt.setblock(0, _block[0]);
2145 						tgt.setblock(1, _block[1]);
2146 						tgt.setblock(2, _block[2]);
2147 						tgt.setblock(3, _block[3]);
2148 						tgt.setblock(4, _block[4]);
2149 						tgt.setblock(5, _block[5]);
2150 						tgt.setblock(6, _block[6]);
2151 						tgt.setblock(7, _block[7] & FSU_MASK);
2152 					}
2153 					else {
2154 						for (size_t i = 0; i < FSU; ++i) {
2155 							tgt.setblock(i, _block[i]);
2156 						}
2157 						tgt.setblock(FSU, _block[FSU] & FSU_MASK);
2158 					}
2159 				}
2160 			}
2161 			else {
2162 				if (isdenormal()) { // it is a subnormal encoding in this target cfloat
2163 					if constexpr (hasSubnormals) {
2164 						if constexpr (fbits < 64) {
2165 							uint64_t raw = fraction_ull();
2166 							int shift = MIN_EXP_NORMAL - scale;
2167 							raw <<= shift;
2168 							raw |= (1ull << fbits);
2169 							tgt.setbits(raw);
2170 						}
2171 						else {
2172 							// brute force copy of blocks
2173 							if constexpr (1 == fBlocks) {
2174 								tgt.setblock(0, _block[0] & FSU_MASK);
2175 							}
2176 							else if constexpr (2 == fBlocks) {
2177 								tgt.setblock(0, _block[0]);
2178 								tgt.setblock(1, _block[1] & FSU_MASK);
2179 							}
2180 							else if constexpr (3 == fBlocks) {
2181 								tgt.setblock(0, _block[0]);
2182 								tgt.setblock(1, _block[1]);
2183 								tgt.setblock(2, _block[2] & FSU_MASK);
2184 							}
2185 							else if constexpr (4 == fBlocks) {
2186 								tgt.setblock(0, _block[0]);
2187 								tgt.setblock(1, _block[1]);
2188 								tgt.setblock(2, _block[2]);
2189 								tgt.setblock(3, _block[3] & FSU_MASK);
2190 							}
2191 							else if constexpr (5 == fBlocks) {
2192 								tgt.setblock(0, _block[0]);
2193 								tgt.setblock(1, _block[1]);
2194 								tgt.setblock(2, _block[2]);
2195 								tgt.setblock(3, _block[3]);
2196 								tgt.setblock(4, _block[4] & FSU_MASK);
2197 							}
2198 							else if constexpr (6 == fBlocks) {
2199 								tgt.setblock(0, _block[0]);
2200 								tgt.setblock(1, _block[1]);
2201 								tgt.setblock(2, _block[2]);
2202 								tgt.setblock(3, _block[3]);
2203 								tgt.setblock(4, _block[4]);
2204 								tgt.setblock(5, _block[5] & FSU_MASK);
2205 							}
2206 							else if constexpr (7 == fBlocks) {
2207 								tgt.setblock(0, _block[0]);
2208 								tgt.setblock(1, _block[1]);
2209 								tgt.setblock(2, _block[2]);
2210 								tgt.setblock(3, _block[3]);
2211 								tgt.setblock(4, _block[4]);
2212 								tgt.setblock(5, _block[5]);
2213 								tgt.setblock(6, _block[6] & FSU_MASK);
2214 							}
2215 							else if constexpr (8 == fBlocks) {
2216 								tgt.setblock(0, _block[0]);
2217 								tgt.setblock(1, _block[1]);
2218 								tgt.setblock(2, _block[2]);
2219 								tgt.setblock(3, _block[3]);
2220 								tgt.setblock(4, _block[4]);
2221 								tgt.setblock(5, _block[5]);
2222 								tgt.setblock(6, _block[6]);
2223 								tgt.setblock(7, _block[7] & FSU_MASK);
2224 							}
2225 							else {
2226 								for (size_t i = 0; i < FSU; ++i) {
2227 									tgt.setblock(i, _block[i]);
2228 								}
2229 								tgt.setblock(FSU, _block[FSU] & FSU_MASK);
2230 							}
2231 						}
2232 					}
2233 					else { // this cfloat has no subnormals
2234 						tgt.setzero(tgt.sign()); // preserve the sign
2235 					}
2236 				}
2237 				else {
2238 					// by design, a cfloat is either normal, subnormal, or supernormal, so this else clause is by deduction covering a supernormal
2239 					if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
2240 						uint64_t raw = fraction_ull();
2241 						raw |= (1ull << fbits);
2242 						tgt.setbits(raw);
2243 					}
2244 					else {
2245 						// brute force copy of blocks
2246 						if constexpr (1 == fBlocks) {
2247 							tgt.setblock(0, _block[0] & FSU_MASK);
2248 						}
2249 						else if constexpr (2 == fBlocks) {
2250 							tgt.setblock(0, _block[0]);
2251 							tgt.setblock(1, _block[1] & FSU_MASK);
2252 						}
2253 						else if constexpr (3 == fBlocks) {
2254 							tgt.setblock(0, _block[0]);
2255 							tgt.setblock(1, _block[1]);
2256 							tgt.setblock(2, _block[2] & FSU_MASK);
2257 						}
2258 						else if constexpr (4 == fBlocks) {
2259 							tgt.setblock(0, _block[0]);
2260 							tgt.setblock(1, _block[1]);
2261 							tgt.setblock(2, _block[2]);
2262 							tgt.setblock(3, _block[3] & FSU_MASK);
2263 						}
2264 						else if constexpr (5 == fBlocks) {
2265 							tgt.setblock(0, _block[0]);
2266 							tgt.setblock(1, _block[1]);
2267 							tgt.setblock(2, _block[2]);
2268 							tgt.setblock(3, _block[3]);
2269 							tgt.setblock(4, _block[4] & FSU_MASK);
2270 						}
2271 						else if constexpr (6 == fBlocks) {
2272 							tgt.setblock(0, _block[0]);
2273 							tgt.setblock(1, _block[1]);
2274 							tgt.setblock(2, _block[2]);
2275 							tgt.setblock(3, _block[3]);
2276 							tgt.setblock(4, _block[4]);
2277 							tgt.setblock(5, _block[5] & FSU_MASK);
2278 						}
2279 						else if constexpr (7 == fBlocks) {
2280 							tgt.setblock(0, _block[0]);
2281 							tgt.setblock(1, _block[1]);
2282 							tgt.setblock(2, _block[2]);
2283 							tgt.setblock(3, _block[3]);
2284 							tgt.setblock(4, _block[4]);
2285 							tgt.setblock(5, _block[5]);
2286 							tgt.setblock(6, _block[6] & FSU_MASK);
2287 						}
2288 						else if constexpr (8 == fBlocks) {
2289 							tgt.setblock(0, _block[0]);
2290 							tgt.setblock(1, _block[1]);
2291 							tgt.setblock(2, _block[2]);
2292 							tgt.setblock(3, _block[3]);
2293 							tgt.setblock(4, _block[4]);
2294 							tgt.setblock(5, _block[5]);
2295 							tgt.setblock(6, _block[6]);
2296 							tgt.setblock(7, _block[7] & FSU_MASK);
2297 						}
2298 						else {
2299 							for (size_t i = 0; i < FSU; ++i) {
2300 								tgt.setblock(i, _block[i]);
2301 							}
2302 							tgt.setblock(FSU, _block[FSU] & FSU_MASK);
2303 						}
2304 					}
2305 
2306 				}
2307 			}
2308 		}
2309 	}
2310 
2311 	// normalize a cfloat to a blocktriple used in div, which has the form 0'00000'00001.fffff
2312 	// that is 3*fbits, plus 1 overflow bit, and the radix set at <fbits>.
2313 	// the result radix will go to 2*fbits after multiplication.
2314 	// TODO: needs implementation
normalizeDivision(blocktriple<fbits,BlockTripleOperator::DIV,bt> & tgt) const2315 	constexpr void normalizeDivision(blocktriple<fbits, BlockTripleOperator::DIV, bt>& tgt) const {
2316 		// test special cases
2317 		if (isnan()) {
2318 			tgt.setnan();
2319 		}
2320 		else if (isinf()) {
2321 			tgt.setinf();
2322 		}
2323 		else if (iszero()) {
2324 			tgt.setzero();
2325 		}
2326 		else {
2327 			tgt.setnormal(); // a blocktriple is always normalized
2328 			int scale = this->scale();
2329 			tgt.setsign(sign());
2330 			tgt.setscale(scale);
2331 			// set significant
2332 			// we are going to unify to the format 01.ffffeeee
2333 			// where 'f' is a fraction bit, and 'e' is an extension bit
2334 			// so that normalize can be used to generate blocktriples for add/sub/mul/div/sqrt
2335 			if (isnormal()) {
2336 				if constexpr (fbits < 64) { // max 63 bits of fraction to yield 64bit of raw significant bits
2337 					uint64_t raw = fraction_ull();
2338 					raw |= (1ull << fbits);
2339 					tgt.setbits(raw);
2340 				}
2341 				else {
2342 					// brute force copy of blocks
2343 					if constexpr (1 == fBlocks) {
2344 						tgt.setblock(0, _block[0] & FSU_MASK);
2345 					}
2346 					else if constexpr (2 == fBlocks) {
2347 						tgt.setblock(0, _block[0]);
2348 						tgt.setblock(1, _block[1] & FSU_MASK);
2349 					}
2350 					else if constexpr (3 == fBlocks) {
2351 						tgt.setblock(0, _block[0]);
2352 						tgt.setblock(1, _block[1]);
2353 						tgt.setblock(2, _block[2] & FSU_MASK);
2354 					}
2355 					else if constexpr (4 == fBlocks) {
2356 						tgt.setblock(0, _block[0]);
2357 						tgt.setblock(1, _block[1]);
2358 						tgt.setblock(2, _block[2]);
2359 						tgt.setblock(3, _block[3] & FSU_MASK);
2360 					}
2361 					else {
2362 						for (size_t i = 0; i < FSU; ++i) {
2363 							tgt.setblock(i, _block[i]);
2364 						}
2365 						tgt.setblock(FSU, _block[FSU] & FSU_MASK);
2366 					}
2367 				}
2368 			}
2369 			else { // it is a subnormal encoding in this target cfloat
2370 				if constexpr (fbits < 64) {
2371 					uint64_t raw = fraction_ull();
2372 					int shift = MIN_EXP_NORMAL - scale;
2373 					raw <<= shift;
2374 					raw |= (1ull << fbits);
2375 					tgt.setbits(raw);
2376 				}
2377 				else {
2378 					// brute force copy of blocks
2379 					if constexpr (1 == fBlocks) {
2380 						tgt.setblock(0, _block[0] & FSU_MASK);
2381 					}
2382 					else if constexpr (2 == fBlocks) {
2383 						tgt.setblock(0, _block[0]);
2384 						tgt.setblock(1, _block[1] & FSU_MASK);
2385 					}
2386 					else if constexpr (3 == fBlocks) {
2387 						tgt.setblock(0, _block[0]);
2388 						tgt.setblock(1, _block[1]);
2389 						tgt.setblock(2, _block[2] & FSU_MASK);
2390 					}
2391 					else if constexpr (4 == fBlocks) {
2392 						tgt.setblock(0, _block[0]);
2393 						tgt.setblock(1, _block[1]);
2394 						tgt.setblock(2, _block[2]);
2395 						tgt.setblock(3, _block[3] & FSU_MASK);
2396 					}
2397 					else {
2398 						for (size_t i = 0; i < FSU; ++i) {
2399 							tgt.setblock(i, _block[i]);
2400 						}
2401 						tgt.setblock(FSU, _block[FSU] & FSU_MASK);
2402 					}
2403 				}
2404 			}
2405 		}
2406 	}
2407 
2408 protected:
2409 	// HELPER methods
2410 
2411 	// convert an unsigned integer into a cfloat
2412 	// TODO: this method does not protect against being called with a signed integer
2413 	template<typename Ty>
convert_unsigned_integer(const Ty & rhs)2414 	constexpr cfloat& convert_unsigned_integer(const Ty& rhs) noexcept {
2415 		clear();
2416 		if (0 == rhs) return *this;
2417 		uint64_t raw = static_cast<uint64_t>(rhs);
2418 		int exponent = int(findMostSignificantBit(raw)) - 1; // precondition that msb > 0 is satisfied by the zero test above
2419 		constexpr uint32_t sizeInBits = 8 * sizeof(Ty);
2420 		uint32_t shift = sizeInBits - exponent - 1;
2421 		raw <<= shift;
2422 		raw = round<sizeInBits, uint64_t>(raw, exponent);
2423 		return *this;
2424 	}
2425 	// convert a signed integer into a cfloat
2426 	// TODO: this method does not protect against being called with a signed integer
2427 	template<typename Ty>
convert_signed_integer(const Ty & rhs)2428 	constexpr cfloat& convert_signed_integer(const Ty& rhs) noexcept {
2429 		clear();
2430 		if (0 == rhs) return *this;
2431 		bool s = (rhs < 0);
2432 		uint64_t raw = static_cast<uint64_t>(s ? -rhs : rhs);
2433 		int exponent = int(findMostSignificantBit(raw)) - 1; // precondition that msb > 0 is satisfied by the zero test above
2434 		constexpr uint32_t sizeInBits = 8 * sizeof(Ty);
2435 		uint32_t shift = sizeInBits - exponent - 1;
2436 		raw <<= shift;
2437 		raw = round<sizeInBits, uint64_t>(raw, exponent);
2438 #ifdef TODO
2439 		// construct the target cfloat
2440 		if constexpr (64 >= nbits - es - 1ull) {
2441 			uint64_t bits = (s ? 1u : 0u);
2442 			bits <<= es;
2443 			bits |= exponent + EXP_BIAS;
2444 			bits <<= nbits - 1ull - es;
2445 			bits |= raw;
2446 			bits &= 0xFFFF'FFFF;
2447 			if constexpr (1 == nrBlocks) {
2448 				_block[MSU] = bt(bits);
2449 			}
2450 			else {
2451 				copyBits(bits);
2452 			}
2453 		}
2454 		else {
2455 			std::cerr << "TBD\n";
2456 		}
2457 #endif
2458 		return *this;
2459 	}
2460 
2461 public:
2462 	template<typename Real>
convert_ieee754(Real rhs)2463 	CONSTEXPRESSION cfloat& convert_ieee754(Real rhs) noexcept {
2464 		// when we are a perfect match to single precision IEEE-754
2465 		// take the bits verbatim as a cfloat is a superset of IEEE-754 in all configurations
2466 		if constexpr (nbits == 32 && es == 8) {
2467 			bool s{ false };
2468 			uint64_t rawExponent{ 0 };
2469 			uint64_t rawFraction{ 0 };
2470 			// use native conversion
2471 			extractFields(float(rhs), s, rawExponent, rawFraction);
2472 			uint64_t raw{ s ? 1ull : 0ull };
2473 			raw <<= 31;
2474 			raw |= (rawExponent << fbits);
2475 			raw |= rawFraction;
2476 			setbits(raw);
2477 			return *this;
2478 		}
2479 		// when we are a perfect match to double precision IEEE-754
2480 		else if constexpr (nbits == 64 && es == 11) {
2481 			bool s{ false };
2482 			uint64_t rawExponent{ 0 };
2483 			uint64_t rawFraction{ 0 };
2484 			// use native conversion
2485 			extractFields(double(rhs), s, rawExponent, rawFraction);
2486 			uint64_t raw{ s ? 1ull : 0ull };
2487 			raw <<= 63;
2488 			raw |= (rawExponent << fbits);
2489 			raw |= rawFraction;
2490 			setbits(raw);
2491 			return *this;
2492 		}
2493 		else {
2494 			clear();
2495 			// extract raw IEEE-754 bits
2496 			bool s{ false };
2497 			uint64_t rawExponent{ 0 };
2498 			uint64_t rawFraction{ 0 };
2499 			extractFields(rhs, s, rawExponent, rawFraction);
2500 
2501 			// special case handling
2502 			if (rawExponent == ieee754_parameter<Real>::eallset) { // nan and inf
2503 				if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::snanmask) ||
2504 					rawFraction == (ieee754_parameter<Real>::fmask & (ieee754_parameter<Real>::qnanmask | ieee754_parameter<Real>::snanmask))) {
2505 					// 1.11111111.00000000.......00000001 signalling nan
2506 					// 0.11111111.00000000000000000000001 signalling nan
2507 					// MSVC
2508 					// 1.11111111.10000000.......00000001 signalling nan
2509 					// 0.11111111.10000000.......00000001 signalling nan
2510 					setnan(NAN_TYPE_SIGNALLING);
2511 					return *this;
2512 				}
2513 				if (rawFraction == (ieee754_parameter<Real>::fmask & ieee754_parameter<Real>::qnanmask)) {
2514 					// 1.11111111.10000000.......00000000 quiet nan
2515 					// 0.11111111.10000000.......00000000 quiet nan
2516 					setnan(NAN_TYPE_QUIET);
2517 					return *this;
2518 				}
2519 				if (rawFraction == 0ull) {
2520 					// 1.11111111.0000000.......000000000 -inf
2521 					// 0.11111111.0000000.......000000000 +inf
2522 					setinf(s);
2523 					return *this;
2524 				}
2525 			}
2526 			if (rhs == 0.0) { // IEEE rule: this is valid for + and - 0.0
2527 				setbit(nbits - 1ull, s);
2528 				return *this;
2529 			}
2530 			// normal number consists of fbits fraction bits and one hidden bit
2531 			// subnormal number has no hidden bit
2532 			int exponent = static_cast<int>(rawExponent) - ieee754_parameter<Real>::bias;  // unbias the exponent
2533 
2534 			// check special case of
2535 			//  1- saturating to maxpos/maxneg, or
2536 			//  2- projecting to +-inf
2537 			// if the value is out of range.
2538 			//
2539 			// One problem here is that at the rounding cusps of maxpos <-> inf <-> nan
2540 			// you need to go through the rounding logic to know which encoding you end up
2541 			// with.
2542 			// For each specific cfloat configuration, you can work out these rounding cusps
2543 			// but they need to go through the value transformation to map them back to native
2544 			// IEEE-754. That is a complex computation to do as a static constexpr as you need
2545 			// to construct the value, then evaluate it, and store it.
2546 			//
2547 			// The algorithm used here is to check for the obvious out of range values by
2548 			// comparing their scale to the max scale this cfloat encoding can represent.
2549 			// For the rounding cusps, we go through the rounding logic, and then clean up
2550 			// after rounding using the observation that no conversion from a value can ever
2551 			// yield the NaN encoding.
2552 			//
2553 			// The rounding logic will correctly sort between maxpos and inf, and we clean
2554 			// up any NaN encodings by projecting back to the configuration's saturation rule.
2555 			//
2556 			// We could improve on this by creating the database of rounding cusps and
2557 			// referencing them with a direct value comparison with the input. That would be
2558 			// the most performant implementation.
2559 			if (exponent > MAX_EXP) {
2560 				if constexpr (isSaturating) {
2561 					if (s) this->maxneg(); else this->maxpos(); // saturate to maxpos or maxneg
2562 				}
2563 				else {
2564 					setinf(s);
2565 				}
2566 				return *this;
2567 			}
2568 			if constexpr (hasSubnormals) {
2569 				if (exponent < MIN_EXP_SUBNORMAL - 1) {
2570 					// map to +-0 any values that have a scale less than (MIN_EXP_SUBMORNAL - 1)
2571 					this->setbit(nbits - 1, s);
2572 					return *this;
2573 				}
2574 			}
2575 			else {
2576 				if (exponent < MIN_EXP_NORMAL - 1) {
2577 					// map to +-0 any values that have a scale less than (MIN_EXP_MORNAL - 1)
2578 					this->setbit(nbits - 1, s);
2579 					return *this;
2580 				}
2581 			}
2582 
2583 			/////////////////
2584 			/// end of special case processing, move on to value sampling and rounding
2585 
2586 #if TRACE_CONVERSION
2587 			std::cout << '\n';
2588 			std::cout << "value             : " << rhs << '\n';
2589 			std::cout << "segments          : " << to_binary(rhs) << '\n';
2590 			std::cout << "sign     bit      : " << (s ? '1' : '0') << '\n';
2591 			std::cout << "exponent bits     : " << to_binary(rawExponent, ieee754_parameter<Real>::ebits, true) << '\n';
2592 			std::cout << "fraction bits     : " << to_binary(rawFraction, ieee754_parameter<Real>::fbits, true) << std::endl;
2593 			std::cout << "exponent value    : " << exponent << '\n';
2594 #endif
2595 
2596 			// do the following scenarios have different rounding bits?
2597 			// input is normal, cfloat is normal           <-- rounding can happen with native ieee-754 bits
2598 			// input is normal, cfloat is subnormal
2599 			// input is subnormal, cfloat is normal
2600 			// input is subnormal, cfloat is subnormal
2601 
2602 			// The first condition is the relationship between the number
2603 			// of fraction bits from the source and the number of fraction bits
2604 			// in the target cfloat: these are constexpressions and guard the shifts
2605 			// input fbits >= cfloat fbits                 <-- need to round
2606 			// input fbits < cfloat fbits                  <-- no need to round
2607 
2608 			if constexpr (fbits < ieee754_parameter<Real>::fbits) {
2609 				// this is the common case for cfloats that are smaller in size compared to single and double precision IEEE-754
2610 				constexpr int rightShift = ieee754_parameter<Real>::fbits - fbits; // this is the bit shift to get the MSB of the src to the MSB of the tgt
2611 				uint32_t biasedExponent{ 0 };
2612 				int adjustment{ 0 }; // right shift adjustment for subnormal representation
2613 				uint64_t mask;
2614 				if (rawExponent != 0) {
2615 					// the source real is a normal number,
2616 					if (exponent >= (MIN_EXP_SUBNORMAL - 1) && exponent < MIN_EXP_NORMAL) {
2617 						// the value is a subnormal number in this representation: biasedExponent = 0
2618 						// add the hidden bit to the fraction bits so the denormalization has the correct MSB
2619 						rawFraction |= ieee754_parameter<Real>::hmask;
2620 
2621 						// fraction processing: we have 1 hidden + 23 explicit fraction bits
2622 						// f = 1.ffff 2^exponent * 2^fbits * 2^-(2-2^(es-1)) = 1.ff...ff >> (23 - (-exponent + fbits - (2 -2^(es-1))))
2623 						// -exponent because we are right shifting and exponent in this range is negative
2624 						adjustment = -(exponent + subnormal_reciprocal_shift[es]);
2625 						// this is the right shift adjustment required for subnormal representation due
2626 						// to the scale of the input number, i.e. the exponent of 2^-adjustment
2627 					}
2628 					else {
2629 						// the value is a normal number in this representation: common case
2630 						biasedExponent = static_cast<uint32_t>(exponent + EXP_BIAS); // project the exponent into the target
2631 						// fraction processing
2632 						// float structure is: seee'eeee'efff'ffff'ffff'ffff'ffff'ffff, s = sign, e - exponent bit, f = fraction bit
2633 						// target structure is for example cfloat<8,2>: seef'ffff
2634 						// since both are normals, we can shift the incoming fraction to the target structure bits, and round
2635 						// MSB of source = 23 - 1, MSB of target = fbits - 1: shift = MSB of src - MSB of tgt => 23 - fbits
2636 						adjustment = 0;
2637 					}
2638 					if constexpr (rightShift > 0) {
2639 						// if true we need to round
2640 						// round-to-even logic
2641 						//  ... lsb | guard  round sticky   round
2642 						//       x     0       x     x       down
2643 						//       0     1       0     0       down  round to even
2644 						//       1     1       0     0        up   round to even
2645 						//       x     1       0     1        up
2646 						//       x     1       1     0        up
2647 						//       x     1       1     1        up
2648 						// collect lsb, guard, round, and sticky bits
2649 //						std::cout << "shift to LSB " << (rightShift + adjustment) << " adjustment = " << adjustment << '\n';
2650 						mask = (1ull << (rightShift + adjustment)); // bit mask for the lsb bit
2651 						bool lsb = (mask & rawFraction);
2652 						mask >>= 1;
2653 						bool guard = (mask & rawFraction);
2654 						mask >>= 1;
2655 						bool round = (mask & rawFraction);
2656 						if constexpr (rightShift > 1) {
2657 							mask = (0xFFFF'FFFF'FFFF'FFFFull << (rightShift - 2));
2658 							mask = ~mask;
2659 						}
2660 						else {
2661 							mask = 0;
2662 						}
2663 						bool sticky = (mask & rawFraction);
2664 						rawFraction >>= (static_cast<int64_t>(rightShift) + static_cast<int64_t>(adjustment));
2665 
2666 						// execute rounding operation
2667 						if (guard) {
2668 							if (lsb && (!round && !sticky)) ++rawFraction; // round to even
2669 							if (round || sticky) ++rawFraction;
2670 							if (rawFraction == (1ull << fbits)) { // overflow
2671 								if (biasedExponent == ALL_ONES_ES) { // overflow to INF == .111..01
2672 									rawFraction = INF_ENCODING;
2673 								}
2674 								else {
2675 									++biasedExponent;
2676 									rawFraction = 0;
2677 								}
2678 							}
2679 						}
2680 #if TRACE_CONVERSION
2681 						std::cout << "lsb               : " << (lsb ? "1\n" : "0\n");
2682 						std::cout << "guard             : " << (guard ? "1\n" : "0\n");
2683 						std::cout << "round             : " << (round ? "1\n" : "0\n");
2684 						std::cout << "sticky            : " << (sticky ? "1\n" : "0\n");
2685 						std::cout << "rounding decision : " << (lsb && (!round && !sticky) ? "round to even\n" : "-\n");
2686 						std::cout << "rounding direction: " << (round || sticky ? "round up\n" : "round down\n");
2687 #endif
2688 					}
2689 					else { // all bits of the float go into this representation and need to be shifted up, no rounding necessary
2690 						int shiftLeft = fbits - ieee754_parameter<Real>::fbits;
2691 						rawFraction <<= shiftLeft;
2692 					}
2693 #if TRACE_CONVERSION
2694 					std::cout << "biased exponent   : " << biasedExponent << " : 0x" << std::hex << biasedExponent << std::dec << '\n';
2695 					std::cout << "right shift       : " << rightShift << '\n';
2696 					std::cout << "adjustment shift  : " << adjustment << '\n';
2697 					std::cout << "sticky bit mask   : " << to_binary(mask, 32, true) << '\n';
2698 					std::cout << "fraction bits     : " << to_binary(rawFraction, 32, true) << '\n';
2699 #endif
2700 					// construct the target cfloat
2701 					uint64_t bits = (s ? 1ull : 0ull);
2702 					bits <<= es;
2703 					bits |= biasedExponent;
2704 					bits <<= fbits;
2705 					bits |= rawFraction;
2706 					setbits(bits);
2707 				}
2708 				else {
2709 					// the source real is a subnormal number
2710 					mask = 0x00FF'FFFFu >> (fbits + exponent + subnormal_reciprocal_shift[es] + 1); // mask for sticky bit
2711 
2712 					// fraction processing: we have fbits+1 bits = 1 hidden + fbits explicit fraction bits
2713 					// f = 1.ffff  2^exponent * 2^fbits * 2^-(2-2^(es-1)) = 1.ff...ff >> (23 - (-exponent + fbits - (2 -2^(es-1))))
2714 					// -exponent because we are right shifting and exponent in this range is negative
2715 					adjustment = -(exponent + subnormal_reciprocal_shift[es]); // this is the right shift adjustment due to the scale of the input number, i.e. the exponent of 2^-adjustment
2716 
2717 					std::cout << "source is subnormal: TBD\n";
2718 					std::cout << "shift to LSB    " << (rightShift + adjustment) << '\n';
2719 					std::cout << "adjustment      " << adjustment << '\n';
2720 					std::cout << "exponent        " << exponent << '\n';
2721 					std::cout << "subnormal shift " << subnormal_reciprocal_shift[es] << '\n';
2722 
2723 					if (exponent >= (MIN_EXP_SUBNORMAL - 1) && exponent < MIN_EXP_NORMAL) {
2724 						// the value is a subnormal number in this representation
2725 					}
2726 					else {
2727 						// the value is a normal number in this representation
2728 					}
2729 				}
2730 			}
2731 			else {
2732 				// no need to round, but we need to shift left to deliver the bits
2733 				// cfloat<40,  8> = float
2734 				// cfloat<48,  9> = float
2735 				// cfloat<56, 10> = float
2736 				// cfloat<64, 11> = float
2737 				// cfloat<64, 10> = double
2738 				// can we go from an input subnormal to a cfloat normal?
2739 				// yes, for example a cfloat<64,11> assigned to a subnormal float
2740 
2741 				// map exponent into target cfloat encoding
2742 				uint64_t biasedExponent = static_cast<uint64_t>(static_cast<int64_t>(exponent) + EXP_BIAS);
2743 				constexpr int upshift = fbits - ieee754_parameter<Real>::fbits;
2744 				// output processing
2745 				if constexpr (nbits < 65) {
2746 					// we can compose the bits in a native 64-bit unsigned integer
2747 					// common case: normal to normal
2748 					// nbits = 40, es = 8, fbits = 31: rhs = float fbits = 23; shift left by (31 - 23) = 8
2749 
2750 					if (rawExponent != 0) {
2751 						// rhs is a normal encoding
2752 						uint64_t bits{ s ? 1ull : 0ull };
2753 						bits <<= es;
2754 						bits |= biasedExponent;
2755 						bits <<= fbits;
2756 						rawFraction <<= upshift;
2757 						bits |= rawFraction;
2758 						setbits(bits);
2759 					}
2760 					else {
2761 						// rhs is a subnormal
2762 	//					std::cerr << "rhs is a subnormal : " << to_binary(rhs) << " : " << rhs << '\n';
2763 						// we need to calculate the effective scale to see
2764 						// if this value becomes a normal, or maps to a subnormal encoding
2765 						// in this target format
2766 					}
2767 				}
2768 				else {
2769 					// we need to write and shift bits into place
2770 					// use cases are cfloats like cfloat<80, 11, bt>
2771 					// even though the bits that come in are single or double precision
2772 					// we need to write the fields and then shifting them in place
2773 					//
2774 					// common case: normal to normal
2775 					if (rawExponent != 0) {
2776 						// nbits = 128, es = 15, fbits = 112: rhs = float: shift left by (112 - 23) = 89
2777 						setbits(biasedExponent);
2778 						shiftLeft(fbits);
2779 						bt fractionBlock[nrBlocks]{ 0 };
2780 						// copy fraction bits
2781 						size_t blocksRequired = (8 * sizeof(rawFraction) + 1) / bitsInBlock;
2782 						size_t maxBlockNr = (blocksRequired < nrBlocks ? blocksRequired : nrBlocks);
2783 						uint64_t mask = static_cast<uint64_t>(ALL_ONES); // set up the block mask
2784 						size_t shift = 0;
2785 						for (size_t i = 0; i < maxBlockNr; ++i) {
2786 							fractionBlock[i] = bt((mask & rawFraction) >> shift);
2787 							mask <<= bitsInBlock;
2788 							shift += bitsInBlock;
2789 						}
2790 						// shift fraction bits
2791 						int bitsToShift = upshift;
2792 						if (bitsToShift >= static_cast<int>(bitsInBlock)) {
2793 							int blockShift = static_cast<int>(bitsToShift / bitsInBlock);
2794 							for (int i = MSU; i >= blockShift; --i) {
2795 								fractionBlock[i] = fractionBlock[i - blockShift];
2796 							}
2797 							for (int i = blockShift - 1; i >= 0; --i) {
2798 								fractionBlock[i] = bt(0);
2799 							}
2800 							// adjust the shift
2801 							bitsToShift -= blockShift * bitsInBlock;
2802 						}
2803 						if (bitsToShift > 0) {
2804 							// construct the mask for the upper bits in the block that need to move to the higher word
2805 							bt bitsToMoveMask = bt(ALL_ONES << (bitsInBlock - bitsToShift));
2806 							for (size_t i = MSU; i > 0; --i) {
2807 								fractionBlock[i] <<= bitsToShift;
2808 								// mix in the bits from the right
2809 								bt bits = static_cast<bt>(bitsToMoveMask & fractionBlock[i - 1]); // operator & yields an int
2810 								fractionBlock[i] |= (bits >> (bitsInBlock - bitsToShift));
2811 							}
2812 							fractionBlock[0] <<= bitsToShift;
2813 						}
2814 						// OR the bits in
2815 						for (size_t i = 0; i <= MSU; ++i) {
2816 							_block[i] |= fractionBlock[i];
2817 						}
2818 						// enforce precondition for fast comparison by properly nulling bits that are outside of nbits
2819 						_block[MSU] &= MSU_MASK;
2820 						// finally, set the sign bit
2821 						setsign(s);
2822 					}
2823 					else {
2824 						// rhs is a subnormal
2825 	//					std::cerr << "rhs is a subnormal : " << to_binary(rhs) << " : " << rhs << '\n';
2826 					}
2827 				}
2828 			}
2829 			// post-processing results to implement saturation and projection after rounding logic
2830 			if constexpr (isSaturating) {
2831 				if (isinf(INF_TYPE_POSITIVE) || isnan(NAN_TYPE_QUIET)) {
2832 					maxpos();
2833 				}
2834 				else if (isinf(INF_TYPE_NEGATIVE) || isnan(NAN_TYPE_SIGNALLING)) {
2835 					maxneg();
2836 				}
2837 			}
2838 			else {
2839 				if (isnan(NAN_TYPE_QUIET)) {
2840 					setinf(false);
2841 				}
2842 				else if (isnan(NAN_TYPE_SIGNALLING)) {
2843 					setinf(true);
2844 				}
2845 			}
2846 			return *this;  // TODO: unreachable in some configurations
2847 		}
2848 	}
2849 
2850 	// post-processing results to implement saturation and projection after rounding logic
2851 	// arithmetic bit operations can't produce NaN encodings, so we need to re-interpret
2852 	// these encodings and 'project' them to the proper values.
post_process()2853 	void constexpr post_process() noexcept {
2854 		if constexpr (isSaturating) {
2855 			if (isinf(INF_TYPE_POSITIVE) || isnan(NAN_TYPE_QUIET)) {
2856 				maxpos();
2857 			}
2858 			else if (isinf(INF_TYPE_NEGATIVE) || isnan(NAN_TYPE_SIGNALLING)) {
2859 				maxneg();
2860 			}
2861 		}
2862 		else {
2863 			if (isnan(NAN_TYPE_QUIET)) {
2864 				setinf(false);
2865 			}
2866 			else if (isnan(NAN_TYPE_SIGNALLING)) {
2867 				setinf(true);
2868 			}
2869 		}
2870 	}
2871 
2872 protected:
2873 
2874 	/// <summary>
2875 	/// round a set of source bits to the present representation.
2876 	/// srcbits is the number of bits of significant in the source representation
2877 	/// </summary>
2878 	/// <typeparam name="StorageType"></typeparam>
2879 	/// <param name="raw"></param>
2880 	/// <returns></returns>
2881 	template<size_t srcbits, typename StorageType>
round(StorageType raw,int & exponent)2882 	constexpr uint64_t round(StorageType raw, int& exponent) noexcept {
2883 		if constexpr (fhbits < srcbits) {
2884 			// round to even: lsb guard round sticky
2885 		    // collect guard, round, and sticky bits
2886 		    // this same logic will work for the case where
2887 		    // we only have a guard bit and no round and sticky bits
2888 		    // because the mask logic will make round and sticky both 0
2889 			constexpr uint32_t shift = srcbits - fhbits - 1ull;
2890 			StorageType mask = (StorageType(1ull) << shift);
2891 			bool guard = (mask & raw);
2892 			mask >>= 1;
2893 			bool round = (mask & raw);
2894 			if constexpr (shift > 1u) { // protect against a negative shift
2895 				StorageType allones(StorageType(~0));
2896 				mask = StorageType(allones << (shift - 2));
2897 				mask = ~mask;
2898 			}
2899 			else {
2900 				mask = 0;
2901 			}
2902 			bool sticky = (mask & raw);
2903 
2904 			raw >>= (shift + 1);  // shift out the bits we are rounding away
2905 			bool lsb = (raw & 0x1u);
2906 			//  ... lsb | guard  round sticky   round
2907 			//       x     0       x     x       down
2908 			//       0     1       0     0       down  round to even
2909 			//       1     1       0     0        up   round to even
2910 			//       x     1       0     1        up
2911 			//       x     1       1     0        up
2912 			//       x     1       1     1        up
2913 			if (guard) {
2914 				if (lsb && (!round && !sticky)) ++raw; // round to even
2915 				if (round || sticky) ++raw;
2916 				if (raw == (1ull << nbits)) { // overflow
2917 					++exponent;
2918 					raw >>= 1u;
2919 				}
2920 			}
2921 		}
2922 		else {
2923 			constexpr size_t shift = fhbits - srcbits;
2924 			if constexpr (shift < (sizeof(StorageType))) {
2925 				raw <<= shift;
2926 			}
2927 			else {
2928 				std::cerr << "round: shift " << shift << " >= " << sizeof(StorageType) << std::endl;
2929 				raw = 0;
2930 			}
2931 		}
2932 		uint64_t significant = raw;
2933 		return significant;
2934 	}
2935 	template<typename ArgumentBlockType>
copyBits(ArgumentBlockType v)2936 	constexpr void copyBits(ArgumentBlockType v) {
2937 		size_t blocksRequired = (8 * sizeof(v) + 1 ) / bitsInBlock;
2938 		size_t maxBlockNr = (blocksRequired < nrBlocks ? blocksRequired : nrBlocks);
2939 		bt b{ 0ul }; b = bt(~b);
2940 		ArgumentBlockType mask = ArgumentBlockType(b);
2941 		size_t shift = 0;
2942 		for (size_t i = 0; i < maxBlockNr; ++i) {
2943 			_block[i] = bt((mask & v) >> shift);
2944 			mask <<= bitsInBlock;
2945 			shift += bitsInBlock;
2946 		}
2947 	}
shiftLeft(int leftShift)2948 	void shiftLeft(int leftShift) {
2949 		if (leftShift == 0) return;
2950 		if (leftShift < 0) return shiftRight(-leftShift);
2951 		if (leftShift > long(nbits)) leftShift = nbits; // clip to max
2952 		if (leftShift >= long(bitsInBlock)) {
2953 			int blockShift = leftShift / static_cast<int>(bitsInBlock);
2954 			for (signed i = signed(MSU); i >= blockShift; --i) {
2955 				_block[i] = _block[i - blockShift];
2956 			}
2957 			for (signed i = blockShift - 1; i >= 0; --i) {
2958 				_block[i] = bt(0);
2959 			}
2960 			// adjust the shift
2961 			leftShift -= (long)(blockShift * bitsInBlock);
2962 			if (leftShift == 0) return;
2963 		}
2964 		// construct the mask for the upper bits in the block that need to move to the higher word
2965 //		bt mask = static_cast<bt>(0xFFFFFFFFFFFFFFFFull << (bitsInBlock - leftShift));
2966 		bt mask = ALL_ONES;
2967 		mask <<= (bitsInBlock - leftShift);
2968 		for (unsigned i = MSU; i > 0; --i) {
2969 			_block[i] <<= leftShift;
2970 			// mix in the bits from the right
2971 			bt bits = static_cast<bt>(mask & _block[i - 1]);
2972 			_block[i] |= (bits >> (bitsInBlock - leftShift));
2973 		}
2974 		_block[0] <<= leftShift;
2975 	}
shiftRight(int rightShift)2976 	void shiftRight(int rightShift) {
2977 		if (rightShift == 0) return;
2978 		if (rightShift < 0) return shiftLeft(-rightShift);
2979 		if (rightShift >= long(nbits)) {
2980 			setzero();
2981 			return;
2982 		}
2983 		bool signext = sign();
2984 		size_t blockShift = 0;
2985 		if (rightShift >= long(bitsInBlock)) {
2986 			blockShift = rightShift / bitsInBlock;
2987 			if (MSU >= blockShift) {
2988 				// shift by blocks
2989 				for (size_t i = 0; i <= MSU - blockShift; ++i) {
2990 					_block[i] = _block[i + blockShift];
2991 				}
2992 			}
2993 			// adjust the shift
2994 			rightShift -= (long)(blockShift * bitsInBlock);
2995 			if (rightShift == 0) {
2996 				// fix up the leading zeros if we have a negative number
2997 				if (signext) {
2998 					// rightShift is guaranteed to be less than nbits
2999 					rightShift += (long)(blockShift * bitsInBlock);
3000 					for (size_t i = nbits - rightShift; i < nbits; ++i) {
3001 						this->setbit(i);
3002 					}
3003 				}
3004 				else {
3005 					// clean up the blocks we have shifted clean
3006 					rightShift += (long)(blockShift * bitsInBlock);
3007 					for (size_t i = nbits - rightShift; i < nbits; ++i) {
3008 						this->setbit(i, false);
3009 					}
3010 				}
3011 			}
3012 		}
3013 
3014 		bt mask = ALL_ONES;
3015 		mask >>= (bitsInBlock - rightShift); // this is a mask for the lower bits in the block that need to move to the lower word
3016 		for (unsigned i = 0; i < MSU; ++i) {  // TODO: can this be improved? we should not have to work on the upper blocks in case we block shifted
3017 			_block[i] >>= rightShift;
3018 			// mix in the bits from the left
3019 			bt bits = static_cast<bt>(mask & _block[i + 1]); // & operator returns an int
3020 			_block[i] |= (bits << (bitsInBlock - rightShift));
3021 		}
3022 		_block[MSU] >>= rightShift;
3023 
3024 		// fix up the leading zeros if we have a negative number
3025 		if (signext) {
3026 			// bitsToShift is guaranteed to be less than nbits
3027 			rightShift += (long)(blockShift * bitsInBlock);
3028 			for (size_t i = nbits - rightShift; i < nbits; ++i) {
3029 				this->setbit(i);
3030 			}
3031 		}
3032 		else {
3033 			// clean up the blocks we have shifted clean
3034 			rightShift += (long)(blockShift * bitsInBlock);
3035 			for (size_t i = nbits - rightShift; i < nbits; ++i) {
3036 				this->setbit(i, false);
3037 			}
3038 		}
3039 
3040 		// enforce precondition for fast comparison by properly nulling bits that are outside of nbits
3041 		_block[MSU] &= MSU_MASK;
3042 	}
3043 
3044 	// calculate the integer power 2 ^ b using exponentiation by squaring
ipow(int exponent) const3045 	double ipow(int exponent) const {
3046 		bool negative = (exponent < 0);
3047 		exponent = negative ? -exponent : exponent;
3048 		double result(1.0);
3049 		double base = 2.0;
3050 		for (;;) {
3051 			if (exponent % 2) result *= base;
3052 			exponent >>= 1;
3053 			if (exponent == 0) break;
3054 			base *= base;
3055 		}
3056 		return (negative ? (1.0 / result) : result);
3057 	}
3058 
3059 private:
3060 	bt _block[nrBlocks];
3061 
3062 	//////////////////////////////////////////////////////////////////////////////
3063 	// friend functions
3064 
3065 	// template parameters need names different from class template parameters (for gcc and clang)
3066 	template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3067 	friend std::ostream& operator<< (std::ostream& ostr, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& r);
3068 	template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3069 	friend std::istream& operator>> (std::istream& istr, cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& r);
3070 
3071 	template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3072 	friend bool operator==(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3073 	template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3074 	friend bool operator!=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3075 	template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3076 	friend bool operator< (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3077 	template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3078 	friend bool operator> (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3079 	template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3080 	friend bool operator<=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3081 	template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
3082 	friend bool operator>=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs);
3083 };
3084 
3085 ////////////////////// operators
3086 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <<(std::ostream & ostr,const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & v)3087 inline std::ostream& operator<<(std::ostream& ostr, const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating>& v) {
3088 	// TODO: make it a native conversion
3089 	ostr << double(v);
3090 	return ostr;
3091 }
3092 
3093 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >>(std::istream & istr,const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & v)3094 inline std::istream& operator>>(std::istream& istr, const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating>& v) {
3095 	istr >> v._fraction;
3096 	return istr;
3097 }
3098 
3099 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
operator ==(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & lhs,const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & rhs)3100 inline bool operator==(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) {
3101 	for (size_t i = 0; i < lhs.nrBlocks; ++i) {
3102 		if (lhs._block[i] != rhs._block[i]) {
3103 			return false;
3104 		}
3105 	}
3106 	return true;
3107 }
3108 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
operator !=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & lhs,const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & rhs)3109 inline bool operator!=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { return !operator==(lhs, rhs); }
3110 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
operator <(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & lhs,const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & rhs)3111 inline bool operator< (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { return (lhs - rhs).isneg(); }
3112 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
operator >(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & lhs,const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & rhs)3113 inline bool operator> (const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { return  operator< (rhs, lhs); }
3114 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
operator <=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & lhs,const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & rhs)3115 inline bool operator<=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { return !operator> (lhs, rhs); }
3116 template<size_t nnbits, size_t nes, typename nbt, bool nsub, bool nsup, bool nsat>
operator >=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & lhs,const cfloat<nnbits,nes,nbt,nsub,nsup,nsat> & rhs)3117 inline bool operator>=(const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& lhs, const cfloat<nnbits,nes,nbt,nsub,nsup,nsat>& rhs) { return !operator< (lhs, rhs); }
3118 
3119 // posit - posit binary arithmetic operators
3120 // BINARY ADDITION
3121 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator +(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & rhs)3122 inline cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> operator+(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& rhs) {
3123 	cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> sum(lhs);
3124 	sum += rhs;
3125 	return sum;
3126 }
3127 // BINARY SUBTRACTION
3128 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator -(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & rhs)3129 inline cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> operator-(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& rhs) {
3130 	cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> diff(lhs);
3131 	diff -= rhs;
3132 	return diff;
3133 }
3134 // BINARY MULTIPLICATION
3135 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator *(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & rhs)3136 inline cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> operator*(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& rhs) {
3137 	cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> mul(lhs);
3138 	mul *= rhs;
3139 	return mul;
3140 }
3141 // BINARY DIVISION
3142 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator /(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & rhs)3143 inline cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> operator/(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& rhs) {
3144 	cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> ratio(lhs);
3145 	ratio /= rhs;
3146 	return ratio;
3147 }
3148 
3149 // encoding helpers
3150 
3151 // return the Unit in the Last Position
3152 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
ulp(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & a)3153 inline cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> ulp(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& a) {
3154 	cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating> b(a);
3155 	return ++b - a;
3156 }
3157 
3158 // convert to std::string
3159 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
to_string(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & v)3160 inline std::string to_string(const cfloat<nbits,es,bt, hasSubnormals, hasSupernormals, isSaturating>& v) {
3161 	std::stringstream s;
3162 	if (v.iszero()) {
3163 		s << " zero b";
3164 		return s.str();
3165 	}
3166 	else if (v.isinf()) {
3167 		s << " infinite b";
3168 		return s.str();
3169 	}
3170 //	s << "(" << (v.sign() ? "-" : "+") << "," << v.scale() << "," << v.fraction() << ")";
3171 	return s.str();
3172 }
3173 
3174 // transform cfloat to a binary representation
3175 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
to_binary(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & number,bool nibbleMarker=false)3176 inline std::string to_binary(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& number, bool nibbleMarker = false) {
3177 	std::stringstream s;
3178 	s << "0b";
3179 	size_t index = nbits;
3180 	s << (number.at(--index) ? '1' : '0') << '.';
3181 
3182 	for (int i = int(es)-1; i >= 0; --i) {
3183 		s << (number.at(--index) ? '1' : '0');
3184 		if (i > 0 && (i % 4) == 0 && nibbleMarker) s << '\'';
3185 	}
3186 
3187 	s << '.';
3188 
3189 	constexpr int fbits = nbits - 1ull - es;
3190 	for (int i = fbits-1; i >= 0; --i) {
3191 		s << (number.at(--index) ? '1' : '0');
3192 		if (i > 0 && (i % 4) == 0 && nibbleMarker) s << '\'';
3193 	}
3194 
3195 	return s.str();
3196 }
3197 
3198 // transform a cfloat into a triple representation
3199 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
to_triple(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & number,bool nibbleMarker=true)3200 inline std::string to_triple(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& number, bool nibbleMarker = true) {
3201 	std::stringstream s;
3202 	blocktriple<cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>::fbits, BlockTripleOperator::REPRESENTATION, bt> triple;
3203 	number.normalize(triple);
3204 	s << to_triple(triple, nibbleMarker);
3205 	return s.str();
3206 }
3207 
3208 // Magnitude of a cfloat (equivalent to turning the sign bit off).
3209 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
3210 cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>
abs(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & v)3211 abs(const cfloat<nbits,es,bt, hasSubnormals, hasSupernormals, isSaturating>& v) {
3212 	return cfloat<nbits,es,bt, hasSubnormals, hasSupernormals, isSaturating>(false, v.scale(), v.fraction(), v.isZero());
3213 }
3214 
3215 ///////////////////////////////////////////////////////////////////////
3216 ///   binary logic literal comparisons
3217 
3218 // cfloat - literal float logic operators
3219 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator ==(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,float rhs)3220 inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, float rhs) {
3221 	return float(lhs) == rhs;
3222 }
3223 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator !=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,float rhs)3224 inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, float rhs) {
3225 	return float(lhs) != rhs;
3226 }
3227 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,float rhs)3228 inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, float rhs) {
3229 	return float(lhs) < rhs;
3230 }
3231 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,float rhs)3232 inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, float rhs) {
3233 	return float(lhs) > rhs;
3234 }
3235 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,float rhs)3236 inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, float rhs) {
3237 	return float(lhs) <= rhs;
3238 }
3239 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,float rhs)3240 inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, float rhs) {
3241 	return float(lhs) >= rhs;
3242 }
3243 // cfloat - literal double logic operators
3244 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator ==(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,double rhs)3245 inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, double rhs) {
3246 	return double(lhs) == rhs;
3247 }
3248 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator !=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,double rhs)3249 inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, double rhs) {
3250 	return double(lhs) != rhs;
3251 }
3252 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,double rhs)3253 inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, double rhs) {
3254 	return double(lhs) < rhs;
3255 }
3256 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,double rhs)3257 inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, double rhs) {
3258 	return double(lhs) > rhs;
3259 }
3260 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,double rhs)3261 inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, double rhs) {
3262 	return double(lhs) <= rhs;
3263 }
3264 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,double rhs)3265 inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, double rhs) {
3266 	return double(lhs) >= rhs;
3267 }
3268 
3269 #if LONG_DOUBLE_SUPPORT
3270 // cfloat - literal long double logic operators
3271 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator ==(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long double rhs)3272 inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long double rhs) {
3273 	return (long double)(lhs) == rhs;
3274 }
3275 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator !=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long double rhs)3276 inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long double rhs) {
3277 	return (long double)(lhs) != rhs;
3278 }
3279 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long double rhs)3280 inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long double rhs) {
3281 	return (long double)(lhs) < rhs;
3282 }
3283 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long double rhs)3284 inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long double rhs) {
3285 	return (long double)(lhs) > rhs;
3286 }
3287 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long double rhs)3288 inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long double rhs) {
3289 	return (long double)(lhs) <= rhs;
3290 }
3291 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long double rhs)3292 inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long double rhs) {
3293 	return (long double)(lhs) >= rhs;
3294 }
3295 #endif
3296 
3297 // cfloat - literal int logic operators
3298 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator ==(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,int rhs)3299 inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, int rhs) {
3300 	return operator==(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3301 }
3302 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator !=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,int rhs)3303 inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, int rhs) {
3304 	return operator!=(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3305 }
3306 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,int rhs)3307 inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, int rhs) {
3308 	return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3309 }
3310 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,int rhs)3311 inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, int rhs) {
3312 	return operator<(cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs), lhs);
3313 }
3314 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,int rhs)3315 inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, int rhs) {
3316 	return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs)) || operator==(lhs, cfloat<nbits, es, bt>(rhs));
3317 }
3318 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,int rhs)3319 inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, int rhs) {
3320 	return !operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3321 }
3322 
3323 // cfloat - long long logic operators
3324 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator ==(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long long rhs)3325 inline bool operator==(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long long rhs) {
3326 	return operator==(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3327 }
3328 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator !=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long long rhs)3329 inline bool operator!=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long long rhs) {
3330 	return operator!=(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3331 }
3332 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long long rhs)3333 inline bool operator< (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long long rhs) {
3334 	return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3335 }
3336 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long long rhs)3337 inline bool operator> (const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long long rhs) {
3338 	return operator<(cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs), lhs);
3339 }
3340 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator <=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long long rhs)3341 inline bool operator<=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long long rhs) {
3342 	return operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs)) || operator==(lhs, cfloat<nbits, es, bt>(rhs));
3343 }
3344 template<size_t nbits, size_t es, typename bt, bool hasSubnormals, bool hasSupernormals, bool isSaturating>
operator >=(const cfloat<nbits,es,bt,hasSubnormals,hasSupernormals,isSaturating> & lhs,long long rhs)3345 inline bool operator>=(const cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>& lhs, long long rhs) {
3346 	return !operator<(lhs, cfloat<nbits, es, bt, hasSubnormals, hasSupernormals, isSaturating>(rhs));
3347 }
3348 
3349 }  // namespace sw::universal
3350